diff options
468 files changed, 18099 insertions, 7686 deletions
diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl index a0d479d1e1dd..f66f4df18690 100644 --- a/Documentation/DocBook/kernel-locking.tmpl +++ b/Documentation/DocBook/kernel-locking.tmpl @@ -1645,7 +1645,9 @@ the amount of locking which needs to be done. all the readers who were traversing the list when we deleted the element are finished. We use <function>call_rcu()</function> to register a callback which will actually destroy the object once - the readers are finished. + all pre-existing readers are finished. Alternatively, + <function>synchronize_rcu()</function> may be used to block until + all pre-existing are finished. </para> <para> But how does Read Copy Update know when the readers are @@ -1714,7 +1716,7 @@ the amount of locking which needs to be done. - object_put(obj); + list_del_rcu(&obj->list); cache_num--; -+ call_rcu(&obj->rcu, cache_delete_rcu, obj); ++ call_rcu(&obj->rcu, cache_delete_rcu); } /* Must be holding cache_lock */ @@ -1725,14 +1727,6 @@ the amount of locking which needs to be done. if (++cache_num > MAX_CACHE_SIZE) { struct object *i, *outcast = NULL; list_for_each_entry(i, &cache, list) { -@@ -85,6 +94,7 @@ - obj->popularity = 0; - atomic_set(&obj->refcnt, 1); /* The cache holds a reference */ - spin_lock_init(&obj->lock); -+ INIT_RCU_HEAD(&obj->rcu); - - spin_lock_irqsave(&cache_lock, flags); - __cache_add(obj); @@ -104,12 +114,11 @@ struct object *cache_find(int id) { diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index 790d1a812376..0c134f8afc6f 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -218,13 +218,22 @@ over a rather long period of time, but improvements are always welcome! include: a. Keeping a count of the number of data-structure elements - used by the RCU-protected data structure, including those - waiting for a grace period to elapse. Enforce a limit - on this number, stalling updates as needed to allow - previously deferred frees to complete. - - Alternatively, limit only the number awaiting deferred - free rather than the total number of elements. + used by the RCU-protected data structure, including + those waiting for a grace period to elapse. Enforce a + limit on this number, stalling updates as needed to allow + previously deferred frees to complete. Alternatively, + limit only the number awaiting deferred free rather than + the total number of elements. + + One way to stall the updates is to acquire the update-side + mutex. (Don't try this with a spinlock -- other CPUs + spinning on the lock could prevent the grace period + from ever ending.) Another way to stall the updates + is for the updates to use a wrapper function around + the memory allocator, so that this wrapper function + simulates OOM when there is too much memory awaiting an + RCU grace period. There are of course many other + variations on this theme. b. Limiting update rate. For example, if updates occur only once per hour, then no explicit rate limiting is required, @@ -365,3 +374,26 @@ over a rather long period of time, but improvements are always welcome! and the compiler to freely reorder code into and out of RCU read-side critical sections. It is the responsibility of the RCU update-side primitives to deal with this. + +17. Use CONFIG_PROVE_RCU, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and + the __rcu sparse checks to validate your RCU code. These + can help find problems as follows: + + CONFIG_PROVE_RCU: check that accesses to RCU-protected data + structures are carried out under the proper RCU + read-side critical section, while holding the right + combination of locks, or whatever other conditions + are appropriate. + + CONFIG_DEBUG_OBJECTS_RCU_HEAD: check that you don't pass the + same object to call_rcu() (or friends) before an RCU + grace period has elapsed since the last time that you + passed that same object to call_rcu() (or friends). + + __rcu sparse checks: tag the pointer to the RCU-protected data + structure with __rcu, and sparse will warn you if you + access that pointer without the services of one of the + variants of rcu_dereference(). + + These debugging aids can help you find problems that are + otherwise extremely difficult to spot. diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 44c6dcc93d6d..862c08ef1fde 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -80,6 +80,24 @@ o A CPU looping with bottom halves disabled. This condition can o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel without invoking schedule(). +o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might + happen to preempt a low-priority task in the middle of an RCU + read-side critical section. This is especially damaging if + that low-priority task is not permitted to run on any other CPU, + in which case the next RCU grace period can never complete, which + will eventually cause the system to run out of memory and hang. + While the system is in the process of running itself out of + memory, you might see stall-warning messages. + +o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that + is running at a higher priority than the RCU softirq threads. + This will prevent RCU callbacks from ever being invoked, + and in a CONFIG_TREE_PREEMPT_RCU kernel will further prevent + RCU grace periods from ever completing. Either way, the + system will eventually run out of memory and hang. In the + CONFIG_TREE_PREEMPT_RCU case, you might see stall-warning + messages. + o A bug in the RCU implementation. o A hardware failure. This is quite unlikely, but has occurred diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index efd8cc95c06b..a851118775d8 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -125,6 +125,17 @@ o "b" is the batch limit for this CPU. If more than this number of RCU callbacks is ready to invoke, then the remainder will be deferred. +o "ci" is the number of RCU callbacks that have been invoked for + this CPU. Note that ci+ql is the number of callbacks that have + been registered in absence of CPU-hotplug activity. + +o "co" is the number of RCU callbacks that have been orphaned due to + this CPU going offline. + +o "ca" is the number of RCU callbacks that have been adopted due to + other CPUs going offline. Note that ci+co-ca+ql is the number of + RCU callbacks registered on this CPU. + There is also an rcu/rcudata.csv file with the same information in comma-separated-variable spreadsheet format. @@ -180,7 +191,7 @@ o "s" is the "signaled" state that drives force_quiescent_state()'s o "jfq" is the number of jiffies remaining for this grace period before force_quiescent_state() is invoked to help push things - along. Note that CPUs in dyntick-idle mode thoughout the grace + along. Note that CPUs in dyntick-idle mode throughout the grace period will not report on their own, but rather must be check by some other CPU via force_quiescent_state(). diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt index 1762b81fcdf2..741fe66d6eca 100644 --- a/Documentation/kprobes.txt +++ b/Documentation/kprobes.txt @@ -542,9 +542,11 @@ Kprobes does not use mutexes or allocate memory except during registration and unregistration. Probe handlers are run with preemption disabled. Depending on the -architecture, handlers may also run with interrupts disabled. In any -case, your handler should not yield the CPU (e.g., by attempting to -acquire a semaphore). +architecture and optimization state, handlers may also run with +interrupts disabled (e.g., kretprobe handlers and optimized kprobe +handlers run without interrupt disabled on x86/x86-64). In any case, +your handler should not yield the CPU (e.g., by attempting to acquire +a semaphore). Since a return probe is implemented by replacing the return address with the trampoline's address, stack backtraces and calls diff --git a/MAINTAINERS b/MAINTAINERS index 7679bf32f7bb..3d4179fbc526 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1527,6 +1527,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git S: Supported F: Documentation/filesystems/ceph.txt F: fs/ceph +F: net/ceph +F: include/linux/ceph CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM: M: David Vrabel <david.vrabel@csr.com> @@ -3162,7 +3164,7 @@ F: drivers/net/ioc3-eth.c IOC3 SERIAL DRIVER M: Pat Gefre <pfg@sgi.com> -L: linux-mips@linux-mips.org +L: linux-serial@vger.kernel.org S: Maintained F: drivers/serial/ioc3_serial.c @@ -4805,6 +4807,15 @@ F: fs/qnx4/ F: include/linux/qnx4_fs.h F: include/linux/qnxtypes.h +RADOS BLOCK DEVICE (RBD) +F: include/linux/qnxtypes.h +M: Yehuda Sadeh <yehuda@hq.newdream.net> +M: Sage Weil <sage@newdream.net> +M: ceph-devel@vger.kernel.org +S: Supported +F: drivers/block/rbd.c +F: drivers/block/rbd_types.h + RADEON FRAMEBUFFER DISPLAY DRIVER M: Benjamin Herrenschmidt <benh@kernel.crashing.org> L: linux-fbdev@vger.kernel.org @@ -1,8 +1,8 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 36 -EXTRAVERSION = -rc7 -NAME = Sheep on Meth +EXTRAVERSION = +NAME = Flesh-Eating Bats with Fangs # *DOCUMENTATION* # To see a list of typical targets execute "make help" @@ -568,6 +568,12 @@ endif ifdef CONFIG_FUNCTION_TRACER KBUILD_CFLAGS += -pg +ifdef CONFIG_DYNAMIC_FTRACE + ifdef CONFIG_HAVE_C_RECORDMCOUNT + BUILD_C_RECORDMCOUNT := y + export BUILD_C_RECORDMCOUNT + endif +endif endif # We trigger additional mismatches with less inlining @@ -591,6 +597,11 @@ KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow) # conserve stack if available KBUILD_CFLAGS += $(call cc-option,-fconserve-stack) +# check for 'asm goto' +ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC)), y) + KBUILD_CFLAGS += -DCC_HAVE_ASM_GOTO +endif + # Add user supplied CPPFLAGS, AFLAGS and CFLAGS as the last assignments # But warn user when we do so warn-assign = \ diff --git a/arch/Kconfig b/arch/Kconfig index fe48fc7a3eba..53d7f619a1b9 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -158,4 +158,7 @@ config HAVE_PERF_EVENTS_NMI subsystem. Also has support for calculating CPU cycle events to determine how many clock cycles in a given period. +config HAVE_ARCH_JUMP_LABEL + bool + source "kernel/gcov/Kconfig" diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index b9647bb66d13..d04ccd73af45 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -9,6 +9,7 @@ config ALPHA select HAVE_IDE select HAVE_OPROFILE select HAVE_SYSCALL_WRAPPERS + select HAVE_IRQ_WORK select HAVE_PERF_EVENTS select HAVE_DMA_ATTRS help diff --git a/arch/alpha/include/asm/perf_event.h b/arch/alpha/include/asm/perf_event.h index 4157cd3c44a9..fe792ca818f6 100644 --- a/arch/alpha/include/asm/perf_event.h +++ b/arch/alpha/include/asm/perf_event.h @@ -1,11 +1,6 @@ #ifndef __ASM_ALPHA_PERF_EVENT_H #define __ASM_ALPHA_PERF_EVENT_H -/* Alpha only supports software events through this interface. */ -extern void set_perf_event_pending(void); - -#define PERF_EVENT_INDEX_OFFSET 0 - #ifdef CONFIG_PERF_EVENTS extern void init_hw_perf_events(void); #else diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c index 85d8e4f58c83..1cc49683fb69 100644 --- a/arch/alpha/kernel/perf_event.c +++ b/arch/alpha/kernel/perf_event.c @@ -307,7 +307,7 @@ again: new_raw_count) != prev_raw_count) goto again; - delta = (new_raw_count - (prev_raw_count & alpha_pmu->pmc_count_mask[idx])) + ovf; + delta = (new_raw_count - (prev_raw_count & alpha_pmu->pmc_count_mask[idx])) + ovf; /* It is possible on very rare occasions that the PMC has overflowed * but the interrupt is yet to come. Detect and fix this situation. @@ -402,14 +402,13 @@ static void maybe_change_configuration(struct cpu_hw_events *cpuc) struct hw_perf_event *hwc = &pe->hw; int idx = hwc->idx; - if (cpuc->current_idx[j] != PMC_NO_INDEX) { - cpuc->idx_mask |= (1<<cpuc->current_idx[j]); - continue; + if (cpuc->current_idx[j] == PMC_NO_INDEX) { + alpha_perf_event_set_period(pe, hwc, idx); + cpuc->current_idx[j] = idx; } - alpha_perf_event_set_period(pe, hwc, idx); - cpuc->current_idx[j] = idx; - cpuc->idx_mask |= (1<<cpuc->current_idx[j]); + if (!(hwc->state & PERF_HES_STOPPED)) + cpuc->idx_mask |= (1<<cpuc->current_idx[j]); } cpuc->config = cpuc->event[0]->hw.config_base; } @@ -420,12 +419,13 @@ static void maybe_change_configuration(struct cpu_hw_events *cpuc) * - this function is called from outside this module via the pmu struct * returned from perf event initialisation. */ -static int alpha_pmu_enable(struct perf_event *event) +static int alpha_pmu_add(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; int n0; int ret; - unsigned long flags; + unsigned long irq_flags; /* * The Sparc code has the IRQ disable first followed by the perf @@ -435,8 +435,8 @@ static int alpha_pmu_enable(struct perf_event *event) * nevertheless we disable the PMCs first to enable a potential * final PMI to occur before we disable interrupts. */ - perf_disable(); - local_irq_save(flags); + perf_pmu_disable(event->pmu); + local_irq_save(irq_flags); /* Default to error to be returned */ ret = -EAGAIN; @@ -455,8 +455,12 @@ static int alpha_pmu_enable(struct perf_event *event) } } - local_irq_restore(flags); - perf_enable(); + hwc->state = PERF_HES_UPTODATE; + if (!(flags & PERF_EF_START)) + hwc->state |= PERF_HES_STOPPED; + + local_irq_restore(irq_flags); + perf_pmu_enable(event->pmu); return ret; } @@ -467,15 +471,15 @@ static int alpha_pmu_enable(struct perf_event *event) * - this function is called from outside this module via the pmu struct * returned from perf event initialisation. */ -static void alpha_pmu_disable(struct perf_event *event) +static void alpha_pmu_del(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - unsigned long flags; + unsigned long irq_flags; int j; - perf_disable(); - local_irq_save(flags); + perf_pmu_disable(event->pmu); + local_irq_save(irq_flags); for (j = 0; j < cpuc->n_events; j++) { if (event == cpuc->event[j]) { @@ -501,8 +505,8 @@ static void alpha_pmu_disable(struct perf_event *event) } } - local_irq_restore(flags); - perf_enable(); + local_irq_restore(irq_flags); + perf_pmu_enable(event->pmu); } @@ -514,13 +518,44 @@ static void alpha_pmu_read(struct perf_event *event) } -static void alpha_pmu_unthrottle(struct perf_event *event) +static void alpha_pmu_stop(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (!(hwc->state & PERF_HES_STOPPED)) { + cpuc->idx_mask &= ~(1UL<<hwc->idx); + hwc->state |= PERF_HES_STOPPED; + } + + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + alpha_perf_event_update(event, hwc, hwc->idx, 0); + hwc->state |= PERF_HES_UPTODATE; + } + + if (cpuc->enabled) + wrperfmon(PERFMON_CMD_DISABLE, (1UL<<hwc->idx)); +} + + +static void alpha_pmu_start(struct perf_event *event, int flags) { struct hw_perf_event *hwc = &event->hw; struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) + return; + + if (flags & PERF_EF_RELOAD) { + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + alpha_perf_event_set_period(event, hwc, hwc->idx); + } + + hwc->state = 0; + cpuc->idx_mask |= 1UL<<hwc->idx; - wrperfmon(PERFMON_CMD_ENABLE, (1UL<<hwc->idx)); + if (cpuc->enabled) + wrperfmon(PERFMON_CMD_ENABLE, (1UL<<hwc->idx)); } @@ -642,39 +677,36 @@ static int __hw_perf_event_init(struct perf_event *event) return 0; } -static const struct pmu pmu = { - .enable = alpha_pmu_enable, - .disable = alpha_pmu_disable, - .read = alpha_pmu_read, - .unthrottle = alpha_pmu_unthrottle, -}; - - /* * Main entry point to initialise a HW performance event. */ -const struct pmu *hw_perf_event_init(struct perf_event *event) +static int alpha_pmu_event_init(struct perf_event *event) { int err; + switch (event->attr.type) { + case PERF_TYPE_RAW: + case PERF_TYPE_HARDWARE: + case PERF_TYPE_HW_CACHE: + break; + + default: + return -ENOENT; + } + if (!alpha_pmu) - return ERR_PTR(-ENODEV); + return -ENODEV; /* Do the real initialisation work. */ err = __hw_perf_event_init(event); - if (err) - return ERR_PTR(err); - - return &pmu; + return err; } - - /* * Main entry point - enable HW performance counters. */ -void hw_perf_enable(void) +static void alpha_pmu_enable(struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -700,7 +732,7 @@ void hw_perf_enable(void) * Main entry point - disable HW performance counters. */ -void hw_perf_disable(void) +static void alpha_pmu_disable(struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -713,6 +745,17 @@ void hw_perf_disable(void) wrperfmon(PERFMON_CMD_DISABLE, cpuc->idx_mask); } +static struct pmu pmu = { + .pmu_enable = alpha_pmu_enable, + .pmu_disable = alpha_pmu_disable, + .event_init = alpha_pmu_event_init, + .add = alpha_pmu_add, + .del = alpha_pmu_del, + .start = alpha_pmu_start, + .stop = alpha_pmu_stop, + .read = alpha_pmu_read, +}; + /* * Main entry point - don't know when this is called but it @@ -766,7 +809,7 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr, wrperfmon(PERFMON_CMD_DISABLE, cpuc->idx_mask); /* la_ptr is the counter that overflowed. */ - if (unlikely(la_ptr >= perf_max_events)) { + if (unlikely(la_ptr >= alpha_pmu->num_pmcs)) { /* This should never occur! */ irq_err_count++; pr_warning("PMI: silly index %ld\n", la_ptr); @@ -807,7 +850,7 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr, /* Interrupts coming too quickly; "throttle" the * counter, i.e., disable it for a little while. */ - cpuc->idx_mask &= ~(1UL<<idx); + alpha_pmu_stop(event, 0); } } wrperfmon(PERFMON_CMD_ENABLE, cpuc->idx_mask); @@ -837,6 +880,7 @@ void __init init_hw_perf_events(void) /* And set up PMU specification */ alpha_pmu = &ev67_pmu; - perf_max_events = alpha_pmu->num_pmcs; + + perf_pmu_register(&pmu); } diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c index 396af1799ea4..0f1d8493cfca 100644 --- a/arch/alpha/kernel/time.c +++ b/arch/alpha/kernel/time.c @@ -41,7 +41,7 @@ #include <linux/init.h> #include <linux/bcd.h> #include <linux/profile.h> -#include <linux/perf_event.h> +#include <linux/irq_work.h> #include <asm/uaccess.h> #include <asm/io.h> @@ -83,25 +83,25 @@ static struct { unsigned long est_cycle_freq; -#ifdef CONFIG_PERF_EVENTS +#ifdef CONFIG_IRQ_WORK -DEFINE_PER_CPU(u8, perf_event_pending); +DEFINE_PER_CPU(u8, irq_work_pending); -#define set_perf_event_pending_flag() __get_cpu_var(perf_event_pending) = 1 -#define test_perf_event_pending() __get_cpu_var(perf_event_pending) -#define clear_perf_event_pending() __get_cpu_var(perf_event_pending) = 0 +#define set_irq_work_pending_flag() __get_cpu_var(irq_work_pending) = 1 +#define test_irq_work_pending() __get_cpu_var(irq_work_pending) +#define clear_irq_work_pending() __get_cpu_var(irq_work_pending) = 0 -void set_perf_event_pending(void) +void set_irq_work_pending(void) { - set_perf_event_pending_flag(); + set_irq_work_pending_flag(); } -#else /* CONFIG_PERF_EVENTS */ +#else /* CONFIG_IRQ_WORK */ -#define test_perf_event_pending() 0 -#define clear_perf_event_pending() +#define test_irq_work_pending() 0 +#define clear_irq_work_pending() -#endif /* CONFIG_PERF_EVENTS */ +#endif /* CONFIG_IRQ_WORK */ static inline __u32 rpcc(void) @@ -191,9 +191,9 @@ irqreturn_t timer_interrupt(int irq, void *dev) write_sequnlock(&xtime_lock); - if (test_perf_event_pending()) { - clear_perf_event_pending(); - perf_event_do_pending(); + if (test_irq_work_pending()) { + clear_irq_work_pending(); + irq_work_run(); } #ifndef CONFIG_SMP diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 9c26ba7244fb..9103904b3dab 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -23,6 +23,7 @@ config ARM select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZO select HAVE_KERNEL_LZMA + select HAVE_IRQ_WORK select HAVE_PERF_EVENTS select PERF_USE_VMALLOC select HAVE_REGS_AND_STACK_ACCESS_API diff --git a/arch/arm/include/asm/perf_event.h b/arch/arm/include/asm/perf_event.h index b5799a3b7117..c4aa4e8c6af9 100644 --- a/arch/arm/include/asm/perf_event.h +++ b/arch/arm/include/asm/perf_event.h @@ -12,18 +12,6 @@ #ifndef __ARM_PERF_EVENT_H__ #define __ARM_PERF_EVENT_H__ -/* - * NOP: on *most* (read: all supported) ARM platforms, the performance - * counter interrupts are regular interrupts and not an NMI. This - * means that when we receive the interrupt we can call - * perf_event_do_pending() that handles all of the work with - * interrupts disabled. - */ -static inline void -set_perf_event_pending(void) -{ -} - /* ARM performance counters start from 1 (in the cp15 accesses) so use the * same indexes here for consistency. */ #define PERF_EVENT_INDEX_OFFSET 1 diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index ecbb0288e5dd..49643b1467e6 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c @@ -123,6 +123,12 @@ armpmu_get_max_events(void) } EXPORT_SYMBOL_GPL(armpmu_get_max_events); +int perf_num_counters(void) +{ + return armpmu_get_max_events(); +} +EXPORT_SYMBOL_GPL(perf_num_counters); + #define HW_OP_UNSUPPORTED 0xFFFF #define C(_x) \ @@ -221,46 +227,56 @@ again: } static void -armpmu_disable(struct perf_event *event) +armpmu_read(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; - - WARN_ON(idx < 0); - - clear_bit(idx, cpuc->active_mask); - armpmu->disable(hwc, idx); - - barrier(); - armpmu_event_update(event, hwc, idx); - cpuc->events[idx] = NULL; - clear_bit(idx, cpuc->used_mask); + /* Don't read disabled counters! */ + if (hwc->idx < 0) + return; - perf_event_update_userpage(event); + armpmu_event_update(event, hwc, hwc->idx); } static void -armpmu_read(struct perf_event *event) +armpmu_stop(struct perf_event *event, int flags) { struct hw_perf_event *hwc = &event->hw; - /* Don't read disabled counters! */ - if (hwc->idx < 0) + if (!armpmu) return; - armpmu_event_update(event, hwc, hwc->idx); + /* + * ARM pmu always has to update the counter, so ignore + * PERF_EF_UPDATE, see comments in armpmu_start(). + */ + if (!(hwc->state & PERF_HES_STOPPED)) { + armpmu->disable(hwc, hwc->idx); + barrier(); /* why? */ + armpmu_event_update(event, hwc, hwc->idx); + hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; + } } static void -armpmu_unthrottle(struct perf_event *event) +armpmu_start(struct perf_event *event, int flags) { struct hw_perf_event *hwc = &event->hw; + if (!armpmu) + return; + + /* + * ARM pmu always has to reprogram the period, so ignore + * PERF_EF_RELOAD, see the comment below. + */ + if (flags & PERF_EF_RELOAD) + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + + hwc->state = 0; /* * Set the period again. Some counters can't be stopped, so when we - * were throttled we simply disabled the IRQ source and the counter + * were stopped we simply disabled the IRQ source and the counter * may have been left counting. If we don't do this step then we may * get an interrupt too soon or *way* too late if the overflow has * happened since disabling. @@ -269,14 +285,33 @@ armpmu_unthrottle(struct perf_event *event) armpmu->enable(hwc, hwc->idx); } +static void +armpmu_del(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + WARN_ON(idx < 0); + + clear_bit(idx, cpuc->active_mask); + armpmu_stop(event, PERF_EF_UPDATE); + cpuc->events[idx] = NULL; + clear_bit(idx, cpuc->used_mask); + + perf_event_update_userpage(event); +} + static int -armpmu_enable(struct perf_event *event) +armpmu_add(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; int idx; int err = 0; + perf_pmu_disable(event->pmu); + /* If we don't have a space for the counter then finish early. */ idx = armpmu->get_event_idx(cpuc, hwc); if (idx < 0) { @@ -293,25 +328,19 @@ armpmu_enable(struct perf_event *event) cpuc->events[idx] = event; set_bit(idx, cpuc->active_mask); - /* Set the period for the event. */ - armpmu_event_set_period(event, hwc, idx); - - /* Enable the event. */ - armpmu->enable(hwc, idx); + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + if (flags & PERF_EF_START) + armpmu_start(event, PERF_EF_RELOAD); /* Propagate our changes to the userspace mapping. */ perf_event_update_userpage(event); out: + perf_pmu_enable(event->pmu); return err; } -static struct pmu pmu = { - .enable = armpmu_enable, - .disable = armpmu_disable, - .unthrottle = armpmu_unthrottle, - .read = armpmu_read, -}; +static struct pmu pmu; static int validate_event(struct cpu_hw_events *cpuc, @@ -491,20 +520,29 @@ __hw_perf_event_init(struct perf_event *event) return err; } -const struct pmu * -hw_perf_event_init(struct perf_event *event) +static int armpmu_event_init(struct perf_event *event) { int err = 0; + switch (event->attr.type) { + case PERF_TYPE_RAW: + case PERF_TYPE_HARDWARE: + case PERF_TYPE_HW_CACHE: + break; + + default: + return -ENOENT; + } + if (!armpmu) - return ERR_PTR(-ENODEV); + return -ENODEV; event->destroy = hw_perf_event_destroy; if (!atomic_inc_not_zero(&active_events)) { - if (atomic_read(&active_events) > perf_max_events) { + if (atomic_read(&active_events) > armpmu->num_events) { atomic_dec(&active_events); - return ERR_PTR(-ENOSPC); + return -ENOSPC; } mutex_lock(&pmu_reserve_mutex); @@ -518,17 +556,16 @@ hw_perf_event_init(struct perf_event *event) } if (err) - return ERR_PTR(err); + return err; err = __hw_perf_event_init(event); if (err) hw_perf_event_destroy(event); - return err ? ERR_PTR(err) : &pmu; + return err; } -void -hw_perf_enable(void) +static void armpmu_enable(struct pmu *pmu) { /* Enable all of the perf events on hardware. */ int idx; @@ -549,13 +586,23 @@ hw_perf_enable(void) armpmu->start(); } -void -hw_perf_disable(void) +static void armpmu_disable(struct pmu *pmu) { if (armpmu) armpmu->stop(); } +static struct pmu pmu = { + .pmu_enable = armpmu_enable, + .pmu_disable = armpmu_disable, + .event_init = armpmu_event_init, + .add = armpmu_add, + .del = armpmu_del, + .start = armpmu_start, + .stop = armpmu_stop, + .read = armpmu_read, +}; + /* * ARMv6 Performance counter handling code. * @@ -1045,7 +1092,7 @@ armv6pmu_handle_irq(int irq_num, * platforms that can have the PMU interrupts raised as an NMI, this * will not work. */ - perf_event_do_pending(); + irq_work_run(); return IRQ_HANDLED; } @@ -2021,7 +2068,7 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev) * platforms that can have the PMU interrupts raised as an NMI, this * will not work. */ - perf_event_do_pending(); + irq_work_run(); return IRQ_HANDLED; } @@ -2389,7 +2436,7 @@ xscale1pmu_handle_irq(int irq_num, void *dev) armpmu->disable(hwc, idx); } - perf_event_do_pending(); + irq_work_run(); /* * Re-enable the PMU. @@ -2716,7 +2763,7 @@ xscale2pmu_handle_irq(int irq_num, void *dev) armpmu->disable(hwc, idx); } - perf_event_do_pending(); + irq_work_run(); /* * Re-enable the PMU. @@ -2933,14 +2980,12 @@ init_hw_perf_events(void) armpmu = &armv6pmu; memcpy(armpmu_perf_cache_map, armv6_perf_cache_map, sizeof(armv6_perf_cache_map)); - perf_max_events = armv6pmu.num_events; break; case 0xB020: /* ARM11mpcore */ armpmu = &armv6mpcore_pmu; memcpy(armpmu_perf_cache_map, armv6mpcore_perf_cache_map, sizeof(armv6mpcore_perf_cache_map)); - perf_max_events = armv6mpcore_pmu.num_events; break; case 0xC080: /* Cortex-A8 */ armv7pmu.id = ARM_PERF_PMU_ID_CA8; @@ -2952,7 +2997,6 @@ init_hw_perf_events(void) /* Reset PMNC and read the nb of CNTx counters supported */ armv7pmu.num_events = armv7_reset_read_pmnc(); - perf_max_events = armv7pmu.num_events; break; case 0xC090: /* Cortex-A9 */ armv7pmu.id = ARM_PERF_PMU_ID_CA9; @@ -2964,7 +3008,6 @@ init_hw_perf_events(void) /* Reset PMNC and read the nb of CNTx counters supported */ armv7pmu.num_events = armv7_reset_read_pmnc(); - perf_max_events = armv7pmu.num_events; break; } /* Intel CPUs [xscale]. */ @@ -2975,13 +3018,11 @@ init_hw_perf_events(void) armpmu = &xscale1pmu; memcpy(armpmu_perf_cache_map, xscale_perf_cache_map, sizeof(xscale_perf_cache_map)); - perf_max_events = xscale1pmu.num_events; break; case 2: armpmu = &xscale2pmu; memcpy(armpmu_perf_cache_map, xscale_perf_cache_map, sizeof(xscale_perf_cache_map)); - perf_max_events = xscale2pmu.num_events; break; } } @@ -2991,9 +3032,10 @@ init_hw_perf_events(void) arm_pmu_names[armpmu->id], armpmu->num_events); } else { pr_info("no hardware support available\n"); - perf_max_events = -1; } + perf_pmu_register(&pmu); + return 0; } arch_initcall(init_hw_perf_events); @@ -3001,13 +3043,6 @@ arch_initcall(init_hw_perf_events); /* * Callchain handling code. */ -static inline void -callchain_store(struct perf_callchain_entry *entry, - u64 ip) -{ - if (entry->nr < PERF_MAX_STACK_DEPTH) - entry->ip[entry->nr++] = ip; -} /* * The registers we're interested in are at the end of the variable @@ -3039,7 +3074,7 @@ user_backtrace(struct frame_tail *tail, if (__copy_from_user_inatomic(&buftail, tail, sizeof(buftail))) return NULL; - callchain_store(entry, buftail.lr); + perf_callchain_store(entry, buftail.lr); /* * Frame pointers should strictly progress back up the stack @@ -3051,16 +3086,11 @@ user_backtrace(struct frame_tail *tail, return buftail.fp - 1; } -static void -perf_callchain_user(struct pt_regs *regs, - struct perf_callchain_entry *entry) +void +perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) { struct frame_tail *tail; - callchain_store(entry, PERF_CONTEXT_USER); - - if (!user_mode(regs)) - regs = task_pt_regs(current); tail = (struct frame_tail *)regs->ARM_fp - 1; @@ -3078,56 +3108,18 @@ callchain_trace(struct stackframe *fr, void *data) { struct perf_callchain_entry *entry = data; - callchain_store(entry, fr->pc); + perf_callchain_store(entry, fr->pc); return 0; } -static void -perf_callchain_kernel(struct pt_regs *regs, - struct perf_callchain_entry *entry) +void +perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) { struct stackframe fr; - callchain_store(entry, PERF_CONTEXT_KERNEL); fr.fp = regs->ARM_fp; fr.sp = regs->ARM_sp; fr.lr = regs->ARM_lr; fr.pc = regs->ARM_pc; walk_stackframe(&fr, callchain_trace, entry); } - -static void -perf_do_callchain(struct pt_regs *regs, - struct perf_callchain_entry *entry) -{ - int is_user; - - if (!regs) - return; - - is_user = user_mode(regs); - - if (!current || !current->pid) - return; - - if (is_user && current->state != TASK_RUNNING) - return; - - if (!is_user) - perf_callchain_kernel(regs, entry); - - if (current->mm) - perf_callchain_user(regs, entry); -} - -static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); - -struct perf_callchain_entry * -perf_callchain(struct pt_regs *regs) -{ - struct perf_callchain_entry *entry = &__get_cpu_var(pmc_irq_entry); - - entry->nr = 0; - perf_do_callchain(regs, entry); - return entry; -} diff --git a/arch/arm/mach-bcmring/dma.c b/arch/arm/mach-bcmring/dma.c index 29c0a911df26..77eb35c89cd0 100644 --- a/arch/arm/mach-bcmring/dma.c +++ b/arch/arm/mach-bcmring/dma.c @@ -691,7 +691,7 @@ int dma_init(void) memset(&gDMA, 0, sizeof(gDMA)); - init_MUTEX_LOCKED(&gDMA.lock); + sema_init(&gDMA.lock, 0); init_waitqueue_head(&gDMA.freeChannelQ); /* Initialize the Hardware */ @@ -1574,7 +1574,7 @@ int dma_init_mem_map(DMA_MemMap_t *memMap) { memset(memMap, 0, sizeof(*memMap)); - init_MUTEX(&memMap->lock); + sema_init(&memMap->lock, 1); return 0; } diff --git a/arch/arm/oprofile/Makefile b/arch/arm/oprofile/Makefile index e666eafed152..b2215c61cdf0 100644 --- a/arch/arm/oprofile/Makefile +++ b/arch/arm/oprofile/Makefile @@ -6,4 +6,8 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \ oprofilefs.o oprofile_stats.o \ timer_int.o ) +ifeq ($(CONFIG_HW_PERF_EVENTS),y) +DRIVER_OBJS += $(addprefix ../../../drivers/oprofile/, oprofile_perf.o) +endif + oprofile-y := $(DRIVER_OBJS) common.o diff --git a/arch/arm/oprofile/common.c b/arch/arm/oprofile/common.c index 72e09eb642dd..8aa974491dfc 100644 --- a/arch/arm/oprofile/common.c +++ b/arch/arm/oprofile/common.c @@ -25,139 +25,10 @@ #include <asm/ptrace.h> #ifdef CONFIG_HW_PERF_EVENTS -/* - * Per performance monitor configuration as set via oprofilefs. - */ -struct op_counter_config { - unsigned long count; - unsigned long enabled; - unsigned long event; - unsigned long unit_mask; - unsigned long kernel; - unsigned long user; - struct perf_event_attr attr; -}; - -static int op_arm_enabled; -static DEFINE_MUTEX(op_arm_mutex); - -static struct op_counter_config *counter_config; -static struct perf_event **perf_events[nr_cpumask_bits]; -static int perf_num_counters; - -/* - * Overflow callback for oprofile. - */ -static void op_overflow_handler(struct perf_event *event, int unused, - struct perf_sample_data *data, struct pt_regs *regs) +char *op_name_from_perf_id(void) { - int id; - u32 cpu = smp_processor_id(); - - for (id = 0; id < perf_num_counters; ++id) - if (perf_events[cpu][id] == event) - break; - - if (id != perf_num_counters) - oprofile_add_sample(regs, id); - else - pr_warning("oprofile: ignoring spurious overflow " - "on cpu %u\n", cpu); -} - -/* - * Called by op_arm_setup to create perf attributes to mirror the oprofile - * settings in counter_config. Attributes are created as `pinned' events and - * so are permanently scheduled on the PMU. - */ -static void op_perf_setup(void) -{ - int i; - u32 size = sizeof(struct perf_event_attr); - struct perf_event_attr *attr; - - for (i = 0; i < perf_num_counters; ++i) { - attr = &counter_config[i].attr; - memset(attr, 0, size); - attr->type = PERF_TYPE_RAW; - attr->size = size; - attr->config = counter_config[i].event; - attr->sample_period = counter_config[i].count; - attr->pinned = 1; - } -} - -static int op_create_counter(int cpu, int event) -{ - int ret = 0; - struct perf_event *pevent; - - if (!counter_config[event].enabled || (perf_events[cpu][event] != NULL)) - return ret; - - pevent = perf_event_create_kernel_counter(&counter_config[event].attr, - cpu, -1, - op_overflow_handler); - - if (IS_ERR(pevent)) { - ret = PTR_ERR(pevent); - } else if (pevent->state != PERF_EVENT_STATE_ACTIVE) { - perf_event_release_kernel(pevent); - pr_warning("oprofile: failed to enable event %d " - "on CPU %d\n", event, cpu); - ret = -EBUSY; - } else { - perf_events[cpu][event] = pevent; - } - - return ret; -} + enum arm_perf_pmu_ids id = armpmu_get_pmu_id(); -static void op_destroy_counter(int cpu, int event) -{ - struct perf_event *pevent = perf_events[cpu][event]; - - if (pevent) { - perf_event_release_kernel(pevent); - perf_events[cpu][event] = NULL; - } -} - -/* - * Called by op_arm_start to create active perf events based on the - * perviously configured attributes. - */ -static int op_perf_start(void) -{ - int cpu, event, ret = 0; - - for_each_online_cpu(cpu) { - for (event = 0; event < perf_num_counters; ++event) { - ret = op_create_counter(cpu, event); - if (ret) - goto out; - } - } - -out: - return ret; -} - -/* - * Called by op_arm_stop at the end of a profiling run. - */ -static void op_perf_stop(void) -{ - int cpu, event; - - for_each_online_cpu(cpu) - for (event = 0; event < perf_num_counters; ++event) - op_destroy_counter(cpu, event); -} - - -static char *op_name_from_perf_id(enum arm_perf_pmu_ids id) -{ switch (id) { case ARM_PERF_PMU_ID_XSCALE1: return "arm/xscale1"; @@ -176,116 +47,6 @@ static char *op_name_from_perf_id(enum arm_perf_pmu_ids id) } } -static int op_arm_create_files(struct super_block *sb, struct dentry *root) -{ - unsigned int i; - - for (i = 0; i < perf_num_counters; i++) { - struct dentry *dir; - char buf[4]; - - snprintf(buf, sizeof buf, "%d", i); - dir = oprofilefs_mkdir(sb, root, buf); - oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled); - oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event); - oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count); - oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); - oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); - oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); - } - - return 0; -} - -static int op_arm_setup(void) -{ - spin_lock(&oprofilefs_lock); - op_perf_setup(); - spin_unlock(&oprofilefs_lock); - return 0; -} - -static int op_arm_start(void) -{ - int ret = -EBUSY; - - mutex_lock(&op_arm_mutex); - if (!op_arm_enabled) { - ret = 0; - op_perf_start(); - op_arm_enabled = 1; - } - mutex_unlock(&op_arm_mutex); - return ret; -} - -static void op_arm_stop(void) -{ - mutex_lock(&op_arm_mutex); - if (op_arm_enabled) - op_perf_stop(); - op_arm_enabled = 0; - mutex_unlock(&op_arm_mutex); -} - -#ifdef CONFIG_PM -static int op_arm_suspend(struct platform_device *dev, pm_message_t state) -{ - mutex_lock(&op_arm_mutex); - if (op_arm_enabled) - op_perf_stop(); - mutex_unlock(&op_arm_mutex); - return 0; -} - -static int op_arm_resume(struct platform_device *dev) -{ - mutex_lock(&op_arm_mutex); - if (op_arm_enabled && op_perf_start()) - op_arm_enabled = 0; - mutex_unlock(&op_arm_mutex); - return 0; -} - -static struct platform_driver oprofile_driver = { - .driver = { - .name = "arm-oprofile", - }, - .resume = op_arm_resume, - .suspend = op_arm_suspend, -}; - -static struct platform_device *oprofile_pdev; - -static int __init init_driverfs(void) -{ - int ret; - - ret = platform_driver_register(&oprofile_driver); - if (ret) - goto out; - - oprofile_pdev = platform_device_register_simple( - oprofile_driver.driver.name, 0, NULL, 0); - if (IS_ERR(oprofile_pdev)) { - ret = PTR_ERR(oprofile_pdev); - platform_driver_unregister(&oprofile_driver); - } - -out: - return ret; -} - -static void exit_driverfs(void) -{ - platform_device_unregister(oprofile_pdev); - platform_driver_unregister(&oprofile_driver); -} -#else -static int __init init_driverfs(void) { return 0; } -#define exit_driverfs() do { } while (0) -#endif /* CONFIG_PM */ - static int report_trace(struct stackframe *frame, void *d) { unsigned int *depth = d; @@ -350,74 +111,14 @@ static void arm_backtrace(struct pt_regs * const regs, unsigned int depth) int __init oprofile_arch_init(struct oprofile_operations *ops) { - int cpu, ret = 0; - - perf_num_counters = armpmu_get_max_events(); - - counter_config = kcalloc(perf_num_counters, - sizeof(struct op_counter_config), GFP_KERNEL); - - if (!counter_config) { - pr_info("oprofile: failed to allocate %d " - "counters\n", perf_num_counters); - return -ENOMEM; - } - - ret = init_driverfs(); - if (ret) { - kfree(counter_config); - counter_config = NULL; - return ret; - } - - for_each_possible_cpu(cpu) { - perf_events[cpu] = kcalloc(perf_num_counters, - sizeof(struct perf_event *), GFP_KERNEL); - if (!perf_events[cpu]) { - pr_info("oprofile: failed to allocate %d perf events " - "for cpu %d\n", perf_num_counters, cpu); - while (--cpu >= 0) - kfree(perf_events[cpu]); - return -ENOMEM; - } - } - ops->backtrace = arm_backtrace; - ops->create_files = op_arm_create_files; - ops->setup = op_arm_setup; - ops->start = op_arm_start; - ops->stop = op_arm_stop; - ops->shutdown = op_arm_stop; - ops->cpu_type = op_name_from_perf_id(armpmu_get_pmu_id()); - - if (!ops->cpu_type) - ret = -ENODEV; - else - pr_info("oprofile: using %s\n", ops->cpu_type); - return ret; + return oprofile_perf_init(ops); } -void oprofile_arch_exit(void) +void __exit oprofile_arch_exit(void) { - int cpu, id; - struct perf_event *event; - - if (*perf_events) { - for_each_possible_cpu(cpu) { - for (id = 0; id < perf_num_counters; ++id) { - event = perf_events[cpu][id]; - if (event != NULL) - perf_event_release_kernel(event); - } - kfree(perf_events[cpu]); - } - } - - if (counter_config) { - kfree(counter_config); - exit_driverfs(); - } + oprofile_perf_exit(); } #else int __init oprofile_arch_init(struct oprofile_operations *ops) @@ -425,5 +126,5 @@ int __init oprofile_arch_init(struct oprofile_operations *ops) pr_info("oprofile: hardware counters not available\n"); return -ENODEV; } -void oprofile_arch_exit(void) {} +void __exit oprofile_arch_exit(void) {} #endif /* CONFIG_HW_PERF_EVENTS */ diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index 16399bd24993..0f2417df6323 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -7,6 +7,7 @@ config FRV default y select HAVE_IDE select HAVE_ARCH_TRACEHOOK + select HAVE_IRQ_WORK select HAVE_PERF_EVENTS config ZONE_DMA diff --git a/arch/frv/lib/Makefile b/arch/frv/lib/Makefile index f4709756d0d9..4ff2fb1e6b16 100644 --- a/arch/frv/lib/Makefile +++ b/arch/frv/lib/Makefile @@ -5,4 +5,4 @@ lib-y := \ __ashldi3.o __lshrdi3.o __muldi3.o __ashrdi3.o __negdi2.o __ucmpdi2.o \ checksum.o memcpy.o memset.o atomic-ops.o atomic64-ops.o \ - outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o perf_event.o + outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o diff --git a/arch/frv/lib/perf_event.c b/arch/frv/lib/perf_event.c deleted file mode 100644 index 9ac5acfd2e91..000000000000 --- a/arch/frv/lib/perf_event.c +++ /dev/null @@ -1,19 +0,0 @@ -/* Performance event handling - * - * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#include <linux/perf_event.h> - -/* - * mark the performance event as pending - */ -void set_perf_event_pending(void) -{ -} diff --git a/arch/ia64/include/asm/hardirq.h b/arch/ia64/include/asm/hardirq.h index d514cd9edb49..8fb7d33a661f 100644 --- a/arch/ia64/include/asm/hardirq.h +++ b/arch/ia64/include/asm/hardirq.h @@ -6,12 +6,6 @@ * David Mosberger-Tang <davidm@hpl.hp.com> */ - -#include <linux/threads.h> -#include <linux/irq.h> - -#include <asm/processor.h> - /* * No irq_cpustat_t for IA-64. The data is held in the per-CPU data structure. */ @@ -20,6 +14,11 @@ #define local_softirq_pending() (local_cpu_data->softirq_pending) +#include <linux/threads.h> +#include <linux/irq.h> + +#include <asm/processor.h> + extern void __iomem *ipi_base_addr; void ack_bad_irq(unsigned int irq); diff --git a/arch/m32r/include/asm/elf.h b/arch/m32r/include/asm/elf.h index 2f85412ef730..b8da7d0574d2 100644 --- a/arch/m32r/include/asm/elf.h +++ b/arch/m32r/include/asm/elf.h @@ -82,9 +82,9 @@ typedef elf_fpreg_t elf_fpregset_t; * These are used to set parameters in the core dumps. */ #define ELF_CLASS ELFCLASS32 -#if defined(__LITTLE_ENDIAN) +#if defined(__LITTLE_ENDIAN__) #define ELF_DATA ELFDATA2LSB -#elif defined(__BIG_ENDIAN) +#elif defined(__BIG_ENDIAN__) #define ELF_DATA ELFDATA2MSB #else #error no endian defined diff --git a/arch/m32r/kernel/.gitignore b/arch/m32r/kernel/.gitignore new file mode 100644 index 000000000000..c5f676c3c224 --- /dev/null +++ b/arch/m32r/kernel/.gitignore @@ -0,0 +1 @@ +vmlinux.lds diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c index 7bbe38645ed5..a08697f0886d 100644 --- a/arch/m32r/kernel/signal.c +++ b/arch/m32r/kernel/signal.c @@ -28,6 +28,8 @@ #define DEBUG_SIG 0 +#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) + asmlinkage int sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, unsigned long r2, unsigned long r3, unsigned long r4, @@ -254,7 +256,7 @@ give_sigsegv: static int prev_insn(struct pt_regs *regs) { u16 inst; - if (get_user(&inst, (u16 __user *)(regs->bpc - 2))) + if (get_user(inst, (u16 __user *)(regs->bpc - 2))) return -EFAULT; if ((inst & 0xfff0) == 0x10f0) /* trap ? */ regs->bpc -= 2; diff --git a/arch/mips/Kbuild b/arch/mips/Kbuild index e322d65f33a4..7dd65cfae837 100644 --- a/arch/mips/Kbuild +++ b/arch/mips/Kbuild @@ -7,6 +7,10 @@ subdir-ccflags-y := -Werror include arch/mips/Kbuild.platforms obj-y := $(platform-y) +# make clean traverses $(obj-) without having included .config, so +# everything ends up here +obj- := $(platform-) + # mips object files # The object files are linked as core-y files would be linked diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 5526faabfc21..4c9f402295dd 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -881,11 +881,15 @@ config NO_IOPORT config GENERIC_ISA_DMA bool select ZONE_DMA if GENERIC_ISA_DMA_SUPPORT_BROKEN=n + select ISA_DMA_API config GENERIC_ISA_DMA_SUPPORT_BROKEN bool select GENERIC_ISA_DMA +config ISA_DMA_API + bool + config GENERIC_GPIO bool diff --git a/arch/mips/boot/compressed/Makefile b/arch/mips/boot/compressed/Makefile index 5fd7f7a58b7e..5042d51b0512 100644 --- a/arch/mips/boot/compressed/Makefile +++ b/arch/mips/boot/compressed/Makefile @@ -105,4 +105,4 @@ OBJCOPYFLAGS_vmlinuz.srec := $(OBJCOPYFLAGS) -S -O srec vmlinuz.srec: vmlinuz $(call cmd,objcopy) -clean-files := $(objtree)/vmlinuz.* +clean-files := $(objtree)/vmlinuz $(objtree)/vmlinuz.{32,ecoff,bin,srec} diff --git a/arch/mips/dec/Platform b/arch/mips/dec/Platform index 3adbcbd95db1..cf55a6f4e720 100644 --- a/arch/mips/dec/Platform +++ b/arch/mips/dec/Platform @@ -1,7 +1,7 @@ # # DECstation family # -platform-$(CONFIG_MACH_DECSTATION) = dec/ +platform-$(CONFIG_MACH_DECSTATION) += dec/ cflags-$(CONFIG_MACH_DECSTATION) += \ -I$(srctree)/arch/mips/include/asm/mach-dec libs-$(CONFIG_MACH_DECSTATION) += arch/mips/dec/prom/ diff --git a/arch/mips/include/asm/fcntl.h b/arch/mips/include/asm/fcntl.h index e482fe90fe88..75eddedcfc3e 100644 --- a/arch/mips/include/asm/fcntl.h +++ b/arch/mips/include/asm/fcntl.h @@ -56,6 +56,7 @@ */ #ifdef CONFIG_32BIT +#include <linux/types.h> struct flock { short l_type; diff --git a/arch/mips/jz4740/Platform b/arch/mips/jz4740/Platform index 6a97230e3d05..ba91be9c21ef 100644 --- a/arch/mips/jz4740/Platform +++ b/arch/mips/jz4740/Platform @@ -1,3 +1,3 @@ -core-$(CONFIG_MACH_JZ4740) += arch/mips/jz4740/ +platform-$(CONFIG_MACH_JZ4740) += jz4740/ cflags-$(CONFIG_MACH_JZ4740) += -I$(srctree)/arch/mips/include/asm/mach-jz4740 load-$(CONFIG_MACH_JZ4740) += 0xffffffff80010000 diff --git a/arch/mips/kernel/branch.c b/arch/mips/kernel/branch.c index 0176ed015c89..32103cc2a257 100644 --- a/arch/mips/kernel/branch.c +++ b/arch/mips/kernel/branch.c @@ -40,7 +40,6 @@ int __compute_return_epc(struct pt_regs *regs) return -EFAULT; } - regs->regs[0] = 0; switch (insn.i_format.opcode) { /* * jr and jalr are in r_format format. diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c index 2340f11dc29c..9a526ba6f257 100644 --- a/arch/mips/kernel/mips-mt-fpaff.c +++ b/arch/mips/kernel/mips-mt-fpaff.c @@ -103,7 +103,7 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len, if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) goto out_unlock; - retval = security_task_setscheduler(p, 0, NULL); + retval = security_task_setscheduler(p) if (retval) goto out_unlock; diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index c51b95ff8644..c8777333e198 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -536,7 +536,7 @@ asmlinkage void do_syscall_trace(struct pt_regs *regs, int entryexit) { /* do the secure computing check first */ if (!entryexit) - secure_computing(regs->regs[0]); + secure_computing(regs->regs[2]); if (unlikely(current->audit_context) && entryexit) audit_syscall_exit(AUDITSC_RESULT(regs->regs[2]), @@ -565,7 +565,7 @@ asmlinkage void do_syscall_trace(struct pt_regs *regs, int entryexit) out: if (unlikely(current->audit_context) && !entryexit) - audit_syscall_entry(audit_arch(), regs->regs[0], + audit_syscall_entry(audit_arch(), regs->regs[2], regs->regs[4], regs->regs[5], regs->regs[6], regs->regs[7]); } diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index 584415eef8c9..fbaabad0e6e2 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -63,9 +63,9 @@ stack_done: sw t0, PT_R7(sp) # set error flag beqz t0, 1f + lw t1, PT_R2(sp) # syscall number negu v0 # error - sw v0, PT_R0(sp) # set flag for syscall - # restarting + sw t1, PT_R0(sp) # save it for syscall restarting 1: sw v0, PT_R2(sp) # result o32_syscall_exit: @@ -104,9 +104,9 @@ syscall_trace_entry: sw t0, PT_R7(sp) # set error flag beqz t0, 1f + lw t1, PT_R2(sp) # syscall number negu v0 # error - sw v0, PT_R0(sp) # set flag for syscall - # restarting + sw t1, PT_R0(sp) # save it for syscall restarting 1: sw v0, PT_R2(sp) # result j syscall_exit @@ -169,8 +169,7 @@ stackargs: * We probably should handle this case a bit more drastic. */ bad_stack: - negu v0 # error - sw v0, PT_R0(sp) + li v0, EFAULT sw v0, PT_R2(sp) li t0, 1 # set error flag sw t0, PT_R7(sp) diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S index 5573f8e4e326..3f4179283207 100644 --- a/arch/mips/kernel/scall64-64.S +++ b/arch/mips/kernel/scall64-64.S @@ -66,9 +66,9 @@ NESTED(handle_sys64, PT_SIZE, sp) sd t0, PT_R7(sp) # set error flag beqz t0, 1f + ld t1, PT_R2(sp) # syscall number dnegu v0 # error - sd v0, PT_R0(sp) # set flag for syscall - # restarting + sd t1, PT_R0(sp) # save it for syscall restarting 1: sd v0, PT_R2(sp) # result n64_syscall_exit: @@ -109,8 +109,9 @@ syscall_trace_entry: sd t0, PT_R7(sp) # set error flag beqz t0, 1f + ld t1, PT_R2(sp) # syscall number dnegu v0 # error - sd v0, PT_R0(sp) # set flag for syscall restarting + sd t1, PT_R0(sp) # save it for syscall restarting 1: sd v0, PT_R2(sp) # result j syscall_exit diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S index 1e38ec97672e..f08ece6d8acc 100644 --- a/arch/mips/kernel/scall64-n32.S +++ b/arch/mips/kernel/scall64-n32.S @@ -65,8 +65,9 @@ NESTED(handle_sysn32, PT_SIZE, sp) sd t0, PT_R7(sp) # set error flag beqz t0, 1f + ld t1, PT_R2(sp) # syscall number dnegu v0 # error - sd v0, PT_R0(sp) # set flag for syscall restarting + sd t1, PT_R0(sp) # save it for syscall restarting 1: sd v0, PT_R2(sp) # result local_irq_disable # make sure need_resched and @@ -106,8 +107,9 @@ n32_syscall_trace_entry: sd t0, PT_R7(sp) # set error flag beqz t0, 1f + ld t1, PT_R2(sp) # syscall number dnegu v0 # error - sd v0, PT_R0(sp) # set flag for syscall restarting + sd t1, PT_R0(sp) # save it for syscall restarting 1: sd v0, PT_R2(sp) # result j syscall_exit @@ -320,10 +322,10 @@ EXPORT(sysn32_call_table) PTR sys_cacheflush PTR sys_cachectl PTR sys_sysmips - PTR sys_io_setup /* 6200 */ + PTR compat_sys_io_setup /* 6200 */ PTR sys_io_destroy - PTR sys_io_getevents - PTR sys_io_submit + PTR compat_sys_io_getevents + PTR compat_sys_io_submit PTR sys_io_cancel PTR sys_exit_group /* 6205 */ PTR sys_lookup_dcookie diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index 171979fc98e5..78d768a3e19d 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -93,8 +93,9 @@ NESTED(handle_sys, PT_SIZE, sp) sd t0, PT_R7(sp) # set error flag beqz t0, 1f + ld t1, PT_R2(sp) # syscall number dnegu v0 # error - sd v0, PT_R0(sp) # flag for syscall restarting + sd t1, PT_R0(sp) # save it for syscall restarting 1: sd v0, PT_R2(sp) # result o32_syscall_exit: @@ -142,8 +143,9 @@ trace_a_syscall: sd t0, PT_R7(sp) # set error flag beqz t0, 1f + ld t1, PT_R2(sp) # syscall number dnegu v0 # error - sd v0, PT_R0(sp) # set flag for syscall restarting + sd t1, PT_R0(sp) # save it for syscall restarting 1: sd v0, PT_R2(sp) # result j syscall_exit @@ -154,8 +156,7 @@ trace_a_syscall: * The stackpointer for a call with more than 4 arguments is bad. */ bad_stack: - dnegu v0 # error - sd v0, PT_R0(sp) + li v0, EFAULT sd v0, PT_R2(sp) li t0, 1 # set error flag sd t0, PT_R7(sp) @@ -444,10 +445,10 @@ sys_call_table: PTR compat_sys_futex PTR compat_sys_sched_setaffinity PTR compat_sys_sched_getaffinity /* 4240 */ - PTR sys_io_setup + PTR compat_sys_io_setup PTR sys_io_destroy - PTR sys_io_getevents - PTR sys_io_submit + PTR compat_sys_io_getevents + PTR compat_sys_io_submit PTR sys_io_cancel /* 4245 */ PTR sys_exit_group PTR sys32_lookup_dcookie diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index 2099d5a4c4b7..5922342bca39 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -390,7 +390,6 @@ asmlinkage void sys_rt_sigreturn(nabi_no_regargs struct pt_regs regs) { struct rt_sigframe __user *frame; sigset_t set; - stack_t st; int sig; frame = (struct rt_sigframe __user *) regs.regs[29]; @@ -411,11 +410,9 @@ asmlinkage void sys_rt_sigreturn(nabi_no_regargs struct pt_regs regs) else if (sig) force_sig(sig, current); - if (__copy_from_user(&st, &frame->rs_uc.uc_stack, sizeof(st))) - goto badframe; /* It is more difficult to avoid calling this function than to call it and ignore errors. */ - do_sigaltstack((stack_t __user *)&st, NULL, regs.regs[29]); + do_sigaltstack(&frame->rs_uc.uc_stack, NULL, regs.regs[29]); /* * Don't let your children do this ... @@ -550,23 +547,26 @@ static int handle_signal(unsigned long sig, siginfo_t *info, struct mips_abi *abi = current->thread.abi; void *vdso = current->mm->context.vdso; - switch(regs->regs[0]) { - case ERESTART_RESTARTBLOCK: - case ERESTARTNOHAND: - regs->regs[2] = EINTR; - break; - case ERESTARTSYS: - if (!(ka->sa.sa_flags & SA_RESTART)) { + if (regs->regs[0]) { + switch(regs->regs[2]) { + case ERESTART_RESTARTBLOCK: + case ERESTARTNOHAND: regs->regs[2] = EINTR; break; + case ERESTARTSYS: + if (!(ka->sa.sa_flags & SA_RESTART)) { + regs->regs[2] = EINTR; + break; + } + /* fallthrough */ + case ERESTARTNOINTR: + regs->regs[7] = regs->regs[26]; + regs->regs[2] = regs->regs[0]; + regs->cp0_epc -= 4; } - /* fallthrough */ - case ERESTARTNOINTR: /* Userland will reload $v0. */ - regs->regs[7] = regs->regs[26]; - regs->cp0_epc -= 8; - } - regs->regs[0] = 0; /* Don't deal with this again. */ + regs->regs[0] = 0; /* Don't deal with this again. */ + } if (sig_uses_siginfo(ka)) ret = abi->setup_rt_frame(vdso + abi->rt_signal_return_offset, @@ -575,6 +575,9 @@ static int handle_signal(unsigned long sig, siginfo_t *info, ret = abi->setup_frame(vdso + abi->signal_return_offset, ka, regs, sig, oldset); + if (ret) + return ret; + spin_lock_irq(¤t->sighand->siglock); sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); if (!(ka->sa.sa_flags & SA_NODEFER)) @@ -622,17 +625,13 @@ static void do_signal(struct pt_regs *regs) return; } - /* - * Who's code doesn't conform to the restartable syscall convention - * dies here!!! The li instruction, a single machine instruction, - * must directly be followed by the syscall instruction. - */ if (regs->regs[0]) { if (regs->regs[2] == ERESTARTNOHAND || regs->regs[2] == ERESTARTSYS || regs->regs[2] == ERESTARTNOINTR) { + regs->regs[2] = regs->regs[0]; regs->regs[7] = regs->regs[26]; - regs->cp0_epc -= 8; + regs->cp0_epc -= 4; } if (regs->regs[2] == ERESTART_RESTARTBLOCK) { regs->regs[2] = current->thread.abi->restart; diff --git a/arch/mips/kernel/signal_n32.c b/arch/mips/kernel/signal_n32.c index 2c5df818c65a..ee24d814d5b9 100644 --- a/arch/mips/kernel/signal_n32.c +++ b/arch/mips/kernel/signal_n32.c @@ -109,6 +109,7 @@ asmlinkage int sysn32_rt_sigsuspend(nabi_no_regargs struct pt_regs regs) asmlinkage void sysn32_rt_sigreturn(nabi_no_regargs struct pt_regs regs) { struct rt_sigframe_n32 __user *frame; + mm_segment_t old_fs; sigset_t set; stack_t st; s32 sp; @@ -143,7 +144,11 @@ asmlinkage void sysn32_rt_sigreturn(nabi_no_regargs struct pt_regs regs) /* It is more difficult to avoid calling this function than to call it and ignore errors. */ + old_fs = get_fs(); + set_fs(KERNEL_DS); do_sigaltstack((stack_t __user *)&st, NULL, regs.regs[29]); + set_fs(old_fs); + /* * Don't let your children do this ... diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c index 69b039ca8d83..33d5a5ce4a29 100644 --- a/arch/mips/kernel/unaligned.c +++ b/arch/mips/kernel/unaligned.c @@ -109,8 +109,6 @@ static void emulate_load_store_insn(struct pt_regs *regs, unsigned long value; unsigned int res; - regs->regs[0] = 0; - /* * This load never faults. */ diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 907417d187e1..79a04a9394d5 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -16,6 +16,7 @@ config PARISC select RTC_DRV_GENERIC select INIT_ALL_POSSIBLE select BUG + select HAVE_IRQ_WORK select HAVE_PERF_EVENTS select GENERIC_ATOMIC64 if !64BIT help diff --git a/arch/parisc/include/asm/perf_event.h b/arch/parisc/include/asm/perf_event.h index cc146427d8f9..1e0fd8ba6c03 100644 --- a/arch/parisc/include/asm/perf_event.h +++ b/arch/parisc/include/asm/perf_event.h @@ -1,7 +1,6 @@ #ifndef __ASM_PARISC_PERF_EVENT_H #define __ASM_PARISC_PERF_EVENT_H -/* parisc only supports software events through this interface. */ -static inline void set_perf_event_pending(void) { } +/* Empty, just to avoid compiling error */ #endif /* __ASM_PARISC_PERF_EVENT_H */ diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 631e5a0fb6ab..4b1e521d966f 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -138,6 +138,7 @@ config PPC select HAVE_OPROFILE select HAVE_SYSCALL_WRAPPERS if PPC64 select GENERIC_ATOMIC64 if PPC32 + select HAVE_IRQ_WORK select HAVE_PERF_EVENTS select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64 diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 1ff6662f7faf..9b287fdd8ea3 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -129,7 +129,7 @@ struct paca_struct { u8 soft_enabled; /* irq soft-enable flag */ u8 hard_enabled; /* set if irqs are enabled in MSR */ u8 io_sync; /* writel() needs spin_unlock sync */ - u8 perf_event_pending; /* PM interrupt while soft-disabled */ + u8 irq_work_pending; /* IRQ_WORK interrupt while soft-disable */ /* Stuff for accurate time accounting */ u64 user_time; /* accumulated usermode TB ticks */ diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c index 95ad9dad298e..d05ae4204bbf 100644 --- a/arch/powerpc/kernel/perf_callchain.c +++ b/arch/powerpc/kernel/perf_callchain.c @@ -23,18 +23,6 @@ #include "ppc32.h" #endif -/* - * Store another value in a callchain_entry. - */ -static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip) -{ - unsigned int nr = entry->nr; - - if (nr < PERF_MAX_STACK_DEPTH) { - entry->ip[nr] = ip; - entry->nr = nr + 1; - } -} /* * Is sp valid as the address of the next kernel stack frame after prev_sp? @@ -58,8 +46,8 @@ static int valid_next_sp(unsigned long sp, unsigned long prev_sp) return 0; } -static void perf_callchain_kernel(struct pt_regs *regs, - struct perf_callchain_entry *entry) +void +perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) { unsigned long sp, next_sp; unsigned long next_ip; @@ -69,8 +57,7 @@ static void perf_callchain_kernel(struct pt_regs *regs, lr = regs->link; sp = regs->gpr[1]; - callchain_store(entry, PERF_CONTEXT_KERNEL); - callchain_store(entry, regs->nip); + perf_callchain_store(entry, regs->nip); if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD)) return; @@ -89,7 +76,7 @@ static void perf_callchain_kernel(struct pt_regs *regs, next_ip = regs->nip; lr = regs->link; level = 0; - callchain_store(entry, PERF_CONTEXT_KERNEL); + perf_callchain_store(entry, PERF_CONTEXT_KERNEL); } else { if (level == 0) @@ -111,7 +98,7 @@ static void perf_callchain_kernel(struct pt_regs *regs, ++level; } - callchain_store(entry, next_ip); + perf_callchain_store(entry, next_ip); if (!valid_next_sp(next_sp, sp)) return; sp = next_sp; @@ -233,8 +220,8 @@ static int sane_signal_64_frame(unsigned long sp) puc == (unsigned long) &sf->uc; } -static void perf_callchain_user_64(struct pt_regs *regs, - struct perf_callchain_entry *entry) +static void perf_callchain_user_64(struct perf_callchain_entry *entry, + struct pt_regs *regs) { unsigned long sp, next_sp; unsigned long next_ip; @@ -246,8 +233,7 @@ static void perf_callchain_user_64(struct pt_regs *regs, next_ip = regs->nip; lr = regs->link; sp = regs->gpr[1]; - callchain_store(entry, PERF_CONTEXT_USER); - callchain_store(entry, next_ip); + perf_callchain_store(entry, next_ip); for (;;) { fp = (unsigned long __user *) sp; @@ -276,14 +262,14 @@ static void perf_callchain_user_64(struct pt_regs *regs, read_user_stack_64(&uregs[PT_R1], &sp)) return; level = 0; - callchain_store(entry, PERF_CONTEXT_USER); - callchain_store(entry, next_ip); + perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_store(entry, next_ip); continue; } if (level == 0) next_ip = lr; - callchain_store(entry, next_ip); + perf_callchain_store(entry, next_ip); ++level; sp = next_sp; } @@ -315,8 +301,8 @@ static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret) return __get_user_inatomic(*ret, ptr); } -static inline void perf_callchain_user_64(struct pt_regs *regs, - struct perf_callchain_entry *entry) +static inline void perf_callchain_user_64(struct perf_callchain_entry *entry, + struct pt_regs *regs) { } @@ -435,8 +421,8 @@ static unsigned int __user *signal_frame_32_regs(unsigned int sp, return mctx->mc_gregs; } -static void perf_callchain_user_32(struct pt_regs *regs, - struct perf_callchain_entry *entry) +static void perf_callchain_user_32(struct perf_callchain_entry *entry, + struct pt_regs *regs) { unsigned int sp, next_sp; unsigned int next_ip; @@ -447,8 +433,7 @@ static void perf_callchain_user_32(struct pt_regs *regs, next_ip = regs->nip; lr = regs->link; sp = regs->gpr[1]; - callchain_store(entry, PERF_CONTEXT_USER); - callchain_store(entry, next_ip); + perf_callchain_store(entry, next_ip); while (entry->nr < PERF_MAX_STACK_DEPTH) { fp = (unsigned int __user *) (unsigned long) sp; @@ -470,45 +455,24 @@ static void perf_callchain_user_32(struct pt_regs *regs, read_user_stack_32(&uregs[PT_R1], &sp)) return; level = 0; - callchain_store(entry, PERF_CONTEXT_USER); - callchain_store(entry, next_ip); + perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_store(entry, next_ip); continue; } if (level == 0) next_ip = lr; - callchain_store(entry, next_ip); + perf_callchain_store(entry, next_ip); ++level; sp = next_sp; } } -/* - * Since we can't get PMU interrupts inside a PMU interrupt handler, - * we don't need separate irq and nmi entries here. - */ -static DEFINE_PER_CPU(struct perf_callchain_entry, cpu_perf_callchain); - -struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +void +perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) { - struct perf_callchain_entry *entry = &__get_cpu_var(cpu_perf_callchain); - - entry->nr = 0; - - if (!user_mode(regs)) { - perf_callchain_kernel(regs, entry); - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - - if (regs) { - if (current_is_64bit()) - perf_callchain_user_64(regs, entry); - else - perf_callchain_user_32(regs, entry); - } - - return entry; + if (current_is_64bit()) + perf_callchain_user_64(entry, regs); + else + perf_callchain_user_32(entry, regs); } diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c index d301a30445e0..3129c855933c 100644 --- a/arch/powerpc/kernel/perf_event.c +++ b/arch/powerpc/kernel/perf_event.c @@ -402,6 +402,9 @@ static void power_pmu_read(struct perf_event *event) { s64 val, delta, prev; + if (event->hw.state & PERF_HES_STOPPED) + return; + if (!event->hw.idx) return; /* @@ -517,7 +520,7 @@ static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0) * Disable all events to prevent PMU interrupts and to allow * events to be added or removed. */ -void hw_perf_disable(void) +static void power_pmu_disable(struct pmu *pmu) { struct cpu_hw_events *cpuhw; unsigned long flags; @@ -565,7 +568,7 @@ void hw_perf_disable(void) * If we were previously disabled and events were added, then * put the new config on the PMU. */ -void hw_perf_enable(void) +static void power_pmu_enable(struct pmu *pmu) { struct perf_event *event; struct cpu_hw_events *cpuhw; @@ -672,6 +675,8 @@ void hw_perf_enable(void) } local64_set(&event->hw.prev_count, val); event->hw.idx = idx; + if (event->hw.state & PERF_HES_STOPPED) + val = 0; write_pmc(idx, val); perf_event_update_userpage(event); } @@ -727,7 +732,7 @@ static int collect_events(struct perf_event *group, int max_count, * re-enable the PMU in order to get hw_perf_enable to do the * actual work of reconfiguring the PMU. */ -static int power_pmu_enable(struct perf_event *event) +static int power_pmu_add(struct perf_event *event, int ef_flags) { struct cpu_hw_events *cpuhw; unsigned long flags; @@ -735,7 +740,7 @@ static int power_pmu_enable(struct perf_event *event) int ret = -EAGAIN; local_irq_save(flags); - perf_disable(); + perf_pmu_disable(event->pmu); /* * Add the event to the list (if there is room) @@ -749,6 +754,9 @@ static int power_pmu_enable(struct perf_event *event) cpuhw->events[n0] = event->hw.config; cpuhw->flags[n0] = event->hw.event_base; + if (!(ef_flags & PERF_EF_START)) + event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + /* * If group events scheduling transaction was started, * skip the schedulability test here, it will be peformed @@ -769,7 +777,7 @@ nocheck: ret = 0; out: - perf_enable(); + perf_pmu_enable(event->pmu); local_irq_restore(flags); return ret; } @@ -777,14 +785,14 @@ nocheck: /* * Remove a event from the PMU. */ -static void power_pmu_disable(struct perf_event *event) +static void power_pmu_del(struct perf_event *event, int ef_flags) { struct cpu_hw_events *cpuhw; long i; unsigned long flags; local_irq_save(flags); - perf_disable(); + perf_pmu_disable(event->pmu); power_pmu_read(event); @@ -821,34 +829,60 @@ static void power_pmu_disable(struct perf_event *event) cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE); } - perf_enable(); + perf_pmu_enable(event->pmu); local_irq_restore(flags); } /* - * Re-enable interrupts on a event after they were throttled - * because they were coming too fast. + * POWER-PMU does not support disabling individual counters, hence + * program their cycle counter to their max value and ignore the interrupts. */ -static void power_pmu_unthrottle(struct perf_event *event) + +static void power_pmu_start(struct perf_event *event, int ef_flags) +{ + unsigned long flags; + s64 left; + + if (!event->hw.idx || !event->hw.sample_period) + return; + + if (!(event->hw.state & PERF_HES_STOPPED)) + return; + + if (ef_flags & PERF_EF_RELOAD) + WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); + + local_irq_save(flags); + perf_pmu_disable(event->pmu); + + event->hw.state = 0; + left = local64_read(&event->hw.period_left); + write_pmc(event->hw.idx, left); + + perf_event_update_userpage(event); + perf_pmu_enable(event->pmu); + local_irq_restore(flags); +} + +static void power_pmu_stop(struct perf_event *event, int ef_flags) { - s64 val, left; unsigned long flags; if (!event->hw.idx || !event->hw.sample_period) return; + + if (event->hw.state & PERF_HES_STOPPED) + return; + local_irq_save(flags); - perf_disable(); + perf_pmu_disable(event->pmu); + power_pmu_read(event); - left = event->hw.sample_period; - event->hw.last_period = left; - val = 0; - if (left < 0x80000000L) - val = 0x80000000L - left; - write_pmc(event->hw.idx, val); - local64_set(&event->hw.prev_count, val); - local64_set(&event->hw.period_left, left); + event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; + write_pmc(event->hw.idx, 0); + perf_event_update_userpage(event); - perf_enable(); + perf_pmu_enable(event->pmu); local_irq_restore(flags); } @@ -857,10 +891,11 @@ static void power_pmu_unthrottle(struct perf_event *event) * Set the flag to make pmu::enable() not perform the * schedulability test, it will be performed at commit time */ -void power_pmu_start_txn(const struct pmu *pmu) +void power_pmu_start_txn(struct pmu *pmu) { struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); + perf_pmu_disable(pmu); cpuhw->group_flag |= PERF_EVENT_TXN; cpuhw->n_txn_start = cpuhw->n_events; } @@ -870,11 +905,12 @@ void power_pmu_start_txn(const struct pmu *pmu) * Clear the flag and pmu::enable() will perform the * schedulability test. */ -void power_pmu_cancel_txn(const struct pmu *pmu) +void power_pmu_cancel_txn(struct pmu *pmu) { struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); cpuhw->group_flag &= ~PERF_EVENT_TXN; + perf_pmu_enable(pmu); } /* @@ -882,7 +918,7 @@ void power_pmu_cancel_txn(const struct pmu *pmu) * Perform the group schedulability test as a whole * Return 0 if success */ -int power_pmu_commit_txn(const struct pmu *pmu) +int power_pmu_commit_txn(struct pmu *pmu) { struct cpu_hw_events *cpuhw; long i, n; @@ -901,19 +937,10 @@ int power_pmu_commit_txn(const struct pmu *pmu) cpuhw->event[i]->hw.config = cpuhw->events[i]; cpuhw->group_flag &= ~PERF_EVENT_TXN; + perf_pmu_enable(pmu); return 0; } -struct pmu power_pmu = { - .enable = power_pmu_enable, - .disable = power_pmu_disable, - .read = power_pmu_read, - .unthrottle = power_pmu_unthrottle, - .start_txn = power_pmu_start_txn, - .cancel_txn = power_pmu_cancel_txn, - .commit_txn = power_pmu_commit_txn, -}; - /* * Return 1 if we might be able to put event on a limited PMC, * or 0 if not. @@ -1014,7 +1041,7 @@ static int hw_perf_cache_event(u64 config, u64 *eventp) return 0; } -const struct pmu *hw_perf_event_init(struct perf_event *event) +static int power_pmu_event_init(struct perf_event *event) { u64 ev; unsigned long flags; @@ -1026,25 +1053,27 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) struct cpu_hw_events *cpuhw; if (!ppmu) - return ERR_PTR(-ENXIO); + return -ENOENT; + switch (event->attr.type) { case PERF_TYPE_HARDWARE: ev = event->attr.config; if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) - return ERR_PTR(-EOPNOTSUPP); + return -EOPNOTSUPP; ev = ppmu->generic_events[ev]; break; case PERF_TYPE_HW_CACHE: err = hw_perf_cache_event(event->attr.config, &ev); if (err) - return ERR_PTR(err); + return err; break; case PERF_TYPE_RAW: ev = event->attr.config; break; default: - return ERR_PTR(-EINVAL); + return -ENOENT; } + event->hw.config_base = ev; event->hw.idx = 0; @@ -1063,7 +1092,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) * XXX we should check if the task is an idle task. */ flags = 0; - if (event->ctx->task) + if (event->attach_state & PERF_ATTACH_TASK) flags |= PPMU_ONLY_COUNT_RUN; /* @@ -1081,7 +1110,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) */ ev = normal_pmc_alternative(ev, flags); if (!ev) - return ERR_PTR(-EINVAL); + return -EINVAL; } } @@ -1095,19 +1124,19 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) n = collect_events(event->group_leader, ppmu->n_counter - 1, ctrs, events, cflags); if (n < 0) - return ERR_PTR(-EINVAL); + return -EINVAL; } events[n] = ev; ctrs[n] = event; cflags[n] = flags; if (check_excludes(ctrs, cflags, n, 1)) - return ERR_PTR(-EINVAL); + return -EINVAL; cpuhw = &get_cpu_var(cpu_hw_events); err = power_check_constraints(cpuhw, events, cflags, n + 1); put_cpu_var(cpu_hw_events); if (err) - return ERR_PTR(-EINVAL); + return -EINVAL; event->hw.config = events[n]; event->hw.event_base = cflags[n]; @@ -1132,11 +1161,23 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) } event->destroy = hw_perf_event_destroy; - if (err) - return ERR_PTR(err); - return &power_pmu; + return err; } +struct pmu power_pmu = { + .pmu_enable = power_pmu_enable, + .pmu_disable = power_pmu_disable, + .event_init = power_pmu_event_init, + .add = power_pmu_add, + .del = power_pmu_del, + .start = power_pmu_start, + .stop = power_pmu_stop, + .read = power_pmu_read, + .start_txn = power_pmu_start_txn, + .cancel_txn = power_pmu_cancel_txn, + .commit_txn = power_pmu_commit_txn, +}; + /* * A counter has overflowed; update its count and record * things if requested. Note that interrupts are hard-disabled @@ -1149,6 +1190,11 @@ static void record_and_restart(struct perf_event *event, unsigned long val, s64 prev, delta, left; int record = 0; + if (event->hw.state & PERF_HES_STOPPED) { + write_pmc(event->hw.idx, 0); + return; + } + /* we don't have to worry about interrupts here */ prev = local64_read(&event->hw.prev_count); delta = (val - prev) & 0xfffffffful; @@ -1171,6 +1217,11 @@ static void record_and_restart(struct perf_event *event, unsigned long val, val = 0x80000000LL - left; } + write_pmc(event->hw.idx, val); + local64_set(&event->hw.prev_count, val); + local64_set(&event->hw.period_left, left); + perf_event_update_userpage(event); + /* * Finally record data if requested. */ @@ -1183,23 +1234,9 @@ static void record_and_restart(struct perf_event *event, unsigned long val, if (event->attr.sample_type & PERF_SAMPLE_ADDR) perf_get_data_addr(regs, &data.addr); - if (perf_event_overflow(event, nmi, &data, regs)) { - /* - * Interrupts are coming too fast - throttle them - * by setting the event to 0, so it will be - * at least 2^30 cycles until the next interrupt - * (assuming each event counts at most 2 counts - * per cycle). - */ - val = 0; - left = ~0ULL >> 1; - } + if (perf_event_overflow(event, nmi, &data, regs)) + power_pmu_stop(event, 0); } - - write_pmc(event->hw.idx, val); - local64_set(&event->hw.prev_count, val); - local64_set(&event->hw.period_left, left); - perf_event_update_userpage(event); } /* @@ -1342,6 +1379,7 @@ int register_power_pmu(struct power_pmu *pmu) freeze_events_kernel = MMCR0_FCHV; #endif /* CONFIG_PPC64 */ + perf_pmu_register(&power_pmu); perf_cpu_notifier(power_pmu_notifier); return 0; diff --git a/arch/powerpc/kernel/perf_event_fsl_emb.c b/arch/powerpc/kernel/perf_event_fsl_emb.c index 1ba45471ae43..7ecca59ddf77 100644 --- a/arch/powerpc/kernel/perf_event_fsl_emb.c +++ b/arch/powerpc/kernel/perf_event_fsl_emb.c @@ -156,6 +156,9 @@ static void fsl_emb_pmu_read(struct perf_event *event) { s64 val, delta, prev; + if (event->hw.state & PERF_HES_STOPPED) + return; + /* * Performance monitor interrupts come even when interrupts * are soft-disabled, as long as interrupts are hard-enabled. @@ -177,7 +180,7 @@ static void fsl_emb_pmu_read(struct perf_event *event) * Disable all events to prevent PMU interrupts and to allow * events to be added or removed. */ -void hw_perf_disable(void) +static void fsl_emb_pmu_disable(struct pmu *pmu) { struct cpu_hw_events *cpuhw; unsigned long flags; @@ -216,7 +219,7 @@ void hw_perf_disable(void) * If we were previously disabled and events were added, then * put the new config on the PMU. */ -void hw_perf_enable(void) +static void fsl_emb_pmu_enable(struct pmu *pmu) { struct cpu_hw_events *cpuhw; unsigned long flags; @@ -262,8 +265,8 @@ static int collect_events(struct perf_event *group, int max_count, return n; } -/* perf must be disabled, context locked on entry */ -static int fsl_emb_pmu_enable(struct perf_event *event) +/* context locked on entry */ +static int fsl_emb_pmu_add(struct perf_event *event, int flags) { struct cpu_hw_events *cpuhw; int ret = -EAGAIN; @@ -271,6 +274,7 @@ static int fsl_emb_pmu_enable(struct perf_event *event) u64 val; int i; + perf_pmu_disable(event->pmu); cpuhw = &get_cpu_var(cpu_hw_events); if (event->hw.config & FSL_EMB_EVENT_RESTRICTED) @@ -301,6 +305,12 @@ static int fsl_emb_pmu_enable(struct perf_event *event) val = 0x80000000L - left; } local64_set(&event->hw.prev_count, val); + + if (!(flags & PERF_EF_START)) { + event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + val = 0; + } + write_pmc(i, val); perf_event_update_userpage(event); @@ -310,15 +320,17 @@ static int fsl_emb_pmu_enable(struct perf_event *event) ret = 0; out: put_cpu_var(cpu_hw_events); + perf_pmu_enable(event->pmu); return ret; } -/* perf must be disabled, context locked on entry */ -static void fsl_emb_pmu_disable(struct perf_event *event) +/* context locked on entry */ +static void fsl_emb_pmu_del(struct perf_event *event, int flags) { struct cpu_hw_events *cpuhw; int i = event->hw.idx; + perf_pmu_disable(event->pmu); if (i < 0) goto out; @@ -346,44 +358,57 @@ static void fsl_emb_pmu_disable(struct perf_event *event) cpuhw->n_events--; out: + perf_pmu_enable(event->pmu); put_cpu_var(cpu_hw_events); } -/* - * Re-enable interrupts on a event after they were throttled - * because they were coming too fast. - * - * Context is locked on entry, but perf is not disabled. - */ -static void fsl_emb_pmu_unthrottle(struct perf_event *event) +static void fsl_emb_pmu_start(struct perf_event *event, int ef_flags) { - s64 val, left; unsigned long flags; + s64 left; if (event->hw.idx < 0 || !event->hw.sample_period) return; + + if (!(event->hw.state & PERF_HES_STOPPED)) + return; + + if (ef_flags & PERF_EF_RELOAD) + WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); + local_irq_save(flags); - perf_disable(); - fsl_emb_pmu_read(event); - left = event->hw.sample_period; - event->hw.last_period = left; - val = 0; - if (left < 0x80000000L) - val = 0x80000000L - left; - write_pmc(event->hw.idx, val); - local64_set(&event->hw.prev_count, val); - local64_set(&event->hw.period_left, left); + perf_pmu_disable(event->pmu); + + event->hw.state = 0; + left = local64_read(&event->hw.period_left); + write_pmc(event->hw.idx, left); + perf_event_update_userpage(event); - perf_enable(); + perf_pmu_enable(event->pmu); local_irq_restore(flags); } -static struct pmu fsl_emb_pmu = { - .enable = fsl_emb_pmu_enable, - .disable = fsl_emb_pmu_disable, - .read = fsl_emb_pmu_read, - .unthrottle = fsl_emb_pmu_unthrottle, -}; +static void fsl_emb_pmu_stop(struct perf_event *event, int ef_flags) +{ + unsigned long flags; + + if (event->hw.idx < 0 || !event->hw.sample_period) + return; + + if (event->hw.state & PERF_HES_STOPPED) + return; + + local_irq_save(flags); + perf_pmu_disable(event->pmu); + + fsl_emb_pmu_read(event); + event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; + write_pmc(event->hw.idx, 0); + + perf_event_update_userpage(event); + perf_pmu_enable(event->pmu); + local_irq_restore(flags); +} /* * Release the PMU if this is the last perf_event. @@ -428,7 +453,7 @@ static int hw_perf_cache_event(u64 config, u64 *eventp) return 0; } -const struct pmu *hw_perf_event_init(struct perf_event *event) +static int fsl_emb_pmu_event_init(struct perf_event *event) { u64 ev; struct perf_event *events[MAX_HWEVENTS]; @@ -441,14 +466,14 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) case PERF_TYPE_HARDWARE: ev = event->attr.config; if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) - return ERR_PTR(-EOPNOTSUPP); + return -EOPNOTSUPP; ev = ppmu->generic_events[ev]; break; case PERF_TYPE_HW_CACHE: err = hw_perf_cache_event(event->attr.config, &ev); if (err) - return ERR_PTR(err); + return err; break; case PERF_TYPE_RAW: @@ -456,12 +481,12 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) break; default: - return ERR_PTR(-EINVAL); + return -ENOENT; } event->hw.config = ppmu->xlate_event(ev); if (!(event->hw.config & FSL_EMB_EVENT_VALID)) - return ERR_PTR(-EINVAL); + return -EINVAL; /* * If this is in a group, check if it can go on with all the @@ -473,7 +498,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) n = collect_events(event->group_leader, ppmu->n_counter - 1, events); if (n < 0) - return ERR_PTR(-EINVAL); + return -EINVAL; } if (event->hw.config & FSL_EMB_EVENT_RESTRICTED) { @@ -484,7 +509,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) } if (num_restricted >= ppmu->n_restricted) - return ERR_PTR(-EINVAL); + return -EINVAL; } event->hw.idx = -1; @@ -497,7 +522,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) if (event->attr.exclude_kernel) event->hw.config_base |= PMLCA_FCS; if (event->attr.exclude_idle) - return ERR_PTR(-ENOTSUPP); + return -ENOTSUPP; event->hw.last_period = event->hw.sample_period; local64_set(&event->hw.period_left, event->hw.last_period); @@ -523,11 +548,20 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) } event->destroy = hw_perf_event_destroy; - if (err) - return ERR_PTR(err); - return &fsl_emb_pmu; + return err; } +static struct pmu fsl_emb_pmu = { + .pmu_enable = fsl_emb_pmu_enable, + .pmu_disable = fsl_emb_pmu_disable, + .event_init = fsl_emb_pmu_event_init, + .add = fsl_emb_pmu_add, + .del = fsl_emb_pmu_del, + .start = fsl_emb_pmu_start, + .stop = fsl_emb_pmu_stop, + .read = fsl_emb_pmu_read, +}; + /* * A counter has overflowed; update its count and record * things if requested. Note that interrupts are hard-disabled @@ -540,6 +574,11 @@ static void record_and_restart(struct perf_event *event, unsigned long val, s64 prev, delta, left; int record = 0; + if (event->hw.state & PERF_HES_STOPPED) { + write_pmc(event->hw.idx, 0); + return; + } + /* we don't have to worry about interrupts here */ prev = local64_read(&event->hw.prev_count); delta = (val - prev) & 0xfffffffful; @@ -562,6 +601,11 @@ static void record_and_restart(struct perf_event *event, unsigned long val, val = 0x80000000LL - left; } + write_pmc(event->hw.idx, val); + local64_set(&event->hw.prev_count, val); + local64_set(&event->hw.period_left, left); + perf_event_update_userpage(event); + /* * Finally record data if requested. */ @@ -571,23 +615,9 @@ static void record_and_restart(struct perf_event *event, unsigned long val, perf_sample_data_init(&data, 0); data.period = event->hw.last_period; - if (perf_event_overflow(event, nmi, &data, regs)) { - /* - * Interrupts are coming too fast - throttle them - * by setting the event to 0, so it will be - * at least 2^30 cycles until the next interrupt - * (assuming each event counts at most 2 counts - * per cycle). - */ - val = 0; - left = ~0ULL >> 1; - } + if (perf_event_overflow(event, nmi, &data, regs)) + fsl_emb_pmu_stop(event, 0); } - - write_pmc(event->hw.idx, val); - local64_set(&event->hw.prev_count, val); - local64_set(&event->hw.period_left, left); - perf_event_update_userpage(event); } static void perf_event_interrupt(struct pt_regs *regs) @@ -651,5 +681,7 @@ int register_fsl_emb_pmu(struct fsl_emb_pmu *pmu) pr_info("%s performance monitor hardware support registered\n", pmu->name); + perf_pmu_register(&fsl_emb_pmu); + return 0; } diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 8533b3b83f5d..54888eb10c3b 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -53,7 +53,7 @@ #include <linux/posix-timers.h> #include <linux/irq.h> #include <linux/delay.h> -#include <linux/perf_event.h> +#include <linux/irq_work.h> #include <asm/trace.h> #include <asm/io.h> @@ -493,60 +493,60 @@ void __init iSeries_time_init_early(void) } #endif /* CONFIG_PPC_ISERIES */ -#ifdef CONFIG_PERF_EVENTS +#ifdef CONFIG_IRQ_WORK /* * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable... */ #ifdef CONFIG_PPC64 -static inline unsigned long test_perf_event_pending(void) +static inline unsigned long test_irq_work_pending(void) { unsigned long x; asm volatile("lbz %0,%1(13)" : "=r" (x) - : "i" (offsetof(struct paca_struct, perf_event_pending))); + : "i" (offsetof(struct paca_struct, irq_work_pending))); return x; } -static inline void set_perf_event_pending_flag(void) +static inline void set_irq_work_pending_flag(void) { asm volatile("stb %0,%1(13)" : : "r" (1), - "i" (offsetof(struct paca_struct, perf_event_pending))); + "i" (offsetof(struct paca_struct, irq_work_pending))); } -static inline void clear_perf_event_pending(void) +static inline void clear_irq_work_pending(void) { asm volatile("stb %0,%1(13)" : : "r" (0), - "i" (offsetof(struct paca_struct, perf_event_pending))); + "i" (offsetof(struct paca_struct, irq_work_pending))); } #else /* 32-bit */ -DEFINE_PER_CPU(u8, perf_event_pending); +DEFINE_PER_CPU(u8, irq_work_pending); -#define set_perf_event_pending_flag() __get_cpu_var(perf_event_pending) = 1 -#define test_perf_event_pending() __get_cpu_var(perf_event_pending) -#define clear_perf_event_pending() __get_cpu_var(perf_event_pending) = 0 +#define set_irq_work_pending_flag() __get_cpu_var(irq_work_pending) = 1 +#define test_irq_work_pending() __get_cpu_var(irq_work_pending) +#define clear_irq_work_pending() __get_cpu_var(irq_work_pending) = 0 #endif /* 32 vs 64 bit */ -void set_perf_event_pending(void) +void set_irq_work_pending(void) { preempt_disable(); - set_perf_event_pending_flag(); + set_irq_work_pending_flag(); set_dec(1); preempt_enable(); } -#else /* CONFIG_PERF_EVENTS */ +#else /* CONFIG_IRQ_WORK */ -#define test_perf_event_pending() 0 -#define clear_perf_event_pending() +#define test_irq_work_pending() 0 +#define clear_irq_work_pending() -#endif /* CONFIG_PERF_EVENTS */ +#endif /* CONFIG_IRQ_WORK */ /* * For iSeries shared processors, we have to let the hypervisor @@ -587,9 +587,9 @@ void timer_interrupt(struct pt_regs * regs) calculate_steal_time(); - if (test_perf_event_pending()) { - clear_perf_event_pending(); - perf_event_do_pending(); + if (test_irq_work_pending()) { + clear_irq_work_pending(); + irq_work_run(); } #ifdef CONFIG_PPC_ISERIES diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 74a2f1b607a4..75976a141947 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -95,6 +95,7 @@ config S390 select HAVE_KVM if 64BIT select HAVE_ARCH_TRACEHOOK select INIT_ALL_POSSIBLE + select HAVE_IRQ_WORK select HAVE_PERF_EVENTS select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 diff --git a/arch/s390/include/asm/hardirq.h b/arch/s390/include/asm/hardirq.h index 498bc3892385..881d94590aeb 100644 --- a/arch/s390/include/asm/hardirq.h +++ b/arch/s390/include/asm/hardirq.h @@ -12,10 +12,6 @@ #ifndef __ASM_HARDIRQ_H #define __ASM_HARDIRQ_H -#include <linux/threads.h> -#include <linux/sched.h> -#include <linux/cache.h> -#include <linux/interrupt.h> #include <asm/lowcore.h> #define local_softirq_pending() (S390_lowcore.softirq_pending) diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h index 3840cbe77637..a75f168d2718 100644 --- a/arch/s390/include/asm/perf_event.h +++ b/arch/s390/include/asm/perf_event.h @@ -4,7 +4,6 @@ * Copyright 2009 Martin Schwidefsky, IBM Corporation. */ -static inline void set_perf_event_pending(void) {} -static inline void clear_perf_event_pending(void) {} +/* Empty, just to avoid compiling error */ #define PERF_EVENT_INDEX_OFFSET 0 diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 33990fa95af0..35b6879628a0 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -16,6 +16,7 @@ config SUPERH select HAVE_ARCH_TRACEHOOK select HAVE_DMA_API_DEBUG select HAVE_DMA_ATTRS + select HAVE_IRQ_WORK select HAVE_PERF_EVENTS select PERF_USE_VMALLOC select HAVE_KERNEL_GZIP @@ -249,6 +250,11 @@ config ARCH_SHMOBILE select PM select PM_RUNTIME +config CPU_HAS_PMU + depends on CPU_SH4 || CPU_SH4A + default y + bool + if SUPERH32 choice @@ -738,6 +744,14 @@ config GUSA_RB LLSC, this should be more efficient than the other alternative of disabling interrupts around the atomic sequence. +config HW_PERF_EVENTS + bool "Enable hardware performance counter support for perf events" + depends on PERF_EVENTS && CPU_HAS_PMU + default y + help + Enable hardware performance counter support for perf events. If + disabled, perf events will use software events only. + source "drivers/sh/Kconfig" endmenu diff --git a/arch/sh/include/asm/perf_event.h b/arch/sh/include/asm/perf_event.h index 3d0c9f36d150..14308bed7ea5 100644 --- a/arch/sh/include/asm/perf_event.h +++ b/arch/sh/include/asm/perf_event.h @@ -26,11 +26,4 @@ extern int register_sh_pmu(struct sh_pmu *); extern int reserve_pmc_hardware(void); extern void release_pmc_hardware(void); -static inline void set_perf_event_pending(void) -{ - /* Nothing to see here, move along. */ -} - -#define PERF_EVENT_INDEX_OFFSET 0 - #endif /* __ASM_SH_PERF_EVENT_H */ diff --git a/arch/sh/kernel/perf_callchain.c b/arch/sh/kernel/perf_callchain.c index a9dd3abde28e..d5ca1ef50fa9 100644 --- a/arch/sh/kernel/perf_callchain.c +++ b/arch/sh/kernel/perf_callchain.c @@ -14,11 +14,6 @@ #include <asm/unwinder.h> #include <asm/ptrace.h> -static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip) -{ - if (entry->nr < PERF_MAX_STACK_DEPTH) - entry->ip[entry->nr++] = ip; -} static void callchain_warning(void *data, char *msg) { @@ -39,7 +34,7 @@ static void callchain_address(void *data, unsigned long addr, int reliable) struct perf_callchain_entry *entry = data; if (reliable) - callchain_store(entry, addr); + perf_callchain_store(entry, addr); } static const struct stacktrace_ops callchain_ops = { @@ -49,47 +44,10 @@ static const struct stacktrace_ops callchain_ops = { .address = callchain_address, }; -static void -perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) +void +perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) { - callchain_store(entry, PERF_CONTEXT_KERNEL); - callchain_store(entry, regs->pc); + perf_callchain_store(entry, regs->pc); unwind_stack(NULL, regs, NULL, &callchain_ops, entry); } - -static void -perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) -{ - int is_user; - - if (!regs) - return; - - is_user = user_mode(regs); - - if (is_user && current->state != TASK_RUNNING) - return; - - /* - * Only the kernel side is implemented for now. - */ - if (!is_user) - perf_callchain_kernel(regs, entry); -} - -/* - * No need for separate IRQ and NMI entries. - */ -static DEFINE_PER_CPU(struct perf_callchain_entry, callchain); - -struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - struct perf_callchain_entry *entry = &__get_cpu_var(callchain); - - entry->nr = 0; - - perf_do_callchain(regs, entry); - - return entry; -} diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c index 7a3dc3567258..5a4b33435650 100644 --- a/arch/sh/kernel/perf_event.c +++ b/arch/sh/kernel/perf_event.c @@ -59,6 +59,24 @@ static inline int sh_pmu_initialized(void) return !!sh_pmu; } +const char *perf_pmu_name(void) +{ + if (!sh_pmu) + return NULL; + + return sh_pmu->name; +} +EXPORT_SYMBOL_GPL(perf_pmu_name); + +int perf_num_counters(void) +{ + if (!sh_pmu) + return 0; + + return sh_pmu->num_events; +} +EXPORT_SYMBOL_GPL(perf_num_counters); + /* * Release the PMU if this is the last perf_event. */ @@ -206,50 +224,80 @@ again: local64_add(delta, &event->count); } -static void sh_pmu_disable(struct perf_event *event) +static void sh_pmu_stop(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; - clear_bit(idx, cpuc->active_mask); - sh_pmu->disable(hwc, idx); + if (!(event->hw.state & PERF_HES_STOPPED)) { + sh_pmu->disable(hwc, idx); + cpuc->events[idx] = NULL; + event->hw.state |= PERF_HES_STOPPED; + } + + if ((flags & PERF_EF_UPDATE) && !(event->hw.state & PERF_HES_UPTODATE)) { + sh_perf_event_update(event, &event->hw, idx); + event->hw.state |= PERF_HES_UPTODATE; + } +} + +static void sh_pmu_start(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + if (WARN_ON_ONCE(idx == -1)) + return; + + if (flags & PERF_EF_RELOAD) + WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); - barrier(); + cpuc->events[idx] = event; + event->hw.state = 0; + sh_pmu->enable(hwc, idx); +} - sh_perf_event_update(event, &event->hw, idx); +static void sh_pmu_del(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - cpuc->events[idx] = NULL; - clear_bit(idx, cpuc->used_mask); + sh_pmu_stop(event, PERF_EF_UPDATE); + __clear_bit(event->hw.idx, cpuc->used_mask); perf_event_update_userpage(event); } -static int sh_pmu_enable(struct perf_event *event) +static int sh_pmu_add(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; + int ret = -EAGAIN; + + perf_pmu_disable(event->pmu); - if (test_and_set_bit(idx, cpuc->used_mask)) { + if (__test_and_set_bit(idx, cpuc->used_mask)) { idx = find_first_zero_bit(cpuc->used_mask, sh_pmu->num_events); if (idx == sh_pmu->num_events) - return -EAGAIN; + goto out; - set_bit(idx, cpuc->used_mask); + __set_bit(idx, cpuc->used_mask); hwc->idx = idx; } sh_pmu->disable(hwc, idx); - cpuc->events[idx] = event; - set_bit(idx, cpuc->active_mask); - - sh_pmu->enable(hwc, idx); + event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + if (flags & PERF_EF_START) + sh_pmu_start(event, PERF_EF_RELOAD); perf_event_update_userpage(event); - - return 0; + ret = 0; +out: + perf_pmu_enable(event->pmu); + return ret; } static void sh_pmu_read(struct perf_event *event) @@ -257,24 +305,56 @@ static void sh_pmu_read(struct perf_event *event) sh_perf_event_update(event, &event->hw, event->hw.idx); } -static const struct pmu pmu = { - .enable = sh_pmu_enable, - .disable = sh_pmu_disable, - .read = sh_pmu_read, -}; - -const struct pmu *hw_perf_event_init(struct perf_event *event) +static int sh_pmu_event_init(struct perf_event *event) { - int err = __hw_perf_event_init(event); + int err; + + switch (event->attr.type) { + case PERF_TYPE_RAW: + case PERF_TYPE_HW_CACHE: + case PERF_TYPE_HARDWARE: + err = __hw_perf_event_init(event); + break; + + default: + return -ENOENT; + } + if (unlikely(err)) { if (event->destroy) event->destroy(event); - return ERR_PTR(err); } - return &pmu; + return err; +} + +static void sh_pmu_enable(struct pmu *pmu) +{ + if (!sh_pmu_initialized()) + return; + + sh_pmu->enable_all(); +} + +static void sh_pmu_disable(struct pmu *pmu) +{ + if (!sh_pmu_initialized()) + return; + + sh_pmu->disable_all(); } +static struct pmu pmu = { + .pmu_enable = sh_pmu_enable, + .pmu_disable = sh_pmu_disable, + .event_init = sh_pmu_event_init, + .add = sh_pmu_add, + .del = sh_pmu_del, + .start = sh_pmu_start, + .stop = sh_pmu_stop, + .read = sh_pmu_read, +}; + static void sh_pmu_setup(int cpu) { struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu); @@ -299,32 +379,17 @@ sh_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) return NOTIFY_OK; } -void hw_perf_enable(void) -{ - if (!sh_pmu_initialized()) - return; - - sh_pmu->enable_all(); -} - -void hw_perf_disable(void) -{ - if (!sh_pmu_initialized()) - return; - - sh_pmu->disable_all(); -} - -int __cpuinit register_sh_pmu(struct sh_pmu *pmu) +int __cpuinit register_sh_pmu(struct sh_pmu *_pmu) { if (sh_pmu) return -EBUSY; - sh_pmu = pmu; + sh_pmu = _pmu; - pr_info("Performance Events: %s support registered\n", pmu->name); + pr_info("Performance Events: %s support registered\n", _pmu->name); - WARN_ON(pmu->num_events > MAX_HWEVENTS); + WARN_ON(_pmu->num_events > MAX_HWEVENTS); + perf_pmu_register(&pmu); perf_cpu_notifier(sh_pmu_notifier); return 0; } diff --git a/arch/sh/oprofile/Makefile b/arch/sh/oprofile/Makefile index 4886c5c1786c..e85aae73e3dc 100644 --- a/arch/sh/oprofile/Makefile +++ b/arch/sh/oprofile/Makefile @@ -6,4 +6,8 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \ oprofilefs.o oprofile_stats.o \ timer_int.o ) +ifeq ($(CONFIG_HW_PERF_EVENTS),y) +DRIVER_OBJS += $(addprefix ../../../drivers/oprofile/, oprofile_perf.o) +endif + oprofile-y := $(DRIVER_OBJS) common.o backtrace.o diff --git a/arch/sh/oprofile/common.c b/arch/sh/oprofile/common.c index ac604937f3ee..e10d89376f9b 100644 --- a/arch/sh/oprofile/common.c +++ b/arch/sh/oprofile/common.c @@ -17,114 +17,45 @@ #include <linux/init.h> #include <linux/errno.h> #include <linux/smp.h> +#include <linux/perf_event.h> #include <asm/processor.h> -#include "op_impl.h" - -static struct op_sh_model *model; - -static struct op_counter_config ctr[20]; +#ifdef CONFIG_HW_PERF_EVENTS extern void sh_backtrace(struct pt_regs * const regs, unsigned int depth); -static int op_sh_setup(void) -{ - /* Pre-compute the values to stuff in the hardware registers. */ - model->reg_setup(ctr); - - /* Configure the registers on all cpus. */ - on_each_cpu(model->cpu_setup, NULL, 1); - - return 0; -} - -static int op_sh_create_files(struct super_block *sb, struct dentry *root) +char *op_name_from_perf_id(void) { - int i, ret = 0; + const char *pmu; + char buf[20]; + int size; - for (i = 0; i < model->num_counters; i++) { - struct dentry *dir; - char buf[4]; + pmu = perf_pmu_name(); + if (!pmu) + return NULL; - snprintf(buf, sizeof(buf), "%d", i); - dir = oprofilefs_mkdir(sb, root, buf); + size = snprintf(buf, sizeof(buf), "sh/%s", pmu); + if (size > -1 && size < sizeof(buf)) + return buf; - ret |= oprofilefs_create_ulong(sb, dir, "enabled", &ctr[i].enabled); - ret |= oprofilefs_create_ulong(sb, dir, "event", &ctr[i].event); - ret |= oprofilefs_create_ulong(sb, dir, "kernel", &ctr[i].kernel); - ret |= oprofilefs_create_ulong(sb, dir, "user", &ctr[i].user); - - if (model->create_files) - ret |= model->create_files(sb, dir); - else - ret |= oprofilefs_create_ulong(sb, dir, "count", &ctr[i].count); - - /* Dummy entries */ - ret |= oprofilefs_create_ulong(sb, dir, "unit_mask", &ctr[i].unit_mask); - } - - return ret; + return NULL; } -static int op_sh_start(void) +int __init oprofile_arch_init(struct oprofile_operations *ops) { - /* Enable performance monitoring for all counters. */ - on_each_cpu(model->cpu_start, NULL, 1); + ops->backtrace = sh_backtrace; - return 0; + return oprofile_perf_init(ops); } -static void op_sh_stop(void) +void __exit oprofile_arch_exit(void) { - /* Disable performance monitoring for all counters. */ - on_each_cpu(model->cpu_stop, NULL, 1); + oprofile_perf_exit(); } - +#else int __init oprofile_arch_init(struct oprofile_operations *ops) { - struct op_sh_model *lmodel = NULL; - int ret; - - /* - * Always assign the backtrace op. If the counter initialization - * fails, we fall back to the timer which will still make use of - * this. - */ - ops->backtrace = sh_backtrace; - - /* - * XXX - * - * All of the SH7750/SH-4A counters have been converted to perf, - * this infrastructure hook is left for other users until they've - * had a chance to convert over, at which point all of this - * will be deleted. - */ - - if (!lmodel) - return -ENODEV; - if (!(current_cpu_data.flags & CPU_HAS_PERF_COUNTER)) - return -ENODEV; - - ret = lmodel->init(); - if (unlikely(ret != 0)) - return ret; - - model = lmodel; - - ops->setup = op_sh_setup; - ops->create_files = op_sh_create_files; - ops->start = op_sh_start; - ops->stop = op_sh_stop; - ops->cpu_type = lmodel->cpu_type; - - printk(KERN_INFO "oprofile: using %s performance monitoring.\n", - lmodel->cpu_type); - - return 0; -} - -void oprofile_arch_exit(void) -{ - if (model && model->exit) - model->exit(); + pr_info("oprofile: hardware counters not available\n"); + return -ENODEV; } +void __exit oprofile_arch_exit(void) {} +#endif /* CONFIG_HW_PERF_EVENTS */ diff --git a/arch/sh/oprofile/op_impl.h b/arch/sh/oprofile/op_impl.h deleted file mode 100644 index 1244479ceb29..000000000000 --- a/arch/sh/oprofile/op_impl.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef __OP_IMPL_H -#define __OP_IMPL_H - -/* Per-counter configuration as set via oprofilefs. */ -struct op_counter_config { - unsigned long enabled; - unsigned long event; - - unsigned long count; - - /* Dummy values for userspace tool compliance */ - unsigned long kernel; - unsigned long user; - unsigned long unit_mask; -}; - -/* Per-architecture configury and hooks. */ -struct op_sh_model { - void (*reg_setup)(struct op_counter_config *); - int (*create_files)(struct super_block *sb, struct dentry *dir); - void (*cpu_setup)(void *dummy); - int (*init)(void); - void (*exit)(void); - void (*cpu_start)(void *args); - void (*cpu_stop)(void *args); - char *cpu_type; - unsigned char num_counters; -}; - -/* arch/sh/oprofile/common.c */ -extern void sh_backtrace(struct pt_regs * const regs, unsigned int depth); - -#endif /* __OP_IMPL_H */ diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 491e9d6de191..3e9d31401fb2 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -26,10 +26,12 @@ config SPARC select ARCH_WANT_OPTIONAL_GPIOLIB select RTC_CLASS select RTC_DRV_M48T59 + select HAVE_IRQ_WORK select HAVE_PERF_EVENTS select PERF_USE_VMALLOC select HAVE_DMA_ATTRS select HAVE_DMA_API_DEBUG + select HAVE_ARCH_JUMP_LABEL config SPARC32 def_bool !64BIT @@ -53,6 +55,7 @@ config SPARC64 select RTC_DRV_BQ4802 select RTC_DRV_SUN4V select RTC_DRV_STARFIRE + select HAVE_IRQ_WORK select HAVE_PERF_EVENTS select PERF_USE_VMALLOC diff --git a/arch/sparc/include/asm/jump_label.h b/arch/sparc/include/asm/jump_label.h new file mode 100644 index 000000000000..62e66d7b2fb6 --- /dev/null +++ b/arch/sparc/include/asm/jump_label.h @@ -0,0 +1,32 @@ +#ifndef _ASM_SPARC_JUMP_LABEL_H +#define _ASM_SPARC_JUMP_LABEL_H + +#ifdef __KERNEL__ + +#include <linux/types.h> +#include <asm/system.h> + +#define JUMP_LABEL_NOP_SIZE 4 + +#define JUMP_LABEL(key, label) \ + do { \ + asm goto("1:\n\t" \ + "nop\n\t" \ + "nop\n\t" \ + ".pushsection __jump_table, \"a\"\n\t"\ + ".word 1b, %l[" #label "], %c0\n\t" \ + ".popsection \n\t" \ + : : "i" (key) : : label);\ + } while (0) + +#endif /* __KERNEL__ */ + +typedef u32 jump_label_t; + +struct jump_entry { + jump_label_t code; + jump_label_t target; + jump_label_t key; +}; + +#endif diff --git a/arch/sparc/include/asm/perf_event.h b/arch/sparc/include/asm/perf_event.h index 727af70646cb..6e8bfa1786da 100644 --- a/arch/sparc/include/asm/perf_event.h +++ b/arch/sparc/include/asm/perf_event.h @@ -1,10 +1,6 @@ #ifndef __ASM_SPARC_PERF_EVENT_H #define __ASM_SPARC_PERF_EVENT_H -extern void set_perf_event_pending(void); - -#define PERF_EVENT_INDEX_OFFSET 0 - #ifdef CONFIG_PERF_EVENTS #include <asm/ptrace.h> diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile index 0c2dc1f24a9a..599398fbbc7c 100644 --- a/arch/sparc/kernel/Makefile +++ b/arch/sparc/kernel/Makefile @@ -119,3 +119,5 @@ obj-$(CONFIG_COMPAT) += $(audit--y) pc--$(CONFIG_PERF_EVENTS) := perf_event.o obj-$(CONFIG_SPARC64) += $(pc--y) + +obj-$(CONFIG_SPARC64) += jump_label.o diff --git a/arch/sparc/kernel/jump_label.c b/arch/sparc/kernel/jump_label.c new file mode 100644 index 000000000000..ea2dafc93d78 --- /dev/null +++ b/arch/sparc/kernel/jump_label.c @@ -0,0 +1,47 @@ +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/mutex.h> +#include <linux/cpu.h> + +#include <linux/jump_label.h> +#include <linux/memory.h> + +#ifdef HAVE_JUMP_LABEL + +void arch_jump_label_transform(struct jump_entry *entry, + enum jump_label_type type) +{ + u32 val; + u32 *insn = (u32 *) (unsigned long) entry->code; + + if (type == JUMP_LABEL_ENABLE) { + s32 off = (s32)entry->target - (s32)entry->code; + +#ifdef CONFIG_SPARC64 + /* ba,pt %xcc, . + (off << 2) */ + val = 0x10680000 | ((u32) off >> 2); +#else + /* ba . + (off << 2) */ + val = 0x10800000 | ((u32) off >> 2); +#endif + } else { + val = 0x01000000; + } + + get_online_cpus(); + mutex_lock(&text_mutex); + *insn = val; + flushi(insn); + mutex_unlock(&text_mutex); + put_online_cpus(); +} + +void arch_jump_label_text_poke_early(jump_label_t addr) +{ + u32 *insn_p = (u32 *) (unsigned long) addr; + + *insn_p = 0x01000000; + flushi(insn_p); +} + +#endif diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c index f848aadf54dc..ee3c7dde8d9f 100644 --- a/arch/sparc/kernel/module.c +++ b/arch/sparc/kernel/module.c @@ -18,6 +18,9 @@ #include <asm/spitfire.h> #ifdef CONFIG_SPARC64 + +#include <linux/jump_label.h> + static void *module_map(unsigned long size) { struct vm_struct *area; @@ -227,6 +230,9 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { + /* make jump label nops */ + jump_label_apply_nops(me); + /* Cheetah's I-cache is fully coherent. */ if (tlb_type == spitfire) { unsigned long va; diff --git a/arch/sparc/kernel/pcr.c b/arch/sparc/kernel/pcr.c index c4a6a50b4849..b87873c0e8ea 100644 --- a/arch/sparc/kernel/pcr.c +++ b/arch/sparc/kernel/pcr.c @@ -7,7 +7,7 @@ #include <linux/init.h> #include <linux/irq.h> -#include <linux/perf_event.h> +#include <linux/irq_work.h> #include <linux/ftrace.h> #include <asm/pil.h> @@ -43,14 +43,14 @@ void __irq_entry deferred_pcr_work_irq(int irq, struct pt_regs *regs) old_regs = set_irq_regs(regs); irq_enter(); -#ifdef CONFIG_PERF_EVENTS - perf_event_do_pending(); +#ifdef CONFIG_IRQ_WORK + irq_work_run(); #endif irq_exit(); set_irq_regs(old_regs); } -void set_perf_event_pending(void) +void arch_irq_work_raise(void) { set_softint(1 << PIL_DEFERRED_PCR_WORK); } diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 6318e622cfb0..0d6deb55a2ae 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -658,13 +658,16 @@ static u64 maybe_change_configuration(struct cpu_hw_events *cpuc, u64 pcr) enc = perf_event_get_enc(cpuc->events[i]); pcr &= ~mask_for_index(idx); - pcr |= event_encoding(enc, idx); + if (hwc->state & PERF_HES_STOPPED) + pcr |= nop_for_index(idx); + else + pcr |= event_encoding(enc, idx); } out: return pcr; } -void hw_perf_enable(void) +static void sparc_pmu_enable(struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); u64 pcr; @@ -691,7 +694,7 @@ void hw_perf_enable(void) pcr_ops->write(cpuc->pcr); } -void hw_perf_disable(void) +static void sparc_pmu_disable(struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); u64 val; @@ -710,19 +713,65 @@ void hw_perf_disable(void) pcr_ops->write(cpuc->pcr); } -static void sparc_pmu_disable(struct perf_event *event) +static int active_event_index(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + int i; + + for (i = 0; i < cpuc->n_events; i++) { + if (cpuc->event[i] == event) + break; + } + BUG_ON(i == cpuc->n_events); + return cpuc->current_idx[i]; +} + +static void sparc_pmu_start(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx = active_event_index(cpuc, event); + + if (flags & PERF_EF_RELOAD) { + WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); + sparc_perf_event_set_period(event, &event->hw, idx); + } + + event->hw.state = 0; + + sparc_pmu_enable_event(cpuc, &event->hw, idx); +} + +static void sparc_pmu_stop(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx = active_event_index(cpuc, event); + + if (!(event->hw.state & PERF_HES_STOPPED)) { + sparc_pmu_disable_event(cpuc, &event->hw, idx); + event->hw.state |= PERF_HES_STOPPED; + } + + if (!(event->hw.state & PERF_HES_UPTODATE) && (flags & PERF_EF_UPDATE)) { + sparc_perf_event_update(event, &event->hw, idx); + event->hw.state |= PERF_HES_UPTODATE; + } +} + +static void sparc_pmu_del(struct perf_event *event, int _flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; unsigned long flags; int i; local_irq_save(flags); - perf_disable(); + perf_pmu_disable(event->pmu); for (i = 0; i < cpuc->n_events; i++) { if (event == cpuc->event[i]) { - int idx = cpuc->current_idx[i]; + /* Absorb the final count and turn off the + * event. + */ + sparc_pmu_stop(event, PERF_EF_UPDATE); /* Shift remaining entries down into * the existing slot. @@ -734,13 +783,6 @@ static void sparc_pmu_disable(struct perf_event *event) cpuc->current_idx[i]; } - /* Absorb the final count and turn off the - * event. - */ - sparc_pmu_disable_event(cpuc, hwc, idx); - barrier(); - sparc_perf_event_update(event, hwc, idx); - perf_event_update_userpage(event); cpuc->n_events--; @@ -748,23 +790,10 @@ static void sparc_pmu_disable(struct perf_event *event) } } - perf_enable(); + perf_pmu_enable(event->pmu); local_irq_restore(flags); } -static int active_event_index(struct cpu_hw_events *cpuc, - struct perf_event *event) -{ - int i; - - for (i = 0; i < cpuc->n_events; i++) { - if (cpuc->event[i] == event) - break; - } - BUG_ON(i == cpuc->n_events); - return cpuc->current_idx[i]; -} - static void sparc_pmu_read(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -774,15 +803,6 @@ static void sparc_pmu_read(struct perf_event *event) sparc_perf_event_update(event, hwc, idx); } -static void sparc_pmu_unthrottle(struct perf_event *event) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - int idx = active_event_index(cpuc, event); - struct hw_perf_event *hwc = &event->hw; - - sparc_pmu_enable_event(cpuc, hwc, idx); -} - static atomic_t active_events = ATOMIC_INIT(0); static DEFINE_MUTEX(pmc_grab_mutex); @@ -877,7 +897,7 @@ static int sparc_check_constraints(struct perf_event **evts, if (!n_ev) return 0; - if (n_ev > perf_max_events) + if (n_ev > MAX_HWEVENTS) return -1; msk0 = perf_event_get_msk(events[0]); @@ -984,23 +1004,27 @@ static int collect_events(struct perf_event *group, int max_count, return n; } -static int sparc_pmu_enable(struct perf_event *event) +static int sparc_pmu_add(struct perf_event *event, int ef_flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int n0, ret = -EAGAIN; unsigned long flags; local_irq_save(flags); - perf_disable(); + perf_pmu_disable(event->pmu); n0 = cpuc->n_events; - if (n0 >= perf_max_events) + if (n0 >= MAX_HWEVENTS) goto out; cpuc->event[n0] = event; cpuc->events[n0] = event->hw.event_base; cpuc->current_idx[n0] = PIC_NO_INDEX; + event->hw.state = PERF_HES_UPTODATE; + if (!(ef_flags & PERF_EF_START)) + event->hw.state |= PERF_HES_STOPPED; + /* * If group events scheduling transaction was started, * skip the schedulability test here, it will be peformed @@ -1020,12 +1044,12 @@ nocheck: ret = 0; out: - perf_enable(); + perf_pmu_enable(event->pmu); local_irq_restore(flags); return ret; } -static int __hw_perf_event_init(struct perf_event *event) +static int sparc_pmu_event_init(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; struct perf_event *evts[MAX_HWEVENTS]; @@ -1038,22 +1062,33 @@ static int __hw_perf_event_init(struct perf_event *event) if (atomic_read(&nmi_active) < 0) return -ENODEV; - pmap = NULL; - if (attr->type == PERF_TYPE_HARDWARE) { + switch (attr->type) { + case PERF_TYPE_HARDWARE: if (attr->config >= sparc_pmu->max_events) return -EINVAL; pmap = sparc_pmu->event_map(attr->config); - } else if (attr->type == PERF_TYPE_HW_CACHE) { + break; + + case PERF_TYPE_HW_CACHE: pmap = sparc_map_cache_event(attr->config); if (IS_ERR(pmap)) return PTR_ERR(pmap); - } else if (attr->type != PERF_TYPE_RAW) - return -EOPNOTSUPP; + break; + + case PERF_TYPE_RAW: + pmap = NULL; + break; + + default: + return -ENOENT; + + } if (pmap) { hwc->event_base = perf_event_encode(pmap); } else { - /* User gives us "(encoding << 16) | pic_mask" for + /* + * User gives us "(encoding << 16) | pic_mask" for * PERF_TYPE_RAW events. */ hwc->event_base = attr->config; @@ -1071,7 +1106,7 @@ static int __hw_perf_event_init(struct perf_event *event) n = 0; if (event->group_leader != event) { n = collect_events(event->group_leader, - perf_max_events - 1, + MAX_HWEVENTS - 1, evts, events, current_idx_dmy); if (n < 0) return -EINVAL; @@ -1107,10 +1142,11 @@ static int __hw_perf_event_init(struct perf_event *event) * Set the flag to make pmu::enable() not perform the * schedulability test, it will be performed at commit time */ -static void sparc_pmu_start_txn(const struct pmu *pmu) +static void sparc_pmu_start_txn(struct pmu *pmu) { struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); + perf_pmu_disable(pmu); cpuhw->group_flag |= PERF_EVENT_TXN; } @@ -1119,11 +1155,12 @@ static void sparc_pmu_start_txn(const struct pmu *pmu) * Clear the flag and pmu::enable() will perform the * schedulability test. */ -static void sparc_pmu_cancel_txn(const struct pmu *pmu) +static void sparc_pmu_cancel_txn(struct pmu *pmu) { struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); cpuhw->group_flag &= ~PERF_EVENT_TXN; + perf_pmu_enable(pmu); } /* @@ -1131,7 +1168,7 @@ static void sparc_pmu_cancel_txn(const struct pmu *pmu) * Perform the group schedulability test as a whole * Return 0 if success */ -static int sparc_pmu_commit_txn(const struct pmu *pmu) +static int sparc_pmu_commit_txn(struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int n; @@ -1147,28 +1184,24 @@ static int sparc_pmu_commit_txn(const struct pmu *pmu) return -EAGAIN; cpuc->group_flag &= ~PERF_EVENT_TXN; + perf_pmu_enable(pmu); return 0; } -static const struct pmu pmu = { - .enable = sparc_pmu_enable, - .disable = sparc_pmu_disable, +static struct pmu pmu = { + .pmu_enable = sparc_pmu_enable, + .pmu_disable = sparc_pmu_disable, + .event_init = sparc_pmu_event_init, + .add = sparc_pmu_add, + .del = sparc_pmu_del, + .start = sparc_pmu_start, + .stop = sparc_pmu_stop, .read = sparc_pmu_read, - .unthrottle = sparc_pmu_unthrottle, .start_txn = sparc_pmu_start_txn, .cancel_txn = sparc_pmu_cancel_txn, .commit_txn = sparc_pmu_commit_txn, }; -const struct pmu *hw_perf_event_init(struct perf_event *event) -{ - int err = __hw_perf_event_init(event); - - if (err) - return ERR_PTR(err); - return &pmu; -} - void perf_event_print_debug(void) { unsigned long flags; @@ -1244,7 +1277,7 @@ static int __kprobes perf_event_nmi_handler(struct notifier_block *self, continue; if (perf_event_overflow(event, 1, &data, regs)) - sparc_pmu_disable_event(cpuc, hwc, idx); + sparc_pmu_stop(event, 0); } return NOTIFY_STOP; @@ -1285,28 +1318,21 @@ void __init init_hw_perf_events(void) pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type); - /* All sparc64 PMUs currently have 2 events. */ - perf_max_events = 2; - + perf_pmu_register(&pmu); register_die_notifier(&perf_event_nmi_notifier); } -static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip) -{ - if (entry->nr < PERF_MAX_STACK_DEPTH) - entry->ip[entry->nr++] = ip; -} - -static void perf_callchain_kernel(struct pt_regs *regs, - struct perf_callchain_entry *entry) +void perf_callchain_kernel(struct perf_callchain_entry *entry, + struct pt_regs *regs) { unsigned long ksp, fp; #ifdef CONFIG_FUNCTION_GRAPH_TRACER int graph = 0; #endif - callchain_store(entry, PERF_CONTEXT_KERNEL); - callchain_store(entry, regs->tpc); + stack_trace_flush(); + + perf_callchain_store(entry, regs->tpc); ksp = regs->u_regs[UREG_I6]; fp = ksp + STACK_BIAS; @@ -1330,13 +1356,13 @@ static void perf_callchain_kernel(struct pt_regs *regs, pc = sf->callers_pc; fp = (unsigned long)sf->fp + STACK_BIAS; } - callchain_store(entry, pc); + perf_callchain_store(entry, pc); #ifdef CONFIG_FUNCTION_GRAPH_TRACER if ((pc + 8UL) == (unsigned long) &return_to_handler) { int index = current->curr_ret_stack; if (current->ret_stack && index >= graph) { pc = current->ret_stack[index - graph].ret; - callchain_store(entry, pc); + perf_callchain_store(entry, pc); graph++; } } @@ -1344,13 +1370,12 @@ static void perf_callchain_kernel(struct pt_regs *regs, } while (entry->nr < PERF_MAX_STACK_DEPTH); } -static void perf_callchain_user_64(struct pt_regs *regs, - struct perf_callchain_entry *entry) +static void perf_callchain_user_64(struct perf_callchain_entry *entry, + struct pt_regs *regs) { unsigned long ufp; - callchain_store(entry, PERF_CONTEXT_USER); - callchain_store(entry, regs->tpc); + perf_callchain_store(entry, regs->tpc); ufp = regs->u_regs[UREG_I6] + STACK_BIAS; do { @@ -1363,17 +1388,16 @@ static void perf_callchain_user_64(struct pt_regs *regs, pc = sf.callers_pc; ufp = (unsigned long)sf.fp + STACK_BIAS; - callchain_store(entry, pc); + perf_callchain_store(entry, pc); } while (entry->nr < PERF_MAX_STACK_DEPTH); } -static void perf_callchain_user_32(struct pt_regs *regs, - struct perf_callchain_entry *entry) +static void perf_callchain_user_32(struct perf_callchain_entry *entry, + struct pt_regs *regs) { unsigned long ufp; - callchain_store(entry, PERF_CONTEXT_USER); - callchain_store(entry, regs->tpc); + perf_callchain_store(entry, regs->tpc); ufp = regs->u_regs[UREG_I6] & 0xffffffffUL; do { @@ -1386,34 +1410,16 @@ static void perf_callchain_user_32(struct pt_regs *regs, pc = sf.callers_pc; ufp = (unsigned long)sf.fp; - callchain_store(entry, pc); + perf_callchain_store(entry, pc); } while (entry->nr < PERF_MAX_STACK_DEPTH); } -/* Like powerpc we can't get PMU interrupts within the PMU handler, - * so no need for separate NMI and IRQ chains as on x86. - */ -static DEFINE_PER_CPU(struct perf_callchain_entry, callchain); - -struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +void +perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) { - struct perf_callchain_entry *entry = &__get_cpu_var(callchain); - - entry->nr = 0; - if (!user_mode(regs)) { - stack_trace_flush(); - perf_callchain_kernel(regs, entry); - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - if (regs) { - flushw_user(); - if (test_thread_flag(TIF_32BIT)) - perf_callchain_user_32(regs, entry); - else - perf_callchain_user_64(regs, entry); - } - return entry; + flushw_user(); + if (test_thread_flag(TIF_32BIT)) + perf_callchain_user_32(entry, regs); + else + perf_callchain_user_64(entry, regs); } diff --git a/arch/um/drivers/hostaudio_kern.c b/arch/um/drivers/hostaudio_kern.c index 0c46e398cd8f..63c740a85b4c 100644 --- a/arch/um/drivers/hostaudio_kern.c +++ b/arch/um/drivers/hostaudio_kern.c @@ -40,6 +40,11 @@ static char *mixer = HOSTAUDIO_DEV_MIXER; " This is used to specify the host mixer device to the hostaudio driver.\n"\ " The default is \"" HOSTAUDIO_DEV_MIXER "\".\n\n" +module_param(dsp, charp, 0644); +MODULE_PARM_DESC(dsp, DSP_HELP); +module_param(mixer, charp, 0644); +MODULE_PARM_DESC(mixer, MIXER_HELP); + #ifndef MODULE static int set_dsp(char *name, int *add) { @@ -56,15 +61,6 @@ static int set_mixer(char *name, int *add) } __uml_setup("mixer=", set_mixer, "mixer=<mixer device>\n" MIXER_HELP); - -#else /*MODULE*/ - -module_param(dsp, charp, 0644); -MODULE_PARM_DESC(dsp, DSP_HELP); - -module_param(mixer, charp, 0644); -MODULE_PARM_DESC(mixer, MIXER_HELP); - #endif /* /dev/dsp file operations */ diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 1bcd208c459f..9734994cba1e 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -163,6 +163,7 @@ struct ubd { struct scatterlist sg[MAX_SG]; struct request *request; int start_sg, end_sg; + sector_t rq_pos; }; #define DEFAULT_COW { \ @@ -187,6 +188,7 @@ struct ubd { .request = NULL, \ .start_sg = 0, \ .end_sg = 0, \ + .rq_pos = 0, \ } /* Protected by ubd_lock */ @@ -1228,7 +1230,6 @@ static void do_ubd_request(struct request_queue *q) { struct io_thread_req *io_req; struct request *req; - sector_t sector; int n; while(1){ @@ -1239,12 +1240,12 @@ static void do_ubd_request(struct request_queue *q) return; dev->request = req; + dev->rq_pos = blk_rq_pos(req); dev->start_sg = 0; dev->end_sg = blk_rq_map_sg(q, req, dev->sg); } req = dev->request; - sector = blk_rq_pos(req); while(dev->start_sg < dev->end_sg){ struct scatterlist *sg = &dev->sg[dev->start_sg]; @@ -1256,10 +1257,9 @@ static void do_ubd_request(struct request_queue *q) return; } prepare_request(req, io_req, - (unsigned long long)sector << 9, + (unsigned long long)dev->rq_pos << 9, sg->offset, sg->length, sg_page(sg)); - sector += sg->length >> 9; n = os_write_file(thread_fd, &io_req, sizeof(struct io_thread_req *)); if(n != sizeof(struct io_thread_req *)){ @@ -1272,6 +1272,7 @@ static void do_ubd_request(struct request_queue *q) return; } + dev->rq_pos += sg->length >> 9; dev->start_sg++; } dev->end_sg = 0; diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f4c70c246ffe..89b88e3a56e9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -25,6 +25,7 @@ config X86 select HAVE_IDE select HAVE_OPROFILE select HAVE_PERF_EVENTS if (!M386 && !M486) + select HAVE_IRQ_WORK select HAVE_IOREMAP_PROT select HAVE_KPROBES select ARCH_WANT_OPTIONAL_GPIOLIB @@ -33,6 +34,7 @@ config X86 select HAVE_KRETPROBES select HAVE_OPTPROBES select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_C_RECORDMCOUNT select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER @@ -59,6 +61,8 @@ config X86 select ANON_INODES select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER + select HAVE_ARCH_JUMP_LABEL + select HAVE_TEXT_POKE_SMP config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) @@ -2136,6 +2140,10 @@ config HAVE_ATOMIC_IOMAP def_bool y depends on X86_32 +config HAVE_TEXT_POKE_SMP + bool + select STOP_MACHINE if SMP + source "net/Kconfig" source "drivers/Kconfig" diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 0350311906ae..2d93bdbc9ac0 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -34,7 +34,7 @@ #include <asm/ia32.h> #undef WARN_OLD -#undef CORE_DUMP /* probably broken */ +#undef CORE_DUMP /* definitely broken */ static int load_aout_binary(struct linux_binprm *, struct pt_regs *regs); static int load_aout_library(struct file *); @@ -131,21 +131,15 @@ static void set_brk(unsigned long start, unsigned long end) * macros to write out all the necessary info. */ -static int dump_write(struct file *file, const void *addr, int nr) -{ - return file->f_op->write(file, addr, nr, &file->f_pos) == nr; -} +#include <linux/coredump.h> #define DUMP_WRITE(addr, nr) \ if (!dump_write(file, (void *)(addr), (nr))) \ goto end_coredump; -#define DUMP_SEEK(offset) \ - if (file->f_op->llseek) { \ - if (file->f_op->llseek(file, (offset), 0) != (offset)) \ - goto end_coredump; \ - } else \ - file->f_pos = (offset) +#define DUMP_SEEK(offset) \ + if (!dump_seek(file, offset)) \ + goto end_coredump; #define START_DATA() (u.u_tsize << PAGE_SHIFT) #define START_STACK(u) (u.start_stack) @@ -217,12 +211,6 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, dump_size = dump.u_ssize << PAGE_SHIFT; DUMP_WRITE(dump_start, dump_size); } - /* - * Finally dump the task struct. Not be used by gdb, but - * could be useful - */ - set_fs(KERNEL_DS); - DUMP_WRITE(current, sizeof(*current)); end_coredump: set_fs(fs); return has_dumped; diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index bc6abb7bc7ee..76561d20ea2f 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -4,6 +4,7 @@ #include <linux/types.h> #include <linux/stddef.h> #include <linux/stringify.h> +#include <linux/jump_label.h> #include <asm/asm.h> /* @@ -160,6 +161,8 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, #define __parainstructions_end NULL #endif +extern void *text_poke_early(void *addr, const void *opcode, size_t len); + /* * Clear and restore the kernel write-protection flag on the local CPU. * Allows the kernel to edit read-only pages. @@ -180,4 +183,12 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, extern void *text_poke(void *addr, const void *opcode, size_t len); extern void *text_poke_smp(void *addr, const void *opcode, size_t len); +#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) +#define IDEAL_NOP_SIZE_5 5 +extern unsigned char ideal_nop5[IDEAL_NOP_SIZE_5]; +extern void arch_init_ideal_nop5(void); +#else +static inline void arch_init_ideal_nop5(void) {} +#endif + #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h index 5af2982133b5..f16a2caca1e0 100644 --- a/arch/x86/include/asm/amd_iommu.h +++ b/arch/x86/include/asm/amd_iommu.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h index cb030374b90a..916bc8111a01 100644 --- a/arch/x86/include/asm/amd_iommu_proto.h +++ b/arch/x86/include/asm/amd_iommu_proto.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009 Advanced Micro Devices, Inc. + * Copyright (C) 2009-2010 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * * This program is free software; you can redistribute it and/or modify it diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 08616180deaf..e3509fc303bf 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * @@ -416,13 +416,22 @@ struct amd_iommu { struct dma_ops_domain *default_dom; /* - * This array is required to work around a potential BIOS bug. - * The BIOS may miss to restore parts of the PCI configuration - * space when the system resumes from S3. The result is that the - * IOMMU does not execute commands anymore which leads to system - * failure. + * We can't rely on the BIOS to restore all values on reinit, so we + * need to stash them */ - u32 cache_cfg[4]; + + /* The iommu BAR */ + u32 stored_addr_lo; + u32 stored_addr_hi; + + /* + * Each iommu has 6 l1s, each of which is documented as having 0x12 + * registers + */ + u32 stored_l1[6][0x12]; + + /* The l2 indirect registers */ + u32 stored_l2[0x83]; }; /* diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 8e8ec663a98f..b8e96a18676b 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -49,8 +49,8 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) -#ifdef CONFIG_PERF_EVENTS -BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) +#ifdef CONFIG_IRQ_WORK +BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR) #endif #ifdef CONFIG_X86_THERMAL_VECTOR diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index 4ac5b0f33fc1..bf357f9b25f0 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h @@ -17,6 +17,7 @@ extern int fix_aperture; #define GARTEN (1<<0) #define DISGARTCPU (1<<4) #define DISGARTIO (1<<5) +#define DISTLBWALKPRB (1<<6) /* GART cache control register bits. */ #define INVGART (1<<0) @@ -27,7 +28,6 @@ extern int fix_aperture; #define AMD64_GARTAPERTUREBASE 0x94 #define AMD64_GARTTABLEBASE 0x98 #define AMD64_GARTCACHECTL 0x9c -#define AMD64_GARTEN (1<<0) #ifdef CONFIG_GART_IOMMU extern int gart_iommu_aperture; @@ -57,6 +57,19 @@ static inline void gart_iommu_hole_init(void) extern int agp_amd64_init(void); +static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order) +{ + u32 ctl; + + /* + * Don't enable translation but enable GART IO and CPU accesses. + * Also, set DISTLBWALKPRB since GART tables memory is UC. + */ + ctl = DISTLBWALKPRB | order << 1; + + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); +} + static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) { u32 tmp, ctl; diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index aeab29aee617..55e4de613f0e 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -14,7 +14,7 @@ typedef struct { #endif unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; - unsigned int apic_pending_irqs; + unsigned int apic_irq_work_irqs; #ifdef CONFIG_SMP unsigned int irq_resched_count; unsigned int irq_call_count; diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 46c0fe05f230..3a54a1ca1a02 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -29,7 +29,7 @@ extern void apic_timer_interrupt(void); extern void x86_platform_ipi(void); extern void error_interrupt(void); -extern void perf_pending_interrupt(void); +extern void irq_work_interrupt(void); extern void spurious_interrupt(void); extern void thermal_interrupt(void); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index e2ca30092557..6af0894dafb4 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -114,9 +114,9 @@ #define X86_PLATFORM_IPI_VECTOR 0xed /* - * Performance monitoring pending work vector: + * IRQ work vector: */ -#define LOCAL_PENDING_VECTOR 0xec +#define IRQ_WORK_VECTOR 0xec #define UV_BAU_MESSAGE 0xea diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h new file mode 100644 index 000000000000..f52d42e80585 --- /dev/null +++ b/arch/x86/include/asm/jump_label.h @@ -0,0 +1,37 @@ +#ifndef _ASM_X86_JUMP_LABEL_H +#define _ASM_X86_JUMP_LABEL_H + +#ifdef __KERNEL__ + +#include <linux/types.h> +#include <asm/nops.h> + +#define JUMP_LABEL_NOP_SIZE 5 + +# define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" + +# define JUMP_LABEL(key, label) \ + do { \ + asm goto("1:" \ + JUMP_LABEL_INITIAL_NOP \ + ".pushsection __jump_table, \"a\" \n\t"\ + _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \ + ".popsection \n\t" \ + : : "i" (key) : : label); \ + } while (0) + +#endif /* __KERNEL__ */ + +#ifdef CONFIG_X86_64 +typedef u64 jump_label_t; +#else +typedef u32 jump_label_t; +#endif + +struct jump_entry { + jump_label_t code; + jump_label_t target; + jump_label_t key; +}; + +#endif diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 502e53f999cf..c52e2eb40a1e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -652,20 +652,6 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) return (struct kvm_mmu_page *)page_private(page); } -static inline u16 kvm_read_fs(void) -{ - u16 seg; - asm("mov %%fs, %0" : "=g"(seg)); - return seg; -} - -static inline u16 kvm_read_gs(void) -{ - u16 seg; - asm("mov %%gs, %0" : "=g"(seg)); - return seg; -} - static inline u16 kvm_read_ldt(void) { u16 ldt; @@ -673,16 +659,6 @@ static inline u16 kvm_read_ldt(void) return ldt; } -static inline void kvm_load_fs(u16 sel) -{ - asm("mov %0, %%fs" : : "rm"(sel)); -} - -static inline void kvm_load_gs(u16 sel) -{ - asm("mov %0, %%gs" : : "rm"(sel)); -} - static inline void kvm_load_ldt(u16 sel) { asm("lldt %0" : : "rm"(sel)); diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index def500776b16..a70cd216be5d 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -36,19 +36,6 @@ #define P4_ESCR_EMASK(v) ((v) << P4_ESCR_EVENTMASK_SHIFT) #define P4_ESCR_TAG(v) ((v) << P4_ESCR_TAG_SHIFT) -/* Non HT mask */ -#define P4_ESCR_MASK \ - (P4_ESCR_EVENT_MASK | \ - P4_ESCR_EVENTMASK_MASK | \ - P4_ESCR_TAG_MASK | \ - P4_ESCR_TAG_ENABLE | \ - P4_ESCR_T0_OS | \ - P4_ESCR_T0_USR) - -/* HT mask */ -#define P4_ESCR_MASK_HT \ - (P4_ESCR_MASK | P4_ESCR_T1_OS | P4_ESCR_T1_USR) - #define P4_CCCR_OVF 0x80000000U #define P4_CCCR_CASCADE 0x40000000U #define P4_CCCR_OVF_PMI_T0 0x04000000U @@ -70,23 +57,6 @@ #define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT) #define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT) -/* Non HT mask */ -#define P4_CCCR_MASK \ - (P4_CCCR_OVF | \ - P4_CCCR_CASCADE | \ - P4_CCCR_OVF_PMI_T0 | \ - P4_CCCR_FORCE_OVF | \ - P4_CCCR_EDGE | \ - P4_CCCR_THRESHOLD_MASK | \ - P4_CCCR_COMPLEMENT | \ - P4_CCCR_COMPARE | \ - P4_CCCR_ESCR_SELECT_MASK | \ - P4_CCCR_ENABLE) - -/* HT mask */ -#define P4_CCCR_MASK_HT \ - (P4_CCCR_MASK | P4_CCCR_OVF_PMI_T1 | P4_CCCR_THREAD_ANY) - #define P4_GEN_ESCR_EMASK(class, name, bit) \ class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT) #define P4_ESCR_EMASK_BIT(class, name) class##__##name @@ -127,6 +97,28 @@ #define P4_CONFIG_HT_SHIFT 63 #define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT) +/* + * The bits we allow to pass for RAW events + */ +#define P4_CONFIG_MASK_ESCR \ + P4_ESCR_EVENT_MASK | \ + P4_ESCR_EVENTMASK_MASK | \ + P4_ESCR_TAG_MASK | \ + P4_ESCR_TAG_ENABLE + +#define P4_CONFIG_MASK_CCCR \ + P4_CCCR_EDGE | \ + P4_CCCR_THRESHOLD_MASK | \ + P4_CCCR_COMPLEMENT | \ + P4_CCCR_COMPARE | \ + P4_CCCR_THREAD_ANY | \ + P4_CCCR_RESERVED + +/* some dangerous bits are reserved for kernel internals */ +#define P4_CONFIG_MASK \ + (p4_config_pack_escr(P4_CONFIG_MASK_ESCR)) | \ + (p4_config_pack_cccr(P4_CONFIG_MASK_CCCR)) + static inline bool p4_is_event_cascaded(u64 config) { u32 cccr = p4_config_unpack_cccr(config); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index fedf32a8c3ec..7490bf8d1459 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -34,7 +34,8 @@ GCOV_PROFILE_paravirt.o := n obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o -obj-y += setup.o x86_init.o i8259.o irqinit.o +obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o +obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index f65ab8b014c4..a36bb90aef53 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -195,7 +195,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; -static void *text_poke_early(void *addr, const void *opcode, size_t len); +void *text_poke_early(void *addr, const void *opcode, size_t len); /* Replace instructions with better alternatives for this CPU type. This runs before SMP is initialized to avoid SMP problems with @@ -522,7 +522,7 @@ void __init alternative_instructions(void) * instructions. And on the local CPU you need to be protected again NMI or MCE * handlers seeing an inconsistent instruction while you patch. */ -static void *__init_or_module text_poke_early(void *addr, const void *opcode, +void *__init_or_module text_poke_early(void *addr, const void *opcode, size_t len) { unsigned long flags; @@ -637,7 +637,72 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) tpp.len = len; atomic_set(&stop_machine_first, 1); wrote_text = 0; - stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); + /* Use __stop_machine() because the caller already got online_cpus. */ + __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); return addr; } +#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) + +unsigned char ideal_nop5[IDEAL_NOP_SIZE_5]; + +void __init arch_init_ideal_nop5(void) +{ + extern const unsigned char ftrace_test_p6nop[]; + extern const unsigned char ftrace_test_nop5[]; + extern const unsigned char ftrace_test_jmp[]; + int faulted = 0; + + /* + * There is no good nop for all x86 archs. + * We will default to using the P6_NOP5, but first we + * will test to make sure that the nop will actually + * work on this CPU. If it faults, we will then + * go to a lesser efficient 5 byte nop. If that fails + * we then just use a jmp as our nop. This isn't the most + * efficient nop, but we can not use a multi part nop + * since we would then risk being preempted in the middle + * of that nop, and if we enabled tracing then, it might + * cause a system crash. + * + * TODO: check the cpuid to determine the best nop. + */ + asm volatile ( + "ftrace_test_jmp:" + "jmp ftrace_test_p6nop\n" + "nop\n" + "nop\n" + "nop\n" /* 2 byte jmp + 3 bytes */ + "ftrace_test_p6nop:" + P6_NOP5 + "jmp 1f\n" + "ftrace_test_nop5:" + ".byte 0x66,0x66,0x66,0x66,0x90\n" + "1:" + ".section .fixup, \"ax\"\n" + "2: movl $1, %0\n" + " jmp ftrace_test_nop5\n" + "3: movl $2, %0\n" + " jmp 1b\n" + ".previous\n" + _ASM_EXTABLE(ftrace_test_p6nop, 2b) + _ASM_EXTABLE(ftrace_test_nop5, 3b) + : "=r"(faulted) : "0" (faulted)); + + switch (faulted) { + case 0: + pr_info("converting mcount calls to 0f 1f 44 00 00\n"); + memcpy(ideal_nop5, ftrace_test_p6nop, IDEAL_NOP_SIZE_5); + break; + case 1: + pr_info("converting mcount calls to 66 66 66 66 90\n"); + memcpy(ideal_nop5, ftrace_test_nop5, IDEAL_NOP_SIZE_5); + break; + case 2: + pr_info("converting mcount calls to jmp . + 5\n"); + memcpy(ideal_nop5, ftrace_test_jmp, IDEAL_NOP_SIZE_5); + break; + } + +} +#endif diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 679b6450382b..d2fdb0826df2 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 5a170cbbbed8..3cb482e123de 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * @@ -194,6 +194,39 @@ static inline unsigned long tbl_size(int entry_size) return 1UL << shift; } +/* Access to l1 and l2 indexed register spaces */ + +static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address) +{ + u32 val; + + pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16)); + pci_read_config_dword(iommu->dev, 0xfc, &val); + return val; +} + +static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val) +{ + pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31)); + pci_write_config_dword(iommu->dev, 0xfc, val); + pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16)); +} + +static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address) +{ + u32 val; + + pci_write_config_dword(iommu->dev, 0xf0, address); + pci_read_config_dword(iommu->dev, 0xf4, &val); + return val; +} + +static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val) +{ + pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8)); + pci_write_config_dword(iommu->dev, 0xf4, val); +} + /**************************************************************************** * * AMD IOMMU MMIO register space handling functions @@ -619,6 +652,7 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) { int cap_ptr = iommu->cap_ptr; u32 range, misc; + int i, j; pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, &iommu->cap); @@ -633,12 +667,29 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) MMIO_GET_LD(range)); iommu->evt_msi_num = MMIO_MSI_NUM(misc); - if (is_rd890_iommu(iommu->dev)) { - pci_read_config_dword(iommu->dev, 0xf0, &iommu->cache_cfg[0]); - pci_read_config_dword(iommu->dev, 0xf4, &iommu->cache_cfg[1]); - pci_read_config_dword(iommu->dev, 0xf8, &iommu->cache_cfg[2]); - pci_read_config_dword(iommu->dev, 0xfc, &iommu->cache_cfg[3]); - } + if (!is_rd890_iommu(iommu->dev)) + return; + + /* + * Some rd890 systems may not be fully reconfigured by the BIOS, so + * it's necessary for us to store this information so it can be + * reprogrammed on resume + */ + + pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4, + &iommu->stored_addr_lo); + pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8, + &iommu->stored_addr_hi); + + /* Low bit locks writes to configuration space */ + iommu->stored_addr_lo &= ~1; + + for (i = 0; i < 6; i++) + for (j = 0; j < 0x12; j++) + iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j); + + for (i = 0; i < 0x83; i++) + iommu->stored_l2[i] = iommu_read_l2(iommu, i); } /* @@ -1127,14 +1178,53 @@ static void iommu_init_flags(struct amd_iommu *iommu) iommu_feature_enable(iommu, CONTROL_COHERENT_EN); } -static void iommu_apply_quirks(struct amd_iommu *iommu) +static void iommu_apply_resume_quirks(struct amd_iommu *iommu) { - if (is_rd890_iommu(iommu->dev)) { - pci_write_config_dword(iommu->dev, 0xf0, iommu->cache_cfg[0]); - pci_write_config_dword(iommu->dev, 0xf4, iommu->cache_cfg[1]); - pci_write_config_dword(iommu->dev, 0xf8, iommu->cache_cfg[2]); - pci_write_config_dword(iommu->dev, 0xfc, iommu->cache_cfg[3]); - } + int i, j; + u32 ioc_feature_control; + struct pci_dev *pdev = NULL; + + /* RD890 BIOSes may not have completely reconfigured the iommu */ + if (!is_rd890_iommu(iommu->dev)) + return; + + /* + * First, we need to ensure that the iommu is enabled. This is + * controlled by a register in the northbridge + */ + pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0)); + + if (!pdev) + return; + + /* Select Northbridge indirect register 0x75 and enable writing */ + pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7)); + pci_read_config_dword(pdev, 0x64, &ioc_feature_control); + + /* Enable the iommu */ + if (!(ioc_feature_control & 0x1)) + pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1); + + pci_dev_put(pdev); + + /* Restore the iommu BAR */ + pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4, + iommu->stored_addr_lo); + pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8, + iommu->stored_addr_hi); + + /* Restore the l1 indirect regs for each of the 6 l1s */ + for (i = 0; i < 6; i++) + for (j = 0; j < 0x12; j++) + iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]); + + /* Restore the l2 indirect regs */ + for (i = 0; i < 0x83; i++) + iommu_write_l2(iommu, i, iommu->stored_l2[i]); + + /* Lock PCI setup registers */ + pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4, + iommu->stored_addr_lo | 1); } /* @@ -1147,7 +1237,6 @@ static void enable_iommus(void) for_each_iommu(iommu) { iommu_disable(iommu); - iommu_apply_quirks(iommu); iommu_init_flags(iommu); iommu_set_device_table(iommu); iommu_enable_command_buffer(iommu); @@ -1173,6 +1262,11 @@ static void disable_iommus(void) static int amd_iommu_resume(struct sys_device *dev) { + struct amd_iommu *iommu; + + for_each_iommu(iommu) + iommu_apply_resume_quirks(iommu); + /* re-load the hardware */ enable_iommus(); diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index a2e0caf26e17..c9cb17368448 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -307,7 +307,7 @@ void __init early_gart_iommu_check(void) continue; ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); - aper_enabled = ctl & AMD64_GARTEN; + aper_enabled = ctl & GARTEN; aper_order = (ctl >> 1) & 7; aper_size = (32 * 1024 * 1024) << aper_order; aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; @@ -362,7 +362,7 @@ void __init early_gart_iommu_check(void) continue; ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); - ctl &= ~AMD64_GARTEN; + ctl &= ~GARTEN; write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); } } @@ -505,8 +505,13 @@ out: /* Fix up the north bridges */ for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { - int bus; - int dev_base, dev_limit; + int bus, dev_base, dev_limit; + + /* + * Don't enable translation yet but enable GART IO and CPU + * accesses and set DISTLBWALKPRB since GART table memory is UC. + */ + u32 ctl = DISTLBWALKPRB | aper_order << 1; bus = bus_dev_ranges[i].bus; dev_base = bus_dev_ranges[i].dev_base; @@ -515,10 +520,7 @@ out: if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) continue; - /* Don't enable translation yet. That is done later. - Assume this BIOS didn't initialise the GART so - just overwrite all previous bits */ - write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1); + write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25); } } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 03a5b0385ad6..fe73c1844a9a 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -531,7 +531,7 @@ static int x86_pmu_hw_config(struct perf_event *event) /* * Setup the hardware configuration for a given attr_type */ -static int __hw_perf_event_init(struct perf_event *event) +static int __x86_pmu_event_init(struct perf_event *event) { int err; @@ -584,7 +584,7 @@ static void x86_pmu_disable_all(void) } } -void hw_perf_disable(void) +static void x86_pmu_disable(struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -619,7 +619,7 @@ static void x86_pmu_enable_all(int added) } } -static const struct pmu pmu; +static struct pmu pmu; static inline int is_x86_event(struct perf_event *event) { @@ -801,10 +801,10 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc, hwc->last_tag == cpuc->tags[i]; } -static int x86_pmu_start(struct perf_event *event); -static void x86_pmu_stop(struct perf_event *event); +static void x86_pmu_start(struct perf_event *event, int flags); +static void x86_pmu_stop(struct perf_event *event, int flags); -void hw_perf_enable(void) +static void x86_pmu_enable(struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct perf_event *event; @@ -840,7 +840,14 @@ void hw_perf_enable(void) match_prev_assignment(hwc, cpuc, i)) continue; - x86_pmu_stop(event); + /* + * Ensure we don't accidentally enable a stopped + * counter simply because we rescheduled. + */ + if (hwc->state & PERF_HES_STOPPED) + hwc->state |= PERF_HES_ARCH; + + x86_pmu_stop(event, PERF_EF_UPDATE); } for (i = 0; i < cpuc->n_events; i++) { @@ -852,7 +859,10 @@ void hw_perf_enable(void) else if (i < n_running) continue; - x86_pmu_start(event); + if (hwc->state & PERF_HES_ARCH) + continue; + + x86_pmu_start(event, PERF_EF_RELOAD); } cpuc->n_added = 0; perf_events_lapic_init(); @@ -953,15 +963,12 @@ static void x86_pmu_enable_event(struct perf_event *event) } /* - * activate a single event + * Add a single event to the PMU. * * The event is added to the group of enabled events * but only if it can be scehduled with existing events. - * - * Called with PMU disabled. If successful and return value 1, - * then guaranteed to call perf_enable() and hw_perf_enable() */ -static int x86_pmu_enable(struct perf_event *event) +static int x86_pmu_add(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc; @@ -970,58 +977,67 @@ static int x86_pmu_enable(struct perf_event *event) hwc = &event->hw; + perf_pmu_disable(event->pmu); n0 = cpuc->n_events; - n = collect_events(cpuc, event, false); - if (n < 0) - return n; + ret = n = collect_events(cpuc, event, false); + if (ret < 0) + goto out; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + if (!(flags & PERF_EF_START)) + hwc->state |= PERF_HES_ARCH; /* * If group events scheduling transaction was started, * skip the schedulability test here, it will be peformed - * at commit time(->commit_txn) as a whole + * at commit time (->commit_txn) as a whole */ if (cpuc->group_flag & PERF_EVENT_TXN) - goto out; + goto done_collect; ret = x86_pmu.schedule_events(cpuc, n, assign); if (ret) - return ret; + goto out; /* * copy new assignment, now we know it is possible * will be used by hw_perf_enable() */ memcpy(cpuc->assign, assign, n*sizeof(int)); -out: +done_collect: cpuc->n_events = n; cpuc->n_added += n - n0; cpuc->n_txn += n - n0; - return 0; + ret = 0; +out: + perf_pmu_enable(event->pmu); + return ret; } -static int x86_pmu_start(struct perf_event *event) +static void x86_pmu_start(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx = event->hw.idx; - if (idx == -1) - return -EAGAIN; + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + if (WARN_ON_ONCE(idx == -1)) + return; + + if (flags & PERF_EF_RELOAD) { + WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); + x86_perf_event_set_period(event); + } + + event->hw.state = 0; - x86_perf_event_set_period(event); cpuc->events[idx] = event; __set_bit(idx, cpuc->active_mask); __set_bit(idx, cpuc->running); x86_pmu.enable(event); perf_event_update_userpage(event); - - return 0; -} - -static void x86_pmu_unthrottle(struct perf_event *event) -{ - int ret = x86_pmu_start(event); - WARN_ON_ONCE(ret); } void perf_event_print_debug(void) @@ -1078,27 +1094,29 @@ void perf_event_print_debug(void) local_irq_restore(flags); } -static void x86_pmu_stop(struct perf_event *event) +static void x86_pmu_stop(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; - if (!__test_and_clear_bit(idx, cpuc->active_mask)) - return; - - x86_pmu.disable(event); - - /* - * Drain the remaining delta count out of a event - * that we are disabling: - */ - x86_perf_event_update(event); + if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) { + x86_pmu.disable(event); + cpuc->events[hwc->idx] = NULL; + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } - cpuc->events[idx] = NULL; + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + x86_perf_event_update(event); + hwc->state |= PERF_HES_UPTODATE; + } } -static void x86_pmu_disable(struct perf_event *event) +static void x86_pmu_del(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int i; @@ -1111,7 +1129,7 @@ static void x86_pmu_disable(struct perf_event *event) if (cpuc->group_flag & PERF_EVENT_TXN) return; - x86_pmu_stop(event); + x86_pmu_stop(event, PERF_EF_UPDATE); for (i = 0; i < cpuc->n_events; i++) { if (event == cpuc->event_list[i]) { @@ -1134,7 +1152,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) struct perf_sample_data data; struct cpu_hw_events *cpuc; struct perf_event *event; - struct hw_perf_event *hwc; int idx, handled = 0; u64 val; @@ -1155,7 +1172,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) } event = cpuc->events[idx]; - hwc = &event->hw; val = x86_perf_event_update(event); if (val & (1ULL << (x86_pmu.cntval_bits - 1))) @@ -1171,7 +1187,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) continue; if (perf_event_overflow(event, 1, &data, regs)) - x86_pmu_stop(event); + x86_pmu_stop(event, 0); } if (handled) @@ -1180,25 +1196,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) return handled; } -void smp_perf_pending_interrupt(struct pt_regs *regs) -{ - irq_enter(); - ack_APIC_irq(); - inc_irq_stat(apic_pending_irqs); - perf_event_do_pending(); - irq_exit(); -} - -void set_perf_event_pending(void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - if (!x86_pmu.apic || !x86_pmu_initialized()) - return; - - apic->send_IPI_self(LOCAL_PENDING_VECTOR); -#endif -} - void perf_events_lapic_init(void) { if (!x86_pmu.apic || !x86_pmu_initialized()) @@ -1388,7 +1385,6 @@ void __init init_hw_perf_events(void) x86_pmu.num_counters = X86_PMC_MAX_GENERIC; } x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; - perf_max_events = x86_pmu.num_counters; if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", @@ -1424,6 +1420,7 @@ void __init init_hw_perf_events(void) pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); + perf_pmu_register(&pmu); perf_cpu_notifier(x86_pmu_notifier); } @@ -1437,10 +1434,11 @@ static inline void x86_pmu_read(struct perf_event *event) * Set the flag to make pmu::enable() not perform the * schedulability test, it will be performed at commit time */ -static void x86_pmu_start_txn(const struct pmu *pmu) +static void x86_pmu_start_txn(struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + perf_pmu_disable(pmu); cpuc->group_flag |= PERF_EVENT_TXN; cpuc->n_txn = 0; } @@ -1450,7 +1448,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu) * Clear the flag and pmu::enable() will perform the * schedulability test. */ -static void x86_pmu_cancel_txn(const struct pmu *pmu) +static void x86_pmu_cancel_txn(struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -1460,6 +1458,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu) */ cpuc->n_added -= cpuc->n_txn; cpuc->n_events -= cpuc->n_txn; + perf_pmu_enable(pmu); } /* @@ -1467,7 +1466,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu) * Perform the group schedulability test as a whole * Return 0 if success */ -static int x86_pmu_commit_txn(const struct pmu *pmu) +static int x86_pmu_commit_txn(struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int assign[X86_PMC_IDX_MAX]; @@ -1489,22 +1488,10 @@ static int x86_pmu_commit_txn(const struct pmu *pmu) memcpy(cpuc->assign, assign, n*sizeof(int)); cpuc->group_flag &= ~PERF_EVENT_TXN; - + perf_pmu_enable(pmu); return 0; } -static const struct pmu pmu = { - .enable = x86_pmu_enable, - .disable = x86_pmu_disable, - .start = x86_pmu_start, - .stop = x86_pmu_stop, - .read = x86_pmu_read, - .unthrottle = x86_pmu_unthrottle, - .start_txn = x86_pmu_start_txn, - .cancel_txn = x86_pmu_cancel_txn, - .commit_txn = x86_pmu_commit_txn, -}; - /* * validate that we can schedule this event */ @@ -1579,12 +1566,22 @@ out: return ret; } -const struct pmu *hw_perf_event_init(struct perf_event *event) +int x86_pmu_event_init(struct perf_event *event) { - const struct pmu *tmp; + struct pmu *tmp; int err; - err = __hw_perf_event_init(event); + switch (event->attr.type) { + case PERF_TYPE_RAW: + case PERF_TYPE_HARDWARE: + case PERF_TYPE_HW_CACHE: + break; + + default: + return -ENOENT; + } + + err = __x86_pmu_event_init(event); if (!err) { /* * we temporarily connect event to its pmu @@ -1604,26 +1601,31 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) if (err) { if (event->destroy) event->destroy(event); - return ERR_PTR(err); } - return &pmu; + return err; } -/* - * callchain support - */ +static struct pmu pmu = { + .pmu_enable = x86_pmu_enable, + .pmu_disable = x86_pmu_disable, -static inline -void callchain_store(struct perf_callchain_entry *entry, u64 ip) -{ - if (entry->nr < PERF_MAX_STACK_DEPTH) - entry->ip[entry->nr++] = ip; -} + .event_init = x86_pmu_event_init, -static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); -static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); + .add = x86_pmu_add, + .del = x86_pmu_del, + .start = x86_pmu_start, + .stop = x86_pmu_stop, + .read = x86_pmu_read, + .start_txn = x86_pmu_start_txn, + .cancel_txn = x86_pmu_cancel_txn, + .commit_txn = x86_pmu_commit_txn, +}; + +/* + * callchain support + */ static void backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) @@ -1645,7 +1647,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) { struct perf_callchain_entry *entry = data; - callchain_store(entry, addr); + perf_callchain_store(entry, addr); } static const struct stacktrace_ops backtrace_ops = { @@ -1656,11 +1658,15 @@ static const struct stacktrace_ops backtrace_ops = { .walk_stack = print_context_stack_bp, }; -static void -perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) +void +perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) { - callchain_store(entry, PERF_CONTEXT_KERNEL); - callchain_store(entry, regs->ip); + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + /* TODO: We don't support guest os callchain now */ + return; + } + + perf_callchain_store(entry, regs->ip); dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); } @@ -1689,7 +1695,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) if (fp < compat_ptr(regs->sp)) break; - callchain_store(entry, frame.return_address); + perf_callchain_store(entry, frame.return_address); fp = compat_ptr(frame.next_frame); } return 1; @@ -1702,19 +1708,20 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) } #endif -static void -perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) +void +perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) { struct stack_frame frame; const void __user *fp; - if (!user_mode(regs)) - regs = task_pt_regs(current); + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + /* TODO: We don't support guest os callchain now */ + return; + } fp = (void __user *)regs->bp; - callchain_store(entry, PERF_CONTEXT_USER); - callchain_store(entry, regs->ip); + perf_callchain_store(entry, regs->ip); if (perf_callchain_user32(regs, entry)) return; @@ -1731,52 +1738,11 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) if ((unsigned long)fp < regs->sp) break; - callchain_store(entry, frame.return_address); + perf_callchain_store(entry, frame.return_address); fp = frame.next_frame; } } -static void -perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) -{ - int is_user; - - if (!regs) - return; - - is_user = user_mode(regs); - - if (is_user && current->state != TASK_RUNNING) - return; - - if (!is_user) - perf_callchain_kernel(regs, entry); - - if (current->mm) - perf_callchain_user(regs, entry); -} - -struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - struct perf_callchain_entry *entry; - - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { - /* TODO: We don't support guest os callchain now */ - return NULL; - } - - if (in_nmi()) - entry = &__get_cpu_var(pmc_nmi_entry); - else - entry = &__get_cpu_var(pmc_irq_entry); - - entry->nr = 0; - - perf_do_callchain(regs, entry); - - return entry; -} - unsigned long perf_instruction_pointer(struct pt_regs *regs) { unsigned long ip; diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index c2897b7b4a3b..46d58448c3af 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -52,7 +52,7 @@ static __initconst const u64 amd_hw_cache_event_ids [ C(DTLB) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ + [ C(RESULT_MISS) ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = 0, @@ -66,7 +66,7 @@ static __initconst const u64 amd_hw_cache_event_ids [ C(ITLB) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ - [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ + [ C(RESULT_MISS) ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = -1, diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index ee05c90012d2..c8f5c088cad1 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -713,18 +713,18 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) struct cpu_hw_events *cpuc; int bit, loops; u64 status; - int handled = 0; + int handled; perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); intel_pmu_disable_all(); - intel_pmu_drain_bts_buffer(); + handled = intel_pmu_drain_bts_buffer(); status = intel_pmu_get_status(); if (!status) { intel_pmu_enable_all(0); - return 0; + return handled; } loops = 0; @@ -763,7 +763,7 @@ again: data.period = event->hw.last_period; if (perf_event_overflow(event, 1, &data, regs)) - x86_pmu_stop(event); + x86_pmu_stop(event, 0); } /* diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 18018d1311cd..4977f9c400e5 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -214,7 +214,7 @@ static void intel_pmu_disable_bts(void) update_debugctlmsr(debugctlmsr); } -static void intel_pmu_drain_bts_buffer(void) +static int intel_pmu_drain_bts_buffer(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct debug_store *ds = cpuc->ds; @@ -231,16 +231,16 @@ static void intel_pmu_drain_bts_buffer(void) struct pt_regs regs; if (!event) - return; + return 0; if (!ds) - return; + return 0; at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; top = (struct bts_record *)(unsigned long)ds->bts_index; if (top <= at) - return; + return 0; ds->bts_index = ds->bts_buffer_base; @@ -256,7 +256,7 @@ static void intel_pmu_drain_bts_buffer(void) perf_prepare_sample(&header, &data, event, ®s); if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1)) - return; + return 1; for (; at < top; at++) { data.ip = at->from; @@ -270,6 +270,7 @@ static void intel_pmu_drain_bts_buffer(void) /* There's new data available. */ event->hw.interrupts++; event->pending_kill = POLL_IN; + return 1; } /* @@ -491,7 +492,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, regs.flags &= ~PERF_EFLAGS_EXACT; if (perf_event_overflow(event, 1, &data, ®s)) - x86_pmu_stop(event); + x86_pmu_stop(event, 0); } static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 249015173992..81400b93e694 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -18,6 +18,8 @@ struct p4_event_bind { unsigned int opcode; /* Event code and ESCR selector */ unsigned int escr_msr[2]; /* ESCR MSR for this event */ + unsigned int escr_emask; /* valid ESCR EventMask bits */ + unsigned int shared; /* event is shared across threads */ char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ }; @@ -66,231 +68,435 @@ static struct p4_event_bind p4_event_bind_map[] = { [P4_EVENT_TC_DELIVER_MODE] = { .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE), .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID), + .shared = 1, .cntr = { {4, 5, -1}, {6, 7, -1} }, }, [P4_EVENT_BPU_FETCH_REQUEST] = { .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST), .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS), .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_ITLB_REFERENCE] = { .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE), .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT) | + P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK), .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_MEMORY_CANCEL] = { .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL), .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL) | + P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF), .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_MEMORY_COMPLETE] = { .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE), .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC) | + P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC), .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_LOAD_PORT_REPLAY] = { .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY), .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD), .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_STORE_PORT_REPLAY] = { .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY), .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST), .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_MOB_LOAD_REPLAY] = { .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY), .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA) | + P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD) | + P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA) | + P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR), .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_PAGE_WALK_TYPE] = { .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE), .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS), + .shared = 1, .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_BSQ_CACHE_REFERENCE] = { .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE), .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS), .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_IOQ_ALLOCATION] = { .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION), .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH), .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */ .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES), .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH), .cntr = { {2, -1, -1}, {3, -1, -1} }, }, [P4_EVENT_FSB_DATA_ACTIVITY] = { .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY), .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER), + .shared = 1, .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */ .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION), .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2), .cntr = { {0, -1, -1}, {1, -1, -1} }, }, [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */ .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES), .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2), .cntr = { {2, -1, -1}, {3, -1, -1} }, }, [P4_EVENT_SSE_INPUT_ASSIST] = { .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST), .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_PACKED_SP_UOP] = { .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP), .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_PACKED_DP_UOP] = { .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP), .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_SCALAR_SP_UOP] = { .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP), .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_SCALAR_DP_UOP] = { .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP), .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_64BIT_MMX_UOP] = { .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP), .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_128BIT_MMX_UOP] = { .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP), .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_X87_FP_UOP] = { .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP), .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_TC_MISC] = { .opcode = P4_OPCODE(P4_EVENT_TC_MISC), .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH), .cntr = { {4, 5, -1}, {6, 7, -1} }, }, [P4_EVENT_GLOBAL_POWER_EVENTS] = { .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS), .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING), .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_TC_MS_XFER] = { .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER), .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC), .cntr = { {4, 5, -1}, {6, 7, -1} }, }, [P4_EVENT_UOP_QUEUE_WRITES] = { .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES), .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD) | + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER) | + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM), .cntr = { {4, 5, -1}, {6, 7, -1} }, }, [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = { .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE), .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT), .cntr = { {4, 5, -1}, {6, 7, -1} }, }, [P4_EVENT_RETIRED_BRANCH_TYPE] = { .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE), .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT), .cntr = { {4, 5, -1}, {6, 7, -1} }, }, [P4_EVENT_RESOURCE_STALL] = { .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL), .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_WC_BUFFER] = { .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER), .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS) | + P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_B2B_CYCLES] = { .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES), .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = 0, .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_BNR] = { .opcode = P4_OPCODE(P4_EVENT_BNR), .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = 0, .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_SNOOP] = { .opcode = P4_OPCODE(P4_EVENT_SNOOP), .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = 0, .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_RESPONSE] = { .opcode = P4_OPCODE(P4_EVENT_RESPONSE), .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = 0, .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_FRONT_END_EVENT] = { .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT), .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS) | + P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_EXECUTION_EVENT] = { .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT), .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_REPLAY_EVENT] = { .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT), .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS) | + P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_INSTR_RETIRED] = { .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED), .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_UOPS_RETIRED] = { .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED), .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS) | + P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_UOP_TYPE] = { .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE), .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS) | + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_BRANCH_RETIRED] = { .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED), .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP) | + P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM) | + P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP) | + P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_MISPRED_BRANCH_RETIRED] = { .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED), .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_X87_ASSIST] = { .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST), .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU) | + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO) | + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO) | + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU) | + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_MACHINE_CLEAR] = { .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR), .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR) | + P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR) | + P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_INSTR_COMPLETED] = { .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED), .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, }; @@ -428,29 +634,73 @@ static u64 p4_pmu_event_map(int hw_event) return config; } +/* check cpu model specifics */ +static bool p4_event_match_cpu_model(unsigned int event_idx) +{ + /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */ + if (event_idx == P4_EVENT_INSTR_COMPLETED) { + if (boot_cpu_data.x86_model != 3 && + boot_cpu_data.x86_model != 4 && + boot_cpu_data.x86_model != 6) + return false; + } + + /* + * For info + * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2 + */ + + return true; +} + static int p4_validate_raw_event(struct perf_event *event) { - unsigned int v; + unsigned int v, emask; - /* user data may have out-of-bound event index */ + /* User data may have out-of-bound event index */ v = p4_config_unpack_event(event->attr.config); - if (v >= ARRAY_SIZE(p4_event_bind_map)) { - pr_warning("P4 PMU: Unknown event code: %d\n", v); + if (v >= ARRAY_SIZE(p4_event_bind_map)) + return -EINVAL; + + /* It may be unsupported: */ + if (!p4_event_match_cpu_model(v)) return -EINVAL; + + /* + * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as + * in Architectural Performance Monitoring, it means not + * on _which_ logical cpu to count but rather _when_, ie it + * depends on logical cpu state -- count event if one cpu active, + * none, both or any, so we just allow user to pass any value + * desired. + * + * In turn we always set Tx_OS/Tx_USR bits bound to logical + * cpu without their propagation to another cpu + */ + + /* + * if an event is shared accross the logical threads + * the user needs special permissions to be able to use it + */ + if (p4_event_bind_map[v].shared) { + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return -EACCES; } + /* ESCR EventMask bits may be invalid */ + emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK; + if (emask & ~p4_event_bind_map[v].escr_emask) + return -EINVAL; + /* - * it may have some screwed PEBS bits + * it may have some invalid PEBS bits */ - if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) { - pr_warning("P4 PMU: PEBS are not supported yet\n"); + if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) return -EINVAL; - } + v = p4_config_unpack_metric(event->attr.config); - if (v >= ARRAY_SIZE(p4_pebs_bind_map)) { - pr_warning("P4 PMU: Unknown metric code: %d\n", v); + if (v >= ARRAY_SIZE(p4_pebs_bind_map)) return -EINVAL; - } return 0; } @@ -478,27 +728,21 @@ static int p4_hw_config(struct perf_event *event) if (event->attr.type == PERF_TYPE_RAW) { + /* + * Clear bits we reserve to be managed by kernel itself + * and never allowed from a user space + */ + event->attr.config &= P4_CONFIG_MASK; + rc = p4_validate_raw_event(event); if (rc) goto out; /* - * We don't control raw events so it's up to the caller - * to pass sane values (and we don't count the thread number - * on HT machine but allow HT-compatible specifics to be - * passed on) - * * Note that for RAW events we allow user to use P4_CCCR_RESERVED * bits since we keep additional info here (for cache events and etc) - * - * XXX: HT wide things should check perf_paranoid_cpu() && - * CAP_SYS_ADMIN */ - event->hw.config |= event->attr.config & - (p4_config_pack_escr(P4_ESCR_MASK_HT) | - p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED)); - - event->hw.config &= ~P4_CCCR_FORCE_OVF; + event->hw.config |= event->attr.config; } rc = x86_setup_perfctr(event); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 17be5ec7cbba..c375c79065f8 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1023,9 +1023,9 @@ apicinterrupt ERROR_APIC_VECTOR \ apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt -#ifdef CONFIG_PERF_EVENTS -apicinterrupt LOCAL_PENDING_VECTOR \ - perf_pending_interrupt smp_perf_pending_interrupt +#ifdef CONFIG_IRQ_WORK +apicinterrupt IRQ_WORK_VECTOR \ + irq_work_interrupt smp_irq_work_interrupt #endif /* diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index cd37469b54ee..3afb33f14d2d 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -257,14 +257,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code) return mod_code_status; } - - - -static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; - static unsigned char *ftrace_nop_replace(void) { - return ftrace_nop; + return ideal_nop5; } static int @@ -338,62 +333,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func) int __init ftrace_dyn_arch_init(void *data) { - extern const unsigned char ftrace_test_p6nop[]; - extern const unsigned char ftrace_test_nop5[]; - extern const unsigned char ftrace_test_jmp[]; - int faulted = 0; - - /* - * There is no good nop for all x86 archs. - * We will default to using the P6_NOP5, but first we - * will test to make sure that the nop will actually - * work on this CPU. If it faults, we will then - * go to a lesser efficient 5 byte nop. If that fails - * we then just use a jmp as our nop. This isn't the most - * efficient nop, but we can not use a multi part nop - * since we would then risk being preempted in the middle - * of that nop, and if we enabled tracing then, it might - * cause a system crash. - * - * TODO: check the cpuid to determine the best nop. - */ - asm volatile ( - "ftrace_test_jmp:" - "jmp ftrace_test_p6nop\n" - "nop\n" - "nop\n" - "nop\n" /* 2 byte jmp + 3 bytes */ - "ftrace_test_p6nop:" - P6_NOP5 - "jmp 1f\n" - "ftrace_test_nop5:" - ".byte 0x66,0x66,0x66,0x66,0x90\n" - "1:" - ".section .fixup, \"ax\"\n" - "2: movl $1, %0\n" - " jmp ftrace_test_nop5\n" - "3: movl $2, %0\n" - " jmp 1b\n" - ".previous\n" - _ASM_EXTABLE(ftrace_test_p6nop, 2b) - _ASM_EXTABLE(ftrace_test_nop5, 3b) - : "=r"(faulted) : "0" (faulted)); - - switch (faulted) { - case 0: - pr_info("converting mcount calls to 0f 1f 44 00 00\n"); - memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE); - break; - case 1: - pr_info("converting mcount calls to 66 66 66 66 90\n"); - memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE); - break; - case 2: - pr_info("converting mcount calls to jmp . + 5\n"); - memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE); - break; - } - /* The return code is retured via data */ *(unsigned long *)data = 0; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 91fd0c70a18a..44edb03fc9ec 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -67,10 +67,10 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); seq_printf(p, " Performance monitoring interrupts\n"); - seq_printf(p, "%*s: ", prec, "PND"); + seq_printf(p, "%*s: ", prec, "IWI"); for_each_online_cpu(j) - seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); - seq_printf(p, " Performance pending work\n"); + seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); + seq_printf(p, " IRQ work interrupts\n"); #endif if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); @@ -185,7 +185,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->apic_timer_irqs; sum += irq_stats(cpu)->irq_spurious_count; sum += irq_stats(cpu)->apic_perf_irqs; - sum += irq_stats(cpu)->apic_pending_irqs; + sum += irq_stats(cpu)->apic_irq_work_irqs; #endif if (x86_platform_ipi_callback) sum += irq_stats(cpu)->x86_platform_ipis; diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c new file mode 100644 index 000000000000..ca8f703a1e70 --- /dev/null +++ b/arch/x86/kernel/irq_work.c @@ -0,0 +1,30 @@ +/* + * x86 specific code for irq_work + * + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + */ + +#include <linux/kernel.h> +#include <linux/irq_work.h> +#include <linux/hardirq.h> +#include <asm/apic.h> + +void smp_irq_work_interrupt(struct pt_regs *regs) +{ + irq_enter(); + ack_APIC_irq(); + inc_irq_stat(apic_irq_work_irqs); + irq_work_run(); + irq_exit(); +} + +void arch_irq_work_raise(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + if (!cpu_has_apic) + return; + + apic->send_IPI_self(IRQ_WORK_VECTOR); + apic_wait_icr_idle(); +#endif +} diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 990ae7cfc578..713969b9266b 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -224,9 +224,9 @@ static void __init apic_intr_init(void) alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - /* Performance monitoring interrupts: */ -# ifdef CONFIG_PERF_EVENTS - alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); + /* IRQ work interrupts: */ +# ifdef CONFIG_IRQ_WORK + alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt); # endif #endif diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c new file mode 100644 index 000000000000..961b6b30ba90 --- /dev/null +++ b/arch/x86/kernel/jump_label.c @@ -0,0 +1,50 @@ +/* + * jump label x86 support + * + * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> + * + */ +#include <linux/jump_label.h> +#include <linux/memory.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/jhash.h> +#include <linux/cpu.h> +#include <asm/kprobes.h> +#include <asm/alternative.h> + +#ifdef HAVE_JUMP_LABEL + +union jump_code_union { + char code[JUMP_LABEL_NOP_SIZE]; + struct { + char jump; + int offset; + } __attribute__((packed)); +}; + +void arch_jump_label_transform(struct jump_entry *entry, + enum jump_label_type type) +{ + union jump_code_union code; + + if (type == JUMP_LABEL_ENABLE) { + code.jump = 0xe9; + code.offset = entry->target - + (entry->code + JUMP_LABEL_NOP_SIZE); + } else + memcpy(&code, ideal_nop5, JUMP_LABEL_NOP_SIZE); + get_online_cpus(); + mutex_lock(&text_mutex); + text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); + mutex_unlock(&text_mutex); + put_online_cpus(); +} + +void arch_jump_label_text_poke_early(jump_label_t addr) +{ + text_poke_early((void *)addr, ideal_nop5, JUMP_LABEL_NOP_SIZE); +} + +#endif diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 770ebfb349e9..1cbd54c0df99 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -230,9 +230,6 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) return 0; } -/* Dummy buffers for kallsyms_lookup */ -static char __dummy_buf[KSYM_NAME_LEN]; - /* Check if paddr is at an instruction boundary */ static int __kprobes can_probe(unsigned long paddr) { @@ -241,7 +238,7 @@ static int __kprobes can_probe(unsigned long paddr) struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; - if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf)) + if (!kallsyms_lookup_size_offset(paddr, NULL, &offset)) return 0; /* Decode instructions */ @@ -1129,7 +1126,7 @@ static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, *(unsigned long *)addr = val; } -void __kprobes kprobes_optinsn_template_holder(void) +static void __used __kprobes kprobes_optinsn_template_holder(void) { asm volatile ( ".global optprobe_template_entry\n" @@ -1221,7 +1218,8 @@ static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) } /* Check whether the address range is reserved */ if (ftrace_text_reserved(src, src + len - 1) || - alternatives_text_reserved(src, src + len - 1)) + alternatives_text_reserved(src, src + len - 1) || + jump_label_text_reserved(src, src + len - 1)) return -EBUSY; return len; @@ -1269,11 +1267,9 @@ static int __kprobes can_optimize(unsigned long paddr) unsigned long addr, size = 0, offset = 0; struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; - /* Dummy buffers for lookup_symbol_attrs */ - static char __dummy_buf[KSYM_NAME_LEN]; /* Lookup symbol including addr */ - if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf)) + if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) return 0; /* Check there is enough space for a relative jump. */ diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 1c355c550960..8f2956091735 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -239,6 +239,9 @@ int module_finalize(const Elf_Ehdr *hdr, apply_paravirt(pseg, pseg + para->sh_size); } + /* make jump label nops */ + jump_label_apply_nops(me); + return 0; } diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 0f7f130caa67..6015ee13e22b 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -601,7 +601,7 @@ static void gart_fixup_northbridges(struct sys_device *dev) * Don't enable translations just yet. That is the next * step. Restore the pre-suspend aperture settings. */ - pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1); + gart_set_size_and_enable(dev, aperture_order); pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25); } } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c3a4fbb2b996..00e167870f71 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -112,6 +112,7 @@ #include <asm/numa_64.h> #endif #include <asm/mce.h> +#include <asm/alternative.h> /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. @@ -726,6 +727,7 @@ void __init setup_arch(char **cmdline_p) { int acpi = 0; int k8 = 0; + unsigned long flags; #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); @@ -1071,6 +1073,10 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.banner(); mcheck_init(); + + local_irq_save(flags); + arch_init_ideal_nop5(); + local_irq_restore(flags); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 81ed28cb36e6..8a3f9f64f86f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3163,8 +3163,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) sync_lapic_to_cr8(vcpu); save_host_msrs(vcpu); - fs_selector = kvm_read_fs(); - gs_selector = kvm_read_gs(); + savesegment(fs, fs_selector); + savesegment(gs, gs_selector); ldt_selector = kvm_read_ldt(); svm->vmcb->save.cr2 = vcpu->arch.cr2; /* required for live migration with NPT */ @@ -3251,10 +3251,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; - kvm_load_fs(fs_selector); - kvm_load_gs(gs_selector); - kvm_load_ldt(ldt_selector); load_host_msrs(vcpu); + loadsegment(fs, fs_selector); +#ifdef CONFIG_X86_64 + load_gs_index(gs_selector); + wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); +#else + loadsegment(gs, gs_selector); +#endif + kvm_load_ldt(ldt_selector); reload_tss(vcpu); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 49b25eee25ac..7bddfab12013 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -803,7 +803,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) */ vmx->host_state.ldt_sel = kvm_read_ldt(); vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; - vmx->host_state.fs_sel = kvm_read_fs(); + savesegment(fs, vmx->host_state.fs_sel); if (!(vmx->host_state.fs_sel & 7)) { vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); vmx->host_state.fs_reload_needed = 0; @@ -811,7 +811,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) vmcs_write16(HOST_FS_SELECTOR, 0); vmx->host_state.fs_reload_needed = 1; } - vmx->host_state.gs_sel = kvm_read_gs(); + savesegment(gs, vmx->host_state.gs_sel); if (!(vmx->host_state.gs_sel & 7)) vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); else { @@ -841,27 +841,21 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) static void __vmx_load_host_state(struct vcpu_vmx *vmx) { - unsigned long flags; - if (!vmx->host_state.loaded) return; ++vmx->vcpu.stat.host_state_reload; vmx->host_state.loaded = 0; if (vmx->host_state.fs_reload_needed) - kvm_load_fs(vmx->host_state.fs_sel); + loadsegment(fs, vmx->host_state.fs_sel); if (vmx->host_state.gs_ldt_reload_needed) { kvm_load_ldt(vmx->host_state.ldt_sel); - /* - * If we have to reload gs, we must take care to - * preserve our gs base. - */ - local_irq_save(flags); - kvm_load_gs(vmx->host_state.gs_sel); #ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); + load_gs_index(vmx->host_state.gs_sel); + wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); +#else + loadsegment(gs, vmx->host_state.gs_sel); #endif - local_irq_restore(flags); } reload_tss(); #ifdef CONFIG_X86_64 @@ -2589,8 +2583,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */ - vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */ + vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ + vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ #ifdef CONFIG_X86_64 rdmsrl(MSR_FS_BASE, a); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 4c4508e8a204..a24c6cfdccc4 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -251,6 +251,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address) if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; + WARN_ON_ONCE(in_nmi()); + /* * Synchronize this task's top level page-table * with the 'reference' page table. @@ -369,6 +371,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address) if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; + WARN_ON_ONCE(in_nmi()); + /* * Copy kernel mappings over when needed. This can also * happen within a race in page table update. In the later diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c index b3b531a4f8e5..d87dd6d042d6 100644 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ b/arch/x86/mm/kmemcheck/kmemcheck.c @@ -631,6 +631,8 @@ bool kmemcheck_fault(struct pt_regs *regs, unsigned long address, if (!pte) return false; + WARN_ON_ONCE(in_nmi()); + if (error_code & 2) kmemcheck_access(regs, address, KMEMCHECK_WRITE); else diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 3855096c59b8..2d49d4e19a36 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -14,6 +14,7 @@ #include <asm/ptrace.h> #include <asm/uaccess.h> #include <asm/stacktrace.h> +#include <linux/compat.h> static void backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) @@ -48,14 +49,12 @@ static struct stacktrace_ops backtrace_ops = { .walk_stack = print_context_stack, }; -struct frame_head { - struct frame_head *bp; - unsigned long ret; -} __attribute__((packed)); - -static struct frame_head *dump_user_backtrace(struct frame_head *head) +#ifdef CONFIG_COMPAT +static struct stack_frame_ia32 * +dump_user_backtrace_32(struct stack_frame_ia32 *head) { - struct frame_head bufhead[2]; + struct stack_frame_ia32 bufhead[2]; + struct stack_frame_ia32 *fp; /* Also check accessibility of one struct frame_head beyond */ if (!access_ok(VERIFY_READ, head, sizeof(bufhead))) @@ -63,20 +62,66 @@ static struct frame_head *dump_user_backtrace(struct frame_head *head) if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead))) return NULL; - oprofile_add_trace(bufhead[0].ret); + fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame); + + oprofile_add_trace(bufhead[0].return_address); + + /* frame pointers should strictly progress back up the stack + * (towards higher addresses) */ + if (head >= fp) + return NULL; + + return fp; +} + +static inline int +x86_backtrace_32(struct pt_regs * const regs, unsigned int depth) +{ + struct stack_frame_ia32 *head; + + /* User process is 32-bit */ + if (!current || !test_thread_flag(TIF_IA32)) + return 0; + + head = (struct stack_frame_ia32 *) regs->bp; + while (depth-- && head) + head = dump_user_backtrace_32(head); + + return 1; +} + +#else +static inline int +x86_backtrace_32(struct pt_regs * const regs, unsigned int depth) +{ + return 0; +} +#endif /* CONFIG_COMPAT */ + +static struct stack_frame *dump_user_backtrace(struct stack_frame *head) +{ + struct stack_frame bufhead[2]; + + /* Also check accessibility of one struct stack_frame beyond */ + if (!access_ok(VERIFY_READ, head, sizeof(bufhead))) + return NULL; + if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead))) + return NULL; + + oprofile_add_trace(bufhead[0].return_address); /* frame pointers should strictly progress back up the stack * (towards higher addresses) */ - if (head >= bufhead[0].bp) + if (head >= bufhead[0].next_frame) return NULL; - return bufhead[0].bp; + return bufhead[0].next_frame; } void x86_backtrace(struct pt_regs * const regs, unsigned int depth) { - struct frame_head *head = (struct frame_head *)frame_pointer(regs); + struct stack_frame *head = (struct stack_frame *)frame_pointer(regs); if (!user_mode_vm(regs)) { unsigned long stack = kernel_stack_pointer(regs); @@ -86,6 +131,9 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) return; } + if (x86_backtrace_32(regs, depth)) + return; + while (depth-- && head) head = dump_user_backtrace(head); } diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index f1575c9a2572..bd1489c3ce09 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -695,9 +695,6 @@ static int __init ppro_init(char **cpu_type) return 1; } -/* in order to get sysfs right */ -static int using_nmi; - int __init op_nmi_init(struct oprofile_operations *ops) { __u8 vendor = boot_cpu_data.x86_vendor; @@ -705,8 +702,6 @@ int __init op_nmi_init(struct oprofile_operations *ops) char *cpu_type = NULL; int ret = 0; - using_nmi = 0; - if (!cpu_has_apic) return -ENODEV; @@ -790,13 +785,11 @@ int __init op_nmi_init(struct oprofile_operations *ops) if (ret) return ret; - using_nmi = 1; printk(KERN_INFO "oprofile: using NMI interrupt.\n"); return 0; } void op_nmi_exit(void) { - if (using_nmi) - exit_sysfs(); + exit_sysfs(); } diff --git a/block/bsg.c b/block/bsg.c index 82d58829ba59..0c00870553a3 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -426,7 +426,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, /* * fill in all the output members */ - hdr->device_status = status_byte(rq->errors); + hdr->device_status = rq->errors & 0xff; hdr->transport_status = host_byte(rq->errors); hdr->driver_status = driver_byte(rq->errors); hdr->info = 0; diff --git a/drivers/atm/iphase.c b/drivers/atm/iphase.c index ee9ddeb53417..8cb0347dec28 100644 --- a/drivers/atm/iphase.c +++ b/drivers/atm/iphase.c @@ -3156,7 +3156,6 @@ static int __devinit ia_init_one(struct pci_dev *pdev, { struct atm_dev *dev; IADEV *iadev; - unsigned long flags; int ret; iadev = kzalloc(sizeof(*iadev), GFP_KERNEL); @@ -3188,19 +3187,14 @@ static int __devinit ia_init_one(struct pci_dev *pdev, ia_dev[iadev_count] = iadev; _ia_dev[iadev_count] = dev; iadev_count++; - spin_lock_init(&iadev->misc_lock); - /* First fixes first. I don't want to think about this now. */ - spin_lock_irqsave(&iadev->misc_lock, flags); if (ia_init(dev) || ia_start(dev)) { IF_INIT(printk("IA register failed!\n");) iadev_count--; ia_dev[iadev_count] = NULL; _ia_dev[iadev_count] = NULL; - spin_unlock_irqrestore(&iadev->misc_lock, flags); ret = -EINVAL; goto err_out_deregister_dev; } - spin_unlock_irqrestore(&iadev->misc_lock, flags); IF_EVENT(printk("iadev_count = %d\n", iadev_count);) iadev->next_board = ia_boards; diff --git a/drivers/atm/iphase.h b/drivers/atm/iphase.h index b2cd20f549cb..077735e0e04b 100644 --- a/drivers/atm/iphase.h +++ b/drivers/atm/iphase.h @@ -1022,7 +1022,7 @@ typedef struct iadev_t { struct dle_q rx_dle_q; struct free_desc_q *rx_free_desc_qhead; struct sk_buff_head rx_dma_q; - spinlock_t rx_lock, misc_lock; + spinlock_t rx_lock; struct atm_vcc **rx_open; /* list of all open VCs */ u16 num_rx_desc, rx_buf_sz, rxing; u32 rx_pkt_ram, rx_tmp_cnt; diff --git a/drivers/atm/solos-pci.c b/drivers/atm/solos-pci.c index f916ddf63938..f46138ab38b6 100644 --- a/drivers/atm/solos-pci.c +++ b/drivers/atm/solos-pci.c @@ -444,6 +444,7 @@ static ssize_t console_show(struct device *dev, struct device_attribute *attr, struct atm_dev *atmdev = container_of(dev, struct atm_dev, class_dev); struct solos_card *card = atmdev->dev_data; struct sk_buff *skb; + unsigned int len; spin_lock(&card->cli_queue_lock); skb = skb_dequeue(&card->cli_queue[SOLOS_CHAN(atmdev)]); @@ -451,11 +452,12 @@ static ssize_t console_show(struct device *dev, struct device_attribute *attr, if(skb == NULL) return sprintf(buf, "No data.\n"); - memcpy(buf, skb->data, skb->len); - dev_dbg(&card->dev->dev, "len: %d\n", skb->len); + len = skb->len; + memcpy(buf, skb->data, len); + dev_dbg(&card->dev->dev, "len: %d\n", len); kfree_skb(skb); - return skb->len; + return len; } static int send_command(struct solos_card *card, int dev, const char *buf, size_t size) diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index de277689da61..4b9359a6f6ca 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -488,4 +488,21 @@ config BLK_DEV_HD If unsure, say N. +config BLK_DEV_RBD + tristate "Rados block device (RBD)" + depends on INET && EXPERIMENTAL && BLOCK + select CEPH_LIB + select LIBCRC32C + select CRYPTO_AES + select CRYPTO + default n + help + Say Y here if you want include the Rados block device, which stripes + a block device over objects stored in the Ceph distributed object + store. + + More information at http://ceph.newdream.net/. + + If unsure, say N. + endif # BLK_DEV diff --git a/drivers/block/Makefile b/drivers/block/Makefile index aff5ac925c34..d7f463d6312d 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -37,5 +37,6 @@ obj-$(CONFIG_BLK_DEV_HD) += hd.o obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ +obj-$(CONFIG_BLK_DEV_RBD) += rbd.o swim_mod-objs := swim.o swim_asm.o diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index e9da874d0419..03688c2da319 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -113,7 +113,7 @@ static void ps3disk_scatter_gather(struct ps3_storage_device *dev, memcpy(buf, dev->bounce_buf+offset, size); offset += size; flush_kernel_dcache_page(bvec->bv_page); - bvec_kunmap_irq(bvec, &flags); + bvec_kunmap_irq(buf, &flags); i++; } } diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c new file mode 100644 index 000000000000..6ec9d53806c5 --- /dev/null +++ b/drivers/block/rbd.c @@ -0,0 +1,1841 @@ +/* + rbd.c -- Export ceph rados objects as a Linux block device + + + based on drivers/block/osdblk.c: + + Copyright 2009 Red Hat, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + + + Instructions for use + -------------------- + + 1) Map a Linux block device to an existing rbd image. + + Usage: <mon ip addr> <options> <pool name> <rbd image name> [snap name] + + $ echo "192.168.0.1 name=admin rbd foo" > /sys/class/rbd/add + + The snapshot name can be "-" or omitted to map the image read/write. + + 2) List all active blkdev<->object mappings. + + In this example, we have performed step #1 twice, creating two blkdevs, + mapped to two separate rados objects in the rados rbd pool + + $ cat /sys/class/rbd/list + #id major client_name pool name snap KB + 0 254 client4143 rbd foo - 1024000 + + The columns, in order, are: + - blkdev unique id + - blkdev assigned major + - rados client id + - rados pool name + - rados block device name + - mapped snapshot ("-" if none) + - device size in KB + + + 3) Create a snapshot. + + Usage: <blkdev id> <snapname> + + $ echo "0 mysnap" > /sys/class/rbd/snap_create + + + 4) Listing a snapshot. + + $ cat /sys/class/rbd/snaps_list + #id snap KB + 0 - 1024000 (*) + 0 foo 1024000 + + The columns, in order, are: + - blkdev unique id + - snapshot name, '-' means none (active read/write version) + - size of device at time of snapshot + - the (*) indicates this is the active version + + 5) Rollback to snapshot. + + Usage: <blkdev id> <snapname> + + $ echo "0 mysnap" > /sys/class/rbd/snap_rollback + + + 6) Mapping an image using snapshot. + + A snapshot mapping is read-only. This is being done by passing + snap=<snapname> to the options when adding a device. + + $ echo "192.168.0.1 name=admin,snap=mysnap rbd foo" > /sys/class/rbd/add + + + 7) Remove an active blkdev<->rbd image mapping. + + In this example, we remove the mapping with blkdev unique id 1. + + $ echo 1 > /sys/class/rbd/remove + + + NOTE: The actual creation and deletion of rados objects is outside the scope + of this driver. + + */ + +#include <linux/ceph/libceph.h> +#include <linux/ceph/osd_client.h> +#include <linux/ceph/mon_client.h> +#include <linux/ceph/decode.h> + +#include <linux/kernel.h> +#include <linux/device.h> +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/blkdev.h> + +#include "rbd_types.h" + +#define DRV_NAME "rbd" +#define DRV_NAME_LONG "rbd (rados block device)" + +#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ + +#define RBD_MAX_MD_NAME_LEN (96 + sizeof(RBD_SUFFIX)) +#define RBD_MAX_POOL_NAME_LEN 64 +#define RBD_MAX_SNAP_NAME_LEN 32 +#define RBD_MAX_OPT_LEN 1024 + +#define RBD_SNAP_HEAD_NAME "-" + +#define DEV_NAME_LEN 32 + +/* + * block device image metadata (in-memory version) + */ +struct rbd_image_header { + u64 image_size; + char block_name[32]; + __u8 obj_order; + __u8 crypt_type; + __u8 comp_type; + struct rw_semaphore snap_rwsem; + struct ceph_snap_context *snapc; + size_t snap_names_len; + u64 snap_seq; + u32 total_snaps; + + char *snap_names; + u64 *snap_sizes; +}; + +/* + * an instance of the client. multiple devices may share a client. + */ +struct rbd_client { + struct ceph_client *client; + struct kref kref; + struct list_head node; +}; + +/* + * a single io request + */ +struct rbd_request { + struct request *rq; /* blk layer request */ + struct bio *bio; /* cloned bio */ + struct page **pages; /* list of used pages */ + u64 len; +}; + +/* + * a single device + */ +struct rbd_device { + int id; /* blkdev unique id */ + + int major; /* blkdev assigned major */ + struct gendisk *disk; /* blkdev's gendisk and rq */ + struct request_queue *q; + + struct ceph_client *client; + struct rbd_client *rbd_client; + + char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ + + spinlock_t lock; /* queue lock */ + + struct rbd_image_header header; + char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */ + int obj_len; + char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */ + char pool_name[RBD_MAX_POOL_NAME_LEN]; + int poolid; + + char snap_name[RBD_MAX_SNAP_NAME_LEN]; + u32 cur_snap; /* index+1 of current snapshot within snap context + 0 - for the head */ + int read_only; + + struct list_head node; +}; + +static spinlock_t node_lock; /* protects client get/put */ + +static struct class *class_rbd; /* /sys/class/rbd */ +static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ +static LIST_HEAD(rbd_dev_list); /* devices */ +static LIST_HEAD(rbd_client_list); /* clients */ + + +static int rbd_open(struct block_device *bdev, fmode_t mode) +{ + struct gendisk *disk = bdev->bd_disk; + struct rbd_device *rbd_dev = disk->private_data; + + set_device_ro(bdev, rbd_dev->read_only); + + if ((mode & FMODE_WRITE) && rbd_dev->read_only) + return -EROFS; + + return 0; +} + +static const struct block_device_operations rbd_bd_ops = { + .owner = THIS_MODULE, + .open = rbd_open, +}; + +/* + * Initialize an rbd client instance. + * We own *opt. + */ +static struct rbd_client *rbd_client_create(struct ceph_options *opt) +{ + struct rbd_client *rbdc; + int ret = -ENOMEM; + + dout("rbd_client_create\n"); + rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); + if (!rbdc) + goto out_opt; + + kref_init(&rbdc->kref); + INIT_LIST_HEAD(&rbdc->node); + + rbdc->client = ceph_create_client(opt, rbdc); + if (IS_ERR(rbdc->client)) + goto out_rbdc; + opt = NULL; /* Now rbdc->client is responsible for opt */ + + ret = ceph_open_session(rbdc->client); + if (ret < 0) + goto out_err; + + spin_lock(&node_lock); + list_add_tail(&rbdc->node, &rbd_client_list); + spin_unlock(&node_lock); + + dout("rbd_client_create created %p\n", rbdc); + return rbdc; + +out_err: + ceph_destroy_client(rbdc->client); +out_rbdc: + kfree(rbdc); +out_opt: + if (opt) + ceph_destroy_options(opt); + return ERR_PTR(ret); +} + +/* + * Find a ceph client with specific addr and configuration. + */ +static struct rbd_client *__rbd_client_find(struct ceph_options *opt) +{ + struct rbd_client *client_node; + + if (opt->flags & CEPH_OPT_NOSHARE) + return NULL; + + list_for_each_entry(client_node, &rbd_client_list, node) + if (ceph_compare_options(opt, client_node->client) == 0) + return client_node; + return NULL; +} + +/* + * Get a ceph client with specific addr and configuration, if one does + * not exist create it. + */ +static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, + char *options) +{ + struct rbd_client *rbdc; + struct ceph_options *opt; + int ret; + + ret = ceph_parse_options(&opt, options, mon_addr, + mon_addr + strlen(mon_addr), NULL, NULL); + if (ret < 0) + return ret; + + spin_lock(&node_lock); + rbdc = __rbd_client_find(opt); + if (rbdc) { + ceph_destroy_options(opt); + + /* using an existing client */ + kref_get(&rbdc->kref); + rbd_dev->rbd_client = rbdc; + rbd_dev->client = rbdc->client; + spin_unlock(&node_lock); + return 0; + } + spin_unlock(&node_lock); + + rbdc = rbd_client_create(opt); + if (IS_ERR(rbdc)) + return PTR_ERR(rbdc); + + rbd_dev->rbd_client = rbdc; + rbd_dev->client = rbdc->client; + return 0; +} + +/* + * Destroy ceph client + */ +static void rbd_client_release(struct kref *kref) +{ + struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); + + dout("rbd_release_client %p\n", rbdc); + spin_lock(&node_lock); + list_del(&rbdc->node); + spin_unlock(&node_lock); + + ceph_destroy_client(rbdc->client); + kfree(rbdc); +} + +/* + * Drop reference to ceph client node. If it's not referenced anymore, release + * it. + */ +static void rbd_put_client(struct rbd_device *rbd_dev) +{ + kref_put(&rbd_dev->rbd_client->kref, rbd_client_release); + rbd_dev->rbd_client = NULL; + rbd_dev->client = NULL; +} + + +/* + * Create a new header structure, translate header format from the on-disk + * header. + */ +static int rbd_header_from_disk(struct rbd_image_header *header, + struct rbd_image_header_ondisk *ondisk, + int allocated_snaps, + gfp_t gfp_flags) +{ + int i; + u32 snap_count = le32_to_cpu(ondisk->snap_count); + int ret = -ENOMEM; + + init_rwsem(&header->snap_rwsem); + + header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); + header->snapc = kmalloc(sizeof(struct ceph_snap_context) + + snap_count * + sizeof(struct rbd_image_snap_ondisk), + gfp_flags); + if (!header->snapc) + return -ENOMEM; + if (snap_count) { + header->snap_names = kmalloc(header->snap_names_len, + GFP_KERNEL); + if (!header->snap_names) + goto err_snapc; + header->snap_sizes = kmalloc(snap_count * sizeof(u64), + GFP_KERNEL); + if (!header->snap_sizes) + goto err_names; + } else { + header->snap_names = NULL; + header->snap_sizes = NULL; + } + memcpy(header->block_name, ondisk->block_name, + sizeof(ondisk->block_name)); + + header->image_size = le64_to_cpu(ondisk->image_size); + header->obj_order = ondisk->options.order; + header->crypt_type = ondisk->options.crypt_type; + header->comp_type = ondisk->options.comp_type; + + atomic_set(&header->snapc->nref, 1); + header->snap_seq = le64_to_cpu(ondisk->snap_seq); + header->snapc->num_snaps = snap_count; + header->total_snaps = snap_count; + + if (snap_count && + allocated_snaps == snap_count) { + for (i = 0; i < snap_count; i++) { + header->snapc->snaps[i] = + le64_to_cpu(ondisk->snaps[i].id); + header->snap_sizes[i] = + le64_to_cpu(ondisk->snaps[i].image_size); + } + + /* copy snapshot names */ + memcpy(header->snap_names, &ondisk->snaps[i], + header->snap_names_len); + } + + return 0; + +err_names: + kfree(header->snap_names); +err_snapc: + kfree(header->snapc); + return ret; +} + +static int snap_index(struct rbd_image_header *header, int snap_num) +{ + return header->total_snaps - snap_num; +} + +static u64 cur_snap_id(struct rbd_device *rbd_dev) +{ + struct rbd_image_header *header = &rbd_dev->header; + + if (!rbd_dev->cur_snap) + return 0; + + return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)]; +} + +static int snap_by_name(struct rbd_image_header *header, const char *snap_name, + u64 *seq, u64 *size) +{ + int i; + char *p = header->snap_names; + + for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) { + if (strcmp(snap_name, p) == 0) + break; + } + if (i == header->total_snaps) + return -ENOENT; + if (seq) + *seq = header->snapc->snaps[i]; + + if (size) + *size = header->snap_sizes[i]; + + return i; +} + +static int rbd_header_set_snap(struct rbd_device *dev, + const char *snap_name, + u64 *size) +{ + struct rbd_image_header *header = &dev->header; + struct ceph_snap_context *snapc = header->snapc; + int ret = -ENOENT; + + down_write(&header->snap_rwsem); + + if (!snap_name || + !*snap_name || + strcmp(snap_name, "-") == 0 || + strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) { + if (header->total_snaps) + snapc->seq = header->snap_seq; + else + snapc->seq = 0; + dev->cur_snap = 0; + dev->read_only = 0; + if (size) + *size = header->image_size; + } else { + ret = snap_by_name(header, snap_name, &snapc->seq, size); + if (ret < 0) + goto done; + + dev->cur_snap = header->total_snaps - ret; + dev->read_only = 1; + } + + ret = 0; +done: + up_write(&header->snap_rwsem); + return ret; +} + +static void rbd_header_free(struct rbd_image_header *header) +{ + kfree(header->snapc); + kfree(header->snap_names); + kfree(header->snap_sizes); +} + +/* + * get the actual striped segment name, offset and length + */ +static u64 rbd_get_segment(struct rbd_image_header *header, + const char *block_name, + u64 ofs, u64 len, + char *seg_name, u64 *segofs) +{ + u64 seg = ofs >> header->obj_order; + + if (seg_name) + snprintf(seg_name, RBD_MAX_SEG_NAME_LEN, + "%s.%012llx", block_name, seg); + + ofs = ofs & ((1 << header->obj_order) - 1); + len = min_t(u64, len, (1 << header->obj_order) - ofs); + + if (segofs) + *segofs = ofs; + + return len; +} + +/* + * bio helpers + */ + +static void bio_chain_put(struct bio *chain) +{ + struct bio *tmp; + + while (chain) { + tmp = chain; + chain = chain->bi_next; + bio_put(tmp); + } +} + +/* + * zeros a bio chain, starting at specific offset + */ +static void zero_bio_chain(struct bio *chain, int start_ofs) +{ + struct bio_vec *bv; + unsigned long flags; + void *buf; + int i; + int pos = 0; + + while (chain) { + bio_for_each_segment(bv, chain, i) { + if (pos + bv->bv_len > start_ofs) { + int remainder = max(start_ofs - pos, 0); + buf = bvec_kmap_irq(bv, &flags); + memset(buf + remainder, 0, + bv->bv_len - remainder); + bvec_kunmap_irq(buf, &flags); + } + pos += bv->bv_len; + } + + chain = chain->bi_next; + } +} + +/* + * bio_chain_clone - clone a chain of bios up to a certain length. + * might return a bio_pair that will need to be released. + */ +static struct bio *bio_chain_clone(struct bio **old, struct bio **next, + struct bio_pair **bp, + int len, gfp_t gfpmask) +{ + struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL; + int total = 0; + + if (*bp) { + bio_pair_release(*bp); + *bp = NULL; + } + + while (old_chain && (total < len)) { + tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs); + if (!tmp) + goto err_out; + + if (total + old_chain->bi_size > len) { + struct bio_pair *bp; + + /* + * this split can only happen with a single paged bio, + * split_bio will BUG_ON if this is not the case + */ + dout("bio_chain_clone split! total=%d remaining=%d" + "bi_size=%d\n", + (int)total, (int)len-total, + (int)old_chain->bi_size); + + /* split the bio. We'll release it either in the next + call, or it will have to be released outside */ + bp = bio_split(old_chain, (len - total) / 512ULL); + if (!bp) + goto err_out; + + __bio_clone(tmp, &bp->bio1); + + *next = &bp->bio2; + } else { + __bio_clone(tmp, old_chain); + *next = old_chain->bi_next; + } + + tmp->bi_bdev = NULL; + gfpmask &= ~__GFP_WAIT; + tmp->bi_next = NULL; + + if (!new_chain) { + new_chain = tail = tmp; + } else { + tail->bi_next = tmp; + tail = tmp; + } + old_chain = old_chain->bi_next; + + total += tmp->bi_size; + } + + BUG_ON(total < len); + + if (tail) + tail->bi_next = NULL; + + *old = old_chain; + + return new_chain; + +err_out: + dout("bio_chain_clone with err\n"); + bio_chain_put(new_chain); + return NULL; +} + +/* + * helpers for osd request op vectors. + */ +static int rbd_create_rw_ops(struct ceph_osd_req_op **ops, + int num_ops, + int opcode, + u32 payload_len) +{ + *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1), + GFP_NOIO); + if (!*ops) + return -ENOMEM; + (*ops)[0].op = opcode; + /* + * op extent offset and length will be set later on + * in calc_raw_layout() + */ + (*ops)[0].payload_len = payload_len; + return 0; +} + +static void rbd_destroy_ops(struct ceph_osd_req_op *ops) +{ + kfree(ops); +} + +/* + * Send ceph osd request + */ +static int rbd_do_request(struct request *rq, + struct rbd_device *dev, + struct ceph_snap_context *snapc, + u64 snapid, + const char *obj, u64 ofs, u64 len, + struct bio *bio, + struct page **pages, + int num_pages, + int flags, + struct ceph_osd_req_op *ops, + int num_reply, + void (*rbd_cb)(struct ceph_osd_request *req, + struct ceph_msg *msg)) +{ + struct ceph_osd_request *req; + struct ceph_file_layout *layout; + int ret; + u64 bno; + struct timespec mtime = CURRENT_TIME; + struct rbd_request *req_data; + struct ceph_osd_request_head *reqhead; + struct rbd_image_header *header = &dev->header; + + ret = -ENOMEM; + req_data = kzalloc(sizeof(*req_data), GFP_NOIO); + if (!req_data) + goto done; + + dout("rbd_do_request len=%lld ofs=%lld\n", len, ofs); + + down_read(&header->snap_rwsem); + + req = ceph_osdc_alloc_request(&dev->client->osdc, flags, + snapc, + ops, + false, + GFP_NOIO, pages, bio); + if (IS_ERR(req)) { + up_read(&header->snap_rwsem); + ret = PTR_ERR(req); + goto done_pages; + } + + req->r_callback = rbd_cb; + + req_data->rq = rq; + req_data->bio = bio; + req_data->pages = pages; + req_data->len = len; + + req->r_priv = req_data; + + reqhead = req->r_request->front.iov_base; + reqhead->snapid = cpu_to_le64(CEPH_NOSNAP); + + strncpy(req->r_oid, obj, sizeof(req->r_oid)); + req->r_oid_len = strlen(req->r_oid); + + layout = &req->r_file_layout; + memset(layout, 0, sizeof(*layout)); + layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); + layout->fl_stripe_count = cpu_to_le32(1); + layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); + layout->fl_pg_preferred = cpu_to_le32(-1); + layout->fl_pg_pool = cpu_to_le32(dev->poolid); + ceph_calc_raw_layout(&dev->client->osdc, layout, snapid, + ofs, &len, &bno, req, ops); + + ceph_osdc_build_request(req, ofs, &len, + ops, + snapc, + &mtime, + req->r_oid, req->r_oid_len); + up_read(&header->snap_rwsem); + + ret = ceph_osdc_start_request(&dev->client->osdc, req, false); + if (ret < 0) + goto done_err; + + if (!rbd_cb) { + ret = ceph_osdc_wait_request(&dev->client->osdc, req); + ceph_osdc_put_request(req); + } + return ret; + +done_err: + bio_chain_put(req_data->bio); + ceph_osdc_put_request(req); +done_pages: + kfree(req_data); +done: + if (rq) + blk_end_request(rq, ret, len); + return ret; +} + +/* + * Ceph osd op callback + */ +static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) +{ + struct rbd_request *req_data = req->r_priv; + struct ceph_osd_reply_head *replyhead; + struct ceph_osd_op *op; + __s32 rc; + u64 bytes; + int read_op; + + /* parse reply */ + replyhead = msg->front.iov_base; + WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); + op = (void *)(replyhead + 1); + rc = le32_to_cpu(replyhead->result); + bytes = le64_to_cpu(op->extent.length); + read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ); + + dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc); + + if (rc == -ENOENT && read_op) { + zero_bio_chain(req_data->bio, 0); + rc = 0; + } else if (rc == 0 && read_op && bytes < req_data->len) { + zero_bio_chain(req_data->bio, bytes); + bytes = req_data->len; + } + + blk_end_request(req_data->rq, rc, bytes); + + if (req_data->bio) + bio_chain_put(req_data->bio); + + ceph_osdc_put_request(req); + kfree(req_data); +} + +/* + * Do a synchronous ceph osd operation + */ +static int rbd_req_sync_op(struct rbd_device *dev, + struct ceph_snap_context *snapc, + u64 snapid, + int opcode, + int flags, + struct ceph_osd_req_op *orig_ops, + int num_reply, + const char *obj, + u64 ofs, u64 len, + char *buf) +{ + int ret; + struct page **pages; + int num_pages; + struct ceph_osd_req_op *ops = orig_ops; + u32 payload_len; + + num_pages = calc_pages_for(ofs , len); + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + if (!orig_ops) { + payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0); + ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len); + if (ret < 0) + goto done; + + if ((flags & CEPH_OSD_FLAG_WRITE) && buf) { + ret = ceph_copy_to_page_vector(pages, buf, ofs, len); + if (ret < 0) + goto done_ops; + } + } + + ret = rbd_do_request(NULL, dev, snapc, snapid, + obj, ofs, len, NULL, + pages, num_pages, + flags, + ops, + 2, + NULL); + if (ret < 0) + goto done_ops; + + if ((flags & CEPH_OSD_FLAG_READ) && buf) + ret = ceph_copy_from_page_vector(pages, buf, ofs, ret); + +done_ops: + if (!orig_ops) + rbd_destroy_ops(ops); +done: + ceph_release_page_vector(pages, num_pages); + return ret; +} + +/* + * Do an asynchronous ceph osd operation + */ +static int rbd_do_op(struct request *rq, + struct rbd_device *rbd_dev , + struct ceph_snap_context *snapc, + u64 snapid, + int opcode, int flags, int num_reply, + u64 ofs, u64 len, + struct bio *bio) +{ + char *seg_name; + u64 seg_ofs; + u64 seg_len; + int ret; + struct ceph_osd_req_op *ops; + u32 payload_len; + + seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); + if (!seg_name) + return -ENOMEM; + + seg_len = rbd_get_segment(&rbd_dev->header, + rbd_dev->header.block_name, + ofs, len, + seg_name, &seg_ofs); + + payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0); + + ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len); + if (ret < 0) + goto done; + + /* we've taken care of segment sizes earlier when we + cloned the bios. We should never have a segment + truncated at this point */ + BUG_ON(seg_len < len); + + ret = rbd_do_request(rq, rbd_dev, snapc, snapid, + seg_name, seg_ofs, seg_len, + bio, + NULL, 0, + flags, + ops, + num_reply, + rbd_req_cb); +done: + kfree(seg_name); + return ret; +} + +/* + * Request async osd write + */ +static int rbd_req_write(struct request *rq, + struct rbd_device *rbd_dev, + struct ceph_snap_context *snapc, + u64 ofs, u64 len, + struct bio *bio) +{ + return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP, + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, + 2, + ofs, len, bio); +} + +/* + * Request async osd read + */ +static int rbd_req_read(struct request *rq, + struct rbd_device *rbd_dev, + u64 snapid, + u64 ofs, u64 len, + struct bio *bio) +{ + return rbd_do_op(rq, rbd_dev, NULL, + (snapid ? snapid : CEPH_NOSNAP), + CEPH_OSD_OP_READ, + CEPH_OSD_FLAG_READ, + 2, + ofs, len, bio); +} + +/* + * Request sync osd read + */ +static int rbd_req_sync_read(struct rbd_device *dev, + struct ceph_snap_context *snapc, + u64 snapid, + const char *obj, + u64 ofs, u64 len, + char *buf) +{ + return rbd_req_sync_op(dev, NULL, + (snapid ? snapid : CEPH_NOSNAP), + CEPH_OSD_OP_READ, + CEPH_OSD_FLAG_READ, + NULL, + 1, obj, ofs, len, buf); +} + +/* + * Request sync osd read + */ +static int rbd_req_sync_rollback_obj(struct rbd_device *dev, + u64 snapid, + const char *obj) +{ + struct ceph_osd_req_op *ops; + int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_ROLLBACK, 0); + if (ret < 0) + return ret; + + ops[0].snap.snapid = snapid; + + ret = rbd_req_sync_op(dev, NULL, + CEPH_NOSNAP, + 0, + CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, + ops, + 1, obj, 0, 0, NULL); + + rbd_destroy_ops(ops); + + if (ret < 0) + return ret; + + return ret; +} + +/* + * Request sync osd read + */ +static int rbd_req_sync_exec(struct rbd_device *dev, + const char *obj, + const char *cls, + const char *method, + const char *data, + int len) +{ + struct ceph_osd_req_op *ops; + int cls_len = strlen(cls); + int method_len = strlen(method); + int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL, + cls_len + method_len + len); + if (ret < 0) + return ret; + + ops[0].cls.class_name = cls; + ops[0].cls.class_len = (__u8)cls_len; + ops[0].cls.method_name = method; + ops[0].cls.method_len = (__u8)method_len; + ops[0].cls.argc = 0; + ops[0].cls.indata = data; + ops[0].cls.indata_len = len; + + ret = rbd_req_sync_op(dev, NULL, + CEPH_NOSNAP, + 0, + CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, + ops, + 1, obj, 0, 0, NULL); + + rbd_destroy_ops(ops); + + dout("cls_exec returned %d\n", ret); + return ret; +} + +/* + * block device queue callback + */ +static void rbd_rq_fn(struct request_queue *q) +{ + struct rbd_device *rbd_dev = q->queuedata; + struct request *rq; + struct bio_pair *bp = NULL; + + rq = blk_fetch_request(q); + + while (1) { + struct bio *bio; + struct bio *rq_bio, *next_bio = NULL; + bool do_write; + int size, op_size = 0; + u64 ofs; + + /* peek at request from block layer */ + if (!rq) + break; + + dout("fetched request\n"); + + /* filter out block requests we don't understand */ + if ((rq->cmd_type != REQ_TYPE_FS)) { + __blk_end_request_all(rq, 0); + goto next; + } + + /* deduce our operation (read, write) */ + do_write = (rq_data_dir(rq) == WRITE); + + size = blk_rq_bytes(rq); + ofs = blk_rq_pos(rq) * 512ULL; + rq_bio = rq->bio; + if (do_write && rbd_dev->read_only) { + __blk_end_request_all(rq, -EROFS); + goto next; + } + + spin_unlock_irq(q->queue_lock); + + dout("%s 0x%x bytes at 0x%llx\n", + do_write ? "write" : "read", + size, blk_rq_pos(rq) * 512ULL); + + do { + /* a bio clone to be passed down to OSD req */ + dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt); + op_size = rbd_get_segment(&rbd_dev->header, + rbd_dev->header.block_name, + ofs, size, + NULL, NULL); + bio = bio_chain_clone(&rq_bio, &next_bio, &bp, + op_size, GFP_ATOMIC); + if (!bio) { + spin_lock_irq(q->queue_lock); + __blk_end_request_all(rq, -ENOMEM); + goto next; + } + + /* init OSD command: write or read */ + if (do_write) + rbd_req_write(rq, rbd_dev, + rbd_dev->header.snapc, + ofs, + op_size, bio); + else + rbd_req_read(rq, rbd_dev, + cur_snap_id(rbd_dev), + ofs, + op_size, bio); + + size -= op_size; + ofs += op_size; + + rq_bio = next_bio; + } while (size > 0); + + if (bp) + bio_pair_release(bp); + + spin_lock_irq(q->queue_lock); +next: + rq = blk_fetch_request(q); + } +} + +/* + * a queue callback. Makes sure that we don't create a bio that spans across + * multiple osd objects. One exception would be with a single page bios, + * which we handle later at bio_chain_clone + */ +static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, + struct bio_vec *bvec) +{ + struct rbd_device *rbd_dev = q->queuedata; + unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9); + sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev); + unsigned int bio_sectors = bmd->bi_size >> 9; + int max; + + max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + + bio_sectors)) << 9; + if (max < 0) + max = 0; /* bio_add cannot handle a negative return */ + if (max <= bvec->bv_len && bio_sectors == 0) + return bvec->bv_len; + return max; +} + +static void rbd_free_disk(struct rbd_device *rbd_dev) +{ + struct gendisk *disk = rbd_dev->disk; + + if (!disk) + return; + + rbd_header_free(&rbd_dev->header); + + if (disk->flags & GENHD_FL_UP) + del_gendisk(disk); + if (disk->queue) + blk_cleanup_queue(disk->queue); + put_disk(disk); +} + +/* + * reload the ondisk the header + */ +static int rbd_read_header(struct rbd_device *rbd_dev, + struct rbd_image_header *header) +{ + ssize_t rc; + struct rbd_image_header_ondisk *dh; + int snap_count = 0; + u64 snap_names_len = 0; + + while (1) { + int len = sizeof(*dh) + + snap_count * sizeof(struct rbd_image_snap_ondisk) + + snap_names_len; + + rc = -ENOMEM; + dh = kmalloc(len, GFP_KERNEL); + if (!dh) + return -ENOMEM; + + rc = rbd_req_sync_read(rbd_dev, + NULL, CEPH_NOSNAP, + rbd_dev->obj_md_name, + 0, len, + (char *)dh); + if (rc < 0) + goto out_dh; + + rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL); + if (rc < 0) + goto out_dh; + + if (snap_count != header->total_snaps) { + snap_count = header->total_snaps; + snap_names_len = header->snap_names_len; + rbd_header_free(header); + kfree(dh); + continue; + } + break; + } + +out_dh: + kfree(dh); + return rc; +} + +/* + * create a snapshot + */ +static int rbd_header_add_snap(struct rbd_device *dev, + const char *snap_name, + gfp_t gfp_flags) +{ + int name_len = strlen(snap_name); + u64 new_snapid; + int ret; + void *data, *data_start, *data_end; + + /* we should create a snapshot only if we're pointing at the head */ + if (dev->cur_snap) + return -EINVAL; + + ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid, + &new_snapid); + dout("created snapid=%lld\n", new_snapid); + if (ret < 0) + return ret; + + data = kmalloc(name_len + 16, gfp_flags); + if (!data) + return -ENOMEM; + + data_start = data; + data_end = data + name_len + 16; + + ceph_encode_string_safe(&data, data_end, snap_name, name_len, bad); + ceph_encode_64_safe(&data, data_end, new_snapid, bad); + + ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add", + data_start, data - data_start); + + kfree(data_start); + + if (ret < 0) + return ret; + + dev->header.snapc->seq = new_snapid; + + return 0; +bad: + return -ERANGE; +} + +/* + * only read the first part of the ondisk header, without the snaps info + */ +static int rbd_update_snaps(struct rbd_device *rbd_dev) +{ + int ret; + struct rbd_image_header h; + u64 snap_seq; + + ret = rbd_read_header(rbd_dev, &h); + if (ret < 0) + return ret; + + down_write(&rbd_dev->header.snap_rwsem); + + snap_seq = rbd_dev->header.snapc->seq; + + kfree(rbd_dev->header.snapc); + kfree(rbd_dev->header.snap_names); + kfree(rbd_dev->header.snap_sizes); + + rbd_dev->header.total_snaps = h.total_snaps; + rbd_dev->header.snapc = h.snapc; + rbd_dev->header.snap_names = h.snap_names; + rbd_dev->header.snap_sizes = h.snap_sizes; + rbd_dev->header.snapc->seq = snap_seq; + + up_write(&rbd_dev->header.snap_rwsem); + + return 0; +} + +static int rbd_init_disk(struct rbd_device *rbd_dev) +{ + struct gendisk *disk; + struct request_queue *q; + int rc; + u64 total_size = 0; + + /* contact OSD, request size info about the object being mapped */ + rc = rbd_read_header(rbd_dev, &rbd_dev->header); + if (rc) + return rc; + + rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size); + if (rc) + return rc; + + /* create gendisk info */ + rc = -ENOMEM; + disk = alloc_disk(RBD_MINORS_PER_MAJOR); + if (!disk) + goto out; + + sprintf(disk->disk_name, DRV_NAME "%d", rbd_dev->id); + disk->major = rbd_dev->major; + disk->first_minor = 0; + disk->fops = &rbd_bd_ops; + disk->private_data = rbd_dev; + + /* init rq */ + rc = -ENOMEM; + q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock); + if (!q) + goto out_disk; + blk_queue_merge_bvec(q, rbd_merge_bvec); + disk->queue = q; + + q->queuedata = rbd_dev; + + rbd_dev->disk = disk; + rbd_dev->q = q; + + /* finally, announce the disk to the world */ + set_capacity(disk, total_size / 512ULL); + add_disk(disk); + + pr_info("%s: added with size 0x%llx\n", + disk->disk_name, (unsigned long long)total_size); + return 0; + +out_disk: + put_disk(disk); +out: + return rc; +} + +/******************************************************************** + * /sys/class/rbd/ + * add map rados objects to blkdev + * remove unmap rados objects + * list show mappings + *******************************************************************/ + +static void class_rbd_release(struct class *cls) +{ + kfree(cls); +} + +static ssize_t class_rbd_list(struct class *c, + struct class_attribute *attr, + char *data) +{ + int n = 0; + struct list_head *tmp; + int max = PAGE_SIZE; + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + n += snprintf(data, max, + "#id\tmajor\tclient_name\tpool\tname\tsnap\tKB\n"); + + list_for_each(tmp, &rbd_dev_list) { + struct rbd_device *rbd_dev; + + rbd_dev = list_entry(tmp, struct rbd_device, node); + n += snprintf(data+n, max-n, + "%d\t%d\tclient%lld\t%s\t%s\t%s\t%lld\n", + rbd_dev->id, + rbd_dev->major, + ceph_client_id(rbd_dev->client), + rbd_dev->pool_name, + rbd_dev->obj, rbd_dev->snap_name, + rbd_dev->header.image_size >> 10); + if (n == max) + break; + } + + mutex_unlock(&ctl_mutex); + return n; +} + +static ssize_t class_rbd_add(struct class *c, + struct class_attribute *attr, + const char *buf, size_t count) +{ + struct ceph_osd_client *osdc; + struct rbd_device *rbd_dev; + ssize_t rc = -ENOMEM; + int irc, new_id = 0; + struct list_head *tmp; + char *mon_dev_name; + char *options; + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL); + if (!mon_dev_name) + goto err_out_mod; + + options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL); + if (!options) + goto err_mon_dev; + + /* new rbd_device object */ + rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); + if (!rbd_dev) + goto err_out_opt; + + /* static rbd_device initialization */ + spin_lock_init(&rbd_dev->lock); + INIT_LIST_HEAD(&rbd_dev->node); + + /* generate unique id: find highest unique id, add one */ + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + list_for_each(tmp, &rbd_dev_list) { + struct rbd_device *rbd_dev; + + rbd_dev = list_entry(tmp, struct rbd_device, node); + if (rbd_dev->id >= new_id) + new_id = rbd_dev->id + 1; + } + + rbd_dev->id = new_id; + + /* add to global list */ + list_add_tail(&rbd_dev->node, &rbd_dev_list); + + /* parse add command */ + if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s " + "%" __stringify(RBD_MAX_OPT_LEN) "s " + "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s " + "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s" + "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s", + mon_dev_name, options, rbd_dev->pool_name, + rbd_dev->obj, rbd_dev->snap_name) < 4) { + rc = -EINVAL; + goto err_out_slot; + } + + if (rbd_dev->snap_name[0] == 0) + rbd_dev->snap_name[0] = '-'; + + rbd_dev->obj_len = strlen(rbd_dev->obj); + snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s", + rbd_dev->obj, RBD_SUFFIX); + + /* initialize rest of new object */ + snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id); + rc = rbd_get_client(rbd_dev, mon_dev_name, options); + if (rc < 0) + goto err_out_slot; + + mutex_unlock(&ctl_mutex); + + /* pick the pool */ + osdc = &rbd_dev->client->osdc; + rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name); + if (rc < 0) + goto err_out_client; + rbd_dev->poolid = rc; + + /* register our block device */ + irc = register_blkdev(0, rbd_dev->name); + if (irc < 0) { + rc = irc; + goto err_out_client; + } + rbd_dev->major = irc; + + /* set up and announce blkdev mapping */ + rc = rbd_init_disk(rbd_dev); + if (rc) + goto err_out_blkdev; + + return count; + +err_out_blkdev: + unregister_blkdev(rbd_dev->major, rbd_dev->name); +err_out_client: + rbd_put_client(rbd_dev); + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); +err_out_slot: + list_del_init(&rbd_dev->node); + mutex_unlock(&ctl_mutex); + + kfree(rbd_dev); +err_out_opt: + kfree(options); +err_mon_dev: + kfree(mon_dev_name); +err_out_mod: + dout("Error adding device %s\n", buf); + module_put(THIS_MODULE); + return rc; +} + +static struct rbd_device *__rbd_get_dev(unsigned long id) +{ + struct list_head *tmp; + struct rbd_device *rbd_dev; + + list_for_each(tmp, &rbd_dev_list) { + rbd_dev = list_entry(tmp, struct rbd_device, node); + if (rbd_dev->id == id) + return rbd_dev; + } + return NULL; +} + +static ssize_t class_rbd_remove(struct class *c, + struct class_attribute *attr, + const char *buf, + size_t count) +{ + struct rbd_device *rbd_dev = NULL; + int target_id, rc; + unsigned long ul; + + rc = strict_strtoul(buf, 10, &ul); + if (rc) + return rc; + + /* convert to int; abort if we lost anything in the conversion */ + target_id = (int) ul; + if (target_id != ul) + return -EINVAL; + + /* remove object from list immediately */ + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + rbd_dev = __rbd_get_dev(target_id); + if (rbd_dev) + list_del_init(&rbd_dev->node); + + mutex_unlock(&ctl_mutex); + + if (!rbd_dev) + return -ENOENT; + + rbd_put_client(rbd_dev); + + /* clean up and free blkdev */ + rbd_free_disk(rbd_dev); + unregister_blkdev(rbd_dev->major, rbd_dev->name); + kfree(rbd_dev); + + /* release module ref */ + module_put(THIS_MODULE); + + return count; +} + +static ssize_t class_rbd_snaps_list(struct class *c, + struct class_attribute *attr, + char *data) +{ + struct rbd_device *rbd_dev = NULL; + struct list_head *tmp; + struct rbd_image_header *header; + int i, n = 0, max = PAGE_SIZE; + int ret; + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + n += snprintf(data, max, "#id\tsnap\tKB\n"); + + list_for_each(tmp, &rbd_dev_list) { + char *names, *p; + struct ceph_snap_context *snapc; + + rbd_dev = list_entry(tmp, struct rbd_device, node); + header = &rbd_dev->header; + + down_read(&header->snap_rwsem); + + names = header->snap_names; + snapc = header->snapc; + + n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n", + rbd_dev->id, RBD_SNAP_HEAD_NAME, + header->image_size >> 10, + (!rbd_dev->cur_snap ? " (*)" : "")); + if (n == max) + break; + + p = names; + for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) { + n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n", + rbd_dev->id, p, header->snap_sizes[i] >> 10, + (rbd_dev->cur_snap && + (snap_index(header, i) == rbd_dev->cur_snap) ? + " (*)" : "")); + if (n == max) + break; + } + + up_read(&header->snap_rwsem); + } + + + ret = n; + mutex_unlock(&ctl_mutex); + return ret; +} + +static ssize_t class_rbd_snaps_refresh(struct class *c, + struct class_attribute *attr, + const char *buf, + size_t count) +{ + struct rbd_device *rbd_dev = NULL; + int target_id, rc; + unsigned long ul; + int ret = count; + + rc = strict_strtoul(buf, 10, &ul); + if (rc) + return rc; + + /* convert to int; abort if we lost anything in the conversion */ + target_id = (int) ul; + if (target_id != ul) + return -EINVAL; + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + rbd_dev = __rbd_get_dev(target_id); + if (!rbd_dev) { + ret = -ENOENT; + goto done; + } + + rc = rbd_update_snaps(rbd_dev); + if (rc < 0) + ret = rc; + +done: + mutex_unlock(&ctl_mutex); + return ret; +} + +static ssize_t class_rbd_snap_create(struct class *c, + struct class_attribute *attr, + const char *buf, + size_t count) +{ + struct rbd_device *rbd_dev = NULL; + int target_id, ret; + char *name; + + name = kmalloc(RBD_MAX_SNAP_NAME_LEN + 1, GFP_KERNEL); + if (!name) + return -ENOMEM; + + /* parse snaps add command */ + if (sscanf(buf, "%d " + "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s", + &target_id, + name) != 2) { + ret = -EINVAL; + goto done; + } + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + rbd_dev = __rbd_get_dev(target_id); + if (!rbd_dev) { + ret = -ENOENT; + goto done_unlock; + } + + ret = rbd_header_add_snap(rbd_dev, + name, GFP_KERNEL); + if (ret < 0) + goto done_unlock; + + ret = rbd_update_snaps(rbd_dev); + if (ret < 0) + goto done_unlock; + + ret = count; +done_unlock: + mutex_unlock(&ctl_mutex); +done: + kfree(name); + return ret; +} + +static ssize_t class_rbd_rollback(struct class *c, + struct class_attribute *attr, + const char *buf, + size_t count) +{ + struct rbd_device *rbd_dev = NULL; + int target_id, ret; + u64 snapid; + char snap_name[RBD_MAX_SNAP_NAME_LEN]; + u64 cur_ofs; + char *seg_name; + + /* parse snaps add command */ + if (sscanf(buf, "%d " + "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s", + &target_id, + snap_name) != 2) { + return -EINVAL; + } + + ret = -ENOMEM; + seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); + if (!seg_name) + return ret; + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + rbd_dev = __rbd_get_dev(target_id); + if (!rbd_dev) { + ret = -ENOENT; + goto done_unlock; + } + + ret = snap_by_name(&rbd_dev->header, snap_name, &snapid, NULL); + if (ret < 0) + goto done_unlock; + + dout("snapid=%lld\n", snapid); + + cur_ofs = 0; + while (cur_ofs < rbd_dev->header.image_size) { + cur_ofs += rbd_get_segment(&rbd_dev->header, + rbd_dev->obj, + cur_ofs, (u64)-1, + seg_name, NULL); + dout("seg_name=%s\n", seg_name); + + ret = rbd_req_sync_rollback_obj(rbd_dev, snapid, seg_name); + if (ret < 0) + pr_warning("could not roll back obj %s err=%d\n", + seg_name, ret); + } + + ret = rbd_update_snaps(rbd_dev); + if (ret < 0) + goto done_unlock; + + ret = count; + +done_unlock: + mutex_unlock(&ctl_mutex); + kfree(seg_name); + + return ret; +} + +static struct class_attribute class_rbd_attrs[] = { + __ATTR(add, 0200, NULL, class_rbd_add), + __ATTR(remove, 0200, NULL, class_rbd_remove), + __ATTR(list, 0444, class_rbd_list, NULL), + __ATTR(snaps_refresh, 0200, NULL, class_rbd_snaps_refresh), + __ATTR(snap_create, 0200, NULL, class_rbd_snap_create), + __ATTR(snaps_list, 0444, class_rbd_snaps_list, NULL), + __ATTR(snap_rollback, 0200, NULL, class_rbd_rollback), + __ATTR_NULL +}; + +/* + * create control files in sysfs + * /sys/class/rbd/... + */ +static int rbd_sysfs_init(void) +{ + int ret = -ENOMEM; + + class_rbd = kzalloc(sizeof(*class_rbd), GFP_KERNEL); + if (!class_rbd) + goto out; + + class_rbd->name = DRV_NAME; + class_rbd->owner = THIS_MODULE; + class_rbd->class_release = class_rbd_release; + class_rbd->class_attrs = class_rbd_attrs; + + ret = class_register(class_rbd); + if (ret) + goto out_class; + return 0; + +out_class: + kfree(class_rbd); + class_rbd = NULL; + pr_err(DRV_NAME ": failed to create class rbd\n"); +out: + return ret; +} + +static void rbd_sysfs_cleanup(void) +{ + if (class_rbd) + class_destroy(class_rbd); + class_rbd = NULL; +} + +int __init rbd_init(void) +{ + int rc; + + rc = rbd_sysfs_init(); + if (rc) + return rc; + spin_lock_init(&node_lock); + pr_info("loaded " DRV_NAME_LONG "\n"); + return 0; +} + +void __exit rbd_exit(void) +{ + rbd_sysfs_cleanup(); +} + +module_init(rbd_init); +module_exit(rbd_exit); + +MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); +MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); +MODULE_DESCRIPTION("rados block device"); + +/* following authorship retained from original osdblk.c */ +MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); + +MODULE_LICENSE("GPL"); diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h new file mode 100644 index 000000000000..fc6c678aa2cb --- /dev/null +++ b/drivers/block/rbd_types.h @@ -0,0 +1,73 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2010 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_RBD_TYPES_H +#define CEPH_RBD_TYPES_H + +#include <linux/types.h> + +/* + * rbd image 'foo' consists of objects + * foo.rbd - image metadata + * foo.00000000 + * foo.00000001 + * ... - data + */ + +#define RBD_SUFFIX ".rbd" +#define RBD_DIRECTORY "rbd_directory" +#define RBD_INFO "rbd_info" + +#define RBD_DEFAULT_OBJ_ORDER 22 /* 4MB */ +#define RBD_MIN_OBJ_ORDER 16 +#define RBD_MAX_OBJ_ORDER 30 + +#define RBD_MAX_OBJ_NAME_LEN 96 +#define RBD_MAX_SEG_NAME_LEN 128 + +#define RBD_COMP_NONE 0 +#define RBD_CRYPT_NONE 0 + +#define RBD_HEADER_TEXT "<<< Rados Block Device Image >>>\n" +#define RBD_HEADER_SIGNATURE "RBD" +#define RBD_HEADER_VERSION "001.005" + +struct rbd_info { + __le64 max_id; +} __attribute__ ((packed)); + +struct rbd_image_snap_ondisk { + __le64 id; + __le64 image_size; +} __attribute__((packed)); + +struct rbd_image_header_ondisk { + char text[40]; + char block_name[24]; + char signature[4]; + char version[8]; + struct { + __u8 order; + __u8 crypt_type; + __u8 comp_type; + __u8 unused; + } __attribute__((packed)) options; + __le64 image_size; + __le64 snap_seq; + __le32 snap_count; + __le32 reserved; + __le64 snap_names_len; + struct rbd_image_snap_ondisk snaps[0]; +} __attribute__((packed)); + + +#endif diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 1101e251a629..8320490226b7 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -2,7 +2,6 @@ #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/blkdev.h> -#include <linux/smp_lock.h> #include <linux/hdreg.h> #include <linux/virtio.h> #include <linux/virtio_blk.h> @@ -222,8 +221,8 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) return err; } -static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode, - unsigned cmd, unsigned long data) +static int virtblk_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long data) { struct gendisk *disk = bdev->bd_disk; struct virtio_blk *vblk = disk->private_data; @@ -238,18 +237,6 @@ static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode, (void __user *)data); } -static int virtblk_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long param) -{ - int ret; - - lock_kernel(); - ret = virtblk_locked_ioctl(bdev, mode, cmd, param); - unlock_kernel(); - - return ret; -} - /* We provide getgeo only to please some old bootloader/partitioning tools */ static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) { diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index 70312da4c968..564808a5c3c0 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -199,7 +199,7 @@ static void amd64_cleanup(void) struct pci_dev *dev = k8_northbridges[i]; /* disable gart translation */ pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &tmp); - tmp &= ~AMD64_GARTEN; + tmp &= ~GARTEN; pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, tmp); } } @@ -313,7 +313,7 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp, if (order < 0 || !agp_aperture_valid(aper, (32*1024*1024)<<order)) return -1; - pci_write_config_dword(nb, AMD64_GARTAPERTURECTL, order << 1); + gart_set_size_and_enable(nb, order); pci_write_config_dword(nb, AMD64_GARTAPERTUREBASE, aper >> 25); return 0; diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c index d2abf5143983..64255cef8a7d 100644 --- a/drivers/char/agp/generic.c +++ b/drivers/char/agp/generic.c @@ -984,7 +984,9 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge) bridge->driver->cache_flush(); #ifdef CONFIG_X86 - set_memory_uc((unsigned long)table, 1 << page_order); + if (set_memory_uc((unsigned long)table, 1 << page_order)) + printk(KERN_WARNING "Could not set GATT table memory to UC!"); + bridge->gatt_table = (void *)table; #else bridge->gatt_table = ioremap_nocache(virt_to_phys(table), diff --git a/drivers/char/tpm/tpm.c b/drivers/char/tpm/tpm.c index 05ad4a17a28f..7c4133582dba 100644 --- a/drivers/char/tpm/tpm.c +++ b/drivers/char/tpm/tpm.c @@ -47,6 +47,16 @@ enum tpm_duration { #define TPM_MAX_PROTECTED_ORDINAL 12 #define TPM_PROTECTED_ORDINAL_MASK 0xFF +/* + * Bug workaround - some TPM's don't flush the most + * recently changed pcr on suspend, so force the flush + * with an extend to the selected _unused_ non-volatile pcr. + */ +static int tpm_suspend_pcr; +module_param_named(suspend_pcr, tpm_suspend_pcr, uint, 0644); +MODULE_PARM_DESC(suspend_pcr, + "PCR to use for dummy writes to faciltate flush on suspend."); + static LIST_HEAD(tpm_chip_list); static DEFINE_SPINLOCK(driver_lock); static DECLARE_BITMAP(dev_mask, TPM_NUM_DEVICES); @@ -1077,18 +1087,6 @@ static struct tpm_input_header savestate_header = { .ordinal = TPM_ORD_SAVESTATE }; -/* Bug workaround - some TPM's don't flush the most - * recently changed pcr on suspend, so force the flush - * with an extend to the selected _unused_ non-volatile pcr. - */ -static int tpm_suspend_pcr; -static int __init tpm_suspend_setup(char *str) -{ - get_option(&str, &tpm_suspend_pcr); - return 1; -} -__setup("tpm_suspend_pcr=", tpm_suspend_setup); - /* * We are about to suspend. Save the TPM state * so that it can be restored. diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index c810481a5bc2..6c1b676643a9 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -48,6 +48,9 @@ struct ports_driver_data { /* Used for exporting per-port information to debugfs */ struct dentry *debugfs_dir; + /* List of all the devices we're handling */ + struct list_head portdevs; + /* Number of devices this driver is handling */ unsigned int index; @@ -108,6 +111,9 @@ struct port_buffer { * ports for that device (vdev->priv). */ struct ports_device { + /* Next portdev in the list, head is in the pdrvdata struct */ + struct list_head list; + /* * Workqueue handlers where we process deferred work after * notification @@ -178,15 +184,21 @@ struct port { struct console cons; /* Each port associates with a separate char device */ - struct cdev cdev; + struct cdev *cdev; struct device *dev; + /* Reference-counting to handle port hot-unplugs and file operations */ + struct kref kref; + /* A waitqueue for poll() or blocking read operations */ wait_queue_head_t waitqueue; /* The 'name' of the port that we expose via sysfs properties */ char *name; + /* We can notify apps of host connect / disconnect events via SIGIO */ + struct fasync_struct *async_queue; + /* The 'id' to identify the port with the Host */ u32 id; @@ -221,6 +233,41 @@ out: return port; } +static struct port *find_port_by_devt_in_portdev(struct ports_device *portdev, + dev_t dev) +{ + struct port *port; + unsigned long flags; + + spin_lock_irqsave(&portdev->ports_lock, flags); + list_for_each_entry(port, &portdev->ports, list) + if (port->cdev->dev == dev) + goto out; + port = NULL; +out: + spin_unlock_irqrestore(&portdev->ports_lock, flags); + + return port; +} + +static struct port *find_port_by_devt(dev_t dev) +{ + struct ports_device *portdev; + struct port *port; + unsigned long flags; + + spin_lock_irqsave(&pdrvdata_lock, flags); + list_for_each_entry(portdev, &pdrvdata.portdevs, list) { + port = find_port_by_devt_in_portdev(portdev, dev); + if (port) + goto out; + } + port = NULL; +out: + spin_unlock_irqrestore(&pdrvdata_lock, flags); + return port; +} + static struct port *find_port_by_id(struct ports_device *portdev, u32 id) { struct port *port; @@ -410,7 +457,10 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id, static ssize_t send_control_msg(struct port *port, unsigned int event, unsigned int value) { - return __send_control_msg(port->portdev, port->id, event, value); + /* Did the port get unplugged before userspace closed it? */ + if (port->portdev) + return __send_control_msg(port->portdev, port->id, event, value); + return 0; } /* Callers must take the port->outvq_lock */ @@ -459,9 +509,12 @@ static ssize_t send_buf(struct port *port, void *in_buf, size_t in_count, /* * Wait till the host acknowledges it pushed out the data we - * sent. This is done for ports in blocking mode or for data - * from the hvc_console; the tty operations are performed with - * spinlocks held so we can't sleep here. + * sent. This is done for data from the hvc_console; the tty + * operations are performed with spinlocks held so we can't + * sleep here. An alternative would be to copy the data to a + * buffer and relax the spinning requirement. The downside is + * we need to kmalloc a GFP_ATOMIC buffer each time the + * console driver writes something out. */ while (!virtqueue_get_buf(out_vq, &len)) cpu_relax(); @@ -522,6 +575,10 @@ static ssize_t fill_readbuf(struct port *port, char *out_buf, size_t out_count, /* The condition that must be true for polling to end */ static bool will_read_block(struct port *port) { + if (!port->guest_connected) { + /* Port got hot-unplugged. Let's exit. */ + return false; + } return !port_has_data(port) && port->host_connected; } @@ -572,6 +629,9 @@ static ssize_t port_fops_read(struct file *filp, char __user *ubuf, if (ret < 0) return ret; } + /* Port got hot-unplugged. */ + if (!port->guest_connected) + return -ENODEV; /* * We could've received a disconnection message while we were * waiting for more data. @@ -613,6 +673,9 @@ static ssize_t port_fops_write(struct file *filp, const char __user *ubuf, if (ret < 0) return ret; } + /* Port got hot-unplugged. */ + if (!port->guest_connected) + return -ENODEV; count = min((size_t)(32 * 1024), count); @@ -626,6 +689,14 @@ static ssize_t port_fops_write(struct file *filp, const char __user *ubuf, goto free_buf; } + /* + * We now ask send_buf() to not spin for generic ports -- we + * can re-use the same code path that non-blocking file + * descriptors take for blocking file descriptors since the + * wait is already done and we're certain the write will go + * through to the host. + */ + nonblock = true; ret = send_buf(port, buf, count, nonblock); if (nonblock && ret > 0) @@ -645,6 +716,10 @@ static unsigned int port_fops_poll(struct file *filp, poll_table *wait) port = filp->private_data; poll_wait(filp, &port->waitqueue, wait); + if (!port->guest_connected) { + /* Port got unplugged */ + return POLLHUP; + } ret = 0; if (!will_read_block(port)) ret |= POLLIN | POLLRDNORM; @@ -656,6 +731,8 @@ static unsigned int port_fops_poll(struct file *filp, poll_table *wait) return ret; } +static void remove_port(struct kref *kref); + static int port_fops_release(struct inode *inode, struct file *filp) { struct port *port; @@ -676,6 +753,16 @@ static int port_fops_release(struct inode *inode, struct file *filp) reclaim_consumed_buffers(port); spin_unlock_irq(&port->outvq_lock); + /* + * Locks aren't necessary here as a port can't be opened after + * unplug, and if a port isn't unplugged, a kref would already + * exist for the port. Plus, taking ports_lock here would + * create a dependency on other locks taken by functions + * inside remove_port if we're the last holder of the port, + * creating many problems. + */ + kref_put(&port->kref, remove_port); + return 0; } @@ -683,22 +770,31 @@ static int port_fops_open(struct inode *inode, struct file *filp) { struct cdev *cdev = inode->i_cdev; struct port *port; + int ret; - port = container_of(cdev, struct port, cdev); + port = find_port_by_devt(cdev->dev); filp->private_data = port; + /* Prevent against a port getting hot-unplugged at the same time */ + spin_lock_irq(&port->portdev->ports_lock); + kref_get(&port->kref); + spin_unlock_irq(&port->portdev->ports_lock); + /* * Don't allow opening of console port devices -- that's done * via /dev/hvc */ - if (is_console_port(port)) - return -ENXIO; + if (is_console_port(port)) { + ret = -ENXIO; + goto out; + } /* Allow only one process to open a particular port at a time */ spin_lock_irq(&port->inbuf_lock); if (port->guest_connected) { spin_unlock_irq(&port->inbuf_lock); - return -EMFILE; + ret = -EMFILE; + goto out; } port->guest_connected = true; @@ -713,10 +809,23 @@ static int port_fops_open(struct inode *inode, struct file *filp) reclaim_consumed_buffers(port); spin_unlock_irq(&port->outvq_lock); + nonseekable_open(inode, filp); + /* Notify host of port being opened */ send_control_msg(filp->private_data, VIRTIO_CONSOLE_PORT_OPEN, 1); return 0; +out: + kref_put(&port->kref, remove_port); + return ret; +} + +static int port_fops_fasync(int fd, struct file *filp, int mode) +{ + struct port *port; + + port = filp->private_data; + return fasync_helper(fd, filp, mode, &port->async_queue); } /* @@ -732,6 +841,8 @@ static const struct file_operations port_fops = { .write = port_fops_write, .poll = port_fops_poll, .release = port_fops_release, + .fasync = port_fops_fasync, + .llseek = no_llseek, }; /* @@ -990,6 +1101,12 @@ static unsigned int fill_queue(struct virtqueue *vq, spinlock_t *lock) return nr_added_bufs; } +static void send_sigio_to_port(struct port *port) +{ + if (port->async_queue && port->guest_connected) + kill_fasync(&port->async_queue, SIGIO, POLL_OUT); +} + static int add_port(struct ports_device *portdev, u32 id) { char debugfs_name[16]; @@ -1004,6 +1121,7 @@ static int add_port(struct ports_device *portdev, u32 id) err = -ENOMEM; goto fail; } + kref_init(&port->kref); port->portdev = portdev; port->id = id; @@ -1011,6 +1129,7 @@ static int add_port(struct ports_device *portdev, u32 id) port->name = NULL; port->inbuf = NULL; port->cons.hvc = NULL; + port->async_queue = NULL; port->cons.ws.ws_row = port->cons.ws.ws_col = 0; @@ -1021,14 +1140,20 @@ static int add_port(struct ports_device *portdev, u32 id) port->in_vq = portdev->in_vqs[port->id]; port->out_vq = portdev->out_vqs[port->id]; - cdev_init(&port->cdev, &port_fops); + port->cdev = cdev_alloc(); + if (!port->cdev) { + dev_err(&port->portdev->vdev->dev, "Error allocating cdev\n"); + err = -ENOMEM; + goto free_port; + } + port->cdev->ops = &port_fops; devt = MKDEV(portdev->chr_major, id); - err = cdev_add(&port->cdev, devt, 1); + err = cdev_add(port->cdev, devt, 1); if (err < 0) { dev_err(&port->portdev->vdev->dev, "Error %d adding cdev for port %u\n", err, id); - goto free_port; + goto free_cdev; } port->dev = device_create(pdrvdata.class, &port->portdev->vdev->dev, devt, port, "vport%up%u", @@ -1093,7 +1218,7 @@ free_inbufs: free_device: device_destroy(pdrvdata.class, port->dev->devt); free_cdev: - cdev_del(&port->cdev); + cdev_del(port->cdev); free_port: kfree(port); fail: @@ -1102,21 +1227,45 @@ fail: return err; } -/* Remove all port-specific data. */ -static int remove_port(struct port *port) +/* No users remain, remove all port-specific data. */ +static void remove_port(struct kref *kref) +{ + struct port *port; + + port = container_of(kref, struct port, kref); + + sysfs_remove_group(&port->dev->kobj, &port_attribute_group); + device_destroy(pdrvdata.class, port->dev->devt); + cdev_del(port->cdev); + + kfree(port->name); + + debugfs_remove(port->debugfs_file); + + kfree(port); +} + +/* + * Port got unplugged. Remove port from portdev's list and drop the + * kref reference. If no userspace has this port opened, it will + * result in immediate removal the port. + */ +static void unplug_port(struct port *port) { struct port_buffer *buf; + spin_lock_irq(&port->portdev->ports_lock); + list_del(&port->list); + spin_unlock_irq(&port->portdev->ports_lock); + if (port->guest_connected) { port->guest_connected = false; port->host_connected = false; wake_up_interruptible(&port->waitqueue); - send_control_msg(port, VIRTIO_CONSOLE_PORT_OPEN, 0); - } - spin_lock_irq(&port->portdev->ports_lock); - list_del(&port->list); - spin_unlock_irq(&port->portdev->ports_lock); + /* Let the app know the port is going down. */ + send_sigio_to_port(port); + } if (is_console_port(port)) { spin_lock_irq(&pdrvdata_lock); @@ -1135,9 +1284,6 @@ static int remove_port(struct port *port) hvc_remove(port->cons.hvc); #endif } - sysfs_remove_group(&port->dev->kobj, &port_attribute_group); - device_destroy(pdrvdata.class, port->dev->devt); - cdev_del(&port->cdev); /* Remove unused data this port might have received. */ discard_port_data(port); @@ -1148,12 +1294,19 @@ static int remove_port(struct port *port) while ((buf = virtqueue_detach_unused_buf(port->in_vq))) free_buf(buf); - kfree(port->name); - - debugfs_remove(port->debugfs_file); + /* + * We should just assume the device itself has gone off -- + * else a close on an open port later will try to send out a + * control message. + */ + port->portdev = NULL; - kfree(port); - return 0; + /* + * Locks around here are not necessary - a port can't be + * opened after we removed the port struct from ports_list + * above. + */ + kref_put(&port->kref, remove_port); } /* Any private messages that the Host and Guest want to share */ @@ -1192,7 +1345,7 @@ static void handle_control_message(struct ports_device *portdev, add_port(portdev, cpkt->id); break; case VIRTIO_CONSOLE_PORT_REMOVE: - remove_port(port); + unplug_port(port); break; case VIRTIO_CONSOLE_CONSOLE_PORT: if (!cpkt->value) @@ -1234,6 +1387,12 @@ static void handle_control_message(struct ports_device *portdev, spin_lock_irq(&port->outvq_lock); reclaim_consumed_buffers(port); spin_unlock_irq(&port->outvq_lock); + + /* + * If the guest is connected, it'll be interested in + * knowing the host connection state changed. + */ + send_sigio_to_port(port); break; case VIRTIO_CONSOLE_PORT_NAME: /* @@ -1330,6 +1489,9 @@ static void in_intr(struct virtqueue *vq) wake_up_interruptible(&port->waitqueue); + /* Send a SIGIO indicating new data in case the process asked for it */ + send_sigio_to_port(port); + if (is_console_port(port) && hvc_poll(port->cons.hvc)) hvc_kick(); } @@ -1566,6 +1728,10 @@ static int __devinit virtcons_probe(struct virtio_device *vdev) add_port(portdev, 0); } + spin_lock_irq(&pdrvdata_lock); + list_add_tail(&portdev->list, &pdrvdata.portdevs); + spin_unlock_irq(&pdrvdata_lock); + __send_control_msg(portdev, VIRTIO_CONSOLE_BAD_ID, VIRTIO_CONSOLE_DEVICE_READY, 1); return 0; @@ -1589,23 +1755,41 @@ static void virtcons_remove(struct virtio_device *vdev) { struct ports_device *portdev; struct port *port, *port2; - struct port_buffer *buf; - unsigned int len; portdev = vdev->priv; + spin_lock_irq(&pdrvdata_lock); + list_del(&portdev->list); + spin_unlock_irq(&pdrvdata_lock); + + /* Disable interrupts for vqs */ + vdev->config->reset(vdev); + /* Finish up work that's lined up */ cancel_work_sync(&portdev->control_work); list_for_each_entry_safe(port, port2, &portdev->ports, list) - remove_port(port); + unplug_port(port); unregister_chrdev(portdev->chr_major, "virtio-portsdev"); - while ((buf = virtqueue_get_buf(portdev->c_ivq, &len))) - free_buf(buf); + /* + * When yanking out a device, we immediately lose the + * (device-side) queues. So there's no point in keeping the + * guest side around till we drop our final reference. This + * also means that any ports which are in an open state will + * have to just stop using the port, as the vqs are going + * away. + */ + if (use_multiport(portdev)) { + struct port_buffer *buf; + unsigned int len; - while ((buf = virtqueue_detach_unused_buf(portdev->c_ivq))) - free_buf(buf); + while ((buf = virtqueue_get_buf(portdev->c_ivq, &len))) + free_buf(buf); + + while ((buf = virtqueue_detach_unused_buf(portdev->c_ivq))) + free_buf(buf); + } vdev->config->del_vqs(vdev); kfree(portdev->in_vqs); @@ -1652,6 +1836,7 @@ static int __init init(void) PTR_ERR(pdrvdata.debugfs_dir)); } INIT_LIST_HEAD(&pdrvdata.consoles); + INIT_LIST_HEAD(&pdrvdata.portdevs); return register_virtio_driver(&virtio_console); } diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index 1b05896648bc..9dcb17d51aee 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -2840,7 +2840,7 @@ static int __devinit pci_probe(struct pci_dev *dev, const struct pci_device_id *ent) { struct fw_ohci *ohci; - u32 bus_options, max_receive, link_speed, version, link_enh; + u32 bus_options, max_receive, link_speed, version; u64 guid; int i, err, n_ir, n_it; size_t size; @@ -2894,23 +2894,6 @@ static int __devinit pci_probe(struct pci_dev *dev, if (param_quirks) ohci->quirks = param_quirks; - /* TI OHCI-Lynx and compatible: set recommended configuration bits. */ - if (dev->vendor == PCI_VENDOR_ID_TI) { - pci_read_config_dword(dev, PCI_CFG_TI_LinkEnh, &link_enh); - - /* adjust latency of ATx FIFO: use 1.7 KB threshold */ - link_enh &= ~TI_LinkEnh_atx_thresh_mask; - link_enh |= TI_LinkEnh_atx_thresh_1_7K; - - /* use priority arbitration for asynchronous responses */ - link_enh |= TI_LinkEnh_enab_unfair; - - /* required for aPhyEnhanceEnable to work */ - link_enh |= TI_LinkEnh_enab_accel; - - pci_write_config_dword(dev, PCI_CFG_TI_LinkEnh, link_enh); - } - ar_context_init(&ohci->ar_request_ctx, ohci, OHCI1394_AsReqRcvContextControlSet); diff --git a/drivers/firewire/ohci.h b/drivers/firewire/ohci.h index 0e6c5a466908..ef5e7336da68 100644 --- a/drivers/firewire/ohci.h +++ b/drivers/firewire/ohci.h @@ -155,12 +155,4 @@ #define OHCI1394_phy_tcode 0xe -/* TI extensions */ - -#define PCI_CFG_TI_LinkEnh 0xf4 -#define TI_LinkEnh_enab_accel 0x00000002 -#define TI_LinkEnh_enab_unfair 0x00000080 -#define TI_LinkEnh_atx_thresh_mask 0x00003000 -#define TI_LinkEnh_atx_thresh_1_7K 0x00001000 - #endif /* _FIREWIRE_OHCI_H */ diff --git a/drivers/gpu/drm/radeon/radeon_cursor.c b/drivers/gpu/drm/radeon/radeon_cursor.c index 5731fc9b1ae3..3eef567b0421 100644 --- a/drivers/gpu/drm/radeon/radeon_cursor.c +++ b/drivers/gpu/drm/radeon/radeon_cursor.c @@ -203,6 +203,7 @@ int radeon_crtc_cursor_move(struct drm_crtc *crtc, struct radeon_crtc *radeon_crtc = to_radeon_crtc(crtc); struct radeon_device *rdev = crtc->dev->dev_private; int xorigin = 0, yorigin = 0; + int w = radeon_crtc->cursor_width; if (x < 0) xorigin = -x + 1; @@ -213,22 +214,7 @@ int radeon_crtc_cursor_move(struct drm_crtc *crtc, if (yorigin >= CURSOR_HEIGHT) yorigin = CURSOR_HEIGHT - 1; - radeon_lock_cursor(crtc, true); - if (ASIC_IS_DCE4(rdev)) { - /* cursors are offset into the total surface */ - x += crtc->x; - y += crtc->y; - DRM_DEBUG("x %d y %d c->x %d c->y %d\n", x, y, crtc->x, crtc->y); - - /* XXX: check if evergreen has the same issues as avivo chips */ - WREG32(EVERGREEN_CUR_POSITION + radeon_crtc->crtc_offset, - ((xorigin ? 0 : x) << 16) | - (yorigin ? 0 : y)); - WREG32(EVERGREEN_CUR_HOT_SPOT + radeon_crtc->crtc_offset, (xorigin << 16) | yorigin); - WREG32(EVERGREEN_CUR_SIZE + radeon_crtc->crtc_offset, - ((radeon_crtc->cursor_width - 1) << 16) | (radeon_crtc->cursor_height - 1)); - } else if (ASIC_IS_AVIVO(rdev)) { - int w = radeon_crtc->cursor_width; + if (ASIC_IS_AVIVO(rdev)) { int i = 0; struct drm_crtc *crtc_p; @@ -260,7 +246,17 @@ int radeon_crtc_cursor_move(struct drm_crtc *crtc, if (w <= 0) w = 1; } + } + radeon_lock_cursor(crtc, true); + if (ASIC_IS_DCE4(rdev)) { + WREG32(EVERGREEN_CUR_POSITION + radeon_crtc->crtc_offset, + ((xorigin ? 0 : x) << 16) | + (yorigin ? 0 : y)); + WREG32(EVERGREEN_CUR_HOT_SPOT + radeon_crtc->crtc_offset, (xorigin << 16) | yorigin); + WREG32(EVERGREEN_CUR_SIZE + radeon_crtc->crtc_offset, + ((w - 1) << 16) | (radeon_crtc->cursor_height - 1)); + } else if (ASIC_IS_AVIVO(rdev)) { WREG32(AVIVO_D1CUR_POSITION + radeon_crtc->crtc_offset, ((xorigin ? 0 : x) << 16) | (yorigin ? 0 : y)); diff --git a/drivers/hid/hid-cando.c b/drivers/hid/hid-cando.c index 4267a6fdc277..5925bdcd417d 100644 --- a/drivers/hid/hid-cando.c +++ b/drivers/hid/hid-cando.c @@ -237,6 +237,8 @@ static const struct hid_device_id cando_devices[] = { USB_DEVICE_ID_CANDO_MULTI_TOUCH) }, { HID_USB_DEVICE(USB_VENDOR_ID_CANDO, USB_DEVICE_ID_CANDO_MULTI_TOUCH_11_6) }, + { HID_USB_DEVICE(USB_VENDOR_ID_CANDO, + USB_DEVICE_ID_CANDO_MULTI_TOUCH_15_6) }, { } }; MODULE_DEVICE_TABLE(hid, cando_devices); diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 3f7292486024..a0dea3d1296e 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -1292,6 +1292,7 @@ static const struct hid_device_id hid_blacklist[] = { { HID_USB_DEVICE(USB_VENDOR_ID_BTC, USB_DEVICE_ID_BTC_EMPREX_REMOTE_2) }, { HID_USB_DEVICE(USB_VENDOR_ID_CANDO, USB_DEVICE_ID_CANDO_MULTI_TOUCH) }, { HID_USB_DEVICE(USB_VENDOR_ID_CANDO, USB_DEVICE_ID_CANDO_MULTI_TOUCH_11_6) }, + { HID_USB_DEVICE(USB_VENDOR_ID_CANDO, USB_DEVICE_ID_CANDO_MULTI_TOUCH_15_6) }, { HID_USB_DEVICE(USB_VENDOR_ID_CHERRY, USB_DEVICE_ID_CHERRY_CYMOTION) }, { HID_USB_DEVICE(USB_VENDOR_ID_CHERRY, USB_DEVICE_ID_CHERRY_CYMOTION_SOLAR) }, { HID_USB_DEVICE(USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_TACTICAL_PAD) }, diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 765a4f53eb5c..c5ae5f1545bd 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -134,6 +134,7 @@ #define USB_VENDOR_ID_CANDO 0x2087 #define USB_DEVICE_ID_CANDO_MULTI_TOUCH 0x0a01 #define USB_DEVICE_ID_CANDO_MULTI_TOUCH_11_6 0x0b03 +#define USB_DEVICE_ID_CANDO_MULTI_TOUCH_15_6 0x0f01 #define USB_VENDOR_ID_CH 0x068e #define USB_DEVICE_ID_CH_PRO_PEDALS 0x00f2 @@ -503,6 +504,7 @@ #define USB_VENDOR_ID_TURBOX 0x062a #define USB_DEVICE_ID_TURBOX_KEYBOARD 0x0201 +#define USB_DEVICE_ID_TURBOX_TOUCHSCREEN_MOSART 0x7100 #define USB_VENDOR_ID_TWINHAN 0x6253 #define USB_DEVICE_ID_TWINHAN_IR_REMOTE 0x0100 diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c index 47d70c523d93..a3866b5c0c43 100644 --- a/drivers/hid/hidraw.c +++ b/drivers/hid/hidraw.c @@ -109,6 +109,12 @@ static ssize_t hidraw_write(struct file *file, const char __user *buffer, size_t int ret = 0; mutex_lock(&minors_lock); + + if (!hidraw_table[minor]) { + ret = -ENODEV; + goto out; + } + dev = hidraw_table[minor]->hid; if (!dev->hid_output_raw_report) { @@ -244,6 +250,10 @@ static long hidraw_ioctl(struct file *file, unsigned int cmd, mutex_lock(&minors_lock); dev = hidraw_table[minor]; + if (!dev) { + ret = -ENODEV; + goto out; + } switch (cmd) { case HIDIOCGRDESCSIZE: @@ -317,6 +327,7 @@ static long hidraw_ioctl(struct file *file, unsigned int cmd, ret = -ENOTTY; } +out: mutex_unlock(&minors_lock); return ret; } diff --git a/drivers/hid/usbhid/hid-quirks.c b/drivers/hid/usbhid/hid-quirks.c index 70da3181c8a0..f0260c699adb 100644 --- a/drivers/hid/usbhid/hid-quirks.c +++ b/drivers/hid/usbhid/hid-quirks.c @@ -36,6 +36,7 @@ static const struct hid_blacklist { { USB_VENDOR_ID_DWAV, USB_DEVICE_ID_EGALAX_TOUCHCONTROLLER, HID_QUIRK_MULTI_INPUT | HID_QUIRK_NOGET }, { USB_VENDOR_ID_DWAV, USB_DEVICE_ID_DWAV_EGALAX_MULTITOUCH, HID_QUIRK_MULTI_INPUT }, { USB_VENDOR_ID_MOJO, USB_DEVICE_ID_RETRO_ADAPTER, HID_QUIRK_MULTI_INPUT }, + { USB_VENDOR_ID_TURBOX, USB_DEVICE_ID_TURBOX_TOUCHSCREEN_MOSART, HID_QUIRK_MULTI_INPUT }, { USB_VENDOR_ID_HAPP, USB_DEVICE_ID_UGCI_DRIVING, HID_QUIRK_BADPAD | HID_QUIRK_MULTI_INPUT }, { USB_VENDOR_ID_HAPP, USB_DEVICE_ID_UGCI_FLYING, HID_QUIRK_BADPAD | HID_QUIRK_MULTI_INPUT }, { USB_VENDOR_ID_HAPP, USB_DEVICE_ID_UGCI_FIGHTING, HID_QUIRK_BADPAD | HID_QUIRK_MULTI_INPUT }, diff --git a/drivers/i2c/busses/i2c-davinci.c b/drivers/i2c/busses/i2c-davinci.c index b8feac5f2ef4..5795c8398c7c 100644 --- a/drivers/i2c/busses/i2c-davinci.c +++ b/drivers/i2c/busses/i2c-davinci.c @@ -331,21 +331,16 @@ i2c_davinci_xfer_msg(struct i2c_adapter *adap, struct i2c_msg *msg, int stop) INIT_COMPLETION(dev->cmd_complete); dev->cmd_err = 0; - /* Take I2C out of reset, configure it as master and set the - * start bit */ - flag = DAVINCI_I2C_MDR_IRS | DAVINCI_I2C_MDR_MST | DAVINCI_I2C_MDR_STT; + /* Take I2C out of reset and configure it as master */ + flag = DAVINCI_I2C_MDR_IRS | DAVINCI_I2C_MDR_MST; /* if the slave address is ten bit address, enable XA bit */ if (msg->flags & I2C_M_TEN) flag |= DAVINCI_I2C_MDR_XA; if (!(msg->flags & I2C_M_RD)) flag |= DAVINCI_I2C_MDR_TRX; - if (stop) - flag |= DAVINCI_I2C_MDR_STP; - if (msg->len == 0) { + if (msg->len == 0) flag |= DAVINCI_I2C_MDR_RM; - flag &= ~DAVINCI_I2C_MDR_STP; - } /* Enable receive or transmit interrupts */ w = davinci_i2c_read_reg(dev, DAVINCI_I2C_IMR_REG); @@ -358,17 +353,28 @@ i2c_davinci_xfer_msg(struct i2c_adapter *adap, struct i2c_msg *msg, int stop) dev->terminate = 0; /* + * Write mode register first as needed for correct behaviour + * on OMAP-L138, but don't set STT yet to avoid a race with XRDY + * occuring before we have loaded DXR + */ + davinci_i2c_write_reg(dev, DAVINCI_I2C_MDR_REG, flag); + + /* * First byte should be set here, not after interrupt, * because transmit-data-ready interrupt can come before * NACK-interrupt during sending of previous message and * ICDXR may have wrong data + * It also saves us one interrupt, slightly faster */ if ((!(msg->flags & I2C_M_RD)) && dev->buf_len) { davinci_i2c_write_reg(dev, DAVINCI_I2C_DXR_REG, *dev->buf++); dev->buf_len--; } - /* write the data into mode register; start transmitting */ + /* Set STT to begin transmit now DXR is loaded */ + flag |= DAVINCI_I2C_MDR_STT; + if (stop && msg->len != 0) + flag |= DAVINCI_I2C_MDR_STP; davinci_i2c_write_reg(dev, DAVINCI_I2C_MDR_REG, flag); r = wait_for_completion_interruptible_timeout(&dev->cmd_complete, diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c index d1ff9408dc1f..4c2a62b75b5c 100644 --- a/drivers/i2c/busses/i2c-imx.c +++ b/drivers/i2c/busses/i2c-imx.c @@ -159,15 +159,9 @@ static int i2c_imx_bus_busy(struct imx_i2c_struct *i2c_imx, int for_busy) static int i2c_imx_trx_complete(struct imx_i2c_struct *i2c_imx) { - int result; - - result = wait_event_interruptible_timeout(i2c_imx->queue, - i2c_imx->i2csr & I2SR_IIF, HZ / 10); + wait_event_timeout(i2c_imx->queue, i2c_imx->i2csr & I2SR_IIF, HZ / 10); - if (unlikely(result < 0)) { - dev_dbg(&i2c_imx->adapter.dev, "<%s> result < 0\n", __func__); - return result; - } else if (unlikely(!(i2c_imx->i2csr & I2SR_IIF))) { + if (unlikely(!(i2c_imx->i2csr & I2SR_IIF))) { dev_dbg(&i2c_imx->adapter.dev, "<%s> Timeout\n", __func__); return -ETIMEDOUT; } @@ -295,7 +289,7 @@ static irqreturn_t i2c_imx_isr(int irq, void *dev_id) i2c_imx->i2csr = temp; temp &= ~I2SR_IIF; writeb(temp, i2c_imx->base + IMX_I2C_I2SR); - wake_up_interruptible(&i2c_imx->queue); + wake_up(&i2c_imx->queue); return IRQ_HANDLED; } diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c index c908c5f83645..af9ee313c10b 100644 --- a/drivers/input/evdev.c +++ b/drivers/input/evdev.c @@ -28,7 +28,7 @@ struct evdev { int minor; struct input_handle handle; wait_queue_head_t wait; - struct evdev_client *grab; + struct evdev_client __rcu *grab; struct list_head client_list; spinlock_t client_lock; /* protects client_list */ struct mutex mutex; @@ -669,6 +669,9 @@ static long evdev_do_ioctl(struct file *file, unsigned int cmd, if ((_IOC_NR(cmd) & ~ABS_MAX) == _IOC_NR(EVIOCGABS(0))) { + if (!dev->absinfo) + return -EINVAL; + t = _IOC_NR(cmd) & ABS_MAX; abs = dev->absinfo[t]; @@ -680,10 +683,13 @@ static long evdev_do_ioctl(struct file *file, unsigned int cmd, } } - if (_IOC_DIR(cmd) == _IOC_READ) { + if (_IOC_DIR(cmd) == _IOC_WRITE) { if ((_IOC_NR(cmd) & ~ABS_MAX) == _IOC_NR(EVIOCSABS(0))) { + if (!dev->absinfo) + return -EINVAL; + t = _IOC_NR(cmd) & ABS_MAX; if (copy_from_user(&abs, p, min_t(size_t, diff --git a/drivers/input/misc/hp_sdc_rtc.c b/drivers/input/misc/hp_sdc_rtc.c index c19066479057..7e2c12a5b839 100644 --- a/drivers/input/misc/hp_sdc_rtc.c +++ b/drivers/input/misc/hp_sdc_rtc.c @@ -104,7 +104,7 @@ static int hp_sdc_rtc_do_read_bbrtc (struct rtc_time *rtctm) t.endidx = 91; t.seq = tseq; t.act.semaphore = &tsem; - init_MUTEX_LOCKED(&tsem); + sema_init(&tsem, 0); if (hp_sdc_enqueue_transaction(&t)) return -1; @@ -698,7 +698,7 @@ static int __init hp_sdc_rtc_init(void) return -ENODEV; #endif - init_MUTEX(&i8042tregs); + sema_init(&i8042tregs, 1); if ((ret = hp_sdc_request_timer_irq(&hp_sdc_rtc_isr))) return ret; diff --git a/drivers/input/serio/hil_mlc.c b/drivers/input/serio/hil_mlc.c index c92f4edfee7b..e5624d8f1709 100644 --- a/drivers/input/serio/hil_mlc.c +++ b/drivers/input/serio/hil_mlc.c @@ -915,15 +915,15 @@ int hil_mlc_register(hil_mlc *mlc) mlc->ostarted = 0; rwlock_init(&mlc->lock); - init_MUTEX(&mlc->osem); + sema_init(&mlc->osem, 1); - init_MUTEX(&mlc->isem); + sema_init(&mlc->isem, 1); mlc->icount = -1; mlc->imatch = 0; mlc->opercnt = 0; - init_MUTEX_LOCKED(&(mlc->csem)); + sema_init(&(mlc->csem), 0); hil_mlc_clear_di_scratch(mlc); hil_mlc_clear_di_map(mlc, 0); diff --git a/drivers/input/serio/hp_sdc.c b/drivers/input/serio/hp_sdc.c index bcc2d30ec245..8c0b51c31424 100644 --- a/drivers/input/serio/hp_sdc.c +++ b/drivers/input/serio/hp_sdc.c @@ -905,7 +905,7 @@ static int __init hp_sdc_init(void) ts_sync[1] = 0x0f; ts_sync[2] = ts_sync[3] = ts_sync[4] = ts_sync[5] = 0; t_sync.act.semaphore = &s_sync; - init_MUTEX_LOCKED(&s_sync); + sema_init(&s_sync, 0); hp_sdc_enqueue_transaction(&t_sync); down(&s_sync); /* Wait for t_sync to complete */ @@ -1039,7 +1039,7 @@ static int __init hp_sdc_register(void) return hp_sdc.dev_err; } - init_MUTEX_LOCKED(&tq_init_sem); + sema_init(&tq_init_sem, 0); tq_init.actidx = 0; tq_init.idx = 1; diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c index 1c4ee6e77937..bf64e49d996a 100644 --- a/drivers/macintosh/adb.c +++ b/drivers/macintosh/adb.c @@ -83,7 +83,7 @@ static struct adb_driver *adb_controller; BLOCKING_NOTIFIER_HEAD(adb_client_list); static int adb_got_sleep; static int adb_inited; -static DECLARE_MUTEX(adb_probe_mutex); +static DEFINE_SEMAPHORE(adb_probe_mutex); static int sleepy_trackpad; static int autopoll_devs; int __adb_probe_sync; diff --git a/drivers/media/video/v4l2-compat-ioctl32.c b/drivers/media/video/v4l2-compat-ioctl32.c index 073f01390cdd..86294ed35c9b 100644 --- a/drivers/media/video/v4l2-compat-ioctl32.c +++ b/drivers/media/video/v4l2-compat-ioctl32.c @@ -193,17 +193,24 @@ static int put_video_window32(struct video_window *kp, struct video_window32 __u struct video_code32 { char loadwhat[16]; /* name or tag of file being passed */ compat_int_t datasize; - unsigned char *data; + compat_uptr_t data; }; -static int get_microcode32(struct video_code *kp, struct video_code32 __user *up) +static struct video_code __user *get_microcode32(struct video_code32 *kp) { - if (!access_ok(VERIFY_READ, up, sizeof(struct video_code32)) || - copy_from_user(kp->loadwhat, up->loadwhat, sizeof(up->loadwhat)) || - get_user(kp->datasize, &up->datasize) || - copy_from_user(kp->data, up->data, up->datasize)) - return -EFAULT; - return 0; + struct video_code __user *up; + + up = compat_alloc_user_space(sizeof(*up)); + + /* + * NOTE! We don't actually care if these fail. If the + * user address is invalid, the native ioctl will do + * the error handling for us + */ + (void) copy_to_user(up->loadwhat, kp->loadwhat, sizeof(up->loadwhat)); + (void) put_user(kp->datasize, &up->datasize); + (void) put_user(compat_ptr(kp->data), &up->data); + return up; } #define VIDIOCGTUNER32 _IOWR('v', 4, struct video_tuner32) @@ -739,7 +746,7 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar struct video_tuner vt; struct video_buffer vb; struct video_window vw; - struct video_code vc; + struct video_code32 vc; struct video_audio va; #endif struct v4l2_format v2f; @@ -818,8 +825,11 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar break; case VIDIOCSMICROCODE: - err = get_microcode32(&karg.vc, up); - compatible_arg = 0; + /* Copy the 32-bit "video_code32" to kernel space */ + if (copy_from_user(&karg.vc, up, sizeof(karg.vc))) + return -EFAULT; + /* Convert the 32-bit version to a 64-bit version in user space */ + up = get_microcode32(&karg.vc); break; case VIDIOCSFREQ: diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 5db49b124ffa..09eee6df0653 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -1631,6 +1631,19 @@ int mmc_suspend_host(struct mmc_host *host) if (host->bus_ops && !host->bus_dead) { if (host->bus_ops->suspend) err = host->bus_ops->suspend(host); + if (err == -ENOSYS || !host->bus_ops->resume) { + /* + * We simply "remove" the card in this case. + * It will be redetected on resume. + */ + if (host->bus_ops->remove) + host->bus_ops->remove(host); + mmc_claim_host(host); + mmc_detach_bus(host); + mmc_release_host(host); + host->pm_flags = 0; + err = 0; + } } mmc_bus_put(host); diff --git a/drivers/mtd/nand/mxc_nand.c b/drivers/mtd/nand/mxc_nand.c index b2828e84d243..214b03afdd48 100644 --- a/drivers/mtd/nand/mxc_nand.c +++ b/drivers/mtd/nand/mxc_nand.c @@ -30,6 +30,8 @@ #include <linux/clk.h> #include <linux/err.h> #include <linux/io.h> +#include <linux/irq.h> +#include <linux/completion.h> #include <asm/mach/flash.h> #include <mach/mxc_nand.h> @@ -151,7 +153,7 @@ struct mxc_nand_host { int irq; int eccsize; - wait_queue_head_t irq_waitq; + struct completion op_completion; uint8_t *data_buf; unsigned int buf_start; @@ -164,6 +166,7 @@ struct mxc_nand_host { void (*send_read_id)(struct mxc_nand_host *); uint16_t (*get_dev_status)(struct mxc_nand_host *); int (*check_int)(struct mxc_nand_host *); + void (*irq_control)(struct mxc_nand_host *, int); }; /* OOB placement block for use with hardware ecc generation */ @@ -216,9 +219,12 @@ static irqreturn_t mxc_nfc_irq(int irq, void *dev_id) { struct mxc_nand_host *host = dev_id; - disable_irq_nosync(irq); + if (!host->check_int(host)) + return IRQ_NONE; - wake_up(&host->irq_waitq); + host->irq_control(host, 0); + + complete(&host->op_completion); return IRQ_HANDLED; } @@ -245,11 +251,54 @@ static int check_int_v1_v2(struct mxc_nand_host *host) if (!(tmp & NFC_V1_V2_CONFIG2_INT)) return 0; - writew(tmp & ~NFC_V1_V2_CONFIG2_INT, NFC_V1_V2_CONFIG2); + if (!cpu_is_mx21()) + writew(tmp & ~NFC_V1_V2_CONFIG2_INT, NFC_V1_V2_CONFIG2); return 1; } +/* + * It has been observed that the i.MX21 cannot read the CONFIG2:INT bit + * if interrupts are masked (CONFIG1:INT_MSK is set). To handle this, the + * driver can enable/disable the irq line rather than simply masking the + * interrupts. + */ +static void irq_control_mx21(struct mxc_nand_host *host, int activate) +{ + if (activate) + enable_irq(host->irq); + else + disable_irq_nosync(host->irq); +} + +static void irq_control_v1_v2(struct mxc_nand_host *host, int activate) +{ + uint16_t tmp; + + tmp = readw(NFC_V1_V2_CONFIG1); + + if (activate) + tmp &= ~NFC_V1_V2_CONFIG1_INT_MSK; + else + tmp |= NFC_V1_V2_CONFIG1_INT_MSK; + + writew(tmp, NFC_V1_V2_CONFIG1); +} + +static void irq_control_v3(struct mxc_nand_host *host, int activate) +{ + uint32_t tmp; + + tmp = readl(NFC_V3_CONFIG2); + + if (activate) + tmp &= ~NFC_V3_CONFIG2_INT_MSK; + else + tmp |= NFC_V3_CONFIG2_INT_MSK; + + writel(tmp, NFC_V3_CONFIG2); +} + /* This function polls the NANDFC to wait for the basic operation to * complete by checking the INT bit of config2 register. */ @@ -259,10 +308,9 @@ static void wait_op_done(struct mxc_nand_host *host, int useirq) if (useirq) { if (!host->check_int(host)) { - - enable_irq(host->irq); - - wait_event(host->irq_waitq, host->check_int(host)); + INIT_COMPLETION(host->op_completion); + host->irq_control(host, 1); + wait_for_completion(&host->op_completion); } } else { while (max_retries-- > 0) { @@ -799,6 +847,7 @@ static void preset_v3(struct mtd_info *mtd) NFC_V3_CONFIG2_2CMD_PHASES | NFC_V3_CONFIG2_SPAS(mtd->oobsize >> 1) | NFC_V3_CONFIG2_ST_CMD(0x70) | + NFC_V3_CONFIG2_INT_MSK | NFC_V3_CONFIG2_NUM_ADDR_PHASE0; if (chip->ecc.mode == NAND_ECC_HW) @@ -1024,6 +1073,10 @@ static int __init mxcnd_probe(struct platform_device *pdev) host->send_read_id = send_read_id_v1_v2; host->get_dev_status = get_dev_status_v1_v2; host->check_int = check_int_v1_v2; + if (cpu_is_mx21()) + host->irq_control = irq_control_mx21; + else + host->irq_control = irq_control_v1_v2; } if (nfc_is_v21()) { @@ -1062,6 +1115,7 @@ static int __init mxcnd_probe(struct platform_device *pdev) host->send_read_id = send_read_id_v3; host->check_int = check_int_v3; host->get_dev_status = get_dev_status_v3; + host->irq_control = irq_control_v3; oob_smallpage = &nandv2_hw_eccoob_smallpage; oob_largepage = &nandv2_hw_eccoob_largepage; } else @@ -1093,14 +1147,34 @@ static int __init mxcnd_probe(struct platform_device *pdev) this->options |= NAND_USE_FLASH_BBT; } - init_waitqueue_head(&host->irq_waitq); + init_completion(&host->op_completion); host->irq = platform_get_irq(pdev, 0); + /* + * mask the interrupt. For i.MX21 explicitely call + * irq_control_v1_v2 to use the mask bit. We can't call + * disable_irq_nosync() for an interrupt we do not own yet. + */ + if (cpu_is_mx21()) + irq_control_v1_v2(host, 0); + else + host->irq_control(host, 0); + err = request_irq(host->irq, mxc_nfc_irq, IRQF_DISABLED, DRIVER_NAME, host); if (err) goto eirq; + host->irq_control(host, 0); + + /* + * Now that the interrupt is disabled make sure the interrupt + * mask bit is cleared on i.MX21. Otherwise we can't read + * the interrupt status bit on this machine. + */ + if (cpu_is_mx21()) + irq_control_v1_v2(host, 1); + /* first scan to find the device and get the page size */ if (nand_scan_ident(mtd, 1, NULL)) { err = -ENXIO; diff --git a/drivers/net/3c527.c b/drivers/net/3c527.c index 70705d1306b9..eca55c52bdfd 100644 --- a/drivers/net/3c527.c +++ b/drivers/net/3c527.c @@ -522,7 +522,7 @@ static int __init mc32_probe1(struct net_device *dev, int slot) lp->tx_len = lp->exec_box->data[9]; /* Transmit list count */ lp->rx_len = lp->exec_box->data[11]; /* Receive list count */ - init_MUTEX_LOCKED(&lp->cmd_mutex); + sema_init(&lp->cmd_mutex, 0); init_completion(&lp->execution_cmd); init_completion(&lp->xceiver_cmd); diff --git a/drivers/net/b44.c b/drivers/net/b44.c index 1e620e287ae0..efeffdf9e5fa 100644 --- a/drivers/net/b44.c +++ b/drivers/net/b44.c @@ -2170,8 +2170,6 @@ static int __devinit b44_init_one(struct ssb_device *sdev, dev->irq = sdev->irq; SET_ETHTOOL_OPS(dev, &b44_ethtool_ops); - netif_carrier_off(dev); - err = ssb_bus_powerup(sdev->bus, 0); if (err) { dev_err(sdev->dev, @@ -2213,6 +2211,8 @@ static int __devinit b44_init_one(struct ssb_device *sdev, goto err_out_powerdown; } + netif_carrier_off(dev); + ssb_set_drvdata(sdev, dev); /* Chip reset provides power to the b44 MAC & PCI cores, which diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c index a333b42111b8..6372610ed240 100644 --- a/drivers/net/ehea/ehea_main.c +++ b/drivers/net/ehea/ehea_main.c @@ -533,8 +533,15 @@ static inline void ehea_fill_skb(struct net_device *dev, int length = cqe->num_bytes_transfered - 4; /*remove CRC */ skb_put(skb, length); - skb->ip_summed = CHECKSUM_UNNECESSARY; skb->protocol = eth_type_trans(skb, dev); + + /* The packet was not an IPV4 packet so a complemented checksum was + calculated. The value is found in the Internet Checksum field. */ + if (cqe->status & EHEA_CQE_BLIND_CKSUM) { + skb->ip_summed = CHECKSUM_COMPLETE; + skb->csum = csum_unfold(~cqe->inet_checksum_value); + } else + skb->ip_summed = CHECKSUM_UNNECESSARY; } static inline struct sk_buff *get_skb_by_index(struct sk_buff **skb_array, diff --git a/drivers/net/ehea/ehea_qmr.h b/drivers/net/ehea/ehea_qmr.h index f608a6c54af5..38104734a3be 100644 --- a/drivers/net/ehea/ehea_qmr.h +++ b/drivers/net/ehea/ehea_qmr.h @@ -150,6 +150,7 @@ struct ehea_rwqe { #define EHEA_CQE_TYPE_RQ 0x60 #define EHEA_CQE_STAT_ERR_MASK 0x700F #define EHEA_CQE_STAT_FAT_ERR_MASK 0xF +#define EHEA_CQE_BLIND_CKSUM 0x8000 #define EHEA_CQE_STAT_ERR_TCP 0x4000 #define EHEA_CQE_STAT_ERR_IP 0x2000 #define EHEA_CQE_STAT_ERR_CRC 0x1000 diff --git a/drivers/net/fec.c b/drivers/net/fec.c index 768b840aeb6b..cce32d43175f 100644 --- a/drivers/net/fec.c +++ b/drivers/net/fec.c @@ -678,24 +678,37 @@ static int fec_enet_mii_probe(struct net_device *dev) { struct fec_enet_private *fep = netdev_priv(dev); struct phy_device *phy_dev = NULL; - int ret; + char mdio_bus_id[MII_BUS_ID_SIZE]; + char phy_name[MII_BUS_ID_SIZE + 3]; + int phy_id; fep->phy_dev = NULL; - /* find the first phy */ - phy_dev = phy_find_first(fep->mii_bus); - if (!phy_dev) { - printk(KERN_ERR "%s: no PHY found\n", dev->name); - return -ENODEV; + /* check for attached phy */ + for (phy_id = 0; (phy_id < PHY_MAX_ADDR); phy_id++) { + if ((fep->mii_bus->phy_mask & (1 << phy_id))) + continue; + if (fep->mii_bus->phy_map[phy_id] == NULL) + continue; + if (fep->mii_bus->phy_map[phy_id]->phy_id == 0) + continue; + strncpy(mdio_bus_id, fep->mii_bus->id, MII_BUS_ID_SIZE); + break; } - /* attach the mac to the phy */ - ret = phy_connect_direct(dev, phy_dev, - &fec_enet_adjust_link, 0, - PHY_INTERFACE_MODE_MII); - if (ret) { - printk(KERN_ERR "%s: Could not attach to PHY\n", dev->name); - return ret; + if (phy_id >= PHY_MAX_ADDR) { + printk(KERN_INFO "%s: no PHY, assuming direct connection " + "to switch\n", dev->name); + strncpy(mdio_bus_id, "0", MII_BUS_ID_SIZE); + phy_id = 0; + } + + snprintf(phy_name, MII_BUS_ID_SIZE, PHY_ID_FMT, mdio_bus_id, phy_id); + phy_dev = phy_connect(dev, phy_name, &fec_enet_adjust_link, 0, + PHY_INTERFACE_MODE_MII); + if (IS_ERR(phy_dev)) { + printk(KERN_ERR "%s: could not attach to PHY\n", dev->name); + return PTR_ERR(phy_dev); } /* mask with MAC supported features */ @@ -738,7 +751,7 @@ static int fec_enet_mii_init(struct platform_device *pdev) fep->mii_bus->read = fec_enet_mdio_read; fep->mii_bus->write = fec_enet_mdio_write; fep->mii_bus->reset = fec_enet_mdio_reset; - snprintf(fep->mii_bus->id, MII_BUS_ID_SIZE, "%x", pdev->id); + snprintf(fep->mii_bus->id, MII_BUS_ID_SIZE, "%x", pdev->id + 1); fep->mii_bus->priv = fep; fep->mii_bus->parent = &pdev->dev; @@ -1311,6 +1324,9 @@ fec_probe(struct platform_device *pdev) if (ret) goto failed_mii_init; + /* Carrier starts down, phylib will bring it up */ + netif_carrier_off(ndev); + ret = register_netdev(ndev); if (ret) goto failed_register; diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index 4b52c767ad05..3e5d0b6b6516 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -608,7 +608,7 @@ static int sixpack_open(struct tty_struct *tty) spin_lock_init(&sp->lock); atomic_set(&sp->refcnt, 1); - init_MUTEX_LOCKED(&sp->dead_sem); + sema_init(&sp->dead_sem, 0); /* !!! length of the buffers. MTU is IP MTU, not PACLEN! */ diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c index 66e88bd59caa..4c628393c8b1 100644 --- a/drivers/net/hamradio/mkiss.c +++ b/drivers/net/hamradio/mkiss.c @@ -747,7 +747,7 @@ static int mkiss_open(struct tty_struct *tty) spin_lock_init(&ax->buflock); atomic_set(&ax->refcnt, 1); - init_MUTEX_LOCKED(&ax->dead_sem); + sema_init(&ax->dead_sem, 0); ax->tty = tty; tty->disc_data = ax; diff --git a/drivers/net/irda/sir_dev.c b/drivers/net/irda/sir_dev.c index 1b051dab7b29..51d74447f8f8 100644 --- a/drivers/net/irda/sir_dev.c +++ b/drivers/net/irda/sir_dev.c @@ -909,7 +909,7 @@ struct sir_dev * sirdev_get_instance(const struct sir_driver *drv, const char *n dev->tx_skb = NULL; spin_lock_init(&dev->tx_lock); - init_MUTEX(&dev->fsm.sem); + sema_init(&dev->fsm.sem, 1); dev->drv = drv; dev->netdev = ndev; diff --git a/drivers/net/ppp_async.c b/drivers/net/ppp_async.c index af50a530daee..78d70a6481bf 100644 --- a/drivers/net/ppp_async.c +++ b/drivers/net/ppp_async.c @@ -184,7 +184,7 @@ ppp_asynctty_open(struct tty_struct *tty) tasklet_init(&ap->tsk, ppp_async_process, (unsigned long) ap); atomic_set(&ap->refcnt, 1); - init_MUTEX_LOCKED(&ap->dead_sem); + sema_init(&ap->dead_sem, 0); ap->chan.private = ap; ap->chan.ops = &async_ops; diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c index a0da4a17b025..992db2fa136e 100644 --- a/drivers/net/r8169.c +++ b/drivers/net/r8169.c @@ -1212,7 +1212,8 @@ static void rtl8169_update_counters(struct net_device *dev) if ((RTL_R8(ChipCmd) & CmdRxEnb) == 0) return; - counters = pci_alloc_consistent(tp->pci_dev, sizeof(*counters), &paddr); + counters = dma_alloc_coherent(&tp->pci_dev->dev, sizeof(*counters), + &paddr, GFP_KERNEL); if (!counters) return; @@ -1233,7 +1234,8 @@ static void rtl8169_update_counters(struct net_device *dev) RTL_W32(CounterAddrLow, 0); RTL_W32(CounterAddrHigh, 0); - pci_free_consistent(tp->pci_dev, sizeof(*counters), counters, paddr); + dma_free_coherent(&tp->pci_dev->dev, sizeof(*counters), counters, + paddr); } static void rtl8169_get_ethtool_stats(struct net_device *dev, @@ -3292,15 +3294,15 @@ static int rtl8169_open(struct net_device *dev) /* * Rx and Tx desscriptors needs 256 bytes alignment. - * pci_alloc_consistent provides more. + * dma_alloc_coherent provides more. */ - tp->TxDescArray = pci_alloc_consistent(pdev, R8169_TX_RING_BYTES, - &tp->TxPhyAddr); + tp->TxDescArray = dma_alloc_coherent(&pdev->dev, R8169_TX_RING_BYTES, + &tp->TxPhyAddr, GFP_KERNEL); if (!tp->TxDescArray) goto err_pm_runtime_put; - tp->RxDescArray = pci_alloc_consistent(pdev, R8169_RX_RING_BYTES, - &tp->RxPhyAddr); + tp->RxDescArray = dma_alloc_coherent(&pdev->dev, R8169_RX_RING_BYTES, + &tp->RxPhyAddr, GFP_KERNEL); if (!tp->RxDescArray) goto err_free_tx_0; @@ -3334,12 +3336,12 @@ out: err_release_ring_2: rtl8169_rx_clear(tp); err_free_rx_1: - pci_free_consistent(pdev, R8169_RX_RING_BYTES, tp->RxDescArray, - tp->RxPhyAddr); + dma_free_coherent(&pdev->dev, R8169_RX_RING_BYTES, tp->RxDescArray, + tp->RxPhyAddr); tp->RxDescArray = NULL; err_free_tx_0: - pci_free_consistent(pdev, R8169_TX_RING_BYTES, tp->TxDescArray, - tp->TxPhyAddr); + dma_free_coherent(&pdev->dev, R8169_TX_RING_BYTES, tp->TxDescArray, + tp->TxPhyAddr); tp->TxDescArray = NULL; err_pm_runtime_put: pm_runtime_put_noidle(&pdev->dev); @@ -3975,7 +3977,7 @@ static void rtl8169_free_rx_skb(struct rtl8169_private *tp, { struct pci_dev *pdev = tp->pci_dev; - pci_unmap_single(pdev, le64_to_cpu(desc->addr), tp->rx_buf_sz, + dma_unmap_single(&pdev->dev, le64_to_cpu(desc->addr), tp->rx_buf_sz, PCI_DMA_FROMDEVICE); dev_kfree_skb(*sk_buff); *sk_buff = NULL; @@ -4000,7 +4002,7 @@ static inline void rtl8169_map_to_asic(struct RxDesc *desc, dma_addr_t mapping, static struct sk_buff *rtl8169_alloc_rx_skb(struct pci_dev *pdev, struct net_device *dev, struct RxDesc *desc, int rx_buf_sz, - unsigned int align) + unsigned int align, gfp_t gfp) { struct sk_buff *skb; dma_addr_t mapping; @@ -4008,13 +4010,13 @@ static struct sk_buff *rtl8169_alloc_rx_skb(struct pci_dev *pdev, pad = align ? align : NET_IP_ALIGN; - skb = netdev_alloc_skb(dev, rx_buf_sz + pad); + skb = __netdev_alloc_skb(dev, rx_buf_sz + pad, gfp); if (!skb) goto err_out; skb_reserve(skb, align ? ((pad - 1) & (unsigned long)skb->data) : pad); - mapping = pci_map_single(pdev, skb->data, rx_buf_sz, + mapping = dma_map_single(&pdev->dev, skb->data, rx_buf_sz, PCI_DMA_FROMDEVICE); rtl8169_map_to_asic(desc, mapping, rx_buf_sz); @@ -4039,7 +4041,7 @@ static void rtl8169_rx_clear(struct rtl8169_private *tp) } static u32 rtl8169_rx_fill(struct rtl8169_private *tp, struct net_device *dev, - u32 start, u32 end) + u32 start, u32 end, gfp_t gfp) { u32 cur; @@ -4054,7 +4056,7 @@ static u32 rtl8169_rx_fill(struct rtl8169_private *tp, struct net_device *dev, skb = rtl8169_alloc_rx_skb(tp->pci_dev, dev, tp->RxDescArray + i, - tp->rx_buf_sz, tp->align); + tp->rx_buf_sz, tp->align, gfp); if (!skb) break; @@ -4082,7 +4084,7 @@ static int rtl8169_init_ring(struct net_device *dev) memset(tp->tx_skb, 0x0, NUM_TX_DESC * sizeof(struct ring_info)); memset(tp->Rx_skbuff, 0x0, NUM_RX_DESC * sizeof(struct sk_buff *)); - if (rtl8169_rx_fill(tp, dev, 0, NUM_RX_DESC) != NUM_RX_DESC) + if (rtl8169_rx_fill(tp, dev, 0, NUM_RX_DESC, GFP_KERNEL) != NUM_RX_DESC) goto err_out; rtl8169_mark_as_last_descriptor(tp->RxDescArray + NUM_RX_DESC - 1); @@ -4099,7 +4101,8 @@ static void rtl8169_unmap_tx_skb(struct pci_dev *pdev, struct ring_info *tx_skb, { unsigned int len = tx_skb->len; - pci_unmap_single(pdev, le64_to_cpu(desc->addr), len, PCI_DMA_TODEVICE); + dma_unmap_single(&pdev->dev, le64_to_cpu(desc->addr), len, + PCI_DMA_TODEVICE); desc->opts1 = 0x00; desc->opts2 = 0x00; desc->addr = 0x00; @@ -4243,7 +4246,8 @@ static int rtl8169_xmit_frags(struct rtl8169_private *tp, struct sk_buff *skb, txd = tp->TxDescArray + entry; len = frag->size; addr = ((void *) page_address(frag->page)) + frag->page_offset; - mapping = pci_map_single(tp->pci_dev, addr, len, PCI_DMA_TODEVICE); + mapping = dma_map_single(&tp->pci_dev->dev, addr, len, + PCI_DMA_TODEVICE); /* anti gcc 2.95.3 bugware (sic) */ status = opts1 | len | (RingEnd * !((entry + 1) % NUM_TX_DESC)); @@ -4313,7 +4317,8 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb, tp->tx_skb[entry].skb = skb; } - mapping = pci_map_single(tp->pci_dev, skb->data, len, PCI_DMA_TODEVICE); + mapping = dma_map_single(&tp->pci_dev->dev, skb->data, len, + PCI_DMA_TODEVICE); tp->tx_skb[entry].len = len; txd->addr = cpu_to_le64(mapping); @@ -4477,8 +4482,8 @@ static inline bool rtl8169_try_rx_copy(struct sk_buff **sk_buff, if (!skb) goto out; - pci_dma_sync_single_for_cpu(tp->pci_dev, addr, pkt_size, - PCI_DMA_FROMDEVICE); + dma_sync_single_for_cpu(&tp->pci_dev->dev, addr, pkt_size, + PCI_DMA_FROMDEVICE); skb_copy_from_linear_data(*sk_buff, skb->data, pkt_size); *sk_buff = skb; done = true; @@ -4549,11 +4554,11 @@ static int rtl8169_rx_interrupt(struct net_device *dev, rtl8169_rx_csum(skb, desc); if (rtl8169_try_rx_copy(&skb, tp, pkt_size, addr)) { - pci_dma_sync_single_for_device(pdev, addr, + dma_sync_single_for_device(&pdev->dev, addr, pkt_size, PCI_DMA_FROMDEVICE); rtl8169_mark_to_asic(desc, tp->rx_buf_sz); } else { - pci_unmap_single(pdev, addr, tp->rx_buf_sz, + dma_unmap_single(&pdev->dev, addr, tp->rx_buf_sz, PCI_DMA_FROMDEVICE); tp->Rx_skbuff[entry] = NULL; } @@ -4583,7 +4588,7 @@ static int rtl8169_rx_interrupt(struct net_device *dev, count = cur_rx - tp->cur_rx; tp->cur_rx = cur_rx; - delta = rtl8169_rx_fill(tp, dev, tp->dirty_rx, tp->cur_rx); + delta = rtl8169_rx_fill(tp, dev, tp->dirty_rx, tp->cur_rx, GFP_ATOMIC); if (!delta && count) netif_info(tp, intr, dev, "no Rx buffer allocated\n"); tp->dirty_rx += delta; @@ -4769,10 +4774,10 @@ static int rtl8169_close(struct net_device *dev) free_irq(dev->irq, dev); - pci_free_consistent(pdev, R8169_RX_RING_BYTES, tp->RxDescArray, - tp->RxPhyAddr); - pci_free_consistent(pdev, R8169_TX_RING_BYTES, tp->TxDescArray, - tp->TxPhyAddr); + dma_free_coherent(&pdev->dev, R8169_RX_RING_BYTES, tp->RxDescArray, + tp->RxPhyAddr); + dma_free_coherent(&pdev->dev, R8169_TX_RING_BYTES, tp->TxDescArray, + tp->TxPhyAddr); tp->TxDescArray = NULL; tp->RxDescArray = NULL; diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index bc3af78a869f..1ec4b9e0239a 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -4666,7 +4666,7 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget) desc_idx, *post_ptr); drop_it_no_recycle: /* Other statistics kept track of by card. */ - tp->net_stats.rx_dropped++; + tp->rx_dropped++; goto next_pkt; } @@ -4726,7 +4726,7 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget) if (len > (tp->dev->mtu + ETH_HLEN) && skb->protocol != htons(ETH_P_8021Q)) { dev_kfree_skb(skb); - goto next_pkt; + goto drop_it_no_recycle; } if (desc->type_flags & RXD_FLAG_VLAN && @@ -9240,6 +9240,8 @@ static struct rtnl_link_stats64 *tg3_get_stats64(struct net_device *dev, stats->rx_missed_errors = old_stats->rx_missed_errors + get_stat64(&hw_stats->rx_discards); + stats->rx_dropped = tp->rx_dropped; + return stats; } diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h index 4937bd190964..be7ff138a7f9 100644 --- a/drivers/net/tg3.h +++ b/drivers/net/tg3.h @@ -2759,7 +2759,7 @@ struct tg3 { /* begin "everything else" cacheline(s) section */ - struct rtnl_link_stats64 net_stats; + unsigned long rx_dropped; struct rtnl_link_stats64 net_stats_prev; struct tg3_ethtool_stats estats; struct tg3_ethtool_stats estats_prev; diff --git a/drivers/net/wan/cosa.c b/drivers/net/wan/cosa.c index 04c6cd4333f1..10bafd59f9c3 100644 --- a/drivers/net/wan/cosa.c +++ b/drivers/net/wan/cosa.c @@ -575,7 +575,7 @@ static int cosa_probe(int base, int irq, int dma) /* Initialize the chardev data structures */ mutex_init(&chan->rlock); - init_MUTEX(&chan->wsem); + sema_init(&chan->wsem, 1); /* Register the network interface */ if (!(chan->netdev = alloc_hdlcdev(chan))) { diff --git a/drivers/net/wimax/i2400m/rx.c b/drivers/net/wimax/i2400m/rx.c index 8cc9e319f435..1737d1488b35 100644 --- a/drivers/net/wimax/i2400m/rx.c +++ b/drivers/net/wimax/i2400m/rx.c @@ -1244,16 +1244,16 @@ int i2400m_rx(struct i2400m *i2400m, struct sk_buff *skb) int i, result; struct device *dev = i2400m_dev(i2400m); const struct i2400m_msg_hdr *msg_hdr; - size_t pl_itr, pl_size, skb_len; + size_t pl_itr, pl_size; unsigned long flags; - unsigned num_pls, single_last; + unsigned num_pls, single_last, skb_len; skb_len = skb->len; - d_fnstart(4, dev, "(i2400m %p skb %p [size %zu])\n", + d_fnstart(4, dev, "(i2400m %p skb %p [size %u])\n", i2400m, skb, skb_len); result = -EIO; msg_hdr = (void *) skb->data; - result = i2400m_rx_msg_hdr_check(i2400m, msg_hdr, skb->len); + result = i2400m_rx_msg_hdr_check(i2400m, msg_hdr, skb_len); if (result < 0) goto error_msg_hdr_check; result = -EIO; @@ -1261,10 +1261,10 @@ int i2400m_rx(struct i2400m *i2400m, struct sk_buff *skb) pl_itr = sizeof(*msg_hdr) + /* Check payload descriptor(s) */ num_pls * sizeof(msg_hdr->pld[0]); pl_itr = ALIGN(pl_itr, I2400M_PL_ALIGN); - if (pl_itr > skb->len) { /* got all the payload descriptors? */ + if (pl_itr > skb_len) { /* got all the payload descriptors? */ dev_err(dev, "RX: HW BUG? message too short (%u bytes) for " "%u payload descriptors (%zu each, total %zu)\n", - skb->len, num_pls, sizeof(msg_hdr->pld[0]), pl_itr); + skb_len, num_pls, sizeof(msg_hdr->pld[0]), pl_itr); goto error_pl_descr_short; } /* Walk each payload payload--check we really got it */ @@ -1272,7 +1272,7 @@ int i2400m_rx(struct i2400m *i2400m, struct sk_buff *skb) /* work around old gcc warnings */ pl_size = i2400m_pld_size(&msg_hdr->pld[i]); result = i2400m_rx_pl_descr_check(i2400m, &msg_hdr->pld[i], - pl_itr, skb->len); + pl_itr, skb_len); if (result < 0) goto error_pl_descr_check; single_last = num_pls == 1 || i == num_pls - 1; @@ -1290,16 +1290,16 @@ int i2400m_rx(struct i2400m *i2400m, struct sk_buff *skb) if (i < i2400m->rx_pl_min) i2400m->rx_pl_min = i; i2400m->rx_num++; - i2400m->rx_size_acc += skb->len; - if (skb->len < i2400m->rx_size_min) - i2400m->rx_size_min = skb->len; - if (skb->len > i2400m->rx_size_max) - i2400m->rx_size_max = skb->len; + i2400m->rx_size_acc += skb_len; + if (skb_len < i2400m->rx_size_min) + i2400m->rx_size_min = skb_len; + if (skb_len > i2400m->rx_size_max) + i2400m->rx_size_max = skb_len; spin_unlock_irqrestore(&i2400m->rx_lock, flags); error_pl_descr_check: error_pl_descr_short: error_msg_hdr_check: - d_fnend(4, dev, "(i2400m %p skb %p [size %zu]) = %d\n", + d_fnend(4, dev, "(i2400m %p skb %p [size %u]) = %d\n", i2400m, skb, skb_len, result); return result; } diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c index b336cd9ee7a1..f9bda64fcd1b 100644 --- a/drivers/oprofile/oprof.c +++ b/drivers/oprofile/oprof.c @@ -225,26 +225,17 @@ post_sync: mutex_unlock(&start_mutex); } -int oprofile_set_backtrace(unsigned long val) +int oprofile_set_ulong(unsigned long *addr, unsigned long val) { - int err = 0; + int err = -EBUSY; mutex_lock(&start_mutex); - - if (oprofile_started) { - err = -EBUSY; - goto out; - } - - if (!oprofile_ops.backtrace) { - err = -EINVAL; - goto out; + if (!oprofile_started) { + *addr = val; + err = 0; } - - oprofile_backtrace_depth = val; - -out: mutex_unlock(&start_mutex); + return err; } @@ -257,16 +248,9 @@ static int __init oprofile_init(void) printk(KERN_INFO "oprofile: using timer interrupt.\n"); err = oprofile_timer_init(&oprofile_ops); if (err) - goto out_arch; + return err; } - err = oprofilefs_register(); - if (err) - goto out_arch; - return 0; - -out_arch: - oprofile_arch_exit(); - return err; + return oprofilefs_register(); } diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h index 47e12cb4ee8b..177b73de5e5f 100644 --- a/drivers/oprofile/oprof.h +++ b/drivers/oprofile/oprof.h @@ -37,7 +37,7 @@ void oprofile_create_files(struct super_block *sb, struct dentry *root); int oprofile_timer_init(struct oprofile_operations *ops); void oprofile_timer_exit(void); -int oprofile_set_backtrace(unsigned long depth); +int oprofile_set_ulong(unsigned long *addr, unsigned long val); int oprofile_set_timeout(unsigned long time); #endif /* OPROF_H */ diff --git a/drivers/oprofile/oprofile_files.c b/drivers/oprofile/oprofile_files.c index bbd7516e0869..ccf099e684a4 100644 --- a/drivers/oprofile/oprofile_files.c +++ b/drivers/oprofile/oprofile_files.c @@ -79,14 +79,17 @@ static ssize_t depth_write(struct file *file, char const __user *buf, size_t cou if (*offset) return -EINVAL; + if (!oprofile_ops.backtrace) + return -EINVAL; + retval = oprofilefs_ulong_from_user(&val, buf, count); if (retval) return retval; - retval = oprofile_set_backtrace(val); - + retval = oprofile_set_ulong(&oprofile_backtrace_depth, val); if (retval) return retval; + return count; } diff --git a/drivers/oprofile/oprofile_perf.c b/drivers/oprofile/oprofile_perf.c new file mode 100644 index 000000000000..9046f7b2ed79 --- /dev/null +++ b/drivers/oprofile/oprofile_perf.c @@ -0,0 +1,328 @@ +/* + * Copyright 2010 ARM Ltd. + * + * Perf-events backend for OProfile. + */ +#include <linux/perf_event.h> +#include <linux/platform_device.h> +#include <linux/oprofile.h> +#include <linux/slab.h> + +/* + * Per performance monitor configuration as set via oprofilefs. + */ +struct op_counter_config { + unsigned long count; + unsigned long enabled; + unsigned long event; + unsigned long unit_mask; + unsigned long kernel; + unsigned long user; + struct perf_event_attr attr; +}; + +static int oprofile_perf_enabled; +static DEFINE_MUTEX(oprofile_perf_mutex); + +static struct op_counter_config *counter_config; +static struct perf_event **perf_events[nr_cpumask_bits]; +static int num_counters; + +/* + * Overflow callback for oprofile. + */ +static void op_overflow_handler(struct perf_event *event, int unused, + struct perf_sample_data *data, struct pt_regs *regs) +{ + int id; + u32 cpu = smp_processor_id(); + + for (id = 0; id < num_counters; ++id) + if (perf_events[cpu][id] == event) + break; + + if (id != num_counters) + oprofile_add_sample(regs, id); + else + pr_warning("oprofile: ignoring spurious overflow " + "on cpu %u\n", cpu); +} + +/* + * Called by oprofile_perf_setup to create perf attributes to mirror the oprofile + * settings in counter_config. Attributes are created as `pinned' events and + * so are permanently scheduled on the PMU. + */ +static void op_perf_setup(void) +{ + int i; + u32 size = sizeof(struct perf_event_attr); + struct perf_event_attr *attr; + + for (i = 0; i < num_counters; ++i) { + attr = &counter_config[i].attr; + memset(attr, 0, size); + attr->type = PERF_TYPE_RAW; + attr->size = size; + attr->config = counter_config[i].event; + attr->sample_period = counter_config[i].count; + attr->pinned = 1; + } +} + +static int op_create_counter(int cpu, int event) +{ + struct perf_event *pevent; + + if (!counter_config[event].enabled || perf_events[cpu][event]) + return 0; + + pevent = perf_event_create_kernel_counter(&counter_config[event].attr, + cpu, NULL, + op_overflow_handler); + + if (IS_ERR(pevent)) + return PTR_ERR(pevent); + + if (pevent->state != PERF_EVENT_STATE_ACTIVE) { + perf_event_release_kernel(pevent); + pr_warning("oprofile: failed to enable event %d " + "on CPU %d\n", event, cpu); + return -EBUSY; + } + + perf_events[cpu][event] = pevent; + + return 0; +} + +static void op_destroy_counter(int cpu, int event) +{ + struct perf_event *pevent = perf_events[cpu][event]; + + if (pevent) { + perf_event_release_kernel(pevent); + perf_events[cpu][event] = NULL; + } +} + +/* + * Called by oprofile_perf_start to create active perf events based on the + * perviously configured attributes. + */ +static int op_perf_start(void) +{ + int cpu, event, ret = 0; + + for_each_online_cpu(cpu) { + for (event = 0; event < num_counters; ++event) { + ret = op_create_counter(cpu, event); + if (ret) + return ret; + } + } + + return ret; +} + +/* + * Called by oprofile_perf_stop at the end of a profiling run. + */ +static void op_perf_stop(void) +{ + int cpu, event; + + for_each_online_cpu(cpu) + for (event = 0; event < num_counters; ++event) + op_destroy_counter(cpu, event); +} + +static int oprofile_perf_create_files(struct super_block *sb, struct dentry *root) +{ + unsigned int i; + + for (i = 0; i < num_counters; i++) { + struct dentry *dir; + char buf[4]; + + snprintf(buf, sizeof buf, "%d", i); + dir = oprofilefs_mkdir(sb, root, buf); + oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled); + oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event); + oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count); + oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); + oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); + oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); + } + + return 0; +} + +static int oprofile_perf_setup(void) +{ + spin_lock(&oprofilefs_lock); + op_perf_setup(); + spin_unlock(&oprofilefs_lock); + return 0; +} + +static int oprofile_perf_start(void) +{ + int ret = -EBUSY; + + mutex_lock(&oprofile_perf_mutex); + if (!oprofile_perf_enabled) { + ret = 0; + op_perf_start(); + oprofile_perf_enabled = 1; + } + mutex_unlock(&oprofile_perf_mutex); + return ret; +} + +static void oprofile_perf_stop(void) +{ + mutex_lock(&oprofile_perf_mutex); + if (oprofile_perf_enabled) + op_perf_stop(); + oprofile_perf_enabled = 0; + mutex_unlock(&oprofile_perf_mutex); +} + +#ifdef CONFIG_PM + +static int oprofile_perf_suspend(struct platform_device *dev, pm_message_t state) +{ + mutex_lock(&oprofile_perf_mutex); + if (oprofile_perf_enabled) + op_perf_stop(); + mutex_unlock(&oprofile_perf_mutex); + return 0; +} + +static int oprofile_perf_resume(struct platform_device *dev) +{ + mutex_lock(&oprofile_perf_mutex); + if (oprofile_perf_enabled && op_perf_start()) + oprofile_perf_enabled = 0; + mutex_unlock(&oprofile_perf_mutex); + return 0; +} + +static struct platform_driver oprofile_driver = { + .driver = { + .name = "oprofile-perf", + }, + .resume = oprofile_perf_resume, + .suspend = oprofile_perf_suspend, +}; + +static struct platform_device *oprofile_pdev; + +static int __init init_driverfs(void) +{ + int ret; + + ret = platform_driver_register(&oprofile_driver); + if (ret) + return ret; + + oprofile_pdev = platform_device_register_simple( + oprofile_driver.driver.name, 0, NULL, 0); + if (IS_ERR(oprofile_pdev)) { + ret = PTR_ERR(oprofile_pdev); + platform_driver_unregister(&oprofile_driver); + } + + return ret; +} + +static void exit_driverfs(void) +{ + platform_device_unregister(oprofile_pdev); + platform_driver_unregister(&oprofile_driver); +} + +#else + +static inline int init_driverfs(void) { return 0; } +static inline void exit_driverfs(void) { } + +#endif /* CONFIG_PM */ + +void oprofile_perf_exit(void) +{ + int cpu, id; + struct perf_event *event; + + for_each_possible_cpu(cpu) { + for (id = 0; id < num_counters; ++id) { + event = perf_events[cpu][id]; + if (event) + perf_event_release_kernel(event); + } + + kfree(perf_events[cpu]); + } + + kfree(counter_config); + exit_driverfs(); +} + +int __init oprofile_perf_init(struct oprofile_operations *ops) +{ + int cpu, ret = 0; + + ret = init_driverfs(); + if (ret) + return ret; + + memset(&perf_events, 0, sizeof(perf_events)); + + num_counters = perf_num_counters(); + if (num_counters <= 0) { + pr_info("oprofile: no performance counters\n"); + ret = -ENODEV; + goto out; + } + + counter_config = kcalloc(num_counters, + sizeof(struct op_counter_config), GFP_KERNEL); + + if (!counter_config) { + pr_info("oprofile: failed to allocate %d " + "counters\n", num_counters); + ret = -ENOMEM; + num_counters = 0; + goto out; + } + + for_each_possible_cpu(cpu) { + perf_events[cpu] = kcalloc(num_counters, + sizeof(struct perf_event *), GFP_KERNEL); + if (!perf_events[cpu]) { + pr_info("oprofile: failed to allocate %d perf events " + "for cpu %d\n", num_counters, cpu); + ret = -ENOMEM; + goto out; + } + } + + ops->create_files = oprofile_perf_create_files; + ops->setup = oprofile_perf_setup; + ops->start = oprofile_perf_start; + ops->stop = oprofile_perf_stop; + ops->shutdown = oprofile_perf_stop; + ops->cpu_type = op_name_from_perf_id(); + + if (!ops->cpu_type) + ret = -ENODEV; + else + pr_info("oprofile: using %s\n", ops->cpu_type); + +out: + if (ret) + oprofile_perf_exit(); + + return ret; +} diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c index 2766a6d3c2e9..1944621930d9 100644 --- a/drivers/oprofile/oprofilefs.c +++ b/drivers/oprofile/oprofilefs.c @@ -91,16 +91,20 @@ static ssize_t ulong_read_file(struct file *file, char __user *buf, size_t count static ssize_t ulong_write_file(struct file *file, char const __user *buf, size_t count, loff_t *offset) { - unsigned long *value = file->private_data; + unsigned long value; int retval; if (*offset) return -EINVAL; - retval = oprofilefs_ulong_from_user(value, buf, count); + retval = oprofilefs_ulong_from_user(&value, buf, count); + if (retval) + return retval; + retval = oprofile_set_ulong(file->private_data, value); if (retval) return retval; + return count; } @@ -126,50 +130,41 @@ static const struct file_operations ulong_ro_fops = { }; -static struct dentry *__oprofilefs_create_file(struct super_block *sb, +static int __oprofilefs_create_file(struct super_block *sb, struct dentry *root, char const *name, const struct file_operations *fops, - int perm) + int perm, void *priv) { struct dentry *dentry; struct inode *inode; dentry = d_alloc_name(root, name); if (!dentry) - return NULL; + return -ENOMEM; inode = oprofilefs_get_inode(sb, S_IFREG | perm); if (!inode) { dput(dentry); - return NULL; + return -ENOMEM; } inode->i_fop = fops; d_add(dentry, inode); - return dentry; + dentry->d_inode->i_private = priv; + return 0; } int oprofilefs_create_ulong(struct super_block *sb, struct dentry *root, char const *name, unsigned long *val) { - struct dentry *d = __oprofilefs_create_file(sb, root, name, - &ulong_fops, 0644); - if (!d) - return -EFAULT; - - d->d_inode->i_private = val; - return 0; + return __oprofilefs_create_file(sb, root, name, + &ulong_fops, 0644, val); } int oprofilefs_create_ro_ulong(struct super_block *sb, struct dentry *root, char const *name, unsigned long *val) { - struct dentry *d = __oprofilefs_create_file(sb, root, name, - &ulong_ro_fops, 0444); - if (!d) - return -EFAULT; - - d->d_inode->i_private = val; - return 0; + return __oprofilefs_create_file(sb, root, name, + &ulong_ro_fops, 0444, val); } @@ -189,31 +184,22 @@ static const struct file_operations atomic_ro_fops = { int oprofilefs_create_ro_atomic(struct super_block *sb, struct dentry *root, char const *name, atomic_t *val) { - struct dentry *d = __oprofilefs_create_file(sb, root, name, - &atomic_ro_fops, 0444); - if (!d) - return -EFAULT; - - d->d_inode->i_private = val; - return 0; + return __oprofilefs_create_file(sb, root, name, + &atomic_ro_fops, 0444, val); } int oprofilefs_create_file(struct super_block *sb, struct dentry *root, char const *name, const struct file_operations *fops) { - if (!__oprofilefs_create_file(sb, root, name, fops, 0644)) - return -EFAULT; - return 0; + return __oprofilefs_create_file(sb, root, name, fops, 0644, NULL); } int oprofilefs_create_file_perm(struct super_block *sb, struct dentry *root, char const *name, const struct file_operations *fops, int perm) { - if (!__oprofilefs_create_file(sb, root, name, fops, perm)) - return -EFAULT; - return 0; + return __oprofilefs_create_file(sb, root, name, fops, perm, NULL); } diff --git a/drivers/parport/share.c b/drivers/parport/share.c index dffa5d4fb298..a2d9d1e59260 100644 --- a/drivers/parport/share.c +++ b/drivers/parport/share.c @@ -306,7 +306,7 @@ struct parport *parport_register_port(unsigned long base, int irq, int dma, spin_lock_init(&tmp->pardevice_lock); tmp->ieee1284.mode = IEEE1284_MODE_COMPAT; tmp->ieee1284.phase = IEEE1284_PH_FWD_IDLE; - init_MUTEX_LOCKED (&tmp->ieee1284.irq); /* actually a semaphore at 0 */ + sema_init(&tmp->ieee1284.irq, 0); tmp->spintime = parport_default_spintime; atomic_set (&tmp->ref_count, 1); INIT_LIST_HEAD(&tmp->full_list); diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index ad0ed212db4a..348fba0a8976 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -1046,13 +1046,13 @@ int scsi_get_vpd_page(struct scsi_device *sdev, u8 page, unsigned char *buf, /* If the user actually wanted this page, we can skip the rest */ if (page == 0) - return -EINVAL; + return 0; for (i = 0; i < min((int)buf[3], buf_len - 4); i++) if (buf[i + 4] == page) goto found; - if (i < buf[3] && i > buf_len) + if (i < buf[3] && i >= buf_len - 4) /* ran off the end of the buffer, give us benefit of doubt */ goto found; /* The device claims it doesn't support the requested page */ diff --git a/drivers/serial/ioc3_serial.c b/drivers/serial/ioc3_serial.c index 93de907b1208..800c54602339 100644 --- a/drivers/serial/ioc3_serial.c +++ b/drivers/serial/ioc3_serial.c @@ -2044,6 +2044,7 @@ ioc3uart_probe(struct ioc3_submodule *is, struct ioc3_driver_data *idd) if (!port) { printk(KERN_WARNING "IOC3 serial memory not available for port\n"); + ret = -ENOMEM; goto out4; } spin_lock_init(&port->ip_lock); diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 7c8008225ee3..17927b1f9334 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -127,7 +127,10 @@ static void handle_tx(struct vhost_net *net) size_t len, total_len = 0; int err, wmem; size_t hdr_size; - struct socket *sock = rcu_dereference(vq->private_data); + struct socket *sock; + + sock = rcu_dereference_check(vq->private_data, + lockdep_is_held(&vq->mutex)); if (!sock) return; @@ -582,7 +585,10 @@ static void vhost_net_disable_vq(struct vhost_net *n, static void vhost_net_enable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { - struct socket *sock = vq->private_data; + struct socket *sock; + + sock = rcu_dereference_protected(vq->private_data, + lockdep_is_held(&vq->mutex)); if (!sock) return; if (vq == n->vqs + VHOST_NET_VQ_TX) { @@ -598,7 +604,8 @@ static struct socket *vhost_net_stop_vq(struct vhost_net *n, struct socket *sock; mutex_lock(&vq->mutex); - sock = vq->private_data; + sock = rcu_dereference_protected(vq->private_data, + lockdep_is_held(&vq->mutex)); vhost_net_disable_vq(n, vq); rcu_assign_pointer(vq->private_data, NULL); mutex_unlock(&vq->mutex); @@ -736,7 +743,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) } /* start polling new socket */ - oldsock = vq->private_data; + oldsock = rcu_dereference_protected(vq->private_data, + lockdep_is_held(&vq->mutex)); if (sock != oldsock) { vhost_net_disable_vq(n, vq); rcu_assign_pointer(vq->private_data, sock); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index dd3d6f7406f8..8b5a1b33d0fe 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -320,7 +320,7 @@ long vhost_dev_reset_owner(struct vhost_dev *dev) vhost_dev_cleanup(dev); memory->nregions = 0; - dev->memory = memory; + RCU_INIT_POINTER(dev->memory, memory); return 0; } @@ -352,8 +352,9 @@ void vhost_dev_cleanup(struct vhost_dev *dev) fput(dev->log_file); dev->log_file = NULL; /* No one will access memory at this point */ - kfree(dev->memory); - dev->memory = NULL; + kfree(rcu_dereference_protected(dev->memory, + lockdep_is_held(&dev->mutex))); + RCU_INIT_POINTER(dev->memory, NULL); if (dev->mm) mmput(dev->mm); dev->mm = NULL; @@ -440,14 +441,22 @@ static int vq_access_ok(unsigned int num, /* Caller should have device mutex but not vq mutex */ int vhost_log_access_ok(struct vhost_dev *dev) { - return memory_access_ok(dev, dev->memory, 1); + struct vhost_memory *mp; + + mp = rcu_dereference_protected(dev->memory, + lockdep_is_held(&dev->mutex)); + return memory_access_ok(dev, mp, 1); } /* Verify access for write logging. */ /* Caller should have vq mutex and device mutex */ static int vq_log_access_ok(struct vhost_virtqueue *vq, void __user *log_base) { - return vq_memory_access_ok(log_base, vq->dev->memory, + struct vhost_memory *mp; + + mp = rcu_dereference_protected(vq->dev->memory, + lockdep_is_held(&vq->mutex)); + return vq_memory_access_ok(log_base, mp, vhost_has_feature(vq->dev, VHOST_F_LOG_ALL)) && (!vq->log_used || log_access_ok(log_base, vq->log_addr, sizeof *vq->used + @@ -487,7 +496,8 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) kfree(newmem); return -EFAULT; } - oldmem = d->memory; + oldmem = rcu_dereference_protected(d->memory, + lockdep_is_held(&d->mutex)); rcu_assign_pointer(d->memory, newmem); synchronize_rcu(); kfree(oldmem); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index afd77295971c..af3c11ded5fd 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -106,7 +106,7 @@ struct vhost_virtqueue { * vhost_work execution acts instead of rcu_read_lock() and the end of * vhost_work execution acts instead of rcu_read_lock(). * Writers use virtqueue mutex. */ - void *private_data; + void __rcu *private_data; /* Log write descriptors */ void __user *log_base; struct vhost_log log[VHOST_NET_MAX_SG]; @@ -116,7 +116,7 @@ struct vhost_dev { /* Readers use RCU to access memory table pointer * log base pointer and features. * Writers use mutex below.*/ - struct vhost_memory *memory; + struct vhost_memory __rcu *memory; struct mm_struct *mm; struct mutex mutex; unsigned acked_features; @@ -173,7 +173,11 @@ enum { static inline int vhost_has_feature(struct vhost_dev *dev, int bit) { - unsigned acked_features = rcu_dereference(dev->acked_features); + unsigned acked_features; + + acked_features = + rcu_dereference_index_check(dev->acked_features, + lockdep_is_held(&dev->mutex)); return acked_features & (1 << bit); } diff --git a/fs/affs/super.c b/fs/affs/super.c index 33c4e7eef470..9581ea94d5a1 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -109,8 +109,8 @@ static void init_once(void *foo) { struct affs_inode_info *ei = (struct affs_inode_info *) foo; - init_MUTEX(&ei->i_link_lock); - init_MUTEX(&ei->i_ext_lock); + sema_init(&ei->i_link_lock, 1); + sema_init(&ei->i_ext_lock, 1); inode_init_once(&ei->vfs_inode); } diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index f96eff04e11a..a6395bdb26ae 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -134,10 +134,6 @@ static int aout_core_dump(struct coredump_params *cprm) if (!dump_write(file, dump_start, dump_size)) goto end_coredump; } -/* Finally dump the task struct. Not be used by gdb, but could be useful */ - set_fs(KERNEL_DS); - if (!dump_write(file, current, sizeof(*current))) - goto end_coredump; end_coredump: set_fs(fs); return has_dumped; diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 0fcd2640c23f..9eb134ea6eb2 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -1,9 +1,11 @@ config CEPH_FS tristate "Ceph distributed file system (EXPERIMENTAL)" depends on INET && EXPERIMENTAL + select CEPH_LIB select LIBCRC32C select CRYPTO_AES select CRYPTO + default n help Choose Y or M here to include support for mounting the experimental Ceph distributed file system. Ceph is an extremely @@ -14,15 +16,3 @@ config CEPH_FS If unsure, say N. -config CEPH_FS_PRETTYDEBUG - bool "Include file:line in ceph debug output" - depends on CEPH_FS - default n - help - If you say Y here, debug output will include a filename and - line to aid debugging. This icnreases kernel size and slows - execution slightly when debug call sites are enabled (e.g., - via CONFIG_DYNAMIC_DEBUG). - - If unsure, say N. - diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 278e1172600d..9e6c4f2e8ff1 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -8,15 +8,8 @@ obj-$(CONFIG_CEPH_FS) += ceph.o ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ export.o caps.o snap.o xattr.o \ - messenger.o msgpool.o buffer.o pagelist.o \ - mds_client.o mdsmap.o \ - mon_client.o \ - osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \ - debugfs.o \ - auth.o auth_none.o \ - crypto.o armor.o \ - auth_x.o \ - ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o + mds_client.o mdsmap.o strings.o ceph_frag.o \ + debugfs.o else #Otherwise we were called directly from the command diff --git a/fs/ceph/README b/fs/ceph/README deleted file mode 100644 index 18352fab37c0..000000000000 --- a/fs/ceph/README +++ /dev/null @@ -1,20 +0,0 @@ -# -# The following files are shared by (and manually synchronized -# between) the Ceph userland and kernel client. -# -# userland kernel -src/include/ceph_fs.h fs/ceph/ceph_fs.h -src/include/ceph_fs.cc fs/ceph/ceph_fs.c -src/include/msgr.h fs/ceph/msgr.h -src/include/rados.h fs/ceph/rados.h -src/include/ceph_strings.cc fs/ceph/ceph_strings.c -src/include/ceph_frag.h fs/ceph/ceph_frag.h -src/include/ceph_frag.cc fs/ceph/ceph_frag.c -src/include/ceph_hash.h fs/ceph/ceph_hash.h -src/include/ceph_hash.cc fs/ceph/ceph_hash.c -src/crush/crush.c fs/ceph/crush/crush.c -src/crush/crush.h fs/ceph/crush/crush.h -src/crush/mapper.c fs/ceph/crush/mapper.c -src/crush/mapper.h fs/ceph/crush/mapper.h -src/crush/hash.h fs/ceph/crush/hash.h -src/crush/hash.c fs/ceph/crush/hash.c diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index efbc604001c8..51bcc5ce3230 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/backing-dev.h> #include <linux/fs.h> @@ -10,7 +10,8 @@ #include <linux/task_io_accounting_ops.h> #include "super.h" -#include "osd_client.h" +#include "mds_client.h" +#include <linux/ceph/osd_client.h> /* * Ceph address space ops. @@ -193,7 +194,8 @@ static int readpage_nounlock(struct file *filp, struct page *page) { struct inode *inode = filp->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; + struct ceph_osd_client *osdc = + &ceph_inode_to_client(inode)->client->osdc; int err = 0; u64 len = PAGE_CACHE_SIZE; @@ -265,7 +267,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, { struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; + struct ceph_osd_client *osdc = + &ceph_inode_to_client(inode)->client->osdc; int rc = 0; struct page **pages; loff_t offset; @@ -365,7 +368,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) { struct inode *inode; struct ceph_inode_info *ci; - struct ceph_client *client; + struct ceph_fs_client *fsc; struct ceph_osd_client *osdc; loff_t page_off = page->index << PAGE_CACHE_SHIFT; int len = PAGE_CACHE_SIZE; @@ -383,8 +386,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) } inode = page->mapping->host; ci = ceph_inode(inode); - client = ceph_inode_to_client(inode); - osdc = &client->osdc; + fsc = ceph_inode_to_client(inode); + osdc = &fsc->client->osdc; /* verify this is a writeable snap context */ snapc = (void *)page->private; @@ -414,10 +417,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", inode, page, page->index, page_off, len, snapc); - writeback_stat = atomic_long_inc_return(&client->writeback_count); + writeback_stat = atomic_long_inc_return(&fsc->writeback_count); if (writeback_stat > - CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) - set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); + CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) + set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); set_page_writeback(page); err = ceph_osdc_writepages(osdc, ceph_vino(inode), @@ -496,7 +499,7 @@ static void writepages_finish(struct ceph_osd_request *req, struct address_space *mapping = inode->i_mapping; __s32 rc = -EIO; u64 bytes = 0; - struct ceph_client *client = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); long writeback_stat; unsigned issued = ceph_caps_issued(ci); @@ -529,10 +532,10 @@ static void writepages_finish(struct ceph_osd_request *req, WARN_ON(!PageUptodate(page)); writeback_stat = - atomic_long_dec_return(&client->writeback_count); + atomic_long_dec_return(&fsc->writeback_count); if (writeback_stat < - CONGESTION_OFF_THRESH(client->mount_args->congestion_kb)) - clear_bdi_congested(&client->backing_dev_info, + CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) + clear_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); ceph_put_snap_context((void *)page->private); @@ -569,13 +572,13 @@ static void writepages_finish(struct ceph_osd_request *req, * mempool. we avoid the mempool if we can because req->r_num_pages * may be less than the maximum write size. */ -static void alloc_page_vec(struct ceph_client *client, +static void alloc_page_vec(struct ceph_fs_client *fsc, struct ceph_osd_request *req) { req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, GFP_NOFS); if (!req->r_pages) { - req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS); + req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS); req->r_pages_from_pool = 1; WARN_ON(!req->r_pages); } @@ -590,7 +593,7 @@ static int ceph_writepages_start(struct address_space *mapping, struct inode *inode = mapping->host; struct backing_dev_info *bdi = mapping->backing_dev_info; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_client *client; + struct ceph_fs_client *fsc; pgoff_t index, start, end; int range_whole = 0; int should_loop = 1; @@ -617,13 +620,13 @@ static int ceph_writepages_start(struct address_space *mapping, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); - client = ceph_inode_to_client(inode); - if (client->mount_state == CEPH_MOUNT_SHUTDOWN) { + fsc = ceph_inode_to_client(inode); + if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { pr_warning("writepage_start %p on forced umount\n", inode); return -EIO; /* we're in a forced umount, don't write! */ } - if (client->mount_args->wsize && client->mount_args->wsize < wsize) - wsize = client->mount_args->wsize; + if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) + wsize = fsc->mount_options->wsize; if (wsize < PAGE_CACHE_SIZE) wsize = PAGE_CACHE_SIZE; max_pages_ever = wsize >> PAGE_CACHE_SHIFT; @@ -769,7 +772,7 @@ get_more_pages: offset = (unsigned long long)page->index << PAGE_CACHE_SHIFT; len = wsize; - req = ceph_osdc_new_request(&client->osdc, + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), offset, &len, @@ -782,7 +785,7 @@ get_more_pages: &inode->i_mtime, true, 1); max_pages = req->r_num_pages; - alloc_page_vec(client, req); + alloc_page_vec(fsc, req); req->r_callback = writepages_finish; req->r_inode = inode; } @@ -794,10 +797,10 @@ get_more_pages: inode, page, page->index); writeback_stat = - atomic_long_inc_return(&client->writeback_count); + atomic_long_inc_return(&fsc->writeback_count); if (writeback_stat > CONGESTION_ON_THRESH( - client->mount_args->congestion_kb)) { - set_bdi_congested(&client->backing_dev_info, + fsc->mount_options->congestion_kb)) { + set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); } @@ -846,7 +849,7 @@ get_more_pages: op->payload_len = cpu_to_le32(len); req->r_request->hdr.data_len = cpu_to_le32(len); - ceph_osdc_start_request(&client->osdc, req, true); + ceph_osdc_start_request(&fsc->client->osdc, req, true); req = NULL; /* continue? */ @@ -915,7 +918,7 @@ static int ceph_update_writeable_page(struct file *file, { struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; loff_t page_off = pos & PAGE_CACHE_MASK; int pos_in_page = pos & ~PAGE_CACHE_MASK; int end_in_page = pos_in_page + len; @@ -1053,8 +1056,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = file->f_dentry->d_inode; - struct ceph_client *client = ceph_inode_to_client(inode); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = fsc->mdsc; unsigned from = pos & (PAGE_CACHE_SIZE - 1); int check_cap = 0; @@ -1123,7 +1126,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = vma->vm_file->f_dentry->d_inode; struct page *page = vmf->page; - struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; loff_t off = page->index << PAGE_CACHE_SHIFT; loff_t size, len; int ret; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 5e9da996a151..98ab13e2b71d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/fs.h> #include <linux/kernel.h> @@ -9,8 +9,9 @@ #include <linux/writeback.h> #include "super.h" -#include "decode.h" -#include "messenger.h" +#include "mds_client.h" +#include <linux/ceph/decode.h> +#include <linux/ceph/messenger.h> /* * Capability management @@ -287,11 +288,11 @@ void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap) spin_unlock(&mdsc->caps_list_lock); } -void ceph_reservation_status(struct ceph_client *client, +void ceph_reservation_status(struct ceph_fs_client *fsc, int *total, int *avail, int *used, int *reserved, int *min) { - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_mds_client *mdsc = fsc->mdsc; if (total) *total = mdsc->caps_total_count; @@ -399,7 +400,7 @@ static void __insert_cap_node(struct ceph_inode_info *ci, static void __cap_set_timeouts(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - struct ceph_mount_args *ma = mdsc->client->mount_args; + struct ceph_mount_options *ma = mdsc->fsc->mount_options; ci->i_hold_caps_min = round_jiffies(jiffies + ma->caps_wanted_delay_min * HZ); @@ -515,7 +516,7 @@ int ceph_add_cap(struct inode *inode, unsigned seq, unsigned mseq, u64 realmino, int flags, struct ceph_cap_reservation *caps_reservation) { - struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap *new_cap = NULL; struct ceph_cap *cap; @@ -873,7 +874,7 @@ void __ceph_remove_cap(struct ceph_cap *cap) struct ceph_mds_session *session = cap->session; struct ceph_inode_info *ci = cap->ci; struct ceph_mds_client *mdsc = - &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; + ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; int removed = 0; dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); @@ -1210,7 +1211,7 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci, int mds; struct ceph_cap_snap *capsnap; u32 mseq; - struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_mds_session *session = NULL; /* if session != NULL, we hold session->s_mutex */ u64 next_follows = 0; /* keep track of how far we've gotten through the @@ -1336,7 +1337,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci) void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) { struct ceph_mds_client *mdsc = - &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; + ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; struct inode *inode = &ci->vfs_inode; int was = ci->i_dirty_caps; int dirty = 0; @@ -1378,7 +1379,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) static int __mark_caps_flushing(struct inode *inode, struct ceph_mds_session *session) { - struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); int flushing; @@ -1416,17 +1417,6 @@ static int __mark_caps_flushing(struct inode *inode, /* * try to invalidate mapping pages without blocking. */ -static int mapping_is_empty(struct address_space *mapping) -{ - struct page *page = find_get_page(mapping, 0); - - if (!page) - return 1; - - put_page(page); - return 0; -} - static int try_nonblocking_invalidate(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); @@ -1436,7 +1426,7 @@ static int try_nonblocking_invalidate(struct inode *inode) invalidate_mapping_pages(&inode->i_data, 0, -1); spin_lock(&inode->i_lock); - if (mapping_is_empty(&inode->i_data) && + if (inode->i_data.nrpages == 0 && invalidating_gen == ci->i_rdcache_gen) { /* success. */ dout("try_nonblocking_invalidate %p success\n", inode); @@ -1462,8 +1452,8 @@ static int try_nonblocking_invalidate(struct inode *inode) void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_session *session) { - struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); + struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; int file_wanted, used; @@ -1533,7 +1523,7 @@ retry_locked: */ if ((!is_delayed || mdsc->stopping) && ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ - ci->i_rdcache_gen && /* may have cached pages */ + inode->i_data.nrpages && /* have cached pages */ (file_wanted == 0 || /* no open files */ (revoking & (CEPH_CAP_FILE_CACHE| CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ @@ -1706,7 +1696,7 @@ ack: static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, unsigned *flush_tid) { - struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); int unlock_session = session ? 0 : 1; int flushing = 0; @@ -1872,7 +1862,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) caps_are_flushed(inode, flush_tid)); } else { struct ceph_mds_client *mdsc = - &ceph_sb_to_client(inode->i_sb)->mdsc; + ceph_sb_to_client(inode->i_sb)->mdsc; spin_lock(&inode->i_lock); if (__ceph_caps_dirty(ci)) @@ -2465,7 +2455,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, __releases(inode->i_lock) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; unsigned seq = le32_to_cpu(m->seq); int dirty = le32_to_cpu(m->dirty); int cleaned = 0; @@ -2713,7 +2703,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_mds_client *mdsc = session->s_mdsc; - struct super_block *sb = mdsc->client->sb; + struct super_block *sb = mdsc->fsc->sb; struct inode *inode; struct ceph_cap *cap; struct ceph_mds_caps *h; diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c index ab6cf35c4091..bdce8b1fbd06 100644 --- a/fs/ceph/ceph_frag.c +++ b/fs/ceph/ceph_frag.c @@ -1,7 +1,8 @@ /* * Ceph 'frag' type */ -#include "types.h" +#include <linux/module.h> +#include <linux/ceph/types.h> int ceph_frag_compare(__u32 a, __u32 b) { diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 6fd8b20a8611..7ae1b3d55b58 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/device.h> #include <linux/slab.h> @@ -7,143 +7,49 @@ #include <linux/debugfs.h> #include <linux/seq_file.h> +#include <linux/ceph/libceph.h> +#include <linux/ceph/mon_client.h> +#include <linux/ceph/auth.h> +#include <linux/ceph/debugfs.h> + #include "super.h" -#include "mds_client.h" -#include "mon_client.h" -#include "auth.h" #ifdef CONFIG_DEBUG_FS -/* - * Implement /sys/kernel/debug/ceph fun - * - * /sys/kernel/debug/ceph/client* - an instance of the ceph client - * .../osdmap - current osdmap - * .../mdsmap - current mdsmap - * .../monmap - current monmap - * .../osdc - active osd requests - * .../mdsc - active mds requests - * .../monc - mon client state - * .../dentry_lru - dump contents of dentry lru - * .../caps - expose cap (reservation) stats - * .../bdi - symlink to ../../bdi/something - */ - -static struct dentry *ceph_debugfs_dir; - -static int monmap_show(struct seq_file *s, void *p) -{ - int i; - struct ceph_client *client = s->private; - - if (client->monc.monmap == NULL) - return 0; - - seq_printf(s, "epoch %d\n", client->monc.monmap->epoch); - for (i = 0; i < client->monc.monmap->num_mon; i++) { - struct ceph_entity_inst *inst = - &client->monc.monmap->mon_inst[i]; - - seq_printf(s, "\t%s%lld\t%s\n", - ENTITY_NAME(inst->name), - pr_addr(&inst->addr.in_addr)); - } - return 0; -} +#include "mds_client.h" static int mdsmap_show(struct seq_file *s, void *p) { int i; - struct ceph_client *client = s->private; + struct ceph_fs_client *fsc = s->private; - if (client->mdsc.mdsmap == NULL) + if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL) return 0; - seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch); - seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root); + seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch); + seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root); seq_printf(s, "session_timeout %d\n", - client->mdsc.mdsmap->m_session_timeout); + fsc->mdsc->mdsmap->m_session_timeout); seq_printf(s, "session_autoclose %d\n", - client->mdsc.mdsmap->m_session_autoclose); - for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) { + fsc->mdsc->mdsmap->m_session_autoclose); + for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) { struct ceph_entity_addr *addr = - &client->mdsc.mdsmap->m_info[i].addr; - int state = client->mdsc.mdsmap->m_info[i].state; + &fsc->mdsc->mdsmap->m_info[i].addr; + int state = fsc->mdsc->mdsmap->m_info[i].state; - seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr), + seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, + ceph_pr_addr(&addr->in_addr), ceph_mds_state_name(state)); } return 0; } -static int osdmap_show(struct seq_file *s, void *p) -{ - int i; - struct ceph_client *client = s->private; - struct rb_node *n; - - if (client->osdc.osdmap == NULL) - return 0; - seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch); - seq_printf(s, "flags%s%s\n", - (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ? - " NEARFULL" : "", - (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ? - " FULL" : ""); - for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { - struct ceph_pg_pool_info *pool = - rb_entry(n, struct ceph_pg_pool_info, node); - seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n", - pool->id, pool->v.pg_num, pool->pg_num_mask, - pool->v.lpg_num, pool->lpg_num_mask); - } - for (i = 0; i < client->osdc.osdmap->max_osd; i++) { - struct ceph_entity_addr *addr = - &client->osdc.osdmap->osd_addr[i]; - int state = client->osdc.osdmap->osd_state[i]; - char sb[64]; - - seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n", - i, pr_addr(&addr->in_addr), - ((client->osdc.osdmap->osd_weight[i]*100) >> 16), - ceph_osdmap_state_str(sb, sizeof(sb), state)); - } - return 0; -} - -static int monc_show(struct seq_file *s, void *p) -{ - struct ceph_client *client = s->private; - struct ceph_mon_generic_request *req; - struct ceph_mon_client *monc = &client->monc; - struct rb_node *rp; - - mutex_lock(&monc->mutex); - - if (monc->have_mdsmap) - seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap); - if (monc->have_osdmap) - seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap); - if (monc->want_next_osdmap) - seq_printf(s, "want next osdmap\n"); - - for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { - __u16 op; - req = rb_entry(rp, struct ceph_mon_generic_request, node); - op = le16_to_cpu(req->request->hdr.type); - if (op == CEPH_MSG_STATFS) - seq_printf(s, "%lld statfs\n", req->tid); - else - seq_printf(s, "%lld unknown\n", req->tid); - } - - mutex_unlock(&monc->mutex); - return 0; -} - +/* + * mdsc debugfs + */ static int mdsc_show(struct seq_file *s, void *p) { - struct ceph_client *client = s->private; - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = s->private; + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct rb_node *rp; int pathlen; @@ -214,61 +120,12 @@ static int mdsc_show(struct seq_file *s, void *p) return 0; } -static int osdc_show(struct seq_file *s, void *pp) -{ - struct ceph_client *client = s->private; - struct ceph_osd_client *osdc = &client->osdc; - struct rb_node *p; - - mutex_lock(&osdc->request_mutex); - for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { - struct ceph_osd_request *req; - struct ceph_osd_request_head *head; - struct ceph_osd_op *op; - int num_ops; - int opcode, olen; - int i; - - req = rb_entry(p, struct ceph_osd_request, r_node); - - seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid, - req->r_osd ? req->r_osd->o_osd : -1, - le32_to_cpu(req->r_pgid.pool), - le16_to_cpu(req->r_pgid.ps)); - - head = req->r_request->front.iov_base; - op = (void *)(head + 1); - - num_ops = le16_to_cpu(head->num_ops); - olen = le32_to_cpu(head->object_len); - seq_printf(s, "%.*s", olen, - (const char *)(head->ops + num_ops)); - - if (req->r_reassert_version.epoch) - seq_printf(s, "\t%u'%llu", - (unsigned)le32_to_cpu(req->r_reassert_version.epoch), - le64_to_cpu(req->r_reassert_version.version)); - else - seq_printf(s, "\t"); - - for (i = 0; i < num_ops; i++) { - opcode = le16_to_cpu(op->op); - seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); - op++; - } - - seq_printf(s, "\n"); - } - mutex_unlock(&osdc->request_mutex); - return 0; -} - static int caps_show(struct seq_file *s, void *p) { - struct ceph_client *client = s->private; + struct ceph_fs_client *fsc = s->private; int total, avail, used, reserved, min; - ceph_reservation_status(client, &total, &avail, &used, &reserved, &min); + ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min); seq_printf(s, "total\t\t%d\n" "avail\t\t%d\n" "used\t\t%d\n" @@ -280,8 +137,8 @@ static int caps_show(struct seq_file *s, void *p) static int dentry_lru_show(struct seq_file *s, void *ptr) { - struct ceph_client *client = s->private; - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = s->private; + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_dentry_info *di; spin_lock(&mdsc->dentry_lru_lock); @@ -295,199 +152,124 @@ static int dentry_lru_show(struct seq_file *s, void *ptr) return 0; } -#define DEFINE_SHOW_FUNC(name) \ -static int name##_open(struct inode *inode, struct file *file) \ -{ \ - struct seq_file *sf; \ - int ret; \ - \ - ret = single_open(file, name, NULL); \ - sf = file->private_data; \ - sf->private = inode->i_private; \ - return ret; \ -} \ - \ -static const struct file_operations name##_fops = { \ - .open = name##_open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ -}; - -DEFINE_SHOW_FUNC(monmap_show) -DEFINE_SHOW_FUNC(mdsmap_show) -DEFINE_SHOW_FUNC(osdmap_show) -DEFINE_SHOW_FUNC(monc_show) -DEFINE_SHOW_FUNC(mdsc_show) -DEFINE_SHOW_FUNC(osdc_show) -DEFINE_SHOW_FUNC(dentry_lru_show) -DEFINE_SHOW_FUNC(caps_show) +CEPH_DEFINE_SHOW_FUNC(mdsmap_show) +CEPH_DEFINE_SHOW_FUNC(mdsc_show) +CEPH_DEFINE_SHOW_FUNC(caps_show) +CEPH_DEFINE_SHOW_FUNC(dentry_lru_show) + +/* + * debugfs + */ static int congestion_kb_set(void *data, u64 val) { - struct ceph_client *client = (struct ceph_client *)data; - - if (client) - client->mount_args->congestion_kb = (int)val; + struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; + fsc->mount_options->congestion_kb = (int)val; return 0; } static int congestion_kb_get(void *data, u64 *val) { - struct ceph_client *client = (struct ceph_client *)data; - - if (client) - *val = (u64)client->mount_args->congestion_kb; + struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; + *val = (u64)fsc->mount_options->congestion_kb; return 0; } - DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get, congestion_kb_set, "%llu\n"); -int __init ceph_debugfs_init(void) -{ - ceph_debugfs_dir = debugfs_create_dir("ceph", NULL); - if (!ceph_debugfs_dir) - return -ENOMEM; - return 0; -} -void ceph_debugfs_cleanup(void) +void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) { - debugfs_remove(ceph_debugfs_dir); + dout("ceph_fs_debugfs_cleanup\n"); + debugfs_remove(fsc->debugfs_bdi); + debugfs_remove(fsc->debugfs_congestion_kb); + debugfs_remove(fsc->debugfs_mdsmap); + debugfs_remove(fsc->debugfs_caps); + debugfs_remove(fsc->debugfs_mdsc); + debugfs_remove(fsc->debugfs_dentry_lru); } -int ceph_debugfs_client_init(struct ceph_client *client) +int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) { - int ret = 0; - char name[80]; - - snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid, - client->monc.auth->global_id); + char name[100]; + int err = -ENOMEM; - client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); - if (!client->debugfs_dir) - goto out; - - client->monc.debugfs_file = debugfs_create_file("monc", - 0600, - client->debugfs_dir, - client, - &monc_show_fops); - if (!client->monc.debugfs_file) + dout("ceph_fs_debugfs_init\n"); + fsc->debugfs_congestion_kb = + debugfs_create_file("writeback_congestion_kb", + 0600, + fsc->client->debugfs_dir, + fsc, + &congestion_kb_fops); + if (!fsc->debugfs_congestion_kb) goto out; - client->mdsc.debugfs_file = debugfs_create_file("mdsc", - 0600, - client->debugfs_dir, - client, - &mdsc_show_fops); - if (!client->mdsc.debugfs_file) - goto out; + dout("a\n"); - client->osdc.debugfs_file = debugfs_create_file("osdc", - 0600, - client->debugfs_dir, - client, - &osdc_show_fops); - if (!client->osdc.debugfs_file) + snprintf(name, sizeof(name), "../../bdi/%s", + dev_name(fsc->backing_dev_info.dev)); + fsc->debugfs_bdi = + debugfs_create_symlink("bdi", + fsc->client->debugfs_dir, + name); + if (!fsc->debugfs_bdi) goto out; - client->debugfs_monmap = debugfs_create_file("monmap", + dout("b\n"); + fsc->debugfs_mdsmap = debugfs_create_file("mdsmap", 0600, - client->debugfs_dir, - client, - &monmap_show_fops); - if (!client->debugfs_monmap) - goto out; - - client->debugfs_mdsmap = debugfs_create_file("mdsmap", - 0600, - client->debugfs_dir, - client, + fsc->client->debugfs_dir, + fsc, &mdsmap_show_fops); - if (!client->debugfs_mdsmap) - goto out; - - client->debugfs_osdmap = debugfs_create_file("osdmap", - 0600, - client->debugfs_dir, - client, - &osdmap_show_fops); - if (!client->debugfs_osdmap) + if (!fsc->debugfs_mdsmap) goto out; - client->debugfs_dentry_lru = debugfs_create_file("dentry_lru", - 0600, - client->debugfs_dir, - client, - &dentry_lru_show_fops); - if (!client->debugfs_dentry_lru) + dout("ca\n"); + fsc->debugfs_mdsc = debugfs_create_file("mdsc", + 0600, + fsc->client->debugfs_dir, + fsc, + &mdsc_show_fops); + if (!fsc->debugfs_mdsc) goto out; - client->debugfs_caps = debugfs_create_file("caps", + dout("da\n"); + fsc->debugfs_caps = debugfs_create_file("caps", 0400, - client->debugfs_dir, - client, + fsc->client->debugfs_dir, + fsc, &caps_show_fops); - if (!client->debugfs_caps) + if (!fsc->debugfs_caps) goto out; - client->debugfs_congestion_kb = - debugfs_create_file("writeback_congestion_kb", - 0600, - client->debugfs_dir, - client, - &congestion_kb_fops); - if (!client->debugfs_congestion_kb) + dout("ea\n"); + fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru", + 0600, + fsc->client->debugfs_dir, + fsc, + &dentry_lru_show_fops); + if (!fsc->debugfs_dentry_lru) goto out; - sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev)); - client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir, - name); - return 0; out: - ceph_debugfs_client_cleanup(client); - return ret; + ceph_fs_debugfs_cleanup(fsc); + return err; } -void ceph_debugfs_client_cleanup(struct ceph_client *client) -{ - debugfs_remove(client->debugfs_bdi); - debugfs_remove(client->debugfs_caps); - debugfs_remove(client->debugfs_dentry_lru); - debugfs_remove(client->debugfs_osdmap); - debugfs_remove(client->debugfs_mdsmap); - debugfs_remove(client->debugfs_monmap); - debugfs_remove(client->osdc.debugfs_file); - debugfs_remove(client->mdsc.debugfs_file); - debugfs_remove(client->monc.debugfs_file); - debugfs_remove(client->debugfs_congestion_kb); - debugfs_remove(client->debugfs_dir); -} #else /* CONFIG_DEBUG_FS */ -int __init ceph_debugfs_init(void) -{ - return 0; -} - -void ceph_debugfs_cleanup(void) -{ -} - -int ceph_debugfs_client_init(struct ceph_client *client) +int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) { return 0; } -void ceph_debugfs_client_cleanup(struct ceph_client *client) +void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) { } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index a1986eb52045..e0a2dc6fcafc 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/spinlock.h> #include <linux/fs_struct.h> @@ -7,6 +7,7 @@ #include <linux/sched.h> #include "super.h" +#include "mds_client.h" /* * Directory operations: readdir, lookup, create, link, unlink, @@ -94,10 +95,7 @@ static unsigned fpos_off(loff_t p) */ static int __dcache_readdir(struct file *filp, void *dirent, filldir_t filldir) - __releases(inode->i_lock) - __acquires(inode->i_lock) { - struct inode *inode = filp->f_dentry->d_inode; struct ceph_file_info *fi = filp->private_data; struct dentry *parent = filp->f_dentry; struct inode *dir = parent->d_inode; @@ -153,7 +151,6 @@ more: atomic_inc(&dentry->d_count); spin_unlock(&dcache_lock); - spin_unlock(&inode->i_lock); dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); @@ -171,35 +168,30 @@ more: } else { dput(last); } - last = NULL; } - - spin_lock(&inode->i_lock); - spin_lock(&dcache_lock); - last = dentry; if (err < 0) - goto out_unlock; + goto out; - p = p->prev; filp->f_pos++; /* make sure a dentry wasn't dropped while we didn't have dcache_lock */ - if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE)) - goto more; - dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); - err = -EAGAIN; + if (!ceph_i_test(dir, CEPH_I_COMPLETE)) { + dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); + err = -EAGAIN; + goto out; + } + + spin_lock(&dcache_lock); + p = p->prev; /* advance to next dentry */ + goto more; out_unlock: spin_unlock(&dcache_lock); - - if (last) { - spin_unlock(&inode->i_lock); +out: + if (last) dput(last); - spin_lock(&inode->i_lock); - } - return err; } @@ -227,15 +219,15 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) struct ceph_file_info *fi = filp->private_data; struct inode *inode = filp->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_client *client = ceph_inode_to_client(inode); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = fsc->mdsc; unsigned frag = fpos_frag(filp->f_pos); int off = fpos_off(filp->f_pos); int err; u32 ftype; struct ceph_mds_reply_info_parsed *rinfo; - const int max_entries = client->mount_args->max_readdir; - const int max_bytes = client->mount_args->max_readdir_bytes; + const int max_entries = fsc->mount_options->max_readdir; + const int max_bytes = fsc->mount_options->max_readdir_bytes; dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); if (fi->at_end) @@ -267,17 +259,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) /* can we use the dcache? */ spin_lock(&inode->i_lock); if ((filp->f_pos == 2 || fi->dentry) && - !ceph_test_opt(client, NOASYNCREADDIR) && + !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && (ci->i_ceph_flags & CEPH_I_COMPLETE) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { + spin_unlock(&inode->i_lock); err = __dcache_readdir(filp, dirent, filldir); - if (err != -EAGAIN) { - spin_unlock(&inode->i_lock); + if (err != -EAGAIN) return err; - } + } else { + spin_unlock(&inode->i_lock); } - spin_unlock(&inode->i_lock); if (fi->dentry) { err = note_last_dentry(fi, fi->dentry->d_name.name, fi->dentry->d_name.len); @@ -487,14 +479,13 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err) { - struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct inode *parent = dentry->d_parent->d_inode; /* .snap dir? */ if (err == -ENOENT && - ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */ strcmp(dentry->d_name.name, - client->mount_args->snapdir_name) == 0) { + fsc->mount_options->snapdir_name) == 0) { struct inode *inode = ceph_get_snapdir(parent); dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", dentry, dentry->d_name.len, dentry->d_name.name, inode); @@ -539,8 +530,8 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int op; int err; @@ -572,7 +563,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, spin_lock(&dir->i_lock); dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); if (strncmp(dentry->d_name.name, - client->mount_args->snapdir_name, + fsc->mount_options->snapdir_name, dentry->d_name.len) && !is_root_ceph_dentry(dir, dentry) && (ci->i_ceph_flags & CEPH_I_COMPLETE) && @@ -629,8 +620,8 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) static int ceph_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int err; @@ -685,8 +676,8 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, int mode, static int ceph_symlink(struct inode *dir, struct dentry *dentry, const char *dest) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int err; @@ -716,8 +707,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int err = -EROFS; int op; @@ -758,8 +749,8 @@ out: static int ceph_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int err; @@ -813,8 +804,8 @@ static int drop_caps_for_unlink(struct inode *inode) */ static int ceph_unlink(struct inode *dir, struct dentry *dentry) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = dentry->d_inode; struct ceph_mds_request *req; int err = -EROFS; @@ -854,8 +845,8 @@ out: static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int err; @@ -1076,7 +1067,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, struct ceph_inode_info *ci = ceph_inode(inode); int left; - if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) + if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) return -EISDIR; if (!cf->dir_info) { @@ -1177,7 +1168,7 @@ void ceph_dentry_lru_add(struct dentry *dn) dout("dentry_lru_add %p %p '%.*s'\n", di, dn, dn->d_name.len, dn->d_name.name); if (di) { - mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); list_add_tail(&di->lru, &mdsc->dentry_lru); mdsc->num_dentry++; @@ -1193,7 +1184,7 @@ void ceph_dentry_lru_touch(struct dentry *dn) dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, dn->d_name.len, dn->d_name.name, di->offset); if (di) { - mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); list_move_tail(&di->lru, &mdsc->dentry_lru); spin_unlock(&mdsc->dentry_lru_lock); @@ -1208,7 +1199,7 @@ void ceph_dentry_lru_del(struct dentry *dn) dout("dentry_lru_del %p %p '%.*s'\n", di, dn, dn->d_name.len, dn->d_name.name); if (di) { - mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); list_del_init(&di->lru); mdsc->num_dentry--; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index e38423e82f2e..2297d9426992 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -1,10 +1,11 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/exportfs.h> #include <linux/slab.h> #include <asm/unaligned.h> #include "super.h" +#include "mds_client.h" /* * NFS export support @@ -120,7 +121,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, static struct dentry *__cfh_to_dentry(struct super_block *sb, struct ceph_nfs_confh *cfh) { - struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; struct inode *inode; struct dentry *dentry; struct ceph_vino vino; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 66e4da6dba22..e77c28cf3690 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1,5 +1,6 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> +#include <linux/module.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/file.h> @@ -38,8 +39,8 @@ static struct ceph_mds_request * prepare_open_request(struct super_block *sb, int flags, int create_mode) { - struct ceph_client *client = ceph_sb_to_client(sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int want_auth = USE_ANY_MDS; int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; @@ -117,8 +118,8 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) int ceph_open(struct inode *inode, struct file *file) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_client *client = ceph_sb_to_client(inode->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct ceph_file_info *cf = file->private_data; struct inode *parent_inode = file->f_dentry->d_parent->d_inode; @@ -216,8 +217,8 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd, int mode, int locked_dir) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct file *file = nd->intent.open.file; struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry); struct ceph_mds_request *req; @@ -270,163 +271,6 @@ int ceph_release(struct inode *inode, struct file *file) } /* - * build a vector of user pages - */ -static struct page **get_direct_page_vector(const char __user *data, - int num_pages, - loff_t off, size_t len) -{ - struct page **pages; - int rc; - - pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); - if (!pages) - return ERR_PTR(-ENOMEM); - - down_read(¤t->mm->mmap_sem); - rc = get_user_pages(current, current->mm, (unsigned long)data, - num_pages, 0, 0, pages, NULL); - up_read(¤t->mm->mmap_sem); - if (rc < 0) - goto fail; - return pages; - -fail: - kfree(pages); - return ERR_PTR(rc); -} - -static void put_page_vector(struct page **pages, int num_pages) -{ - int i; - - for (i = 0; i < num_pages; i++) - put_page(pages[i]); - kfree(pages); -} - -void ceph_release_page_vector(struct page **pages, int num_pages) -{ - int i; - - for (i = 0; i < num_pages; i++) - __free_pages(pages[i], 0); - kfree(pages); -} - -/* - * allocate a vector new pages - */ -static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags) -{ - struct page **pages; - int i; - - pages = kmalloc(sizeof(*pages) * num_pages, flags); - if (!pages) - return ERR_PTR(-ENOMEM); - for (i = 0; i < num_pages; i++) { - pages[i] = __page_cache_alloc(flags); - if (pages[i] == NULL) { - ceph_release_page_vector(pages, i); - return ERR_PTR(-ENOMEM); - } - } - return pages; -} - -/* - * copy user data into a page vector - */ -static int copy_user_to_page_vector(struct page **pages, - const char __user *data, - loff_t off, size_t len) -{ - int i = 0; - int po = off & ~PAGE_CACHE_MASK; - int left = len; - int l, bad; - - while (left > 0) { - l = min_t(int, PAGE_CACHE_SIZE-po, left); - bad = copy_from_user(page_address(pages[i]) + po, data, l); - if (bad == l) - return -EFAULT; - data += l - bad; - left -= l - bad; - po += l - bad; - if (po == PAGE_CACHE_SIZE) { - po = 0; - i++; - } - } - return len; -} - -/* - * copy user data from a page vector into a user pointer - */ -static int copy_page_vector_to_user(struct page **pages, char __user *data, - loff_t off, size_t len) -{ - int i = 0; - int po = off & ~PAGE_CACHE_MASK; - int left = len; - int l, bad; - - while (left > 0) { - l = min_t(int, left, PAGE_CACHE_SIZE-po); - bad = copy_to_user(data, page_address(pages[i]) + po, l); - if (bad == l) - return -EFAULT; - data += l - bad; - left -= l - bad; - if (po) { - po += l - bad; - if (po == PAGE_CACHE_SIZE) - po = 0; - } - i++; - } - return len; -} - -/* - * Zero an extent within a page vector. Offset is relative to the - * start of the first page. - */ -static void zero_page_vector_range(int off, int len, struct page **pages) -{ - int i = off >> PAGE_CACHE_SHIFT; - - off &= ~PAGE_CACHE_MASK; - - dout("zero_page_vector_page %u~%u\n", off, len); - - /* leading partial page? */ - if (off) { - int end = min((int)PAGE_CACHE_SIZE, off + len); - dout("zeroing %d %p head from %d\n", i, pages[i], - (int)off); - zero_user_segment(pages[i], off, end); - len -= (end - off); - i++; - } - while (len >= PAGE_CACHE_SIZE) { - dout("zeroing %d %p len=%d\n", i, pages[i], len); - zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE); - len -= PAGE_CACHE_SIZE; - i++; - } - /* trailing partial page? */ - if (len) { - dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len); - zero_user_segment(pages[i], 0, len); - } -} - - -/* * Read a range of bytes striped over one or more objects. Iterate over * objects we stripe over. (That's not atomic, but good enough for now.) * @@ -438,7 +282,7 @@ static int striped_read(struct inode *inode, struct page **pages, int num_pages, int *checkeof) { - struct ceph_client *client = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 pos, this_len; int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ @@ -459,7 +303,7 @@ static int striped_read(struct inode *inode, more: this_len = left; - ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode), + ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), &ci->i_layout, pos, &this_len, ci->i_truncate_seq, ci->i_truncate_size, @@ -477,8 +321,8 @@ more: if (read < pos - off) { dout(" zero gap %llu to %llu\n", off + read, pos); - zero_page_vector_range(page_off + read, - pos - off - read, pages); + ceph_zero_page_vector_range(page_off + read, + pos - off - read, pages); } pos += ret; read = pos - off; @@ -495,8 +339,8 @@ more: /* was original extent fully inside i_size? */ if (pos + left <= inode->i_size) { dout("zero tail\n"); - zero_page_vector_range(page_off + read, len - read, - pages); + ceph_zero_page_vector_range(page_off + read, len - read, + pages); read = len; goto out; } @@ -531,7 +375,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); if (file->f_flags & O_DIRECT) { - pages = get_direct_page_vector(data, num_pages, off, len); + pages = ceph_get_direct_page_vector(data, num_pages, off, len); /* * flush any page cache pages in this range. this @@ -552,13 +396,13 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, ret = striped_read(inode, off, len, pages, num_pages, checkeof); if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) - ret = copy_page_vector_to_user(pages, data, off, ret); + ret = ceph_copy_page_vector_to_user(pages, data, off, ret); if (ret >= 0) *poff = off + ret; done: if (file->f_flags & O_DIRECT) - put_page_vector(pages, num_pages); + ceph_put_page_vector(pages, num_pages); else ceph_release_page_vector(pages, num_pages); dout("sync_read result %d\n", ret); @@ -594,7 +438,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, { struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_client *client = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_request *req; struct page **pages; int num_pages; @@ -642,7 +486,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, */ more: len = left; - req = ceph_osdc_new_request(&client->osdc, &ci->i_layout, + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), pos, &len, CEPH_OSD_OP_WRITE, flags, ci->i_snap_realm->cached_context, @@ -655,7 +499,7 @@ more: num_pages = calc_pages_for(pos, len); if (file->f_flags & O_DIRECT) { - pages = get_direct_page_vector(data, num_pages, pos, len); + pages = ceph_get_direct_page_vector(data, num_pages, pos, len); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out; @@ -673,7 +517,7 @@ more: ret = PTR_ERR(pages); goto out; } - ret = copy_user_to_page_vector(pages, data, pos, len); + ret = ceph_copy_user_to_page_vector(pages, data, pos, len); if (ret < 0) { ceph_release_page_vector(pages, num_pages); goto out; @@ -689,7 +533,7 @@ more: req->r_num_pages = num_pages; req->r_inode = inode; - ret = ceph_osdc_start_request(&client->osdc, req, false); + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!ret) { if (req->r_safe_callback) { /* @@ -701,11 +545,11 @@ more: spin_unlock(&ci->i_unsafe_lock); ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); } - ret = ceph_osdc_wait_request(&client->osdc, req); + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); } if (file->f_flags & O_DIRECT) - put_page_vector(pages, num_pages); + ceph_put_page_vector(pages, num_pages); else if (file->f_flags & O_SYNC) ceph_release_page_vector(pages, num_pages); @@ -814,7 +658,8 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; + struct ceph_osd_client *osdc = + &ceph_sb_to_client(inode->i_sb)->client->osdc; loff_t endoff = pos + iov->iov_len; int want, got = 0; int ret, err; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 62377ec37edf..1d6a45b5a04c 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/module.h> #include <linux/fs.h> @@ -13,7 +13,8 @@ #include <linux/pagevec.h> #include "super.h" -#include "decode.h" +#include "mds_client.h" +#include <linux/ceph/decode.h> /* * Ceph inode operations @@ -384,7 +385,7 @@ void ceph_destroy_inode(struct inode *inode) */ if (ci->i_snap_realm) { struct ceph_mds_client *mdsc = - &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; + ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; struct ceph_snap_realm *realm = ci->i_snap_realm; dout(" dropping residual ref to snap realm %p\n", realm); @@ -685,7 +686,7 @@ static int fill_inode(struct inode *inode, } /* it may be better to set st_size in getattr instead? */ - if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES)) + if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES)) inode->i_size = ci->i_rbytes; break; default: @@ -901,7 +902,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, struct inode *in = NULL; struct ceph_mds_reply_inode *ininfo; struct ceph_vino vino; - struct ceph_client *client = ceph_sb_to_client(sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); int i = 0; int err = 0; @@ -965,7 +966,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, */ if (rinfo->head->is_dentry && !req->r_aborted && (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, - client->mount_args->snapdir_name, + fsc->mount_options->snapdir_name, req->r_dentry->d_name.len))) { /* * lookup link rename : null -> possibly existing inode @@ -1533,7 +1534,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) struct inode *parent_inode = dentry->d_parent->d_inode; const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; - struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; int issued; int release = 0, dirtied = 0; int mask = 0; @@ -1728,8 +1729,8 @@ out: */ int ceph_do_getattr(struct inode *inode, int mask) { - struct ceph_client *client = ceph_sb_to_client(inode->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int err; diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 76e307d2aba1..8888c9ba68db 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -1,8 +1,10 @@ #include <linux/in.h> -#include "ioctl.h" #include "super.h" -#include "ceph_debug.h" +#include "mds_client.h" +#include <linux/ceph/ceph_debug.h> + +#include "ioctl.h" /* @@ -37,7 +39,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) { struct inode *inode = file->f_dentry->d_inode; struct inode *parent_inode = file->f_dentry->d_parent->d_inode; - struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; struct ceph_ioctl_layout l; int err, i; @@ -90,6 +92,68 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) } /* + * Set a layout policy on a directory inode. All items in the tree + * rooted at this inode will inherit this layout on creation, + * (It doesn't apply retroactively ) + * unless a subdirectory has its own layout policy. + */ +static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ceph_mds_request *req; + struct ceph_ioctl_layout l; + int err, i; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + + /* copy and validate */ + if (copy_from_user(&l, arg, sizeof(l))) + return -EFAULT; + + if ((l.object_size & ~PAGE_MASK) || + (l.stripe_unit & ~PAGE_MASK) || + !l.stripe_unit || + (l.object_size && + (unsigned)l.object_size % (unsigned)l.stripe_unit)) + return -EINVAL; + + /* make sure it's a valid data pool */ + if (l.data_pool > 0) { + mutex_lock(&mdsc->mutex); + err = -EINVAL; + for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++) + if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) { + err = 0; + break; + } + mutex_unlock(&mdsc->mutex); + if (err) + return err; + } + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT, + USE_AUTH_MDS); + + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = igrab(inode); + + req->r_args.setlayout.layout.fl_stripe_unit = + cpu_to_le32(l.stripe_unit); + req->r_args.setlayout.layout.fl_stripe_count = + cpu_to_le32(l.stripe_count); + req->r_args.setlayout.layout.fl_object_size = + cpu_to_le32(l.object_size); + req->r_args.setlayout.layout.fl_pg_pool = + cpu_to_le32(l.data_pool); + req->r_args.setlayout.layout.fl_pg_preferred = + cpu_to_le32(l.preferred_osd); + + err = ceph_mdsc_do_request(mdsc, inode, req); + ceph_mdsc_put_request(req); + return err; +} + +/* * Return object name, size/offset information, and location (OSD * number, network address) for a given file offset. */ @@ -98,7 +162,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) struct ceph_ioctl_dataloc dl; struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; + struct ceph_osd_client *osdc = + &ceph_sb_to_client(inode->i_sb)->client->osdc; u64 len = 1, olen; u64 tmp; struct ceph_object_layout ol; @@ -174,11 +239,15 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case CEPH_IOC_SET_LAYOUT: return ceph_ioctl_set_layout(file, (void __user *)arg); + case CEPH_IOC_SET_LAYOUT_POLICY: + return ceph_ioctl_set_layout_policy(file, (void __user *)arg); + case CEPH_IOC_GET_DATALOC: return ceph_ioctl_get_dataloc(file, (void __user *)arg); case CEPH_IOC_LAZYIO: return ceph_ioctl_lazyio(file); } + return -ENOTTY; } diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h index 88451a3b6857..a6ce54e94eb5 100644 --- a/fs/ceph/ioctl.h +++ b/fs/ceph/ioctl.h @@ -4,7 +4,7 @@ #include <linux/ioctl.h> #include <linux/types.h> -#define CEPH_IOCTL_MAGIC 0x97 +#define CEPH_IOCTL_MAGIC 0x98 /* just use u64 to align sanely on all archs */ struct ceph_ioctl_layout { @@ -17,6 +17,8 @@ struct ceph_ioctl_layout { struct ceph_ioctl_layout) #define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \ struct ceph_ioctl_layout) +#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5, \ + struct ceph_ioctl_layout) /* * Extract identity, address of the OSD and object storing a given diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index ff4e753aae92..40abde93c345 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -1,11 +1,11 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/file.h> #include <linux/namei.h> #include "super.h" #include "mds_client.h" -#include "pagelist.h" +#include <linux/ceph/pagelist.h> /** * Implement fcntl and flock locking functions. @@ -16,7 +16,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, { struct inode *inode = file->f_dentry->d_inode; struct ceph_mds_client *mdsc = - &ceph_sb_to_client(inode->i_sb)->mdsc; + ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; int err; @@ -181,8 +181,9 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) * Encode the flock and fcntl locks for the given inode into the pagelist. * Format is: #fcntl locks, sequential fcntl locks, #flock locks, * sequential flock locks. - * Must be called with BLK already held, and the lock numbers should have - * been gathered under the same lock holding window. + * Must be called with lock_flocks() already held. + * If we encounter more of a specific lock type than expected, + * we return the value 1. */ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, int num_fcntl_locks, int num_flock_locks) @@ -190,6 +191,8 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, struct file_lock *lock; struct ceph_filelock cephlock; int err = 0; + int seen_fcntl = 0; + int seen_flock = 0; dout("encoding %d flock and %d fcntl locks", num_flock_locks, num_fcntl_locks); @@ -198,6 +201,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, goto fail; for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { if (lock->fl_flags & FL_POSIX) { + ++seen_fcntl; + if (seen_fcntl > num_fcntl_locks) { + err = -ENOSPC; + goto fail; + } err = lock_to_ceph_filelock(lock, &cephlock); if (err) goto fail; @@ -213,6 +221,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, goto fail; for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { if (lock->fl_flags & FL_FLOCK) { + ++seen_flock; + if (seen_flock > num_flock_locks) { + err = -ENOSPC; + goto fail; + } err = lock_to_ceph_filelock(lock, &cephlock); if (err) goto fail; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index fad95f8f2608..3142b15940c2 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1,17 +1,21 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> +#include <linux/fs.h> #include <linux/wait.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> #include <linux/smp_lock.h> -#include "mds_client.h" -#include "mon_client.h" #include "super.h" -#include "messenger.h" -#include "decode.h" -#include "auth.h" -#include "pagelist.h" +#include "mds_client.h" + +#include <linux/ceph/messenger.h> +#include <linux/ceph/decode.h> +#include <linux/ceph/pagelist.h> +#include <linux/ceph/auth.h> +#include <linux/ceph/debugfs.h> /* * A cluster of MDS (metadata server) daemons is responsible for @@ -286,8 +290,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s) atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); if (atomic_dec_and_test(&s->s_ref)) { if (s->s_authorizer) - s->s_mdsc->client->monc.auth->ops->destroy_authorizer( - s->s_mdsc->client->monc.auth, s->s_authorizer); + s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer( + s->s_mdsc->fsc->client->monc.auth, + s->s_authorizer); kfree(s); } } @@ -344,7 +349,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_seq = 0; mutex_init(&s->s_mutex); - ceph_con_init(mdsc->client->msgr, &s->s_con); + ceph_con_init(mdsc->fsc->client->msgr, &s->s_con); s->s_con.private = s; s->s_con.ops = &mds_con_ops; s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; @@ -599,7 +604,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, } else if (req->r_dentry) { struct inode *dir = req->r_dentry->d_parent->d_inode; - if (dir->i_sb != mdsc->client->sb) { + if (dir->i_sb != mdsc->fsc->sb) { /* not this fs! */ inode = req->r_dentry->d_inode; } else if (ceph_snap(dir) != CEPH_NOSNAP) { @@ -884,7 +889,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, __ceph_remove_cap(cap); if (!__ceph_is_any_real_caps(ci)) { struct ceph_mds_client *mdsc = - &ceph_sb_to_client(inode->i_sb)->mdsc; + ceph_sb_to_client(inode->i_sb)->mdsc; spin_lock(&mdsc->cap_dirty_lock); if (!list_empty(&ci->i_dirty_item)) { @@ -1146,7 +1151,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc, struct ceph_msg *msg, *partial = NULL; struct ceph_mds_cap_release *head; int err = -ENOMEM; - int extra = mdsc->client->mount_args->cap_release_safety; + int extra = mdsc->fsc->mount_options->cap_release_safety; int num; dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds, @@ -2085,7 +2090,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* insert trace into our cache */ mutex_lock(&req->r_fill_mutex); - err = ceph_fill_trace(mdsc->client->sb, req, req->r_session); + err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); if (err == 0) { if (result == 0 && rinfo->dir_nr) ceph_readdir_prepopulate(req, req->r_session); @@ -2361,19 +2366,35 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, if (recon_state->flock) { int num_fcntl_locks, num_flock_locks; - - lock_kernel(); - ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); - rec.v2.flock_len = (2*sizeof(u32) + - (num_fcntl_locks+num_flock_locks) * - sizeof(struct ceph_filelock)); - - err = ceph_pagelist_append(pagelist, &rec, reclen); - if (!err) - err = ceph_encode_locks(inode, pagelist, - num_fcntl_locks, - num_flock_locks); - unlock_kernel(); + struct ceph_pagelist_cursor trunc_point; + + ceph_pagelist_set_cursor(pagelist, &trunc_point); + do { + lock_flocks(); + ceph_count_locks(inode, &num_fcntl_locks, + &num_flock_locks); + rec.v2.flock_len = (2*sizeof(u32) + + (num_fcntl_locks+num_flock_locks) * + sizeof(struct ceph_filelock)); + unlock_flocks(); + + /* pre-alloc pagelist */ + ceph_pagelist_truncate(pagelist, &trunc_point); + err = ceph_pagelist_append(pagelist, &rec, reclen); + if (!err) + err = ceph_pagelist_reserve(pagelist, + rec.v2.flock_len); + + /* encode locks */ + if (!err) { + lock_flocks(); + err = ceph_encode_locks(inode, + pagelist, + num_fcntl_locks, + num_flock_locks); + unlock_flocks(); + } + } while (err == -ENOSPC); } else { err = ceph_pagelist_append(pagelist, &rec, reclen); } @@ -2613,7 +2634,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg) { - struct super_block *sb = mdsc->client->sb; + struct super_block *sb = mdsc->fsc->sb; struct inode *inode; struct ceph_inode_info *ci; struct dentry *parent, *dentry; @@ -2891,10 +2912,16 @@ static void delayed_work(struct work_struct *work) schedule_delayed(mdsc); } +int ceph_mdsc_init(struct ceph_fs_client *fsc) -int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) { - mdsc->client = client; + struct ceph_mds_client *mdsc; + + mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS); + if (!mdsc) + return -ENOMEM; + mdsc->fsc = fsc; + fsc->mdsc = mdsc; mutex_init(&mdsc->mutex); mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); if (mdsc->mdsmap == NULL) @@ -2927,7 +2954,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) INIT_LIST_HEAD(&mdsc->dentry_lru); ceph_caps_init(mdsc); - ceph_adjust_min_caps(mdsc, client->min_caps); + ceph_adjust_min_caps(mdsc, fsc->min_caps); return 0; } @@ -2939,7 +2966,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) static void wait_requests(struct ceph_mds_client *mdsc) { struct ceph_mds_request *req; - struct ceph_client *client = mdsc->client; + struct ceph_fs_client *fsc = mdsc->fsc; mutex_lock(&mdsc->mutex); if (__get_oldest_req(mdsc)) { @@ -2947,7 +2974,7 @@ static void wait_requests(struct ceph_mds_client *mdsc) dout("wait_requests waiting for requests\n"); wait_for_completion_timeout(&mdsc->safe_umount_waiters, - client->mount_args->mount_timeout * HZ); + fsc->client->options->mount_timeout * HZ); /* tear down remaining requests */ mutex_lock(&mdsc->mutex); @@ -3030,7 +3057,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { u64 want_tid, want_flush; - if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) + if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) return; dout("sync\n"); @@ -3053,7 +3080,7 @@ bool done_closing_sessions(struct ceph_mds_client *mdsc) { int i, n = 0; - if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) + if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) return true; mutex_lock(&mdsc->mutex); @@ -3071,8 +3098,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) { struct ceph_mds_session *session; int i; - struct ceph_client *client = mdsc->client; - unsigned long timeout = client->mount_args->mount_timeout * HZ; + struct ceph_fs_client *fsc = mdsc->fsc; + unsigned long timeout = fsc->client->options->mount_timeout * HZ; dout("close_sessions\n"); @@ -3119,7 +3146,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) dout("stopped\n"); } -void ceph_mdsc_stop(struct ceph_mds_client *mdsc) +static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) { dout("stop\n"); cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ @@ -3129,6 +3156,15 @@ void ceph_mdsc_stop(struct ceph_mds_client *mdsc) ceph_caps_finalize(mdsc); } +void ceph_mdsc_destroy(struct ceph_fs_client *fsc) +{ + struct ceph_mds_client *mdsc = fsc->mdsc; + + ceph_mdsc_stop(mdsc); + fsc->mdsc = NULL; + kfree(mdsc); +} + /* * handle mds map update. @@ -3145,14 +3181,14 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad); ceph_decode_copy(&p, &fsid, sizeof(fsid)); - if (ceph_check_fsid(mdsc->client, &fsid) < 0) + if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0) return; epoch = ceph_decode_32(&p); maplen = ceph_decode_32(&p); dout("handle_map epoch %u len %d\n", epoch, (int)maplen); /* do we need it? */ - ceph_monc_got_mdsmap(&mdsc->client->monc, epoch); + ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch); mutex_lock(&mdsc->mutex); if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { dout("handle_map epoch %u <= our %u\n", @@ -3176,7 +3212,7 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) } else { mdsc->mdsmap = newmap; /* first mds map */ } - mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; + mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; __wake_requests(mdsc, &mdsc->waiting_for_map); @@ -3277,7 +3313,7 @@ static int get_authorizer(struct ceph_connection *con, { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; - struct ceph_auth_client *ac = mdsc->client->monc.auth; + struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; int ret = 0; if (force_new && s->s_authorizer) { @@ -3311,7 +3347,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; - struct ceph_auth_client *ac = mdsc->client->monc.auth; + struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len); } @@ -3320,12 +3356,12 @@ static int invalidate_authorizer(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; - struct ceph_auth_client *ac = mdsc->client->monc.auth; + struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; if (ac->ops->invalidate_authorizer) ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); - return ceph_monc_validate_auth(&mdsc->client->monc); + return ceph_monc_validate_auth(&mdsc->fsc->client->monc); } static const struct ceph_connection_operations mds_con_ops = { @@ -3338,7 +3374,4 @@ static const struct ceph_connection_operations mds_con_ops = { .peer_reset = peer_reset, }; - - - /* eof */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index c98267ce6d2a..d66d63c72355 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -8,9 +8,9 @@ #include <linux/rbtree.h> #include <linux/spinlock.h> -#include "types.h" -#include "messenger.h" -#include "mdsmap.h" +#include <linux/ceph/types.h> +#include <linux/ceph/messenger.h> +#include <linux/ceph/mdsmap.h> /* * Some lock dependencies: @@ -26,7 +26,7 @@ * */ -struct ceph_client; +struct ceph_fs_client; struct ceph_cap; /* @@ -230,7 +230,7 @@ struct ceph_mds_request { * mds client state */ struct ceph_mds_client { - struct ceph_client *client; + struct ceph_fs_client *fsc; struct mutex mutex; /* all nested structures */ struct ceph_mdsmap *mdsmap; @@ -289,11 +289,6 @@ struct ceph_mds_client { int caps_avail_count; /* unused, unreserved */ int caps_min_count; /* keep at least this many (unreserved) */ - -#ifdef CONFIG_DEBUG_FS - struct dentry *debugfs_file; -#endif - spinlock_t dentry_lru_lock; struct list_head dentry_lru; int num_dentry; @@ -316,10 +311,9 @@ extern void ceph_put_mds_session(struct ceph_mds_session *s); extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, struct ceph_msg *msg, int mds); -extern int ceph_mdsc_init(struct ceph_mds_client *mdsc, - struct ceph_client *client); +extern int ceph_mdsc_init(struct ceph_fs_client *fsc); extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); -extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc); +extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc); extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 040be6d1150b..73b7d44e8a35 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/bug.h> #include <linux/err.h> @@ -6,9 +6,9 @@ #include <linux/slab.h> #include <linux/types.h> -#include "mdsmap.h" -#include "messenger.h" -#include "decode.h" +#include <linux/ceph/mdsmap.h> +#include <linux/ceph/messenger.h> +#include <linux/ceph/decode.h> #include "super.h" @@ -117,7 +117,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) } dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", - i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr), + i+1, n, global_id, mds, inc, + ceph_pr_addr(&addr.in_addr), ceph_mds_state_name(state)); if (mds >= 0 && mds < m->m_max_mds && state > 0) { m->m_info[mds].global_id = global_id; diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c deleted file mode 100644 index 46a368b6dce5..000000000000 --- a/fs/ceph/pagelist.c +++ /dev/null @@ -1,63 +0,0 @@ - -#include <linux/gfp.h> -#include <linux/pagemap.h> -#include <linux/highmem.h> - -#include "pagelist.h" - -static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl) -{ - struct page *page = list_entry(pl->head.prev, struct page, - lru); - kunmap(page); -} - -int ceph_pagelist_release(struct ceph_pagelist *pl) -{ - if (pl->mapped_tail) - ceph_pagelist_unmap_tail(pl); - - while (!list_empty(&pl->head)) { - struct page *page = list_first_entry(&pl->head, struct page, - lru); - list_del(&page->lru); - __free_page(page); - } - return 0; -} - -static int ceph_pagelist_addpage(struct ceph_pagelist *pl) -{ - struct page *page = __page_cache_alloc(GFP_NOFS); - if (!page) - return -ENOMEM; - pl->room += PAGE_SIZE; - list_add_tail(&page->lru, &pl->head); - if (pl->mapped_tail) - ceph_pagelist_unmap_tail(pl); - pl->mapped_tail = kmap(page); - return 0; -} - -int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len) -{ - while (pl->room < len) { - size_t bit = pl->room; - int ret; - - memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), - buf, bit); - pl->length += bit; - pl->room -= bit; - buf += bit; - len -= bit; - ret = ceph_pagelist_addpage(pl); - if (ret) - return ret; - } - - memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len); - pl->length += len; - pl->room -= len; - return 0; -} diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 190b6c4a6f2b..39c243acd062 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -1,10 +1,12 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/sort.h> #include <linux/slab.h> #include "super.h" -#include "decode.h" +#include "mds_client.h" + +#include <linux/ceph/decode.h> /* * Snapshots in ceph are driven in large part by cooperation from the @@ -526,7 +528,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap) { struct inode *inode = &ci->vfs_inode; - struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; BUG_ON(capsnap->writing); capsnap->size = inode->i_size; @@ -747,7 +749,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg) { - struct super_block *sb = mdsc->client->sb; + struct super_block *sb = mdsc->fsc->sb; int mds = session->s_mds; u64 split; int op; diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/strings.c index c6179d3a26a2..cd5097d7c804 100644 --- a/fs/ceph/ceph_strings.c +++ b/fs/ceph/strings.c @@ -1,71 +1,9 @@ /* - * Ceph string constants + * Ceph fs string constants */ -#include "types.h" +#include <linux/module.h> +#include <linux/ceph/types.h> -const char *ceph_entity_type_name(int type) -{ - switch (type) { - case CEPH_ENTITY_TYPE_MDS: return "mds"; - case CEPH_ENTITY_TYPE_OSD: return "osd"; - case CEPH_ENTITY_TYPE_MON: return "mon"; - case CEPH_ENTITY_TYPE_CLIENT: return "client"; - case CEPH_ENTITY_TYPE_AUTH: return "auth"; - default: return "unknown"; - } -} - -const char *ceph_osd_op_name(int op) -{ - switch (op) { - case CEPH_OSD_OP_READ: return "read"; - case CEPH_OSD_OP_STAT: return "stat"; - - case CEPH_OSD_OP_MASKTRUNC: return "masktrunc"; - - case CEPH_OSD_OP_WRITE: return "write"; - case CEPH_OSD_OP_DELETE: return "delete"; - case CEPH_OSD_OP_TRUNCATE: return "truncate"; - case CEPH_OSD_OP_ZERO: return "zero"; - case CEPH_OSD_OP_WRITEFULL: return "writefull"; - case CEPH_OSD_OP_ROLLBACK: return "rollback"; - - case CEPH_OSD_OP_APPEND: return "append"; - case CEPH_OSD_OP_STARTSYNC: return "startsync"; - case CEPH_OSD_OP_SETTRUNC: return "settrunc"; - case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc"; - - case CEPH_OSD_OP_TMAPUP: return "tmapup"; - case CEPH_OSD_OP_TMAPGET: return "tmapget"; - case CEPH_OSD_OP_TMAPPUT: return "tmapput"; - - case CEPH_OSD_OP_GETXATTR: return "getxattr"; - case CEPH_OSD_OP_GETXATTRS: return "getxattrs"; - case CEPH_OSD_OP_SETXATTR: return "setxattr"; - case CEPH_OSD_OP_SETXATTRS: return "setxattrs"; - case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs"; - case CEPH_OSD_OP_RMXATTR: return "rmxattr"; - case CEPH_OSD_OP_CMPXATTR: return "cmpxattr"; - - case CEPH_OSD_OP_PULL: return "pull"; - case CEPH_OSD_OP_PUSH: return "push"; - case CEPH_OSD_OP_BALANCEREADS: return "balance-reads"; - case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads"; - case CEPH_OSD_OP_SCRUB: return "scrub"; - - case CEPH_OSD_OP_WRLOCK: return "wrlock"; - case CEPH_OSD_OP_WRUNLOCK: return "wrunlock"; - case CEPH_OSD_OP_RDLOCK: return "rdlock"; - case CEPH_OSD_OP_RDUNLOCK: return "rdunlock"; - case CEPH_OSD_OP_UPLOCK: return "uplock"; - case CEPH_OSD_OP_DNLOCK: return "dnlock"; - - case CEPH_OSD_OP_CALL: return "call"; - - case CEPH_OSD_OP_PGLS: return "pgls"; - } - return "???"; -} const char *ceph_mds_state_name(int s) { @@ -177,17 +115,3 @@ const char *ceph_snap_op_name(int o) } return "???"; } - -const char *ceph_pool_op_name(int op) -{ - switch (op) { - case POOL_OP_CREATE: return "create"; - case POOL_OP_DELETE: return "delete"; - case POOL_OP_AUID_CHANGE: return "auid change"; - case POOL_OP_CREATE_SNAP: return "create snap"; - case POOL_OP_DELETE_SNAP: return "delete snap"; - case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap"; - case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap"; - } - return "???"; -} diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 9922628532b2..d6e0e0421891 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1,5 +1,5 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/backing-dev.h> #include <linux/ctype.h> @@ -15,10 +15,13 @@ #include <linux/statfs.h> #include <linux/string.h> -#include "decode.h" #include "super.h" -#include "mon_client.h" -#include "auth.h" +#include "mds_client.h" + +#include <linux/ceph/decode.h> +#include <linux/ceph/mon_client.h> +#include <linux/ceph/auth.h> +#include <linux/ceph/debugfs.h> /* * Ceph superblock operations @@ -26,36 +29,22 @@ * Handle the basics of mounting, unmounting. */ - -/* - * find filename portion of a path (/foo/bar/baz -> baz) - */ -const char *ceph_file_part(const char *s, int len) -{ - const char *e = s + len; - - while (e != s && *(e-1) != '/') - e--; - return e; -} - - /* * super ops */ static void ceph_put_super(struct super_block *s) { - struct ceph_client *client = ceph_sb_to_client(s); + struct ceph_fs_client *fsc = ceph_sb_to_client(s); dout("put_super\n"); - ceph_mdsc_close_sessions(&client->mdsc); + ceph_mdsc_close_sessions(fsc->mdsc); /* * ensure we release the bdi before put_anon_super releases * the device name. */ - if (s->s_bdi == &client->backing_dev_info) { - bdi_unregister(&client->backing_dev_info); + if (s->s_bdi == &fsc->backing_dev_info) { + bdi_unregister(&fsc->backing_dev_info); s->s_bdi = NULL; } @@ -64,14 +53,14 @@ static void ceph_put_super(struct super_block *s) static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct ceph_client *client = ceph_inode_to_client(dentry->d_inode); - struct ceph_monmap *monmap = client->monc.monmap; + struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode); + struct ceph_monmap *monmap = fsc->client->monc.monmap; struct ceph_statfs st; u64 fsid; int err; dout("statfs\n"); - err = ceph_monc_do_statfs(&client->monc, &st); + err = ceph_monc_do_statfs(&fsc->client->monc, &st); if (err < 0) return err; @@ -104,238 +93,28 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) static int ceph_sync_fs(struct super_block *sb, int wait) { - struct ceph_client *client = ceph_sb_to_client(sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); if (!wait) { dout("sync_fs (non-blocking)\n"); - ceph_flush_dirty_caps(&client->mdsc); + ceph_flush_dirty_caps(fsc->mdsc); dout("sync_fs (non-blocking) done\n"); return 0; } dout("sync_fs (blocking)\n"); - ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc); - ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc); + ceph_osdc_sync(&fsc->client->osdc); + ceph_mdsc_sync(fsc->mdsc); dout("sync_fs (blocking) done\n"); return 0; } -static int default_congestion_kb(void) -{ - int congestion_kb; - - /* - * Copied from NFS - * - * congestion size, scale with available memory. - * - * 64MB: 8192k - * 128MB: 11585k - * 256MB: 16384k - * 512MB: 23170k - * 1GB: 32768k - * 2GB: 46340k - * 4GB: 65536k - * 8GB: 92681k - * 16GB: 131072k - * - * This allows larger machines to have larger/more transfers. - * Limit the default to 256M - */ - congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); - if (congestion_kb > 256*1024) - congestion_kb = 256*1024; - - return congestion_kb; -} - -/** - * ceph_show_options - Show mount options in /proc/mounts - * @m: seq_file to write to - * @mnt: mount descriptor - */ -static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) -{ - struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb); - struct ceph_mount_args *args = client->mount_args; - - if (args->flags & CEPH_OPT_FSID) - seq_printf(m, ",fsid=%pU", &args->fsid); - if (args->flags & CEPH_OPT_NOSHARE) - seq_puts(m, ",noshare"); - if (args->flags & CEPH_OPT_DIRSTAT) - seq_puts(m, ",dirstat"); - if ((args->flags & CEPH_OPT_RBYTES) == 0) - seq_puts(m, ",norbytes"); - if (args->flags & CEPH_OPT_NOCRC) - seq_puts(m, ",nocrc"); - if (args->flags & CEPH_OPT_NOASYNCREADDIR) - seq_puts(m, ",noasyncreaddir"); - - if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) - seq_printf(m, ",mount_timeout=%d", args->mount_timeout); - if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) - seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl); - if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT) - seq_printf(m, ",osdtimeout=%d", args->osd_timeout); - if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) - seq_printf(m, ",osdkeepalivetimeout=%d", - args->osd_keepalive_timeout); - if (args->wsize) - seq_printf(m, ",wsize=%d", args->wsize); - if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT) - seq_printf(m, ",rsize=%d", args->rsize); - if (args->congestion_kb != default_congestion_kb()) - seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb); - if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) - seq_printf(m, ",caps_wanted_delay_min=%d", - args->caps_wanted_delay_min); - if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) - seq_printf(m, ",caps_wanted_delay_max=%d", - args->caps_wanted_delay_max); - if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT) - seq_printf(m, ",cap_release_safety=%d", - args->cap_release_safety); - if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT) - seq_printf(m, ",readdir_max_entries=%d", args->max_readdir); - if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) - seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes); - if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) - seq_printf(m, ",snapdirname=%s", args->snapdir_name); - if (args->name) - seq_printf(m, ",name=%s", args->name); - if (args->secret) - seq_puts(m, ",secret=<hidden>"); - return 0; -} - -/* - * caches - */ -struct kmem_cache *ceph_inode_cachep; -struct kmem_cache *ceph_cap_cachep; -struct kmem_cache *ceph_dentry_cachep; -struct kmem_cache *ceph_file_cachep; - -static void ceph_inode_init_once(void *foo) -{ - struct ceph_inode_info *ci = foo; - inode_init_once(&ci->vfs_inode); -} - -static int __init init_caches(void) -{ - ceph_inode_cachep = kmem_cache_create("ceph_inode_info", - sizeof(struct ceph_inode_info), - __alignof__(struct ceph_inode_info), - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), - ceph_inode_init_once); - if (ceph_inode_cachep == NULL) - return -ENOMEM; - - ceph_cap_cachep = KMEM_CACHE(ceph_cap, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); - if (ceph_cap_cachep == NULL) - goto bad_cap; - - ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); - if (ceph_dentry_cachep == NULL) - goto bad_dentry; - - ceph_file_cachep = KMEM_CACHE(ceph_file_info, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); - if (ceph_file_cachep == NULL) - goto bad_file; - - return 0; - -bad_file: - kmem_cache_destroy(ceph_dentry_cachep); -bad_dentry: - kmem_cache_destroy(ceph_cap_cachep); -bad_cap: - kmem_cache_destroy(ceph_inode_cachep); - return -ENOMEM; -} - -static void destroy_caches(void) -{ - kmem_cache_destroy(ceph_inode_cachep); - kmem_cache_destroy(ceph_cap_cachep); - kmem_cache_destroy(ceph_dentry_cachep); - kmem_cache_destroy(ceph_file_cachep); -} - - -/* - * ceph_umount_begin - initiate forced umount. Tear down down the - * mount, skipping steps that may hang while waiting for server(s). - */ -static void ceph_umount_begin(struct super_block *sb) -{ - struct ceph_client *client = ceph_sb_to_client(sb); - - dout("ceph_umount_begin - starting forced umount\n"); - if (!client) - return; - client->mount_state = CEPH_MOUNT_SHUTDOWN; - return; -} - -static const struct super_operations ceph_super_ops = { - .alloc_inode = ceph_alloc_inode, - .destroy_inode = ceph_destroy_inode, - .write_inode = ceph_write_inode, - .sync_fs = ceph_sync_fs, - .put_super = ceph_put_super, - .show_options = ceph_show_options, - .statfs = ceph_statfs, - .umount_begin = ceph_umount_begin, -}; - - -const char *ceph_msg_type_name(int type) -{ - switch (type) { - case CEPH_MSG_SHUTDOWN: return "shutdown"; - case CEPH_MSG_PING: return "ping"; - case CEPH_MSG_AUTH: return "auth"; - case CEPH_MSG_AUTH_REPLY: return "auth_reply"; - case CEPH_MSG_MON_MAP: return "mon_map"; - case CEPH_MSG_MON_GET_MAP: return "mon_get_map"; - case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe"; - case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack"; - case CEPH_MSG_STATFS: return "statfs"; - case CEPH_MSG_STATFS_REPLY: return "statfs_reply"; - case CEPH_MSG_MDS_MAP: return "mds_map"; - case CEPH_MSG_CLIENT_SESSION: return "client_session"; - case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect"; - case CEPH_MSG_CLIENT_REQUEST: return "client_request"; - case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward"; - case CEPH_MSG_CLIENT_REPLY: return "client_reply"; - case CEPH_MSG_CLIENT_CAPS: return "client_caps"; - case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release"; - case CEPH_MSG_CLIENT_SNAP: return "client_snap"; - case CEPH_MSG_CLIENT_LEASE: return "client_lease"; - case CEPH_MSG_OSD_MAP: return "osd_map"; - case CEPH_MSG_OSD_OP: return "osd_op"; - case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; - default: return "unknown"; - } -} - - /* * mount options */ enum { Opt_wsize, Opt_rsize, - Opt_osdtimeout, - Opt_osdkeepalivetimeout, - Opt_mount_timeout, - Opt_osd_idle_ttl, Opt_caps_wanted_delay_min, Opt_caps_wanted_delay_max, Opt_cap_release_safety, @@ -344,29 +123,19 @@ enum { Opt_congestion_kb, Opt_last_int, /* int args above */ - Opt_fsid, Opt_snapdirname, - Opt_name, - Opt_secret, Opt_last_string, /* string args above */ - Opt_ip, - Opt_noshare, Opt_dirstat, Opt_nodirstat, Opt_rbytes, Opt_norbytes, - Opt_nocrc, Opt_noasyncreaddir, }; -static match_table_t arg_tokens = { +static match_table_t fsopt_tokens = { {Opt_wsize, "wsize=%d"}, {Opt_rsize, "rsize=%d"}, - {Opt_osdtimeout, "osdtimeout=%d"}, - {Opt_osdkeepalivetimeout, "osdkeepalive=%d"}, - {Opt_mount_timeout, "mount_timeout=%d"}, - {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, {Opt_cap_release_safety, "cap_release_safety=%d"}, @@ -374,403 +143,459 @@ static match_table_t arg_tokens = { {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, {Opt_congestion_kb, "write_congestion_kb=%d"}, /* int args above */ - {Opt_fsid, "fsid=%s"}, {Opt_snapdirname, "snapdirname=%s"}, - {Opt_name, "name=%s"}, - {Opt_secret, "secret=%s"}, /* string args above */ - {Opt_ip, "ip=%s"}, - {Opt_noshare, "noshare"}, {Opt_dirstat, "dirstat"}, {Opt_nodirstat, "nodirstat"}, {Opt_rbytes, "rbytes"}, {Opt_norbytes, "norbytes"}, - {Opt_nocrc, "nocrc"}, {Opt_noasyncreaddir, "noasyncreaddir"}, {-1, NULL} }; -static int parse_fsid(const char *str, struct ceph_fsid *fsid) +static int parse_fsopt_token(char *c, void *private) { - int i = 0; - char tmp[3]; - int err = -EINVAL; - int d; - - dout("parse_fsid '%s'\n", str); - tmp[2] = 0; - while (*str && i < 16) { - if (ispunct(*str)) { - str++; - continue; + struct ceph_mount_options *fsopt = private; + substring_t argstr[MAX_OPT_ARGS]; + int token, intval, ret; + + token = match_token((char *)c, fsopt_tokens, argstr); + if (token < 0) + return -EINVAL; + + if (token < Opt_last_int) { + ret = match_int(&argstr[0], &intval); + if (ret < 0) { + pr_err("bad mount option arg (not int) " + "at '%s'\n", c); + return ret; } - if (!isxdigit(str[0]) || !isxdigit(str[1])) - break; - tmp[0] = str[0]; - tmp[1] = str[1]; - if (sscanf(tmp, "%x", &d) < 1) - break; - fsid->fsid[i] = d & 0xff; - i++; - str += 2; + dout("got int token %d val %d\n", token, intval); + } else if (token > Opt_last_int && token < Opt_last_string) { + dout("got string token %d val %s\n", token, + argstr[0].from); + } else { + dout("got token %d\n", token); } - if (i == 16) - err = 0; - dout("parse_fsid ret %d got fsid %pU", err, fsid); - return err; + switch (token) { + case Opt_snapdirname: + kfree(fsopt->snapdir_name); + fsopt->snapdir_name = kstrndup(argstr[0].from, + argstr[0].to-argstr[0].from, + GFP_KERNEL); + if (!fsopt->snapdir_name) + return -ENOMEM; + break; + + /* misc */ + case Opt_wsize: + fsopt->wsize = intval; + break; + case Opt_rsize: + fsopt->rsize = intval; + break; + case Opt_caps_wanted_delay_min: + fsopt->caps_wanted_delay_min = intval; + break; + case Opt_caps_wanted_delay_max: + fsopt->caps_wanted_delay_max = intval; + break; + case Opt_readdir_max_entries: + fsopt->max_readdir = intval; + break; + case Opt_readdir_max_bytes: + fsopt->max_readdir_bytes = intval; + break; + case Opt_congestion_kb: + fsopt->congestion_kb = intval; + break; + case Opt_dirstat: + fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT; + break; + case Opt_nodirstat: + fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT; + break; + case Opt_rbytes: + fsopt->flags |= CEPH_MOUNT_OPT_RBYTES; + break; + case Opt_norbytes: + fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES; + break; + case Opt_noasyncreaddir: + fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; + break; + default: + BUG_ON(token); + } + return 0; } -static struct ceph_mount_args *parse_mount_args(int flags, char *options, - const char *dev_name, - const char **path) +static void destroy_mount_options(struct ceph_mount_options *args) { - struct ceph_mount_args *args; - const char *c; - int err = -ENOMEM; - substring_t argstr[MAX_OPT_ARGS]; + dout("destroy_mount_options %p\n", args); + kfree(args->snapdir_name); + kfree(args); +} - args = kzalloc(sizeof(*args), GFP_KERNEL); - if (!args) - return ERR_PTR(-ENOMEM); - args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr), - GFP_KERNEL); - if (!args->mon_addr) - goto out; +static int strcmp_null(const char *s1, const char *s2) +{ + if (!s1 && !s2) + return 0; + if (s1 && !s2) + return -1; + if (!s1 && s2) + return 1; + return strcmp(s1, s2); +} - dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name); - - /* start with defaults */ - args->sb_flags = flags; - args->flags = CEPH_OPT_DEFAULT; - args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT; - args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; - args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ - args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ - args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; - args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; - args->rsize = CEPH_MOUNT_RSIZE_DEFAULT; - args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); - args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; - args->max_readdir = CEPH_MAX_READDIR_DEFAULT; - args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; - args->congestion_kb = default_congestion_kb(); - - /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ - err = -EINVAL; - if (!dev_name) - goto out; - *path = strstr(dev_name, ":/"); - if (*path == NULL) { - pr_err("device name is missing path (no :/ in %s)\n", - dev_name); - goto out; - } +static int compare_mount_options(struct ceph_mount_options *new_fsopt, + struct ceph_options *new_opt, + struct ceph_fs_client *fsc) +{ + struct ceph_mount_options *fsopt1 = new_fsopt; + struct ceph_mount_options *fsopt2 = fsc->mount_options; + int ofs = offsetof(struct ceph_mount_options, snapdir_name); + int ret; - /* get mon ip(s) */ - err = ceph_parse_ips(dev_name, *path, args->mon_addr, - CEPH_MAX_MON, &args->num_mon); - if (err < 0) - goto out; + ret = memcmp(fsopt1, fsopt2, ofs); + if (ret) + return ret; + + ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name); + if (ret) + return ret; + + return ceph_compare_options(new_opt, fsc->client); +} + +static int parse_mount_options(struct ceph_mount_options **pfsopt, + struct ceph_options **popt, + int flags, char *options, + const char *dev_name, + const char **path) +{ + struct ceph_mount_options *fsopt; + const char *dev_name_end; + int err = -ENOMEM; + + fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL); + if (!fsopt) + return -ENOMEM; + + dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name); + + fsopt->sb_flags = flags; + fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; + + fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT; + fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); + fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; + fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; + fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; + fsopt->congestion_kb = default_congestion_kb(); + + /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ + err = -EINVAL; + if (!dev_name) + goto out; + *path = strstr(dev_name, ":/"); + if (*path == NULL) { + pr_err("device name is missing path (no :/ in %s)\n", + dev_name); + goto out; + } + dev_name_end = *path; + dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); /* path on server */ *path += 2; dout("server path '%s'\n", *path); - /* parse mount options */ - while ((c = strsep(&options, ",")) != NULL) { - int token, intval, ret; - if (!*c) - continue; - err = -EINVAL; - token = match_token((char *)c, arg_tokens, argstr); - if (token < 0) { - pr_err("bad mount option at '%s'\n", c); - goto out; - } - if (token < Opt_last_int) { - ret = match_int(&argstr[0], &intval); - if (ret < 0) { - pr_err("bad mount option arg (not int) " - "at '%s'\n", c); - continue; - } - dout("got int token %d val %d\n", token, intval); - } else if (token > Opt_last_int && token < Opt_last_string) { - dout("got string token %d val %s\n", token, - argstr[0].from); - } else { - dout("got token %d\n", token); - } - switch (token) { - case Opt_ip: - err = ceph_parse_ips(argstr[0].from, - argstr[0].to, - &args->my_addr, - 1, NULL); - if (err < 0) - goto out; - args->flags |= CEPH_OPT_MYIP; - break; - - case Opt_fsid: - err = parse_fsid(argstr[0].from, &args->fsid); - if (err == 0) - args->flags |= CEPH_OPT_FSID; - break; - case Opt_snapdirname: - kfree(args->snapdir_name); - args->snapdir_name = kstrndup(argstr[0].from, - argstr[0].to-argstr[0].from, - GFP_KERNEL); - break; - case Opt_name: - args->name = kstrndup(argstr[0].from, - argstr[0].to-argstr[0].from, - GFP_KERNEL); - break; - case Opt_secret: - args->secret = kstrndup(argstr[0].from, - argstr[0].to-argstr[0].from, - GFP_KERNEL); - break; - - /* misc */ - case Opt_wsize: - args->wsize = intval; - break; - case Opt_rsize: - args->rsize = intval; - break; - case Opt_osdtimeout: - args->osd_timeout = intval; - break; - case Opt_osdkeepalivetimeout: - args->osd_keepalive_timeout = intval; - break; - case Opt_osd_idle_ttl: - args->osd_idle_ttl = intval; - break; - case Opt_mount_timeout: - args->mount_timeout = intval; - break; - case Opt_caps_wanted_delay_min: - args->caps_wanted_delay_min = intval; - break; - case Opt_caps_wanted_delay_max: - args->caps_wanted_delay_max = intval; - break; - case Opt_readdir_max_entries: - args->max_readdir = intval; - break; - case Opt_readdir_max_bytes: - args->max_readdir_bytes = intval; - break; - case Opt_congestion_kb: - args->congestion_kb = intval; - break; - - case Opt_noshare: - args->flags |= CEPH_OPT_NOSHARE; - break; - - case Opt_dirstat: - args->flags |= CEPH_OPT_DIRSTAT; - break; - case Opt_nodirstat: - args->flags &= ~CEPH_OPT_DIRSTAT; - break; - case Opt_rbytes: - args->flags |= CEPH_OPT_RBYTES; - break; - case Opt_norbytes: - args->flags &= ~CEPH_OPT_RBYTES; - break; - case Opt_nocrc: - args->flags |= CEPH_OPT_NOCRC; - break; - case Opt_noasyncreaddir: - args->flags |= CEPH_OPT_NOASYNCREADDIR; - break; - - default: - BUG_ON(token); - } - } - return args; + err = ceph_parse_options(popt, options, dev_name, dev_name_end, + parse_fsopt_token, (void *)fsopt); + if (err) + goto out; + + /* success */ + *pfsopt = fsopt; + return 0; out: - kfree(args->mon_addr); - kfree(args); - return ERR_PTR(err); + destroy_mount_options(fsopt); + return err; } -static void destroy_mount_args(struct ceph_mount_args *args) +/** + * ceph_show_options - Show mount options in /proc/mounts + * @m: seq_file to write to + * @mnt: mount descriptor + */ +static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) { - dout("destroy_mount_args %p\n", args); - kfree(args->snapdir_name); - args->snapdir_name = NULL; - kfree(args->name); - args->name = NULL; - kfree(args->secret); - args->secret = NULL; - kfree(args); + struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb); + struct ceph_mount_options *fsopt = fsc->mount_options; + struct ceph_options *opt = fsc->client->options; + + if (opt->flags & CEPH_OPT_FSID) + seq_printf(m, ",fsid=%pU", &opt->fsid); + if (opt->flags & CEPH_OPT_NOSHARE) + seq_puts(m, ",noshare"); + if (opt->flags & CEPH_OPT_NOCRC) + seq_puts(m, ",nocrc"); + + if (opt->name) + seq_printf(m, ",name=%s", opt->name); + if (opt->secret) + seq_puts(m, ",secret=<hidden>"); + + if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) + seq_printf(m, ",mount_timeout=%d", opt->mount_timeout); + if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) + seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl); + if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT) + seq_printf(m, ",osdtimeout=%d", opt->osd_timeout); + if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) + seq_printf(m, ",osdkeepalivetimeout=%d", + opt->osd_keepalive_timeout); + + if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT) + seq_puts(m, ",dirstat"); + if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0) + seq_puts(m, ",norbytes"); + if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR) + seq_puts(m, ",noasyncreaddir"); + + if (fsopt->wsize) + seq_printf(m, ",wsize=%d", fsopt->wsize); + if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT) + seq_printf(m, ",rsize=%d", fsopt->rsize); + if (fsopt->congestion_kb != default_congestion_kb()) + seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); + if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) + seq_printf(m, ",caps_wanted_delay_min=%d", + fsopt->caps_wanted_delay_min); + if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) + seq_printf(m, ",caps_wanted_delay_max=%d", + fsopt->caps_wanted_delay_max); + if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT) + seq_printf(m, ",cap_release_safety=%d", + fsopt->cap_release_safety); + if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT) + seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir); + if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) + seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); + if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) + seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name); + return 0; } /* - * create a fresh client instance + * handle any mon messages the standard library doesn't understand. + * return error if we don't either. */ -static struct ceph_client *ceph_create_client(struct ceph_mount_args *args) +static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg) { - struct ceph_client *client; + struct ceph_fs_client *fsc = client->private; + int type = le16_to_cpu(msg->hdr.type); + + switch (type) { + case CEPH_MSG_MDS_MAP: + ceph_mdsc_handle_map(fsc->mdsc, msg); + return 0; + + default: + return -1; + } +} + +/* + * create a new fs client + */ +struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, + struct ceph_options *opt) +{ + struct ceph_fs_client *fsc; int err = -ENOMEM; - client = kzalloc(sizeof(*client), GFP_KERNEL); - if (client == NULL) + fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); + if (!fsc) return ERR_PTR(-ENOMEM); - mutex_init(&client->mount_mutex); - - init_waitqueue_head(&client->auth_wq); + fsc->client = ceph_create_client(opt, fsc); + if (IS_ERR(fsc->client)) { + err = PTR_ERR(fsc->client); + goto fail; + } + fsc->client->extra_mon_dispatch = extra_mon_dispatch; + fsc->client->supported_features |= CEPH_FEATURE_FLOCK; + fsc->client->monc.want_mdsmap = 1; - client->sb = NULL; - client->mount_state = CEPH_MOUNT_MOUNTING; - client->mount_args = args; + fsc->mount_options = fsopt; - client->msgr = NULL; + fsc->sb = NULL; + fsc->mount_state = CEPH_MOUNT_MOUNTING; - client->auth_err = 0; - atomic_long_set(&client->writeback_count, 0); + atomic_long_set(&fsc->writeback_count, 0); - err = bdi_init(&client->backing_dev_info); + err = bdi_init(&fsc->backing_dev_info); if (err < 0) - goto fail; + goto fail_client; err = -ENOMEM; - client->wb_wq = create_workqueue("ceph-writeback"); - if (client->wb_wq == NULL) + fsc->wb_wq = create_workqueue("ceph-writeback"); + if (fsc->wb_wq == NULL) goto fail_bdi; - client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid"); - if (client->pg_inv_wq == NULL) + fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid"); + if (fsc->pg_inv_wq == NULL) goto fail_wb_wq; - client->trunc_wq = create_singlethread_workqueue("ceph-trunc"); - if (client->trunc_wq == NULL) + fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc"); + if (fsc->trunc_wq == NULL) goto fail_pg_inv_wq; /* set up mempools */ err = -ENOMEM; - client->wb_pagevec_pool = mempool_create_kmalloc_pool(10, - client->mount_args->wsize >> PAGE_CACHE_SHIFT); - if (!client->wb_pagevec_pool) + fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, + fsc->mount_options->wsize >> PAGE_CACHE_SHIFT); + if (!fsc->wb_pagevec_pool) goto fail_trunc_wq; /* caps */ - client->min_caps = args->max_readdir; + fsc->min_caps = fsopt->max_readdir; + + return fsc; - /* subsystems */ - err = ceph_monc_init(&client->monc, client); - if (err < 0) - goto fail_mempool; - err = ceph_osdc_init(&client->osdc, client); - if (err < 0) - goto fail_monc; - err = ceph_mdsc_init(&client->mdsc, client); - if (err < 0) - goto fail_osdc; - return client; - -fail_osdc: - ceph_osdc_stop(&client->osdc); -fail_monc: - ceph_monc_stop(&client->monc); -fail_mempool: - mempool_destroy(client->wb_pagevec_pool); fail_trunc_wq: - destroy_workqueue(client->trunc_wq); + destroy_workqueue(fsc->trunc_wq); fail_pg_inv_wq: - destroy_workqueue(client->pg_inv_wq); + destroy_workqueue(fsc->pg_inv_wq); fail_wb_wq: - destroy_workqueue(client->wb_wq); + destroy_workqueue(fsc->wb_wq); fail_bdi: - bdi_destroy(&client->backing_dev_info); + bdi_destroy(&fsc->backing_dev_info); +fail_client: + ceph_destroy_client(fsc->client); fail: - kfree(client); + kfree(fsc); return ERR_PTR(err); } -static void ceph_destroy_client(struct ceph_client *client) +void destroy_fs_client(struct ceph_fs_client *fsc) { - dout("destroy_client %p\n", client); + dout("destroy_fs_client %p\n", fsc); - /* unmount */ - ceph_mdsc_stop(&client->mdsc); - ceph_osdc_stop(&client->osdc); + destroy_workqueue(fsc->wb_wq); + destroy_workqueue(fsc->pg_inv_wq); + destroy_workqueue(fsc->trunc_wq); - /* - * make sure mds and osd connections close out before destroying - * the auth module, which is needed to free those connections' - * ceph_authorizers. - */ - ceph_msgr_flush(); - - ceph_monc_stop(&client->monc); + bdi_destroy(&fsc->backing_dev_info); - ceph_debugfs_client_cleanup(client); - destroy_workqueue(client->wb_wq); - destroy_workqueue(client->pg_inv_wq); - destroy_workqueue(client->trunc_wq); + mempool_destroy(fsc->wb_pagevec_pool); - bdi_destroy(&client->backing_dev_info); + destroy_mount_options(fsc->mount_options); - if (client->msgr) - ceph_messenger_destroy(client->msgr); - mempool_destroy(client->wb_pagevec_pool); + ceph_fs_debugfs_cleanup(fsc); - destroy_mount_args(client->mount_args); + ceph_destroy_client(fsc->client); - kfree(client); - dout("destroy_client %p done\n", client); + kfree(fsc); + dout("destroy_fs_client %p done\n", fsc); } /* - * Initially learn our fsid, or verify an fsid matches. + * caches */ -int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) +struct kmem_cache *ceph_inode_cachep; +struct kmem_cache *ceph_cap_cachep; +struct kmem_cache *ceph_dentry_cachep; +struct kmem_cache *ceph_file_cachep; + +static void ceph_inode_init_once(void *foo) { - if (client->have_fsid) { - if (ceph_fsid_compare(&client->fsid, fsid)) { - pr_err("bad fsid, had %pU got %pU", - &client->fsid, fsid); - return -1; - } - } else { - pr_info("client%lld fsid %pU\n", client->monc.auth->global_id, - fsid); - memcpy(&client->fsid, fsid, sizeof(*fsid)); - ceph_debugfs_client_init(client); - client->have_fsid = true; - } + struct ceph_inode_info *ci = foo; + inode_init_once(&ci->vfs_inode); +} + +static int __init init_caches(void) +{ + ceph_inode_cachep = kmem_cache_create("ceph_inode_info", + sizeof(struct ceph_inode_info), + __alignof__(struct ceph_inode_info), + (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), + ceph_inode_init_once); + if (ceph_inode_cachep == NULL) + return -ENOMEM; + + ceph_cap_cachep = KMEM_CACHE(ceph_cap, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + if (ceph_cap_cachep == NULL) + goto bad_cap; + + ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + if (ceph_dentry_cachep == NULL) + goto bad_dentry; + + ceph_file_cachep = KMEM_CACHE(ceph_file_info, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + if (ceph_file_cachep == NULL) + goto bad_file; + return 0; + +bad_file: + kmem_cache_destroy(ceph_dentry_cachep); +bad_dentry: + kmem_cache_destroy(ceph_cap_cachep); +bad_cap: + kmem_cache_destroy(ceph_inode_cachep); + return -ENOMEM; } +static void destroy_caches(void) +{ + kmem_cache_destroy(ceph_inode_cachep); + kmem_cache_destroy(ceph_cap_cachep); + kmem_cache_destroy(ceph_dentry_cachep); + kmem_cache_destroy(ceph_file_cachep); +} + + /* - * true if we have the mon map (and have thus joined the cluster) + * ceph_umount_begin - initiate forced umount. Tear down down the + * mount, skipping steps that may hang while waiting for server(s). */ -static int have_mon_and_osd_map(struct ceph_client *client) +static void ceph_umount_begin(struct super_block *sb) { - return client->monc.monmap && client->monc.monmap->epoch && - client->osdc.osdmap && client->osdc.osdmap->epoch; + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + + dout("ceph_umount_begin - starting forced umount\n"); + if (!fsc) + return; + fsc->mount_state = CEPH_MOUNT_SHUTDOWN; + return; } +static const struct super_operations ceph_super_ops = { + .alloc_inode = ceph_alloc_inode, + .destroy_inode = ceph_destroy_inode, + .write_inode = ceph_write_inode, + .sync_fs = ceph_sync_fs, + .put_super = ceph_put_super, + .show_options = ceph_show_options, + .statfs = ceph_statfs, + .umount_begin = ceph_umount_begin, +}; + /* * Bootstrap mount by opening the root directory. Note the mount * @started time from caller, and time out if this takes too long. */ -static struct dentry *open_root_dentry(struct ceph_client *client, +static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, const char *path, unsigned long started) { - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req = NULL; int err; struct dentry *root; @@ -784,14 +609,14 @@ static struct dentry *open_root_dentry(struct ceph_client *client, req->r_ino1.ino = CEPH_INO_ROOT; req->r_ino1.snap = CEPH_NOSNAP; req->r_started = started; - req->r_timeout = client->mount_args->mount_timeout * HZ; + req->r_timeout = fsc->client->options->mount_timeout * HZ; req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); if (err == 0) { dout("open_root_inode success\n"); if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && - client->sb->s_root == NULL) + fsc->sb->s_root == NULL) root = d_alloc_root(req->r_target_inode); else root = d_obtain_alias(req->r_target_inode); @@ -804,105 +629,86 @@ static struct dentry *open_root_dentry(struct ceph_client *client, return root; } + + + /* * mount: join the ceph cluster, and open root directory. */ -static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt, +static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt, const char *path) { - struct ceph_entity_addr *myaddr = NULL; int err; - unsigned long timeout = client->mount_args->mount_timeout * HZ; unsigned long started = jiffies; /* note the start time */ struct dentry *root; + int first = 0; /* first vfsmount for this super_block */ dout("mount start\n"); - mutex_lock(&client->mount_mutex); - - /* initialize the messenger */ - if (client->msgr == NULL) { - if (ceph_test_opt(client, MYIP)) - myaddr = &client->mount_args->my_addr; - client->msgr = ceph_messenger_create(myaddr); - if (IS_ERR(client->msgr)) { - err = PTR_ERR(client->msgr); - client->msgr = NULL; - goto out; - } - client->msgr->nocrc = ceph_test_opt(client, NOCRC); - } + mutex_lock(&fsc->client->mount_mutex); - /* open session, and wait for mon, mds, and osd maps */ - err = ceph_monc_open_session(&client->monc); + err = __ceph_open_session(fsc->client, started); if (err < 0) goto out; - while (!have_mon_and_osd_map(client)) { - err = -EIO; - if (timeout && time_after_eq(jiffies, started + timeout)) - goto out; - - /* wait */ - dout("mount waiting for mon_map\n"); - err = wait_event_interruptible_timeout(client->auth_wq, - have_mon_and_osd_map(client) || (client->auth_err < 0), - timeout); - if (err == -EINTR || err == -ERESTARTSYS) - goto out; - if (client->auth_err < 0) { - err = client->auth_err; - goto out; - } - } - dout("mount opening root\n"); - root = open_root_dentry(client, "", started); + root = open_root_dentry(fsc, "", started); if (IS_ERR(root)) { err = PTR_ERR(root); goto out; } - if (client->sb->s_root) + if (fsc->sb->s_root) { dput(root); - else - client->sb->s_root = root; + } else { + fsc->sb->s_root = root; + first = 1; + + err = ceph_fs_debugfs_init(fsc); + if (err < 0) + goto fail; + } if (path[0] == 0) { dget(root); } else { dout("mount opening base mountpoint\n"); - root = open_root_dentry(client, path, started); + root = open_root_dentry(fsc, path, started); if (IS_ERR(root)) { err = PTR_ERR(root); - dput(client->sb->s_root); - client->sb->s_root = NULL; - goto out; + goto fail; } } mnt->mnt_root = root; - mnt->mnt_sb = client->sb; + mnt->mnt_sb = fsc->sb; - client->mount_state = CEPH_MOUNT_MOUNTED; + fsc->mount_state = CEPH_MOUNT_MOUNTED; dout("mount success\n"); err = 0; out: - mutex_unlock(&client->mount_mutex); + mutex_unlock(&fsc->client->mount_mutex); return err; + +fail: + if (first) { + dput(fsc->sb->s_root); + fsc->sb->s_root = NULL; + } + goto out; } static int ceph_set_super(struct super_block *s, void *data) { - struct ceph_client *client = data; + struct ceph_fs_client *fsc = data; int ret; dout("set_super %p data %p\n", s, data); - s->s_flags = client->mount_args->sb_flags; + s->s_flags = fsc->mount_options->sb_flags; s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ - s->s_fs_info = client; - client->sb = s; + s->s_fs_info = fsc; + fsc->sb = s; s->s_op = &ceph_super_ops; s->s_export_op = &ceph_export_ops; @@ -917,7 +723,7 @@ static int ceph_set_super(struct super_block *s, void *data) fail: s->s_fs_info = NULL; - client->sb = NULL; + fsc->sb = NULL; return ret; } @@ -926,30 +732,23 @@ fail: */ static int ceph_compare_super(struct super_block *sb, void *data) { - struct ceph_client *new = data; - struct ceph_mount_args *args = new->mount_args; - struct ceph_client *other = ceph_sb_to_client(sb); - int i; + struct ceph_fs_client *new = data; + struct ceph_mount_options *fsopt = new->mount_options; + struct ceph_options *opt = new->client->options; + struct ceph_fs_client *other = ceph_sb_to_client(sb); dout("ceph_compare_super %p\n", sb); - if (args->flags & CEPH_OPT_FSID) { - if (ceph_fsid_compare(&args->fsid, &other->fsid)) { - dout("fsid doesn't match\n"); - return 0; - } - } else { - /* do we share (a) monitor? */ - for (i = 0; i < new->monc.monmap->num_mon; i++) - if (ceph_monmap_contains(other->monc.monmap, - &new->monc.monmap->mon_inst[i].addr)) - break; - if (i == new->monc.monmap->num_mon) { - dout("mon ip not part of monmap\n"); - return 0; - } - dout("mon ip matches existing sb %p\n", sb); + + if (compare_mount_options(fsopt, opt, other)) { + dout("monitor(s)/mount options don't match\n"); + return 0; } - if (args->sb_flags != other->mount_args->sb_flags) { + if ((opt->flags & CEPH_OPT_FSID) && + ceph_fsid_compare(&opt->fsid, &other->client->fsid)) { + dout("fsid doesn't match\n"); + return 0; + } + if (fsopt->sb_flags != other->mount_options->sb_flags) { dout("flags differ\n"); return 0; } @@ -961,19 +760,20 @@ static int ceph_compare_super(struct super_block *sb, void *data) */ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); -static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) +static int ceph_register_bdi(struct super_block *sb, + struct ceph_fs_client *fsc) { int err; /* set ra_pages based on rsize mount option? */ - if (client->mount_args->rsize >= PAGE_CACHE_SIZE) - client->backing_dev_info.ra_pages = - (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) + if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) + fsc->backing_dev_info.ra_pages = + (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; - err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d", + err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d", atomic_long_inc_return(&bdi_seq)); if (!err) - sb->s_bdi = &client->backing_dev_info; + sb->s_bdi = &fsc->backing_dev_info; return err; } @@ -982,46 +782,52 @@ static int ceph_get_sb(struct file_system_type *fs_type, struct vfsmount *mnt) { struct super_block *sb; - struct ceph_client *client; + struct ceph_fs_client *fsc; int err; int (*compare_super)(struct super_block *, void *) = ceph_compare_super; const char *path = NULL; - struct ceph_mount_args *args; + struct ceph_mount_options *fsopt = NULL; + struct ceph_options *opt = NULL; dout("ceph_get_sb\n"); - args = parse_mount_args(flags, data, dev_name, &path); - if (IS_ERR(args)) { - err = PTR_ERR(args); + err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); + if (err < 0) goto out_final; - } /* create client (which we may/may not use) */ - client = ceph_create_client(args); - if (IS_ERR(client)) { - err = PTR_ERR(client); + fsc = create_fs_client(fsopt, opt); + if (IS_ERR(fsc)) { + err = PTR_ERR(fsc); + kfree(fsopt); + kfree(opt); goto out_final; } - if (client->mount_args->flags & CEPH_OPT_NOSHARE) + err = ceph_mdsc_init(fsc); + if (err < 0) + goto out; + + if (ceph_test_opt(fsc->client, NOSHARE)) compare_super = NULL; - sb = sget(fs_type, compare_super, ceph_set_super, client); + sb = sget(fs_type, compare_super, ceph_set_super, fsc); if (IS_ERR(sb)) { err = PTR_ERR(sb); goto out; } - if (ceph_sb_to_client(sb) != client) { - ceph_destroy_client(client); - client = ceph_sb_to_client(sb); - dout("get_sb got existing client %p\n", client); + if (ceph_sb_to_client(sb) != fsc) { + ceph_mdsc_destroy(fsc); + destroy_fs_client(fsc); + fsc = ceph_sb_to_client(sb); + dout("get_sb got existing client %p\n", fsc); } else { - dout("get_sb using new client %p\n", client); - err = ceph_register_bdi(sb, client); + dout("get_sb using new client %p\n", fsc); + err = ceph_register_bdi(sb, fsc); if (err < 0) goto out_splat; } - err = ceph_mount(client, mnt, path); + err = ceph_mount(fsc, mnt, path); if (err < 0) goto out_splat; dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root, @@ -1029,12 +835,13 @@ static int ceph_get_sb(struct file_system_type *fs_type, return 0; out_splat: - ceph_mdsc_close_sessions(&client->mdsc); + ceph_mdsc_close_sessions(fsc->mdsc); deactivate_locked_super(sb); goto out_final; out: - ceph_destroy_client(client); + ceph_mdsc_destroy(fsc); + destroy_fs_client(fsc); out_final: dout("ceph_get_sb fail %d\n", err); return err; @@ -1042,11 +849,12 @@ out_final: static void ceph_kill_sb(struct super_block *s) { - struct ceph_client *client = ceph_sb_to_client(s); + struct ceph_fs_client *fsc = ceph_sb_to_client(s); dout("kill_sb %p\n", s); - ceph_mdsc_pre_umount(&client->mdsc); + ceph_mdsc_pre_umount(fsc->mdsc); kill_anon_super(s); /* will call put_super after sb is r/o */ - ceph_destroy_client(client); + ceph_mdsc_destroy(fsc); + destroy_fs_client(fsc); } static struct file_system_type ceph_fs_type = { @@ -1062,36 +870,20 @@ static struct file_system_type ceph_fs_type = { static int __init init_ceph(void) { - int ret = 0; - - ret = ceph_debugfs_init(); - if (ret < 0) - goto out; - - ret = ceph_msgr_init(); - if (ret < 0) - goto out_debugfs; - - ret = init_caches(); + int ret = init_caches(); if (ret) - goto out_msgr; + goto out; ret = register_filesystem(&ceph_fs_type); if (ret) goto out_icache; - pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n", - CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL, - CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT, - CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT); + pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL); + return 0; out_icache: destroy_caches(); -out_msgr: - ceph_msgr_exit(); -out_debugfs: - ceph_debugfs_cleanup(); out: return ret; } @@ -1101,8 +893,6 @@ static void __exit exit_ceph(void) dout("exit_ceph\n"); unregister_filesystem(&ceph_fs_type); destroy_caches(); - ceph_msgr_exit(); - ceph_debugfs_cleanup(); } module_init(init_ceph); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b87638e84c4b..1886294e12f7 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1,7 +1,7 @@ #ifndef _FS_CEPH_SUPER_H #define _FS_CEPH_SUPER_H -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <asm/unaligned.h> #include <linux/backing-dev.h> @@ -14,13 +14,7 @@ #include <linux/writeback.h> #include <linux/slab.h> -#include "types.h" -#include "messenger.h" -#include "msgpool.h" -#include "mon_client.h" -#include "mds_client.h" -#include "osd_client.h" -#include "ceph_fs.h" +#include <linux/ceph/libceph.h> /* f_type in struct statfs */ #define CEPH_SUPER_MAGIC 0x00c36400 @@ -30,42 +24,25 @@ #define CEPH_BLOCK_SHIFT 20 /* 1 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) -/* - * Supported features - */ -#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK -#define CEPH_FEATURE_REQUIRED CEPH_FEATURE_NOSRCADDR +#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ +#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ +#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ -/* - * mount options - */ -#define CEPH_OPT_FSID (1<<0) -#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */ -#define CEPH_OPT_MYIP (1<<2) /* specified my ip */ -#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */ -#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ -#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */ -#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ +#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) -#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES) +#define ceph_set_mount_opt(fsc, opt) \ + (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt; +#define ceph_test_mount_opt(fsc, opt) \ + (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) -#define ceph_set_opt(client, opt) \ - (client)->mount_args->flags |= CEPH_OPT_##opt; -#define ceph_test_opt(client, opt) \ - (!!((client)->mount_args->flags & CEPH_OPT_##opt)) +#define CEPH_MAX_READDIR_DEFAULT 1024 +#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) +#define CEPH_SNAPDIRNAME_DEFAULT ".snap" - -struct ceph_mount_args { - int sb_flags; +struct ceph_mount_options { int flags; - struct ceph_fsid fsid; - struct ceph_entity_addr my_addr; - int num_mon; - struct ceph_entity_addr *mon_addr; - int mount_timeout; - int osd_idle_ttl; - int osd_timeout; - int osd_keepalive_timeout; + int sb_flags; + int wsize; int rsize; /* max readahead */ int congestion_kb; /* max writeback in flight */ @@ -73,82 +50,25 @@ struct ceph_mount_args { int cap_release_safety; int max_readdir; /* max readdir result (entires) */ int max_readdir_bytes; /* max readdir result (bytes) */ - char *snapdir_name; /* default ".snap" */ - char *name; - char *secret; -}; -/* - * defaults - */ -#define CEPH_MOUNT_TIMEOUT_DEFAULT 60 -#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */ -#define CEPH_OSD_KEEPALIVE_DEFAULT 5 -#define CEPH_OSD_IDLE_TTL_DEFAULT 60 -#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ -#define CEPH_MAX_READDIR_DEFAULT 1024 -#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) - -#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) -#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) - -#define CEPH_SNAPDIRNAME_DEFAULT ".snap" -#define CEPH_AUTH_NAME_DEFAULT "guest" -/* - * Delay telling the MDS we no longer want caps, in case we reopen - * the file. Delay a minimum amount of time, even if we send a cap - * message for some other reason. Otherwise, take the oppotunity to - * update the mds to avoid sending another message later. - */ -#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ -#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ - -#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4) - -/* mount state */ -enum { - CEPH_MOUNT_MOUNTING, - CEPH_MOUNT_MOUNTED, - CEPH_MOUNT_UNMOUNTING, - CEPH_MOUNT_UNMOUNTED, - CEPH_MOUNT_SHUTDOWN, -}; - -/* - * subtract jiffies - */ -static inline unsigned long time_sub(unsigned long a, unsigned long b) -{ - BUG_ON(time_after(b, a)); - return (long)a - (long)b; -} - -/* - * per-filesystem client state - * - * possibly shared by multiple mount points, if they are - * mounting the same ceph filesystem/cluster. - */ -struct ceph_client { - struct ceph_fsid fsid; - bool have_fsid; + /* + * everything above this point can be memcmp'd; everything below + * is handled in compare_mount_options() + */ - struct mutex mount_mutex; /* serialize mount attempts */ - struct ceph_mount_args *mount_args; + char *snapdir_name; /* default ".snap" */ +}; +struct ceph_fs_client { struct super_block *sb; - unsigned long mount_state; - wait_queue_head_t auth_wq; - - int auth_err; + struct ceph_mount_options *mount_options; + struct ceph_client *client; + unsigned long mount_state; int min_caps; /* min caps i added */ - struct ceph_messenger *msgr; /* messenger instance */ - struct ceph_mon_client monc; - struct ceph_mds_client mdsc; - struct ceph_osd_client osdc; + struct ceph_mds_client *mdsc; /* writeback */ mempool_t *wb_pagevec_pool; @@ -160,14 +80,14 @@ struct ceph_client { struct backing_dev_info backing_dev_info; #ifdef CONFIG_DEBUG_FS - struct dentry *debugfs_monmap; - struct dentry *debugfs_mdsmap, *debugfs_osdmap; - struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps; + struct dentry *debugfs_dentry_lru, *debugfs_caps; struct dentry *debugfs_congestion_kb; struct dentry *debugfs_bdi; + struct dentry *debugfs_mdsc, *debugfs_mdsmap; #endif }; + /* * File i/o capability. This tracks shared state with the metadata * server that allows us to cache or writeback attributes or to read @@ -275,6 +195,20 @@ struct ceph_inode_xattr { int should_free_val; }; +/* + * Ceph dentry state + */ +struct ceph_dentry_info { + struct ceph_mds_session *lease_session; + u32 lease_gen, lease_shared_gen; + u32 lease_seq; + unsigned long lease_renew_after, lease_renew_from; + struct list_head lru; + struct dentry *dentry; + u64 time; + u64 offset; +}; + struct ceph_inode_xattrs_info { /* * (still encoded) xattr blob. we avoid the overhead of parsing @@ -296,11 +230,6 @@ struct ceph_inode_xattrs_info { /* * Ceph inode. */ -#define CEPH_I_COMPLETE 1 /* we have complete directory cached */ -#define CEPH_I_NODELAY 4 /* do not delay cap release */ -#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ -#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ - struct ceph_inode_info { struct ceph_vino i_vino; /* ceph ino + snap */ @@ -391,6 +320,63 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode) return container_of(inode, struct ceph_inode_info, vfs_inode); } +static inline struct ceph_vino ceph_vino(struct inode *inode) +{ + return ceph_inode(inode)->i_vino; +} + +/* + * ino_t is <64 bits on many architectures, blech. + * + * don't include snap in ino hash, at least for now. + */ +static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) +{ + ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */ +#if BITS_PER_LONG == 32 + ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8; + if (!ino) + ino = 1; +#endif + return ino; +} + +/* for printf-style formatting */ +#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap + +static inline u64 ceph_ino(struct inode *inode) +{ + return ceph_inode(inode)->i_vino.ino; +} +static inline u64 ceph_snap(struct inode *inode) +{ + return ceph_inode(inode)->i_vino.snap; +} + +static inline int ceph_ino_compare(struct inode *inode, void *data) +{ + struct ceph_vino *pvino = (struct ceph_vino *)data; + struct ceph_inode_info *ci = ceph_inode(inode); + return ci->i_vino.ino == pvino->ino && + ci->i_vino.snap == pvino->snap; +} + +static inline struct inode *ceph_find_inode(struct super_block *sb, + struct ceph_vino vino) +{ + ino_t t = ceph_vino_to_ino(vino); + return ilookup5(sb, t, ceph_ino_compare, &vino); +} + + +/* + * Ceph inode. + */ +#define CEPH_I_COMPLETE 1 /* we have complete directory cached */ +#define CEPH_I_NODELAY 4 /* do not delay cap release */ +#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ +#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ + static inline void ceph_i_clear(struct inode *inode, unsigned mask) { struct ceph_inode_info *ci = ceph_inode(inode); @@ -414,8 +400,9 @@ static inline bool ceph_i_test(struct inode *inode, unsigned mask) struct ceph_inode_info *ci = ceph_inode(inode); bool r; - smp_mb(); + spin_lock(&inode->i_lock); r = (ci->i_ceph_flags & mask) == mask; + spin_unlock(&inode->i_lock); return r; } @@ -432,20 +419,6 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, struct ceph_inode_frag *pfrag, int *found); -/* - * Ceph dentry state - */ -struct ceph_dentry_info { - struct ceph_mds_session *lease_session; - u32 lease_gen, lease_shared_gen; - u32 lease_seq; - unsigned long lease_renew_after, lease_renew_from; - struct list_head lru; - struct dentry *dentry; - u64 time; - u64 offset; -}; - static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) { return (struct ceph_dentry_info *)dentry->d_fsdata; @@ -456,22 +429,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) return ((loff_t)frag << 32) | (loff_t)off; } -/* - * ino_t is <64 bits on many architectures, blech. - * - * don't include snap in ino hash, at least for now. - */ -static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) -{ - ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */ -#if BITS_PER_LONG == 32 - ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8; - if (!ino) - ino = 1; -#endif - return ino; -} - static inline int ceph_set_ino_cb(struct inode *inode, void *data) { ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; @@ -479,39 +436,6 @@ static inline int ceph_set_ino_cb(struct inode *inode, void *data) return 0; } -static inline struct ceph_vino ceph_vino(struct inode *inode) -{ - return ceph_inode(inode)->i_vino; -} - -/* for printf-style formatting */ -#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap - -static inline u64 ceph_ino(struct inode *inode) -{ - return ceph_inode(inode)->i_vino.ino; -} -static inline u64 ceph_snap(struct inode *inode) -{ - return ceph_inode(inode)->i_vino.snap; -} - -static inline int ceph_ino_compare(struct inode *inode, void *data) -{ - struct ceph_vino *pvino = (struct ceph_vino *)data; - struct ceph_inode_info *ci = ceph_inode(inode); - return ci->i_vino.ino == pvino->ino && - ci->i_vino.snap == pvino->snap; -} - -static inline struct inode *ceph_find_inode(struct super_block *sb, - struct ceph_vino vino) -{ - ino_t t = ceph_vino_to_ino(vino); - return ilookup5(sb, t, ceph_ino_compare, &vino); -} - - /* * caps helpers */ @@ -576,18 +500,18 @@ extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need); extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx); -extern void ceph_reservation_status(struct ceph_client *client, +extern void ceph_reservation_status(struct ceph_fs_client *client, int *total, int *avail, int *used, int *reserved, int *min); -static inline struct ceph_client *ceph_inode_to_client(struct inode *inode) +static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode) { - return (struct ceph_client *)inode->i_sb->s_fs_info; + return (struct ceph_fs_client *)inode->i_sb->s_fs_info; } -static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb) +static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb) { - return (struct ceph_client *)sb->s_fs_info; + return (struct ceph_fs_client *)sb->s_fs_info; } @@ -617,51 +541,6 @@ struct ceph_file_info { /* - * snapshots - */ - -/* - * A "snap context" is the set of existing snapshots when we - * write data. It is used by the OSD to guide its COW behavior. - * - * The ceph_snap_context is refcounted, and attached to each dirty - * page, indicating which context the dirty data belonged when it was - * dirtied. - */ -struct ceph_snap_context { - atomic_t nref; - u64 seq; - int num_snaps; - u64 snaps[]; -}; - -static inline struct ceph_snap_context * -ceph_get_snap_context(struct ceph_snap_context *sc) -{ - /* - printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), - atomic_read(&sc->nref)+1); - */ - if (sc) - atomic_inc(&sc->nref); - return sc; -} - -static inline void ceph_put_snap_context(struct ceph_snap_context *sc) -{ - if (!sc) - return; - /* - printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), - atomic_read(&sc->nref)-1); - */ - if (atomic_dec_and_test(&sc->nref)) { - /*printk(" deleting snap_context %p\n", sc);*/ - kfree(sc); - } -} - -/* * A "snap realm" describes a subset of the file hierarchy sharing * the same set of snapshots that apply to it. The realms themselves * are organized into a hierarchy, such that children inherit (some of) @@ -699,16 +578,33 @@ struct ceph_snap_realm { spinlock_t inodes_with_caps_lock; }; - - -/* - * calculate the number of pages a given length and offset map onto, - * if we align the data. - */ -static inline int calc_pages_for(u64 off, u64 len) +static inline int default_congestion_kb(void) { - return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) - - (off >> PAGE_CACHE_SHIFT); + int congestion_kb; + + /* + * Copied from NFS + * + * congestion size, scale with available memory. + * + * 64MB: 8192k + * 128MB: 11585k + * 256MB: 16384k + * 512MB: 23170k + * 1GB: 32768k + * 2GB: 46340k + * 4GB: 65536k + * 8GB: 92681k + * 16GB: 131072k + * + * This allows larger machines to have larger/more transfers. + * Limit the default to 256M + */ + congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); + if (congestion_kb > 256*1024) + congestion_kb = 256*1024; + + return congestion_kb; } @@ -741,16 +637,6 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) ci_item)->writing; } - -/* super.c */ -extern struct kmem_cache *ceph_inode_cachep; -extern struct kmem_cache *ceph_cap_cachep; -extern struct kmem_cache *ceph_dentry_cachep; -extern struct kmem_cache *ceph_file_cachep; - -extern const char *ceph_msg_type_name(int type); -extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); - /* inode.c */ extern const struct inode_operations ceph_file_iops; @@ -857,12 +743,18 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); /* file.c */ extern const struct file_operations ceph_file_fops; extern const struct address_space_operations ceph_aops; +extern int ceph_copy_to_page_vector(struct page **pages, + const char *data, + loff_t off, size_t len); +extern int ceph_copy_from_page_vector(struct page **pages, + char *data, + loff_t off, size_t len); +extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); extern int ceph_open(struct inode *inode, struct file *file); extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd, int mode, int locked_dir); extern int ceph_release(struct inode *inode, struct file *filp); -extern void ceph_release_page_vector(struct page **pages, int num_pages); /* dir.c */ extern const struct file_operations ceph_dir_fops; @@ -892,12 +784,6 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg); /* export.c */ extern const struct export_operations ceph_export_ops; -/* debugfs.c */ -extern int ceph_debugfs_init(void); -extern void ceph_debugfs_cleanup(void); -extern int ceph_debugfs_client_init(struct ceph_client *client); -extern void ceph_debugfs_client_cleanup(struct ceph_client *client); - /* locks.c */ extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); @@ -914,4 +800,8 @@ static inline struct inode *get_dentry_parent_inode(struct dentry *dentry) return NULL; } +/* debugfs.c */ +extern int ceph_fs_debugfs_init(struct ceph_fs_client *client); +extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); + #endif /* _FS_CEPH_SUPER_H */ diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 9578af610b73..6e12a6ba5f79 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1,6 +1,9 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> + #include "super.h" -#include "decode.h" +#include "mds_client.h" + +#include <linux/ceph/decode.h> #include <linux/xattr.h> #include <linux/slab.h> @@ -620,12 +623,12 @@ out: static int ceph_sync_setxattr(struct dentry *dentry, const char *name, const char *value, size_t size, int flags) { - struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); struct inode *parent_inode = dentry->d_parent->d_inode; struct ceph_mds_request *req; - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_mds_client *mdsc = fsc->mdsc; int err; int i, nr_pages; struct page **pages = NULL; @@ -713,10 +716,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name, /* preallocate memory for xattr name, value, index node */ err = -ENOMEM; - newname = kmalloc(name_len + 1, GFP_NOFS); + newname = kmemdup(name, name_len + 1, GFP_NOFS); if (!newname) goto out; - memcpy(newname, name, name_len + 1); if (val_len) { newval = kmalloc(val_len + 1, GFP_NOFS); @@ -777,8 +779,8 @@ out: static int ceph_send_removexattr(struct dentry *dentry, const char *name) { - struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = dentry->d_inode; struct inode *parent_inode = dentry->d_parent->d_inode; struct ceph_mds_request *req; diff --git a/fs/exec.c b/fs/exec.c index 828dd2461d6b..6d2b6f936858 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -2014,3 +2014,43 @@ fail_creds: fail: return; } + +/* + * Core dumping helper functions. These are the only things you should + * do on a core-file: use only these functions to write out all the + * necessary info. + */ +int dump_write(struct file *file, const void *addr, int nr) +{ + return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr; +} +EXPORT_SYMBOL(dump_write); + +int dump_seek(struct file *file, loff_t off) +{ + int ret = 1; + + if (file->f_op->llseek && file->f_op->llseek != no_llseek) { + if (file->f_op->llseek(file, off, SEEK_CUR) < 0) + return 0; + } else { + char *buf = (char *)get_zeroed_page(GFP_KERNEL); + + if (!buf) + return 0; + while (off > 0) { + unsigned long n = off; + + if (n > PAGE_SIZE) + n = PAGE_SIZE; + if (!dump_write(file, buf, n)) { + ret = 0; + break; + } + off -= n; + } + free_page((unsigned long)buf); + } + return ret; +} +EXPORT_SYMBOL(dump_seek); diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index cc9665522148..c465ae066c62 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -1,6 +1,6 @@ config GFS2_FS tristate "GFS2 file system support" - depends on EXPERIMENTAL && (64BIT || LBDAF) + depends on (64BIT || LBDAF) select DLM if GFS2_FS_LOCKING_DLM select CONFIGFS_FS if GFS2_FS_LOCKING_DLM select SYSFS if GFS2_FS_LOCKING_DLM diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 194fe16d8418..6b24afb96aae 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -36,8 +36,8 @@ #include "glops.h" -static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, - unsigned int from, unsigned int to) +void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, + unsigned int from, unsigned int to) { struct buffer_head *head = page_buffers(page); unsigned int bsize = head->b_size; @@ -615,7 +615,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, unsigned int data_blocks = 0, ind_blocks = 0, rblocks; int alloc_required; int error = 0; - struct gfs2_alloc *al; + struct gfs2_alloc *al = NULL; pgoff_t index = pos >> PAGE_CACHE_SHIFT; unsigned from = pos & (PAGE_CACHE_SIZE - 1); unsigned to = from + len; @@ -663,6 +663,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, rblocks += RES_STATFS + RES_QUOTA; if (&ip->i_inode == sdp->sd_rindex) rblocks += 2 * RES_STATFS; + if (alloc_required) + rblocks += gfs2_rg_blocks(al); error = gfs2_trans_begin(sdp, rblocks, PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); @@ -696,13 +698,11 @@ out: page_cache_release(page); - /* - * XXX(truncate): the call below should probably be replaced with - * a call to the gfs2-specific truncate blocks helper to actually - * release disk blocks.. - */ + gfs2_trans_end(sdp); if (pos + len > ip->i_inode.i_size) - truncate_setsize(&ip->i_inode, ip->i_inode.i_size); + gfs2_trim_blocks(&ip->i_inode); + goto out_trans_fail; + out_endtrans: gfs2_trans_end(sdp); out_trans_fail: @@ -802,10 +802,8 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, page_cache_release(page); if (copied) { - if (inode->i_size < to) { + if (inode->i_size < to) i_size_write(inode, to); - ip->i_disksize = inode->i_size; - } gfs2_dinode_out(ip, di); mark_inode_dirty(inode); } @@ -876,8 +874,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); if (ret > 0) { - if (inode->i_size > ip->i_disksize) - ip->i_disksize = inode->i_size; gfs2_dinode_out(ip, dibh->b_data); mark_inode_dirty(inode); } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 6f482809d1a3..5476c066d4ee 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -50,7 +50,7 @@ struct strip_mine { * @ip: the inode * @dibh: the dinode buffer * @block: the block number that was allocated - * @private: any locked page held by the caller process + * @page: The (optional) page. This is looked up if @page is NULL * * Returns: errno */ @@ -109,8 +109,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, /** * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big * @ip: The GFS2 inode to unstuff - * @unstuffer: the routine that handles unstuffing a non-zero length file - * @private: private data for the unstuffer + * @page: The (optional) page. This is looked up if the @page is NULL * * This routine unstuffs a dinode and returns it to a "normal" state such * that the height can be grown in the traditional way. @@ -132,7 +131,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) if (error) goto out; - if (ip->i_disksize) { + if (i_size_read(&ip->i_inode)) { /* Get a free block, fill it with the stuffed data, and write it out to disk */ @@ -161,7 +160,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) di = (struct gfs2_dinode *)dibh->b_data; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); - if (ip->i_disksize) { + if (i_size_read(&ip->i_inode)) { *(__be64 *)(di + 1) = cpu_to_be64(block); gfs2_add_inode_blocks(&ip->i_inode, 1); di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); @@ -885,83 +884,14 @@ out: } /** - * do_grow - Make a file look bigger than it is - * @ip: the inode - * @size: the size to set the file to - * - * Called with an exclusive lock on @ip. - * - * Returns: errno - */ - -static int do_grow(struct gfs2_inode *ip, u64 size) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al; - struct buffer_head *dibh; - int error; - - al = gfs2_alloc_get(ip); - if (!al) - return -ENOMEM; - - error = gfs2_quota_lock_check(ip); - if (error) - goto out; - - al->al_requested = sdp->sd_max_height + RES_DATA; - - error = gfs2_inplace_reserve(ip); - if (error) - goto out_gunlock_q; - - error = gfs2_trans_begin(sdp, - sdp->sd_max_height + al->al_rgd->rd_length + - RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0); - if (error) - goto out_ipres; - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto out_end_trans; - - if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) { - if (gfs2_is_stuffed(ip)) { - error = gfs2_unstuff_dinode(ip, NULL); - if (error) - goto out_brelse; - } - } - - ip->i_disksize = size; - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - -out_brelse: - brelse(dibh); -out_end_trans: - gfs2_trans_end(sdp); -out_ipres: - gfs2_inplace_release(ip); -out_gunlock_q: - gfs2_quota_unlock(ip); -out: - gfs2_alloc_put(ip); - return error; -} - - -/** * gfs2_block_truncate_page - Deal with zeroing out data for truncate * * This is partly borrowed from ext3. */ -static int gfs2_block_truncate_page(struct address_space *mapping) +static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from) { struct inode *inode = mapping->host; struct gfs2_inode *ip = GFS2_I(inode); - loff_t from = inode->i_size; unsigned long index = from >> PAGE_CACHE_SHIFT; unsigned offset = from & (PAGE_CACHE_SIZE-1); unsigned blocksize, iblock, length, pos; @@ -1023,9 +953,11 @@ unlock: return err; } -static int trunc_start(struct gfs2_inode *ip, u64 size) +static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) { - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct address_space *mapping = inode->i_mapping; struct buffer_head *dibh; int journaled = gfs2_is_jdata(ip); int error; @@ -1039,31 +971,26 @@ static int trunc_start(struct gfs2_inode *ip, u64 size) if (error) goto out; + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + if (gfs2_is_stuffed(ip)) { - u64 dsize = size + sizeof(struct gfs2_dinode); - ip->i_disksize = size; - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - if (dsize > dibh->b_size) - dsize = dibh->b_size; - gfs2_buffer_clear_tail(dibh, dsize); - error = 1; + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize); } else { - if (size & (u64)(sdp->sd_sb.sb_bsize - 1)) - error = gfs2_block_truncate_page(ip->i_inode.i_mapping); - - if (!error) { - ip->i_disksize = size; - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; - ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); + if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) { + error = gfs2_block_truncate_page(mapping, newsize); + if (error) + goto out_brelse; } + ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG; } - brelse(dibh); + i_size_write(inode, newsize); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_dinode_out(ip, dibh->b_data); + truncate_pagecache(inode, oldsize, newsize); +out_brelse: + brelse(dibh); out: gfs2_trans_end(sdp); return error; @@ -1123,7 +1050,7 @@ static int trunc_end(struct gfs2_inode *ip) if (error) goto out; - if (!ip->i_disksize) { + if (!i_size_read(&ip->i_inode)) { ip->i_height = 0; ip->i_goal = ip->i_no_addr; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); @@ -1143,92 +1070,154 @@ out: /** * do_shrink - make a file smaller - * @ip: the inode - * @size: the size to make the file - * @truncator: function to truncate the last partial block + * @inode: the inode + * @oldsize: the current inode size + * @newsize: the size to make the file * - * Called with an exclusive lock on @ip. + * Called with an exclusive lock on @inode. The @size must + * be equal to or smaller than the current inode size. * * Returns: errno */ -static int do_shrink(struct gfs2_inode *ip, u64 size) +static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize) { + struct gfs2_inode *ip = GFS2_I(inode); int error; - error = trunc_start(ip, size); + error = trunc_start(inode, oldsize, newsize); if (error < 0) return error; - if (error > 0) + if (gfs2_is_stuffed(ip)) return 0; - error = trunc_dealloc(ip, size); - if (!error) + error = trunc_dealloc(ip, newsize); + if (error == 0) error = trunc_end(ip); return error; } -static int do_touch(struct gfs2_inode *ip, u64 size) +void gfs2_trim_blocks(struct inode *inode) { - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + u64 size = inode->i_size; + int ret; + + ret = do_shrink(inode, size, size); + WARN_ON(ret != 0); +} + +/** + * do_grow - Touch and update inode size + * @inode: The inode + * @size: The new size + * + * This function updates the timestamps on the inode and + * may also increase the size of the inode. This function + * must not be called with @size any smaller than the current + * inode size. + * + * Although it is not strictly required to unstuff files here, + * earlier versions of GFS2 have a bug in the stuffed file reading + * code which will result in a buffer overrun if the size is larger + * than the max stuffed file size. In order to prevent this from + * occuring, such files are unstuffed, but in other cases we can + * just update the inode size directly. + * + * Returns: 0 on success, or -ve on error + */ + +static int do_grow(struct inode *inode, u64 size) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); struct buffer_head *dibh; + struct gfs2_alloc *al = NULL; int error; - error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (gfs2_is_stuffed(ip) && + (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) { + al = gfs2_alloc_get(ip); + if (al == NULL) + return -ENOMEM; + + error = gfs2_quota_lock_check(ip); + if (error) + goto do_grow_alloc_put; + + al->al_requested = 1; + error = gfs2_inplace_reserve(ip); + if (error) + goto do_grow_qunlock; + } + + error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0); if (error) - return error; + goto do_grow_release; - down_write(&ip->i_rw_mutex); + if (al) { + error = gfs2_unstuff_dinode(ip, NULL); + if (error) + goto do_end_trans; + } error = gfs2_meta_inode_buffer(ip, &dibh); if (error) - goto do_touch_out; + goto do_end_trans; + i_size_write(inode, size); ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); -do_touch_out: - up_write(&ip->i_rw_mutex); +do_end_trans: gfs2_trans_end(sdp); +do_grow_release: + if (al) { + gfs2_inplace_release(ip); +do_grow_qunlock: + gfs2_quota_unlock(ip); +do_grow_alloc_put: + gfs2_alloc_put(ip); + } return error; } /** - * gfs2_truncatei - make a file a given size - * @ip: the inode - * @size: the size to make the file - * @truncator: function to truncate the last partial block + * gfs2_setattr_size - make a file a given size + * @inode: the inode + * @newsize: the size to make the file * - * The file size can grow, shrink, or stay the same size. + * The file size can grow, shrink, or stay the same size. This + * is called holding i_mutex and an exclusive glock on the inode + * in question. * * Returns: errno */ -int gfs2_truncatei(struct gfs2_inode *ip, u64 size) +int gfs2_setattr_size(struct inode *inode, u64 newsize) { - int error; + int ret; + u64 oldsize; - if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode))) - return -EINVAL; + BUG_ON(!S_ISREG(inode->i_mode)); - if (size > ip->i_disksize) - error = do_grow(ip, size); - else if (size < ip->i_disksize) - error = do_shrink(ip, size); - else - /* update time stamps */ - error = do_touch(ip, size); + ret = inode_newsize_ok(inode, newsize); + if (ret) + return ret; - return error; + oldsize = inode->i_size; + if (newsize >= oldsize) + return do_grow(inode, newsize); + + return do_shrink(inode, oldsize, newsize); } int gfs2_truncatei_resume(struct gfs2_inode *ip) { int error; - error = trunc_dealloc(ip, ip->i_disksize); + error = trunc_dealloc(ip, i_size_read(&ip->i_inode)); if (!error) error = trunc_end(ip); return error; @@ -1269,7 +1258,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, shift = sdp->sd_sb.sb_bsize_shift; BUG_ON(gfs2_is_dir(ip)); - end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift; + end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift; lblock = offset >> shift; lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; if (lblock_stop > end_of_file) diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index a20a5213135a..42fea03e2bd9 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -44,14 +44,16 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip, } } -int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); -int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create); -int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen); - -int gfs2_truncatei(struct gfs2_inode *ip, u64 size); -int gfs2_truncatei_resume(struct gfs2_inode *ip); -int gfs2_file_dealloc(struct gfs2_inode *ip); -int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, - unsigned int len); +extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); +extern int gfs2_block_map(struct inode *inode, sector_t lblock, + struct buffer_head *bh, int create); +extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, + u64 *dblock, unsigned *extlen); +extern int gfs2_setattr_size(struct inode *inode, u64 size); +extern void gfs2_trim_blocks(struct inode *inode); +extern int gfs2_truncatei_resume(struct gfs2_inode *ip); +extern int gfs2_file_dealloc(struct gfs2_inode *ip); +extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, + unsigned int len); #endif /* __BMAP_DOT_H__ */ diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index bb7907bde3d8..6798755b3858 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -49,7 +49,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) ip = GFS2_I(inode); } - if (sdp->sd_args.ar_localcaching) + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) goto valid; had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index b9dd88a78dd4..5c356d09c321 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -79,6 +79,9 @@ #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1)) +struct qstr gfs2_qdot __read_mostly; +struct qstr gfs2_qdotdot __read_mostly; + typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len, u64 leaf_no, void *data); typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent, @@ -127,8 +130,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf, gfs2_trans_add_bh(ip->i_gl, dibh, 1); memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); - if (ip->i_disksize < offset + size) - ip->i_disksize = offset + size; + if (ip->i_inode.i_size < offset + size) + i_size_write(&ip->i_inode, offset + size); ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_dinode_out(ip, dibh->b_data); @@ -225,8 +228,8 @@ out: if (error) return error; - if (ip->i_disksize < offset + copied) - ip->i_disksize = offset + copied; + if (ip->i_inode.i_size < offset + copied) + i_size_write(&ip->i_inode, offset + copied); ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); @@ -275,12 +278,13 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, unsigned int o; int copied = 0; int error = 0; + u64 disksize = i_size_read(&ip->i_inode); - if (offset >= ip->i_disksize) + if (offset >= disksize) return 0; - if (offset + size > ip->i_disksize) - size = ip->i_disksize - offset; + if (offset + size > disksize) + size = disksize - offset; if (!size) return 0; @@ -727,7 +731,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode, unsigned hsize = 1 << ip->i_depth; unsigned index; u64 ln; - if (hsize * sizeof(u64) != ip->i_disksize) { + if (hsize * sizeof(u64) != i_size_read(inode)) { gfs2_consist_inode(ip); return ERR_PTR(-EIO); } @@ -879,7 +883,7 @@ static int dir_make_exhash(struct inode *inode) for (x = sdp->sd_hash_ptrs; x--; lp++) *lp = cpu_to_be64(bn); - dip->i_disksize = sdp->sd_sb.sb_bsize / 2; + i_size_write(inode, sdp->sd_sb.sb_bsize / 2); gfs2_add_inode_blocks(&dip->i_inode, 1); dip->i_diskflags |= GFS2_DIF_EXHASH; @@ -1057,11 +1061,12 @@ static int dir_double_exhash(struct gfs2_inode *dip) u64 *buf; u64 *from, *to; u64 block; + u64 disksize = i_size_read(&dip->i_inode); int x; int error = 0; hsize = 1 << dip->i_depth; - if (hsize * sizeof(u64) != dip->i_disksize) { + if (hsize * sizeof(u64) != disksize) { gfs2_consist_inode(dip); return -EIO; } @@ -1072,7 +1077,7 @@ static int dir_double_exhash(struct gfs2_inode *dip) if (!buf) return -ENOMEM; - for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) { + for (block = disksize >> sdp->sd_hash_bsize_shift; block--;) { error = gfs2_dir_read_data(dip, (char *)buf, block * sdp->sd_hash_bsize, sdp->sd_hash_bsize, 1); @@ -1370,7 +1375,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, unsigned depth = 0; hsize = 1 << dip->i_depth; - if (hsize * sizeof(u64) != dip->i_disksize) { + if (hsize * sizeof(u64) != i_size_read(inode)) { gfs2_consist_inode(dip); return -EIO; } @@ -1784,7 +1789,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data) int error = 0; hsize = 1 << dip->i_depth; - if (hsize * sizeof(u64) != dip->i_disksize) { + if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) { gfs2_consist_inode(dip); return -EIO; } diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index 4f919440c3be..a98f644bd3df 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -17,23 +17,24 @@ struct inode; struct gfs2_inode; struct gfs2_inum; -struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename); -int gfs2_dir_check(struct inode *dir, const struct qstr *filename, - const struct gfs2_inode *ip); -int gfs2_dir_add(struct inode *inode, const struct qstr *filename, - const struct gfs2_inode *ip, unsigned int type); -int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename); -int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, - filldir_t filldir); -int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, - const struct gfs2_inode *nip, unsigned int new_type); +extern struct inode *gfs2_dir_search(struct inode *dir, + const struct qstr *filename); +extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename, + const struct gfs2_inode *ip); +extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename, + const struct gfs2_inode *ip, unsigned int type); +extern int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename); +extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, + filldir_t filldir); +extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, + const struct gfs2_inode *nip, unsigned int new_type); -int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); +extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); -int gfs2_diradd_alloc_required(struct inode *dir, - const struct qstr *filename); -int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, - struct buffer_head **bhp); +extern int gfs2_diradd_alloc_required(struct inode *dir, + const struct qstr *filename); +extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, + struct buffer_head **bhp); static inline u32 gfs2_disk_hash(const char *data, int len) { @@ -61,4 +62,7 @@ static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct memcpy(dent + 1, name->name, name->len); } +extern struct qstr gfs2_qdot; +extern struct qstr gfs2_qdotdot; + #endif /* __DIR_DOT_H__ */ diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index dfe237a3f8ad..06d582732d34 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -126,16 +126,9 @@ static int gfs2_get_name(struct dentry *parent, char *name, static struct dentry *gfs2_get_parent(struct dentry *child) { - struct qstr dotdot; struct dentry *dentry; - /* - * XXX(hch): it would be a good idea to keep this around as a - * static variable. - */ - gfs2_str2qstr(&dotdot, ".."); - - dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1)); + dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1)); if (!IS_ERR(dentry)) dentry->d_op = &gfs2_dops; return dentry; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 4edd662c8232..237ee6a940df 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -382,8 +382,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) rblocks = RES_DINODE + ind_blocks; if (gfs2_is_jdata(ip)) rblocks += data_blocks ? data_blocks : 1; - if (ind_blocks || data_blocks) + if (ind_blocks || data_blocks) { rblocks += RES_STATFS + RES_QUOTA; + rblocks += gfs2_rg_blocks(al); + } ret = gfs2_trans_begin(sdp, rblocks, 0); if (ret) goto out_trans_fail; @@ -491,7 +493,7 @@ static int gfs2_open(struct inode *inode, struct file *file) goto fail; if (!(file->f_flags & O_LARGEFILE) && - ip->i_disksize > MAX_NON_LFS) { + i_size_read(inode) > MAX_NON_LFS) { error = -EOVERFLOW; goto fail_gunlock; } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 9adf8f924e08..87778857f099 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -441,6 +441,8 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state) else gfs2_glock_put_nolock(gl); } + if (held1 && held2 && list_empty(&gl->gl_holders)) + clear_bit(GLF_QUEUED, &gl->gl_flags); gl->gl_state = new_state; gl->gl_tchange = jiffies; @@ -1012,6 +1014,7 @@ fail: if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt)) insert_pt = &gh2->gh_list; } + set_bit(GLF_QUEUED, &gl->gl_flags); if (likely(insert_pt == NULL)) { list_add_tail(&gh->gh_list, &gl->gl_holders); if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY)) @@ -1310,10 +1313,12 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) gfs2_glock_hold(gl); holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; - if (time_before(now, holdtime)) - delay = holdtime - now; - if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) - delay = gl->gl_ops->go_min_hold_time; + if (test_bit(GLF_QUEUED, &gl->gl_flags)) { + if (time_before(now, holdtime)) + delay = holdtime - now; + if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) + delay = gl->gl_ops->go_min_hold_time; + } spin_lock(&gl->gl_spin); handle_callback(gl, state, delay); @@ -1512,7 +1517,7 @@ static void clear_glock(struct gfs2_glock *gl) spin_unlock(&lru_lock); spin_lock(&gl->gl_spin); - if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED) + if (gl->gl_state != LM_ST_UNLOCKED) handle_callback(gl, LM_ST_UNLOCKED, 0); spin_unlock(&gl->gl_spin); gfs2_glock_hold(gl); @@ -1660,6 +1665,8 @@ static const char *gflags2str(char *buf, const unsigned long *gflags) *p++ = 'I'; if (test_bit(GLF_FROZEN, gflags)) *p++ = 'F'; + if (test_bit(GLF_QUEUED, gflags)) + *p++ = 'q'; *p = 0; return buf; } @@ -1776,10 +1783,12 @@ int __init gfs2_glock_init(void) } #endif - glock_workqueue = create_workqueue("glock_workqueue"); + glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER | + WQ_HIGHPRI | WQ_FREEZEABLE, 0); if (IS_ERR(glock_workqueue)) return PTR_ERR(glock_workqueue); - gfs2_delete_workqueue = create_workqueue("delete_workqueue"); + gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER | + WQ_FREEZEABLE, 0); if (IS_ERR(gfs2_delete_workqueue)) { destroy_workqueue(glock_workqueue); return PTR_ERR(gfs2_delete_workqueue); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 2bda1911b156..db1c26d6d220 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -215,7 +215,7 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); /** - * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock + * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock * @gl: the glock * @state: the state we're requesting * @flags: the modifier flags diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 49f97d3bb690..0d149dcc04e5 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -262,13 +262,12 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) const struct gfs2_inode *ip = gl->gl_object; if (ip == NULL) return 0; - gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n", + gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n", (unsigned long long)ip->i_no_formal_ino, (unsigned long long)ip->i_no_addr, IF2DT(ip->i_inode.i_mode), ip->i_flags, (unsigned int)ip->i_diskflags, - (unsigned long long)ip->i_inode.i_size, - (unsigned long long)ip->i_disksize); + (unsigned long long)i_size_read(&ip->i_inode)); return 0; } @@ -453,7 +452,6 @@ const struct gfs2_glock_operations *gfs2_glops_list[] = { [LM_TYPE_META] = &gfs2_meta_glops, [LM_TYPE_INODE] = &gfs2_inode_glops, [LM_TYPE_RGRP] = &gfs2_rgrp_glops, - [LM_TYPE_NONDISK] = &gfs2_trans_glops, [LM_TYPE_IOPEN] = &gfs2_iopen_glops, [LM_TYPE_FLOCK] = &gfs2_flock_glops, [LM_TYPE_NONDISK] = &gfs2_nondisk_glops, diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index fdbf4b366fa5..764fbb49efc8 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -196,6 +196,7 @@ enum { GLF_REPLY_PENDING = 9, GLF_INITIAL = 10, GLF_FROZEN = 11, + GLF_QUEUED = 12, }; struct gfs2_glock { @@ -267,7 +268,6 @@ struct gfs2_inode { u64 i_no_formal_ino; u64 i_generation; u64 i_eattr; - loff_t i_disksize; unsigned long i_flags; /* GIF_... */ struct gfs2_glock *i_gl; /* Move into i_gh? */ struct gfs2_holder i_iopen_gh; @@ -416,11 +416,8 @@ struct gfs2_args { char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ unsigned int ar_spectator:1; /* Don't get a journal */ - unsigned int ar_ignore_local_fs:1; /* Ignore optimisations */ unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */ - unsigned int ar_localcaching:1; /* Local caching */ unsigned int ar_debug:1; /* Oops on errors */ - unsigned int ar_upgrade:1; /* Upgrade ondisk format */ unsigned int ar_posix_acl:1; /* Enable posix acls */ unsigned int ar_quota:2; /* off/account/on */ unsigned int ar_suiddir:1; /* suiddir support */ @@ -497,7 +494,7 @@ struct gfs2_sb_host { */ struct lm_lockstruct { - unsigned int ls_jid; + int ls_jid; unsigned int ls_first; unsigned int ls_first_done; unsigned int ls_nodir; @@ -572,6 +569,7 @@ struct gfs2_sbd { struct list_head sd_rindex_mru_list; struct gfs2_rgrpd *sd_rindex_forward; unsigned int sd_rgrps; + unsigned int sd_max_rg_data; /* Journal index stuff */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 08140f185a37..06370f8bd8cf 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -359,8 +359,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) * to do that. */ ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink); - ip->i_disksize = be64_to_cpu(str->di_size); - i_size_write(&ip->i_inode, ip->i_disksize); + i_size_write(&ip->i_inode, be64_to_cpu(str->di_size)); gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); atime.tv_sec = be64_to_cpu(str->di_atime); atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); @@ -1055,7 +1054,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) str->di_uid = cpu_to_be32(ip->i_inode.i_uid); str->di_gid = cpu_to_be32(ip->i_inode.i_gid); str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); - str->di_size = cpu_to_be64(ip->i_disksize); + str->di_size = cpu_to_be64(i_size_read(&ip->i_inode)); str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); @@ -1085,8 +1084,8 @@ void gfs2_dinode_print(const struct gfs2_inode *ip) (unsigned long long)ip->i_no_formal_ino); printk(KERN_INFO " no_addr = %llu\n", (unsigned long long)ip->i_no_addr); - printk(KERN_INFO " i_disksize = %llu\n", - (unsigned long long)ip->i_disksize); + printk(KERN_INFO " i_size = %llu\n", + (unsigned long long)i_size_read(&ip->i_inode)); printk(KERN_INFO " blocks = %llu\n", (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode)); printk(KERN_INFO " i_goal = %llu\n", diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 300ada3f21de..6720d7d5fbc6 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -19,6 +19,8 @@ extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask); extern int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state, char *buf, loff_t *pos, unsigned size); +extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, + unsigned int from, unsigned int to); extern void gfs2_set_aops(struct inode *inode); static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) @@ -80,6 +82,19 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip, dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr); } +static inline int gfs2_check_internal_file_size(struct inode *inode, + u64 minsize, u64 maxsize) +{ + u64 size = i_size_read(inode); + if (size < minsize || size > maxsize) + goto err; + if (size & ((1 << inode->i_blkbits) - 1)) + goto err; + return 0; +err: + gfs2_consist_inode(GFS2_I(inode)); + return -EIO; +} extern void gfs2_set_iop(struct inode *inode); extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 0e0470ed34c2..1c09425b45fd 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -42,9 +42,9 @@ static void gdlm_ast(void *arg) ret |= LM_OUT_CANCELED; goto out; case -EAGAIN: /* Try lock fails */ + case -EDEADLK: /* Deadlock detected */ goto out; - case -EINVAL: /* Invalid */ - case -ENOMEM: /* Out of memory */ + case -ETIMEDOUT: /* Canceled due to timeout */ ret |= LM_OUT_ERROR; goto out; case 0: /* Success */ diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index b1e9630eb46a..d7eb1e209aa8 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -24,6 +24,7 @@ #include "glock.h" #include "quota.h" #include "recovery.h" +#include "dir.h" static struct shrinker qd_shrinker = { .shrink = gfs2_shrink_qd_memory, @@ -78,6 +79,9 @@ static int __init init_gfs2_fs(void) { int error; + gfs2_str2qstr(&gfs2_qdot, "."); + gfs2_str2qstr(&gfs2_qdotdot, ".."); + error = gfs2_sys_init(); if (error) return error; @@ -140,7 +144,7 @@ static int __init init_gfs2_fs(void) error = -ENOMEM; gfs_recovery_wq = alloc_workqueue("gfs_recovery", - WQ_NON_REENTRANT | WQ_RESCUER, 0); + WQ_RESCUER | WQ_FREEZEABLE, 0); if (!gfs_recovery_wq) goto fail_wq; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 4d4b1e8ac64c..aeafc233dc89 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -38,14 +38,6 @@ #define DO 0 #define UNDO 1 -static const u32 gfs2_old_fs_formats[] = { - 0 -}; - -static const u32 gfs2_old_multihost_formats[] = { - 0 -}; - /** * gfs2_tune_init - Fill a gfs2_tune structure with default values * @gt: tune @@ -135,8 +127,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent) { - unsigned int x; - if (sb->sb_magic != GFS2_MAGIC || sb->sb_type != GFS2_METATYPE_SB) { if (!silent) @@ -150,55 +140,9 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int sile sb->sb_multihost_format == GFS2_FORMAT_MULTI) return 0; - if (sb->sb_fs_format != GFS2_FORMAT_FS) { - for (x = 0; gfs2_old_fs_formats[x]; x++) - if (gfs2_old_fs_formats[x] == sb->sb_fs_format) - break; + fs_warn(sdp, "Unknown on-disk format, unable to mount\n"); - if (!gfs2_old_fs_formats[x]) { - printk(KERN_WARNING - "GFS2: code version (%u, %u) is incompatible " - "with ondisk format (%u, %u)\n", - GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, - sb->sb_fs_format, sb->sb_multihost_format); - printk(KERN_WARNING - "GFS2: I don't know how to upgrade this FS\n"); - return -EINVAL; - } - } - - if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) { - for (x = 0; gfs2_old_multihost_formats[x]; x++) - if (gfs2_old_multihost_formats[x] == - sb->sb_multihost_format) - break; - - if (!gfs2_old_multihost_formats[x]) { - printk(KERN_WARNING - "GFS2: code version (%u, %u) is incompatible " - "with ondisk format (%u, %u)\n", - GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, - sb->sb_fs_format, sb->sb_multihost_format); - printk(KERN_WARNING - "GFS2: I don't know how to upgrade this FS\n"); - return -EINVAL; - } - } - - if (!sdp->sd_args.ar_upgrade) { - printk(KERN_WARNING - "GFS2: code version (%u, %u) is incompatible " - "with ondisk format (%u, %u)\n", - GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, - sb->sb_fs_format, sb->sb_multihost_format); - printk(KERN_INFO - "GFS2: Use the \"upgrade\" mount option to upgrade " - "the FS\n"); - printk(KERN_INFO "GFS2: See the manual for more details\n"); - return -EINVAL; - } - - return 0; + return -EINVAL; } static void end_bio_io_page(struct bio *bio, int error) @@ -586,7 +530,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp) prev_db = 0; - for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) { + for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) { bh.b_state = 0; bh.b_blocknr = 0; bh.b_size = 1 << ip->i_inode.i_blkbits; @@ -1022,7 +966,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) if (!strcmp("lock_nolock", proto)) { lm = &nolock_ops; sdp->sd_args.ar_localflocks = 1; - sdp->sd_args.ar_localcaching = 1; #ifdef CONFIG_GFS2_FS_LOCKING_DLM } else if (!strcmp("lock_dlm", proto)) { lm = &gfs2_dlm_ops; @@ -1113,8 +1056,6 @@ static int gfs2_journalid_wait(void *word) static int wait_on_journal(struct gfs2_sbd *sdp) { - if (sdp->sd_args.ar_spectator) - return 0; if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) return 0; @@ -1217,6 +1158,20 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent if (error) goto fail_sb; + /* + * If user space has failed to join the cluster or some similar + * failure has occurred, then the journal id will contain a + * negative (error) number. This will then be returned to the + * caller (of the mount syscall). We do this even for spectator + * mounts (which just write a jid of 0 to indicate "ok" even though + * the jid is unused in the spectator case) + */ + if (sdp->sd_lockstruct.ls_jid < 0) { + error = sdp->sd_lockstruct.ls_jid; + sdp->sd_lockstruct.ls_jid = 0; + goto fail_sb; + } + error = init_inodes(sdp, DO); if (error) goto fail_sb; diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 1009be2c9737..0534510200d5 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -18,6 +18,8 @@ #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> #include <linux/fiemap.h> +#include <linux/swap.h> +#include <linux/falloc.h> #include <asm/uaccess.h> #include "gfs2.h" @@ -217,7 +219,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, goto out_gunlock_q; error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - al->al_rgd->rd_length + + gfs2_rg_blocks(al) + 2 * RES_DINODE + RES_STATFS + RES_QUOTA, 0); if (error) @@ -406,7 +408,6 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry, ip = ghs[1].gh_gl->gl_object; - ip->i_disksize = size; i_size_write(inode, size); error = gfs2_meta_inode_buffer(ip, &dibh); @@ -461,7 +462,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) ip = ghs[1].gh_gl->gl_object; ip->i_inode.i_nlink = 2; - ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); + i_size_write(inode, sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)); ip->i_diskflags |= GFS2_DIF_JDATA; ip->i_entries = 2; @@ -470,18 +471,15 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (!gfs2_assert_withdraw(sdp, !error)) { struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1); - struct qstr str; - gfs2_str2qstr(&str, "."); gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent); + gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent); dent->de_inum = di->di_num; /* already GFS2 endian */ dent->de_type = cpu_to_be16(DT_DIR); di->di_entries = cpu_to_be32(1); - gfs2_str2qstr(&str, ".."); dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1)); - gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent); + gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent); gfs2_inum_out(dip, dent); dent->de_type = cpu_to_be16(DT_DIR); @@ -522,7 +520,6 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, struct gfs2_inode *ip) { - struct qstr dotname; int error; if (ip->i_entries != 2) { @@ -539,13 +536,11 @@ static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, if (error) return error; - gfs2_str2qstr(&dotname, "."); - error = gfs2_dir_del(ip, &dotname); + error = gfs2_dir_del(ip, &gfs2_qdot); if (error) return error; - gfs2_str2qstr(&dotname, ".."); - error = gfs2_dir_del(ip, &dotname); + error = gfs2_dir_del(ip, &gfs2_qdotdot); if (error) return error; @@ -694,11 +689,8 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) struct inode *dir = &to->i_inode; struct super_block *sb = dir->i_sb; struct inode *tmp; - struct qstr dotdot; int error = 0; - gfs2_str2qstr(&dotdot, ".."); - igrab(dir); for (;;) { @@ -711,7 +703,7 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) break; } - tmp = gfs2_lookupi(dir, &dotdot, 1); + tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1); if (IS_ERR(tmp)) { error = PTR_ERR(tmp); break; @@ -744,7 +736,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, struct gfs2_inode *ip = GFS2_I(odentry->d_inode); struct gfs2_inode *nip = NULL; struct gfs2_sbd *sdp = GFS2_SB(odir); - struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; + struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh; struct gfs2_rgrpd *nrgd; unsigned int num_gh; int dir_rename = 0; @@ -758,6 +750,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, return 0; } + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + return error; if (odip != ndip) { error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, @@ -887,12 +882,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, al->al_requested = sdp->sd_max_dirres; - error = gfs2_inplace_reserve(ndip); + error = gfs2_inplace_reserve_ri(ndip); if (error) goto out_gunlock_q; error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - al->al_rgd->rd_length + + gfs2_rg_blocks(al) + 4 * RES_DINODE + 4 * RES_LEAF + RES_STATFS + RES_QUOTA + 4, 0); if (error) @@ -920,9 +915,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, } if (dir_rename) { - struct qstr name; - gfs2_str2qstr(&name, ".."); - error = gfs2_change_nlink(ndip, +1); if (error) goto out_end_trans; @@ -930,7 +922,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) goto out_end_trans; - error = gfs2_dir_mvino(ip, &name, ndip, DT_DIR); + error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR); if (error) goto out_end_trans; } else { @@ -972,6 +964,7 @@ out_gunlock_r: if (r_gh.gh_gl) gfs2_glock_dq_uninit(&r_gh); out: + gfs2_glock_dq_uninit(&ri_gh); return error; } @@ -990,7 +983,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) struct gfs2_inode *ip = GFS2_I(dentry->d_inode); struct gfs2_holder i_gh; struct buffer_head *dibh; - unsigned int x; + unsigned int x, size; char *buf; int error; @@ -1002,7 +995,8 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) return NULL; } - if (!ip->i_disksize) { + size = (unsigned int)i_size_read(&ip->i_inode); + if (size == 0) { gfs2_consist_inode(ip); buf = ERR_PTR(-EIO); goto out; @@ -1014,7 +1008,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) goto out; } - x = ip->i_disksize + 1; + x = size + 1; buf = kmalloc(x, GFP_NOFS); if (!buf) buf = ERR_PTR(-ENOMEM); @@ -1071,30 +1065,6 @@ int gfs2_permission(struct inode *inode, int mask) return error; } -/* - * XXX(truncate): the truncate_setsize calls should be moved to the end. - */ -static int setattr_size(struct inode *inode, struct iattr *attr) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - int error; - - if (attr->ia_size != ip->i_disksize) { - error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); - if (error) - return error; - truncate_setsize(inode, attr->ia_size); - gfs2_trans_end(sdp); - } - - error = gfs2_truncatei(ip, attr->ia_size); - if (error && (inode->i_size != ip->i_disksize)) - i_size_write(inode, ip->i_disksize); - - return error; -} - static int setattr_chown(struct inode *inode, struct iattr *attr) { struct gfs2_inode *ip = GFS2_I(inode); @@ -1195,7 +1165,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) goto out; if (attr->ia_valid & ATTR_SIZE) - error = setattr_size(inode, attr); + error = gfs2_setattr_size(inode, attr->ia_size); else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) error = setattr_chown(inode, attr); else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) @@ -1301,6 +1271,257 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name) return ret; } +static void empty_write_end(struct page *page, unsigned from, + unsigned to) +{ + struct gfs2_inode *ip = GFS2_I(page->mapping->host); + + page_zero_new_buffers(page, from, to); + flush_dcache_page(page); + mark_page_accessed(page); + + if (!gfs2_is_writeback(ip)) + gfs2_page_add_databufs(ip, page, from, to); + + block_commit_write(page, from, to); +} + + +static int write_empty_blocks(struct page *page, unsigned from, unsigned to) +{ + unsigned start, end, next; + struct buffer_head *bh, *head; + int error; + + if (!page_has_buffers(page)) { + error = block_prepare_write(page, from, to, gfs2_block_map); + if (unlikely(error)) + return error; + + empty_write_end(page, from, to); + return 0; + } + + bh = head = page_buffers(page); + next = end = 0; + while (next < from) { + next += bh->b_size; + bh = bh->b_this_page; + } + start = next; + do { + next += bh->b_size; + if (buffer_mapped(bh)) { + if (end) { + error = block_prepare_write(page, start, end, + gfs2_block_map); + if (unlikely(error)) + return error; + empty_write_end(page, start, end); + end = 0; + } + start = next; + } + else + end = next; + bh = bh->b_this_page; + } while (next < to); + + if (end) { + error = block_prepare_write(page, start, end, gfs2_block_map); + if (unlikely(error)) + return error; + empty_write_end(page, start, end); + } + + return 0; +} + +static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, + int mode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct buffer_head *dibh; + int error; + u64 start = offset >> PAGE_CACHE_SHIFT; + unsigned int start_offset = offset & ~PAGE_CACHE_MASK; + u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT; + pgoff_t curr; + struct page *page; + unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK; + unsigned int from, to; + + if (!end_offset) + end_offset = PAGE_CACHE_SIZE; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (unlikely(error)) + goto out; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + + if (gfs2_is_stuffed(ip)) { + error = gfs2_unstuff_dinode(ip, NULL); + if (unlikely(error)) + goto out; + } + + curr = start; + offset = start << PAGE_CACHE_SHIFT; + from = start_offset; + to = PAGE_CACHE_SIZE; + while (curr <= end) { + page = grab_cache_page_write_begin(inode->i_mapping, curr, + AOP_FLAG_NOFS); + if (unlikely(!page)) { + error = -ENOMEM; + goto out; + } + + if (curr == end) + to = end_offset; + error = write_empty_blocks(page, from, to); + if (!error && offset + to > inode->i_size && + !(mode & FALLOC_FL_KEEP_SIZE)) { + i_size_write(inode, offset + to); + } + unlock_page(page); + page_cache_release(page); + if (error) + goto out; + curr++; + offset += PAGE_CACHE_SIZE; + from = 0; + } + + gfs2_dinode_out(ip, dibh->b_data); + mark_inode_dirty(inode); + + brelse(dibh); + +out: + return error; +} + +static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len, + unsigned int *data_blocks, unsigned int *ind_blocks) +{ + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone; + unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1); + + for (tmp = max_data; tmp > sdp->sd_diptrs;) { + tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); + max_data -= tmp; + } + /* This calculation isn't the exact reverse of gfs2_write_calc_reserve, + so it might end up with fewer data blocks */ + if (max_data <= *data_blocks) + return; + *data_blocks = max_data; + *ind_blocks = max_blocks - max_data; + *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift; + if (*len > max) { + *len = max; + gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks); + } +} + +static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset, + loff_t len) +{ + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *ip = GFS2_I(inode); + unsigned int data_blocks = 0, ind_blocks = 0, rblocks; + loff_t bytes, max_bytes; + struct gfs2_alloc *al; + int error; + loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; + next = (next + 1) << sdp->sd_sb.sb_bsize_shift; + + offset = (offset >> sdp->sd_sb.sb_bsize_shift) << + sdp->sd_sb.sb_bsize_shift; + + len = next - offset; + bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2; + if (!bytes) + bytes = UINT_MAX; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); + error = gfs2_glock_nq(&ip->i_gh); + if (unlikely(error)) + goto out_uninit; + + if (!gfs2_write_alloc_required(ip, offset, len)) + goto out_unlock; + + while (len > 0) { + if (len < bytes) + bytes = len; + al = gfs2_alloc_get(ip); + if (!al) { + error = -ENOMEM; + goto out_unlock; + } + + error = gfs2_quota_lock_check(ip); + if (error) + goto out_alloc_put; + +retry: + gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); + + al->al_requested = data_blocks + ind_blocks; + error = gfs2_inplace_reserve(ip); + if (error) { + if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { + bytes >>= 1; + goto retry; + } + goto out_qunlock; + } + max_bytes = bytes; + calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks); + al->al_requested = data_blocks + ind_blocks; + + rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + + RES_RG_HDR + gfs2_rg_blocks(al); + if (gfs2_is_jdata(ip)) + rblocks += data_blocks ? data_blocks : 1; + + error = gfs2_trans_begin(sdp, rblocks, + PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); + if (error) + goto out_trans_fail; + + error = fallocate_chunk(inode, offset, max_bytes, mode); + gfs2_trans_end(sdp); + + if (error) + goto out_trans_fail; + + len -= max_bytes; + offset += max_bytes; + gfs2_inplace_release(ip); + gfs2_quota_unlock(ip); + gfs2_alloc_put(ip); + } + goto out_unlock; + +out_trans_fail: + gfs2_inplace_release(ip); +out_qunlock: + gfs2_quota_unlock(ip); +out_alloc_put: + gfs2_alloc_put(ip); +out_unlock: + gfs2_glock_dq(&ip->i_gh); +out_uninit: + gfs2_holder_uninit(&ip->i_gh); + return error; +} + + static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { @@ -1351,6 +1572,7 @@ const struct inode_operations gfs2_file_iops = { .getxattr = gfs2_getxattr, .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, + .fallocate = gfs2_fallocate, .fiemap = gfs2_fiemap, }; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 1bc6b5695e6d..58a9b9998b42 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -735,10 +735,8 @@ get_a_page: goto out; size = loc + sizeof(struct gfs2_quota); - if (size > inode->i_size) { - ip->i_disksize = size; + if (size > inode->i_size) i_size_write(inode, size); - } inode->i_mtime = inode->i_atime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); @@ -817,7 +815,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) goto out_alloc; if (nalloc) - blocks += al->al_rgd->rd_length + nalloc * ind_blocks + RES_STATFS; + blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS; error = gfs2_trans_begin(sdp, blocks, 0); if (error) @@ -1190,18 +1188,17 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void * int gfs2_quota_init(struct gfs2_sbd *sdp) { struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); - unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; + u64 size = i_size_read(sdp->sd_qc_inode); + unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift; unsigned int x, slot = 0; unsigned int found = 0; u64 dblock; u32 extlen = 0; int error; - if (!ip->i_disksize || ip->i_disksize > (64 << 20) || - ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) { - gfs2_consist_inode(ip); + if (gfs2_check_internal_file_size(sdp->sd_qc_inode, 1, 64 << 20)) return -EIO; - } + sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE); @@ -1589,6 +1586,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, error = gfs2_inplace_reserve(ip); if (error) goto out_alloc; + blocks += gfs2_rg_blocks(al); } error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0); diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index f7f89a94a5a4..f2a02edcac8f 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -455,11 +455,13 @@ void gfs2_recover_func(struct work_struct *work) int ro = 0; unsigned int pass; int error; + int jlocked = 0; - if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { + if (sdp->sd_args.ar_spectator || + (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) { fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n", jd->jd_jid); - + jlocked = 1; /* Acquire the journal lock so we can do recovery */ error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops, @@ -554,13 +556,12 @@ void gfs2_recover_func(struct work_struct *work) jd->jd_jid, t); } - if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) - gfs2_glock_dq_uninit(&ji_gh); - gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); - if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) + if (jlocked) { + gfs2_glock_dq_uninit(&ji_gh); gfs2_glock_dq_uninit(&j_gh); + } fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); goto done; @@ -568,7 +569,7 @@ void gfs2_recover_func(struct work_struct *work) fail_gunlock_tr: gfs2_glock_dq_uninit(&t_gh); fail_gunlock_ji: - if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { + if (jlocked) { gfs2_glock_dq_uninit(&ji_gh); fail_gunlock_j: gfs2_glock_dq_uninit(&j_gh); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 171a744f8e45..fb67f593f408 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp) for (rgrps = 0;; rgrps++) { loff_t pos = rgrps * sizeof(struct gfs2_rindex); - if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize) + if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode)) break; error = gfs2_internal_read(ip, &ra_state, buf, &pos, sizeof(struct gfs2_rindex)); @@ -588,7 +588,9 @@ static int gfs2_ri_update(struct gfs2_inode *ip) struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct inode *inode = &ip->i_inode; struct file_ra_state ra_state; - u64 rgrp_count = ip->i_disksize; + u64 rgrp_count = i_size_read(inode); + struct gfs2_rgrpd *rgd; + unsigned int max_data = 0; int error; do_div(rgrp_count, sizeof(struct gfs2_rindex)); @@ -603,6 +605,10 @@ static int gfs2_ri_update(struct gfs2_inode *ip) } } + list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list) + if (rgd->rd_data > max_data) + max_data = rgd->rd_data; + sdp->sd_max_rg_data = max_data; sdp->sd_rindex_uptodate = 1; return 0; } @@ -622,13 +628,15 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip) struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct inode *inode = &ip->i_inode; struct file_ra_state ra_state; + struct gfs2_rgrpd *rgd; + unsigned int max_data = 0; int error; file_ra_state_init(&ra_state, inode->i_mapping); for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { /* Ignore partials */ if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) > - ip->i_disksize) + i_size_read(inode)) break; error = read_rindex_entry(ip, &ra_state); if (error) { @@ -636,6 +644,10 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip) return error; } } + list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list) + if (rgd->rd_data > max_data) + max_data = rgd->rd_data; + sdp->sd_max_rg_data = max_data; sdp->sd_rindex_uptodate = 1; return 0; @@ -1188,7 +1200,8 @@ out: * Returns: errno */ -int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) +int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, + char *file, unsigned int line) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_alloc *al = ip->i_alloc; @@ -1199,12 +1212,15 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) return -EINVAL; try_again: - /* We need to hold the rindex unless the inode we're using is - the rindex itself, in which case it's already held. */ - if (ip != GFS2_I(sdp->sd_rindex)) - error = gfs2_rindex_hold(sdp, &al->al_ri_gh); - else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */ - error = gfs2_ri_update_special(ip); + if (hold_rindex) { + /* We need to hold the rindex unless the inode we're using is + the rindex itself, in which case it's already held. */ + if (ip != GFS2_I(sdp->sd_rindex)) + error = gfs2_rindex_hold(sdp, &al->al_ri_gh); + else if (!sdp->sd_rgrps) /* We may not have the rindex read + in, so: */ + error = gfs2_ri_update_special(ip); + } if (error) return error; @@ -1215,7 +1231,7 @@ try_again: try to free it, and try the allocation again. */ error = get_local_rgrp(ip, &unlinked, &last_unlinked); if (error) { - if (ip != GFS2_I(sdp->sd_rindex)) + if (hold_rindex && ip != GFS2_I(sdp->sd_rindex)) gfs2_glock_dq_uninit(&al->al_ri_gh); if (error != -EAGAIN) return error; @@ -1257,7 +1273,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip) al->al_rgd = NULL; if (al->al_rgd_gh.gh_gl) gfs2_glock_dq_uninit(&al->al_rgd_gh); - if (ip != GFS2_I(sdp->sd_rindex)) + if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl) gfs2_glock_dq_uninit(&al->al_ri_gh); } @@ -1496,11 +1512,19 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n) struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head *dibh; struct gfs2_alloc *al = ip->i_alloc; - struct gfs2_rgrpd *rgd = al->al_rgd; + struct gfs2_rgrpd *rgd; u32 goal, blk; u64 block; int error; + /* Only happens if there is a bug in gfs2, return something distinctive + * to ensure that it is noticed. + */ + if (al == NULL) + return -ECANCELED; + + rgd = al->al_rgd; + if (rgrp_contains_block(rgd, ip->i_goal)) goal = ip->i_goal - rgd->rd_data0; else diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index f07119d89557..0e35c0466f9a 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -39,10 +39,12 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip) ip->i_alloc = NULL; } -extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, - unsigned int line); +extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, + char *file, unsigned int line); #define gfs2_inplace_reserve(ip) \ -gfs2_inplace_reserve_i((ip), __FILE__, __LINE__) + gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__) +#define gfs2_inplace_reserve_ri(ip) \ + gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__) extern void gfs2_inplace_release(struct gfs2_inode *ip); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 77cb9f830ee4..047d1176096c 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -85,6 +85,7 @@ static const match_table_t tokens = { {Opt_locktable, "locktable=%s"}, {Opt_hostdata, "hostdata=%s"}, {Opt_spectator, "spectator"}, + {Opt_spectator, "norecovery"}, {Opt_ignore_local_fs, "ignore_local_fs"}, {Opt_localflocks, "localflocks"}, {Opt_localcaching, "localcaching"}, @@ -159,13 +160,13 @@ int gfs2_mount_args(struct gfs2_args *args, char *options) args->ar_spectator = 1; break; case Opt_ignore_local_fs: - args->ar_ignore_local_fs = 1; + /* Retained for backwards compat only */ break; case Opt_localflocks: args->ar_localflocks = 1; break; case Opt_localcaching: - args->ar_localcaching = 1; + /* Retained for backwards compat only */ break; case Opt_debug: if (args->ar_errors == GFS2_ERRORS_PANIC) { @@ -179,7 +180,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options) args->ar_debug = 0; break; case Opt_upgrade: - args->ar_upgrade = 1; + /* Retained for backwards compat only */ break; case Opt_acl: args->ar_posix_acl = 1; @@ -342,15 +343,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd) { struct gfs2_inode *ip = GFS2_I(jd->jd_inode); struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + u64 size = i_size_read(jd->jd_inode); - if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) || - (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) { - gfs2_consist_inode(ip); + if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30)) return -EIO; - } - jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; - if (gfs2_write_alloc_required(ip, 0, ip->i_disksize)) { + jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift; + + if (gfs2_write_alloc_required(ip, 0, size)) { gfs2_consist_inode(ip); return -EIO; } @@ -1129,9 +1129,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) /* Some flags must not be changed */ if (args_neq(&args, &sdp->sd_args, spectator) || - args_neq(&args, &sdp->sd_args, ignore_local_fs) || args_neq(&args, &sdp->sd_args, localflocks) || - args_neq(&args, &sdp->sd_args, localcaching) || args_neq(&args, &sdp->sd_args, meta)) return -EINVAL; @@ -1234,16 +1232,10 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) seq_printf(s, ",hostdata=%s", args->ar_hostdata); if (args->ar_spectator) seq_printf(s, ",spectator"); - if (args->ar_ignore_local_fs) - seq_printf(s, ",ignore_local_fs"); if (args->ar_localflocks) seq_printf(s, ",localflocks"); - if (args->ar_localcaching) - seq_printf(s, ",localcaching"); if (args->ar_debug) seq_printf(s, ",debug"); - if (args->ar_upgrade) - seq_printf(s, ",upgrade"); if (args->ar_posix_acl) seq_printf(s, ",acl"); if (args->ar_quota != GFS2_QUOTA_DEFAULT) { diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index ccacffd2faaa..748ccb557c18 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -230,7 +230,10 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len if (gltype > LM_TYPE_JOURNAL) return -EINVAL; - glops = gfs2_glops_list[gltype]; + if (gltype == LM_TYPE_NONDISK && glnum == GFS2_TRANS_LOCK) + glops = &gfs2_trans_glops; + else + glops = gfs2_glops_list[gltype]; if (glops == NULL) return -EINVAL; if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags)) @@ -399,31 +402,32 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf) static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf) { - return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid); + return sprintf(buf, "%d\n", sdp->sd_lockstruct.ls_jid); } static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { - unsigned jid; + int jid; int rv; - rv = sscanf(buf, "%u", &jid); + rv = sscanf(buf, "%d", &jid); if (rv != 1) return -EINVAL; spin_lock(&sdp->sd_jindex_spin); rv = -EINVAL; - if (sdp->sd_args.ar_spectator) - goto out; if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) goto out; rv = -EBUSY; - if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) + if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) goto out; + rv = 0; + if (sdp->sd_args.ar_spectator && jid > 0) + rv = jid = -EINVAL; sdp->sd_lockstruct.ls_jid = jid; + clear_bit(SDF_NOJOURNALID, &sdp->sd_flags); smp_mb__after_clear_bit(); wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); - rv = 0; out: spin_unlock(&sdp->sd_jindex_spin); return rv ? rv : len; @@ -617,7 +621,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj, add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) - add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid); + add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid); if (gfs2_uuid_valid(uuid)) add_uevent_var(env, "UUID=%pUB", uuid); return 0; diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 148d55c14171..cedb0bb96d96 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -39,7 +39,8 @@ {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \ {(1UL << GLF_REPLY_PENDING), "r" }, \ {(1UL << GLF_INITIAL), "I" }, \ - {(1UL << GLF_FROZEN), "F" }) + {(1UL << GLF_FROZEN), "F" }, \ + {(1UL << GLF_QUEUED), "q" }) #ifndef NUMPTY #define NUMPTY diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index edf9d4bd908e..fb56b783e028 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -20,11 +20,20 @@ struct gfs2_glock; #define RES_JDATA 1 #define RES_DATA 1 #define RES_LEAF 1 +#define RES_RG_HDR 1 #define RES_RG_BIT 2 #define RES_EATTR 1 #define RES_STATFS 1 #define RES_QUOTA 2 +/* reserve either the number of blocks to be allocated plus the rg header + * block, or all of the blocks in the rg, whichever is smaller */ +static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al) +{ + return (al->al_requested < al->al_rgd->rd_length)? + al->al_requested + 1 : al->al_rgd->rd_length; +} + int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, unsigned int revokes); diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 776af6eb4bcb..30b58f07c8a6 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -734,7 +734,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, goto out_gunlock_q; error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), - blks + al->al_rgd->rd_length + + blks + gfs2_rg_blocks(al) + RES_DINODE + RES_STATFS + RES_QUOTA, 0); if (error) goto out_ipres; diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c index 4129cdb3f0d8..571abe97b42a 100644 --- a/fs/hfs/bfind.c +++ b/fs/hfs/bfind.c @@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); - down(&tree->tree_lock); + mutex_lock(&tree->tree_lock); return 0; } @@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd) hfs_bnode_put(fd->bnode); kfree(fd->search_key); dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); - up(&fd->tree->tree_lock); + mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; } diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 38a0a9917d7f..3ebc437736fe 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -27,7 +27,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke if (!tree) return NULL; - init_MUTEX(&tree->tree_lock); + mutex_init(&tree->tree_lock); spin_lock_init(&tree->hash_lock); /* Set the correct compare function */ tree->sb = sb; diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h index cc51905ac21d..2a1d712f85dc 100644 --- a/fs/hfs/btree.h +++ b/fs/hfs/btree.h @@ -33,7 +33,7 @@ struct hfs_btree { unsigned int depth; //unsigned int map1_size, map_size; - struct semaphore tree_lock; + struct mutex tree_lock; unsigned int pages_per_bnode; spinlock_t hash_lock; diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c index 5007a41f1be9..d182438c7ae4 100644 --- a/fs/hfsplus/bfind.c +++ b/fs/hfsplus/bfind.c @@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); - down(&tree->tree_lock); + mutex_lock(&tree->tree_lock); return 0; } @@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd) hfs_bnode_put(fd->bnode); kfree(fd->search_key); dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); - up(&fd->tree->tree_lock); + mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; } @@ -52,6 +52,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd) rec = (e + b) / 2; len = hfs_brec_lenoff(bnode, rec, &off); keylen = hfs_brec_keylen(bnode, rec); + if (keylen == 0) { + res = -EINVAL; + goto fail; + } hfs_bnode_read(bnode, fd->key, off, keylen); cmpval = bnode->tree->keycmp(fd->key, fd->search_key); if (!cmpval) { @@ -67,6 +71,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd) if (rec != e && e >= 0) { len = hfs_brec_lenoff(bnode, e, &off); keylen = hfs_brec_keylen(bnode, e); + if (keylen == 0) { + res = -EINVAL; + goto fail; + } hfs_bnode_read(bnode, fd->key, off, keylen); } done: @@ -75,6 +83,7 @@ done: fd->keylength = keylen; fd->entryoffset = off + keylen; fd->entrylength = len - keylen; +fail: return res; } @@ -198,6 +207,10 @@ int hfs_brec_goto(struct hfs_find_data *fd, int cnt) len = hfs_brec_lenoff(bnode, fd->record, &off); keylen = hfs_brec_keylen(bnode, fd->record); + if (keylen == 0) { + res = -EINVAL; + goto out; + } fd->keyoffset = off; fd->keylength = keylen; fd->entryoffset = off + keylen; diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index ea30afc2a03c..ad57f5991eb1 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -17,6 +17,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max) { + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct page *page; struct address_space *mapping; __be32 *pptr, *curr, *end; @@ -29,8 +30,8 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma return size; dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); - mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex); - mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; + mutex_lock(&sbi->alloc_mutex); + mapping = sbi->alloc_file->i_mapping; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); if (IS_ERR(page)) { start = size; @@ -150,16 +151,17 @@ done: set_page_dirty(page); kunmap(page); *max = offset + (curr - pptr) * 32 + i - start; - HFSPLUS_SB(sb).free_blocks -= *max; + sbi->free_blocks -= *max; sb->s_dirt = 1; dprint(DBG_BITMAP, "-> %u,%u\n", start, *max); out: - mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex); + mutex_unlock(&sbi->alloc_mutex); return start; } int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) { + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct page *page; struct address_space *mapping; __be32 *pptr, *curr, *end; @@ -172,11 +174,11 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count); /* are all of the bits in range? */ - if ((offset + count) > HFSPLUS_SB(sb).total_blocks) + if ((offset + count) > sbi->total_blocks) return -2; - mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex); - mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; + mutex_lock(&sbi->alloc_mutex); + mapping = sbi->alloc_file->i_mapping; pnr = offset / PAGE_CACHE_BITS; page = read_mapping_page(mapping, pnr, NULL); pptr = kmap(page); @@ -224,9 +226,9 @@ done: out: set_page_dirty(page); kunmap(page); - HFSPLUS_SB(sb).free_blocks += len; + sbi->free_blocks += len; sb->s_dirt = 1; - mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex); + mutex_unlock(&sbi->alloc_mutex); return 0; } diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index c88e5d72a402..2f39d05443e1 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -42,10 +42,13 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec) recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2); if (!recoff) return 0; - if (node->tree->attributes & HFS_TREE_BIGKEYS) - retval = hfs_bnode_read_u16(node, recoff) + 2; - else - retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1; + + retval = hfs_bnode_read_u16(node, recoff) + 2; + if (retval > node->tree->max_key_len + 2) { + printk(KERN_ERR "hfs: keylen %d too large\n", + retval); + retval = 0; + } } return retval; } @@ -216,7 +219,7 @@ skip: static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) { struct hfs_btree *tree; - struct hfs_bnode *node, *new_node; + struct hfs_bnode *node, *new_node, *next_node; struct hfs_bnode_desc node_desc; int num_recs, new_rec_off, new_off, old_rec_off; int data_start, data_end, size; @@ -235,6 +238,17 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) new_node->type = node->type; new_node->height = node->height; + if (node->next) + next_node = hfs_bnode_find(tree, node->next); + else + next_node = NULL; + + if (IS_ERR(next_node)) { + hfs_bnode_put(node); + hfs_bnode_put(new_node); + return next_node; + } + size = tree->node_size / 2 - node->num_recs * 2 - 14; old_rec_off = tree->node_size - 4; num_recs = 1; @@ -248,6 +262,8 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) /* panic? */ hfs_bnode_put(node); hfs_bnode_put(new_node); + if (next_node) + hfs_bnode_put(next_node); return ERR_PTR(-ENOSPC); } @@ -302,8 +318,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc)); /* update next bnode header */ - if (new_node->next) { - struct hfs_bnode *next_node = hfs_bnode_find(tree, new_node->next); + if (next_node) { next_node->prev = new_node->this; hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc)); node_desc.prev = cpu_to_be32(next_node->prev); diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index e49fcee1e293..22e4d4e32999 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -30,7 +30,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) if (!tree) return NULL; - init_MUTEX(&tree->tree_lock); + mutex_init(&tree->tree_lock); spin_lock_init(&tree->hash_lock); tree->sb = sb; tree->cnid = id; @@ -39,10 +39,16 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) goto free_tree; tree->inode = inode; + if (!HFSPLUS_I(tree->inode)->first_blocks) { + printk(KERN_ERR + "hfs: invalid btree extent records (0 size).\n"); + goto free_inode; + } + mapping = tree->inode->i_mapping; page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) - goto free_tree; + goto free_inode; /* Load the header */ head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); @@ -57,27 +63,56 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) tree->max_key_len = be16_to_cpu(head->max_key_len); tree->depth = be16_to_cpu(head->depth); - /* Set the correct compare function */ - if (id == HFSPLUS_EXT_CNID) { + /* Verify the tree and set the correct compare function */ + switch (id) { + case HFSPLUS_EXT_CNID: + if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) { + printk(KERN_ERR "hfs: invalid extent max_key_len %d\n", + tree->max_key_len); + goto fail_page; + } + if (tree->attributes & HFS_TREE_VARIDXKEYS) { + printk(KERN_ERR "hfs: invalid extent btree flag\n"); + goto fail_page; + } + tree->keycmp = hfsplus_ext_cmp_key; - } else if (id == HFSPLUS_CAT_CNID) { - if ((HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX) && + break; + case HFSPLUS_CAT_CNID: + if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) { + printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n", + tree->max_key_len); + goto fail_page; + } + if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) { + printk(KERN_ERR "hfs: invalid catalog btree flag\n"); + goto fail_page; + } + + if (test_bit(HFSPLUS_SB_HFSX, &HFSPLUS_SB(sb)->flags) && (head->key_type == HFSPLUS_KEY_BINARY)) tree->keycmp = hfsplus_cat_bin_cmp_key; else { tree->keycmp = hfsplus_cat_case_cmp_key; - HFSPLUS_SB(sb).flags |= HFSPLUS_SB_CASEFOLD; + set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); } - } else { + break; + default: printk(KERN_ERR "hfs: unknown B*Tree requested\n"); goto fail_page; } + if (!(tree->attributes & HFS_TREE_BIGKEYS)) { + printk(KERN_ERR "hfs: invalid btree flag\n"); + goto fail_page; + } + size = tree->node_size; if (!is_power_of_2(size)) goto fail_page; if (!tree->node_count) goto fail_page; + tree->node_size_shift = ffs(size) - 1; tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; @@ -87,10 +122,11 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) return tree; fail_page: - tree->inode->i_mapping->a_ops = &hfsplus_aops; page_cache_release(page); - free_tree: + free_inode: + tree->inode->i_mapping->a_ops = &hfsplus_aops; iput(tree->inode); + free_tree: kfree(tree); return NULL; } @@ -192,17 +228,18 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) while (!tree->free_nodes) { struct inode *inode = tree->inode; + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); u32 count; int res; res = hfsplus_file_extend(inode); if (res) return ERR_PTR(res); - HFSPLUS_I(inode).phys_size = inode->i_size = - (loff_t)HFSPLUS_I(inode).alloc_blocks << - HFSPLUS_SB(tree->sb).alloc_blksz_shift; - HFSPLUS_I(inode).fs_blocks = HFSPLUS_I(inode).alloc_blocks << - HFSPLUS_SB(tree->sb).fs_shift; + hip->phys_size = inode->i_size = + (loff_t)hip->alloc_blocks << + HFSPLUS_SB(tree->sb)->alloc_blksz_shift; + hip->fs_blocks = + hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift; inode_set_bytes(inode, inode->i_size); count = inode->i_size >> tree->node_size_shift; tree->free_nodes = count - tree->node_count; diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index f6874acb2cf2..8af45fc5b051 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -67,7 +67,7 @@ static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent, key->key_len = cpu_to_be16(6 + ustrlen); } -static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms) +void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms) { if (inode->i_flags & S_IMMUTABLE) perms->rootflags |= HFSPLUS_FLG_IMMUTABLE; @@ -77,15 +77,24 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms) perms->rootflags |= HFSPLUS_FLG_APPEND; else perms->rootflags &= ~HFSPLUS_FLG_APPEND; - HFSPLUS_I(inode).rootflags = perms->rootflags; - HFSPLUS_I(inode).userflags = perms->userflags; + + perms->userflags = HFSPLUS_I(inode)->userflags; perms->mode = cpu_to_be16(inode->i_mode); perms->owner = cpu_to_be32(inode->i_uid); perms->group = cpu_to_be32(inode->i_gid); + + if (S_ISREG(inode->i_mode)) + perms->dev = cpu_to_be32(inode->i_nlink); + else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) + perms->dev = cpu_to_be32(inode->i_rdev); + else + perms->dev = 0; } static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode) { + struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); + if (S_ISDIR(inode->i_mode)) { struct hfsplus_cat_folder *folder; @@ -93,13 +102,13 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i memset(folder, 0, sizeof(*folder)); folder->type = cpu_to_be16(HFSPLUS_FOLDER); folder->id = cpu_to_be32(inode->i_ino); - HFSPLUS_I(inode).create_date = + HFSPLUS_I(inode)->create_date = folder->create_date = folder->content_mod_date = folder->attribute_mod_date = folder->access_date = hfsp_now2mt(); - hfsplus_set_perms(inode, &folder->permissions); - if (inode == HFSPLUS_SB(inode->i_sb).hidden_dir) + hfsplus_cat_set_perms(inode, &folder->permissions); + if (inode == sbi->hidden_dir) /* invisible and namelocked */ folder->user_info.frFlags = cpu_to_be16(0x5000); return sizeof(*folder); @@ -111,19 +120,19 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i file->type = cpu_to_be16(HFSPLUS_FILE); file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS); file->id = cpu_to_be32(cnid); - HFSPLUS_I(inode).create_date = + HFSPLUS_I(inode)->create_date = file->create_date = file->content_mod_date = file->attribute_mod_date = file->access_date = hfsp_now2mt(); if (cnid == inode->i_ino) { - hfsplus_set_perms(inode, &file->permissions); + hfsplus_cat_set_perms(inode, &file->permissions); if (S_ISLNK(inode->i_mode)) { file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE); file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR); } else { - file->user_info.fdType = cpu_to_be32(HFSPLUS_SB(inode->i_sb).type); - file->user_info.fdCreator = cpu_to_be32(HFSPLUS_SB(inode->i_sb).creator); + file->user_info.fdType = cpu_to_be32(sbi->type); + file->user_info.fdCreator = cpu_to_be32(sbi->creator); } if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE) file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); @@ -131,8 +140,8 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE); file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR); file->user_info.fdFlags = cpu_to_be16(0x100); - file->create_date = HFSPLUS_I(HFSPLUS_SB(inode->i_sb).hidden_dir).create_date; - file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode).dev); + file->create_date = HFSPLUS_I(sbi->hidden_dir)->create_date; + file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode)->linkid); } return sizeof(*file); } @@ -180,15 +189,14 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid, int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode) { + struct super_block *sb = dir->i_sb; struct hfs_find_data fd; - struct super_block *sb; hfsplus_cat_entry entry; int entry_size; int err; dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink); - sb = dir->i_sb; - hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); + hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ? @@ -234,7 +242,7 @@ err2: int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) { - struct super_block *sb; + struct super_block *sb = dir->i_sb; struct hfs_find_data fd; struct hfsplus_fork_raw fork; struct list_head *pos; @@ -242,8 +250,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) u16 type; dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); - sb = dir->i_sb; - hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); + hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (!str) { int len; @@ -279,7 +286,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC); } - list_for_each(pos, &HFSPLUS_I(dir).open_dir_list) { + list_for_each(pos, &HFSPLUS_I(dir)->open_dir_list) { struct hfsplus_readdir_data *rd = list_entry(pos, struct hfsplus_readdir_data, list); if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0) @@ -312,7 +319,7 @@ int hfsplus_rename_cat(u32 cnid, struct inode *src_dir, struct qstr *src_name, struct inode *dst_dir, struct qstr *dst_name) { - struct super_block *sb; + struct super_block *sb = src_dir->i_sb; struct hfs_find_data src_fd, dst_fd; hfsplus_cat_entry entry; int entry_size, type; @@ -320,8 +327,7 @@ int hfsplus_rename_cat(u32 cnid, dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); - sb = src_dir->i_sb; - hfs_find_init(HFSPLUS_SB(sb).cat_tree, &src_fd); + hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd); dst_fd = src_fd; /* find the old dir entry and read the data */ diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 764fd1bdca88..d236d85ec9d7 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -39,7 +39,7 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry, dentry->d_op = &hfsplus_dentry_operations; dentry->d_fsdata = NULL; - hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); + hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name); again: err = hfs_brec_read(&fd, &entry, sizeof(entry)); @@ -68,9 +68,9 @@ again: cnid = be32_to_cpu(entry.file.id); if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) && entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) && - (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb).hidden_dir).create_date || - entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode).create_date) && - HFSPLUS_SB(sb).hidden_dir) { + (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->create_date || + entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode)->create_date) && + HFSPLUS_SB(sb)->hidden_dir) { struct qstr str; char name[32]; @@ -86,7 +86,8 @@ again: linkid = be32_to_cpu(entry.file.permissions.dev); str.len = sprintf(name, "iNode%d", linkid); str.name = name; - hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_SB(sb).hidden_dir->i_ino, &str); + hfsplus_cat_build_key(sb, fd.search_key, + HFSPLUS_SB(sb)->hidden_dir->i_ino, &str); goto again; } } else if (!dentry->d_fsdata) @@ -101,7 +102,7 @@ again: if (IS_ERR(inode)) return ERR_CAST(inode); if (S_ISREG(inode->i_mode)) - HFSPLUS_I(inode).dev = linkid; + HFSPLUS_I(inode)->linkid = linkid; out: d_add(dentry, inode); return NULL; @@ -124,7 +125,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) if (filp->f_pos >= inode->i_size) return 0; - hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); + hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); err = hfs_brec_find(&fd); if (err) @@ -180,8 +181,9 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) err = -EIO; goto out; } - if (HFSPLUS_SB(sb).hidden_dir && - HFSPLUS_SB(sb).hidden_dir->i_ino == be32_to_cpu(entry.folder.id)) + if (HFSPLUS_SB(sb)->hidden_dir && + HFSPLUS_SB(sb)->hidden_dir->i_ino == + be32_to_cpu(entry.folder.id)) goto next; if (filldir(dirent, strbuf, len, filp->f_pos, be32_to_cpu(entry.folder.id), DT_DIR)) @@ -217,7 +219,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) } filp->private_data = rd; rd->file = filp; - list_add(&rd->list, &HFSPLUS_I(inode).open_dir_list); + list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list); } memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key)); out: @@ -229,38 +231,18 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file) { struct hfsplus_readdir_data *rd = file->private_data; if (rd) { + mutex_lock(&inode->i_mutex); list_del(&rd->list); + mutex_unlock(&inode->i_mutex); kfree(rd); } return 0; } -static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode, - struct nameidata *nd) -{ - struct inode *inode; - int res; - - inode = hfsplus_new_inode(dir->i_sb, mode); - if (!inode) - return -ENOSPC; - - res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); - if (res) { - inode->i_nlink = 0; - hfsplus_delete_inode(inode); - iput(inode); - return res; - } - hfsplus_instantiate(dentry, inode, inode->i_ino); - mark_inode_dirty(inode); - return 0; -} - static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, struct dentry *dst_dentry) { - struct super_block *sb = dst_dir->i_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb); struct inode *inode = src_dentry->d_inode; struct inode *src_dir = src_dentry->d_parent->d_inode; struct qstr str; @@ -270,7 +252,10 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, if (HFSPLUS_IS_RSRC(inode)) return -EPERM; + if (!S_ISREG(inode->i_mode)) + return -EPERM; + mutex_lock(&sbi->vh_mutex); if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) { for (;;) { get_random_bytes(&id, sizeof(cnid)); @@ -279,40 +264,41 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, str.len = sprintf(name, "iNode%d", id); res = hfsplus_rename_cat(inode->i_ino, src_dir, &src_dentry->d_name, - HFSPLUS_SB(sb).hidden_dir, &str); + sbi->hidden_dir, &str); if (!res) break; if (res != -EEXIST) - return res; + goto out; } - HFSPLUS_I(inode).dev = id; - cnid = HFSPLUS_SB(sb).next_cnid++; + HFSPLUS_I(inode)->linkid = id; + cnid = sbi->next_cnid++; src_dentry->d_fsdata = (void *)(unsigned long)cnid; res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode); if (res) /* panic? */ - return res; - HFSPLUS_SB(sb).file_count++; + goto out; + sbi->file_count++; } - cnid = HFSPLUS_SB(sb).next_cnid++; + cnid = sbi->next_cnid++; res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode); if (res) - return res; + goto out; inc_nlink(inode); hfsplus_instantiate(dst_dentry, inode, cnid); atomic_inc(&inode->i_count); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - HFSPLUS_SB(sb).file_count++; - sb->s_dirt = 1; - - return 0; + sbi->file_count++; + dst_dir->i_sb->s_dirt = 1; +out: + mutex_unlock(&sbi->vh_mutex); + return res; } static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) { - struct super_block *sb = dir->i_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); struct inode *inode = dentry->d_inode; struct qstr str; char name[32]; @@ -322,21 +308,22 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) if (HFSPLUS_IS_RSRC(inode)) return -EPERM; + mutex_lock(&sbi->vh_mutex); cnid = (u32)(unsigned long)dentry->d_fsdata; if (inode->i_ino == cnid && - atomic_read(&HFSPLUS_I(inode).opencnt)) { + atomic_read(&HFSPLUS_I(inode)->opencnt)) { str.name = name; str.len = sprintf(name, "temp%lu", inode->i_ino); res = hfsplus_rename_cat(inode->i_ino, dir, &dentry->d_name, - HFSPLUS_SB(sb).hidden_dir, &str); + sbi->hidden_dir, &str); if (!res) inode->i_flags |= S_DEAD; - return res; + goto out; } res = hfsplus_delete_cat(cnid, dir, &dentry->d_name); if (res) - return res; + goto out; if (inode->i_nlink > 0) drop_nlink(inode); @@ -344,10 +331,10 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) clear_nlink(inode); if (!inode->i_nlink) { if (inode->i_ino != cnid) { - HFSPLUS_SB(sb).file_count--; - if (!atomic_read(&HFSPLUS_I(inode).opencnt)) { + sbi->file_count--; + if (!atomic_read(&HFSPLUS_I(inode)->opencnt)) { res = hfsplus_delete_cat(inode->i_ino, - HFSPLUS_SB(sb).hidden_dir, + sbi->hidden_dir, NULL); if (!res) hfsplus_delete_inode(inode); @@ -356,107 +343,108 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) } else hfsplus_delete_inode(inode); } else - HFSPLUS_SB(sb).file_count--; + sbi->file_count--; inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - +out: + mutex_unlock(&sbi->vh_mutex); return res; } -static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode) -{ - struct inode *inode; - int res; - - inode = hfsplus_new_inode(dir->i_sb, S_IFDIR | mode); - if (!inode) - return -ENOSPC; - - res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); - if (res) { - inode->i_nlink = 0; - hfsplus_delete_inode(inode); - iput(inode); - return res; - } - hfsplus_instantiate(dentry, inode, inode->i_ino); - mark_inode_dirty(inode); - return 0; -} - static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry) { - struct inode *inode; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); + struct inode *inode = dentry->d_inode; int res; - inode = dentry->d_inode; if (inode->i_size != 2) return -ENOTEMPTY; + + mutex_lock(&sbi->vh_mutex); res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name); if (res) - return res; + goto out; clear_nlink(inode); inode->i_ctime = CURRENT_TIME_SEC; hfsplus_delete_inode(inode); mark_inode_dirty(inode); - return 0; +out: + mutex_unlock(&sbi->vh_mutex); + return res; } static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - struct super_block *sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); struct inode *inode; - int res; + int res = -ENOSPC; - sb = dir->i_sb; - inode = hfsplus_new_inode(sb, S_IFLNK | S_IRWXUGO); + mutex_lock(&sbi->vh_mutex); + inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO); if (!inode) - return -ENOSPC; + goto out; res = page_symlink(inode, symname, strlen(symname) + 1); - if (res) { - inode->i_nlink = 0; - hfsplus_delete_inode(inode); - iput(inode); - return res; - } + if (res) + goto out_err; - mark_inode_dirty(inode); res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); + if (res) + goto out_err; - if (!res) { - hfsplus_instantiate(dentry, inode, inode->i_ino); - mark_inode_dirty(inode); - } + hfsplus_instantiate(dentry, inode, inode->i_ino); + mark_inode_dirty(inode); + goto out; +out_err: + inode->i_nlink = 0; + hfsplus_delete_inode(inode); + iput(inode); +out: + mutex_unlock(&sbi->vh_mutex); return res; } static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) { - struct super_block *sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); struct inode *inode; - int res; + int res = -ENOSPC; - sb = dir->i_sb; - inode = hfsplus_new_inode(sb, mode); + mutex_lock(&sbi->vh_mutex); + inode = hfsplus_new_inode(dir->i_sb, mode); if (!inode) - return -ENOSPC; + goto out; + + if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) + init_special_inode(inode, mode, rdev); res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); if (res) { inode->i_nlink = 0; hfsplus_delete_inode(inode); iput(inode); - return res; + goto out; } - init_special_inode(inode, mode, rdev); + hfsplus_instantiate(dentry, inode, inode->i_ino); mark_inode_dirty(inode); +out: + mutex_unlock(&sbi->vh_mutex); + return res; +} - return 0; +static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + return hfsplus_mknod(dir, dentry, mode, 0); +} + +static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0); } static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, @@ -466,7 +454,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, /* Unlink destination if it already exists */ if (new_dentry->d_inode) { - res = hfsplus_unlink(new_dir, new_dentry); + if (S_ISDIR(new_dentry->d_inode->i_mode)) + res = hfsplus_rmdir(new_dir, new_dentry); + else + res = hfsplus_unlink(new_dir, new_dentry); if (res) return res; } diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 0022eec63cda..0c9cb1820a52 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -85,35 +85,49 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext) static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) { + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); int res; - hfsplus_ext_build_key(fd->search_key, inode->i_ino, HFSPLUS_I(inode).cached_start, - HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); + WARN_ON(!mutex_is_locked(&hip->extents_lock)); + + hfsplus_ext_build_key(fd->search_key, inode->i_ino, hip->cached_start, + HFSPLUS_IS_RSRC(inode) ? + HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); + res = hfs_brec_find(fd); - if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_NEW) { + if (hip->flags & HFSPLUS_FLG_EXT_NEW) { if (res != -ENOENT) return; - hfs_brec_insert(fd, HFSPLUS_I(inode).cached_extents, sizeof(hfsplus_extent_rec)); - HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); + hfs_brec_insert(fd, hip->cached_extents, + sizeof(hfsplus_extent_rec)); + hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); } else { if (res) return; - hfs_bnode_write(fd->bnode, HFSPLUS_I(inode).cached_extents, fd->entryoffset, fd->entrylength); - HFSPLUS_I(inode).flags &= ~HFSPLUS_FLG_EXT_DIRTY; + hfs_bnode_write(fd->bnode, hip->cached_extents, + fd->entryoffset, fd->entrylength); + hip->flags &= ~HFSPLUS_FLG_EXT_DIRTY; } } -void hfsplus_ext_write_extent(struct inode *inode) +static void hfsplus_ext_write_extent_locked(struct inode *inode) { - if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) { + if (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_EXT_DIRTY) { struct hfs_find_data fd; - hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd); + hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd); __hfsplus_ext_write_extent(inode, &fd); hfs_find_exit(&fd); } } +void hfsplus_ext_write_extent(struct inode *inode) +{ + mutex_lock(&HFSPLUS_I(inode)->extents_lock); + hfsplus_ext_write_extent_locked(inode); + mutex_unlock(&HFSPLUS_I(inode)->extents_lock); +} + static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd, struct hfsplus_extent *extent, u32 cnid, u32 block, u8 type) @@ -136,33 +150,39 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd, static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block) { + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); int res; - if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) + WARN_ON(!mutex_is_locked(&hip->extents_lock)); + + if (hip->flags & HFSPLUS_FLG_EXT_DIRTY) __hfsplus_ext_write_extent(inode, fd); - res = __hfsplus_ext_read_extent(fd, HFSPLUS_I(inode).cached_extents, inode->i_ino, - block, HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); + res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino, + block, HFSPLUS_IS_RSRC(inode) ? + HFSPLUS_TYPE_RSRC : + HFSPLUS_TYPE_DATA); if (!res) { - HFSPLUS_I(inode).cached_start = be32_to_cpu(fd->key->ext.start_block); - HFSPLUS_I(inode).cached_blocks = hfsplus_ext_block_count(HFSPLUS_I(inode).cached_extents); + hip->cached_start = be32_to_cpu(fd->key->ext.start_block); + hip->cached_blocks = hfsplus_ext_block_count(hip->cached_extents); } else { - HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0; - HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); + hip->cached_start = hip->cached_blocks = 0; + hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); } return res; } static int hfsplus_ext_read_extent(struct inode *inode, u32 block) { + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); struct hfs_find_data fd; int res; - if (block >= HFSPLUS_I(inode).cached_start && - block < HFSPLUS_I(inode).cached_start + HFSPLUS_I(inode).cached_blocks) + if (block >= hip->cached_start && + block < hip->cached_start + hip->cached_blocks) return 0; - hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd); + hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd); res = __hfsplus_ext_cache_extent(&fd, inode, block); hfs_find_exit(&fd); return res; @@ -172,21 +192,21 @@ static int hfsplus_ext_read_extent(struct inode *inode, u32 block) int hfsplus_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - struct super_block *sb; + struct super_block *sb = inode->i_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); int res = -EIO; u32 ablock, dblock, mask; int shift; - sb = inode->i_sb; - /* Convert inode block to disk allocation block */ - shift = HFSPLUS_SB(sb).alloc_blksz_shift - sb->s_blocksize_bits; - ablock = iblock >> HFSPLUS_SB(sb).fs_shift; + shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits; + ablock = iblock >> sbi->fs_shift; - if (iblock >= HFSPLUS_I(inode).fs_blocks) { - if (iblock > HFSPLUS_I(inode).fs_blocks || !create) + if (iblock >= hip->fs_blocks) { + if (iblock > hip->fs_blocks || !create) return -EIO; - if (ablock >= HFSPLUS_I(inode).alloc_blocks) { + if (ablock >= hip->alloc_blocks) { res = hfsplus_file_extend(inode); if (res) return res; @@ -194,33 +214,33 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, } else create = 0; - if (ablock < HFSPLUS_I(inode).first_blocks) { - dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).first_extents, ablock); + if (ablock < hip->first_blocks) { + dblock = hfsplus_ext_find_block(hip->first_extents, ablock); goto done; } if (inode->i_ino == HFSPLUS_EXT_CNID) return -EIO; - mutex_lock(&HFSPLUS_I(inode).extents_lock); + mutex_lock(&hip->extents_lock); res = hfsplus_ext_read_extent(inode, ablock); if (!res) { - dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).cached_extents, ablock - - HFSPLUS_I(inode).cached_start); + dblock = hfsplus_ext_find_block(hip->cached_extents, + ablock - hip->cached_start); } else { - mutex_unlock(&HFSPLUS_I(inode).extents_lock); + mutex_unlock(&hip->extents_lock); return -EIO; } - mutex_unlock(&HFSPLUS_I(inode).extents_lock); + mutex_unlock(&hip->extents_lock); done: dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock); - mask = (1 << HFSPLUS_SB(sb).fs_shift) - 1; - map_bh(bh_result, sb, (dblock << HFSPLUS_SB(sb).fs_shift) + HFSPLUS_SB(sb).blockoffset + (iblock & mask)); + mask = (1 << sbi->fs_shift) - 1; + map_bh(bh_result, sb, (dblock << sbi->fs_shift) + sbi->blockoffset + (iblock & mask)); if (create) { set_buffer_new(bh_result); - HFSPLUS_I(inode).phys_size += sb->s_blocksize; - HFSPLUS_I(inode).fs_blocks++; + hip->phys_size += sb->s_blocksize; + hip->fs_blocks++; inode_add_bytes(inode, sb->s_blocksize); mark_inode_dirty(inode); } @@ -327,7 +347,7 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw if (total_blocks == blocks) return 0; - hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd); + hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd); do { res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid, total_blocks, type); @@ -348,29 +368,33 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw int hfsplus_file_extend(struct inode *inode) { struct super_block *sb = inode->i_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); u32 start, len, goal; int res; - if (HFSPLUS_SB(sb).alloc_file->i_size * 8 < HFSPLUS_SB(sb).total_blocks - HFSPLUS_SB(sb).free_blocks + 8) { + if (sbi->alloc_file->i_size * 8 < + sbi->total_blocks - sbi->free_blocks + 8) { // extend alloc file - printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n", HFSPLUS_SB(sb).alloc_file->i_size * 8, - HFSPLUS_SB(sb).total_blocks, HFSPLUS_SB(sb).free_blocks); + printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n", + sbi->alloc_file->i_size * 8, + sbi->total_blocks, sbi->free_blocks); return -ENOSPC; } - mutex_lock(&HFSPLUS_I(inode).extents_lock); - if (HFSPLUS_I(inode).alloc_blocks == HFSPLUS_I(inode).first_blocks) - goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).first_extents); + mutex_lock(&hip->extents_lock); + if (hip->alloc_blocks == hip->first_blocks) + goal = hfsplus_ext_lastblock(hip->first_extents); else { - res = hfsplus_ext_read_extent(inode, HFSPLUS_I(inode).alloc_blocks); + res = hfsplus_ext_read_extent(inode, hip->alloc_blocks); if (res) goto out; - goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).cached_extents); + goal = hfsplus_ext_lastblock(hip->cached_extents); } - len = HFSPLUS_I(inode).clump_blocks; - start = hfsplus_block_allocate(sb, HFSPLUS_SB(sb).total_blocks, goal, &len); - if (start >= HFSPLUS_SB(sb).total_blocks) { + len = hip->clump_blocks; + start = hfsplus_block_allocate(sb, sbi->total_blocks, goal, &len); + if (start >= sbi->total_blocks) { start = hfsplus_block_allocate(sb, goal, 0, &len); if (start >= goal) { res = -ENOSPC; @@ -379,56 +403,56 @@ int hfsplus_file_extend(struct inode *inode) } dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); - if (HFSPLUS_I(inode).alloc_blocks <= HFSPLUS_I(inode).first_blocks) { - if (!HFSPLUS_I(inode).first_blocks) { + + if (hip->alloc_blocks <= hip->first_blocks) { + if (!hip->first_blocks) { dprint(DBG_EXTENT, "first extents\n"); /* no extents yet */ - HFSPLUS_I(inode).first_extents[0].start_block = cpu_to_be32(start); - HFSPLUS_I(inode).first_extents[0].block_count = cpu_to_be32(len); + hip->first_extents[0].start_block = cpu_to_be32(start); + hip->first_extents[0].block_count = cpu_to_be32(len); res = 0; } else { /* try to append to extents in inode */ - res = hfsplus_add_extent(HFSPLUS_I(inode).first_extents, - HFSPLUS_I(inode).alloc_blocks, + res = hfsplus_add_extent(hip->first_extents, + hip->alloc_blocks, start, len); if (res == -ENOSPC) goto insert_extent; } if (!res) { - hfsplus_dump_extent(HFSPLUS_I(inode).first_extents); - HFSPLUS_I(inode).first_blocks += len; + hfsplus_dump_extent(hip->first_extents); + hip->first_blocks += len; } } else { - res = hfsplus_add_extent(HFSPLUS_I(inode).cached_extents, - HFSPLUS_I(inode).alloc_blocks - - HFSPLUS_I(inode).cached_start, + res = hfsplus_add_extent(hip->cached_extents, + hip->alloc_blocks - hip->cached_start, start, len); if (!res) { - hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); - HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY; - HFSPLUS_I(inode).cached_blocks += len; + hfsplus_dump_extent(hip->cached_extents); + hip->flags |= HFSPLUS_FLG_EXT_DIRTY; + hip->cached_blocks += len; } else if (res == -ENOSPC) goto insert_extent; } out: - mutex_unlock(&HFSPLUS_I(inode).extents_lock); + mutex_unlock(&hip->extents_lock); if (!res) { - HFSPLUS_I(inode).alloc_blocks += len; + hip->alloc_blocks += len; mark_inode_dirty(inode); } return res; insert_extent: dprint(DBG_EXTENT, "insert new extent\n"); - hfsplus_ext_write_extent(inode); + hfsplus_ext_write_extent_locked(inode); - memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); - HFSPLUS_I(inode).cached_extents[0].start_block = cpu_to_be32(start); - HFSPLUS_I(inode).cached_extents[0].block_count = cpu_to_be32(len); - hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); - HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW; - HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).alloc_blocks; - HFSPLUS_I(inode).cached_blocks = len; + memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); + hip->cached_extents[0].start_block = cpu_to_be32(start); + hip->cached_extents[0].block_count = cpu_to_be32(len); + hfsplus_dump_extent(hip->cached_extents); + hip->flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW; + hip->cached_start = hip->alloc_blocks; + hip->cached_blocks = len; res = 0; goto out; @@ -437,13 +461,15 @@ insert_extent: void hfsplus_file_truncate(struct inode *inode) { struct super_block *sb = inode->i_sb; + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); struct hfs_find_data fd; u32 alloc_cnt, blk_cnt, start; int res; - dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino, - (long long)HFSPLUS_I(inode).phys_size, inode->i_size); - if (inode->i_size > HFSPLUS_I(inode).phys_size) { + dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", + inode->i_ino, (long long)hip->phys_size, inode->i_size); + + if (inode->i_size > hip->phys_size) { struct address_space *mapping = inode->i_mapping; struct page *page; void *fsdata; @@ -460,47 +486,48 @@ void hfsplus_file_truncate(struct inode *inode) return; mark_inode_dirty(inode); return; - } else if (inode->i_size == HFSPLUS_I(inode).phys_size) + } else if (inode->i_size == hip->phys_size) return; - blk_cnt = (inode->i_size + HFSPLUS_SB(sb).alloc_blksz - 1) >> HFSPLUS_SB(sb).alloc_blksz_shift; - alloc_cnt = HFSPLUS_I(inode).alloc_blocks; + blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >> + HFSPLUS_SB(sb)->alloc_blksz_shift; + alloc_cnt = hip->alloc_blocks; if (blk_cnt == alloc_cnt) goto out; - mutex_lock(&HFSPLUS_I(inode).extents_lock); - hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd); + mutex_lock(&hip->extents_lock); + hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd); while (1) { - if (alloc_cnt == HFSPLUS_I(inode).first_blocks) { - hfsplus_free_extents(sb, HFSPLUS_I(inode).first_extents, + if (alloc_cnt == hip->first_blocks) { + hfsplus_free_extents(sb, hip->first_extents, alloc_cnt, alloc_cnt - blk_cnt); - hfsplus_dump_extent(HFSPLUS_I(inode).first_extents); - HFSPLUS_I(inode).first_blocks = blk_cnt; + hfsplus_dump_extent(hip->first_extents); + hip->first_blocks = blk_cnt; break; } res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt); if (res) break; - start = HFSPLUS_I(inode).cached_start; - hfsplus_free_extents(sb, HFSPLUS_I(inode).cached_extents, + start = hip->cached_start; + hfsplus_free_extents(sb, hip->cached_extents, alloc_cnt - start, alloc_cnt - blk_cnt); - hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); + hfsplus_dump_extent(hip->cached_extents); if (blk_cnt > start) { - HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY; + hip->flags |= HFSPLUS_FLG_EXT_DIRTY; break; } alloc_cnt = start; - HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0; - HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); + hip->cached_start = hip->cached_blocks = 0; + hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); hfs_brec_remove(&fd); } hfs_find_exit(&fd); - mutex_unlock(&HFSPLUS_I(inode).extents_lock); + mutex_unlock(&hip->extents_lock); - HFSPLUS_I(inode).alloc_blocks = blk_cnt; + hip->alloc_blocks = blk_cnt; out: - HFSPLUS_I(inode).phys_size = inode->i_size; - HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; - inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits); + hip->phys_size = inode->i_size; + hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits); mark_inode_dirty(inode); } diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index dc856be3c2b0..cb3653efb57a 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -62,7 +62,7 @@ struct hfs_btree { unsigned int depth; //unsigned int map1_size, map_size; - struct semaphore tree_lock; + struct mutex tree_lock; unsigned int pages_per_bnode; spinlock_t hash_lock; @@ -121,16 +121,21 @@ struct hfsplus_sb_info { u32 sect_count; int fs_shift; - /* Stuff in host order from Vol Header */ + /* immutable data from the volume header */ u32 alloc_blksz; int alloc_blksz_shift; u32 total_blocks; + u32 data_clump_blocks, rsrc_clump_blocks; + + /* mutable data from the volume header, protected by alloc_mutex */ u32 free_blocks; - u32 next_alloc; + struct mutex alloc_mutex; + + /* mutable data from the volume header, protected by vh_mutex */ u32 next_cnid; u32 file_count; u32 folder_count; - u32 data_clump_blocks, rsrc_clump_blocks; + struct mutex vh_mutex; /* Config options */ u32 creator; @@ -143,40 +148,50 @@ struct hfsplus_sb_info { int part, session; unsigned long flags; - - struct hlist_head rsrc_inodes; }; -#define HFSPLUS_SB_WRITEBACKUP 0x0001 -#define HFSPLUS_SB_NODECOMPOSE 0x0002 -#define HFSPLUS_SB_FORCE 0x0004 -#define HFSPLUS_SB_HFSX 0x0008 -#define HFSPLUS_SB_CASEFOLD 0x0010 +#define HFSPLUS_SB_WRITEBACKUP 0 +#define HFSPLUS_SB_NODECOMPOSE 1 +#define HFSPLUS_SB_FORCE 2 +#define HFSPLUS_SB_HFSX 3 +#define HFSPLUS_SB_CASEFOLD 4 struct hfsplus_inode_info { - struct mutex extents_lock; - u32 clump_blocks, alloc_blocks; - sector_t fs_blocks; - /* Allocation extents from catalog record or volume header */ - hfsplus_extent_rec first_extents; - u32 first_blocks; - hfsplus_extent_rec cached_extents; - u32 cached_start, cached_blocks; atomic_t opencnt; - struct inode *rsrc_inode; + /* + * Extent allocation information, protected by extents_lock. + */ + u32 first_blocks; + u32 clump_blocks; + u32 alloc_blocks; + u32 cached_start; + u32 cached_blocks; + hfsplus_extent_rec first_extents; + hfsplus_extent_rec cached_extents; unsigned long flags; + struct mutex extents_lock; + /* + * Immutable data. + */ + struct inode *rsrc_inode; __be32 create_date; - /* Device number in hfsplus_permissions in catalog */ - u32 dev; - /* BSD system and user file flags */ - u8 rootflags; - u8 userflags; + /* + * Protected by sbi->vh_mutex. + */ + u32 linkid; + + /* + * Protected by i_mutex. + */ + sector_t fs_blocks; + u8 userflags; /* BSD user file flags */ struct list_head open_dir_list; loff_t phys_size; + struct inode vfs_inode; }; @@ -184,8 +199,8 @@ struct hfsplus_inode_info { #define HFSPLUS_FLG_EXT_DIRTY 0x0002 #define HFSPLUS_FLG_EXT_NEW 0x0004 -#define HFSPLUS_IS_DATA(inode) (!(HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC)) -#define HFSPLUS_IS_RSRC(inode) (HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC) +#define HFSPLUS_IS_DATA(inode) (!(HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC)) +#define HFSPLUS_IS_RSRC(inode) (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC) struct hfs_find_data { /* filled by caller */ @@ -311,6 +326,7 @@ int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *); int hfsplus_delete_cat(u32, struct inode *, struct qstr *); int hfsplus_rename_cat(u32, struct inode *, struct qstr *, struct inode *, struct qstr *); +void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms); /* dir.c */ extern const struct inode_operations hfsplus_dir_inode_operations; @@ -372,26 +388,15 @@ int hfsplus_read_wrapper(struct super_block *); int hfs_part_find(struct super_block *, sector_t *, sector_t *); /* access macros */ -/* static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb) { return sb->s_fs_info; } + static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode) { return list_entry(inode, struct hfsplus_inode_info, vfs_inode); } -*/ -#define HFSPLUS_SB(super) (*(struct hfsplus_sb_info *)(super)->s_fs_info) -#define HFSPLUS_I(inode) (*list_entry(inode, struct hfsplus_inode_info, vfs_inode)) - -#if 1 -#define hfsplus_kmap(p) ({ struct page *__p = (p); kmap(__p); }) -#define hfsplus_kunmap(p) ({ struct page *__p = (p); kunmap(__p); __p; }) -#else -#define hfsplus_kmap(p) kmap(p) -#define hfsplus_kunmap(p) kunmap(p) -#endif #define sb_bread512(sb, sec, data) ({ \ struct buffer_head *__bh; \ @@ -419,6 +424,4 @@ static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode) #define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec) #define hfsp_now2mt() __hfsp_ut2mt(get_seconds()) -#define kdev_t_to_nr(x) (x) - #endif diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h index fe99fe8db61a..6892899fd6fb 100644 --- a/fs/hfsplus/hfsplus_raw.h +++ b/fs/hfsplus/hfsplus_raw.h @@ -200,6 +200,7 @@ struct hfsplus_cat_key { struct hfsplus_unistr name; } __packed; +#define HFSPLUS_CAT_KEYLEN (sizeof(struct hfsplus_cat_key)) /* Structs from hfs.h */ struct hfsp_point { @@ -323,7 +324,7 @@ struct hfsplus_ext_key { __be32 start_block; } __packed; -#define HFSPLUS_EXT_KEYLEN 12 +#define HFSPLUS_EXT_KEYLEN sizeof(struct hfsplus_ext_key) /* HFS+ generic BTree key */ typedef union { diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index c5a979d62c65..78449280dae0 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -36,7 +36,7 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping, *pagep = NULL; ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, hfsplus_get_block, - &HFSPLUS_I(mapping->host).phys_size); + &HFSPLUS_I(mapping->host)->phys_size); if (unlikely(ret)) { loff_t isize = mapping->host->i_size; if (pos + len > isize) @@ -62,13 +62,13 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask) switch (inode->i_ino) { case HFSPLUS_EXT_CNID: - tree = HFSPLUS_SB(sb).ext_tree; + tree = HFSPLUS_SB(sb)->ext_tree; break; case HFSPLUS_CAT_CNID: - tree = HFSPLUS_SB(sb).cat_tree; + tree = HFSPLUS_SB(sb)->cat_tree; break; case HFSPLUS_ATTR_CNID: - tree = HFSPLUS_SB(sb).attr_tree; + tree = HFSPLUS_SB(sb)->attr_tree; break; default: BUG(); @@ -172,12 +172,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent struct hfs_find_data fd; struct super_block *sb = dir->i_sb; struct inode *inode = NULL; + struct hfsplus_inode_info *hip; int err; if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc")) goto out; - inode = HFSPLUS_I(dir).rsrc_inode; + inode = HFSPLUS_I(dir)->rsrc_inode; if (inode) goto out; @@ -185,12 +186,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent if (!inode) return ERR_PTR(-ENOMEM); + hip = HFSPLUS_I(inode); inode->i_ino = dir->i_ino; - INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); - mutex_init(&HFSPLUS_I(inode).extents_lock); - HFSPLUS_I(inode).flags = HFSPLUS_FLG_RSRC; + INIT_LIST_HEAD(&hip->open_dir_list); + mutex_init(&hip->extents_lock); + hip->flags = HFSPLUS_FLG_RSRC; - hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); + hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); err = hfsplus_find_cat(sb, dir->i_ino, &fd); if (!err) err = hfsplus_cat_read_inode(inode, &fd); @@ -199,10 +201,18 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent iput(inode); return ERR_PTR(err); } - HFSPLUS_I(inode).rsrc_inode = dir; - HFSPLUS_I(dir).rsrc_inode = inode; + hip->rsrc_inode = dir; + HFSPLUS_I(dir)->rsrc_inode = inode; igrab(dir); - hlist_add_head(&inode->i_hash, &HFSPLUS_SB(sb).rsrc_inodes); + + /* + * __mark_inode_dirty expects inodes to be hashed. Since we don't + * want resource fork inodes in the regular inode space, we make them + * appear hashed, but do not put on any lists. hlist_del() + * will work fine and require no locking. + */ + inode->i_hash.pprev = &inode->i_hash.next; + mark_inode_dirty(inode); out: d_add(dentry, inode); @@ -211,30 +221,27 @@ out: static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir) { - struct super_block *sb = inode->i_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); u16 mode; mode = be16_to_cpu(perms->mode); inode->i_uid = be32_to_cpu(perms->owner); if (!inode->i_uid && !mode) - inode->i_uid = HFSPLUS_SB(sb).uid; + inode->i_uid = sbi->uid; inode->i_gid = be32_to_cpu(perms->group); if (!inode->i_gid && !mode) - inode->i_gid = HFSPLUS_SB(sb).gid; + inode->i_gid = sbi->gid; if (dir) { - mode = mode ? (mode & S_IALLUGO) : - (S_IRWXUGO & ~(HFSPLUS_SB(sb).umask)); + mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask)); mode |= S_IFDIR; } else if (!mode) - mode = S_IFREG | ((S_IRUGO|S_IWUGO) & - ~(HFSPLUS_SB(sb).umask)); + mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask)); inode->i_mode = mode; - HFSPLUS_I(inode).rootflags = perms->rootflags; - HFSPLUS_I(inode).userflags = perms->userflags; + HFSPLUS_I(inode)->userflags = perms->userflags; if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE) inode->i_flags |= S_IMMUTABLE; else @@ -245,30 +252,13 @@ static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, i inode->i_flags &= ~S_APPEND; } -static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms) -{ - if (inode->i_flags & S_IMMUTABLE) - perms->rootflags |= HFSPLUS_FLG_IMMUTABLE; - else - perms->rootflags &= ~HFSPLUS_FLG_IMMUTABLE; - if (inode->i_flags & S_APPEND) - perms->rootflags |= HFSPLUS_FLG_APPEND; - else - perms->rootflags &= ~HFSPLUS_FLG_APPEND; - perms->userflags = HFSPLUS_I(inode).userflags; - perms->mode = cpu_to_be16(inode->i_mode); - perms->owner = cpu_to_be32(inode->i_uid); - perms->group = cpu_to_be32(inode->i_gid); - perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev); -} - static int hfsplus_file_open(struct inode *inode, struct file *file) { if (HFSPLUS_IS_RSRC(inode)) - inode = HFSPLUS_I(inode).rsrc_inode; + inode = HFSPLUS_I(inode)->rsrc_inode; if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) return -EOVERFLOW; - atomic_inc(&HFSPLUS_I(inode).opencnt); + atomic_inc(&HFSPLUS_I(inode)->opencnt); return 0; } @@ -277,12 +267,13 @@ static int hfsplus_file_release(struct inode *inode, struct file *file) struct super_block *sb = inode->i_sb; if (HFSPLUS_IS_RSRC(inode)) - inode = HFSPLUS_I(inode).rsrc_inode; - if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) { + inode = HFSPLUS_I(inode)->rsrc_inode; + if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) { mutex_lock(&inode->i_mutex); hfsplus_file_truncate(inode); if (inode->i_flags & S_DEAD) { - hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL); + hfsplus_delete_cat(inode->i_ino, + HFSPLUS_SB(sb)->hidden_dir, NULL); hfsplus_delete_inode(inode); } mutex_unlock(&inode->i_mutex); @@ -361,47 +352,52 @@ static const struct file_operations hfsplus_file_operations = { struct inode *hfsplus_new_inode(struct super_block *sb, int mode) { + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct inode *inode = new_inode(sb); + struct hfsplus_inode_info *hip; + if (!inode) return NULL; - inode->i_ino = HFSPLUS_SB(sb).next_cnid++; + inode->i_ino = sbi->next_cnid++; inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_nlink = 1; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; - INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); - mutex_init(&HFSPLUS_I(inode).extents_lock); - atomic_set(&HFSPLUS_I(inode).opencnt, 0); - HFSPLUS_I(inode).flags = 0; - memset(HFSPLUS_I(inode).first_extents, 0, sizeof(hfsplus_extent_rec)); - memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); - HFSPLUS_I(inode).alloc_blocks = 0; - HFSPLUS_I(inode).first_blocks = 0; - HFSPLUS_I(inode).cached_start = 0; - HFSPLUS_I(inode).cached_blocks = 0; - HFSPLUS_I(inode).phys_size = 0; - HFSPLUS_I(inode).fs_blocks = 0; - HFSPLUS_I(inode).rsrc_inode = NULL; + + hip = HFSPLUS_I(inode); + INIT_LIST_HEAD(&hip->open_dir_list); + mutex_init(&hip->extents_lock); + atomic_set(&hip->opencnt, 0); + hip->flags = 0; + memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec)); + memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); + hip->alloc_blocks = 0; + hip->first_blocks = 0; + hip->cached_start = 0; + hip->cached_blocks = 0; + hip->phys_size = 0; + hip->fs_blocks = 0; + hip->rsrc_inode = NULL; if (S_ISDIR(inode->i_mode)) { inode->i_size = 2; - HFSPLUS_SB(sb).folder_count++; + sbi->folder_count++; inode->i_op = &hfsplus_dir_inode_operations; inode->i_fop = &hfsplus_dir_operations; } else if (S_ISREG(inode->i_mode)) { - HFSPLUS_SB(sb).file_count++; + sbi->file_count++; inode->i_op = &hfsplus_file_inode_operations; inode->i_fop = &hfsplus_file_operations; inode->i_mapping->a_ops = &hfsplus_aops; - HFSPLUS_I(inode).clump_blocks = HFSPLUS_SB(sb).data_clump_blocks; + hip->clump_blocks = sbi->data_clump_blocks; } else if (S_ISLNK(inode->i_mode)) { - HFSPLUS_SB(sb).file_count++; + sbi->file_count++; inode->i_op = &page_symlink_inode_operations; inode->i_mapping->a_ops = &hfsplus_aops; - HFSPLUS_I(inode).clump_blocks = 1; + hip->clump_blocks = 1; } else - HFSPLUS_SB(sb).file_count++; + sbi->file_count++; insert_inode_hash(inode); mark_inode_dirty(inode); sb->s_dirt = 1; @@ -414,11 +410,11 @@ void hfsplus_delete_inode(struct inode *inode) struct super_block *sb = inode->i_sb; if (S_ISDIR(inode->i_mode)) { - HFSPLUS_SB(sb).folder_count--; + HFSPLUS_SB(sb)->folder_count--; sb->s_dirt = 1; return; } - HFSPLUS_SB(sb).file_count--; + HFSPLUS_SB(sb)->file_count--; if (S_ISREG(inode->i_mode)) { if (!inode->i_nlink) { inode->i_size = 0; @@ -434,34 +430,39 @@ void hfsplus_delete_inode(struct inode *inode) void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork) { struct super_block *sb = inode->i_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); u32 count; int i; - memcpy(&HFSPLUS_I(inode).first_extents, &fork->extents, - sizeof(hfsplus_extent_rec)); + memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec)); for (count = 0, i = 0; i < 8; i++) count += be32_to_cpu(fork->extents[i].block_count); - HFSPLUS_I(inode).first_blocks = count; - memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); - HFSPLUS_I(inode).cached_start = 0; - HFSPLUS_I(inode).cached_blocks = 0; - - HFSPLUS_I(inode).alloc_blocks = be32_to_cpu(fork->total_blocks); - inode->i_size = HFSPLUS_I(inode).phys_size = be64_to_cpu(fork->total_size); - HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; - inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits); - HFSPLUS_I(inode).clump_blocks = be32_to_cpu(fork->clump_size) >> HFSPLUS_SB(sb).alloc_blksz_shift; - if (!HFSPLUS_I(inode).clump_blocks) - HFSPLUS_I(inode).clump_blocks = HFSPLUS_IS_RSRC(inode) ? HFSPLUS_SB(sb).rsrc_clump_blocks : - HFSPLUS_SB(sb).data_clump_blocks; + hip->first_blocks = count; + memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); + hip->cached_start = 0; + hip->cached_blocks = 0; + + hip->alloc_blocks = be32_to_cpu(fork->total_blocks); + hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size); + hip->fs_blocks = + (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits); + hip->clump_blocks = + be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift; + if (!hip->clump_blocks) { + hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ? + sbi->rsrc_clump_blocks : + sbi->data_clump_blocks; + } } void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork) { - memcpy(&fork->extents, &HFSPLUS_I(inode).first_extents, + memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents, sizeof(hfsplus_extent_rec)); fork->total_size = cpu_to_be64(inode->i_size); - fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode).alloc_blocks); + fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks); } int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) @@ -472,7 +473,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset); - HFSPLUS_I(inode).dev = 0; + HFSPLUS_I(inode)->linkid = 0; if (type == HFSPLUS_FOLDER) { struct hfsplus_cat_folder *folder = &entry.folder; @@ -486,8 +487,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) inode->i_atime = hfsp_mt2ut(folder->access_date); inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date); - HFSPLUS_I(inode).create_date = folder->create_date; - HFSPLUS_I(inode).fs_blocks = 0; + HFSPLUS_I(inode)->create_date = folder->create_date; + HFSPLUS_I(inode)->fs_blocks = 0; inode->i_op = &hfsplus_dir_inode_operations; inode->i_fop = &hfsplus_dir_operations; } else if (type == HFSPLUS_FILE) { @@ -518,7 +519,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) inode->i_atime = hfsp_mt2ut(file->access_date); inode->i_mtime = hfsp_mt2ut(file->content_mod_date); inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date); - HFSPLUS_I(inode).create_date = file->create_date; + HFSPLUS_I(inode)->create_date = file->create_date; } else { printk(KERN_ERR "hfs: bad catalog entry used to create inode\n"); res = -EIO; @@ -533,12 +534,12 @@ int hfsplus_cat_write_inode(struct inode *inode) hfsplus_cat_entry entry; if (HFSPLUS_IS_RSRC(inode)) - main_inode = HFSPLUS_I(inode).rsrc_inode; + main_inode = HFSPLUS_I(inode)->rsrc_inode; if (!main_inode->i_nlink) return 0; - if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb).cat_tree, &fd)) + if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb)->cat_tree, &fd)) /* panic? */ return -EIO; @@ -554,7 +555,7 @@ int hfsplus_cat_write_inode(struct inode *inode) hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, sizeof(struct hfsplus_cat_folder)); /* simple node checks? */ - hfsplus_set_perms(inode, &folder->permissions); + hfsplus_cat_set_perms(inode, &folder->permissions); folder->access_date = hfsp_ut2mt(inode->i_atime); folder->content_mod_date = hfsp_ut2mt(inode->i_mtime); folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime); @@ -576,11 +577,7 @@ int hfsplus_cat_write_inode(struct inode *inode) hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, sizeof(struct hfsplus_cat_file)); hfsplus_inode_write_fork(inode, &file->data_fork); - if (S_ISREG(inode->i_mode)) - HFSPLUS_I(inode).dev = inode->i_nlink; - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) - HFSPLUS_I(inode).dev = kdev_t_to_nr(inode->i_rdev); - hfsplus_set_perms(inode, &file->permissions); + hfsplus_cat_set_perms(inode, &file->permissions); if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE) file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); else diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index ac405f099026..5b4667e08ef7 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -17,83 +17,98 @@ #include <linux/mount.h> #include <linux/sched.h> #include <linux/xattr.h> -#include <linux/smp_lock.h> #include <asm/uaccess.h> #include "hfsplus_fs.h" -long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + unsigned int flags = 0; + + if (inode->i_flags & S_IMMUTABLE) + flags |= FS_IMMUTABLE_FL; + if (inode->i_flags |= S_APPEND) + flags |= FS_APPEND_FL; + if (hip->userflags & HFSPLUS_FLG_NODUMP) + flags |= FS_NODUMP_FL; + + return put_user(flags, user_flags); +} + +static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); unsigned int flags; + int err = 0; - lock_kernel(); - switch (cmd) { - case HFSPLUS_IOC_EXT2_GETFLAGS: - flags = 0; - if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_IMMUTABLE) - flags |= FS_IMMUTABLE_FL; /* EXT2_IMMUTABLE_FL */ - if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_APPEND) - flags |= FS_APPEND_FL; /* EXT2_APPEND_FL */ - if (HFSPLUS_I(inode).userflags & HFSPLUS_FLG_NODUMP) - flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */ - return put_user(flags, (int __user *)arg); - case HFSPLUS_IOC_EXT2_SETFLAGS: { - int err = 0; - err = mnt_want_write(filp->f_path.mnt); - if (err) { - unlock_kernel(); - return err; - } + err = mnt_want_write(file->f_path.mnt); + if (err) + goto out; - if (!is_owner_or_cap(inode)) { - err = -EACCES; - goto setflags_out; - } - if (get_user(flags, (int __user *)arg)) { - err = -EFAULT; - goto setflags_out; - } - if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) || - HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) { - if (!capable(CAP_LINUX_IMMUTABLE)) { - err = -EPERM; - goto setflags_out; - } - } + if (!is_owner_or_cap(inode)) { + err = -EACCES; + goto out_drop_write; + } - /* don't silently ignore unsupported ext2 flags */ - if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) { - err = -EOPNOTSUPP; - goto setflags_out; - } - if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */ - inode->i_flags |= S_IMMUTABLE; - HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE; - } else { - inode->i_flags &= ~S_IMMUTABLE; - HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_IMMUTABLE; - } - if (flags & FS_APPEND_FL) { /* EXT2_APPEND_FL */ - inode->i_flags |= S_APPEND; - HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_APPEND; - } else { - inode->i_flags &= ~S_APPEND; - HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_APPEND; + if (get_user(flags, user_flags)) { + err = -EFAULT; + goto out_drop_write; + } + + mutex_lock(&inode->i_mutex); + + if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) || + inode->i_flags & (S_IMMUTABLE|S_APPEND)) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + err = -EPERM; + goto out_unlock_inode; } - if (flags & FS_NODUMP_FL) /* EXT2_NODUMP_FL */ - HFSPLUS_I(inode).userflags |= HFSPLUS_FLG_NODUMP; - else - HFSPLUS_I(inode).userflags &= ~HFSPLUS_FLG_NODUMP; - - inode->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); -setflags_out: - mnt_drop_write(filp->f_path.mnt); - unlock_kernel(); - return err; } + + /* don't silently ignore unsupported ext2 flags */ + if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) { + err = -EOPNOTSUPP; + goto out_unlock_inode; + } + + if (flags & FS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + + if (flags & FS_APPEND_FL) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + + if (flags & FS_NODUMP_FL) + hip->userflags |= HFSPLUS_FLG_NODUMP; + else + hip->userflags &= ~HFSPLUS_FLG_NODUMP; + + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + +out_unlock_inode: + mutex_lock(&inode->i_mutex); +out_drop_write: + mnt_drop_write(file->f_path.mnt); +out: + return err; +} + +long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + + switch (cmd) { + case HFSPLUS_IOC_EXT2_GETFLAGS: + return hfsplus_ioctl_getflags(file, argp); + case HFSPLUS_IOC_EXT2_SETFLAGS: + return hfsplus_ioctl_setflags(file, argp); default: - unlock_kernel(); return -ENOTTY; } } @@ -110,7 +125,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name, if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode)) return -EOPNOTSUPP; - res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd); + res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); if (res) return res; res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); @@ -153,7 +168,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, return -EOPNOTSUPP; if (size) { - res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd); + res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); if (res) return res; res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); @@ -177,7 +192,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, } else res = size ? -ERANGE : 4; } else - res = -ENODATA; + res = -EOPNOTSUPP; out: if (size) hfs_find_exit(&fd); diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index 572628b4b07d..f9ab276a4d8d 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -143,13 +143,13 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) kfree(p); break; case opt_decompose: - sbi->flags &= ~HFSPLUS_SB_NODECOMPOSE; + clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); break; case opt_nodecompose: - sbi->flags |= HFSPLUS_SB_NODECOMPOSE; + set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); break; case opt_force: - sbi->flags |= HFSPLUS_SB_FORCE; + set_bit(HFSPLUS_SB_FORCE, &sbi->flags); break; default: return 0; @@ -171,7 +171,7 @@ done: int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt) { - struct hfsplus_sb_info *sbi = &HFSPLUS_SB(mnt->mnt_sb); + struct hfsplus_sb_info *sbi = HFSPLUS_SB(mnt->mnt_sb); if (sbi->creator != HFSPLUS_DEF_CR_TYPE) seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator); @@ -184,7 +184,7 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt) seq_printf(seq, ",session=%u", sbi->session); if (sbi->nls) seq_printf(seq, ",nls=%s", sbi->nls->charset); - if (sbi->flags & HFSPLUS_SB_NODECOMPOSE) + if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags)) seq_printf(seq, ",nodecompose"); return 0; } diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c index 1528a6fd0299..208b16c645cc 100644 --- a/fs/hfsplus/part_tbl.c +++ b/fs/hfsplus/part_tbl.c @@ -74,6 +74,7 @@ struct old_pmap { int hfs_part_find(struct super_block *sb, sector_t *part_start, sector_t *part_size) { + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct buffer_head *bh; __be16 *data; int i, size, res; @@ -95,7 +96,7 @@ int hfs_part_find(struct super_block *sb, for (i = 0; i < size; p++, i++) { if (p->pdStart && p->pdSize && p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ && - (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) { + (sbi->part < 0 || sbi->part == i)) { *part_start += be32_to_cpu(p->pdStart); *part_size = be32_to_cpu(p->pdSize); res = 0; @@ -111,7 +112,7 @@ int hfs_part_find(struct super_block *sb, size = be32_to_cpu(pm->pmMapBlkCnt); for (i = 0; i < size;) { if (!memcmp(pm->pmPartType,"Apple_HFS", 9) && - (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) { + (sbi->part < 0 || sbi->part == i)) { *part_start += be32_to_cpu(pm->pmPyPartStart); *part_size = be32_to_cpu(pm->pmPartBlkCnt); res = 0; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 3b55c050c742..9a88d7536103 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -12,7 +12,6 @@ #include <linux/pagemap.h> #include <linux/fs.h> #include <linux/slab.h> -#include <linux/smp_lock.h> #include <linux/vfs.h> #include <linux/nls.h> @@ -21,40 +20,11 @@ static void hfsplus_destroy_inode(struct inode *inode); #include "hfsplus_fs.h" -struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) +static int hfsplus_system_read_inode(struct inode *inode) { - struct hfs_find_data fd; - struct hfsplus_vh *vhdr; - struct inode *inode; - long err = -EIO; - - inode = iget_locked(sb, ino); - if (!inode) - return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) - return inode; + struct hfsplus_vh *vhdr = HFSPLUS_SB(inode->i_sb)->s_vhdr; - INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); - mutex_init(&HFSPLUS_I(inode).extents_lock); - HFSPLUS_I(inode).flags = 0; - HFSPLUS_I(inode).rsrc_inode = NULL; - atomic_set(&HFSPLUS_I(inode).opencnt, 0); - - if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) { - read_inode: - hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd); - err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); - if (!err) - err = hfsplus_cat_read_inode(inode, &fd); - hfs_find_exit(&fd); - if (err) - goto bad_inode; - goto done; - } - vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr; - switch(inode->i_ino) { - case HFSPLUS_ROOT_CNID: - goto read_inode; + switch (inode->i_ino) { case HFSPLUS_EXT_CNID: hfsplus_inode_read_fork(inode, &vhdr->ext_file); inode->i_mapping->a_ops = &hfsplus_btree_aops; @@ -75,74 +45,101 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) inode->i_mapping->a_ops = &hfsplus_btree_aops; break; default: - goto bad_inode; + return -EIO; + } + + return 0; +} + +struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) +{ + struct hfs_find_data fd; + struct inode *inode; + int err; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list); + mutex_init(&HFSPLUS_I(inode)->extents_lock); + HFSPLUS_I(inode)->flags = 0; + HFSPLUS_I(inode)->rsrc_inode = NULL; + atomic_set(&HFSPLUS_I(inode)->opencnt, 0); + + if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID || + inode->i_ino == HFSPLUS_ROOT_CNID) { + hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); + err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); + if (!err) + err = hfsplus_cat_read_inode(inode, &fd); + hfs_find_exit(&fd); + } else { + err = hfsplus_system_read_inode(inode); + } + + if (err) { + iget_failed(inode); + return ERR_PTR(err); } -done: unlock_new_inode(inode); return inode; - -bad_inode: - iget_failed(inode); - return ERR_PTR(err); } -static int hfsplus_write_inode(struct inode *inode, - struct writeback_control *wbc) +static int hfsplus_system_write_inode(struct inode *inode) { - struct hfsplus_vh *vhdr; - int ret = 0; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); + struct hfsplus_vh *vhdr = sbi->s_vhdr; + struct hfsplus_fork_raw *fork; + struct hfs_btree *tree = NULL; - dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino); - hfsplus_ext_write_extent(inode); - if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) { - return hfsplus_cat_write_inode(inode); - } - vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr; switch (inode->i_ino) { - case HFSPLUS_ROOT_CNID: - ret = hfsplus_cat_write_inode(inode); - break; case HFSPLUS_EXT_CNID: - if (vhdr->ext_file.total_size != cpu_to_be64(inode->i_size)) { - HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; - inode->i_sb->s_dirt = 1; - } - hfsplus_inode_write_fork(inode, &vhdr->ext_file); - hfs_btree_write(HFSPLUS_SB(inode->i_sb).ext_tree); + fork = &vhdr->ext_file; + tree = sbi->ext_tree; break; case HFSPLUS_CAT_CNID: - if (vhdr->cat_file.total_size != cpu_to_be64(inode->i_size)) { - HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; - inode->i_sb->s_dirt = 1; - } - hfsplus_inode_write_fork(inode, &vhdr->cat_file); - hfs_btree_write(HFSPLUS_SB(inode->i_sb).cat_tree); + fork = &vhdr->cat_file; + tree = sbi->cat_tree; break; case HFSPLUS_ALLOC_CNID: - if (vhdr->alloc_file.total_size != cpu_to_be64(inode->i_size)) { - HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; - inode->i_sb->s_dirt = 1; - } - hfsplus_inode_write_fork(inode, &vhdr->alloc_file); + fork = &vhdr->alloc_file; break; case HFSPLUS_START_CNID: - if (vhdr->start_file.total_size != cpu_to_be64(inode->i_size)) { - HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; - inode->i_sb->s_dirt = 1; - } - hfsplus_inode_write_fork(inode, &vhdr->start_file); + fork = &vhdr->start_file; break; case HFSPLUS_ATTR_CNID: - if (vhdr->attr_file.total_size != cpu_to_be64(inode->i_size)) { - HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; - inode->i_sb->s_dirt = 1; - } - hfsplus_inode_write_fork(inode, &vhdr->attr_file); - hfs_btree_write(HFSPLUS_SB(inode->i_sb).attr_tree); - break; + fork = &vhdr->attr_file; + tree = sbi->attr_tree; + default: + return -EIO; + } + + if (fork->total_size != cpu_to_be64(inode->i_size)) { + set_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags); + inode->i_sb->s_dirt = 1; } - return ret; + hfsplus_inode_write_fork(inode, fork); + if (tree) + hfs_btree_write(tree); + return 0; +} + +static int hfsplus_write_inode(struct inode *inode, + struct writeback_control *wbc) +{ + dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino); + + hfsplus_ext_write_extent(inode); + + if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID || + inode->i_ino == HFSPLUS_ROOT_CNID) + return hfsplus_cat_write_inode(inode); + else + return hfsplus_system_write_inode(inode); } static void hfsplus_evict_inode(struct inode *inode) @@ -151,51 +148,53 @@ static void hfsplus_evict_inode(struct inode *inode) truncate_inode_pages(&inode->i_data, 0); end_writeback(inode); if (HFSPLUS_IS_RSRC(inode)) { - HFSPLUS_I(HFSPLUS_I(inode).rsrc_inode).rsrc_inode = NULL; - iput(HFSPLUS_I(inode).rsrc_inode); + HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL; + iput(HFSPLUS_I(inode)->rsrc_inode); } } int hfsplus_sync_fs(struct super_block *sb, int wait) { - struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct hfsplus_vh *vhdr = sbi->s_vhdr; dprint(DBG_SUPER, "hfsplus_write_super\n"); - lock_super(sb); + mutex_lock(&sbi->vh_mutex); + mutex_lock(&sbi->alloc_mutex); sb->s_dirt = 0; - vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks); - vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc); - vhdr->next_cnid = cpu_to_be32(HFSPLUS_SB(sb).next_cnid); - vhdr->folder_count = cpu_to_be32(HFSPLUS_SB(sb).folder_count); - vhdr->file_count = cpu_to_be32(HFSPLUS_SB(sb).file_count); + vhdr->free_blocks = cpu_to_be32(sbi->free_blocks); + vhdr->next_cnid = cpu_to_be32(sbi->next_cnid); + vhdr->folder_count = cpu_to_be32(sbi->folder_count); + vhdr->file_count = cpu_to_be32(sbi->file_count); - mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); - if (HFSPLUS_SB(sb).flags & HFSPLUS_SB_WRITEBACKUP) { - if (HFSPLUS_SB(sb).sect_count) { + mark_buffer_dirty(sbi->s_vhbh); + if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) { + if (sbi->sect_count) { struct buffer_head *bh; u32 block, offset; - block = HFSPLUS_SB(sb).blockoffset; - block += (HFSPLUS_SB(sb).sect_count - 2) >> (sb->s_blocksize_bits - 9); - offset = ((HFSPLUS_SB(sb).sect_count - 2) << 9) & (sb->s_blocksize - 1); - printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n", HFSPLUS_SB(sb).blockoffset, - HFSPLUS_SB(sb).sect_count, block, offset); + block = sbi->blockoffset; + block += (sbi->sect_count - 2) >> (sb->s_blocksize_bits - 9); + offset = ((sbi->sect_count - 2) << 9) & (sb->s_blocksize - 1); + printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n", + sbi->blockoffset, sbi->sect_count, + block, offset); bh = sb_bread(sb, block); if (bh) { vhdr = (struct hfsplus_vh *)(bh->b_data + offset); if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) { - memcpy(vhdr, HFSPLUS_SB(sb).s_vhdr, sizeof(*vhdr)); + memcpy(vhdr, sbi->s_vhdr, sizeof(*vhdr)); mark_buffer_dirty(bh); brelse(bh); } else printk(KERN_WARNING "hfs: backup not found!\n"); } } - HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP; } - unlock_super(sb); + mutex_unlock(&sbi->alloc_mutex); + mutex_unlock(&sbi->vh_mutex); return 0; } @@ -209,48 +208,48 @@ static void hfsplus_write_super(struct super_block *sb) static void hfsplus_put_super(struct super_block *sb) { + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + dprint(DBG_SUPER, "hfsplus_put_super\n"); + if (!sb->s_fs_info) return; - lock_kernel(); - if (sb->s_dirt) hfsplus_write_super(sb); - if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) { - struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; + if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) { + struct hfsplus_vh *vhdr = sbi->s_vhdr; vhdr->modify_date = hfsp_now2mt(); vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT); vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT); - mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); - sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh); + mark_buffer_dirty(sbi->s_vhbh); + sync_dirty_buffer(sbi->s_vhbh); } - hfs_btree_close(HFSPLUS_SB(sb).cat_tree); - hfs_btree_close(HFSPLUS_SB(sb).ext_tree); - iput(HFSPLUS_SB(sb).alloc_file); - iput(HFSPLUS_SB(sb).hidden_dir); - brelse(HFSPLUS_SB(sb).s_vhbh); - unload_nls(HFSPLUS_SB(sb).nls); + hfs_btree_close(sbi->cat_tree); + hfs_btree_close(sbi->ext_tree); + iput(sbi->alloc_file); + iput(sbi->hidden_dir); + brelse(sbi->s_vhbh); + unload_nls(sbi->nls); kfree(sb->s_fs_info); sb->s_fs_info = NULL; - - unlock_kernel(); } static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = HFSPLUS_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; - buf->f_blocks = HFSPLUS_SB(sb).total_blocks << HFSPLUS_SB(sb).fs_shift; - buf->f_bfree = HFSPLUS_SB(sb).free_blocks << HFSPLUS_SB(sb).fs_shift; + buf->f_blocks = sbi->total_blocks << sbi->fs_shift; + buf->f_bfree = sbi->free_blocks << sbi->fs_shift; buf->f_bavail = buf->f_bfree; buf->f_files = 0xFFFFFFFF; - buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid; + buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = HFSPLUS_MAX_STRLEN; @@ -263,11 +262,11 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data) if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; if (!(*flags & MS_RDONLY)) { - struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; + struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr; struct hfsplus_sb_info sbi; memset(&sbi, 0, sizeof(struct hfsplus_sb_info)); - sbi.nls = HFSPLUS_SB(sb).nls; + sbi.nls = HFSPLUS_SB(sb)->nls; if (!hfsplus_parse_options(data, &sbi)) return -EINVAL; @@ -276,7 +275,7 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data) "running fsck.hfsplus is recommended. leaving read-only.\n"); sb->s_flags |= MS_RDONLY; *flags |= MS_RDONLY; - } else if (sbi.flags & HFSPLUS_SB_FORCE) { + } else if (test_bit(HFSPLUS_SB_FORCE, &sbi.flags)) { /* nothing */ } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n"); @@ -320,7 +319,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; sb->s_fs_info = sbi; - INIT_HLIST_HEAD(&sbi->rsrc_inodes); + mutex_init(&sbi->alloc_mutex); + mutex_init(&sbi->vh_mutex); hfsplus_fill_defaults(sbi); if (!hfsplus_parse_options(data, sbi)) { printk(KERN_ERR "hfs: unable to parse mount options\n"); @@ -344,7 +344,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) err = -EINVAL; goto cleanup; } - vhdr = HFSPLUS_SB(sb).s_vhdr; + vhdr = sbi->s_vhdr; /* Copy parts of the volume header into the superblock */ sb->s_magic = HFSPLUS_VOLHEAD_SIG; @@ -353,18 +353,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) printk(KERN_ERR "hfs: wrong filesystem version\n"); goto cleanup; } - HFSPLUS_SB(sb).total_blocks = be32_to_cpu(vhdr->total_blocks); - HFSPLUS_SB(sb).free_blocks = be32_to_cpu(vhdr->free_blocks); - HFSPLUS_SB(sb).next_alloc = be32_to_cpu(vhdr->next_alloc); - HFSPLUS_SB(sb).next_cnid = be32_to_cpu(vhdr->next_cnid); - HFSPLUS_SB(sb).file_count = be32_to_cpu(vhdr->file_count); - HFSPLUS_SB(sb).folder_count = be32_to_cpu(vhdr->folder_count); - HFSPLUS_SB(sb).data_clump_blocks = be32_to_cpu(vhdr->data_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift; - if (!HFSPLUS_SB(sb).data_clump_blocks) - HFSPLUS_SB(sb).data_clump_blocks = 1; - HFSPLUS_SB(sb).rsrc_clump_blocks = be32_to_cpu(vhdr->rsrc_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift; - if (!HFSPLUS_SB(sb).rsrc_clump_blocks) - HFSPLUS_SB(sb).rsrc_clump_blocks = 1; + sbi->total_blocks = be32_to_cpu(vhdr->total_blocks); + sbi->free_blocks = be32_to_cpu(vhdr->free_blocks); + sbi->next_cnid = be32_to_cpu(vhdr->next_cnid); + sbi->file_count = be32_to_cpu(vhdr->file_count); + sbi->folder_count = be32_to_cpu(vhdr->folder_count); + sbi->data_clump_blocks = + be32_to_cpu(vhdr->data_clump_sz) >> sbi->alloc_blksz_shift; + if (!sbi->data_clump_blocks) + sbi->data_clump_blocks = 1; + sbi->rsrc_clump_blocks = + be32_to_cpu(vhdr->rsrc_clump_sz) >> sbi->alloc_blksz_shift; + if (!sbi->rsrc_clump_blocks) + sbi->rsrc_clump_blocks = 1; /* Set up operations so we can load metadata */ sb->s_op = &hfsplus_sops; @@ -374,7 +375,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, " "running fsck.hfsplus is recommended. mounting read-only.\n"); sb->s_flags |= MS_RDONLY; - } else if (sbi->flags & HFSPLUS_SB_FORCE) { + } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) { /* nothing */ } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n"); @@ -384,16 +385,15 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) "use the force option at your own risk, mounting read-only.\n"); sb->s_flags |= MS_RDONLY; } - sbi->flags &= ~HFSPLUS_SB_FORCE; /* Load metadata objects (B*Trees) */ - HFSPLUS_SB(sb).ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); - if (!HFSPLUS_SB(sb).ext_tree) { + sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); + if (!sbi->ext_tree) { printk(KERN_ERR "hfs: failed to load extents file\n"); goto cleanup; } - HFSPLUS_SB(sb).cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID); - if (!HFSPLUS_SB(sb).cat_tree) { + sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID); + if (!sbi->cat_tree) { printk(KERN_ERR "hfs: failed to load catalog file\n"); goto cleanup; } @@ -404,7 +404,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) err = PTR_ERR(inode); goto cleanup; } - HFSPLUS_SB(sb).alloc_file = inode; + sbi->alloc_file = inode; /* Load the root directory */ root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID); @@ -423,7 +423,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; str.name = HFSP_HIDDENDIR_NAME; - hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); + hfs_find_init(sbi->cat_tree, &fd); hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str); if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { hfs_find_exit(&fd); @@ -434,7 +434,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) err = PTR_ERR(inode); goto cleanup; } - HFSPLUS_SB(sb).hidden_dir = inode; + sbi->hidden_dir = inode; } else hfs_find_exit(&fd); @@ -449,15 +449,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) be32_add_cpu(&vhdr->write_count, 1); vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); - mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); - sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh); + mark_buffer_dirty(sbi->s_vhbh); + sync_dirty_buffer(sbi->s_vhbh); - if (!HFSPLUS_SB(sb).hidden_dir) { + if (!sbi->hidden_dir) { printk(KERN_DEBUG "hfs: create hidden dir...\n"); - HFSPLUS_SB(sb).hidden_dir = hfsplus_new_inode(sb, S_IFDIR); - hfsplus_create_cat(HFSPLUS_SB(sb).hidden_dir->i_ino, sb->s_root->d_inode, - &str, HFSPLUS_SB(sb).hidden_dir); - mark_inode_dirty(HFSPLUS_SB(sb).hidden_dir); + + mutex_lock(&sbi->vh_mutex); + sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR); + hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode, + &str, sbi->hidden_dir); + mutex_unlock(&sbi->vh_mutex); + + mark_inode_dirty(sbi->hidden_dir); } out: unload_nls(sbi->nls); @@ -486,7 +490,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb) static void hfsplus_destroy_inode(struct inode *inode) { - kmem_cache_free(hfsplus_inode_cachep, &HFSPLUS_I(inode)); + kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode)); } #define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info) diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index 628ccf6fa402..b66d67de882c 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -121,7 +121,7 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc) int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p) { const hfsplus_unichr *ip; - struct nls_table *nls = HFSPLUS_SB(sb).nls; + struct nls_table *nls = HFSPLUS_SB(sb)->nls; u8 *op; u16 cc, c0, c1; u16 *ce1, *ce2; @@ -132,7 +132,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c ustrlen = be16_to_cpu(ustr->length); len = *len_p; ce1 = NULL; - compose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); + compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); while (ustrlen > 0) { c0 = be16_to_cpu(*ip++); @@ -246,7 +246,7 @@ out: static inline int asc2unichar(struct super_block *sb, const char *astr, int len, wchar_t *uc) { - int size = HFSPLUS_SB(sb).nls->char2uni(astr, len, uc); + int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc); if (size <= 0) { *uc = '?'; size = 1; @@ -293,7 +293,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, u16 *dstr, outlen = 0; wchar_t c; - decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); + decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); while (outlen < HFSPLUS_MAX_STRLEN && len > 0) { size = asc2unichar(sb, astr, len, &c); @@ -330,8 +330,8 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str) wchar_t c; u16 c2; - casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD); - decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); + casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); + decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); hash = init_name_hash(); astr = str->name; len = str->len; @@ -373,8 +373,8 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr * u16 c1, c2; wchar_t c; - casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD); - decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); + casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); + decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); astr1 = s1->name; len1 = s1->len; astr2 = s2->name; diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index bed78ac8f6d1..8972c20b3216 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -65,8 +65,8 @@ static int hfsplus_get_last_session(struct super_block *sb, *start = 0; *size = sb->s_bdev->bd_inode->i_size >> 9; - if (HFSPLUS_SB(sb).session >= 0) { - te.cdte_track = HFSPLUS_SB(sb).session; + if (HFSPLUS_SB(sb)->session >= 0) { + te.cdte_track = HFSPLUS_SB(sb)->session; te.cdte_format = CDROM_LBA; res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te); if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) { @@ -87,6 +87,7 @@ static int hfsplus_get_last_session(struct super_block *sb, /* Takes in super block, returns true if good data read */ int hfsplus_read_wrapper(struct super_block *sb) { + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct buffer_head *bh; struct hfsplus_vh *vhdr; struct hfsplus_wd wd; @@ -122,7 +123,7 @@ int hfsplus_read_wrapper(struct super_block *sb) if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG)) break; if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) { - HFSPLUS_SB(sb).flags |= HFSPLUS_SB_HFSX; + set_bit(HFSPLUS_SB_HFSX, &sbi->flags); break; } brelse(bh); @@ -143,11 +144,11 @@ int hfsplus_read_wrapper(struct super_block *sb) if (blocksize < HFSPLUS_SECTOR_SIZE || ((blocksize - 1) & blocksize)) return -EINVAL; - HFSPLUS_SB(sb).alloc_blksz = blocksize; - HFSPLUS_SB(sb).alloc_blksz_shift = 0; + sbi->alloc_blksz = blocksize; + sbi->alloc_blksz_shift = 0; while ((blocksize >>= 1) != 0) - HFSPLUS_SB(sb).alloc_blksz_shift++; - blocksize = min(HFSPLUS_SB(sb).alloc_blksz, (u32)PAGE_SIZE); + sbi->alloc_blksz_shift++; + blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE); /* align block size to block offset */ while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1)) @@ -158,23 +159,26 @@ int hfsplus_read_wrapper(struct super_block *sb) return -EINVAL; } - HFSPLUS_SB(sb).blockoffset = part_start >> - (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT); - HFSPLUS_SB(sb).sect_count = part_size; - HFSPLUS_SB(sb).fs_shift = HFSPLUS_SB(sb).alloc_blksz_shift - - sb->s_blocksize_bits; + sbi->blockoffset = + part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT); + sbi->sect_count = part_size; + sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits; bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr); if (!bh) return -EIO; /* should still be the same... */ - if (vhdr->signature != (HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX ? - cpu_to_be16(HFSPLUS_VOLHEAD_SIGX) : - cpu_to_be16(HFSPLUS_VOLHEAD_SIG))) - goto error; - HFSPLUS_SB(sb).s_vhbh = bh; - HFSPLUS_SB(sb).s_vhdr = vhdr; + if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) { + if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) + goto error; + } else { + if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIG)) + goto error; + } + + sbi->s_vhbh = bh; + sbi->s_vhdr = vhdr; return 0; error: diff --git a/include/asm-generic/hardirq.h b/include/asm-generic/hardirq.h index 62f59080e5cc..04d0a977cd43 100644 --- a/include/asm-generic/hardirq.h +++ b/include/asm-generic/hardirq.h @@ -3,13 +3,13 @@ #include <linux/cache.h> #include <linux/threads.h> -#include <linux/irq.h> typedef struct { unsigned int __softirq_pending; } ____cacheline_aligned irq_cpustat_t; #include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */ +#include <linux/irq.h> #ifndef ack_bad_irq static inline void ack_bad_irq(unsigned int irq) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 8a92a170fb7d..ef2af9948eac 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -220,6 +220,8 @@ \ BUG_TABLE \ \ + JUMP_TABLE \ + \ /* PCI quirks */ \ .pci_fixup : AT(ADDR(.pci_fixup) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start_pci_fixups_early) = .; \ @@ -563,6 +565,14 @@ #define BUG_TABLE #endif +#define JUMP_TABLE \ + . = ALIGN(8); \ + __jump_table : AT(ADDR(__jump_table) - LOAD_OFFSET) { \ + VMLINUX_SYMBOL(__start___jump_table) = .; \ + *(__jump_table) \ + VMLINUX_SYMBOL(__stop___jump_table) = .; \ + } + #ifdef CONFIG_PM_TRACE #define TRACEDATA \ . = ALIGN(4); \ diff --git a/fs/ceph/auth.h b/include/linux/ceph/auth.h index d38a2fb4a137..7fff521d7eb5 100644 --- a/fs/ceph/auth.h +++ b/include/linux/ceph/auth.h @@ -1,8 +1,8 @@ #ifndef _FS_CEPH_AUTH_H #define _FS_CEPH_AUTH_H -#include "types.h" -#include "buffer.h" +#include <linux/ceph/types.h> +#include <linux/ceph/buffer.h> /* * Abstract interface for communicating with the authenticate module. diff --git a/fs/ceph/buffer.h b/include/linux/ceph/buffer.h index 58d19014068f..58d19014068f 100644 --- a/fs/ceph/buffer.h +++ b/include/linux/ceph/buffer.h diff --git a/fs/ceph/ceph_debug.h b/include/linux/ceph/ceph_debug.h index 1818c2305610..aa2e19182d99 100644 --- a/fs/ceph/ceph_debug.h +++ b/include/linux/ceph/ceph_debug.h @@ -3,7 +3,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#ifdef CONFIG_CEPH_FS_PRETTYDEBUG +#ifdef CONFIG_CEPH_LIB_PRETTYDEBUG /* * wrap pr_debug to include a filename:lineno prefix on each line. @@ -14,7 +14,8 @@ # if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) extern const char *ceph_file_part(const char *s, int len); # define dout(fmt, ...) \ - pr_debug(" %12.12s:%-4d : " fmt, \ + pr_debug("%.*s %12.12s:%-4d : " fmt, \ + 8 - (int)sizeof(KBUILD_MODNAME), " ", \ ceph_file_part(__FILE__, sizeof(__FILE__)), \ __LINE__, ##__VA_ARGS__) # else diff --git a/fs/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h index 5babb8e95352..5babb8e95352 100644 --- a/fs/ceph/ceph_frag.h +++ b/include/linux/ceph/ceph_frag.h diff --git a/fs/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index d5619ac86711..c3c74aef289d 100644 --- a/fs/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -299,6 +299,7 @@ enum { CEPH_MDS_OP_SETATTR = 0x01108, CEPH_MDS_OP_SETFILELOCK= 0x01109, CEPH_MDS_OP_GETFILELOCK= 0x00110, + CEPH_MDS_OP_SETDIRLAYOUT=0x0110a, CEPH_MDS_OP_MKNOD = 0x01201, CEPH_MDS_OP_LINK = 0x01202, diff --git a/fs/ceph/ceph_hash.h b/include/linux/ceph/ceph_hash.h index d099c3f90236..d099c3f90236 100644 --- a/fs/ceph/ceph_hash.h +++ b/include/linux/ceph/ceph_hash.h diff --git a/include/linux/ceph/debugfs.h b/include/linux/ceph/debugfs.h new file mode 100644 index 000000000000..2a79702e092b --- /dev/null +++ b/include/linux/ceph/debugfs.h @@ -0,0 +1,33 @@ +#ifndef _FS_CEPH_DEBUGFS_H +#define _FS_CEPH_DEBUGFS_H + +#include "ceph_debug.h" +#include "types.h" + +#define CEPH_DEFINE_SHOW_FUNC(name) \ +static int name##_open(struct inode *inode, struct file *file) \ +{ \ + struct seq_file *sf; \ + int ret; \ + \ + ret = single_open(file, name, NULL); \ + sf = file->private_data; \ + sf->private = inode->i_private; \ + return ret; \ +} \ + \ +static const struct file_operations name##_fops = { \ + .open = name##_open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +}; + +/* debugfs.c */ +extern int ceph_debugfs_init(void); +extern void ceph_debugfs_cleanup(void); +extern int ceph_debugfs_client_init(struct ceph_client *client); +extern void ceph_debugfs_client_cleanup(struct ceph_client *client); + +#endif + diff --git a/fs/ceph/decode.h b/include/linux/ceph/decode.h index 3d25415afe63..c5b6939fb32a 100644 --- a/fs/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -191,6 +191,11 @@ static inline void ceph_encode_string(void **p, void *end, ceph_encode_need(p, end, n, bad); \ ceph_encode_copy(p, pv, n); \ } while (0) +#define ceph_encode_string_safe(p, end, s, n, bad) \ + do { \ + ceph_encode_need(p, end, n, bad); \ + ceph_encode_string(p, end, s, n); \ + } while (0) #endif diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h new file mode 100644 index 000000000000..f22b2e941686 --- /dev/null +++ b/include/linux/ceph/libceph.h @@ -0,0 +1,249 @@ +#ifndef _FS_CEPH_LIBCEPH_H +#define _FS_CEPH_LIBCEPH_H + +#include "ceph_debug.h" + +#include <asm/unaligned.h> +#include <linux/backing-dev.h> +#include <linux/completion.h> +#include <linux/exportfs.h> +#include <linux/fs.h> +#include <linux/mempool.h> +#include <linux/pagemap.h> +#include <linux/wait.h> +#include <linux/writeback.h> +#include <linux/slab.h> + +#include "types.h" +#include "messenger.h" +#include "msgpool.h" +#include "mon_client.h" +#include "osd_client.h" +#include "ceph_fs.h" + +/* + * Supported features + */ +#define CEPH_FEATURE_SUPPORTED_DEFAULT CEPH_FEATURE_NOSRCADDR +#define CEPH_FEATURE_REQUIRED_DEFAULT CEPH_FEATURE_NOSRCADDR + +/* + * mount options + */ +#define CEPH_OPT_FSID (1<<0) +#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */ +#define CEPH_OPT_MYIP (1<<2) /* specified my ip */ +#define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes */ + +#define CEPH_OPT_DEFAULT (0); + +#define ceph_set_opt(client, opt) \ + (client)->options->flags |= CEPH_OPT_##opt; +#define ceph_test_opt(client, opt) \ + (!!((client)->options->flags & CEPH_OPT_##opt)) + +struct ceph_options { + int flags; + struct ceph_fsid fsid; + struct ceph_entity_addr my_addr; + int mount_timeout; + int osd_idle_ttl; + int osd_timeout; + int osd_keepalive_timeout; + + /* + * any type that can't be simply compared or doesn't need need + * to be compared should go beyond this point, + * ceph_compare_options() should be updated accordingly + */ + + struct ceph_entity_addr *mon_addr; /* should be the first + pointer type of args */ + int num_mon; + char *name; + char *secret; +}; + +/* + * defaults + */ +#define CEPH_MOUNT_TIMEOUT_DEFAULT 60 +#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */ +#define CEPH_OSD_KEEPALIVE_DEFAULT 5 +#define CEPH_OSD_IDLE_TTL_DEFAULT 60 +#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ + +#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) +#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) + +#define CEPH_AUTH_NAME_DEFAULT "guest" + +/* + * Delay telling the MDS we no longer want caps, in case we reopen + * the file. Delay a minimum amount of time, even if we send a cap + * message for some other reason. Otherwise, take the oppotunity to + * update the mds to avoid sending another message later. + */ +#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ +#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ + +#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4) + +/* mount state */ +enum { + CEPH_MOUNT_MOUNTING, + CEPH_MOUNT_MOUNTED, + CEPH_MOUNT_UNMOUNTING, + CEPH_MOUNT_UNMOUNTED, + CEPH_MOUNT_SHUTDOWN, +}; + +/* + * subtract jiffies + */ +static inline unsigned long time_sub(unsigned long a, unsigned long b) +{ + BUG_ON(time_after(b, a)); + return (long)a - (long)b; +} + +struct ceph_mds_client; + +/* + * per client state + * + * possibly shared by multiple mount points, if they are + * mounting the same ceph filesystem/cluster. + */ +struct ceph_client { + struct ceph_fsid fsid; + bool have_fsid; + + void *private; + + struct ceph_options *options; + + struct mutex mount_mutex; /* serialize mount attempts */ + wait_queue_head_t auth_wq; + int auth_err; + + int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *); + + u32 supported_features; + u32 required_features; + + struct ceph_messenger *msgr; /* messenger instance */ + struct ceph_mon_client monc; + struct ceph_osd_client osdc; + +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_dir; + struct dentry *debugfs_monmap; + struct dentry *debugfs_osdmap; +#endif +}; + + + +/* + * snapshots + */ + +/* + * A "snap context" is the set of existing snapshots when we + * write data. It is used by the OSD to guide its COW behavior. + * + * The ceph_snap_context is refcounted, and attached to each dirty + * page, indicating which context the dirty data belonged when it was + * dirtied. + */ +struct ceph_snap_context { + atomic_t nref; + u64 seq; + int num_snaps; + u64 snaps[]; +}; + +static inline struct ceph_snap_context * +ceph_get_snap_context(struct ceph_snap_context *sc) +{ + /* + printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), + atomic_read(&sc->nref)+1); + */ + if (sc) + atomic_inc(&sc->nref); + return sc; +} + +static inline void ceph_put_snap_context(struct ceph_snap_context *sc) +{ + if (!sc) + return; + /* + printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), + atomic_read(&sc->nref)-1); + */ + if (atomic_dec_and_test(&sc->nref)) { + /*printk(" deleting snap_context %p\n", sc);*/ + kfree(sc); + } +} + +/* + * calculate the number of pages a given length and offset map onto, + * if we align the data. + */ +static inline int calc_pages_for(u64 off, u64 len) +{ + return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) - + (off >> PAGE_CACHE_SHIFT); +} + +/* ceph_common.c */ +extern const char *ceph_msg_type_name(int type); +extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); +extern struct kmem_cache *ceph_inode_cachep; +extern struct kmem_cache *ceph_cap_cachep; +extern struct kmem_cache *ceph_dentry_cachep; +extern struct kmem_cache *ceph_file_cachep; + +extern int ceph_parse_options(struct ceph_options **popt, char *options, + const char *dev_name, const char *dev_name_end, + int (*parse_extra_token)(char *c, void *private), + void *private); +extern void ceph_destroy_options(struct ceph_options *opt); +extern int ceph_compare_options(struct ceph_options *new_opt, + struct ceph_client *client); +extern struct ceph_client *ceph_create_client(struct ceph_options *opt, + void *private); +extern u64 ceph_client_id(struct ceph_client *client); +extern void ceph_destroy_client(struct ceph_client *client); +extern int __ceph_open_session(struct ceph_client *client, + unsigned long started); +extern int ceph_open_session(struct ceph_client *client); + +/* pagevec.c */ +extern void ceph_release_page_vector(struct page **pages, int num_pages); + +extern struct page **ceph_get_direct_page_vector(const char __user *data, + int num_pages, + loff_t off, size_t len); +extern void ceph_put_page_vector(struct page **pages, int num_pages); +extern void ceph_release_page_vector(struct page **pages, int num_pages); +extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); +extern int ceph_copy_user_to_page_vector(struct page **pages, + const char __user *data, + loff_t off, size_t len); +extern int ceph_copy_to_page_vector(struct page **pages, + const char *data, + loff_t off, size_t len); +extern int ceph_copy_from_page_vector(struct page **pages, + char *data, + loff_t off, size_t len); +extern int ceph_copy_page_vector_to_user(struct page **pages, char __user *data, + loff_t off, size_t len); +extern void ceph_zero_page_vector_range(int off, int len, struct page **pages); + + +#endif /* _FS_CEPH_SUPER_H */ diff --git a/fs/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h index 4c5cb0880bba..4c5cb0880bba 100644 --- a/fs/ceph/mdsmap.h +++ b/include/linux/ceph/mdsmap.h diff --git a/fs/ceph/messenger.h b/include/linux/ceph/messenger.h index 76fbc957bc13..5956d62c3057 100644 --- a/fs/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -65,6 +65,9 @@ struct ceph_messenger { */ u32 global_seq; spinlock_t global_seq_lock; + + u32 supported_features; + u32 required_features; }; /* @@ -82,6 +85,10 @@ struct ceph_msg { struct ceph_pagelist *pagelist; /* instead of pages */ struct list_head list_head; struct kref kref; + struct bio *bio; /* instead of pages/pagelist */ + struct bio *bio_iter; /* bio iterator */ + int bio_seg; /* current bio segment */ + struct ceph_pagelist *trail; /* the trailing part of the data */ bool front_is_vmalloc; bool more_to_follow; bool needs_out_seq; @@ -205,7 +212,7 @@ struct ceph_connection { }; -extern const char *pr_addr(const struct sockaddr_storage *ss); +extern const char *ceph_pr_addr(const struct sockaddr_storage *ss); extern int ceph_parse_ips(const char *c, const char *end, struct ceph_entity_addr *addr, int max_count, int *count); @@ -216,7 +223,8 @@ extern void ceph_msgr_exit(void); extern void ceph_msgr_flush(void); extern struct ceph_messenger *ceph_messenger_create( - struct ceph_entity_addr *myaddr); + struct ceph_entity_addr *myaddr, + u32 features, u32 required); extern void ceph_messenger_destroy(struct ceph_messenger *); extern void ceph_con_init(struct ceph_messenger *msgr, diff --git a/fs/ceph/mon_client.h b/include/linux/ceph/mon_client.h index 8e396f2c0963..545f85917780 100644 --- a/fs/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -79,6 +79,7 @@ struct ceph_mon_client { u64 last_tid; /* mds/osd map */ + int want_mdsmap; int want_next_osdmap; /* 1 = want, 2 = want+asked */ u32 have_osdmap, have_mdsmap; diff --git a/fs/ceph/msgpool.h b/include/linux/ceph/msgpool.h index a362605f9368..a362605f9368 100644 --- a/fs/ceph/msgpool.h +++ b/include/linux/ceph/msgpool.h diff --git a/fs/ceph/msgr.h b/include/linux/ceph/msgr.h index 680d3d648cac..680d3d648cac 100644 --- a/fs/ceph/msgr.h +++ b/include/linux/ceph/msgr.h diff --git a/fs/ceph/osd_client.h b/include/linux/ceph/osd_client.h index ce776989ef6a..6c91fb032c39 100644 --- a/fs/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -15,6 +15,7 @@ struct ceph_snap_context; struct ceph_osd_request; struct ceph_osd_client; struct ceph_authorizer; +struct ceph_pagelist; /* * completion callback for async writepages @@ -68,6 +69,7 @@ struct ceph_osd_request { struct list_head r_unsafe_item; struct inode *r_inode; /* for use by callbacks */ + void *r_priv; /* ditto */ char r_oid[40]; /* object name */ int r_oid_len; @@ -80,6 +82,11 @@ struct ceph_osd_request { struct page **r_pages; /* pages for data payload */ int r_pages_from_pool; int r_own_pages; /* if true, i own page list */ +#ifdef CONFIG_BLOCK + struct bio *r_bio; /* instead of pages */ +#endif + + struct ceph_pagelist *r_trail; /* trailing part of the data */ }; struct ceph_osd_client { @@ -110,6 +117,42 @@ struct ceph_osd_client { struct ceph_msgpool msgpool_op_reply; }; +struct ceph_osd_req_op { + u16 op; /* CEPH_OSD_OP_* */ + u32 flags; /* CEPH_OSD_FLAG_* */ + union { + struct { + u64 offset, length; + u64 truncate_size; + u32 truncate_seq; + } extent; + struct { + const char *name; + u32 name_len; + const char *val; + u32 value_len; + __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ + __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ + } xattr; + struct { + const char *class_name; + __u8 class_len; + const char *method_name; + __u8 method_len; + __u8 argc; + const char *indata; + u32 indata_len; + } cls; + struct { + u64 cookie, count; + } pgls; + struct { + u64 snapid; + } snap; + }; + u32 payload_len; +}; + extern int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client); extern void ceph_osdc_stop(struct ceph_osd_client *osdc); @@ -119,6 +162,30 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); +extern void ceph_calc_raw_layout(struct ceph_osd_client *osdc, + struct ceph_file_layout *layout, + u64 snapid, + u64 off, u64 *plen, u64 *bno, + struct ceph_osd_request *req, + struct ceph_osd_req_op *op); + +extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, + int flags, + struct ceph_snap_context *snapc, + struct ceph_osd_req_op *ops, + bool use_mempool, + gfp_t gfp_flags, + struct page **pages, + struct bio *bio); + +extern void ceph_osdc_build_request(struct ceph_osd_request *req, + u64 off, u64 *plen, + struct ceph_osd_req_op *src_ops, + struct ceph_snap_context *snapc, + struct timespec *mtime, + const char *oid, + int oid_len); + extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, struct ceph_file_layout *layout, struct ceph_vino vino, diff --git a/fs/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 970b547e510d..ba4c205cbb01 100644 --- a/fs/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -4,7 +4,7 @@ #include <linux/rbtree.h> #include "types.h" #include "ceph_fs.h" -#include "crush/crush.h" +#include <linux/crush/crush.h> /* * The osd map describes the current membership of the osd cluster and @@ -125,4 +125,6 @@ extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid); +extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name); + #endif diff --git a/fs/ceph/pagelist.h b/include/linux/ceph/pagelist.h index e8a4187e1087..9660d6b0a35d 100644 --- a/fs/ceph/pagelist.h +++ b/include/linux/ceph/pagelist.h @@ -8,6 +8,14 @@ struct ceph_pagelist { void *mapped_tail; size_t length; size_t room; + struct list_head free_list; + size_t num_pages_free; +}; + +struct ceph_pagelist_cursor { + struct ceph_pagelist *pl; /* pagelist, for error checking */ + struct list_head *page_lru; /* page in list */ + size_t room; /* room remaining to reset to */ }; static inline void ceph_pagelist_init(struct ceph_pagelist *pl) @@ -16,10 +24,23 @@ static inline void ceph_pagelist_init(struct ceph_pagelist *pl) pl->mapped_tail = NULL; pl->length = 0; pl->room = 0; + INIT_LIST_HEAD(&pl->free_list); + pl->num_pages_free = 0; } + extern int ceph_pagelist_release(struct ceph_pagelist *pl); -extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l); +extern int ceph_pagelist_append(struct ceph_pagelist *pl, const void *d, size_t l); + +extern int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space); + +extern int ceph_pagelist_free_reserve(struct ceph_pagelist *pl); + +extern void ceph_pagelist_set_cursor(struct ceph_pagelist *pl, + struct ceph_pagelist_cursor *c); + +extern int ceph_pagelist_truncate(struct ceph_pagelist *pl, + struct ceph_pagelist_cursor *c); static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v) { diff --git a/fs/ceph/rados.h b/include/linux/ceph/rados.h index 6d5247f2e81b..6d5247f2e81b 100644 --- a/fs/ceph/rados.h +++ b/include/linux/ceph/rados.h diff --git a/fs/ceph/types.h b/include/linux/ceph/types.h index 28b35a005ec2..28b35a005ec2 100644 --- a/fs/ceph/types.h +++ b/include/linux/ceph/types.h diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 0c991023ee47..709dfb901d11 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -75,7 +75,7 @@ struct cgroup_subsys_state { unsigned long flags; /* ID for this css, if possible */ - struct css_id *id; + struct css_id __rcu *id; }; /* bits in struct cgroup_subsys_state flags field */ @@ -205,7 +205,7 @@ struct cgroup { struct list_head children; /* my children */ struct cgroup *parent; /* my parent */ - struct dentry *dentry; /* cgroup fs entry, RCU protected */ + struct dentry __rcu *dentry; /* cgroup fs entry, RCU protected */ /* Private pointers for each registered subsystem */ struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; diff --git a/include/linux/compiler.h b/include/linux/compiler.h index c1a62c56a660..320d6c94ff84 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -16,7 +16,11 @@ # define __release(x) __context__(x,-1) # define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0) # define __percpu __attribute__((noderef, address_space(3))) +#ifdef CONFIG_SPARSE_RCU_POINTER +# define __rcu __attribute__((noderef, address_space(4))) +#else # define __rcu +#endif extern void __chk_user_ptr(const volatile void __user *); extern void __chk_io_ptr(const volatile void __iomem *); #else diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 8ba66a9d9022..ba4b85a6d9b8 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -9,37 +9,7 @@ * These are the only things you should do on a core-file: use only these * functions to write out all the necessary info. */ -static inline int dump_write(struct file *file, const void *addr, int nr) -{ - return file->f_op->write(file, addr, nr, &file->f_pos) == nr; -} - -static inline int dump_seek(struct file *file, loff_t off) -{ - int ret = 1; - - if (file->f_op->llseek && file->f_op->llseek != no_llseek) { - if (file->f_op->llseek(file, off, SEEK_CUR) < 0) - return 0; - } else { - char *buf = (char *)get_zeroed_page(GFP_KERNEL); - - if (!buf) - return 0; - while (off > 0) { - unsigned long n = off; - - if (n > PAGE_SIZE) - n = PAGE_SIZE; - if (!dump_write(file, buf, n)) { - ret = 0; - break; - } - off -= n; - } - free_page((unsigned long)buf); - } - return ret; -} +extern int dump_write(struct file *file, const void *addr, int nr); +extern int dump_seek(struct file *file, loff_t off); #endif /* _LINUX_COREDUMP_H */ diff --git a/include/linux/cred.h b/include/linux/cred.h index 4d2c39573f36..4aaeab376446 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -84,7 +84,7 @@ struct thread_group_cred { atomic_t usage; pid_t tgid; /* thread group process ID */ spinlock_t lock; - struct key *session_keyring; /* keyring inherited over fork */ + struct key __rcu *session_keyring; /* keyring inherited over fork */ struct key *process_keyring; /* keyring private to this process */ struct rcu_head rcu; /* RCU deletion hook */ }; diff --git a/fs/ceph/crush/crush.h b/include/linux/crush/crush.h index 97e435b191f4..97e435b191f4 100644 --- a/fs/ceph/crush/crush.h +++ b/include/linux/crush/crush.h diff --git a/fs/ceph/crush/hash.h b/include/linux/crush/hash.h index 91e884230d5d..91e884230d5d 100644 --- a/fs/ceph/crush/hash.h +++ b/include/linux/crush/hash.h diff --git a/fs/ceph/crush/mapper.h b/include/linux/crush/mapper.h index c46b99c18bb0..c46b99c18bb0 100644 --- a/fs/ceph/crush/mapper.h +++ b/include/linux/crush/mapper.h diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index 29b3ce3f2a1d..2833452ea01c 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h @@ -49,7 +49,6 @@ struct task_struct; #ifdef CONFIG_LOCKDEP extern void debug_show_all_locks(void); -extern void __debug_show_held_locks(struct task_struct *task); extern void debug_show_held_locks(struct task_struct *task); extern void debug_check_no_locks_freed(const void *from, unsigned long len); extern void debug_check_no_locks_held(struct task_struct *task); @@ -58,10 +57,6 @@ static inline void debug_show_all_locks(void) { } -static inline void __debug_show_held_locks(struct task_struct *task) -{ -} - static inline void debug_show_held_locks(struct task_struct *task) { } diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index 52c0da4bdd18..bef3cda44c4c 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -1,6 +1,8 @@ #ifndef _DYNAMIC_DEBUG_H #define _DYNAMIC_DEBUG_H +#include <linux/jump_label.h> + /* dynamic_printk_enabled, and dynamic_printk_enabled2 are bitmasks in which * bit n is set to 1 if any modname hashes into the bucket n, 0 otherwise. They * use independent hash functions, to reduce the chance of false positives. @@ -22,8 +24,6 @@ struct _ddebug { const char *function; const char *filename; const char *format; - char primary_hash; - char secondary_hash; unsigned int lineno:24; /* * The flags field controls the behaviour at the callsite. @@ -33,6 +33,7 @@ struct _ddebug { #define _DPRINTK_FLAGS_PRINT (1<<0) /* printk() a message using the format */ #define _DPRINTK_FLAGS_DEFAULT 0 unsigned int flags:8; + char enabled; } __attribute__((aligned(8))); @@ -42,33 +43,35 @@ int ddebug_add_module(struct _ddebug *tab, unsigned int n, #if defined(CONFIG_DYNAMIC_DEBUG) extern int ddebug_remove_module(const char *mod_name); -#define __dynamic_dbg_enabled(dd) ({ \ - int __ret = 0; \ - if (unlikely((dynamic_debug_enabled & (1LL << DEBUG_HASH)) && \ - (dynamic_debug_enabled2 & (1LL << DEBUG_HASH2)))) \ - if (unlikely(dd.flags)) \ - __ret = 1; \ - __ret; }) - #define dynamic_pr_debug(fmt, ...) do { \ + __label__ do_printk; \ + __label__ out; \ static struct _ddebug descriptor \ __used \ __attribute__((section("__verbose"), aligned(8))) = \ - { KBUILD_MODNAME, __func__, __FILE__, fmt, DEBUG_HASH, \ - DEBUG_HASH2, __LINE__, _DPRINTK_FLAGS_DEFAULT }; \ - if (__dynamic_dbg_enabled(descriptor)) \ - printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ + { KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__, \ + _DPRINTK_FLAGS_DEFAULT }; \ + JUMP_LABEL(&descriptor.enabled, do_printk); \ + goto out; \ +do_printk: \ + printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ +out: ; \ } while (0) #define dynamic_dev_dbg(dev, fmt, ...) do { \ + __label__ do_printk; \ + __label__ out; \ static struct _ddebug descriptor \ __used \ __attribute__((section("__verbose"), aligned(8))) = \ - { KBUILD_MODNAME, __func__, __FILE__, fmt, DEBUG_HASH, \ - DEBUG_HASH2, __LINE__, _DPRINTK_FLAGS_DEFAULT }; \ - if (__dynamic_dbg_enabled(descriptor)) \ - dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__); \ + { KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__, \ + _DPRINTK_FLAGS_DEFAULT }; \ + JUMP_LABEL(&descriptor.enabled, do_printk); \ + goto out; \ +do_printk: \ + dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__); \ +out: ; \ } while (0) #else diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index f59ed297b661..133c0ba25e30 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -31,7 +31,7 @@ struct embedded_fd_set { struct fdtable { unsigned int max_fds; - struct file ** fd; /* current fd array */ + struct file __rcu **fd; /* current fd array */ fd_set *close_on_exec; fd_set *open_fds; struct rcu_head rcu; @@ -46,7 +46,7 @@ struct files_struct { * read mostly part */ atomic_t count; - struct fdtable *fdt; + struct fdtable __rcu *fdt; struct fdtable fdtab; /* * written part on a separate cache line in SMP @@ -55,7 +55,7 @@ struct files_struct { int next_fd; struct embedded_fd_set close_on_exec_init; struct embedded_fd_set open_fds_init; - struct file * fd_array[NR_OPEN_DEFAULT]; + struct file __rcu * fd_array[NR_OPEN_DEFAULT]; }; #define rcu_dereference_check_fdtable(files, fdtfd) \ diff --git a/include/linux/fs.h b/include/linux/fs.h index 63d069bd80b7..3168dcfb94f2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1384,7 +1384,7 @@ struct super_block { * Saved mount options for lazy filesystems using * generic_show_options() */ - char *s_options; + char __rcu *s_options; }; extern struct timespec current_fs_time(struct super_block *sb); diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 02b8b24f8f51..8beabb958f61 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -191,8 +191,8 @@ struct ftrace_event_call { unsigned int flags; #ifdef CONFIG_PERF_EVENTS - int perf_refcount; - struct hlist_head *perf_events; + int perf_refcount; + struct hlist_head __percpu *perf_events; #endif }; @@ -252,8 +252,8 @@ DECLARE_PER_CPU(struct pt_regs, perf_trace_regs); extern int perf_trace_init(struct perf_event *event); extern void perf_trace_destroy(struct perf_event *event); -extern int perf_trace_enable(struct perf_event *event); -extern void perf_trace_disable(struct perf_event *event); +extern int perf_trace_add(struct perf_event *event, int flags); +extern void perf_trace_del(struct perf_event *event, int flags); extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str); extern void ftrace_profile_free_filter(struct perf_event *event); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 5f2f4c4d8fb0..af3f06b41dc1 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -129,8 +129,8 @@ struct blk_scsi_cmd_filter { struct disk_part_tbl { struct rcu_head rcu_head; int len; - struct hd_struct *last_lookup; - struct hd_struct *part[]; + struct hd_struct __rcu *last_lookup; + struct hd_struct __rcu *part[]; }; struct gendisk { @@ -149,7 +149,7 @@ struct gendisk { * non-critical accesses use RCU. Always access through * helpers. */ - struct disk_part_tbl *part_tbl; + struct disk_part_tbl __rcu *part_tbl; struct hd_struct part0; const struct block_device_operations *fops; diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index ff43e9268449..96c323ac44df 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -146,7 +146,7 @@ extern void account_system_vtime(struct task_struct *tsk); #endif #if defined(CONFIG_NO_HZ) -#if defined(CONFIG_TINY_RCU) +#if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) extern void rcu_enter_nohz(void); extern void rcu_exit_nohz(void); diff --git a/include/linux/idr.h b/include/linux/idr.h index e968db71e33a..cdb715e58e3e 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -50,14 +50,14 @@ struct idr_layer { unsigned long bitmap; /* A zero bit means "space here" */ - struct idr_layer *ary[1<<IDR_BITS]; + struct idr_layer __rcu *ary[1<<IDR_BITS]; int count; /* When zero, we can release it */ int layer; /* distance from leaf */ struct rcu_head rcu_head; }; struct idr { - struct idr_layer *top; + struct idr_layer __rcu *top; struct idr_layer *id_free; int layers; /* only valid without concurrent changes */ int id_free_cnt; diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 1f43fa56f600..2fea6c8ef6ba 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -82,11 +82,17 @@ extern struct group_info init_groups; # define CAP_INIT_BSET CAP_FULL_SET #ifdef CONFIG_TREE_PREEMPT_RCU +#define INIT_TASK_RCU_TREE_PREEMPT() \ + .rcu_blocked_node = NULL, +#else +#define INIT_TASK_RCU_TREE_PREEMPT(tsk) +#endif +#ifdef CONFIG_PREEMPT_RCU #define INIT_TASK_RCU_PREEMPT(tsk) \ .rcu_read_lock_nesting = 0, \ .rcu_read_unlock_special = 0, \ - .rcu_blocked_node = NULL, \ - .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), + .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \ + INIT_TASK_RCU_TREE_PREEMPT() #else #define INIT_TASK_RCU_PREEMPT(tsk) #endif @@ -137,8 +143,8 @@ extern struct cred init_cred; .children = LIST_HEAD_INIT(tsk.children), \ .sibling = LIST_HEAD_INIT(tsk.sibling), \ .group_leader = &tsk, \ - .real_cred = &init_cred, \ - .cred = &init_cred, \ + RCU_INIT_POINTER(.real_cred, &init_cred), \ + RCU_INIT_POINTER(.cred, &init_cred), \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(tsk.cred_guard_mutex), \ .comm = "swapper", \ diff --git a/include/linux/input.h b/include/linux/input.h index 896a92227bc4..d6ae1761be97 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -1196,7 +1196,7 @@ struct input_dev { int (*flush)(struct input_dev *dev, struct file *file); int (*event)(struct input_dev *dev, unsigned int type, unsigned int code, int value); - struct input_handle *grab; + struct input_handle __rcu *grab; spinlock_t event_lock; struct mutex mutex; diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index a0384a4d1e6f..531495db1708 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -18,6 +18,7 @@ #include <asm/atomic.h> #include <asm/ptrace.h> #include <asm/system.h> +#include <trace/events/irq.h> /* * These correspond to the IORESOURCE_IRQ_* defines in @@ -407,7 +408,12 @@ asmlinkage void do_softirq(void); asmlinkage void __do_softirq(void); extern void open_softirq(int nr, void (*action)(struct softirq_action *)); extern void softirq_init(void); -#define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0) +static inline void __raise_softirq_irqoff(unsigned int nr) +{ + trace_softirq_raise((struct softirq_action *)(unsigned long)nr, NULL); + or_softirq_pending(1UL << nr); +} + extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); extern void wakeup_softirqd(void); diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 64d529133031..3e70b21884a9 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -53,7 +53,7 @@ struct io_context { struct radix_tree_root radix_root; struct hlist_head cic_list; - void *ioc_data; + void __rcu *ioc_data; }; static inline struct io_context *ioc_task_link(struct io_context *ioc) diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h new file mode 100644 index 000000000000..4fa09d4d0b71 --- /dev/null +++ b/include/linux/irq_work.h @@ -0,0 +1,20 @@ +#ifndef _LINUX_IRQ_WORK_H +#define _LINUX_IRQ_WORK_H + +struct irq_work { + struct irq_work *next; + void (*func)(struct irq_work *); +}; + +static inline +void init_irq_work(struct irq_work *entry, void (*func)(struct irq_work *)) +{ + entry->next = NULL; + entry->func = func; +} + +bool irq_work_queue(struct irq_work *entry); +void irq_work_run(void); +void irq_work_sync(struct irq_work *entry); + +#endif /* _LINUX_IRQ_WORK_H */ diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h new file mode 100644 index 000000000000..b67cb180e6e9 --- /dev/null +++ b/include/linux/jump_label.h @@ -0,0 +1,74 @@ +#ifndef _LINUX_JUMP_LABEL_H +#define _LINUX_JUMP_LABEL_H + +#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_HAVE_ARCH_JUMP_LABEL) +# include <asm/jump_label.h> +# define HAVE_JUMP_LABEL +#endif + +enum jump_label_type { + JUMP_LABEL_ENABLE, + JUMP_LABEL_DISABLE +}; + +struct module; + +#ifdef HAVE_JUMP_LABEL + +extern struct jump_entry __start___jump_table[]; +extern struct jump_entry __stop___jump_table[]; + +extern void arch_jump_label_transform(struct jump_entry *entry, + enum jump_label_type type); +extern void arch_jump_label_text_poke_early(jump_label_t addr); +extern void jump_label_update(unsigned long key, enum jump_label_type type); +extern void jump_label_apply_nops(struct module *mod); +extern int jump_label_text_reserved(void *start, void *end); + +#define jump_label_enable(key) \ + jump_label_update((unsigned long)key, JUMP_LABEL_ENABLE); + +#define jump_label_disable(key) \ + jump_label_update((unsigned long)key, JUMP_LABEL_DISABLE); + +#else + +#define JUMP_LABEL(key, label) \ +do { \ + if (unlikely(*key)) \ + goto label; \ +} while (0) + +#define jump_label_enable(cond_var) \ +do { \ + *(cond_var) = 1; \ +} while (0) + +#define jump_label_disable(cond_var) \ +do { \ + *(cond_var) = 0; \ +} while (0) + +static inline int jump_label_apply_nops(struct module *mod) +{ + return 0; +} + +static inline int jump_label_text_reserved(void *start, void *end) +{ + return 0; +} + +#endif + +#define COND_STMT(key, stmt) \ +do { \ + __label__ jl_enabled; \ + JUMP_LABEL(key, jl_enabled); \ + if (0) { \ +jl_enabled: \ + stmt; \ + } \ +} while (0) + +#endif diff --git a/include/linux/jump_label_ref.h b/include/linux/jump_label_ref.h new file mode 100644 index 000000000000..e5d012ad92c6 --- /dev/null +++ b/include/linux/jump_label_ref.h @@ -0,0 +1,44 @@ +#ifndef _LINUX_JUMP_LABEL_REF_H +#define _LINUX_JUMP_LABEL_REF_H + +#include <linux/jump_label.h> +#include <asm/atomic.h> + +#ifdef HAVE_JUMP_LABEL + +static inline void jump_label_inc(atomic_t *key) +{ + if (atomic_add_return(1, key) == 1) + jump_label_enable(key); +} + +static inline void jump_label_dec(atomic_t *key) +{ + if (atomic_dec_and_test(key)) + jump_label_disable(key); +} + +#else /* !HAVE_JUMP_LABEL */ + +static inline void jump_label_inc(atomic_t *key) +{ + atomic_inc(key); +} + +static inline void jump_label_dec(atomic_t *key) +{ + atomic_dec(key); +} + +#undef JUMP_LABEL +#define JUMP_LABEL(key, label) \ +do { \ + if (unlikely(__builtin_choose_expr( \ + __builtin_types_compatible_p(typeof(key), atomic_t *), \ + atomic_read((atomic_t *)(key)), *(key)))) \ + goto label; \ +} while (0) + +#endif /* HAVE_JUMP_LABEL */ + +#endif /* _LINUX_JUMP_LABEL_REF_H */ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2b0a35e6bc69..1759ba5adce8 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -58,7 +58,18 @@ extern const char linux_proc_banner[]; #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f)) #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) -#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) +#define roundup(x, y) ( \ +{ \ + typeof(y) __y = y; \ + (((x) + (__y - 1)) / __y) * __y; \ +} \ +) +#define rounddown(x, y) ( \ +{ \ + typeof(x) __x = (x); \ + __x - (__x % (y)); \ +} \ +) #define DIV_ROUND_CLOSEST(x, divisor)( \ { \ typeof(divisor) __divisor = divisor; \ diff --git a/include/linux/key.h b/include/linux/key.h index cd50dfa1d4c2..3db0adce1fda 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -178,8 +178,9 @@ struct key { */ union { unsigned long value; + void __rcu *rcudata; void *data; - struct keyring_list *subscriptions; + struct keyring_list __rcu *subscriptions; } payload; }; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c13cc48697aa..ac740b26eb10 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -205,7 +205,7 @@ struct kvm { struct mutex irq_lock; #ifdef CONFIG_HAVE_KVM_IRQCHIP - struct kvm_irq_routing_table *irq_routing; + struct kvm_irq_routing_table __rcu *irq_routing; struct hlist_head mask_notifier_list; struct hlist_head irq_ack_notifier_list; #endif diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 06aed8305bf3..2186a64ee4b5 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -32,6 +32,17 @@ extern int lock_stat; #define MAX_LOCKDEP_SUBCLASSES 8UL /* + * NR_LOCKDEP_CACHING_CLASSES ... Number of classes + * cached in the instance of lockdep_map + * + * Currently main class (subclass == 0) and signle depth subclass + * are cached in lockdep_map. This optimization is mainly targeting + * on rq->lock. double_rq_lock() acquires this highly competitive with + * single depth. + */ +#define NR_LOCKDEP_CACHING_CLASSES 2 + +/* * Lock-classes are keyed via unique addresses, by embedding the * lockclass-key into the kernel (or module) .data section. (For * static locks we use the lock address itself as the key.) @@ -138,7 +149,7 @@ void clear_lock_stats(struct lock_class *class); */ struct lockdep_map { struct lock_class_key *key; - struct lock_class *class_cache; + struct lock_class *class_cache[NR_LOCKDEP_CACHING_CLASSES]; const char *name; #ifdef CONFIG_LOCK_STAT int cpu; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ee7e258627f9..cb57d657ce4d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -299,7 +299,7 @@ struct mm_struct { * new_owner->mm == mm * new_owner->alloc_lock is held */ - struct task_struct *owner; + struct task_struct __rcu *owner; #endif #ifdef CONFIG_PROC_FS diff --git a/include/linux/module.h b/include/linux/module.h index aace066bad8f..b29e7458b966 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -350,7 +350,10 @@ struct module struct tracepoint *tracepoints; unsigned int num_tracepoints; #endif - +#ifdef HAVE_JUMP_LABEL + struct jump_entry *jump_entries; + unsigned int num_jump_entries; +#endif #ifdef CONFIG_TRACING const char **trace_bprintk_fmt_start; unsigned int num_trace_bprintk_fmt; diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h index 9ed534c991b9..70cd0603911c 100644 --- a/include/linux/netfilter/nfnetlink_conntrack.h +++ b/include/linux/netfilter/nfnetlink_conntrack.h @@ -39,8 +39,9 @@ enum ctattr_type { CTA_TUPLE_MASTER, CTA_NAT_SEQ_ADJ_ORIG, CTA_NAT_SEQ_ADJ_REPLY, - CTA_SECMARK, + CTA_SECMARK, /* obsolete */ CTA_ZONE, + CTA_SECCTX, __CTA_MAX }; #define CTA_MAX (__CTA_MAX - 1) @@ -172,4 +173,11 @@ enum ctattr_help { }; #define CTA_HELP_MAX (__CTA_HELP_MAX - 1) +enum ctattr_secctx { + CTA_SECCTX_UNSPEC, + CTA_SECCTX_NAME, + __CTA_SECCTX_MAX +}; +#define CTA_SECCTX_MAX (__CTA_SECCTX_MAX - 1) + #endif /* _IPCONNTRACK_NETLINK_H */ diff --git a/include/linux/netfilter/xt_SECMARK.h b/include/linux/netfilter/xt_SECMARK.h index 6fcd3448b186..989092bd6274 100644 --- a/include/linux/netfilter/xt_SECMARK.h +++ b/include/linux/netfilter/xt_SECMARK.h @@ -11,18 +11,12 @@ * packets are being marked for. */ #define SECMARK_MODE_SEL 0x01 /* SELinux */ -#define SECMARK_SELCTX_MAX 256 - -struct xt_secmark_target_selinux_info { - __u32 selsid; - char selctx[SECMARK_SELCTX_MAX]; -}; +#define SECMARK_SECCTX_MAX 256 struct xt_secmark_target_info { __u8 mode; - union { - struct xt_secmark_target_selinux_info sel; - } u; + __u32 secid; + char secctx[SECMARK_SECCTX_MAX]; }; #endif /*_XT_SECMARK_H_target */ diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 508f8cf6da37..d0edf7d823ae 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -185,7 +185,7 @@ struct nfs_inode { struct nfs4_cached_acl *nfs4_acl; /* NFSv4 state */ struct list_head open_states; - struct nfs_delegation *delegation; + struct nfs_delegation __rcu *delegation; fmode_t delegation_state; struct rw_semaphore rwsem; #endif /* CONFIG_NFS_V4*/ diff --git a/include/linux/notifier.h b/include/linux/notifier.h index b2f1a4d83550..2026f9e1ceb8 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -49,28 +49,28 @@ struct notifier_block { int (*notifier_call)(struct notifier_block *, unsigned long, void *); - struct notifier_block *next; + struct notifier_block __rcu *next; int priority; }; struct atomic_notifier_head { spinlock_t lock; - struct notifier_block *head; + struct notifier_block __rcu *head; }; struct blocking_notifier_head { struct rw_semaphore rwsem; - struct notifier_block *head; + struct notifier_block __rcu *head; }; struct raw_notifier_head { - struct notifier_block *head; + struct notifier_block __rcu *head; }; struct srcu_notifier_head { struct mutex mutex; struct srcu_struct srcu; - struct notifier_block *head; + struct notifier_block __rcu *head; }; #define ATOMIC_INIT_NOTIFIER_HEAD(name) do { \ diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h index 5171639ecf0f..32fb81212fd1 100644 --- a/include/linux/oprofile.h +++ b/include/linux/oprofile.h @@ -15,6 +15,7 @@ #include <linux/types.h> #include <linux/spinlock.h> +#include <linux/init.h> #include <asm/atomic.h> /* Each escaped entry is prefixed by ESCAPE_CODE @@ -185,4 +186,10 @@ int oprofile_add_data(struct op_entry *entry, unsigned long val); int oprofile_add_data64(struct op_entry *entry, u64 val); int oprofile_write_commit(struct op_entry *entry); +#ifdef CONFIG_PERF_EVENTS +int __init oprofile_perf_init(struct oprofile_operations *ops); +void oprofile_perf_exit(void); +char *op_name_from_perf_id(void); +#endif /* CONFIG_PERF_EVENTS */ + #endif /* OPROFILE_H */ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 49466b13c5c6..0eb50832aa00 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -39,6 +39,15 @@ preempt_enable(); \ } while (0) +#define get_cpu_ptr(var) ({ \ + preempt_disable(); \ + this_cpu_ptr(var); }) + +#define put_cpu_ptr(var) do { \ + (void)(var); \ + preempt_enable(); \ +} while (0) + #ifdef CONFIG_SMP /* minimum unit size, also is the maximum supported allocation size */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 716f99b682c1..057bf22a8323 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -486,6 +486,8 @@ struct perf_guest_info_callbacks { #include <linux/workqueue.h> #include <linux/ftrace.h> #include <linux/cpu.h> +#include <linux/irq_work.h> +#include <linux/jump_label_ref.h> #include <asm/atomic.h> #include <asm/local.h> @@ -529,16 +531,22 @@ struct hw_perf_event { int last_cpu; }; struct { /* software */ - s64 remaining; struct hrtimer hrtimer; }; #ifdef CONFIG_HAVE_HW_BREAKPOINT struct { /* breakpoint */ struct arch_hw_breakpoint info; struct list_head bp_list; + /* + * Crufty hack to avoid the chicken and egg + * problem hw_breakpoint has with context + * creation and event initalization. + */ + struct task_struct *bp_target; }; #endif }; + int state; local64_t prev_count; u64 sample_period; u64 last_period; @@ -550,6 +558,13 @@ struct hw_perf_event { #endif }; +/* + * hw_perf_event::state flags + */ +#define PERF_HES_STOPPED 0x01 /* the counter is stopped */ +#define PERF_HES_UPTODATE 0x02 /* event->count up-to-date */ +#define PERF_HES_ARCH 0x04 + struct perf_event; /* @@ -561,36 +576,70 @@ struct perf_event; * struct pmu - generic performance monitoring unit */ struct pmu { - int (*enable) (struct perf_event *event); - void (*disable) (struct perf_event *event); - int (*start) (struct perf_event *event); - void (*stop) (struct perf_event *event); - void (*read) (struct perf_event *event); - void (*unthrottle) (struct perf_event *event); + struct list_head entry; + + int * __percpu pmu_disable_count; + struct perf_cpu_context * __percpu pmu_cpu_context; + int task_ctx_nr; + + /* + * Fully disable/enable this PMU, can be used to protect from the PMI + * as well as for lazy/batch writing of the MSRs. + */ + void (*pmu_enable) (struct pmu *pmu); /* optional */ + void (*pmu_disable) (struct pmu *pmu); /* optional */ /* - * Group events scheduling is treated as a transaction, add group - * events as a whole and perform one schedulability test. If the test - * fails, roll back the whole group + * Try and initialize the event for this PMU. + * Should return -ENOENT when the @event doesn't match this PMU. */ + int (*event_init) (struct perf_event *event); + +#define PERF_EF_START 0x01 /* start the counter when adding */ +#define PERF_EF_RELOAD 0x02 /* reload the counter when starting */ +#define PERF_EF_UPDATE 0x04 /* update the counter when stopping */ /* - * Start the transaction, after this ->enable() doesn't need - * to do schedulability tests. + * Adds/Removes a counter to/from the PMU, can be done inside + * a transaction, see the ->*_txn() methods. */ - void (*start_txn) (const struct pmu *pmu); + int (*add) (struct perf_event *event, int flags); + void (*del) (struct perf_event *event, int flags); + /* - * If ->start_txn() disabled the ->enable() schedulability test + * Starts/Stops a counter present on the PMU. The PMI handler + * should stop the counter when perf_event_overflow() returns + * !0. ->start() will be used to continue. + */ + void (*start) (struct perf_event *event, int flags); + void (*stop) (struct perf_event *event, int flags); + + /* + * Updates the counter value of the event. + */ + void (*read) (struct perf_event *event); + + /* + * Group events scheduling is treated as a transaction, add + * group events as a whole and perform one schedulability test. + * If the test fails, roll back the whole group + * + * Start the transaction, after this ->add() doesn't need to + * do schedulability tests. + */ + void (*start_txn) (struct pmu *pmu); /* optional */ + /* + * If ->start_txn() disabled the ->add() schedulability test * then ->commit_txn() is required to perform one. On success * the transaction is closed. On error the transaction is kept * open until ->cancel_txn() is called. */ - int (*commit_txn) (const struct pmu *pmu); + int (*commit_txn) (struct pmu *pmu); /* optional */ /* - * Will cancel the transaction, assumes ->disable() is called for - * each successfull ->enable() during the transaction. + * Will cancel the transaction, assumes ->del() is called + * for each successfull ->add() during the transaction. */ - void (*cancel_txn) (const struct pmu *pmu); + void (*cancel_txn) (struct pmu *pmu); /* optional */ }; /** @@ -631,11 +680,6 @@ struct perf_buffer { void *data_pages[0]; }; -struct perf_pending_entry { - struct perf_pending_entry *next; - void (*func)(struct perf_pending_entry *); -}; - struct perf_sample_data; typedef void (*perf_overflow_handler_t)(struct perf_event *, int, @@ -656,6 +700,7 @@ struct swevent_hlist { #define PERF_ATTACH_CONTEXT 0x01 #define PERF_ATTACH_GROUP 0x02 +#define PERF_ATTACH_TASK 0x04 /** * struct perf_event - performance event kernel representation: @@ -669,7 +714,7 @@ struct perf_event { int nr_siblings; int group_flags; struct perf_event *group_leader; - const struct pmu *pmu; + struct pmu *pmu; enum perf_event_active_state state; unsigned int attach_state; @@ -743,7 +788,7 @@ struct perf_event { int pending_wakeup; int pending_kill; int pending_disable; - struct perf_pending_entry pending; + struct irq_work pending; atomic_t event_limit; @@ -763,12 +808,19 @@ struct perf_event { #endif /* CONFIG_PERF_EVENTS */ }; +enum perf_event_context_type { + task_context, + cpu_context, +}; + /** * struct perf_event_context - event context structure * * Used as a container for task events and CPU events as well: */ struct perf_event_context { + enum perf_event_context_type type; + struct pmu *pmu; /* * Protect the states of the events in the list, * nr_active, and the list: @@ -808,6 +860,12 @@ struct perf_event_context { struct rcu_head rcu_head; }; +/* + * Number of contexts where an event can trigger: + * task, softirq, hardirq, nmi. + */ +#define PERF_NR_CONTEXTS 4 + /** * struct perf_event_cpu_context - per cpu event context structure */ @@ -815,18 +873,9 @@ struct perf_cpu_context { struct perf_event_context ctx; struct perf_event_context *task_ctx; int active_oncpu; - int max_pertask; int exclusive; - struct swevent_hlist *swevent_hlist; - struct mutex hlist_mutex; - int hlist_refcount; - - /* - * Recursion avoidance: - * - * task, softirq, irq, nmi context - */ - int recursion[4]; + struct list_head rotation_list; + int jiffies_interval; }; struct perf_output_handle { @@ -842,26 +891,34 @@ struct perf_output_handle { #ifdef CONFIG_PERF_EVENTS -/* - * Set by architecture code: - */ -extern int perf_max_events; +extern int perf_pmu_register(struct pmu *pmu); +extern void perf_pmu_unregister(struct pmu *pmu); + +extern int perf_num_counters(void); +extern const char *perf_pmu_name(void); +extern void __perf_event_task_sched_in(struct task_struct *task); +extern void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next); -extern const struct pmu *hw_perf_event_init(struct perf_event *event); +extern atomic_t perf_task_events; + +static inline void perf_event_task_sched_in(struct task_struct *task) +{ + COND_STMT(&perf_task_events, __perf_event_task_sched_in(task)); +} + +static inline +void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) +{ + COND_STMT(&perf_task_events, __perf_event_task_sched_out(task, next)); +} -extern void perf_event_task_sched_in(struct task_struct *task); -extern void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next); -extern void perf_event_task_tick(struct task_struct *task); extern int perf_event_init_task(struct task_struct *child); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); -extern void set_perf_event_pending(void); -extern void perf_event_do_pending(void); +extern void perf_event_delayed_put(struct task_struct *task); extern void perf_event_print_debug(void); -extern void __perf_disable(void); -extern bool __perf_enable(void); -extern void perf_disable(void); -extern void perf_enable(void); +extern void perf_pmu_disable(struct pmu *pmu); +extern void perf_pmu_enable(struct pmu *pmu); extern int perf_event_task_disable(void); extern int perf_event_task_enable(void); extern void perf_event_update_userpage(struct perf_event *event); @@ -869,7 +926,7 @@ extern int perf_event_release_kernel(struct perf_event *event); extern struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, - pid_t pid, + struct task_struct *task, perf_overflow_handler_t callback); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); @@ -920,14 +977,7 @@ extern int perf_event_overflow(struct perf_event *event, int nmi, */ static inline int is_software_event(struct perf_event *event) { - switch (event->attr.type) { - case PERF_TYPE_SOFTWARE: - case PERF_TYPE_TRACEPOINT: - /* for now the breakpoint stuff also works as software event */ - case PERF_TYPE_BREAKPOINT: - return 1; - } - return 0; + return event->pmu->task_ctx_nr == perf_sw_context; } extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; @@ -954,18 +1004,20 @@ static inline void perf_fetch_caller_regs(struct pt_regs *regs) perf_arch_fetch_caller_regs(regs, CALLER_ADDR0); } -static inline void +static __always_inline void perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { - if (atomic_read(&perf_swevent_enabled[event_id])) { - struct pt_regs hot_regs; - - if (!regs) { - perf_fetch_caller_regs(&hot_regs); - regs = &hot_regs; - } - __perf_sw_event(event_id, nr, nmi, regs, addr); + struct pt_regs hot_regs; + + JUMP_LABEL(&perf_swevent_enabled[event_id], have_event); + return; + +have_event: + if (!regs) { + perf_fetch_caller_regs(&hot_regs); + regs = &hot_regs; } + __perf_sw_event(event_id, nr, nmi, regs, addr); } extern void perf_event_mmap(struct vm_area_struct *vma); @@ -976,7 +1028,21 @@ extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks extern void perf_event_comm(struct task_struct *tsk); extern void perf_event_fork(struct task_struct *tsk); -extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); +/* Callchains */ +DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); + +extern void perf_callchain_user(struct perf_callchain_entry *entry, + struct pt_regs *regs); +extern void perf_callchain_kernel(struct perf_callchain_entry *entry, + struct pt_regs *regs); + + +static inline void +perf_callchain_store(struct perf_callchain_entry *entry, u64 ip) +{ + if (entry->nr < PERF_MAX_STACK_DEPTH) + entry->ip[entry->nr++] = ip; +} extern int sysctl_perf_event_paranoid; extern int sysctl_perf_event_mlock; @@ -1019,21 +1085,18 @@ extern int perf_swevent_get_recursion_context(void); extern void perf_swevent_put_recursion_context(int rctx); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); +extern void perf_event_task_tick(void); #else static inline void perf_event_task_sched_in(struct task_struct *task) { } static inline void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) { } -static inline void -perf_event_task_tick(struct task_struct *task) { } static inline int perf_event_init_task(struct task_struct *child) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } -static inline void perf_event_do_pending(void) { } +static inline void perf_event_delayed_put(struct task_struct *task) { } static inline void perf_event_print_debug(void) { } -static inline void perf_disable(void) { } -static inline void perf_enable(void) { } static inline int perf_event_task_disable(void) { return -EINVAL; } static inline int perf_event_task_enable(void) { return -EINVAL; } @@ -1056,6 +1119,7 @@ static inline int perf_swevent_get_recursion_context(void) { return -1; } static inline void perf_swevent_put_recursion_context(int rctx) { } static inline void perf_event_enable(struct perf_event *event) { } static inline void perf_event_disable(struct perf_event *event) { } +static inline void perf_event_task_tick(void) { } #endif #define perf_output_put(handle, x) \ diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 634b8e674ac5..a39cbed9ee17 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -47,6 +47,8 @@ static inline void *radix_tree_indirect_to_ptr(void *ptr) { return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR); } +#define radix_tree_indirect_to_ptr(ptr) \ + radix_tree_indirect_to_ptr((void __force *)(ptr)) static inline int radix_tree_is_indirect_ptr(void *ptr) { @@ -61,7 +63,7 @@ static inline int radix_tree_is_indirect_ptr(void *ptr) struct radix_tree_root { unsigned int height; gfp_t gfp_mask; - struct radix_tree_node *rnode; + struct radix_tree_node __rcu *rnode; }; #define RADIX_TREE_INIT(mask) { \ diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 4ec3b38ce9c5..f31ef61f1c65 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -10,6 +10,21 @@ #include <linux/rcupdate.h> /* + * Why is there no list_empty_rcu()? Because list_empty() serves this + * purpose. The list_empty() function fetches the RCU-protected pointer + * and compares it to the address of the list head, but neither dereferences + * this pointer itself nor provides this pointer to the caller. Therefore, + * it is not necessary to use rcu_dereference(), so that list_empty() can + * be used anywhere you would want to use a list_empty_rcu(). + */ + +/* + * return the ->next pointer of a list_head in an rcu safe + * way, we must not access it directly + */ +#define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next))) + +/* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know @@ -20,7 +35,7 @@ static inline void __list_add_rcu(struct list_head *new, { new->next = next; new->prev = prev; - rcu_assign_pointer(prev->next, new); + rcu_assign_pointer(list_next_rcu(prev), new); next->prev = new; } @@ -138,7 +153,7 @@ static inline void list_replace_rcu(struct list_head *old, { new->next = old->next; new->prev = old->prev; - rcu_assign_pointer(new->prev->next, new); + rcu_assign_pointer(list_next_rcu(new->prev), new); new->next->prev = new; old->prev = LIST_POISON2; } @@ -193,7 +208,7 @@ static inline void list_splice_init_rcu(struct list_head *list, */ last->next = at; - rcu_assign_pointer(head->next, first); + rcu_assign_pointer(list_next_rcu(head), first); first->prev = head; at->prev = last; } @@ -208,7 +223,9 @@ static inline void list_splice_init_rcu(struct list_head *list, * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_entry_rcu(ptr, type, member) \ - container_of(rcu_dereference_raw(ptr), type, member) + ({typeof (*ptr) __rcu *__ptr = (typeof (*ptr) __rcu __force *)ptr; \ + container_of((typeof(ptr))rcu_dereference_raw(__ptr), type, member); \ + }) /** * list_first_entry_rcu - get the first element from a list @@ -225,9 +242,9 @@ static inline void list_splice_init_rcu(struct list_head *list, list_entry_rcu((ptr)->next, type, member) #define __list_for_each_rcu(pos, head) \ - for (pos = rcu_dereference_raw((head)->next); \ + for (pos = rcu_dereference_raw(list_next_rcu(head)); \ pos != (head); \ - pos = rcu_dereference_raw(pos->next)) + pos = rcu_dereference_raw(list_next_rcu((pos))) /** * list_for_each_entry_rcu - iterate over rcu list of given type @@ -257,9 +274,9 @@ static inline void list_splice_init_rcu(struct list_head *list, * as long as the traversal is guarded by rcu_read_lock(). */ #define list_for_each_continue_rcu(pos, head) \ - for ((pos) = rcu_dereference_raw((pos)->next); \ + for ((pos) = rcu_dereference_raw(list_next_rcu(pos)); \ prefetch((pos)->next), (pos) != (head); \ - (pos) = rcu_dereference_raw((pos)->next)) + (pos) = rcu_dereference_raw(list_next_rcu(pos))) /** * list_for_each_entry_continue_rcu - continue iteration over list of given type @@ -314,12 +331,19 @@ static inline void hlist_replace_rcu(struct hlist_node *old, new->next = next; new->pprev = old->pprev; - rcu_assign_pointer(*new->pprev, new); + rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new); if (next) new->next->pprev = &new->next; old->pprev = LIST_POISON2; } +/* + * return the first or the next element in an RCU protected hlist + */ +#define hlist_first_rcu(head) (*((struct hlist_node __rcu **)(&(head)->first))) +#define hlist_next_rcu(node) (*((struct hlist_node __rcu **)(&(node)->next))) +#define hlist_pprev_rcu(node) (*((struct hlist_node __rcu **)((node)->pprev))) + /** * hlist_add_head_rcu * @n: the element to add to the hash list. @@ -346,7 +370,7 @@ static inline void hlist_add_head_rcu(struct hlist_node *n, n->next = first; n->pprev = &h->first; - rcu_assign_pointer(h->first, n); + rcu_assign_pointer(hlist_first_rcu(h), n); if (first) first->pprev = &n->next; } @@ -374,7 +398,7 @@ static inline void hlist_add_before_rcu(struct hlist_node *n, { n->pprev = next->pprev; n->next = next; - rcu_assign_pointer(*(n->pprev), n); + rcu_assign_pointer(hlist_pprev_rcu(n), n); next->pprev = &n->next; } @@ -401,15 +425,15 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, { n->next = prev->next; n->pprev = &prev->next; - rcu_assign_pointer(prev->next, n); + rcu_assign_pointer(hlist_next_rcu(prev), n); if (n->next) n->next->pprev = &n->next; } -#define __hlist_for_each_rcu(pos, head) \ - for (pos = rcu_dereference((head)->first); \ - pos && ({ prefetch(pos->next); 1; }); \ - pos = rcu_dereference(pos->next)) +#define __hlist_for_each_rcu(pos, head) \ + for (pos = rcu_dereference(hlist_first_rcu(head)); \ + pos && ({ prefetch(pos->next); 1; }); \ + pos = rcu_dereference(hlist_next_rcu(pos))) /** * hlist_for_each_entry_rcu - iterate over rcu list of given type @@ -422,11 +446,11 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ -#define hlist_for_each_entry_rcu(tpos, pos, head, member) \ - for (pos = rcu_dereference_raw((head)->first); \ +#define hlist_for_each_entry_rcu(tpos, pos, head, member) \ + for (pos = rcu_dereference_raw(hlist_first_rcu(head)); \ pos && ({ prefetch(pos->next); 1; }) && \ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \ - pos = rcu_dereference_raw(pos->next)) + pos = rcu_dereference_raw(hlist_next_rcu(pos))) /** * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index b70ffe53cb9f..2ae13714828b 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -37,6 +37,12 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n) } } +#define hlist_nulls_first_rcu(head) \ + (*((struct hlist_nulls_node __rcu __force **)&(head)->first)) + +#define hlist_nulls_next_rcu(node) \ + (*((struct hlist_nulls_node __rcu __force **)&(node)->next)) + /** * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. @@ -88,7 +94,7 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, n->next = first; n->pprev = &h->first; - rcu_assign_pointer(h->first, n); + rcu_assign_pointer(hlist_nulls_first_rcu(h), n); if (!is_a_nulls(first)) first->pprev = &n->next; } @@ -100,11 +106,11 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, * @member: the name of the hlist_nulls_node within the struct. * */ -#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \ - for (pos = rcu_dereference_raw((head)->first); \ - (!is_a_nulls(pos)) && \ +#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \ + for (pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ + (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \ - pos = rcu_dereference_raw(pos->next)) + pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos))) #endif #endif diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 83af1f8d8b74..03cda7bed985 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -41,11 +41,15 @@ #include <linux/lockdep.h> #include <linux/completion.h> #include <linux/debugobjects.h> +#include <linux/compiler.h> #ifdef CONFIG_RCU_TORTURE_TEST extern int rcutorture_runnable; /* for sysctl */ #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ +#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) +#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) + /** * struct rcu_head - callback structure for use with RCU * @next: next update requests in a list @@ -57,29 +61,94 @@ struct rcu_head { }; /* Exported common interfaces */ -extern void rcu_barrier(void); +extern void call_rcu_sched(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)); +extern void synchronize_sched(void); extern void rcu_barrier_bh(void); extern void rcu_barrier_sched(void); extern void synchronize_sched_expedited(void); extern int sched_expedited_torture_stats(char *page); +static inline void __rcu_read_lock_bh(void) +{ + local_bh_disable(); +} + +static inline void __rcu_read_unlock_bh(void) +{ + local_bh_enable(); +} + +#ifdef CONFIG_PREEMPT_RCU + +extern void __rcu_read_lock(void); +extern void __rcu_read_unlock(void); +void synchronize_rcu(void); + +/* + * Defined as a macro as it is a very low level header included from + * areas that don't even know about current. This gives the rcu_read_lock() + * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other + * types of kernel builds, the rcu_read_lock() nesting depth is unknowable. + */ +#define rcu_preempt_depth() (current->rcu_read_lock_nesting) + +#else /* #ifdef CONFIG_PREEMPT_RCU */ + +static inline void __rcu_read_lock(void) +{ + preempt_disable(); +} + +static inline void __rcu_read_unlock(void) +{ + preempt_enable(); +} + +static inline void synchronize_rcu(void) +{ + synchronize_sched(); +} + +static inline int rcu_preempt_depth(void) +{ + return 0; +} + +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ + /* Internal to kernel */ extern void rcu_init(void); +extern void rcu_sched_qs(int cpu); +extern void rcu_bh_qs(int cpu); +extern void rcu_check_callbacks(int cpu, int user); +struct notifier_block; + +#ifdef CONFIG_NO_HZ + +extern void rcu_enter_nohz(void); +extern void rcu_exit_nohz(void); + +#else /* #ifdef CONFIG_NO_HZ */ + +static inline void rcu_enter_nohz(void) +{ +} + +static inline void rcu_exit_nohz(void) +{ +} + +#endif /* #else #ifdef CONFIG_NO_HZ */ #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) #include <linux/rcutree.h> -#elif defined(CONFIG_TINY_RCU) +#elif defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) #include <linux/rcutiny.h> #else #error "Unknown RCU implementation specified to kernel configuration" #endif -#define RCU_HEAD_INIT { .next = NULL, .func = NULL } -#define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT -#define INIT_RCU_HEAD(ptr) do { \ - (ptr)->next = NULL; (ptr)->func = NULL; \ -} while (0) - /* * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic * initialization and destruction of rcu_head on the stack. rcu_head structures @@ -120,14 +189,15 @@ extern struct lockdep_map rcu_sched_lock_map; extern int debug_lockdep_rcu_enabled(void); /** - * rcu_read_lock_held - might we be in RCU read-side critical section? + * rcu_read_lock_held() - might we be in RCU read-side critical section? * * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, * this assumes we are in an RCU read-side critical section unless it can - * prove otherwise. + * prove otherwise. This is useful for debug checks in functions that + * require that they be called within an RCU read-side critical section. * - * Check debug_lockdep_rcu_enabled() to prevent false positives during boot + * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot * and while lockdep is disabled. */ static inline int rcu_read_lock_held(void) @@ -144,14 +214,16 @@ static inline int rcu_read_lock_held(void) extern int rcu_read_lock_bh_held(void); /** - * rcu_read_lock_sched_held - might we be in RCU-sched read-side critical section? + * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? * * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an * RCU-sched read-side critical section. In absence of * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side * critical section unless it can prove otherwise. Note that disabling * of preemption (including disabling irqs) counts as an RCU-sched - * read-side critical section. + * read-side critical section. This is useful for debug checks in functions + * that required that they be called within an RCU-sched read-side + * critical section. * * Check debug_lockdep_rcu_enabled() to prevent false positives during boot * and while lockdep is disabled. @@ -211,7 +283,11 @@ static inline int rcu_read_lock_sched_held(void) extern int rcu_my_thread_group_empty(void); -#define __do_rcu_dereference_check(c) \ +/** + * rcu_lockdep_assert - emit lockdep splat if specified condition not met + * @c: condition to check + */ +#define rcu_lockdep_assert(c) \ do { \ static bool __warned; \ if (debug_lockdep_rcu_enabled() && !__warned && !(c)) { \ @@ -220,41 +296,163 @@ extern int rcu_my_thread_group_empty(void); } \ } while (0) +#else /* #ifdef CONFIG_PROVE_RCU */ + +#define rcu_lockdep_assert(c) do { } while (0) + +#endif /* #else #ifdef CONFIG_PROVE_RCU */ + +/* + * Helper functions for rcu_dereference_check(), rcu_dereference_protected() + * and rcu_assign_pointer(). Some of these could be folded into their + * callers, but they are left separate in order to ease introduction of + * multiple flavors of pointers to match the multiple flavors of RCU + * (e.g., __rcu_bh, * __rcu_sched, and __srcu), should this make sense in + * the future. + */ + +#ifdef __CHECKER__ +#define rcu_dereference_sparse(p, space) \ + ((void)(((typeof(*p) space *)p) == p)) +#else /* #ifdef __CHECKER__ */ +#define rcu_dereference_sparse(p, space) +#endif /* #else #ifdef __CHECKER__ */ + +#define __rcu_access_pointer(p, space) \ + ({ \ + typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \ + rcu_dereference_sparse(p, space); \ + ((typeof(*p) __force __kernel *)(_________p1)); \ + }) +#define __rcu_dereference_check(p, c, space) \ + ({ \ + typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \ + rcu_lockdep_assert(c); \ + rcu_dereference_sparse(p, space); \ + smp_read_barrier_depends(); \ + ((typeof(*p) __force __kernel *)(_________p1)); \ + }) +#define __rcu_dereference_protected(p, c, space) \ + ({ \ + rcu_lockdep_assert(c); \ + rcu_dereference_sparse(p, space); \ + ((typeof(*p) __force __kernel *)(p)); \ + }) + +#define __rcu_dereference_index_check(p, c) \ + ({ \ + typeof(p) _________p1 = ACCESS_ONCE(p); \ + rcu_lockdep_assert(c); \ + smp_read_barrier_depends(); \ + (_________p1); \ + }) +#define __rcu_assign_pointer(p, v, space) \ + ({ \ + if (!__builtin_constant_p(v) || \ + ((v) != NULL)) \ + smp_wmb(); \ + (p) = (typeof(*v) __force space *)(v); \ + }) + + +/** + * rcu_access_pointer() - fetch RCU pointer with no dereferencing + * @p: The pointer to read + * + * Return the value of the specified RCU-protected pointer, but omit the + * smp_read_barrier_depends() and keep the ACCESS_ONCE(). This is useful + * when the value of this pointer is accessed, but the pointer is not + * dereferenced, for example, when testing an RCU-protected pointer against + * NULL. Although rcu_access_pointer() may also be used in cases where + * update-side locks prevent the value of the pointer from changing, you + * should instead use rcu_dereference_protected() for this use case. + */ +#define rcu_access_pointer(p) __rcu_access_pointer((p), __rcu) + /** - * rcu_dereference_check - rcu_dereference with debug checking + * rcu_dereference_check() - rcu_dereference with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * Do an rcu_dereference(), but check that the conditions under which the - * dereference will take place are correct. Typically the conditions indicate - * the various locking conditions that should be held at that point. The check - * should return true if the conditions are satisfied. + * dereference will take place are correct. Typically the conditions + * indicate the various locking conditions that should be held at that + * point. The check should return true if the conditions are satisfied. + * An implicit check for being in an RCU read-side critical section + * (rcu_read_lock()) is included. * * For example: * - * bar = rcu_dereference_check(foo->bar, rcu_read_lock_held() || - * lockdep_is_held(&foo->lock)); + * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock)); * * could be used to indicate to lockdep that foo->bar may only be dereferenced - * if either the RCU read lock is held, or that the lock required to replace + * if either rcu_read_lock() is held, or that the lock required to replace * the bar struct at foo->bar is held. * * Note that the list of conditions may also include indications of when a lock * need not be held, for example during initialisation or destruction of the * target struct: * - * bar = rcu_dereference_check(foo->bar, rcu_read_lock_held() || - * lockdep_is_held(&foo->lock) || + * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) || * atomic_read(&foo->usage) == 0); + * + * Inserts memory barriers on architectures that require them + * (currently only the Alpha), prevents the compiler from refetching + * (and from merging fetches), and, more importantly, documents exactly + * which pointers are protected by RCU and checks that the pointer is + * annotated as __rcu. */ #define rcu_dereference_check(p, c) \ - ({ \ - __do_rcu_dereference_check(c); \ - rcu_dereference_raw(p); \ - }) + __rcu_dereference_check((p), rcu_read_lock_held() || (c), __rcu) + +/** + * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking + * @p: The pointer to read, prior to dereferencing + * @c: The conditions under which the dereference will take place + * + * This is the RCU-bh counterpart to rcu_dereference_check(). + */ +#define rcu_dereference_bh_check(p, c) \ + __rcu_dereference_check((p), rcu_read_lock_bh_held() || (c), __rcu) /** - * rcu_dereference_protected - fetch RCU pointer when updates prevented + * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking + * @p: The pointer to read, prior to dereferencing + * @c: The conditions under which the dereference will take place + * + * This is the RCU-sched counterpart to rcu_dereference_check(). + */ +#define rcu_dereference_sched_check(p, c) \ + __rcu_dereference_check((p), rcu_read_lock_sched_held() || (c), \ + __rcu) + +#define rcu_dereference_raw(p) rcu_dereference_check(p, 1) /*@@@ needed? @@@*/ + +/** + * rcu_dereference_index_check() - rcu_dereference for indices with debug checking + * @p: The pointer to read, prior to dereferencing + * @c: The conditions under which the dereference will take place + * + * Similar to rcu_dereference_check(), but omits the sparse checking. + * This allows rcu_dereference_index_check() to be used on integers, + * which can then be used as array indices. Attempting to use + * rcu_dereference_check() on an integer will give compiler warnings + * because the sparse address-space mechanism relies on dereferencing + * the RCU-protected pointer. Dereferencing integers is not something + * that even gcc will put up with. + * + * Note that this function does not implicitly check for RCU read-side + * critical sections. If this function gains lots of uses, it might + * make sense to provide versions for each flavor of RCU, but it does + * not make sense as of early 2010. + */ +#define rcu_dereference_index_check(p, c) \ + __rcu_dereference_index_check((p), (c)) + +/** + * rcu_dereference_protected() - fetch RCU pointer when updates prevented + * @p: The pointer to read, prior to dereferencing + * @c: The conditions under which the dereference will take place * * Return the value of the specified RCU-protected pointer, but omit * both the smp_read_barrier_depends() and the ACCESS_ONCE(). This @@ -263,35 +461,61 @@ extern int rcu_my_thread_group_empty(void); * prevent the compiler from repeating this reference or combining it * with other references, so it should not be used without protection * of appropriate locks. + * + * This function is only for update-side use. Using this function + * when protected only by rcu_read_lock() will result in infrequent + * but very ugly failures. */ #define rcu_dereference_protected(p, c) \ - ({ \ - __do_rcu_dereference_check(c); \ - (p); \ - }) + __rcu_dereference_protected((p), (c), __rcu) -#else /* #ifdef CONFIG_PROVE_RCU */ +/** + * rcu_dereference_bh_protected() - fetch RCU-bh pointer when updates prevented + * @p: The pointer to read, prior to dereferencing + * @c: The conditions under which the dereference will take place + * + * This is the RCU-bh counterpart to rcu_dereference_protected(). + */ +#define rcu_dereference_bh_protected(p, c) \ + __rcu_dereference_protected((p), (c), __rcu) -#define rcu_dereference_check(p, c) rcu_dereference_raw(p) -#define rcu_dereference_protected(p, c) (p) +/** + * rcu_dereference_sched_protected() - fetch RCU-sched pointer when updates prevented + * @p: The pointer to read, prior to dereferencing + * @c: The conditions under which the dereference will take place + * + * This is the RCU-sched counterpart to rcu_dereference_protected(). + */ +#define rcu_dereference_sched_protected(p, c) \ + __rcu_dereference_protected((p), (c), __rcu) -#endif /* #else #ifdef CONFIG_PROVE_RCU */ /** - * rcu_access_pointer - fetch RCU pointer with no dereferencing + * rcu_dereference() - fetch RCU-protected pointer for dereferencing + * @p: The pointer to read, prior to dereferencing * - * Return the value of the specified RCU-protected pointer, but omit the - * smp_read_barrier_depends() and keep the ACCESS_ONCE(). This is useful - * when the value of this pointer is accessed, but the pointer is not - * dereferenced, for example, when testing an RCU-protected pointer against - * NULL. This may also be used in cases where update-side locks prevent - * the value of the pointer from changing, but rcu_dereference_protected() - * is a lighter-weight primitive for this use case. + * This is a simple wrapper around rcu_dereference_check(). + */ +#define rcu_dereference(p) rcu_dereference_check(p, 0) + +/** + * rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing + * @p: The pointer to read, prior to dereferencing + * + * Makes rcu_dereference_check() do the dirty work. + */ +#define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0) + +/** + * rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing + * @p: The pointer to read, prior to dereferencing + * + * Makes rcu_dereference_check() do the dirty work. */ -#define rcu_access_pointer(p) ACCESS_ONCE(p) +#define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0) /** - * rcu_read_lock - mark the beginning of an RCU read-side critical section. + * rcu_read_lock() - mark the beginning of an RCU read-side critical section * * When synchronize_rcu() is invoked on one CPU while other CPUs * are within RCU read-side critical sections, then the @@ -302,7 +526,7 @@ extern int rcu_my_thread_group_empty(void); * until after the all the other CPUs exit their critical sections. * * Note, however, that RCU callbacks are permitted to run concurrently - * with RCU read-side critical sections. One way that this can happen + * with new RCU read-side critical sections. One way that this can happen * is via the following sequence of events: (1) CPU 0 enters an RCU * read-side critical section, (2) CPU 1 invokes call_rcu() to register * an RCU callback, (3) CPU 0 exits the RCU read-side critical section, @@ -317,7 +541,20 @@ extern int rcu_my_thread_group_empty(void); * will be deferred until the outermost RCU read-side critical section * completes. * - * It is illegal to block while in an RCU read-side critical section. + * You can avoid reading and understanding the next paragraph by + * following this rule: don't put anything in an rcu_read_lock() RCU + * read-side critical section that would block in a !PREEMPT kernel. + * But if you want the full story, read on! + * + * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU), it + * is illegal to block while in an RCU read-side critical section. In + * preemptible RCU implementations (TREE_PREEMPT_RCU and TINY_PREEMPT_RCU) + * in CONFIG_PREEMPT kernel builds, RCU read-side critical sections may + * be preempted, but explicit blocking is illegal. Finally, in preemptible + * RCU implementations in real-time (CONFIG_PREEMPT_RT) kernel builds, + * RCU read-side critical sections may be preempted and they may also + * block, but only when acquiring spinlocks that are subject to priority + * inheritance. */ static inline void rcu_read_lock(void) { @@ -337,7 +574,7 @@ static inline void rcu_read_lock(void) */ /** - * rcu_read_unlock - marks the end of an RCU read-side critical section. + * rcu_read_unlock() - marks the end of an RCU read-side critical section. * * See rcu_read_lock() for more information. */ @@ -349,15 +586,16 @@ static inline void rcu_read_unlock(void) } /** - * rcu_read_lock_bh - mark the beginning of a softirq-only RCU critical section + * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section * * This is equivalent of rcu_read_lock(), but to be used when updates - * are being done using call_rcu_bh(). Since call_rcu_bh() callbacks - * consider completion of a softirq handler to be a quiescent state, - * a process in RCU read-side critical section must be protected by - * disabling softirqs. Read-side critical sections in interrupt context - * can use just rcu_read_lock(). - * + * are being done using call_rcu_bh() or synchronize_rcu_bh(). Since + * both call_rcu_bh() and synchronize_rcu_bh() consider completion of a + * softirq handler to be a quiescent state, a process in RCU read-side + * critical section must be protected by disabling softirqs. Read-side + * critical sections in interrupt context can use just rcu_read_lock(), + * though this should at least be commented to avoid confusing people + * reading the code. */ static inline void rcu_read_lock_bh(void) { @@ -379,13 +617,12 @@ static inline void rcu_read_unlock_bh(void) } /** - * rcu_read_lock_sched - mark the beginning of a RCU-classic critical section + * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section * - * Should be used with either - * - synchronize_sched() - * or - * - call_rcu_sched() and rcu_barrier_sched() - * on the write-side to insure proper synchronization. + * This is equivalent of rcu_read_lock(), but to be used when updates + * are being done using call_rcu_sched() or synchronize_rcu_sched(). + * Read-side critical sections can also be introduced by anything that + * disables preemption, including local_irq_disable() and friends. */ static inline void rcu_read_lock_sched(void) { @@ -420,54 +657,14 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) preempt_enable_notrace(); } - /** - * rcu_dereference_raw - fetch an RCU-protected pointer + * rcu_assign_pointer() - assign to RCU-protected pointer + * @p: pointer to assign to + * @v: value to assign (publish) * - * The caller must be within some flavor of RCU read-side critical - * section, or must be otherwise preventing the pointer from changing, - * for example, by holding an appropriate lock. This pointer may later - * be safely dereferenced. It is the caller's responsibility to have - * done the right thing, as this primitive does no checking of any kind. - * - * Inserts memory barriers on architectures that require them - * (currently only the Alpha), and, more importantly, documents - * exactly which pointers are protected by RCU. - */ -#define rcu_dereference_raw(p) ({ \ - typeof(p) _________p1 = ACCESS_ONCE(p); \ - smp_read_barrier_depends(); \ - (_________p1); \ - }) - -/** - * rcu_dereference - fetch an RCU-protected pointer, checking for RCU - * - * Makes rcu_dereference_check() do the dirty work. - */ -#define rcu_dereference(p) \ - rcu_dereference_check(p, rcu_read_lock_held()) - -/** - * rcu_dereference_bh - fetch an RCU-protected pointer, checking for RCU-bh - * - * Makes rcu_dereference_check() do the dirty work. - */ -#define rcu_dereference_bh(p) \ - rcu_dereference_check(p, rcu_read_lock_bh_held() || irqs_disabled()) - -/** - * rcu_dereference_sched - fetch RCU-protected pointer, checking for RCU-sched - * - * Makes rcu_dereference_check() do the dirty work. - */ -#define rcu_dereference_sched(p) \ - rcu_dereference_check(p, rcu_read_lock_sched_held()) - -/** - * rcu_assign_pointer - assign (publicize) a pointer to a newly - * initialized structure that will be dereferenced by RCU read-side - * critical sections. Returns the value assigned. + * Assigns the specified value to the specified RCU-protected + * pointer, ensuring that any concurrent RCU readers will see + * any prior initialization. Returns the value assigned. * * Inserts memory barriers on architectures that require them * (pretty much all of them other than x86), and also prevents @@ -476,14 +673,17 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * call documents which pointers will be dereferenced by RCU read-side * code. */ - #define rcu_assign_pointer(p, v) \ - ({ \ - if (!__builtin_constant_p(v) || \ - ((v) != NULL)) \ - smp_wmb(); \ - (p) = (v); \ - }) + __rcu_assign_pointer((p), (v), __rcu) + +/** + * RCU_INIT_POINTER() - initialize an RCU protected pointer + * + * Initialize an RCU-protected pointer in such a way to avoid RCU-lockdep + * splats. + */ +#define RCU_INIT_POINTER(p, v) \ + p = (typeof(*v) __force __rcu *)(v) /* Infrastructure to implement the synchronize_() primitives. */ @@ -494,26 +694,37 @@ struct rcu_synchronize { extern void wakeme_after_rcu(struct rcu_head *head); +#ifdef CONFIG_PREEMPT_RCU + /** - * call_rcu - Queue an RCU callback for invocation after a grace period. + * call_rcu() - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. - * @func: actual update function to be invoked after the grace period + * @func: actual callback function to be invoked after the grace period * - * The update function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. RCU read-side critical + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all pre-existing RCU read-side + * critical sections have completed. However, the callback function + * might well execute concurrently with RCU read-side critical sections + * that started after call_rcu() was invoked. RCU read-side critical * sections are delimited by rcu_read_lock() and rcu_read_unlock(), * and may be nested. */ extern void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *head)); +#else /* #ifdef CONFIG_PREEMPT_RCU */ + +/* In classic RCU, call_rcu() is just call_rcu_sched(). */ +#define call_rcu call_rcu_sched + +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ + /** - * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. + * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period. * @head: structure to be used for queueing the RCU updates. - * @func: actual update function to be invoked after the grace period + * @func: actual callback function to be invoked after the grace period * - * The update function will be invoked some time after a full grace + * The callback function will be invoked some time after a full grace * period elapses, in other words after all currently executing RCU * read-side critical sections have completed. call_rcu_bh() assumes * that the read-side critical sections end on completion of a softirq @@ -566,37 +777,4 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -#ifndef CONFIG_PROVE_RCU -#define __do_rcu_dereference_check(c) do { } while (0) -#endif /* #ifdef CONFIG_PROVE_RCU */ - -#define __rcu_dereference_index_check(p, c) \ - ({ \ - typeof(p) _________p1 = ACCESS_ONCE(p); \ - __do_rcu_dereference_check(c); \ - smp_read_barrier_depends(); \ - (_________p1); \ - }) - -/** - * rcu_dereference_index_check() - rcu_dereference for indices with debug checking - * @p: The pointer to read, prior to dereferencing - * @c: The conditions under which the dereference will take place - * - * Similar to rcu_dereference_check(), but omits the sparse checking. - * This allows rcu_dereference_index_check() to be used on integers, - * which can then be used as array indices. Attempting to use - * rcu_dereference_check() on an integer will give compiler warnings - * because the sparse address-space mechanism relies on dereferencing - * the RCU-protected pointer. Dereferencing integers is not something - * that even gcc will put up with. - * - * Note that this function does not implicitly check for RCU read-side - * critical sections. If this function gains lots of uses, it might - * make sense to provide versions for each flavor of RCU, but it does - * not make sense as of early 2010. - */ -#define rcu_dereference_index_check(p, c) \ - __rcu_dereference_index_check((p), (c)) - #endif /* __LINUX_RCUPDATE_H */ diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index e2e893144a84..13877cb93a60 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -27,103 +27,101 @@ #include <linux/cache.h> -void rcu_sched_qs(int cpu); -void rcu_bh_qs(int cpu); -static inline void rcu_note_context_switch(int cpu) -{ - rcu_sched_qs(cpu); -} +#define rcu_init_sched() do { } while (0) -#define __rcu_read_lock() preempt_disable() -#define __rcu_read_unlock() preempt_enable() -#define __rcu_read_lock_bh() local_bh_disable() -#define __rcu_read_unlock_bh() local_bh_enable() -#define call_rcu_sched call_rcu +#ifdef CONFIG_TINY_RCU -#define rcu_init_sched() do { } while (0) -extern void rcu_check_callbacks(int cpu, int user); +static inline void synchronize_rcu_expedited(void) +{ + synchronize_sched(); /* Only one CPU, so pretty fast anyway!!! */ +} -static inline int rcu_needs_cpu(int cpu) +static inline void rcu_barrier(void) { - return 0; + rcu_barrier_sched(); /* Only one CPU, so only one list of callbacks! */ } -/* - * Return the number of grace periods. - */ -static inline long rcu_batches_completed(void) +#else /* #ifdef CONFIG_TINY_RCU */ + +void rcu_barrier(void); +void synchronize_rcu_expedited(void); + +#endif /* #else #ifdef CONFIG_TINY_RCU */ + +static inline void synchronize_rcu_bh(void) { - return 0; + synchronize_sched(); } -/* - * Return the number of bottom-half grace periods. - */ -static inline long rcu_batches_completed_bh(void) +static inline void synchronize_rcu_bh_expedited(void) { - return 0; + synchronize_sched(); } -static inline void rcu_force_quiescent_state(void) +#ifdef CONFIG_TINY_RCU + +static inline void rcu_preempt_note_context_switch(void) { } -static inline void rcu_bh_force_quiescent_state(void) +static inline void exit_rcu(void) { } -static inline void rcu_sched_force_quiescent_state(void) +static inline int rcu_needs_cpu(int cpu) { + return 0; } -extern void synchronize_sched(void); +#else /* #ifdef CONFIG_TINY_RCU */ + +void rcu_preempt_note_context_switch(void); +extern void exit_rcu(void); +int rcu_preempt_needs_cpu(void); -static inline void synchronize_rcu(void) +static inline int rcu_needs_cpu(int cpu) { - synchronize_sched(); + return rcu_preempt_needs_cpu(); } -static inline void synchronize_rcu_bh(void) +#endif /* #else #ifdef CONFIG_TINY_RCU */ + +static inline void rcu_note_context_switch(int cpu) { - synchronize_sched(); + rcu_sched_qs(cpu); + rcu_preempt_note_context_switch(); } -static inline void synchronize_rcu_expedited(void) +/* + * Return the number of grace periods. + */ +static inline long rcu_batches_completed(void) { - synchronize_sched(); + return 0; } -static inline void synchronize_rcu_bh_expedited(void) +/* + * Return the number of bottom-half grace periods. + */ +static inline long rcu_batches_completed_bh(void) { - synchronize_sched(); + return 0; } -struct notifier_block; - -#ifdef CONFIG_NO_HZ - -extern void rcu_enter_nohz(void); -extern void rcu_exit_nohz(void); - -#else /* #ifdef CONFIG_NO_HZ */ - -static inline void rcu_enter_nohz(void) +static inline void rcu_force_quiescent_state(void) { } -static inline void rcu_exit_nohz(void) +static inline void rcu_bh_force_quiescent_state(void) { } -#endif /* #else #ifdef CONFIG_NO_HZ */ - -static inline void exit_rcu(void) +static inline void rcu_sched_force_quiescent_state(void) { } -static inline int rcu_preempt_depth(void) +static inline void rcu_cpu_stall_reset(void) { - return 0; } #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index c0ed1c056f29..95518e628794 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -30,64 +30,23 @@ #ifndef __LINUX_RCUTREE_H #define __LINUX_RCUTREE_H -struct notifier_block; - -extern void rcu_sched_qs(int cpu); -extern void rcu_bh_qs(int cpu); extern void rcu_note_context_switch(int cpu); extern int rcu_needs_cpu(int cpu); +extern void rcu_cpu_stall_reset(void); #ifdef CONFIG_TREE_PREEMPT_RCU -extern void __rcu_read_lock(void); -extern void __rcu_read_unlock(void); -extern void synchronize_rcu(void); extern void exit_rcu(void); -/* - * Defined as macro as it is a very low level header - * included from areas that don't even know about current - */ -#define rcu_preempt_depth() (current->rcu_read_lock_nesting) - #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -static inline void __rcu_read_lock(void) -{ - preempt_disable(); -} - -static inline void __rcu_read_unlock(void) -{ - preempt_enable(); -} - -#define synchronize_rcu synchronize_sched - static inline void exit_rcu(void) { } -static inline int rcu_preempt_depth(void) -{ - return 0; -} - #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ -static inline void __rcu_read_lock_bh(void) -{ - local_bh_disable(); -} -static inline void __rcu_read_unlock_bh(void) -{ - local_bh_enable(); -} - -extern void call_rcu_sched(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)); extern void synchronize_rcu_bh(void); -extern void synchronize_sched(void); extern void synchronize_rcu_expedited(void); static inline void synchronize_rcu_bh_expedited(void) @@ -95,7 +54,7 @@ static inline void synchronize_rcu_bh_expedited(void) synchronize_sched_expedited(); } -extern void rcu_check_callbacks(int cpu, int user); +extern void rcu_barrier(void); extern long rcu_batches_completed(void); extern long rcu_batches_completed_bh(void); @@ -104,18 +63,6 @@ extern void rcu_force_quiescent_state(void); extern void rcu_bh_force_quiescent_state(void); extern void rcu_sched_force_quiescent_state(void); -#ifdef CONFIG_NO_HZ -void rcu_enter_nohz(void); -void rcu_exit_nohz(void); -#else /* CONFIG_NO_HZ */ -static inline void rcu_enter_nohz(void) -{ -} -static inline void rcu_exit_nohz(void) -{ -} -#endif /* CONFIG_NO_HZ */ - /* A context switch is a grace period for RCU-sched and RCU-bh. */ static inline int rcu_blocking_is_gp(void) { diff --git a/include/linux/sched.h b/include/linux/sched.h index 2cca9a92f5e5..0383601a927c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1161,6 +1161,13 @@ struct sched_rt_entity { struct rcu_node; +enum perf_event_task_context { + perf_invalid_context = -1, + perf_hw_context = 0, + perf_sw_context, + perf_nr_task_contexts, +}; + struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ void *stack; @@ -1203,11 +1210,13 @@ struct task_struct { unsigned int policy; cpumask_t cpus_allowed; -#ifdef CONFIG_TREE_PREEMPT_RCU +#ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; char rcu_read_unlock_special; - struct rcu_node *rcu_blocked_node; struct list_head rcu_node_entry; +#endif /* #ifdef CONFIG_PREEMPT_RCU */ +#ifdef CONFIG_TREE_PREEMPT_RCU + struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) @@ -1289,9 +1298,9 @@ struct task_struct { struct list_head cpu_timers[3]; /* process credentials */ - const struct cred *real_cred; /* objective and real subjective task + const struct cred __rcu *real_cred; /* objective and real subjective task * credentials (COW) */ - const struct cred *cred; /* effective (overridable) subjective task + const struct cred __rcu *cred; /* effective (overridable) subjective task * credentials (COW) */ struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations @@ -1419,7 +1428,7 @@ struct task_struct { #endif #ifdef CONFIG_CGROUPS /* Control Group info protected by css_set_lock */ - struct css_set *cgroups; + struct css_set __rcu *cgroups; /* cg_list protected by css_set_lock and tsk->alloc_lock */ struct list_head cg_list; #endif @@ -1432,7 +1441,7 @@ struct task_struct { struct futex_pi_state *pi_state_cache; #endif #ifdef CONFIG_PERF_EVENTS - struct perf_event_context *perf_event_ctxp; + struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts]; struct mutex perf_event_mutex; struct list_head perf_event_list; #endif @@ -1740,7 +1749,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) -#ifdef CONFIG_TREE_PREEMPT_RCU +#ifdef CONFIG_PREEMPT_RCU #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */ @@ -1749,7 +1758,9 @@ static inline void rcu_copy_process(struct task_struct *p) { p->rcu_read_lock_nesting = 0; p->rcu_read_unlock_special = 0; +#ifdef CONFIG_TREE_PREEMPT_RCU p->rcu_blocked_node = NULL; +#endif INIT_LIST_HEAD(&p->rcu_node_entry); } diff --git a/include/linux/security.h b/include/linux/security.h index a22219afff09..b8246a8df7d2 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -74,7 +74,7 @@ extern int cap_file_mmap(struct file *file, unsigned long reqprot, extern int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags); extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5); -extern int cap_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp); +extern int cap_task_setscheduler(struct task_struct *p); extern int cap_task_setioprio(struct task_struct *p, int ioprio); extern int cap_task_setnice(struct task_struct *p, int nice); extern int cap_syslog(int type, bool from_file); @@ -959,6 +959,12 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * Sets the new child socket's sid to the openreq sid. * @inet_conn_established: * Sets the connection's peersid to the secmark on skb. + * @secmark_relabel_packet: + * check if the process should be allowed to relabel packets to the given secid + * @security_secmark_refcount_inc + * tells the LSM to increment the number of secmark labeling rules loaded + * @security_secmark_refcount_dec + * tells the LSM to decrement the number of secmark labeling rules loaded * @req_classify_flow: * Sets the flow's sid to the openreq sid. * @tun_dev_create: @@ -1279,9 +1285,13 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * Return 0 if permission is granted. * * @secid_to_secctx: - * Convert secid to security context. + * Convert secid to security context. If secdata is NULL the length of + * the result will be returned in seclen, but no secdata will be returned. + * This does mean that the length could change between calls to check the + * length and the next call which actually allocates and returns the secdata. * @secid contains the security ID. * @secdata contains the pointer that stores the converted security context. + * @seclen pointer which contains the length of the data * @secctx_to_secid: * Convert security context to secid. * @secid contains the pointer to the generated security ID. @@ -1501,8 +1511,7 @@ struct security_operations { int (*task_getioprio) (struct task_struct *p); int (*task_setrlimit) (struct task_struct *p, unsigned int resource, struct rlimit *new_rlim); - int (*task_setscheduler) (struct task_struct *p, int policy, - struct sched_param *lp); + int (*task_setscheduler) (struct task_struct *p); int (*task_getscheduler) (struct task_struct *p); int (*task_movememory) (struct task_struct *p); int (*task_kill) (struct task_struct *p, @@ -1594,6 +1603,9 @@ struct security_operations { struct request_sock *req); void (*inet_csk_clone) (struct sock *newsk, const struct request_sock *req); void (*inet_conn_established) (struct sock *sk, struct sk_buff *skb); + int (*secmark_relabel_packet) (u32 secid); + void (*secmark_refcount_inc) (void); + void (*secmark_refcount_dec) (void); void (*req_classify_flow) (const struct request_sock *req, struct flowi *fl); int (*tun_dev_create)(void); void (*tun_dev_post_create)(struct sock *sk); @@ -1752,8 +1764,7 @@ int security_task_setioprio(struct task_struct *p, int ioprio); int security_task_getioprio(struct task_struct *p); int security_task_setrlimit(struct task_struct *p, unsigned int resource, struct rlimit *new_rlim); -int security_task_setscheduler(struct task_struct *p, - int policy, struct sched_param *lp); +int security_task_setscheduler(struct task_struct *p); int security_task_getscheduler(struct task_struct *p); int security_task_movememory(struct task_struct *p); int security_task_kill(struct task_struct *p, struct siginfo *info, @@ -2320,11 +2331,9 @@ static inline int security_task_setrlimit(struct task_struct *p, return 0; } -static inline int security_task_setscheduler(struct task_struct *p, - int policy, - struct sched_param *lp) +static inline int security_task_setscheduler(struct task_struct *p) { - return cap_task_setscheduler(p, policy, lp); + return cap_task_setscheduler(p); } static inline int security_task_getscheduler(struct task_struct *p) @@ -2551,6 +2560,9 @@ void security_inet_csk_clone(struct sock *newsk, const struct request_sock *req); void security_inet_conn_established(struct sock *sk, struct sk_buff *skb); +int security_secmark_relabel_packet(u32 secid); +void security_secmark_refcount_inc(void); +void security_secmark_refcount_dec(void); int security_tun_dev_create(void); void security_tun_dev_post_create(struct sock *sk); int security_tun_dev_attach(struct sock *sk); @@ -2705,6 +2717,19 @@ static inline void security_inet_conn_established(struct sock *sk, { } +static inline int security_secmark_relabel_packet(u32 secid) +{ + return 0; +} + +static inline void security_secmark_refcount_inc(void) +{ +} + +static inline void security_secmark_refcount_dec(void) +{ +} + static inline int security_tun_dev_create(void) { return 0; diff --git a/include/linux/selinux.h b/include/linux/selinux.h index 82e0f26a1299..44f459612690 100644 --- a/include/linux/selinux.h +++ b/include/linux/selinux.h @@ -21,74 +21,11 @@ struct kern_ipc_perm; #ifdef CONFIG_SECURITY_SELINUX /** - * selinux_string_to_sid - map a security context string to a security ID - * @str: the security context string to be mapped - * @sid: ID value returned via this. - * - * Returns 0 if successful, with the SID stored in sid. A value - * of zero for sid indicates no SID could be determined (but no error - * occurred). - */ -int selinux_string_to_sid(char *str, u32 *sid); - -/** - * selinux_secmark_relabel_packet_permission - secmark permission check - * @sid: SECMARK ID value to be applied to network packet - * - * Returns 0 if the current task is allowed to set the SECMARK label of - * packets with the supplied security ID. Note that it is implicit that - * the packet is always being relabeled from the default unlabeled value, - * and that the access control decision is made in the AVC. - */ -int selinux_secmark_relabel_packet_permission(u32 sid); - -/** - * selinux_secmark_refcount_inc - increments the secmark use counter - * - * SELinux keeps track of the current SECMARK targets in use so it knows - * when to apply SECMARK label access checks to network packets. This - * function incements this reference count to indicate that a new SECMARK - * target has been configured. - */ -void selinux_secmark_refcount_inc(void); - -/** - * selinux_secmark_refcount_dec - decrements the secmark use counter - * - * SELinux keeps track of the current SECMARK targets in use so it knows - * when to apply SECMARK label access checks to network packets. This - * function decements this reference count to indicate that one of the - * existing SECMARK targets has been removed/flushed. - */ -void selinux_secmark_refcount_dec(void); - -/** * selinux_is_enabled - is SELinux enabled? */ bool selinux_is_enabled(void); #else -static inline int selinux_string_to_sid(const char *str, u32 *sid) -{ - *sid = 0; - return 0; -} - -static inline int selinux_secmark_relabel_packet_permission(u32 sid) -{ - return 0; -} - -static inline void selinux_secmark_refcount_inc(void) -{ - return; -} - -static inline void selinux_secmark_refcount_dec(void) -{ - return; -} - static inline bool selinux_is_enabled(void) { return false; diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 4d5d2f546dbf..58971e891f48 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -108,19 +108,43 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /** - * srcu_dereference - fetch SRCU-protected pointer with checking + * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing + * @p: the pointer to fetch and protect for later dereferencing + * @sp: pointer to the srcu_struct, which is used to check that we + * really are in an SRCU read-side critical section. + * @c: condition to check for update-side use * - * Makes rcu_dereference_check() do the dirty work. + * If PROVE_RCU is enabled, invoking this outside of an RCU read-side + * critical section will result in an RCU-lockdep splat, unless @c evaluates + * to 1. The @c argument will normally be a logical expression containing + * lockdep_is_held() calls. */ -#define srcu_dereference(p, sp) \ - rcu_dereference_check(p, srcu_read_lock_held(sp)) +#define srcu_dereference_check(p, sp, c) \ + __rcu_dereference_check((p), srcu_read_lock_held(sp) || (c), __rcu) + +/** + * srcu_dereference - fetch SRCU-protected pointer for later dereferencing + * @p: the pointer to fetch and protect for later dereferencing + * @sp: pointer to the srcu_struct, which is used to check that we + * really are in an SRCU read-side critical section. + * + * Makes rcu_dereference_check() do the dirty work. If PROVE_RCU + * is enabled, invoking this outside of an RCU read-side critical + * section will result in an RCU-lockdep splat. + */ +#define srcu_dereference(p, sp) srcu_dereference_check((p), (sp), 0) /** * srcu_read_lock - register a new reader for an SRCU-protected structure. * @sp: srcu_struct in which to register the new reader. * * Enter an SRCU read-side critical section. Note that SRCU read-side - * critical sections may be nested. + * critical sections may be nested. However, it is illegal to + * call anything that waits on an SRCU grace period for the same + * srcu_struct, whether directly or indirectly. Please note that + * one way to indirectly wait on an SRCU grace period is to acquire + * a mutex that is held elsewhere while calling synchronize_srcu() or + * synchronize_srcu_expedited(). */ static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp) { diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 6b524a0d02e4..1808960c5059 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -126,8 +126,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus); #else /* CONFIG_STOP_MACHINE && CONFIG_SMP */ -static inline int stop_machine(int (*fn)(void *), void *data, - const struct cpumask *cpus) +static inline int __stop_machine(int (*fn)(void *), void *data, + const struct cpumask *cpus) { int ret; local_irq_disable(); @@ -136,5 +136,11 @@ static inline int stop_machine(int (*fn)(void *), void *data, return ret; } +static inline int stop_machine(int (*fn)(void *), void *data, + const struct cpumask *cpus) +{ + return __stop_machine(fn, data, cpus); +} + #endif /* CONFIG_STOP_MACHINE && CONFIG_SMP */ #endif /* _LINUX_STOP_MACHINE */ diff --git a/include/linux/sunrpc/auth_gss.h b/include/linux/sunrpc/auth_gss.h index 671538d25bc1..8eee9dbbfe7a 100644 --- a/include/linux/sunrpc/auth_gss.h +++ b/include/linux/sunrpc/auth_gss.h @@ -69,7 +69,7 @@ struct gss_cl_ctx { enum rpc_gss_proc gc_proc; u32 gc_seq; spinlock_t gc_seq_lock; - struct gss_ctx *gc_gss_ctx; + struct gss_ctx __rcu *gc_gss_ctx; struct xdr_netobj gc_wire_ctx; u32 gc_win; unsigned long gc_expiry; @@ -80,7 +80,7 @@ struct gss_upcall_msg; struct gss_cred { struct rpc_cred gc_base; enum rpc_gss_svc gc_service; - struct gss_cl_ctx *gc_ctx; + struct gss_cl_ctx __rcu *gc_ctx; struct gss_upcall_msg *gc_upcall; unsigned long gc_upcall_timestamp; unsigned char gc_machine_cred : 1; diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 103d1b61aacb..a4a90b6726ce 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -17,6 +17,7 @@ #include <linux/errno.h> #include <linux/types.h> #include <linux/rcupdate.h> +#include <linux/jump_label.h> struct module; struct tracepoint; @@ -145,7 +146,9 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin, extern struct tracepoint __tracepoint_##name; \ static inline void trace_##name(proto) \ { \ - if (unlikely(__tracepoint_##name.state)) \ + JUMP_LABEL(&__tracepoint_##name.state, do_trace); \ + return; \ +do_trace: \ __DO_TRACE(&__tracepoint_##name, \ TP_PROTO(data_proto), \ TP_ARGS(data_args)); \ diff --git a/include/linux/types.h b/include/linux/types.h index 01a082f56ef4..357dbc19606f 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -121,7 +121,15 @@ typedef __u64 u_int64_t; typedef __s64 int64_t; #endif -/* this is a special 64bit data type that is 8-byte aligned */ +/* + * aligned_u64 should be used in defining kernel<->userspace ABIs to avoid + * common 32/64-bit compat problems. + * 64-bit values align to 4-byte boundaries on x86_32 (and possibly other + * architectures) and to 8-byte boundaries on 64-bit architetures. The new + * aligned_64 type enforces 8-byte alignment so that structs containing + * aligned_64 values have the same alignment on 32-bit and 64-bit architectures. + * No conversions are necessary between 32-bit user-space and a 64-bit kernel. + */ #define aligned_u64 __u64 __attribute__((aligned(8))) #define aligned_be64 __be64 __attribute__((aligned(8))) #define aligned_le64 __le64 __attribute__((aligned(8))) @@ -178,6 +186,11 @@ typedef __u64 __bitwise __be64; typedef __u16 __bitwise __sum16; typedef __u32 __bitwise __wsum; +/* this is a special 64bit data type that is 8-byte aligned */ +#define __aligned_u64 __u64 __attribute__((aligned(8))) +#define __aligned_be64 __be64 __attribute__((aligned(8))) +#define __aligned_le64 __le64 __attribute__((aligned(8))) + #ifdef __KERNEL__ typedef unsigned __bitwise__ gfp_t; typedef unsigned __bitwise__ fmode_t; diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h index ef6c24a529e1..a4dc5b027bd9 100644 --- a/include/net/cls_cgroup.h +++ b/include/net/cls_cgroup.h @@ -51,7 +51,8 @@ static inline u32 task_cls_classid(struct task_struct *p) return 0; rcu_read_lock(); - id = rcu_dereference(net_cls_subsys_id); + id = rcu_dereference_index_check(net_cls_subsys_id, + rcu_read_lock_held()); if (id >= 0) classid = container_of(task_subsys_state(p, id), struct cgroup_cls_state, css)->classid; diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index e624dae54fa4..caf17db87dbc 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -75,7 +75,7 @@ struct nf_conntrack_helper; /* nf_conn feature for connections that have a helper */ struct nf_conn_help { /* Helper. if any */ - struct nf_conntrack_helper *helper; + struct nf_conntrack_helper __rcu *helper; union nf_conntrack_help help; diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h index 0e4cfb694fe7..6fa7cbab7d93 100644 --- a/include/trace/events/irq.h +++ b/include/trace/events/irq.h @@ -5,7 +5,9 @@ #define _TRACE_IRQ_H #include <linux/tracepoint.h> -#include <linux/interrupt.h> + +struct irqaction; +struct softirq_action; #define softirq_name(sirq) { sirq##_SOFTIRQ, #sirq } #define show_softirq_name(val) \ @@ -93,7 +95,10 @@ DECLARE_EVENT_CLASS(softirq, ), TP_fast_assign( - __entry->vec = (int)(h - vec); + if (vec) + __entry->vec = (int)(h - vec); + else + __entry->vec = (int)(long)h; ), TP_printk("vec=%d [action=%s]", __entry->vec, @@ -136,6 +141,23 @@ DEFINE_EVENT(softirq, softirq_exit, TP_ARGS(h, vec) ); +/** + * softirq_raise - called immediately when a softirq is raised + * @h: pointer to struct softirq_action + * @vec: pointer to first struct softirq_action in softirq_vec array + * + * The @h parameter contains a pointer to the softirq vector number which is + * raised. @vec is NULL and it means @h includes vector number not + * softirq_action. When used in combination with the softirq_entry tracepoint + * we can determine the softirq raise latency. + */ +DEFINE_EVENT(softirq, softirq_raise, + + TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + + TP_ARGS(h, vec) +); + #endif /* _TRACE_IRQ_H */ /* This part must be outside protection */ diff --git a/include/trace/events/napi.h b/include/trace/events/napi.h index 188deca2f3c7..8fe1e93f531d 100644 --- a/include/trace/events/napi.h +++ b/include/trace/events/napi.h @@ -6,10 +6,31 @@ #include <linux/netdevice.h> #include <linux/tracepoint.h> +#include <linux/ftrace.h> + +#define NO_DEV "(no_device)" + +TRACE_EVENT(napi_poll, -DECLARE_TRACE(napi_poll, TP_PROTO(struct napi_struct *napi), - TP_ARGS(napi)); + + TP_ARGS(napi), + + TP_STRUCT__entry( + __field( struct napi_struct *, napi) + __string( dev_name, napi->dev ? napi->dev->name : NO_DEV) + ), + + TP_fast_assign( + __entry->napi = napi; + __assign_str(dev_name, napi->dev ? napi->dev->name : NO_DEV); + ), + + TP_printk("napi poll on napi struct %p for device %s", + __entry->napi, __get_str(dev_name)) +); + +#undef NO_DEV #endif /* _TRACE_NAPI_H_ */ diff --git a/include/trace/events/net.h b/include/trace/events/net.h new file mode 100644 index 000000000000..5f247f5ffc56 --- /dev/null +++ b/include/trace/events/net.h @@ -0,0 +1,82 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM net + +#if !defined(_TRACE_NET_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NET_H + +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/ip.h> +#include <linux/tracepoint.h> + +TRACE_EVENT(net_dev_xmit, + + TP_PROTO(struct sk_buff *skb, + int rc), + + TP_ARGS(skb, rc), + + TP_STRUCT__entry( + __field( void *, skbaddr ) + __field( unsigned int, len ) + __field( int, rc ) + __string( name, skb->dev->name ) + ), + + TP_fast_assign( + __entry->skbaddr = skb; + __entry->len = skb->len; + __entry->rc = rc; + __assign_str(name, skb->dev->name); + ), + + TP_printk("dev=%s skbaddr=%p len=%u rc=%d", + __get_str(name), __entry->skbaddr, __entry->len, __entry->rc) +); + +DECLARE_EVENT_CLASS(net_dev_template, + + TP_PROTO(struct sk_buff *skb), + + TP_ARGS(skb), + + TP_STRUCT__entry( + __field( void *, skbaddr ) + __field( unsigned int, len ) + __string( name, skb->dev->name ) + ), + + TP_fast_assign( + __entry->skbaddr = skb; + __entry->len = skb->len; + __assign_str(name, skb->dev->name); + ), + + TP_printk("dev=%s skbaddr=%p len=%u", + __get_str(name), __entry->skbaddr, __entry->len) +) + +DEFINE_EVENT(net_dev_template, net_dev_queue, + + TP_PROTO(struct sk_buff *skb), + + TP_ARGS(skb) +); + +DEFINE_EVENT(net_dev_template, netif_receive_skb, + + TP_PROTO(struct sk_buff *skb), + + TP_ARGS(skb) +); + +DEFINE_EVENT(net_dev_template, netif_rx, + + TP_PROTO(struct sk_buff *skb), + + TP_ARGS(skb) +); +#endif /* _TRACE_NET_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 35a2a6e7bf1e..286784d69b8f 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -10,12 +10,17 @@ #ifndef _TRACE_POWER_ENUM_ #define _TRACE_POWER_ENUM_ enum { - POWER_NONE = 0, - POWER_CSTATE = 1, - POWER_PSTATE = 2, + POWER_NONE = 0, + POWER_CSTATE = 1, /* C-State */ + POWER_PSTATE = 2, /* Fequency change or DVFS */ + POWER_SSTATE = 3, /* Suspend */ }; #endif +/* + * The power events are used for cpuidle & suspend (power_start, power_end) + * and for cpufreq (power_frequency) + */ DECLARE_EVENT_CLASS(power, TP_PROTO(unsigned int type, unsigned int state, unsigned int cpu_id), @@ -70,6 +75,85 @@ TRACE_EVENT(power_end, ); +/* + * The clock events are used for clock enable/disable and for + * clock rate change + */ +DECLARE_EVENT_CLASS(clock, + + TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), + + TP_ARGS(name, state, cpu_id), + + TP_STRUCT__entry( + __string( name, name ) + __field( u64, state ) + __field( u64, cpu_id ) + ), + + TP_fast_assign( + __assign_str(name, name); + __entry->state = state; + __entry->cpu_id = cpu_id; + ), + + TP_printk("%s state=%lu cpu_id=%lu", __get_str(name), + (unsigned long)__entry->state, (unsigned long)__entry->cpu_id) +); + +DEFINE_EVENT(clock, clock_enable, + + TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), + + TP_ARGS(name, state, cpu_id) +); + +DEFINE_EVENT(clock, clock_disable, + + TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), + + TP_ARGS(name, state, cpu_id) +); + +DEFINE_EVENT(clock, clock_set_rate, + + TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), + + TP_ARGS(name, state, cpu_id) +); + +/* + * The power domain events are used for power domains transitions + */ +DECLARE_EVENT_CLASS(power_domain, + + TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), + + TP_ARGS(name, state, cpu_id), + + TP_STRUCT__entry( + __string( name, name ) + __field( u64, state ) + __field( u64, cpu_id ) + ), + + TP_fast_assign( + __assign_str(name, name); + __entry->state = state; + __entry->cpu_id = cpu_id; +), + + TP_printk("%s state=%lu cpu_id=%lu", __get_str(name), + (unsigned long)__entry->state, (unsigned long)__entry->cpu_id) +); + +DEFINE_EVENT(power_domain, power_domain_target, + + TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), + + TP_ARGS(name, state, cpu_id) +); + #endif /* _TRACE_POWER_H */ /* This part must be outside protection */ diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 4b2be6dc76f0..75ce9d500d8e 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -35,6 +35,23 @@ TRACE_EVENT(kfree_skb, __entry->skbaddr, __entry->protocol, __entry->location) ); +TRACE_EVENT(consume_skb, + + TP_PROTO(struct sk_buff *skb), + + TP_ARGS(skb), + + TP_STRUCT__entry( + __field( void *, skbaddr ) + ), + + TP_fast_assign( + __entry->skbaddr = skb; + ), + + TP_printk("skbaddr=%p", __entry->skbaddr) +); + TRACE_EVENT(skb_copy_datagram_iovec, TP_PROTO(const struct sk_buff *skb, int len), diff --git a/init/Kconfig b/init/Kconfig index 2de5b1cbadd9..7b920aafa98a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -21,6 +21,13 @@ config CONSTRUCTORS depends on !UML default y +config HAVE_IRQ_WORK + bool + +config IRQ_WORK + bool + depends on HAVE_IRQ_WORK + menu "General setup" config EXPERIMENTAL @@ -340,6 +347,7 @@ choice config TREE_RCU bool "Tree-based hierarchical RCU" + depends on !PREEMPT && SMP help This option selects the RCU implementation that is designed for very large SMP system with hundreds or @@ -347,7 +355,7 @@ config TREE_RCU smaller systems. config TREE_PREEMPT_RCU - bool "Preemptable tree-based hierarchical RCU" + bool "Preemptible tree-based hierarchical RCU" depends on PREEMPT help This option selects the RCU implementation that is @@ -365,8 +373,22 @@ config TINY_RCU is not required. This option greatly reduces the memory footprint of RCU. +config TINY_PREEMPT_RCU + bool "Preemptible UP-only small-memory-footprint RCU" + depends on !SMP && PREEMPT + help + This option selects the RCU implementation that is designed + for real-time UP systems. This option greatly reduces the + memory footprint of RCU. + endchoice +config PREEMPT_RCU + def_bool ( TREE_PREEMPT_RCU || TINY_PREEMPT_RCU ) + help + This option enables preemptible-RCU code that is common between + the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations. + config RCU_TRACE bool "Enable tracing for RCU" depends on TREE_RCU || TREE_PREEMPT_RCU @@ -387,9 +409,12 @@ config RCU_FANOUT help This option controls the fanout of hierarchical implementations of RCU, allowing RCU to work efficiently on machines with - large numbers of CPUs. This value must be at least the cube - root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit - systems and up to 262,144 for 64-bit systems. + large numbers of CPUs. This value must be at least the fourth + root of NR_CPUS, which allows NR_CPUS to be insanely large. + The default value of RCU_FANOUT should be used for production + systems, but if you are stress-testing the RCU implementation + itself, small RCU_FANOUT values allow you to test large-system + code paths on small(er) systems. Select a specific number if testing RCU itself. Take the default if unsure. @@ -987,6 +1012,7 @@ config PERF_EVENTS default y if (PROFILING || PERF_COUNTERS) depends on HAVE_PERF_EVENTS select ANON_INODES + select IRQ_WORK help Enable kernel support for various performance events provided by software and hardware. diff --git a/kernel/Makefile b/kernel/Makefile index 0b72d1a74be0..e2c9d52cfe9e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,7 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ - async.o range.o + async.o range.o jump_label.o obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o obj-y += groups.o @@ -23,6 +23,7 @@ CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg CFLAGS_REMOVE_perf_event.o = -pg +CFLAGS_REMOVE_irq_work.o = -pg endif obj-$(CONFIG_FREEZER) += freezer.o @@ -86,6 +87,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o obj-$(CONFIG_TINY_RCU) += rcutiny.o +obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o @@ -100,6 +102,7 @@ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o +obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c9483d8f6140..291ba3d04bea 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -138,7 +138,7 @@ struct css_id { * is called after synchronize_rcu(). But for safe use, css_is_removed() * css_tryget() should be used for avoiding race. */ - struct cgroup_subsys_state *css; + struct cgroup_subsys_state __rcu *css; /* * ID of this css. */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b23c0979bbe7..51b143e2a07a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1397,7 +1397,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, if (tsk->flags & PF_THREAD_BOUND) return -EINVAL; - ret = security_task_setscheduler(tsk, 0, NULL); + ret = security_task_setscheduler(tsk); if (ret) return ret; if (threadgroup) { @@ -1405,7 +1405,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, rcu_read_lock(); list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - ret = security_task_setscheduler(c, 0, NULL); + ret = security_task_setscheduler(c); if (ret) { rcu_read_unlock(); return ret; diff --git a/kernel/exit.c b/kernel/exit.c index 03120229db28..e2bdf37f9fde 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -149,9 +149,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); -#ifdef CONFIG_PERF_EVENTS - WARN_ON_ONCE(tsk->perf_event_ctxp); -#endif + perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); } diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 1decafbb6b1a..72206cf5c6cf 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -931,6 +931,7 @@ static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) { if (hrtimer_is_queued(timer)) { + unsigned long state; int reprogram; /* @@ -944,8 +945,13 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) debug_deactivate(timer); timer_stats_hrtimer_clear_start_info(timer); reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); - __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, - reprogram); + /* + * We must preserve the CALLBACK state flag here, + * otherwise we could move the timer base in + * switch_hrtimer_base. + */ + state = timer->state & HRTIMER_STATE_CALLBACK; + __remove_hrtimer(timer, base, state, reprogram); return 1; } return 0; @@ -1231,6 +1237,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); enqueue_hrtimer(timer, base); } + + WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK)); + timer->state &= ~HRTIMER_STATE_CALLBACK; } diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 0c642d51aac2..53ead174da2f 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -98,7 +98,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); - __debug_show_held_locks(t); + debug_show_held_locks(t); touch_nmi_watchdog(); @@ -111,7 +111,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) * periodically exit the critical section and enter a new one. * * For preemptible RCU it is sufficient to call rcu_read_unlock in order - * exit the grace period. For classic RCU, a reschedule is required. + * to exit the grace period. For classic RCU, a reschedule is required. */ static void rcu_lock_break(struct task_struct *g, struct task_struct *t) { diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index c7c2aed9e2dc..2c9120f0afca 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -113,12 +113,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) */ static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) { - struct perf_event_context *ctx = bp->ctx; + struct task_struct *tsk = bp->hw.bp_target; struct perf_event *iter; int count = 0; list_for_each_entry(iter, &bp_task_head, hw.bp_list) { - if (iter->ctx == ctx && find_slot_idx(iter) == type) + if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type) count += hw_breakpoint_weight(iter); } @@ -134,7 +134,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, enum bp_type_idx type) { int cpu = bp->cpu; - struct task_struct *tsk = bp->ctx->task; + struct task_struct *tsk = bp->hw.bp_target; if (cpu >= 0) { slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); @@ -213,7 +213,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight) { int cpu = bp->cpu; - struct task_struct *tsk = bp->ctx->task; + struct task_struct *tsk = bp->hw.bp_target; /* Pinned counter cpu profiling */ if (!tsk) { @@ -433,8 +433,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, struct task_struct *tsk) { - return perf_event_create_kernel_counter(attr, -1, task_pid_vnr(tsk), - triggered); + return perf_event_create_kernel_counter(attr, -1, tsk, triggered); } EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); @@ -516,7 +515,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, get_online_cpus(); for_each_online_cpu(cpu) { pevent = per_cpu_ptr(cpu_events, cpu); - bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); + bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered); *pevent = bp; @@ -566,6 +565,61 @@ static struct notifier_block hw_breakpoint_exceptions_nb = { .priority = 0x7fffffff }; +static void bp_perf_event_destroy(struct perf_event *event) +{ + release_bp_slot(event); +} + +static int hw_breakpoint_event_init(struct perf_event *bp) +{ + int err; + + if (bp->attr.type != PERF_TYPE_BREAKPOINT) + return -ENOENT; + + err = register_perf_hw_breakpoint(bp); + if (err) + return err; + + bp->destroy = bp_perf_event_destroy; + + return 0; +} + +static int hw_breakpoint_add(struct perf_event *bp, int flags) +{ + if (!(flags & PERF_EF_START)) + bp->hw.state = PERF_HES_STOPPED; + + return arch_install_hw_breakpoint(bp); +} + +static void hw_breakpoint_del(struct perf_event *bp, int flags) +{ + arch_uninstall_hw_breakpoint(bp); +} + +static void hw_breakpoint_start(struct perf_event *bp, int flags) +{ + bp->hw.state = 0; +} + +static void hw_breakpoint_stop(struct perf_event *bp, int flags) +{ + bp->hw.state = PERF_HES_STOPPED; +} + +static struct pmu perf_breakpoint = { + .task_ctx_nr = perf_sw_context, /* could eventually get its own */ + + .event_init = hw_breakpoint_event_init, + .add = hw_breakpoint_add, + .del = hw_breakpoint_del, + .start = hw_breakpoint_start, + .stop = hw_breakpoint_stop, + .read = hw_breakpoint_pmu_read, +}; + static int __init init_hw_breakpoint(void) { unsigned int **task_bp_pinned; @@ -587,6 +641,8 @@ static int __init init_hw_breakpoint(void) constraints_initialized = 1; + perf_pmu_register(&perf_breakpoint); + return register_die_notifier(&hw_breakpoint_exceptions_nb); err_alloc: @@ -602,8 +658,3 @@ static int __init init_hw_breakpoint(void) core_initcall(init_hw_breakpoint); -struct pmu perf_ops_bp = { - .enable = arch_install_hw_breakpoint, - .disable = arch_uninstall_hw_breakpoint, - .read = hw_breakpoint_pmu_read, -}; diff --git a/kernel/irq_work.c b/kernel/irq_work.c new file mode 100644 index 000000000000..f16763ff8481 --- /dev/null +++ b/kernel/irq_work.c @@ -0,0 +1,164 @@ +/* + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * + * Provides a framework for enqueueing and running callbacks from hardirq + * context. The enqueueing is NMI-safe. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/irq_work.h> +#include <linux/hardirq.h> + +/* + * An entry can be in one of four states: + * + * free NULL, 0 -> {claimed} : free to be used + * claimed NULL, 3 -> {pending} : claimed to be enqueued + * pending next, 3 -> {busy} : queued, pending callback + * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed + * + * We use the lower two bits of the next pointer to keep PENDING and BUSY + * flags. + */ + +#define IRQ_WORK_PENDING 1UL +#define IRQ_WORK_BUSY 2UL +#define IRQ_WORK_FLAGS 3UL + +static inline bool irq_work_is_set(struct irq_work *entry, int flags) +{ + return (unsigned long)entry->next & flags; +} + +static inline struct irq_work *irq_work_next(struct irq_work *entry) +{ + unsigned long next = (unsigned long)entry->next; + next &= ~IRQ_WORK_FLAGS; + return (struct irq_work *)next; +} + +static inline struct irq_work *next_flags(struct irq_work *entry, int flags) +{ + unsigned long next = (unsigned long)entry; + next |= flags; + return (struct irq_work *)next; +} + +static DEFINE_PER_CPU(struct irq_work *, irq_work_list); + +/* + * Claim the entry so that no one else will poke at it. + */ +static bool irq_work_claim(struct irq_work *entry) +{ + struct irq_work *next, *nflags; + + do { + next = entry->next; + if ((unsigned long)next & IRQ_WORK_PENDING) + return false; + nflags = next_flags(next, IRQ_WORK_FLAGS); + } while (cmpxchg(&entry->next, next, nflags) != next); + + return true; +} + + +void __weak arch_irq_work_raise(void) +{ + /* + * Lame architectures will get the timer tick callback + */ +} + +/* + * Queue the entry and raise the IPI if needed. + */ +static void __irq_work_queue(struct irq_work *entry) +{ + struct irq_work **head, *next; + + head = &get_cpu_var(irq_work_list); + + do { + next = *head; + /* Can assign non-atomic because we keep the flags set. */ + entry->next = next_flags(next, IRQ_WORK_FLAGS); + } while (cmpxchg(head, next, entry) != next); + + /* The list was empty, raise self-interrupt to start processing. */ + if (!irq_work_next(entry)) + arch_irq_work_raise(); + + put_cpu_var(irq_work_list); +} + +/* + * Enqueue the irq_work @entry, returns true on success, failure when the + * @entry was already enqueued by someone else. + * + * Can be re-enqueued while the callback is still in progress. + */ +bool irq_work_queue(struct irq_work *entry) +{ + if (!irq_work_claim(entry)) { + /* + * Already enqueued, can't do! + */ + return false; + } + + __irq_work_queue(entry); + return true; +} +EXPORT_SYMBOL_GPL(irq_work_queue); + +/* + * Run the irq_work entries on this cpu. Requires to be ran from hardirq + * context with local IRQs disabled. + */ +void irq_work_run(void) +{ + struct irq_work *list, **head; + + head = &__get_cpu_var(irq_work_list); + if (*head == NULL) + return; + + BUG_ON(!in_irq()); + BUG_ON(!irqs_disabled()); + + list = xchg(head, NULL); + while (list != NULL) { + struct irq_work *entry = list; + + list = irq_work_next(list); + + /* + * Clear the PENDING bit, after this point the @entry + * can be re-used. + */ + entry->next = next_flags(NULL, IRQ_WORK_BUSY); + entry->func(entry); + /* + * Clear the BUSY bit and return to the free state if + * no-one else claimed it meanwhile. + */ + cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL); + } +} +EXPORT_SYMBOL_GPL(irq_work_run); + +/* + * Synchronize against the irq_work @entry, ensures the entry is not + * currently in use. + */ +void irq_work_sync(struct irq_work *entry) +{ + WARN_ON_ONCE(irqs_disabled()); + + while (irq_work_is_set(entry, IRQ_WORK_BUSY)) + cpu_relax(); +} +EXPORT_SYMBOL_GPL(irq_work_sync); diff --git a/kernel/jump_label.c b/kernel/jump_label.c new file mode 100644 index 000000000000..7be868bf25c6 --- /dev/null +++ b/kernel/jump_label.c @@ -0,0 +1,429 @@ +/* + * jump label support + * + * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> + * + */ +#include <linux/jump_label.h> +#include <linux/memory.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/jhash.h> +#include <linux/slab.h> +#include <linux/sort.h> +#include <linux/err.h> + +#ifdef HAVE_JUMP_LABEL + +#define JUMP_LABEL_HASH_BITS 6 +#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS) +static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE]; + +/* mutex to protect coming/going of the the jump_label table */ +static DEFINE_MUTEX(jump_label_mutex); + +struct jump_label_entry { + struct hlist_node hlist; + struct jump_entry *table; + int nr_entries; + /* hang modules off here */ + struct hlist_head modules; + unsigned long key; +}; + +struct jump_label_module_entry { + struct hlist_node hlist; + struct jump_entry *table; + int nr_entries; + struct module *mod; +}; + +static int jump_label_cmp(const void *a, const void *b) +{ + const struct jump_entry *jea = a; + const struct jump_entry *jeb = b; + + if (jea->key < jeb->key) + return -1; + + if (jea->key > jeb->key) + return 1; + + return 0; +} + +static void +sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop) +{ + unsigned long size; + + size = (((unsigned long)stop - (unsigned long)start) + / sizeof(struct jump_entry)); + sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); +} + +static struct jump_label_entry *get_jump_label_entry(jump_label_t key) +{ + struct hlist_head *head; + struct hlist_node *node; + struct jump_label_entry *e; + u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0); + + head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (key == e->key) + return e; + } + return NULL; +} + +static struct jump_label_entry * +add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table) +{ + struct hlist_head *head; + struct jump_label_entry *e; + u32 hash; + + e = get_jump_label_entry(key); + if (e) + return ERR_PTR(-EEXIST); + + e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL); + if (!e) + return ERR_PTR(-ENOMEM); + + hash = jhash((void *)&key, sizeof(jump_label_t), 0); + head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; + e->key = key; + e->table = table; + e->nr_entries = nr_entries; + INIT_HLIST_HEAD(&(e->modules)); + hlist_add_head(&e->hlist, head); + return e; +} + +static int +build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop) +{ + struct jump_entry *iter, *iter_begin; + struct jump_label_entry *entry; + int count; + + sort_jump_label_entries(start, stop); + iter = start; + while (iter < stop) { + entry = get_jump_label_entry(iter->key); + if (!entry) { + iter_begin = iter; + count = 0; + while ((iter < stop) && + (iter->key == iter_begin->key)) { + iter++; + count++; + } + entry = add_jump_label_entry(iter_begin->key, + count, iter_begin); + if (IS_ERR(entry)) + return PTR_ERR(entry); + } else { + WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n"); + return -1; + } + } + return 0; +} + +/*** + * jump_label_update - update jump label text + * @key - key value associated with a a jump label + * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE + * + * Will enable/disable the jump for jump label @key, depending on the + * value of @type. + * + */ + +void jump_label_update(unsigned long key, enum jump_label_type type) +{ + struct jump_entry *iter; + struct jump_label_entry *entry; + struct hlist_node *module_node; + struct jump_label_module_entry *e_module; + int count; + + mutex_lock(&jump_label_mutex); + entry = get_jump_label_entry((jump_label_t)key); + if (entry) { + count = entry->nr_entries; + iter = entry->table; + while (count--) { + if (kernel_text_address(iter->code)) + arch_jump_label_transform(iter, type); + iter++; + } + /* eanble/disable jump labels in modules */ + hlist_for_each_entry(e_module, module_node, &(entry->modules), + hlist) { + count = e_module->nr_entries; + iter = e_module->table; + while (count--) { + if (kernel_text_address(iter->code)) + arch_jump_label_transform(iter, type); + iter++; + } + } + } + mutex_unlock(&jump_label_mutex); +} + +static int addr_conflict(struct jump_entry *entry, void *start, void *end) +{ + if (entry->code <= (unsigned long)end && + entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start) + return 1; + + return 0; +} + +#ifdef CONFIG_MODULES + +static int module_conflict(void *start, void *end) +{ + struct hlist_head *head; + struct hlist_node *node, *node_next, *module_node, *module_node_next; + struct jump_label_entry *e; + struct jump_label_module_entry *e_module; + struct jump_entry *iter; + int i, count; + int conflict = 0; + + for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { + head = &jump_label_table[i]; + hlist_for_each_entry_safe(e, node, node_next, head, hlist) { + hlist_for_each_entry_safe(e_module, module_node, + module_node_next, + &(e->modules), hlist) { + count = e_module->nr_entries; + iter = e_module->table; + while (count--) { + if (addr_conflict(iter, start, end)) { + conflict = 1; + goto out; + } + iter++; + } + } + } + } +out: + return conflict; +} + +#endif + +/*** + * jump_label_text_reserved - check if addr range is reserved + * @start: start text addr + * @end: end text addr + * + * checks if the text addr located between @start and @end + * overlaps with any of the jump label patch addresses. Code + * that wants to modify kernel text should first verify that + * it does not overlap with any of the jump label addresses. + * + * returns 1 if there is an overlap, 0 otherwise + */ +int jump_label_text_reserved(void *start, void *end) +{ + struct jump_entry *iter; + struct jump_entry *iter_start = __start___jump_table; + struct jump_entry *iter_stop = __start___jump_table; + int conflict = 0; + + mutex_lock(&jump_label_mutex); + iter = iter_start; + while (iter < iter_stop) { + if (addr_conflict(iter, start, end)) { + conflict = 1; + goto out; + } + iter++; + } + + /* now check modules */ +#ifdef CONFIG_MODULES + conflict = module_conflict(start, end); +#endif +out: + mutex_unlock(&jump_label_mutex); + return conflict; +} + +static __init int init_jump_label(void) +{ + int ret; + struct jump_entry *iter_start = __start___jump_table; + struct jump_entry *iter_stop = __stop___jump_table; + struct jump_entry *iter; + + mutex_lock(&jump_label_mutex); + ret = build_jump_label_hashtable(__start___jump_table, + __stop___jump_table); + iter = iter_start; + while (iter < iter_stop) { + arch_jump_label_text_poke_early(iter->code); + iter++; + } + mutex_unlock(&jump_label_mutex); + return ret; +} +early_initcall(init_jump_label); + +#ifdef CONFIG_MODULES + +static struct jump_label_module_entry * +add_jump_label_module_entry(struct jump_label_entry *entry, + struct jump_entry *iter_begin, + int count, struct module *mod) +{ + struct jump_label_module_entry *e; + + e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL); + if (!e) + return ERR_PTR(-ENOMEM); + e->mod = mod; + e->nr_entries = count; + e->table = iter_begin; + hlist_add_head(&e->hlist, &entry->modules); + return e; +} + +static int add_jump_label_module(struct module *mod) +{ + struct jump_entry *iter, *iter_begin; + struct jump_label_entry *entry; + struct jump_label_module_entry *module_entry; + int count; + + /* if the module doesn't have jump label entries, just return */ + if (!mod->num_jump_entries) + return 0; + + sort_jump_label_entries(mod->jump_entries, + mod->jump_entries + mod->num_jump_entries); + iter = mod->jump_entries; + while (iter < mod->jump_entries + mod->num_jump_entries) { + entry = get_jump_label_entry(iter->key); + iter_begin = iter; + count = 0; + while ((iter < mod->jump_entries + mod->num_jump_entries) && + (iter->key == iter_begin->key)) { + iter++; + count++; + } + if (!entry) { + entry = add_jump_label_entry(iter_begin->key, 0, NULL); + if (IS_ERR(entry)) + return PTR_ERR(entry); + } + module_entry = add_jump_label_module_entry(entry, iter_begin, + count, mod); + if (IS_ERR(module_entry)) + return PTR_ERR(module_entry); + } + return 0; +} + +static void remove_jump_label_module(struct module *mod) +{ + struct hlist_head *head; + struct hlist_node *node, *node_next, *module_node, *module_node_next; + struct jump_label_entry *e; + struct jump_label_module_entry *e_module; + int i; + + /* if the module doesn't have jump label entries, just return */ + if (!mod->num_jump_entries) + return; + + for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { + head = &jump_label_table[i]; + hlist_for_each_entry_safe(e, node, node_next, head, hlist) { + hlist_for_each_entry_safe(e_module, module_node, + module_node_next, + &(e->modules), hlist) { + if (e_module->mod == mod) { + hlist_del(&e_module->hlist); + kfree(e_module); + } + } + if (hlist_empty(&e->modules) && (e->nr_entries == 0)) { + hlist_del(&e->hlist); + kfree(e); + } + } + } +} + +static int +jump_label_module_notify(struct notifier_block *self, unsigned long val, + void *data) +{ + struct module *mod = data; + int ret = 0; + + switch (val) { + case MODULE_STATE_COMING: + mutex_lock(&jump_label_mutex); + ret = add_jump_label_module(mod); + if (ret) + remove_jump_label_module(mod); + mutex_unlock(&jump_label_mutex); + break; + case MODULE_STATE_GOING: + mutex_lock(&jump_label_mutex); + remove_jump_label_module(mod); + mutex_unlock(&jump_label_mutex); + break; + } + return ret; +} + +/*** + * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop() + * @mod: module to patch + * + * Allow for run-time selection of the optimal nops. Before the module + * loads patch these with arch_get_jump_label_nop(), which is specified by + * the arch specific jump label code. + */ +void jump_label_apply_nops(struct module *mod) +{ + struct jump_entry *iter; + + /* if the module doesn't have jump label entries, just return */ + if (!mod->num_jump_entries) + return; + + iter = mod->jump_entries; + while (iter < mod->jump_entries + mod->num_jump_entries) { + arch_jump_label_text_poke_early(iter->code); + iter++; + } +} + +struct notifier_block jump_label_module_nb = { + .notifier_call = jump_label_module_notify, + .priority = 0, +}; + +static __init int init_jump_label_module(void) +{ + return register_module_notifier(&jump_label_module_nb); +} +early_initcall(init_jump_label_module); + +#endif /* CONFIG_MODULES */ + +#endif diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 282035f3ae96..ec4210c6501e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -47,6 +47,7 @@ #include <linux/memory.h> #include <linux/ftrace.h> #include <linux/cpu.h> +#include <linux/jump_label.h> #include <asm-generic/sections.h> #include <asm/cacheflush.h> @@ -399,7 +400,7 @@ static inline int kprobe_optready(struct kprobe *p) * Return an optimized kprobe whose optimizing code replaces * instructions including addr (exclude breakpoint). */ -struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) +static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) { int i; struct kprobe *p = NULL; @@ -831,6 +832,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, void __kprobes kretprobe_hash_lock(struct task_struct *tsk, struct hlist_head **head, unsigned long *flags) +__acquires(hlist_lock) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); spinlock_t *hlist_lock; @@ -842,6 +844,7 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk, static void __kprobes kretprobe_table_lock(unsigned long hash, unsigned long *flags) +__acquires(hlist_lock) { spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); spin_lock_irqsave(hlist_lock, *flags); @@ -849,6 +852,7 @@ static void __kprobes kretprobe_table_lock(unsigned long hash, void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) +__releases(hlist_lock) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); spinlock_t *hlist_lock; @@ -857,7 +861,9 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, spin_unlock_irqrestore(hlist_lock, *flags); } -void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) +static void __kprobes kretprobe_table_unlock(unsigned long hash, + unsigned long *flags) +__releases(hlist_lock) { spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); spin_unlock_irqrestore(hlist_lock, *flags); @@ -1141,7 +1147,8 @@ int __kprobes register_kprobe(struct kprobe *p) preempt_disable(); if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr) || - ftrace_text_reserved(p->addr, p->addr)) { + ftrace_text_reserved(p->addr, p->addr) || + jump_label_text_reserved(p->addr, p->addr)) { preempt_enable(); return -EINVAL; } @@ -1339,18 +1346,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num) if (num <= 0) return -EINVAL; for (i = 0; i < num; i++) { - unsigned long addr; + unsigned long addr, offset; jp = jps[i]; addr = arch_deref_entry_point(jp->entry); - if (!kernel_text_address(addr)) - ret = -EINVAL; - else { - /* Todo: Verify probepoint is a function entry point */ + /* Verify probepoint is a function entry point */ + if (kallsyms_lookup_size_offset(addr, NULL, &offset) && + offset == 0) { jp->kp.pre_handler = setjmp_pre_handler; jp->kp.break_handler = longjmp_break_handler; ret = register_kprobe(&jp->kp); - } + } else + ret = -EINVAL; + if (ret < 0) { if (i > 0) unregister_jprobes(jps, i); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index f2852a510232..42ba65dff7d9 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -639,6 +639,16 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) } #endif + if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { + debug_locks_off(); + printk(KERN_ERR + "BUG: looking up invalid subclass: %u\n", subclass); + printk(KERN_ERR + "turning off the locking correctness validator.\n"); + dump_stack(); + return NULL; + } + /* * Static locks do not have their class-keys yet - for them the key * is the lock object itself: @@ -774,7 +784,9 @@ out_unlock_set: raw_local_irq_restore(flags); if (!subclass || force) - lock->class_cache = class; + lock->class_cache[0] = class; + else if (subclass < NR_LOCKDEP_CACHING_CLASSES) + lock->class_cache[subclass] = class; if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) return NULL; @@ -2679,7 +2691,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { - lock->class_cache = NULL; + int i; + + for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) + lock->class_cache[i] = NULL; + #ifdef CONFIG_LOCK_STAT lock->cpu = raw_smp_processor_id(); #endif @@ -2739,21 +2755,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; - if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { - debug_locks_off(); - printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n"); - printk("turning off the locking correctness validator.\n"); - dump_stack(); - return 0; - } - if (lock->key == &__lockdep_no_validate__) check = 1; - if (!subclass) - class = lock->class_cache; + if (subclass < NR_LOCKDEP_CACHING_CLASSES) + class = lock->class_cache[subclass]; /* - * Not cached yet or subclass? + * Not cached? */ if (unlikely(!class)) { class = register_lock_class(lock, subclass, 0); @@ -2918,7 +2926,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) return 1; if (hlock->references) { - struct lock_class *class = lock->class_cache; + struct lock_class *class = lock->class_cache[0]; if (!class) class = look_up_lock_class(lock, 0); @@ -3559,7 +3567,12 @@ void lockdep_reset_lock(struct lockdep_map *lock) if (list_empty(head)) continue; list_for_each_entry_safe(class, next, head, hash_entry) { - if (unlikely(class == lock->class_cache)) { + int match = 0; + + for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) + match |= class == lock->class_cache[j]; + + if (unlikely(match)) { if (debug_locks_off_graph_unlock()) WARN_ON(1); goto out_restore; @@ -3775,7 +3788,7 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks); * Careful: only use this function if you are sure that * the task cannot run in parallel! */ -void __debug_show_held_locks(struct task_struct *task) +void debug_show_held_locks(struct task_struct *task) { if (unlikely(!debug_locks)) { printk("INFO: lockdep is turned off.\n"); @@ -3783,12 +3796,6 @@ void __debug_show_held_locks(struct task_struct *task) } lockdep_print_held_locks(task); } -EXPORT_SYMBOL_GPL(__debug_show_held_locks); - -void debug_show_held_locks(struct task_struct *task) -{ - __debug_show_held_locks(task); -} EXPORT_SYMBOL_GPL(debug_show_held_locks); void lockdep_sys_exit(void) diff --git a/kernel/module.c b/kernel/module.c index ccd641991842..2df46301a7a4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -55,6 +55,7 @@ #include <linux/async.h> #include <linux/percpu.h> #include <linux/kmemleak.h> +#include <linux/jump_label.h> #define CREATE_TRACE_POINTS #include <trace/events/module.h> @@ -2309,6 +2310,11 @@ static void find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->tracepoints), &mod->num_tracepoints); #endif +#ifdef HAVE_JUMP_LABEL + mod->jump_entries = section_objs(info, "__jump_table", + sizeof(*mod->jump_entries), + &mod->num_jump_entries); +#endif #ifdef CONFIG_EVENT_TRACING mod->trace_events = section_objs(info, "_ftrace_events", sizeof(*mod->trace_events), diff --git a/kernel/perf_event.c b/kernel/perf_event.c index b98bed3d8182..f309e8014c78 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -31,24 +31,18 @@ #include <linux/kernel_stat.h> #include <linux/perf_event.h> #include <linux/ftrace_event.h> -#include <linux/hw_breakpoint.h> #include <asm/irq_regs.h> -/* - * Each CPU has a list of per CPU events: - */ -static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); - -int perf_max_events __read_mostly = 1; -static int perf_reserved_percpu __read_mostly; -static int perf_overcommit __read_mostly = 1; - -static atomic_t nr_events __read_mostly; +atomic_t perf_task_events __read_mostly; static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_task_events __read_mostly; +static LIST_HEAD(pmus); +static DEFINE_MUTEX(pmus_lock); +static struct srcu_struct pmus_srcu; + /* * perf event paranoia level: * -1 - not paranoid at all @@ -67,36 +61,43 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000; static atomic64_t perf_event_id; -/* - * Lock for (sysadmin-configurable) event reservations: - */ -static DEFINE_SPINLOCK(perf_resource_lock); +void __weak perf_event_print_debug(void) { } -/* - * Architecture provided APIs - weak aliases: - */ -extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) +extern __weak const char *perf_pmu_name(void) { - return NULL; + return "pmu"; } -void __weak hw_perf_disable(void) { barrier(); } -void __weak hw_perf_enable(void) { barrier(); } - -void __weak perf_event_print_debug(void) { } - -static DEFINE_PER_CPU(int, perf_disable_count); +void perf_pmu_disable(struct pmu *pmu) +{ + int *count = this_cpu_ptr(pmu->pmu_disable_count); + if (!(*count)++) + pmu->pmu_disable(pmu); +} -void perf_disable(void) +void perf_pmu_enable(struct pmu *pmu) { - if (!__get_cpu_var(perf_disable_count)++) - hw_perf_disable(); + int *count = this_cpu_ptr(pmu->pmu_disable_count); + if (!--(*count)) + pmu->pmu_enable(pmu); } -void perf_enable(void) +static DEFINE_PER_CPU(struct list_head, rotation_list); + +/* + * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized + * because they're strictly cpu affine and rotate_start is called with IRQs + * disabled, while rotate_context is called from IRQ context. + */ +static void perf_pmu_rotate_start(struct pmu *pmu) { - if (!--__get_cpu_var(perf_disable_count)) - hw_perf_enable(); + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct list_head *head = &__get_cpu_var(rotation_list); + + WARN_ON(!irqs_disabled()); + + if (list_empty(&cpuctx->rotation_list)) + list_add(&cpuctx->rotation_list, head); } static void get_ctx(struct perf_event_context *ctx) @@ -151,13 +152,13 @@ static u64 primary_event_id(struct perf_event *event) * the context could get moved to another task. */ static struct perf_event_context * -perf_lock_task_context(struct task_struct *task, unsigned long *flags) +perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) { struct perf_event_context *ctx; rcu_read_lock(); - retry: - ctx = rcu_dereference(task->perf_event_ctxp); +retry: + ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); if (ctx) { /* * If this context is a clone of another, it might @@ -170,7 +171,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) * can't get swapped on us any more. */ raw_spin_lock_irqsave(&ctx->lock, *flags); - if (ctx != rcu_dereference(task->perf_event_ctxp)) { + if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { raw_spin_unlock_irqrestore(&ctx->lock, *flags); goto retry; } @@ -189,12 +190,13 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) * can't get swapped to another task. This also increments its * reference count so that the context can't get freed. */ -static struct perf_event_context *perf_pin_task_context(struct task_struct *task) +static struct perf_event_context * +perf_pin_task_context(struct task_struct *task, int ctxn) { struct perf_event_context *ctx; unsigned long flags; - ctx = perf_lock_task_context(task, &flags); + ctx = perf_lock_task_context(task, ctxn, &flags); if (ctx) { ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); @@ -302,6 +304,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) } list_add_rcu(&event->event_entry, &ctx->event_list); + if (!ctx->nr_events) + perf_pmu_rotate_start(ctx->pmu); ctx->nr_events++; if (event->attr.inherit_stat) ctx->nr_stat++; @@ -311,7 +315,12 @@ static void perf_group_attach(struct perf_event *event) { struct perf_event *group_leader = event->group_leader; - WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP); + /* + * We can have double attach due to group movement in perf_event_open. + */ + if (event->attach_state & PERF_ATTACH_GROUP) + return; + event->attach_state |= PERF_ATTACH_GROUP; if (group_leader == event) @@ -408,8 +417,8 @@ event_filter_match(struct perf_event *event) return event->cpu == -1 || event->cpu == smp_processor_id(); } -static void -event_sched_out(struct perf_event *event, +static int +__event_sched_out(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { @@ -428,15 +437,14 @@ event_sched_out(struct perf_event *event, } if (event->state != PERF_EVENT_STATE_ACTIVE) - return; + return 0; event->state = PERF_EVENT_STATE_INACTIVE; if (event->pending_disable) { event->pending_disable = 0; event->state = PERF_EVENT_STATE_OFF; } - event->tstamp_stopped = ctx->time; - event->pmu->disable(event); + event->pmu->del(event, 0); event->oncpu = -1; if (!is_software_event(event)) @@ -444,6 +452,19 @@ event_sched_out(struct perf_event *event, ctx->nr_active--; if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; + return 1; +} + +static void +event_sched_out(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + int ret; + + ret = __event_sched_out(event, cpuctx, ctx); + if (ret) + event->tstamp_stopped = ctx->time; } static void @@ -466,6 +487,12 @@ group_sched_out(struct perf_event *group_event, cpuctx->exclusive = 0; } +static inline struct perf_cpu_context * +__get_cpu_context(struct perf_event_context *ctx) +{ + return this_cpu_ptr(ctx->pmu->pmu_cpu_context); +} + /* * Cross CPU call to remove a performance event * @@ -474,9 +501,9 @@ group_sched_out(struct perf_event *group_event, */ static void __perf_event_remove_from_context(void *info) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); /* * If this is a task context, we need to check whether it is @@ -487,27 +514,11 @@ static void __perf_event_remove_from_context(void *info) return; raw_spin_lock(&ctx->lock); - /* - * Protect the list operation against NMI by disabling the - * events on a global level. - */ - perf_disable(); event_sched_out(event, cpuctx, ctx); list_del_event(event, ctx); - if (!ctx->task) { - /* - * Allow more per task events with respect to the - * reservation: - */ - cpuctx->max_pertask = - min(perf_max_events - ctx->nr_events, - perf_max_events - perf_reserved_percpu); - } - - perf_enable(); raw_spin_unlock(&ctx->lock); } @@ -572,8 +583,8 @@ retry: static void __perf_event_disable(void *info) { struct perf_event *event = info; - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); /* * If this is a per-task event, need to check whether this @@ -628,7 +639,7 @@ void perf_event_disable(struct perf_event *event) return; } - retry: +retry: task_oncpu_function_call(task, __perf_event_disable, event); raw_spin_lock_irq(&ctx->lock); @@ -653,7 +664,7 @@ void perf_event_disable(struct perf_event *event) } static int -event_sched_in(struct perf_event *event, +__event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { @@ -667,14 +678,12 @@ event_sched_in(struct perf_event *event, */ smp_wmb(); - if (event->pmu->enable(event)) { + if (event->pmu->add(event, PERF_EF_START)) { event->state = PERF_EVENT_STATE_INACTIVE; event->oncpu = -1; return -EAGAIN; } - event->tstamp_running += ctx->time - event->tstamp_stopped; - if (!is_software_event(event)) cpuctx->active_oncpu++; ctx->nr_active++; @@ -685,28 +694,56 @@ event_sched_in(struct perf_event *event, return 0; } +static inline int +event_sched_in(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + int ret = __event_sched_in(event, cpuctx, ctx); + if (ret) + return ret; + event->tstamp_running += ctx->time - event->tstamp_stopped; + return 0; +} + +static void +group_commit_event_sched_in(struct perf_event *group_event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + struct perf_event *event; + u64 now = ctx->time; + + group_event->tstamp_running += now - group_event->tstamp_stopped; + /* + * Schedule in siblings as one group (if any): + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + event->tstamp_running += now - event->tstamp_stopped; + } +} + static int group_sched_in(struct perf_event *group_event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { struct perf_event *event, *partial_group = NULL; - const struct pmu *pmu = group_event->pmu; - bool txn = false; + struct pmu *pmu = group_event->pmu; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; - /* Check if group transaction availabe */ - if (pmu->start_txn) - txn = true; + pmu->start_txn(pmu); - if (txn) - pmu->start_txn(pmu); - - if (event_sched_in(group_event, cpuctx, ctx)) { - if (txn) - pmu->cancel_txn(pmu); + /* + * use __event_sched_in() to delay updating tstamp_running + * until the transaction is committed. In case of failure + * we will keep an unmodified tstamp_running which is a + * requirement to get correct timing information + */ + if (__event_sched_in(group_event, cpuctx, ctx)) { + pmu->cancel_txn(pmu); return -EAGAIN; } @@ -714,29 +751,33 @@ group_sched_in(struct perf_event *group_event, * Schedule in siblings as one group (if any): */ list_for_each_entry(event, &group_event->sibling_list, group_entry) { - if (event_sched_in(event, cpuctx, ctx)) { + if (__event_sched_in(event, cpuctx, ctx)) { partial_group = event; goto group_error; } } - if (!txn || !pmu->commit_txn(pmu)) + if (!pmu->commit_txn(pmu)) { + /* commit tstamp_running */ + group_commit_event_sched_in(group_event, cpuctx, ctx); return 0; - + } group_error: /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: + * + * use __event_sched_out() to avoid updating tstamp_stopped + * because the event never actually ran */ list_for_each_entry(event, &group_event->sibling_list, group_entry) { if (event == partial_group) break; - event_sched_out(event, cpuctx, ctx); + __event_sched_out(event, cpuctx, ctx); } - event_sched_out(group_event, cpuctx, ctx); + __event_sched_out(group_event, cpuctx, ctx); - if (txn) - pmu->cancel_txn(pmu); + pmu->cancel_txn(pmu); return -EAGAIN; } @@ -789,10 +830,10 @@ static void add_event_to_ctx(struct perf_event *event, */ static void __perf_install_in_context(void *info) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; struct perf_event *leader = event->group_leader; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); int err; /* @@ -812,12 +853,6 @@ static void __perf_install_in_context(void *info) ctx->is_active = 1; update_context_time(ctx); - /* - * Protect the list operation against NMI by disabling the - * events on a global level. NOP for non NMI based events. - */ - perf_disable(); - add_event_to_ctx(event, ctx); if (event->cpu != -1 && event->cpu != smp_processor_id()) @@ -855,12 +890,7 @@ static void __perf_install_in_context(void *info) } } - if (!err && !ctx->task && cpuctx->max_pertask) - cpuctx->max_pertask--; - - unlock: - perf_enable(); - +unlock: raw_spin_unlock(&ctx->lock); } @@ -883,6 +913,8 @@ perf_install_in_context(struct perf_event_context *ctx, { struct task_struct *task = ctx->task; + event->ctx = ctx; + if (!task) { /* * Per cpu events are installed via an smp call and @@ -931,10 +963,12 @@ static void __perf_event_mark_enabled(struct perf_event *event, event->state = PERF_EVENT_STATE_INACTIVE; event->tstamp_enabled = ctx->time - event->total_time_enabled; - list_for_each_entry(sub, &event->sibling_list, group_entry) - if (sub->state >= PERF_EVENT_STATE_INACTIVE) + list_for_each_entry(sub, &event->sibling_list, group_entry) { + if (sub->state >= PERF_EVENT_STATE_INACTIVE) { sub->tstamp_enabled = ctx->time - sub->total_time_enabled; + } + } } /* @@ -943,9 +977,9 @@ static void __perf_event_mark_enabled(struct perf_event *event, static void __perf_event_enable(void *info) { struct perf_event *event = info; - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event_context *ctx = event->ctx; struct perf_event *leader = event->group_leader; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); int err; /* @@ -979,12 +1013,10 @@ static void __perf_event_enable(void *info) if (!group_can_go_on(event, cpuctx, 1)) { err = -EEXIST; } else { - perf_disable(); if (event == leader) err = group_sched_in(event, cpuctx, ctx); else err = event_sched_in(event, cpuctx, ctx); - perf_enable(); } if (err) { @@ -1000,7 +1032,7 @@ static void __perf_event_enable(void *info) } } - unlock: +unlock: raw_spin_unlock(&ctx->lock); } @@ -1041,7 +1073,7 @@ void perf_event_enable(struct perf_event *event) if (event->state == PERF_EVENT_STATE_ERROR) event->state = PERF_EVENT_STATE_OFF; - retry: +retry: raw_spin_unlock_irq(&ctx->lock); task_oncpu_function_call(task, __perf_event_enable, event); @@ -1061,7 +1093,7 @@ void perf_event_enable(struct perf_event *event) if (event->state == PERF_EVENT_STATE_OFF) __perf_event_mark_enabled(event, ctx); - out: +out: raw_spin_unlock_irq(&ctx->lock); } @@ -1092,26 +1124,26 @@ static void ctx_sched_out(struct perf_event_context *ctx, struct perf_event *event; raw_spin_lock(&ctx->lock); + perf_pmu_disable(ctx->pmu); ctx->is_active = 0; if (likely(!ctx->nr_events)) goto out; update_context_time(ctx); - perf_disable(); if (!ctx->nr_active) - goto out_enable; + goto out; - if (event_type & EVENT_PINNED) + if (event_type & EVENT_PINNED) { list_for_each_entry(event, &ctx->pinned_groups, group_entry) group_sched_out(event, cpuctx, ctx); + } - if (event_type & EVENT_FLEXIBLE) + if (event_type & EVENT_FLEXIBLE) { list_for_each_entry(event, &ctx->flexible_groups, group_entry) group_sched_out(event, cpuctx, ctx); - - out_enable: - perf_enable(); - out: + } +out: + perf_pmu_enable(ctx->pmu); raw_spin_unlock(&ctx->lock); } @@ -1209,34 +1241,25 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, } } -/* - * Called from scheduler to remove the events of the current task, - * with interrupts disabled. - * - * We stop each event and update the event value in event->count. - * - * This does not protect us against NMI, but disable() - * sets the disabled bit in the control field of event _before_ - * accessing the event control register. If a NMI hits, then it will - * not restart the event. - */ -void perf_event_task_sched_out(struct task_struct *task, - struct task_struct *next) +void perf_event_context_sched_out(struct task_struct *task, int ctxn, + struct task_struct *next) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; struct perf_event_context *next_ctx; struct perf_event_context *parent; + struct perf_cpu_context *cpuctx; int do_switch = 1; - perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); + if (likely(!ctx)) + return; - if (likely(!ctx || !cpuctx->task_ctx)) + cpuctx = __get_cpu_context(ctx); + if (!cpuctx->task_ctx) return; rcu_read_lock(); parent = rcu_dereference(ctx->parent_ctx); - next_ctx = next->perf_event_ctxp; + next_ctx = next->perf_event_ctxp[ctxn]; if (parent && next_ctx && rcu_dereference(next_ctx->parent_ctx) == parent) { /* @@ -1255,8 +1278,8 @@ void perf_event_task_sched_out(struct task_struct *task, * XXX do we need a memory barrier of sorts * wrt to rcu_dereference() of perf_event_ctxp */ - task->perf_event_ctxp = next_ctx; - next->perf_event_ctxp = ctx; + task->perf_event_ctxp[ctxn] = next_ctx; + next->perf_event_ctxp[ctxn] = ctx; ctx->task = next; next_ctx->task = task; do_switch = 0; @@ -1274,10 +1297,35 @@ void perf_event_task_sched_out(struct task_struct *task, } } +#define for_each_task_context_nr(ctxn) \ + for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) + +/* + * Called from scheduler to remove the events of the current task, + * with interrupts disabled. + * + * We stop each event and update the event value in event->count. + * + * This does not protect us against NMI, but disable() + * sets the disabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * not restart the event. + */ +void __perf_event_task_sched_out(struct task_struct *task, + struct task_struct *next) +{ + int ctxn; + + perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); + + for_each_task_context_nr(ctxn) + perf_event_context_sched_out(task, ctxn, next); +} + static void task_ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); if (!cpuctx->task_ctx) return; @@ -1292,14 +1340,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, /* * Called with IRQs disabled */ -static void __perf_event_task_sched_out(struct perf_event_context *ctx) -{ - task_ctx_sched_out(ctx, EVENT_ALL); -} - -/* - * Called with IRQs disabled - */ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, enum event_type_t event_type) { @@ -1350,9 +1390,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, if (event->cpu != -1 && event->cpu != smp_processor_id()) continue; - if (group_can_go_on(event, cpuctx, can_add_hw)) + if (group_can_go_on(event, cpuctx, can_add_hw)) { if (group_sched_in(event, cpuctx, ctx)) can_add_hw = 0; + } } } @@ -1368,8 +1409,6 @@ ctx_sched_in(struct perf_event_context *ctx, ctx->timestamp = perf_clock(); - perf_disable(); - /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. @@ -1381,8 +1420,7 @@ ctx_sched_in(struct perf_event_context *ctx, if (event_type & EVENT_FLEXIBLE) ctx_flexible_sched_in(ctx, cpuctx); - perf_enable(); - out: +out: raw_spin_unlock(&ctx->lock); } @@ -1394,43 +1432,28 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, ctx_sched_in(ctx, cpuctx, event_type); } -static void task_ctx_sched_in(struct task_struct *task, +static void task_ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_cpu_context *cpuctx; - if (likely(!ctx)) - return; + cpuctx = __get_cpu_context(ctx); if (cpuctx->task_ctx == ctx) return; + ctx_sched_in(ctx, cpuctx, event_type); cpuctx->task_ctx = ctx; } -/* - * Called from scheduler to add the events of the current task - * with interrupts disabled. - * - * We restore the event value and then enable it. - * - * This does not protect us against NMI, but enable() - * sets the enabled bit in the control field of event _before_ - * accessing the event control register. If a NMI hits, then it will - * keep the event running. - */ -void perf_event_task_sched_in(struct task_struct *task) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_event_context *ctx = task->perf_event_ctxp; - if (likely(!ctx)) - return; +void perf_event_context_sched_in(struct perf_event_context *ctx) +{ + struct perf_cpu_context *cpuctx; + cpuctx = __get_cpu_context(ctx); if (cpuctx->task_ctx == ctx) return; - perf_disable(); - + perf_pmu_disable(ctx->pmu); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -1444,7 +1467,37 @@ void perf_event_task_sched_in(struct task_struct *task) cpuctx->task_ctx = ctx; - perf_enable(); + /* + * Since these rotations are per-cpu, we need to ensure the + * cpu-context we got scheduled on is actually rotating. + */ + perf_pmu_rotate_start(ctx->pmu); + perf_pmu_enable(ctx->pmu); +} + +/* + * Called from scheduler to add the events of the current task + * with interrupts disabled. + * + * We restore the event value and then enable it. + * + * This does not protect us against NMI, but enable() + * sets the enabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * keep the event running. + */ +void __perf_event_task_sched_in(struct task_struct *task) +{ + struct perf_event_context *ctx; + int ctxn; + + for_each_task_context_nr(ctxn) { + ctx = task->perf_event_ctxp[ctxn]; + if (likely(!ctx)) + continue; + + perf_event_context_sched_in(ctx); + } } #define MAX_INTERRUPTS (~0ULL) @@ -1524,22 +1577,6 @@ do { \ return div64_u64(dividend, divisor); } -static void perf_event_stop(struct perf_event *event) -{ - if (!event->pmu->stop) - return event->pmu->disable(event); - - return event->pmu->stop(event); -} - -static int perf_event_start(struct perf_event *event) -{ - if (!event->pmu->start) - return event->pmu->enable(event); - - return event->pmu->start(event); -} - static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) { struct hw_perf_event *hwc = &event->hw; @@ -1559,15 +1596,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) hwc->sample_period = sample_period; if (local64_read(&hwc->period_left) > 8*sample_period) { - perf_disable(); - perf_event_stop(event); + event->pmu->stop(event, PERF_EF_UPDATE); local64_set(&hwc->period_left, 0); - perf_event_start(event); - perf_enable(); + event->pmu->start(event, PERF_EF_RELOAD); } } -static void perf_ctx_adjust_freq(struct perf_event_context *ctx) +static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) { struct perf_event *event; struct hw_perf_event *hwc; @@ -1592,23 +1627,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) */ if (interrupts == MAX_INTERRUPTS) { perf_log_throttle(event, 1); - perf_disable(); - event->pmu->unthrottle(event); - perf_enable(); + event->pmu->start(event, 0); } if (!event->attr.freq || !event->attr.sample_freq) continue; - perf_disable(); event->pmu->read(event); now = local64_read(&event->count); delta = now - hwc->freq_count_stamp; hwc->freq_count_stamp = now; if (delta > 0) - perf_adjust_period(event, TICK_NSEC, delta); - perf_enable(); + perf_adjust_period(event, period, delta); } raw_spin_unlock(&ctx->lock); } @@ -1626,32 +1657,38 @@ static void rotate_ctx(struct perf_event_context *ctx) raw_spin_unlock(&ctx->lock); } -void perf_event_task_tick(struct task_struct *curr) +/* + * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized + * because they're strictly cpu affine and rotate_start is called with IRQs + * disabled, while rotate_context is called from IRQ context. + */ +static void perf_rotate_context(struct perf_cpu_context *cpuctx) { - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; - int rotate = 0; - - if (!atomic_read(&nr_events)) - return; + u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; + struct perf_event_context *ctx = NULL; + int rotate = 0, remove = 1; - cpuctx = &__get_cpu_var(perf_cpu_context); - if (cpuctx->ctx.nr_events && - cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) - rotate = 1; + if (cpuctx->ctx.nr_events) { + remove = 0; + if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) + rotate = 1; + } - ctx = curr->perf_event_ctxp; - if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) - rotate = 1; + ctx = cpuctx->task_ctx; + if (ctx && ctx->nr_events) { + remove = 0; + if (ctx->nr_events != ctx->nr_active) + rotate = 1; + } - perf_ctx_adjust_freq(&cpuctx->ctx); + perf_pmu_disable(cpuctx->ctx.pmu); + perf_ctx_adjust_freq(&cpuctx->ctx, interval); if (ctx) - perf_ctx_adjust_freq(ctx); + perf_ctx_adjust_freq(ctx, interval); if (!rotate) - return; + goto done; - perf_disable(); cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); if (ctx) task_ctx_sched_out(ctx, EVENT_FLEXIBLE); @@ -1662,8 +1699,27 @@ void perf_event_task_tick(struct task_struct *curr) cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); if (ctx) - task_ctx_sched_in(curr, EVENT_FLEXIBLE); - perf_enable(); + task_ctx_sched_in(ctx, EVENT_FLEXIBLE); + +done: + if (remove) + list_del_init(&cpuctx->rotation_list); + + perf_pmu_enable(cpuctx->ctx.pmu); +} + +void perf_event_task_tick(void) +{ + struct list_head *head = &__get_cpu_var(rotation_list); + struct perf_cpu_context *cpuctx, *tmp; + + WARN_ON(!irqs_disabled()); + + list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { + if (cpuctx->jiffies_interval == 1 || + !(jiffies % cpuctx->jiffies_interval)) + perf_rotate_context(cpuctx); + } } static int event_enable_on_exec(struct perf_event *event, @@ -1685,20 +1741,18 @@ static int event_enable_on_exec(struct perf_event *event, * Enable all of a task's events that have been marked enable-on-exec. * This expects task == current. */ -static void perf_event_enable_on_exec(struct task_struct *task) +static void perf_event_enable_on_exec(struct perf_event_context *ctx) { - struct perf_event_context *ctx; struct perf_event *event; unsigned long flags; int enabled = 0; int ret; local_irq_save(flags); - ctx = task->perf_event_ctxp; if (!ctx || !ctx->nr_events) goto out; - __perf_event_task_sched_out(ctx); + task_ctx_sched_out(ctx, EVENT_ALL); raw_spin_lock(&ctx->lock); @@ -1722,8 +1776,8 @@ static void perf_event_enable_on_exec(struct task_struct *task) raw_spin_unlock(&ctx->lock); - perf_event_task_sched_in(task); - out: + perf_event_context_sched_in(ctx); +out: local_irq_restore(flags); } @@ -1732,9 +1786,9 @@ static void perf_event_enable_on_exec(struct task_struct *task) */ static void __perf_event_read(void *info) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); /* * If this is a task context, we need to check whether it is @@ -1773,7 +1827,13 @@ static u64 perf_event_read(struct perf_event *event) unsigned long flags; raw_spin_lock_irqsave(&ctx->lock, flags); - update_context_time(ctx); + /* + * may read while context is not active + * (e.g., thread is blocked), in that case + * we cannot update context time + */ + if (ctx->is_active) + update_context_time(ctx); update_event_times(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } @@ -1782,11 +1842,219 @@ static u64 perf_event_read(struct perf_event *event) } /* - * Initialize the perf_event context in a task_struct: + * Callchain support */ + +struct callchain_cpus_entries { + struct rcu_head rcu_head; + struct perf_callchain_entry *cpu_entries[0]; +}; + +static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); +static atomic_t nr_callchain_events; +static DEFINE_MUTEX(callchain_mutex); +struct callchain_cpus_entries *callchain_cpus_entries; + + +__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +__weak void perf_callchain_user(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +static void release_callchain_buffers_rcu(struct rcu_head *head) +{ + struct callchain_cpus_entries *entries; + int cpu; + + entries = container_of(head, struct callchain_cpus_entries, rcu_head); + + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + + kfree(entries); +} + +static void release_callchain_buffers(void) +{ + struct callchain_cpus_entries *entries; + + entries = callchain_cpus_entries; + rcu_assign_pointer(callchain_cpus_entries, NULL); + call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); +} + +static int alloc_callchain_buffers(void) +{ + int cpu; + int size; + struct callchain_cpus_entries *entries; + + /* + * We can't use the percpu allocation API for data that can be + * accessed from NMI. Use a temporary manual per cpu allocation + * until that gets sorted out. + */ + size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) * + num_possible_cpus(); + + entries = kzalloc(size, GFP_KERNEL); + if (!entries) + return -ENOMEM; + + size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; + + for_each_possible_cpu(cpu) { + entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, + cpu_to_node(cpu)); + if (!entries->cpu_entries[cpu]) + goto fail; + } + + rcu_assign_pointer(callchain_cpus_entries, entries); + + return 0; + +fail: + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + kfree(entries); + + return -ENOMEM; +} + +static int get_callchain_buffers(void) +{ + int err = 0; + int count; + + mutex_lock(&callchain_mutex); + + count = atomic_inc_return(&nr_callchain_events); + if (WARN_ON_ONCE(count < 1)) { + err = -EINVAL; + goto exit; + } + + if (count > 1) { + /* If the allocation failed, give up */ + if (!callchain_cpus_entries) + err = -ENOMEM; + goto exit; + } + + err = alloc_callchain_buffers(); + if (err) + release_callchain_buffers(); +exit: + mutex_unlock(&callchain_mutex); + + return err; +} + +static void put_callchain_buffers(void) +{ + if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { + release_callchain_buffers(); + mutex_unlock(&callchain_mutex); + } +} + +static int get_recursion_context(int *recursion) +{ + int rctx; + + if (in_nmi()) + rctx = 3; + else if (in_irq()) + rctx = 2; + else if (in_softirq()) + rctx = 1; + else + rctx = 0; + + if (recursion[rctx]) + return -1; + + recursion[rctx]++; + barrier(); + + return rctx; +} + +static inline void put_recursion_context(int *recursion, int rctx) +{ + barrier(); + recursion[rctx]--; +} + +static struct perf_callchain_entry *get_callchain_entry(int *rctx) +{ + int cpu; + struct callchain_cpus_entries *entries; + + *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); + if (*rctx == -1) + return NULL; + + entries = rcu_dereference(callchain_cpus_entries); + if (!entries) + return NULL; + + cpu = smp_processor_id(); + + return &entries->cpu_entries[cpu][*rctx]; +} + static void -__perf_event_init_context(struct perf_event_context *ctx, - struct task_struct *task) +put_callchain_entry(int rctx) +{ + put_recursion_context(__get_cpu_var(callchain_recursion), rctx); +} + +static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + int rctx; + struct perf_callchain_entry *entry; + + + entry = get_callchain_entry(&rctx); + if (rctx == -1) + return NULL; + + if (!entry) + goto exit_put; + + entry->nr = 0; + + if (!user_mode(regs)) { + perf_callchain_store(entry, PERF_CONTEXT_KERNEL); + perf_callchain_kernel(entry, regs); + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) { + perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_user(entry, regs); + } + +exit_put: + put_callchain_entry(rctx); + + return entry; +} + +/* + * Initialize the perf_event context in a task_struct: + */ +static void __perf_event_init_context(struct perf_event_context *ctx) { raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); @@ -1794,45 +2062,38 @@ __perf_event_init_context(struct perf_event_context *ctx, INIT_LIST_HEAD(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); atomic_set(&ctx->refcount, 1); - ctx->task = task; } -static struct perf_event_context *find_get_context(pid_t pid, int cpu) +static struct perf_event_context * +alloc_perf_context(struct pmu *pmu, struct task_struct *task) { struct perf_event_context *ctx; - struct perf_cpu_context *cpuctx; - struct task_struct *task; - unsigned long flags; - int err; - - if (pid == -1 && cpu != -1) { - /* Must be root to operate on a CPU event: */ - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EACCES); - if (cpu < 0 || cpu >= nr_cpumask_bits) - return ERR_PTR(-EINVAL); + ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); + if (!ctx) + return NULL; - /* - * We could be clever and allow to attach a event to an - * offline CPU and activate it when the CPU comes up, but - * that's for later. - */ - if (!cpu_online(cpu)) - return ERR_PTR(-ENODEV); + __perf_event_init_context(ctx); + if (task) { + ctx->task = task; + get_task_struct(task); + } + ctx->pmu = pmu; - cpuctx = &per_cpu(perf_cpu_context, cpu); - ctx = &cpuctx->ctx; - get_ctx(ctx); + return ctx; +} - return ctx; - } +static struct task_struct * +find_lively_task_by_vpid(pid_t vpid) +{ + struct task_struct *task; + int err; rcu_read_lock(); - if (!pid) + if (!vpid) task = current; else - task = find_task_by_vpid(pid); + task = find_task_by_vpid(vpid); if (task) get_task_struct(task); rcu_read_unlock(); @@ -1852,36 +2113,78 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto errout; - retry: - ctx = perf_lock_task_context(task, &flags); + return task; +errout: + put_task_struct(task); + return ERR_PTR(err); + +} + +static struct perf_event_context * +find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) +{ + struct perf_event_context *ctx; + struct perf_cpu_context *cpuctx; + unsigned long flags; + int ctxn, err; + + if (!task && cpu != -1) { + /* Must be root to operate on a CPU event: */ + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EACCES); + + if (cpu < 0 || cpu >= nr_cpumask_bits) + return ERR_PTR(-EINVAL); + + /* + * We could be clever and allow to attach a event to an + * offline CPU and activate it when the CPU comes up, but + * that's for later. + */ + if (!cpu_online(cpu)) + return ERR_PTR(-ENODEV); + + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + ctx = &cpuctx->ctx; + get_ctx(ctx); + + return ctx; + } + + err = -EINVAL; + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto errout; + +retry: + ctx = perf_lock_task_context(task, ctxn, &flags); if (ctx) { unclone_ctx(ctx); raw_spin_unlock_irqrestore(&ctx->lock, flags); } if (!ctx) { - ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); + ctx = alloc_perf_context(pmu, task); err = -ENOMEM; if (!ctx) goto errout; - __perf_event_init_context(ctx, task); + get_ctx(ctx); - if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { + + if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) { /* * We raced with some other task; use * the context they set. */ + put_task_struct(task); kfree(ctx); goto retry; } - get_task_struct(task); } - put_task_struct(task); return ctx; - errout: - put_task_struct(task); +errout: return ERR_PTR(err); } @@ -1898,21 +2201,23 @@ static void free_event_rcu(struct rcu_head *head) kfree(event); } -static void perf_pending_sync(struct perf_event *event); static void perf_buffer_put(struct perf_buffer *buffer); static void free_event(struct perf_event *event) { - perf_pending_sync(event); + irq_work_sync(&event->pending); if (!event->parent) { - atomic_dec(&nr_events); + if (event->attach_state & PERF_ATTACH_TASK) + jump_label_dec(&perf_task_events); if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) atomic_dec(&nr_comm_events); if (event->attr.task) atomic_dec(&nr_task_events); + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + put_callchain_buffers(); } if (event->buffer) { @@ -1923,7 +2228,9 @@ static void free_event(struct perf_event *event) if (event->destroy) event->destroy(event); - put_ctx(event->ctx); + if (event->ctx) + put_ctx(event->ctx); + call_rcu(&event->rcu_head, free_event_rcu); } @@ -2342,6 +2649,9 @@ int perf_event_task_disable(void) static int perf_event_index(struct perf_event *event) { + if (event->hw.state & PERF_HES_STOPPED) + return 0; + if (event->state != PERF_EVENT_STATE_ACTIVE) return 0; @@ -2845,16 +3155,7 @@ void perf_event_wakeup(struct perf_event *event) } } -/* - * Pending wakeups - * - * Handle the case where we need to wakeup up from NMI (or rq->lock) context. - * - * The NMI bit means we cannot possibly take locks. Therefore, maintain a - * single linked list and use cmpxchg() to add entries lockless. - */ - -static void perf_pending_event(struct perf_pending_entry *entry) +static void perf_pending_event(struct irq_work *entry) { struct perf_event *event = container_of(entry, struct perf_event, pending); @@ -2870,99 +3171,6 @@ static void perf_pending_event(struct perf_pending_entry *entry) } } -#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) - -static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { - PENDING_TAIL, -}; - -static void perf_pending_queue(struct perf_pending_entry *entry, - void (*func)(struct perf_pending_entry *)) -{ - struct perf_pending_entry **head; - - if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) - return; - - entry->func = func; - - head = &get_cpu_var(perf_pending_head); - - do { - entry->next = *head; - } while (cmpxchg(head, entry->next, entry) != entry->next); - - set_perf_event_pending(); - - put_cpu_var(perf_pending_head); -} - -static int __perf_pending_run(void) -{ - struct perf_pending_entry *list; - int nr = 0; - - list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); - while (list != PENDING_TAIL) { - void (*func)(struct perf_pending_entry *); - struct perf_pending_entry *entry = list; - - list = list->next; - - func = entry->func; - entry->next = NULL; - /* - * Ensure we observe the unqueue before we issue the wakeup, - * so that we won't be waiting forever. - * -- see perf_not_pending(). - */ - smp_wmb(); - - func(entry); - nr++; - } - - return nr; -} - -static inline int perf_not_pending(struct perf_event *event) -{ - /* - * If we flush on whatever cpu we run, there is a chance we don't - * need to wait. - */ - get_cpu(); - __perf_pending_run(); - put_cpu(); - - /* - * Ensure we see the proper queue state before going to sleep - * so that we do not miss the wakeup. -- see perf_pending_handle() - */ - smp_rmb(); - return event->pending.next == NULL; -} - -static void perf_pending_sync(struct perf_event *event) -{ - wait_event(event->waitq, perf_not_pending(event)); -} - -void perf_event_do_pending(void) -{ - __perf_pending_run(); -} - -/* - * Callchain support -- arch specific - */ - -__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - return NULL; -} - - /* * We assume there is only KVM supporting the callbacks. * Later on, we might change it to a list if there is @@ -3012,8 +3220,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle) if (handle->nmi) { handle->event->pending_wakeup = 1; - perf_pending_queue(&handle->event->pending, - perf_pending_event); + irq_work_queue(&handle->event->pending); } else perf_event_wakeup(handle->event); } @@ -3069,7 +3276,7 @@ again: if (handle->wakeup != local_read(&buffer->wakeup)) perf_output_wakeup(handle); - out: +out: preempt_enable(); } @@ -3457,14 +3664,20 @@ static void perf_event_output(struct perf_event *event, int nmi, struct perf_output_handle handle; struct perf_event_header header; + /* protect the callchain buffers */ + rcu_read_lock(); + perf_prepare_sample(&header, data, event, regs); if (perf_output_begin(&handle, event, header.size, nmi, 1)) - return; + goto exit; perf_output_sample(&handle, &header, data, event); perf_output_end(&handle); + +exit: + rcu_read_unlock(); } /* @@ -3578,16 +3791,27 @@ static void perf_event_task_ctx(struct perf_event_context *ctx, static void perf_event_task_event(struct perf_task_event *task_event) { struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx = task_event->task_ctx; + struct perf_event_context *ctx; + struct pmu *pmu; + int ctxn; rcu_read_lock(); - cpuctx = &get_cpu_var(perf_cpu_context); - perf_event_task_ctx(&cpuctx->ctx, task_event); - if (!ctx) - ctx = rcu_dereference(current->perf_event_ctxp); - if (ctx) - perf_event_task_ctx(ctx, task_event); - put_cpu_var(perf_cpu_context); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + perf_event_task_ctx(&cpuctx->ctx, task_event); + + ctx = task_event->task_ctx; + if (!ctx) { + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + } + if (ctx) + perf_event_task_ctx(ctx, task_event); +next: + put_cpu_ptr(pmu->pmu_cpu_context); + } rcu_read_unlock(); } @@ -3692,8 +3916,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - unsigned int size; char comm[TASK_COMM_LEN]; + unsigned int size; + struct pmu *pmu; + int ctxn; memset(comm, 0, sizeof(comm)); strlcpy(comm, comm_event->task->comm, sizeof(comm)); @@ -3705,21 +3931,36 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; rcu_read_lock(); - cpuctx = &get_cpu_var(perf_cpu_context); - perf_event_comm_ctx(&cpuctx->ctx, comm_event); - ctx = rcu_dereference(current->perf_event_ctxp); - if (ctx) - perf_event_comm_ctx(ctx, comm_event); - put_cpu_var(perf_cpu_context); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + perf_event_comm_ctx(&cpuctx->ctx, comm_event); + + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) + perf_event_comm_ctx(ctx, comm_event); +next: + put_cpu_ptr(pmu->pmu_cpu_context); + } rcu_read_unlock(); } void perf_event_comm(struct task_struct *task) { struct perf_comm_event comm_event; + struct perf_event_context *ctx; + int ctxn; + + for_each_task_context_nr(ctxn) { + ctx = task->perf_event_ctxp[ctxn]; + if (!ctx) + continue; - if (task->perf_event_ctxp) - perf_event_enable_on_exec(task); + perf_event_enable_on_exec(ctx); + } if (!atomic_read(&nr_comm_events)) return; @@ -3821,6 +4062,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) char tmp[16]; char *buf = NULL; const char *name; + struct pmu *pmu; + int ctxn; memset(tmp, 0, sizeof(tmp)); @@ -3873,12 +4116,23 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; rcu_read_lock(); - cpuctx = &get_cpu_var(perf_cpu_context); - perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); - ctx = rcu_dereference(current->perf_event_ctxp); - if (ctx) - perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC); - put_cpu_var(perf_cpu_context); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, + vma->vm_flags & VM_EXEC); + + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) { + perf_event_mmap_ctx(ctx, mmap_event, + vma->vm_flags & VM_EXEC); + } +next: + put_cpu_ptr(pmu->pmu_cpu_context); + } rcu_read_unlock(); kfree(buf); @@ -3960,8 +4214,6 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, struct hw_perf_event *hwc = &event->hw; int ret = 0; - throttle = (throttle && event->pmu->unthrottle != NULL); - if (!throttle) { hwc->interrupts++; } else { @@ -4004,8 +4256,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, event->pending_kill = POLL_HUP; if (nmi) { event->pending_disable = 1; - perf_pending_queue(&event->pending, - perf_pending_event); + irq_work_queue(&event->pending); } else perf_event_disable(event); } @@ -4029,6 +4280,17 @@ int perf_event_overflow(struct perf_event *event, int nmi, * Generic software event infrastructure */ +struct swevent_htable { + struct swevent_hlist *swevent_hlist; + struct mutex hlist_mutex; + int hlist_refcount; + + /* Recursion avoidance in each contexts */ + int recursion[PERF_NR_CONTEXTS]; +}; + +static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); + /* * We directly increment event->count and keep a second value in * event->hw.period_left to count intervals. This period event @@ -4086,7 +4348,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, } } -static void perf_swevent_add(struct perf_event *event, u64 nr, +static void perf_swevent_event(struct perf_event *event, u64 nr, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { @@ -4112,6 +4374,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { + if (event->hw.state & PERF_HES_STOPPED) + return 0; + if (regs) { if (event->attr.exclude_user && user_mode(regs)) return 1; @@ -4158,11 +4423,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) /* For the read side: events when they trigger */ static inline struct hlist_head * -find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) +find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) { struct swevent_hlist *hlist; - hlist = rcu_dereference(ctx->swevent_hlist); + hlist = rcu_dereference(swhash->swevent_hlist); if (!hlist) return NULL; @@ -4171,7 +4436,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) /* For the event head insertion and removal in the hlist */ static inline struct hlist_head * -find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) +find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) { struct swevent_hlist *hlist; u32 event_id = event->attr.config; @@ -4182,7 +4447,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) * and release. Which makes the protected version suitable here. * The context lock guarantees that. */ - hlist = rcu_dereference_protected(ctx->swevent_hlist, + hlist = rcu_dereference_protected(swhash->swevent_hlist, lockdep_is_held(&event->ctx->lock)); if (!hlist) return NULL; @@ -4195,23 +4460,19 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, struct perf_sample_data *data, struct pt_regs *regs) { - struct perf_cpu_context *cpuctx; + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); struct perf_event *event; struct hlist_node *node; struct hlist_head *head; - cpuctx = &__get_cpu_var(perf_cpu_context); - rcu_read_lock(); - - head = find_swevent_head_rcu(cpuctx, type, event_id); - + head = find_swevent_head_rcu(swhash, type, event_id); if (!head) goto end; hlist_for_each_entry_rcu(event, node, head, hlist_entry) { if (perf_swevent_match(event, type, event_id, data, regs)) - perf_swevent_add(event, nr, nmi, data, regs); + perf_swevent_event(event, nr, nmi, data, regs); } end: rcu_read_unlock(); @@ -4219,33 +4480,17 @@ end: int perf_swevent_get_recursion_context(void) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - int rctx; - - if (in_nmi()) - rctx = 3; - else if (in_irq()) - rctx = 2; - else if (in_softirq()) - rctx = 1; - else - rctx = 0; - - if (cpuctx->recursion[rctx]) - return -1; + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); - cpuctx->recursion[rctx]++; - barrier(); - - return rctx; + return get_recursion_context(swhash->recursion); } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); void inline perf_swevent_put_recursion_context(int rctx) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - barrier(); - cpuctx->recursion[rctx]--; + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + + put_recursion_context(swhash->recursion, rctx); } void __perf_sw_event(u32 event_id, u64 nr, int nmi, @@ -4271,20 +4516,20 @@ static void perf_swevent_read(struct perf_event *event) { } -static int perf_swevent_enable(struct perf_event *event) +static int perf_swevent_add(struct perf_event *event, int flags) { + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); struct hw_perf_event *hwc = &event->hw; - struct perf_cpu_context *cpuctx; struct hlist_head *head; - cpuctx = &__get_cpu_var(perf_cpu_context); - if (hwc->sample_period) { hwc->last_period = hwc->sample_period; perf_swevent_set_period(event); } - head = find_swevent_head(cpuctx, event); + hwc->state = !(flags & PERF_EF_START); + + head = find_swevent_head(swhash, event); if (WARN_ON_ONCE(!head)) return -EINVAL; @@ -4293,202 +4538,27 @@ static int perf_swevent_enable(struct perf_event *event) return 0; } -static void perf_swevent_disable(struct perf_event *event) +static void perf_swevent_del(struct perf_event *event, int flags) { hlist_del_rcu(&event->hlist_entry); } -static void perf_swevent_void(struct perf_event *event) -{ -} - -static int perf_swevent_int(struct perf_event *event) -{ - return 0; -} - -static const struct pmu perf_ops_generic = { - .enable = perf_swevent_enable, - .disable = perf_swevent_disable, - .start = perf_swevent_int, - .stop = perf_swevent_void, - .read = perf_swevent_read, - .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */ -}; - -/* - * hrtimer based swevent callback - */ - -static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) +static void perf_swevent_start(struct perf_event *event, int flags) { - enum hrtimer_restart ret = HRTIMER_RESTART; - struct perf_sample_data data; - struct pt_regs *regs; - struct perf_event *event; - u64 period; - - event = container_of(hrtimer, struct perf_event, hw.hrtimer); - event->pmu->read(event); - - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; - regs = get_irq_regs(); - - if (regs && !perf_exclude_event(event, regs)) { - if (!(event->attr.exclude_idle && current->pid == 0)) - if (perf_event_overflow(event, 0, &data, regs)) - ret = HRTIMER_NORESTART; - } - - period = max_t(u64, 10000, event->hw.sample_period); - hrtimer_forward_now(hrtimer, ns_to_ktime(period)); - - return ret; + event->hw.state = 0; } -static void perf_swevent_start_hrtimer(struct perf_event *event) +static void perf_swevent_stop(struct perf_event *event, int flags) { - struct hw_perf_event *hwc = &event->hw; - - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swevent_hrtimer; - if (hwc->sample_period) { - u64 period; - - if (hwc->remaining) { - if (hwc->remaining < 0) - period = 10000; - else - period = hwc->remaining; - hwc->remaining = 0; - } else { - period = max_t(u64, 10000, hwc->sample_period); - } - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } -} - -static void perf_swevent_cancel_hrtimer(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - if (hwc->sample_period) { - ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); - hwc->remaining = ktime_to_ns(remaining); - - hrtimer_cancel(&hwc->hrtimer); - } -} - -/* - * Software event: cpu wall time clock - */ - -static void cpu_clock_perf_event_update(struct perf_event *event) -{ - int cpu = raw_smp_processor_id(); - s64 prev; - u64 now; - - now = cpu_clock(cpu); - prev = local64_xchg(&event->hw.prev_count, now); - local64_add(now - prev, &event->count); -} - -static int cpu_clock_perf_event_enable(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - int cpu = raw_smp_processor_id(); - - local64_set(&hwc->prev_count, cpu_clock(cpu)); - perf_swevent_start_hrtimer(event); - - return 0; -} - -static void cpu_clock_perf_event_disable(struct perf_event *event) -{ - perf_swevent_cancel_hrtimer(event); - cpu_clock_perf_event_update(event); -} - -static void cpu_clock_perf_event_read(struct perf_event *event) -{ - cpu_clock_perf_event_update(event); -} - -static const struct pmu perf_ops_cpu_clock = { - .enable = cpu_clock_perf_event_enable, - .disable = cpu_clock_perf_event_disable, - .read = cpu_clock_perf_event_read, -}; - -/* - * Software event: task time clock - */ - -static void task_clock_perf_event_update(struct perf_event *event, u64 now) -{ - u64 prev; - s64 delta; - - prev = local64_xchg(&event->hw.prev_count, now); - delta = now - prev; - local64_add(delta, &event->count); -} - -static int task_clock_perf_event_enable(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - u64 now; - - now = event->ctx->time; - - local64_set(&hwc->prev_count, now); - - perf_swevent_start_hrtimer(event); - - return 0; -} - -static void task_clock_perf_event_disable(struct perf_event *event) -{ - perf_swevent_cancel_hrtimer(event); - task_clock_perf_event_update(event, event->ctx->time); - -} - -static void task_clock_perf_event_read(struct perf_event *event) -{ - u64 time; - - if (!in_nmi()) { - update_context_time(event->ctx); - time = event->ctx->time; - } else { - u64 now = perf_clock(); - u64 delta = now - event->ctx->timestamp; - time = event->ctx->time + delta; - } - - task_clock_perf_event_update(event, time); + event->hw.state = PERF_HES_STOPPED; } -static const struct pmu perf_ops_task_clock = { - .enable = task_clock_perf_event_enable, - .disable = task_clock_perf_event_disable, - .read = task_clock_perf_event_read, -}; - /* Deref the hlist from the update side */ static inline struct swevent_hlist * -swevent_hlist_deref(struct perf_cpu_context *cpuctx) +swevent_hlist_deref(struct swevent_htable *swhash) { - return rcu_dereference_protected(cpuctx->swevent_hlist, - lockdep_is_held(&cpuctx->hlist_mutex)); + return rcu_dereference_protected(swhash->swevent_hlist, + lockdep_is_held(&swhash->hlist_mutex)); } static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) @@ -4499,27 +4569,27 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) kfree(hlist); } -static void swevent_hlist_release(struct perf_cpu_context *cpuctx) +static void swevent_hlist_release(struct swevent_htable *swhash) { - struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); + struct swevent_hlist *hlist = swevent_hlist_deref(swhash); if (!hlist) return; - rcu_assign_pointer(cpuctx->swevent_hlist, NULL); + rcu_assign_pointer(swhash->swevent_hlist, NULL); call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); } static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - mutex_lock(&cpuctx->hlist_mutex); + mutex_lock(&swhash->hlist_mutex); - if (!--cpuctx->hlist_refcount) - swevent_hlist_release(cpuctx); + if (!--swhash->hlist_refcount) + swevent_hlist_release(swhash); - mutex_unlock(&cpuctx->hlist_mutex); + mutex_unlock(&swhash->hlist_mutex); } static void swevent_hlist_put(struct perf_event *event) @@ -4537,12 +4607,12 @@ static void swevent_hlist_put(struct perf_event *event) static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); int err = 0; - mutex_lock(&cpuctx->hlist_mutex); + mutex_lock(&swhash->hlist_mutex); - if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { + if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { struct swevent_hlist *hlist; hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); @@ -4550,11 +4620,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) err = -ENOMEM; goto exit; } - rcu_assign_pointer(cpuctx->swevent_hlist, hlist); + rcu_assign_pointer(swhash->swevent_hlist, hlist); } - cpuctx->hlist_refcount++; - exit: - mutex_unlock(&cpuctx->hlist_mutex); + swhash->hlist_refcount++; +exit: + mutex_unlock(&swhash->hlist_mutex); return err; } @@ -4578,7 +4648,7 @@ static int swevent_hlist_get(struct perf_event *event) put_online_cpus(); return 0; - fail: +fail: for_each_possible_cpu(cpu) { if (cpu == failed_cpu) break; @@ -4589,17 +4659,64 @@ static int swevent_hlist_get(struct perf_event *event) return err; } -#ifdef CONFIG_EVENT_TRACING +atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; + +static void sw_perf_event_destroy(struct perf_event *event) +{ + u64 event_id = event->attr.config; + + WARN_ON(event->parent); + + jump_label_dec(&perf_swevent_enabled[event_id]); + swevent_hlist_put(event); +} + +static int perf_swevent_init(struct perf_event *event) +{ + int event_id = event->attr.config; + + if (event->attr.type != PERF_TYPE_SOFTWARE) + return -ENOENT; + + switch (event_id) { + case PERF_COUNT_SW_CPU_CLOCK: + case PERF_COUNT_SW_TASK_CLOCK: + return -ENOENT; -static const struct pmu perf_ops_tracepoint = { - .enable = perf_trace_enable, - .disable = perf_trace_disable, - .start = perf_swevent_int, - .stop = perf_swevent_void, + default: + break; + } + + if (event_id > PERF_COUNT_SW_MAX) + return -ENOENT; + + if (!event->parent) { + int err; + + err = swevent_hlist_get(event); + if (err) + return err; + + jump_label_inc(&perf_swevent_enabled[event_id]); + event->destroy = sw_perf_event_destroy; + } + + return 0; +} + +static struct pmu perf_swevent = { + .task_ctx_nr = perf_sw_context, + + .event_init = perf_swevent_init, + .add = perf_swevent_add, + .del = perf_swevent_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, .read = perf_swevent_read, - .unthrottle = perf_swevent_void, }; +#ifdef CONFIG_EVENT_TRACING + static int perf_tp_filter_match(struct perf_event *event, struct perf_sample_data *data) { @@ -4643,7 +4760,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, hlist_for_each_entry_rcu(event, node, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) - perf_swevent_add(event, count, 1, &data, regs); + perf_swevent_event(event, count, 1, &data, regs); } perf_swevent_put_recursion_context(rctx); @@ -4655,10 +4772,13 @@ static void tp_perf_event_destroy(struct perf_event *event) perf_trace_destroy(event); } -static const struct pmu *tp_perf_event_init(struct perf_event *event) +static int perf_tp_event_init(struct perf_event *event) { int err; + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -ENOENT; + /* * Raw tracepoint data is a severe data leak, only allow root to * have these. @@ -4666,15 +4786,31 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) if ((event->attr.sample_type & PERF_SAMPLE_RAW) && perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); + return -EPERM; err = perf_trace_init(event); if (err) - return NULL; + return err; event->destroy = tp_perf_event_destroy; - return &perf_ops_tracepoint; + return 0; +} + +static struct pmu perf_tracepoint = { + .task_ctx_nr = perf_sw_context, + + .event_init = perf_tp_event_init, + .add = perf_trace_add, + .del = perf_trace_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, +}; + +static inline void perf_tp_register(void) +{ + perf_pmu_register(&perf_tracepoint); } static int perf_event_set_filter(struct perf_event *event, void __user *arg) @@ -4702,9 +4838,8 @@ static void perf_event_free_filter(struct perf_event *event) #else -static const struct pmu *tp_perf_event_init(struct perf_event *event) +static inline void perf_tp_register(void) { - return NULL; } static int perf_event_set_filter(struct perf_event *event, void __user *arg) @@ -4719,105 +4854,389 @@ static void perf_event_free_filter(struct perf_event *event) #endif /* CONFIG_EVENT_TRACING */ #ifdef CONFIG_HAVE_HW_BREAKPOINT -static void bp_perf_event_destroy(struct perf_event *event) +void perf_bp_event(struct perf_event *bp, void *data) { - release_bp_slot(event); + struct perf_sample_data sample; + struct pt_regs *regs = data; + + perf_sample_data_init(&sample, bp->attr.bp_addr); + + if (!bp->hw.state && !perf_exclude_event(bp, regs)) + perf_swevent_event(bp, 1, 1, &sample, regs); } +#endif -static const struct pmu *bp_perf_event_init(struct perf_event *bp) +/* + * hrtimer based swevent callback + */ + +static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) { - int err; + enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; + struct pt_regs *regs; + struct perf_event *event; + u64 period; - err = register_perf_hw_breakpoint(bp); - if (err) - return ERR_PTR(err); + event = container_of(hrtimer, struct perf_event, hw.hrtimer); + event->pmu->read(event); + + perf_sample_data_init(&data, 0); + data.period = event->hw.last_period; + regs = get_irq_regs(); + + if (regs && !perf_exclude_event(event, regs)) { + if (!(event->attr.exclude_idle && current->pid == 0)) + if (perf_event_overflow(event, 0, &data, regs)) + ret = HRTIMER_NORESTART; + } + + period = max_t(u64, 10000, event->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); - bp->destroy = bp_perf_event_destroy; + return ret; +} - return &perf_ops_bp; +static void perf_swevent_start_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + if (hwc->sample_period) { + s64 period = local64_read(&hwc->period_left); + + if (period) { + if (period < 0) + period = 10000; + + local64_set(&hwc->period_left, 0); + } else { + period = max_t(u64, 10000, hwc->sample_period); + } + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL_PINNED, 0); + } } -void perf_bp_event(struct perf_event *bp, void *data) +static void perf_swevent_cancel_hrtimer(struct perf_event *event) { - struct perf_sample_data sample; - struct pt_regs *regs = data; + struct hw_perf_event *hwc = &event->hw; - perf_sample_data_init(&sample, bp->attr.bp_addr); + if (hwc->sample_period) { + ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); + local64_set(&hwc->period_left, ktime_to_ns(remaining)); - if (!perf_exclude_event(bp, regs)) - perf_swevent_add(bp, 1, 1, &sample, regs); + hrtimer_cancel(&hwc->hrtimer); + } } -#else -static const struct pmu *bp_perf_event_init(struct perf_event *bp) + +/* + * Software event: cpu wall time clock + */ + +static void cpu_clock_event_update(struct perf_event *event) { - return NULL; + s64 prev; + u64 now; + + now = local_clock(); + prev = local64_xchg(&event->hw.prev_count, now); + local64_add(now - prev, &event->count); } -void perf_bp_event(struct perf_event *bp, void *regs) +static void cpu_clock_event_start(struct perf_event *event, int flags) { + local64_set(&event->hw.prev_count, local_clock()); + perf_swevent_start_hrtimer(event); } -#endif -atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; +static void cpu_clock_event_stop(struct perf_event *event, int flags) +{ + perf_swevent_cancel_hrtimer(event); + cpu_clock_event_update(event); +} -static void sw_perf_event_destroy(struct perf_event *event) +static int cpu_clock_event_add(struct perf_event *event, int flags) { - u64 event_id = event->attr.config; + if (flags & PERF_EF_START) + cpu_clock_event_start(event, flags); - WARN_ON(event->parent); + return 0; +} - atomic_dec(&perf_swevent_enabled[event_id]); - swevent_hlist_put(event); +static void cpu_clock_event_del(struct perf_event *event, int flags) +{ + cpu_clock_event_stop(event, flags); } -static const struct pmu *sw_perf_event_init(struct perf_event *event) +static void cpu_clock_event_read(struct perf_event *event) { - const struct pmu *pmu = NULL; - u64 event_id = event->attr.config; + cpu_clock_event_update(event); +} + +static int cpu_clock_event_init(struct perf_event *event) +{ + if (event->attr.type != PERF_TYPE_SOFTWARE) + return -ENOENT; + + if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) + return -ENOENT; + + return 0; +} +static struct pmu perf_cpu_clock = { + .task_ctx_nr = perf_sw_context, + + .event_init = cpu_clock_event_init, + .add = cpu_clock_event_add, + .del = cpu_clock_event_del, + .start = cpu_clock_event_start, + .stop = cpu_clock_event_stop, + .read = cpu_clock_event_read, +}; + +/* + * Software event: task time clock + */ + +static void task_clock_event_update(struct perf_event *event, u64 now) +{ + u64 prev; + s64 delta; + + prev = local64_xchg(&event->hw.prev_count, now); + delta = now - prev; + local64_add(delta, &event->count); +} + +static void task_clock_event_start(struct perf_event *event, int flags) +{ + local64_set(&event->hw.prev_count, event->ctx->time); + perf_swevent_start_hrtimer(event); +} + +static void task_clock_event_stop(struct perf_event *event, int flags) +{ + perf_swevent_cancel_hrtimer(event); + task_clock_event_update(event, event->ctx->time); +} + +static int task_clock_event_add(struct perf_event *event, int flags) +{ + if (flags & PERF_EF_START) + task_clock_event_start(event, flags); + + return 0; +} + +static void task_clock_event_del(struct perf_event *event, int flags) +{ + task_clock_event_stop(event, PERF_EF_UPDATE); +} + +static void task_clock_event_read(struct perf_event *event) +{ + u64 time; + + if (!in_nmi()) { + update_context_time(event->ctx); + time = event->ctx->time; + } else { + u64 now = perf_clock(); + u64 delta = now - event->ctx->timestamp; + time = event->ctx->time + delta; + } + + task_clock_event_update(event, time); +} + +static int task_clock_event_init(struct perf_event *event) +{ + if (event->attr.type != PERF_TYPE_SOFTWARE) + return -ENOENT; + + if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) + return -ENOENT; + + return 0; +} + +static struct pmu perf_task_clock = { + .task_ctx_nr = perf_sw_context, + + .event_init = task_clock_event_init, + .add = task_clock_event_add, + .del = task_clock_event_del, + .start = task_clock_event_start, + .stop = task_clock_event_stop, + .read = task_clock_event_read, +}; + +static void perf_pmu_nop_void(struct pmu *pmu) +{ +} + +static int perf_pmu_nop_int(struct pmu *pmu) +{ + return 0; +} + +static void perf_pmu_start_txn(struct pmu *pmu) +{ + perf_pmu_disable(pmu); +} + +static int perf_pmu_commit_txn(struct pmu *pmu) +{ + perf_pmu_enable(pmu); + return 0; +} + +static void perf_pmu_cancel_txn(struct pmu *pmu) +{ + perf_pmu_enable(pmu); +} + +/* + * Ensures all contexts with the same task_ctx_nr have the same + * pmu_cpu_context too. + */ +static void *find_pmu_context(int ctxn) +{ + struct pmu *pmu; + + if (ctxn < 0) + return NULL; + + list_for_each_entry(pmu, &pmus, entry) { + if (pmu->task_ctx_nr == ctxn) + return pmu->pmu_cpu_context; + } + + return NULL; +} + +static void free_pmu_context(void * __percpu cpu_context) +{ + struct pmu *pmu; + + mutex_lock(&pmus_lock); /* - * Software events (currently) can't in general distinguish - * between user, kernel and hypervisor events. - * However, context switches and cpu migrations are considered - * to be kernel events, and page faults are never hypervisor - * events. + * Like a real lame refcount. */ - switch (event_id) { - case PERF_COUNT_SW_CPU_CLOCK: - pmu = &perf_ops_cpu_clock; + list_for_each_entry(pmu, &pmus, entry) { + if (pmu->pmu_cpu_context == cpu_context) + goto out; + } - break; - case PERF_COUNT_SW_TASK_CLOCK: - /* - * If the user instantiates this as a per-cpu event, - * use the cpu_clock event instead. - */ - if (event->ctx->task) - pmu = &perf_ops_task_clock; - else - pmu = &perf_ops_cpu_clock; + free_percpu(cpu_context); +out: + mutex_unlock(&pmus_lock); +} - break; - case PERF_COUNT_SW_PAGE_FAULTS: - case PERF_COUNT_SW_PAGE_FAULTS_MIN: - case PERF_COUNT_SW_PAGE_FAULTS_MAJ: - case PERF_COUNT_SW_CONTEXT_SWITCHES: - case PERF_COUNT_SW_CPU_MIGRATIONS: - case PERF_COUNT_SW_ALIGNMENT_FAULTS: - case PERF_COUNT_SW_EMULATION_FAULTS: - if (!event->parent) { - int err; - - err = swevent_hlist_get(event); - if (err) - return ERR_PTR(err); +int perf_pmu_register(struct pmu *pmu) +{ + int cpu, ret; + + mutex_lock(&pmus_lock); + ret = -ENOMEM; + pmu->pmu_disable_count = alloc_percpu(int); + if (!pmu->pmu_disable_count) + goto unlock; - atomic_inc(&perf_swevent_enabled[event_id]); - event->destroy = sw_perf_event_destroy; + pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); + if (pmu->pmu_cpu_context) + goto got_cpu_context; + + pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); + if (!pmu->pmu_cpu_context) + goto free_pdc; + + for_each_possible_cpu(cpu) { + struct perf_cpu_context *cpuctx; + + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + __perf_event_init_context(&cpuctx->ctx); + cpuctx->ctx.type = cpu_context; + cpuctx->ctx.pmu = pmu; + cpuctx->jiffies_interval = 1; + INIT_LIST_HEAD(&cpuctx->rotation_list); + } + +got_cpu_context: + if (!pmu->start_txn) { + if (pmu->pmu_enable) { + /* + * If we have pmu_enable/pmu_disable calls, install + * transaction stubs that use that to try and batch + * hardware accesses. + */ + pmu->start_txn = perf_pmu_start_txn; + pmu->commit_txn = perf_pmu_commit_txn; + pmu->cancel_txn = perf_pmu_cancel_txn; + } else { + pmu->start_txn = perf_pmu_nop_void; + pmu->commit_txn = perf_pmu_nop_int; + pmu->cancel_txn = perf_pmu_nop_void; + } + } + + if (!pmu->pmu_enable) { + pmu->pmu_enable = perf_pmu_nop_void; + pmu->pmu_disable = perf_pmu_nop_void; + } + + list_add_rcu(&pmu->entry, &pmus); + ret = 0; +unlock: + mutex_unlock(&pmus_lock); + + return ret; + +free_pdc: + free_percpu(pmu->pmu_disable_count); + goto unlock; +} + +void perf_pmu_unregister(struct pmu *pmu) +{ + mutex_lock(&pmus_lock); + list_del_rcu(&pmu->entry); + mutex_unlock(&pmus_lock); + + /* + * We dereference the pmu list under both SRCU and regular RCU, so + * synchronize against both of those. + */ + synchronize_srcu(&pmus_srcu); + synchronize_rcu(); + + free_percpu(pmu->pmu_disable_count); + free_pmu_context(pmu->pmu_cpu_context); +} + +struct pmu *perf_init_event(struct perf_event *event) +{ + struct pmu *pmu = NULL; + int idx; + + idx = srcu_read_lock(&pmus_srcu); + list_for_each_entry_rcu(pmu, &pmus, entry) { + int ret = pmu->event_init(event); + if (!ret) + goto unlock; + + if (ret != -ENOENT) { + pmu = ERR_PTR(ret); + goto unlock; } - pmu = &perf_ops_generic; - break; } + pmu = ERR_PTR(-ENOENT); +unlock: + srcu_read_unlock(&pmus_srcu, idx); return pmu; } @@ -4826,20 +5245,18 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event) * Allocate and initialize a event structure */ static struct perf_event * -perf_event_alloc(struct perf_event_attr *attr, - int cpu, - struct perf_event_context *ctx, - struct perf_event *group_leader, - struct perf_event *parent_event, - perf_overflow_handler_t overflow_handler, - gfp_t gfpflags) -{ - const struct pmu *pmu; +perf_event_alloc(struct perf_event_attr *attr, int cpu, + struct task_struct *task, + struct perf_event *group_leader, + struct perf_event *parent_event, + perf_overflow_handler_t overflow_handler) +{ + struct pmu *pmu; struct perf_event *event; struct hw_perf_event *hwc; long err; - event = kzalloc(sizeof(*event), gfpflags); + event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) return ERR_PTR(-ENOMEM); @@ -4857,6 +5274,7 @@ perf_event_alloc(struct perf_event_attr *attr, INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); init_waitqueue_head(&event->waitq); + init_irq_work(&event->pending, perf_pending_event); mutex_init(&event->mmap_mutex); @@ -4864,7 +5282,6 @@ perf_event_alloc(struct perf_event_attr *attr, event->attr = *attr; event->group_leader = group_leader; event->pmu = NULL; - event->ctx = ctx; event->oncpu = -1; event->parent = parent_event; @@ -4874,6 +5291,17 @@ perf_event_alloc(struct perf_event_attr *attr, event->state = PERF_EVENT_STATE_INACTIVE; + if (task) { + event->attach_state = PERF_ATTACH_TASK; +#ifdef CONFIG_HAVE_HW_BREAKPOINT + /* + * hw_breakpoint is a bit difficult here.. + */ + if (attr->type == PERF_TYPE_BREAKPOINT) + event->hw.bp_target = task; +#endif + } + if (!overflow_handler && parent_event) overflow_handler = parent_event->overflow_handler; @@ -4898,29 +5326,8 @@ perf_event_alloc(struct perf_event_attr *attr, if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) goto done; - switch (attr->type) { - case PERF_TYPE_RAW: - case PERF_TYPE_HARDWARE: - case PERF_TYPE_HW_CACHE: - pmu = hw_perf_event_init(event); - break; - - case PERF_TYPE_SOFTWARE: - pmu = sw_perf_event_init(event); - break; - - case PERF_TYPE_TRACEPOINT: - pmu = tp_perf_event_init(event); - break; + pmu = perf_init_event(event); - case PERF_TYPE_BREAKPOINT: - pmu = bp_perf_event_init(event); - break; - - - default: - break; - } done: err = 0; if (!pmu) @@ -4938,13 +5345,21 @@ done: event->pmu = pmu; if (!event->parent) { - atomic_inc(&nr_events); + if (event->attach_state & PERF_ATTACH_TASK) + jump_label_inc(&perf_task_events); if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) atomic_inc(&nr_comm_events); if (event->attr.task) atomic_inc(&nr_task_events); + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { + err = get_callchain_buffers(); + if (err) { + free_event(event); + return ERR_PTR(err); + } + } } return event; @@ -5092,12 +5507,16 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { - struct perf_event *event, *group_leader = NULL, *output_event = NULL; + struct perf_event *group_leader = NULL, *output_event = NULL; + struct perf_event *event, *sibling; struct perf_event_attr attr; struct perf_event_context *ctx; struct file *event_file = NULL; struct file *group_file = NULL; + struct task_struct *task = NULL; + struct pmu *pmu; int event_fd; + int move_group = 0; int fput_needed = 0; int err; @@ -5123,20 +5542,11 @@ SYSCALL_DEFINE5(perf_event_open, if (event_fd < 0) return event_fd; - /* - * Get the target context (task or percpu): - */ - ctx = find_get_context(pid, cpu); - if (IS_ERR(ctx)) { - err = PTR_ERR(ctx); - goto err_fd; - } - if (group_fd != -1) { group_leader = perf_fget_light(group_fd, &fput_needed); if (IS_ERR(group_leader)) { err = PTR_ERR(group_leader); - goto err_put_context; + goto err_fd; } group_file = group_leader->filp; if (flags & PERF_FLAG_FD_OUTPUT) @@ -5145,6 +5555,58 @@ SYSCALL_DEFINE5(perf_event_open, group_leader = NULL; } + if (pid != -1) { + task = find_lively_task_by_vpid(pid); + if (IS_ERR(task)) { + err = PTR_ERR(task); + goto err_group_fd; + } + } + + event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); + if (IS_ERR(event)) { + err = PTR_ERR(event); + goto err_task; + } + + /* + * Special case software events and allow them to be part of + * any hardware group. + */ + pmu = event->pmu; + + if (group_leader && + (is_software_event(event) != is_software_event(group_leader))) { + if (is_software_event(event)) { + /* + * If event and group_leader are not both a software + * event, and event is, then group leader is not. + * + * Allow the addition of software events to !software + * groups, this is safe because software events never + * fail to schedule. + */ + pmu = group_leader->pmu; + } else if (is_software_event(group_leader) && + (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { + /* + * In case the group is a pure software group, and we + * try to add a hardware event, move the whole group to + * the hardware context. + */ + move_group = 1; + } + } + + /* + * Get the target context (task or percpu): + */ + ctx = find_get_context(pmu, task, cpu); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto err_alloc; + } + /* * Look up the group leader (we will attach this event to it): */ @@ -5156,42 +5618,66 @@ SYSCALL_DEFINE5(perf_event_open, * becoming part of another group-sibling): */ if (group_leader->group_leader != group_leader) - goto err_put_context; + goto err_context; /* * Do not allow to attach to a group in a different * task or CPU context: */ - if (group_leader->ctx != ctx) - goto err_put_context; + if (move_group) { + if (group_leader->ctx->type != ctx->type) + goto err_context; + } else { + if (group_leader->ctx != ctx) + goto err_context; + } + /* * Only a group leader can be exclusive or pinned */ if (attr.exclusive || attr.pinned) - goto err_put_context; - } - - event = perf_event_alloc(&attr, cpu, ctx, group_leader, - NULL, NULL, GFP_KERNEL); - if (IS_ERR(event)) { - err = PTR_ERR(event); - goto err_put_context; + goto err_context; } if (output_event) { err = perf_event_set_output(event, output_event); if (err) - goto err_free_put_context; + goto err_context; } event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); if (IS_ERR(event_file)) { err = PTR_ERR(event_file); - goto err_free_put_context; + goto err_context; + } + + if (move_group) { + struct perf_event_context *gctx = group_leader->ctx; + + mutex_lock(&gctx->mutex); + perf_event_remove_from_context(group_leader); + list_for_each_entry(sibling, &group_leader->sibling_list, + group_entry) { + perf_event_remove_from_context(sibling); + put_ctx(gctx); + } + mutex_unlock(&gctx->mutex); + put_ctx(gctx); } event->filp = event_file; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); + + if (move_group) { + perf_install_in_context(ctx, group_leader, cpu); + get_ctx(ctx); + list_for_each_entry(sibling, &group_leader->sibling_list, + group_entry) { + perf_install_in_context(ctx, sibling, cpu); + get_ctx(ctx); + } + } + perf_install_in_context(ctx, event, cpu); ++ctx->generation; mutex_unlock(&ctx->mutex); @@ -5212,11 +5698,15 @@ SYSCALL_DEFINE5(perf_event_open, fd_install(event_fd, event_file); return event_fd; -err_free_put_context: +err_context: + put_ctx(ctx); +err_alloc: free_event(event); -err_put_context: +err_task: + if (task) + put_task_struct(task); +err_group_fd: fput_light(group_file, fput_needed); - put_ctx(ctx); err_fd: put_unused_fd(event_fd); return err; @@ -5227,32 +5717,31 @@ err_fd: * * @attr: attributes of the counter to create * @cpu: cpu in which the counter is bound - * @pid: task to profile + * @task: task to profile (NULL for percpu) */ struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, - pid_t pid, + struct task_struct *task, perf_overflow_handler_t overflow_handler) { - struct perf_event *event; struct perf_event_context *ctx; + struct perf_event *event; int err; /* * Get the target context (task or percpu): */ - ctx = find_get_context(pid, cpu); - if (IS_ERR(ctx)) { - err = PTR_ERR(ctx); - goto err_exit; - } - - event = perf_event_alloc(attr, cpu, ctx, NULL, - NULL, overflow_handler, GFP_KERNEL); + event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); if (IS_ERR(event)) { err = PTR_ERR(event); - goto err_put_context; + goto err; + } + + ctx = find_get_context(event->pmu, task, cpu); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto err_free; } event->filp = NULL; @@ -5270,112 +5759,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, return event; - err_put_context: - put_ctx(ctx); - err_exit: +err_free: + free_event(event); +err: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); -/* - * inherit a event from parent task to child task: - */ -static struct perf_event * -inherit_event(struct perf_event *parent_event, - struct task_struct *parent, - struct perf_event_context *parent_ctx, - struct task_struct *child, - struct perf_event *group_leader, - struct perf_event_context *child_ctx) -{ - struct perf_event *child_event; - - /* - * Instead of creating recursive hierarchies of events, - * we link inherited events back to the original parent, - * which has a filp for sure, which we use as the reference - * count: - */ - if (parent_event->parent) - parent_event = parent_event->parent; - - child_event = perf_event_alloc(&parent_event->attr, - parent_event->cpu, child_ctx, - group_leader, parent_event, - NULL, GFP_KERNEL); - if (IS_ERR(child_event)) - return child_event; - get_ctx(child_ctx); - - /* - * Make the child state follow the state of the parent event, - * not its attr.disabled bit. We hold the parent's mutex, - * so we won't race with perf_event_{en, dis}able_family. - */ - if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) - child_event->state = PERF_EVENT_STATE_INACTIVE; - else - child_event->state = PERF_EVENT_STATE_OFF; - - if (parent_event->attr.freq) { - u64 sample_period = parent_event->hw.sample_period; - struct hw_perf_event *hwc = &child_event->hw; - - hwc->sample_period = sample_period; - hwc->last_period = sample_period; - - local64_set(&hwc->period_left, sample_period); - } - - child_event->overflow_handler = parent_event->overflow_handler; - - /* - * Link it up in the child's context: - */ - add_event_to_ctx(child_event, child_ctx); - - /* - * Get a reference to the parent filp - we will fput it - * when the child event exits. This is safe to do because - * we are in the parent and we know that the filp still - * exists and has a nonzero count: - */ - atomic_long_inc(&parent_event->filp->f_count); - - /* - * Link this into the parent event's child list - */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); - list_add_tail(&child_event->child_list, &parent_event->child_list); - mutex_unlock(&parent_event->child_mutex); - - return child_event; -} - -static int inherit_group(struct perf_event *parent_event, - struct task_struct *parent, - struct perf_event_context *parent_ctx, - struct task_struct *child, - struct perf_event_context *child_ctx) -{ - struct perf_event *leader; - struct perf_event *sub; - struct perf_event *child_ctr; - - leader = inherit_event(parent_event, parent, parent_ctx, - child, NULL, child_ctx); - if (IS_ERR(leader)) - return PTR_ERR(leader); - list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { - child_ctr = inherit_event(sub, parent, parent_ctx, - child, leader, child_ctx); - if (IS_ERR(child_ctr)) - return PTR_ERR(child_ctr); - } - return 0; -} - static void sync_child_event(struct perf_event *child_event, struct task_struct *child) { @@ -5432,16 +5822,13 @@ __perf_event_exit_task(struct perf_event *child_event, } } -/* - * When a child task exits, feed back event values to parent events. - */ -void perf_event_exit_task(struct task_struct *child) +static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { struct perf_event *child_event, *tmp; struct perf_event_context *child_ctx; unsigned long flags; - if (likely(!child->perf_event_ctxp)) { + if (likely(!child->perf_event_ctxp[ctxn])) { perf_event_task(child, NULL, 0); return; } @@ -5453,8 +5840,8 @@ void perf_event_exit_task(struct task_struct *child) * scheduled, so we are now safe from rescheduling changing * our context. */ - child_ctx = child->perf_event_ctxp; - __perf_event_task_sched_out(child_ctx); + child_ctx = child->perf_event_ctxp[ctxn]; + task_ctx_sched_out(child_ctx, EVENT_ALL); /* * Take the context lock here so that if find_get_context is @@ -5462,7 +5849,7 @@ void perf_event_exit_task(struct task_struct *child) * incremented the context's refcount before we do put_ctx below. */ raw_spin_lock(&child_ctx->lock); - child->perf_event_ctxp = NULL; + child->perf_event_ctxp[ctxn] = NULL; /* * If this context is a clone; unclone it so it can't get * swapped to another process while we're removing all @@ -5515,6 +5902,17 @@ again: put_ctx(child_ctx); } +/* + * When a child task exits, feed back event values to parent events. + */ +void perf_event_exit_task(struct task_struct *child) +{ + int ctxn; + + for_each_task_context_nr(ctxn) + perf_event_exit_task_context(child, ctxn); +} + static void perf_free_event(struct perf_event *event, struct perf_event_context *ctx) { @@ -5536,48 +5934,166 @@ static void perf_free_event(struct perf_event *event, /* * free an unexposed, unused context as created by inheritance by - * init_task below, used by fork() in case of fail. + * perf_event_init_task below, used by fork() in case of fail. */ void perf_event_free_task(struct task_struct *task) { - struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event_context *ctx; struct perf_event *event, *tmp; + int ctxn; - if (!ctx) - return; + for_each_task_context_nr(ctxn) { + ctx = task->perf_event_ctxp[ctxn]; + if (!ctx) + continue; - mutex_lock(&ctx->mutex); + mutex_lock(&ctx->mutex); again: - list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) - perf_free_event(event, ctx); + list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, + group_entry) + perf_free_event(event, ctx); - list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, - group_entry) - perf_free_event(event, ctx); + list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, + group_entry) + perf_free_event(event, ctx); - if (!list_empty(&ctx->pinned_groups) || - !list_empty(&ctx->flexible_groups)) - goto again; + if (!list_empty(&ctx->pinned_groups) || + !list_empty(&ctx->flexible_groups)) + goto again; - mutex_unlock(&ctx->mutex); + mutex_unlock(&ctx->mutex); - put_ctx(ctx); + put_ctx(ctx); + } +} + +void perf_event_delayed_put(struct task_struct *task) +{ + int ctxn; + + for_each_task_context_nr(ctxn) + WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); +} + +/* + * inherit a event from parent task to child task: + */ +static struct perf_event * +inherit_event(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event *group_leader, + struct perf_event_context *child_ctx) +{ + struct perf_event *child_event; + unsigned long flags; + + /* + * Instead of creating recursive hierarchies of events, + * we link inherited events back to the original parent, + * which has a filp for sure, which we use as the reference + * count: + */ + if (parent_event->parent) + parent_event = parent_event->parent; + + child_event = perf_event_alloc(&parent_event->attr, + parent_event->cpu, + child, + group_leader, parent_event, + NULL); + if (IS_ERR(child_event)) + return child_event; + get_ctx(child_ctx); + + /* + * Make the child state follow the state of the parent event, + * not its attr.disabled bit. We hold the parent's mutex, + * so we won't race with perf_event_{en, dis}able_family. + */ + if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) + child_event->state = PERF_EVENT_STATE_INACTIVE; + else + child_event->state = PERF_EVENT_STATE_OFF; + + if (parent_event->attr.freq) { + u64 sample_period = parent_event->hw.sample_period; + struct hw_perf_event *hwc = &child_event->hw; + + hwc->sample_period = sample_period; + hwc->last_period = sample_period; + + local64_set(&hwc->period_left, sample_period); + } + + child_event->ctx = child_ctx; + child_event->overflow_handler = parent_event->overflow_handler; + + /* + * Link it up in the child's context: + */ + raw_spin_lock_irqsave(&child_ctx->lock, flags); + add_event_to_ctx(child_event, child_ctx); + raw_spin_unlock_irqrestore(&child_ctx->lock, flags); + + /* + * Get a reference to the parent filp - we will fput it + * when the child event exits. This is safe to do because + * we are in the parent and we know that the filp still + * exists and has a nonzero count: + */ + atomic_long_inc(&parent_event->filp->f_count); + + /* + * Link this into the parent event's child list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_add_tail(&child_event->child_list, &parent_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + return child_event; +} + +static int inherit_group(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event_context *child_ctx) +{ + struct perf_event *leader; + struct perf_event *sub; + struct perf_event *child_ctr; + + leader = inherit_event(parent_event, parent, parent_ctx, + child, NULL, child_ctx); + if (IS_ERR(leader)) + return PTR_ERR(leader); + list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { + child_ctr = inherit_event(sub, parent, parent_ctx, + child, leader, child_ctx); + if (IS_ERR(child_ctr)) + return PTR_ERR(child_ctr); + } + return 0; } static int inherit_task_group(struct perf_event *event, struct task_struct *parent, struct perf_event_context *parent_ctx, - struct task_struct *child, + struct task_struct *child, int ctxn, int *inherited_all) { int ret; - struct perf_event_context *child_ctx = child->perf_event_ctxp; + struct perf_event_context *child_ctx; if (!event->attr.inherit) { *inherited_all = 0; return 0; } + child_ctx = child->perf_event_ctxp[ctxn]; if (!child_ctx) { /* * This is executed from the parent task context, so @@ -5586,14 +6102,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, * child. */ - child_ctx = kzalloc(sizeof(struct perf_event_context), - GFP_KERNEL); + child_ctx = alloc_perf_context(event->pmu, child); if (!child_ctx) return -ENOMEM; - __perf_event_init_context(child_ctx, child); - child->perf_event_ctxp = child_ctx; - get_task_struct(child); + child->perf_event_ctxp[ctxn] = child_ctx; } ret = inherit_group(event, parent, parent_ctx, @@ -5605,11 +6118,10 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, return ret; } - /* * Initialize the perf_event context in task_struct */ -int perf_event_init_task(struct task_struct *child) +int perf_event_init_context(struct task_struct *child, int ctxn) { struct perf_event_context *child_ctx, *parent_ctx; struct perf_event_context *cloned_ctx; @@ -5618,19 +6130,19 @@ int perf_event_init_task(struct task_struct *child) int inherited_all = 1; int ret = 0; - child->perf_event_ctxp = NULL; + child->perf_event_ctxp[ctxn] = NULL; mutex_init(&child->perf_event_mutex); INIT_LIST_HEAD(&child->perf_event_list); - if (likely(!parent->perf_event_ctxp)) + if (likely(!parent->perf_event_ctxp[ctxn])) return 0; /* * If the parent's context is a clone, pin it so it won't get * swapped under us. */ - parent_ctx = perf_pin_task_context(parent); + parent_ctx = perf_pin_task_context(parent, ctxn); /* * No need to check if parent_ctx != NULL here; since we saw @@ -5650,20 +6162,20 @@ int perf_event_init_task(struct task_struct *child) * the list, not manipulating it: */ list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { - ret = inherit_task_group(event, parent, parent_ctx, child, - &inherited_all); + ret = inherit_task_group(event, parent, parent_ctx, + child, ctxn, &inherited_all); if (ret) break; } list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { - ret = inherit_task_group(event, parent, parent_ctx, child, - &inherited_all); + ret = inherit_task_group(event, parent, parent_ctx, + child, ctxn, &inherited_all); if (ret) break; } - child_ctx = child->perf_event_ctxp; + child_ctx = child->perf_event_ctxp[ctxn]; if (child_ctx && inherited_all) { /* @@ -5692,63 +6204,98 @@ int perf_event_init_task(struct task_struct *child) return ret; } +/* + * Initialize the perf_event context in task_struct + */ +int perf_event_init_task(struct task_struct *child) +{ + int ctxn, ret; + + for_each_task_context_nr(ctxn) { + ret = perf_event_init_context(child, ctxn); + if (ret) + return ret; + } + + return 0; +} + static void __init perf_event_init_all_cpus(void) { + struct swevent_htable *swhash; int cpu; - struct perf_cpu_context *cpuctx; for_each_possible_cpu(cpu) { - cpuctx = &per_cpu(perf_cpu_context, cpu); - mutex_init(&cpuctx->hlist_mutex); - __perf_event_init_context(&cpuctx->ctx, NULL); + swhash = &per_cpu(swevent_htable, cpu); + mutex_init(&swhash->hlist_mutex); + INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); } } static void __cpuinit perf_event_init_cpu(int cpu) { - struct perf_cpu_context *cpuctx; - - cpuctx = &per_cpu(perf_cpu_context, cpu); + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - spin_lock(&perf_resource_lock); - cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; - spin_unlock(&perf_resource_lock); - - mutex_lock(&cpuctx->hlist_mutex); - if (cpuctx->hlist_refcount > 0) { + mutex_lock(&swhash->hlist_mutex); + if (swhash->hlist_refcount > 0) { struct swevent_hlist *hlist; - hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); - WARN_ON_ONCE(!hlist); - rcu_assign_pointer(cpuctx->swevent_hlist, hlist); + hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); + WARN_ON(!hlist); + rcu_assign_pointer(swhash->swevent_hlist, hlist); } - mutex_unlock(&cpuctx->hlist_mutex); + mutex_unlock(&swhash->hlist_mutex); } #ifdef CONFIG_HOTPLUG_CPU -static void __perf_event_exit_cpu(void *info) +static void perf_pmu_rotate_stop(struct pmu *pmu) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_event_context *ctx = &cpuctx->ctx; + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + WARN_ON(!irqs_disabled()); + + list_del_init(&cpuctx->rotation_list); +} + +static void __perf_event_exit_context(void *__info) +{ + struct perf_event_context *ctx = __info; struct perf_event *event, *tmp; + perf_pmu_rotate_stop(ctx->pmu); + list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) __perf_event_remove_from_context(event); list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) __perf_event_remove_from_context(event); } + +static void perf_event_exit_cpu_context(int cpu) +{ + struct perf_event_context *ctx; + struct pmu *pmu; + int idx; + + idx = srcu_read_lock(&pmus_srcu); + list_for_each_entry_rcu(pmu, &pmus, entry) { + ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; + + mutex_lock(&ctx->mutex); + smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); + mutex_unlock(&ctx->mutex); + } + srcu_read_unlock(&pmus_srcu, idx); +} + static void perf_event_exit_cpu(int cpu) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_event_context *ctx = &cpuctx->ctx; + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - mutex_lock(&cpuctx->hlist_mutex); - swevent_hlist_release(cpuctx); - mutex_unlock(&cpuctx->hlist_mutex); + mutex_lock(&swhash->hlist_mutex); + swevent_hlist_release(swhash); + mutex_unlock(&swhash->hlist_mutex); - mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); - mutex_unlock(&ctx->mutex); + perf_event_exit_cpu_context(cpu); } #else static inline void perf_event_exit_cpu(int cpu) { } @@ -5778,118 +6325,13 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) return NOTIFY_OK; } -/* - * This has to have a higher priority than migration_notifier in sched.c. - */ -static struct notifier_block __cpuinitdata perf_cpu_nb = { - .notifier_call = perf_cpu_notify, - .priority = 20, -}; - void __init perf_event_init(void) { perf_event_init_all_cpus(); - perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, - (void *)(long)smp_processor_id()); - register_cpu_notifier(&perf_cpu_nb); -} - -static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, - struct sysdev_class_attribute *attr, - char *buf) -{ - return sprintf(buf, "%d\n", perf_reserved_percpu); -} - -static ssize_t -perf_set_reserve_percpu(struct sysdev_class *class, - struct sysdev_class_attribute *attr, - const char *buf, - size_t count) -{ - struct perf_cpu_context *cpuctx; - unsigned long val; - int err, cpu, mpt; - - err = strict_strtoul(buf, 10, &val); - if (err) - return err; - if (val > perf_max_events) - return -EINVAL; - - spin_lock(&perf_resource_lock); - perf_reserved_percpu = val; - for_each_online_cpu(cpu) { - cpuctx = &per_cpu(perf_cpu_context, cpu); - raw_spin_lock_irq(&cpuctx->ctx.lock); - mpt = min(perf_max_events - cpuctx->ctx.nr_events, - perf_max_events - perf_reserved_percpu); - cpuctx->max_pertask = mpt; - raw_spin_unlock_irq(&cpuctx->ctx.lock); - } - spin_unlock(&perf_resource_lock); - - return count; -} - -static ssize_t perf_show_overcommit(struct sysdev_class *class, - struct sysdev_class_attribute *attr, - char *buf) -{ - return sprintf(buf, "%d\n", perf_overcommit); -} - -static ssize_t -perf_set_overcommit(struct sysdev_class *class, - struct sysdev_class_attribute *attr, - const char *buf, size_t count) -{ - unsigned long val; - int err; - - err = strict_strtoul(buf, 10, &val); - if (err) - return err; - if (val > 1) - return -EINVAL; - - spin_lock(&perf_resource_lock); - perf_overcommit = val; - spin_unlock(&perf_resource_lock); - - return count; -} - -static SYSDEV_CLASS_ATTR( - reserve_percpu, - 0644, - perf_show_reserve_percpu, - perf_set_reserve_percpu - ); - -static SYSDEV_CLASS_ATTR( - overcommit, - 0644, - perf_show_overcommit, - perf_set_overcommit - ); - -static struct attribute *perfclass_attrs[] = { - &attr_reserve_percpu.attr, - &attr_overcommit.attr, - NULL -}; - -static struct attribute_group perfclass_attr_group = { - .attrs = perfclass_attrs, - .name = "perf_events", -}; - -static int __init perf_event_sysfs_init(void) -{ - return sysfs_create_group(&cpu_sysdev_class.kset.kobj, - &perfclass_attr_group); + init_srcu_struct(&pmus_srcu); + perf_pmu_register(&perf_swevent); + perf_pmu_register(&perf_cpu_clock); + perf_pmu_register(&perf_task_clock); + perf_tp_register(); + perf_cpu_notifier(perf_cpu_notify); } -device_initcall(perf_event_sysfs_init); diff --git a/kernel/pid.c b/kernel/pid.c index d55c6fb8d087..39b65b69584f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -401,7 +401,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) struct task_struct *result = NULL; if (pid) { struct hlist_node *first; - first = rcu_dereference_check(pid->tasks[type].first, + first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), rcu_read_lock_held() || lockdep_tasklist_lock_is_held()); if (first) @@ -416,6 +416,7 @@ EXPORT_SYMBOL(pid_task); */ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) { + rcu_lockdep_assert(rcu_read_lock_held()); return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); } diff --git a/kernel/printk.c b/kernel/printk.c index 8fe465ac008a..2531017795f6 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -85,7 +85,7 @@ EXPORT_SYMBOL(oops_in_progress); * provides serialisation for access to the entire console * driver system. */ -static DECLARE_MUTEX(console_sem); +static DEFINE_SEMAPHORE(console_sem); struct console *console_drivers; EXPORT_SYMBOL_GPL(console_drivers); @@ -556,7 +556,7 @@ static void zap_locks(void) /* If a crash is occurring, make sure we can't deadlock */ spin_lock_init(&logbuf_lock); /* And make sure that we print immediately */ - init_MUTEX(&console_sem); + sema_init(&console_sem, 1); } #if defined(CONFIG_PRINTK_TIME) diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 4d169835fb36..a23a57a976d1 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -73,12 +73,14 @@ int debug_lockdep_rcu_enabled(void) EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); /** - * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section? + * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? * * Check for bottom half being disabled, which covers both the * CONFIG_PROVE_RCU and not cases. Note that if someone uses * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) - * will show the situation. + * will show the situation. This is useful for debug checks in functions + * that require that they be called within an RCU read-side critical + * section. * * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. */ @@ -86,7 +88,7 @@ int rcu_read_lock_bh_held(void) { if (!debug_lockdep_rcu_enabled()) return 1; - return in_softirq(); + return in_softirq() || irqs_disabled(); } EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 196ec02f8be0..d806735342ac 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -59,6 +59,14 @@ int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +/* Forward declarations for rcutiny_plugin.h. */ +static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); +static void __call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu), + struct rcu_ctrlblk *rcp); + +#include "rcutiny_plugin.h" + #ifdef CONFIG_NO_HZ static long rcu_dynticks_nesting = 1; @@ -140,6 +148,7 @@ void rcu_check_callbacks(int cpu, int user) rcu_sched_qs(cpu); else if (!in_softirq()) rcu_bh_qs(cpu); + rcu_preempt_check_callbacks(); } /* @@ -162,6 +171,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) *rcp->donetail = NULL; if (rcp->curtail == rcp->donetail) rcp->curtail = &rcp->rcucblist; + rcu_preempt_remove_callbacks(rcp); rcp->donetail = &rcp->rcucblist; local_irq_restore(flags); @@ -182,6 +192,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) { __rcu_process_callbacks(&rcu_sched_ctrlblk); __rcu_process_callbacks(&rcu_bh_ctrlblk); + rcu_preempt_process_callbacks(); } /* @@ -223,15 +234,15 @@ static void __call_rcu(struct rcu_head *head, } /* - * Post an RCU callback to be invoked after the end of an RCU grace + * Post an RCU callback to be invoked after the end of an RCU-sched grace * period. But since we have but one CPU, that would be after any * quiescent state. */ -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { __call_rcu(head, func, &rcu_sched_ctrlblk); } -EXPORT_SYMBOL_GPL(call_rcu); +EXPORT_SYMBOL_GPL(call_rcu_sched); /* * Post an RCU bottom-half callback to be invoked after any subsequent @@ -243,20 +254,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu_bh); -void rcu_barrier(void) -{ - struct rcu_synchronize rcu; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); -} -EXPORT_SYMBOL_GPL(rcu_barrier); - void rcu_barrier_bh(void) { struct rcu_synchronize rcu; @@ -289,5 +286,3 @@ void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } - -#include "rcutiny_plugin.h" diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index d223a92bc742..6ceca4f745ff 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -1,7 +1,7 @@ /* - * Read-Copy Update mechanism for mutual exclusion (tree-based version) + * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition * Internal non-public definitions that provide either classic - * or preemptable semantics. + * or preemptible semantics. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -17,11 +17,587 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - * Copyright IBM Corporation, 2009 + * Copyright (c) 2010 Linaro * * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> */ +#ifdef CONFIG_TINY_PREEMPT_RCU + +#include <linux/delay.h> + +/* Global control variables for preemptible RCU. */ +struct rcu_preempt_ctrlblk { + struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */ + struct rcu_head **nexttail; + /* Tasks blocked in a preemptible RCU */ + /* read-side critical section while an */ + /* preemptible-RCU grace period is in */ + /* progress must wait for a later grace */ + /* period. This pointer points to the */ + /* ->next pointer of the last task that */ + /* must wait for a later grace period, or */ + /* to &->rcb.rcucblist if there is no */ + /* such task. */ + struct list_head blkd_tasks; + /* Tasks blocked in RCU read-side critical */ + /* section. Tasks are placed at the head */ + /* of this list and age towards the tail. */ + struct list_head *gp_tasks; + /* Pointer to the first task blocking the */ + /* current grace period, or NULL if there */ + /* is not such task. */ + struct list_head *exp_tasks; + /* Pointer to first task blocking the */ + /* current expedited grace period, or NULL */ + /* if there is no such task. If there */ + /* is no current expedited grace period, */ + /* then there cannot be any such task. */ + u8 gpnum; /* Current grace period. */ + u8 gpcpu; /* Last grace period blocked by the CPU. */ + u8 completed; /* Last grace period completed. */ + /* If all three are equal, RCU is idle. */ +}; + +static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { + .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist, + .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, + .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, + .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), +}; + +static int rcu_preempted_readers_exp(void); +static void rcu_report_exp_done(void); + +/* + * Return true if the CPU has not yet responded to the current grace period. + */ +static int rcu_cpu_blocking_cur_gp(void) +{ + return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum; +} + +/* + * Check for a running RCU reader. Because there is only one CPU, + * there can be but one running RCU reader at a time. ;-) + */ +static int rcu_preempt_running_reader(void) +{ + return current->rcu_read_lock_nesting; +} + +/* + * Check for preempted RCU readers blocking any grace period. + * If the caller needs a reliable answer, it must disable hard irqs. + */ +static int rcu_preempt_blocked_readers_any(void) +{ + return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks); +} + +/* + * Check for preempted RCU readers blocking the current grace period. + * If the caller needs a reliable answer, it must disable hard irqs. + */ +static int rcu_preempt_blocked_readers_cgp(void) +{ + return rcu_preempt_ctrlblk.gp_tasks != NULL; +} + +/* + * Return true if another preemptible-RCU grace period is needed. + */ +static int rcu_preempt_needs_another_gp(void) +{ + return *rcu_preempt_ctrlblk.rcb.curtail != NULL; +} + +/* + * Return true if a preemptible-RCU grace period is in progress. + * The caller must disable hardirqs. + */ +static int rcu_preempt_gp_in_progress(void) +{ + return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum; +} + +/* + * Record a preemptible-RCU quiescent state for the specified CPU. Note + * that this just means that the task currently running on the CPU is + * in a quiescent state. There might be any number of tasks blocked + * while in an RCU read-side critical section. + * + * Unlike the other rcu_*_qs() functions, callers to this function + * must disable irqs in order to protect the assignment to + * ->rcu_read_unlock_special. + * + * Because this is a single-CPU implementation, the only way a grace + * period can end is if the CPU is in a quiescent state. The reason is + * that a blocked preemptible-RCU reader can exit its critical section + * only if the CPU is running it at the time. Therefore, when the + * last task blocking the current grace period exits its RCU read-side + * critical section, neither the CPU nor blocked tasks will be stopping + * the current grace period. (In contrast, SMP implementations + * might have CPUs running in RCU read-side critical sections that + * block later grace periods -- but this is not possible given only + * one CPU.) + */ +static void rcu_preempt_cpu_qs(void) +{ + /* Record both CPU and task as having responded to current GP. */ + rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; + current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; + + /* + * If there is no GP, or if blocked readers are still blocking GP, + * then there is nothing more to do. + */ + if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp()) + return; + + /* Advance callbacks. */ + rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum; + rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail; + rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail; + + /* If there are no blocked readers, next GP is done instantly. */ + if (!rcu_preempt_blocked_readers_any()) + rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; + + /* If there are done callbacks, make RCU_SOFTIRQ process them. */ + if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) + raise_softirq(RCU_SOFTIRQ); +} + +/* + * Start a new RCU grace period if warranted. Hard irqs must be disabled. + */ +static void rcu_preempt_start_gp(void) +{ + if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) { + + /* Official start of GP. */ + rcu_preempt_ctrlblk.gpnum++; + + /* Any blocked RCU readers block new GP. */ + if (rcu_preempt_blocked_readers_any()) + rcu_preempt_ctrlblk.gp_tasks = + rcu_preempt_ctrlblk.blkd_tasks.next; + + /* If there is no running reader, CPU is done with GP. */ + if (!rcu_preempt_running_reader()) + rcu_preempt_cpu_qs(); + } +} + +/* + * We have entered the scheduler, and the current task might soon be + * context-switched away from. If this task is in an RCU read-side + * critical section, we will no longer be able to rely on the CPU to + * record that fact, so we enqueue the task on the blkd_tasks list. + * If the task started after the current grace period began, as recorded + * by ->gpcpu, we enqueue at the beginning of the list. Otherwise + * before the element referenced by ->gp_tasks (or at the tail if + * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element. + * The task will dequeue itself when it exits the outermost enclosing + * RCU read-side critical section. Therefore, the current grace period + * cannot be permitted to complete until the ->gp_tasks pointer becomes + * NULL. + * + * Caller must disable preemption. + */ +void rcu_preempt_note_context_switch(void) +{ + struct task_struct *t = current; + unsigned long flags; + + local_irq_save(flags); /* must exclude scheduler_tick(). */ + if (rcu_preempt_running_reader() && + (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { + + /* Possibly blocking in an RCU read-side critical section. */ + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; + + /* + * If this CPU has already checked in, then this task + * will hold up the next grace period rather than the + * current grace period. Queue the task accordingly. + * If the task is queued for the current grace period + * (i.e., this CPU has not yet passed through a quiescent + * state for the current grace period), then as long + * as that task remains queued, the current grace period + * cannot end. + */ + list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); + if (rcu_cpu_blocking_cur_gp()) + rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; + } + + /* + * Either we were not in an RCU read-side critical section to + * begin with, or we have now recorded that critical section + * globally. Either way, we can now note a quiescent state + * for this CPU. Again, if we were in an RCU read-side critical + * section, and if that critical section was blocking the current + * grace period, then the fact that the task has been enqueued + * means that current grace period continues to be blocked. + */ + rcu_preempt_cpu_qs(); + local_irq_restore(flags); +} + +/* + * Tiny-preemptible RCU implementation for rcu_read_lock(). + * Just increment ->rcu_read_lock_nesting, shared state will be updated + * if we block. + */ +void __rcu_read_lock(void) +{ + current->rcu_read_lock_nesting++; + barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */ +} +EXPORT_SYMBOL_GPL(__rcu_read_lock); + +/* + * Handle special cases during rcu_read_unlock(), such as needing to + * notify RCU core processing or task having blocked during the RCU + * read-side critical section. + */ +static void rcu_read_unlock_special(struct task_struct *t) +{ + int empty; + int empty_exp; + unsigned long flags; + struct list_head *np; + int special; + + /* + * NMI handlers cannot block and cannot safely manipulate state. + * They therefore cannot possibly be special, so just leave. + */ + if (in_nmi()) + return; + + local_irq_save(flags); + + /* + * If RCU core is waiting for this CPU to exit critical section, + * let it know that we have done so. + */ + special = t->rcu_read_unlock_special; + if (special & RCU_READ_UNLOCK_NEED_QS) + rcu_preempt_cpu_qs(); + + /* Hardware IRQ handlers cannot block. */ + if (in_irq()) { + local_irq_restore(flags); + return; + } + + /* Clean up if blocked during RCU read-side critical section. */ + if (special & RCU_READ_UNLOCK_BLOCKED) { + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; + + /* + * Remove this task from the ->blkd_tasks list and adjust + * any pointers that might have been referencing it. + */ + empty = !rcu_preempt_blocked_readers_cgp(); + empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; + np = t->rcu_node_entry.next; + if (np == &rcu_preempt_ctrlblk.blkd_tasks) + np = NULL; + list_del(&t->rcu_node_entry); + if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) + rcu_preempt_ctrlblk.gp_tasks = np; + if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) + rcu_preempt_ctrlblk.exp_tasks = np; + INIT_LIST_HEAD(&t->rcu_node_entry); + + /* + * If this was the last task on the current list, and if + * we aren't waiting on the CPU, report the quiescent state + * and start a new grace period if needed. + */ + if (!empty && !rcu_preempt_blocked_readers_cgp()) { + rcu_preempt_cpu_qs(); + rcu_preempt_start_gp(); + } + + /* + * If this was the last task on the expedited lists, + * then we need wake up the waiting task. + */ + if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) + rcu_report_exp_done(); + } + local_irq_restore(flags); +} + +/* + * Tiny-preemptible RCU implementation for rcu_read_unlock(). + * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost + * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then + * invoke rcu_read_unlock_special() to clean up after a context switch + * in an RCU read-side critical section and other special cases. + */ +void __rcu_read_unlock(void) +{ + struct task_struct *t = current; + + barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ + --t->rcu_read_lock_nesting; + barrier(); /* decrement before load of ->rcu_read_unlock_special */ + if (t->rcu_read_lock_nesting == 0 && + unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + rcu_read_unlock_special(t); +#ifdef CONFIG_PROVE_LOCKING + WARN_ON_ONCE(t->rcu_read_lock_nesting < 0); +#endif /* #ifdef CONFIG_PROVE_LOCKING */ +} +EXPORT_SYMBOL_GPL(__rcu_read_unlock); + +/* + * Check for a quiescent state from the current CPU. When a task blocks, + * the task is recorded in the rcu_preempt_ctrlblk structure, which is + * checked elsewhere. This is called from the scheduling-clock interrupt. + * + * Caller must disable hard irqs. + */ +static void rcu_preempt_check_callbacks(void) +{ + struct task_struct *t = current; + + if (rcu_preempt_gp_in_progress() && + (!rcu_preempt_running_reader() || + !rcu_cpu_blocking_cur_gp())) + rcu_preempt_cpu_qs(); + if (&rcu_preempt_ctrlblk.rcb.rcucblist != + rcu_preempt_ctrlblk.rcb.donetail) + raise_softirq(RCU_SOFTIRQ); + if (rcu_preempt_gp_in_progress() && + rcu_cpu_blocking_cur_gp() && + rcu_preempt_running_reader()) + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; +} + +/* + * TINY_PREEMPT_RCU has an extra callback-list tail pointer to + * update, so this is invoked from __rcu_process_callbacks() to + * handle that case. Of course, it is invoked for all flavors of + * RCU, but RCU callbacks can appear only on one of the lists, and + * neither ->nexttail nor ->donetail can possibly be NULL, so there + * is no need for an explicit check. + */ +static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) +{ + if (rcu_preempt_ctrlblk.nexttail == rcp->donetail) + rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist; +} + +/* + * Process callbacks for preemptible RCU. + */ +static void rcu_preempt_process_callbacks(void) +{ + __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); +} + +/* + * Queue a preemptible -RCU callback for invocation after a grace period. + */ +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + unsigned long flags; + + debug_rcu_head_queue(head); + head->func = func; + head->next = NULL; + + local_irq_save(flags); + *rcu_preempt_ctrlblk.nexttail = head; + rcu_preempt_ctrlblk.nexttail = &head->next; + rcu_preempt_start_gp(); /* checks to see if GP needed. */ + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(call_rcu); + +void rcu_barrier(void) +{ + struct rcu_synchronize rcu; + + init_rcu_head_on_stack(&rcu.head); + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + +/* + * synchronize_rcu - wait until a grace period has elapsed. + * + * Control will return to the caller some time after a full grace + * period has elapsed, in other words after all currently executing RCU + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), + * and may be nested. + */ +void synchronize_rcu(void) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (!rcu_scheduler_active) + return; +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + + WARN_ON_ONCE(rcu_preempt_running_reader()); + if (!rcu_preempt_blocked_readers_any()) + return; + + /* Once we get past the fastpath checks, same code as rcu_barrier(). */ + rcu_barrier(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu); + +static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); +static unsigned long sync_rcu_preempt_exp_count; +static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); + +/* + * Return non-zero if there are any tasks in RCU read-side critical + * sections blocking the current preemptible-RCU expedited grace period. + * If there is no preemptible-RCU expedited grace period currently in + * progress, returns zero unconditionally. + */ +static int rcu_preempted_readers_exp(void) +{ + return rcu_preempt_ctrlblk.exp_tasks != NULL; +} + +/* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU + * grace period. + */ +static void rcu_report_exp_done(void) +{ + wake_up(&sync_rcu_preempt_exp_wq); +} + +/* + * Wait for an rcu-preempt grace period, but expedite it. The basic idea + * is to rely in the fact that there is but one CPU, and that it is + * illegal for a task to invoke synchronize_rcu_expedited() while in a + * preemptible-RCU read-side critical section. Therefore, any such + * critical sections must correspond to blocked tasks, which must therefore + * be on the ->blkd_tasks list. So just record the current head of the + * list in the ->exp_tasks pointer, and wait for all tasks including and + * after the task pointed to by ->exp_tasks to drain. + */ +void synchronize_rcu_expedited(void) +{ + unsigned long flags; + struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk; + unsigned long snap; + + barrier(); /* ensure prior action seen before grace period. */ + + WARN_ON_ONCE(rcu_preempt_running_reader()); + + /* + * Acquire lock so that there is only one preemptible RCU grace + * period in flight. Of course, if someone does the expedited + * grace period for us while we are acquiring the lock, just leave. + */ + snap = sync_rcu_preempt_exp_count + 1; + mutex_lock(&sync_rcu_preempt_exp_mutex); + if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count)) + goto unlock_mb_ret; /* Others did our work for us. */ + + local_irq_save(flags); + + /* + * All RCU readers have to already be on blkd_tasks because + * we cannot legally be executing in an RCU read-side critical + * section. + */ + + /* Snapshot current head of ->blkd_tasks list. */ + rpcp->exp_tasks = rpcp->blkd_tasks.next; + if (rpcp->exp_tasks == &rpcp->blkd_tasks) + rpcp->exp_tasks = NULL; + local_irq_restore(flags); + + /* Wait for tail of ->blkd_tasks list to drain. */ + if (rcu_preempted_readers_exp()) + wait_event(sync_rcu_preempt_exp_wq, + !rcu_preempted_readers_exp()); + + /* Clean up and exit. */ + barrier(); /* ensure expedited GP seen before counter increment. */ + sync_rcu_preempt_exp_count++; +unlock_mb_ret: + mutex_unlock(&sync_rcu_preempt_exp_mutex); + barrier(); /* ensure subsequent action seen after grace period. */ +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +/* + * Does preemptible RCU need the CPU to stay out of dynticks mode? + */ +int rcu_preempt_needs_cpu(void) +{ + if (!rcu_preempt_running_reader()) + rcu_preempt_cpu_qs(); + return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; +} + +/* + * Check for a task exiting while in a preemptible -RCU read-side + * critical section, clean up if so. No need to issue warnings, + * as debug_check_no_locks_held() already does this if lockdep + * is enabled. + */ +void exit_rcu(void) +{ + struct task_struct *t = current; + + if (t->rcu_read_lock_nesting == 0) + return; + t->rcu_read_lock_nesting = 1; + rcu_read_unlock(); +} + +#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ + +/* + * Because preemptible RCU does not exist, it never has any callbacks + * to check. + */ +static void rcu_preempt_check_callbacks(void) +{ +} + +/* + * Because preemptible RCU does not exist, it never has any callbacks + * to remove. + */ +static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) +{ +} + +/* + * Because preemptible RCU does not exist, it never has any callbacks + * to process. + */ +static void rcu_preempt_process_callbacks(void) +{ +} + +#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC #include <linux/kernel_stat.h> diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 2e2726d790b9..9d8e8fb2515f 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -120,7 +120,7 @@ struct rcu_torture { }; static LIST_HEAD(rcu_torture_freelist); -static struct rcu_torture *rcu_torture_current; +static struct rcu_torture __rcu *rcu_torture_current; static long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); @@ -153,8 +153,10 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; #define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ #define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ static int fullstop = FULLSTOP_RMMOD; -DEFINE_MUTEX(fullstop_mutex); /* Protect fullstop transitions and spawning */ - /* of kthreads. */ +/* + * Protect fullstop transitions and spawning of kthreads. + */ +static DEFINE_MUTEX(fullstop_mutex); /* * Detect and respond to a system shutdown. @@ -303,6 +305,10 @@ static void rcu_read_delay(struct rcu_random_state *rrsp) mdelay(longdelay_ms); if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) udelay(shortdelay_us); +#ifdef CONFIG_PREEMPT + if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000))) + preempt_schedule(); /* No QS if preempt_disable() in effect */ +#endif } static void rcu_torture_read_unlock(int idx) __releases(RCU) @@ -536,6 +542,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp) delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); if (!delay) schedule_timeout_interruptible(longdelay); + else + rcu_read_delay(rrsp); } static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) @@ -731,7 +739,8 @@ rcu_torture_writer(void *arg) continue; rp->rtort_pipe_count = 0; udelay(rcu_random(&rand) & 0x3ff); - old_rp = rcu_torture_current; + old_rp = rcu_dereference_check(rcu_torture_current, + current == writer_task); rp->rtort_mbtest = 1; rcu_assign_pointer(rcu_torture_current, rp); smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d5bc43976c5a..ccdc04c47981 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -143,6 +143,11 @@ module_param(blimit, int, 0); module_param(qhimark, int, 0); module_param(qlowmark, int, 0); +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR +int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT; +module_param(rcu_cpu_stall_suppress, int, 0644); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + static void force_quiescent_state(struct rcu_state *rsp, int relaxed); static int rcu_pending(int cpu); @@ -450,7 +455,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) #ifdef CONFIG_RCU_CPU_STALL_DETECTOR -int rcu_cpu_stall_panicking __read_mostly; +int rcu_cpu_stall_suppress __read_mostly; static void record_gp_stall_check_time(struct rcu_state *rsp) { @@ -482,8 +487,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp) rcu_print_task_stall(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); - /* OK, time to rat on our buddy... */ - + /* + * OK, time to rat on our buddy... + * See Documentation/RCU/stallwarn.txt for info on how to debug + * RCU CPU stall warnings. + */ printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", rsp->name); rcu_for_each_leaf_node(rsp, rnp) { @@ -512,6 +520,11 @@ static void print_cpu_stall(struct rcu_state *rsp) unsigned long flags; struct rcu_node *rnp = rcu_get_root(rsp); + /* + * OK, time to rat on ourselves... + * See Documentation/RCU/stallwarn.txt for info on how to debug + * RCU CPU stall warnings. + */ printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", rsp->name, smp_processor_id(), jiffies - rsp->gp_start); trigger_all_cpu_backtrace(); @@ -530,11 +543,11 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) long delta; struct rcu_node *rnp; - if (rcu_cpu_stall_panicking) + if (rcu_cpu_stall_suppress) return; - delta = jiffies - rsp->jiffies_stall; + delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); rnp = rdp->mynode; - if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { + if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); @@ -548,10 +561,26 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) { - rcu_cpu_stall_panicking = 1; + rcu_cpu_stall_suppress = 1; return NOTIFY_DONE; } +/** + * rcu_cpu_stall_reset - prevent further stall warnings in current grace period + * + * Set the stall-warning timeout way off into the future, thus preventing + * any RCU CPU stall-warning messages from appearing in the current set of + * RCU grace periods. + * + * The caller must disable hard irqs. + */ +void rcu_cpu_stall_reset(void) +{ + rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2; + rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2; + rcu_preempt_stall_reset(); +} + static struct notifier_block rcu_panic_block = { .notifier_call = rcu_panic, }; @@ -571,6 +600,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) { } +void rcu_cpu_stall_reset(void) +{ +} + static void __init check_cpu_stall_init(void) { } @@ -712,7 +745,7 @@ static void rcu_start_gp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { - struct rcu_data *rdp = rsp->rda[smp_processor_id()]; + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_node *rnp = rcu_get_root(rsp); if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { @@ -960,7 +993,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) { int i; - struct rcu_data *rdp = rsp->rda[smp_processor_id()]; + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); if (rdp->nxtlist == NULL) return; /* irqs disabled, so comparison is stable. */ @@ -971,6 +1004,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; rsp->orphan_qlen += rdp->qlen; + rdp->n_cbs_orphaned += rdp->qlen; rdp->qlen = 0; raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ } @@ -984,7 +1018,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) struct rcu_data *rdp; raw_spin_lock_irqsave(&rsp->onofflock, flags); - rdp = rsp->rda[smp_processor_id()]; + rdp = this_cpu_ptr(rsp->rda); if (rsp->orphan_cbs_list == NULL) { raw_spin_unlock_irqrestore(&rsp->onofflock, flags); return; @@ -992,6 +1026,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; rdp->qlen += rsp->orphan_qlen; + rdp->n_cbs_adopted += rsp->orphan_qlen; rsp->orphan_cbs_list = NULL; rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; rsp->orphan_qlen = 0; @@ -1007,7 +1042,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) unsigned long flags; unsigned long mask; int need_report = 0; - struct rcu_data *rdp = rsp->rda[cpu]; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp; /* Exclude any attempts to start a new grace period. */ @@ -1123,6 +1158,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* Update count, and requeue any remaining callbacks. */ rdp->qlen -= count; + rdp->n_cbs_invoked += count; if (list != NULL) { *tail = rdp->nxtlist; rdp->nxtlist = list; @@ -1226,7 +1262,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) cpu = rnp->grplo; bit = 1; for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { - if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) + if ((rnp->qsmask & bit) != 0 && + f(per_cpu_ptr(rsp->rda, cpu))) mask |= bit; } if (mask != 0) { @@ -1402,7 +1439,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), * a quiescent state betweentimes. */ local_irq_save(flags); - rdp = rsp->rda[smp_processor_id()]; + rdp = this_cpu_ptr(rsp->rda); rcu_process_gp_end(rsp, rdp); check_for_new_grace_period(rsp, rdp); @@ -1701,7 +1738,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) { unsigned long flags; int i; - struct rcu_data *rdp = rsp->rda[cpu]; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ @@ -1729,7 +1766,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) { unsigned long flags; unsigned long mask; - struct rcu_data *rdp = rsp->rda[cpu]; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ @@ -1865,7 +1902,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) /* * Helper function for rcu_init() that initializes one rcu_state structure. */ -static void __init rcu_init_one(struct rcu_state *rsp) +static void __init rcu_init_one(struct rcu_state *rsp, + struct rcu_data __percpu *rda) { static char *buf[] = { "rcu_node_level_0", "rcu_node_level_1", @@ -1918,37 +1956,23 @@ static void __init rcu_init_one(struct rcu_state *rsp) } } + rsp->rda = rda; rnp = rsp->level[NUM_RCU_LVLS - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) rnp++; - rsp->rda[i]->mynode = rnp; + per_cpu_ptr(rsp->rda, i)->mynode = rnp; rcu_boot_init_percpu_data(i, rsp); } } -/* - * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used - * nowhere else! Assigns leaf node pointers into each CPU's rcu_data - * structure. - */ -#define RCU_INIT_FLAVOR(rsp, rcu_data) \ -do { \ - int i; \ - \ - for_each_possible_cpu(i) { \ - (rsp)->rda[i] = &per_cpu(rcu_data, i); \ - } \ - rcu_init_one(rsp); \ -} while (0) - void __init rcu_init(void) { int cpu; rcu_bootup_announce(); - RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); - RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); + rcu_init_one(&rcu_sched_state, &rcu_sched_data); + rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 14c040b18ed0..91d4170c5c13 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -202,6 +202,9 @@ struct rcu_data { long qlen; /* # of queued callbacks */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ + unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ + unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */ + unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ @@ -254,19 +257,23 @@ struct rcu_data { #define RCU_STALL_DELAY_DELTA 0 #endif -#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA) +#define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \ + RCU_STALL_DELAY_DELTA) /* for rsp->jiffies_stall */ -#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA) +#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30) /* for rsp->jiffies_stall */ #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ /* to take at least one */ /* scheduling clock irq */ /* before ratting on them. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE +#define RCU_CPU_STALL_SUPPRESS_INIT 0 +#else +#define RCU_CPU_STALL_SUPPRESS_INIT 1 +#endif -#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) -#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* * RCU global state, including node hierarchy. This hierarchy is @@ -283,7 +290,7 @@ struct rcu_state { struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ - struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */ + struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ /* The following fields are guarded by the root rcu_node's lock. */ @@ -365,6 +372,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, #ifdef CONFIG_RCU_CPU_STALL_DETECTOR static void rcu_print_detail_task_stall(struct rcu_state *rsp); static void rcu_print_task_stall(struct rcu_node *rnp); +static void rcu_preempt_stall_reset(void); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 0e4f420245d9..71a4147473f9 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -57,7 +57,7 @@ static void __init rcu_bootup_announce_oddness(void) printk(KERN_INFO "\tRCU-based detection of stalled CPUs is disabled.\n"); #endif -#ifndef CONFIG_RCU_CPU_STALL_VERBOSE +#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); #endif #if NUM_RCU_LVL_4 != 0 @@ -154,7 +154,7 @@ static void rcu_preempt_note_context_switch(int cpu) (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { /* Possibly blocking in an RCU read-side critical section. */ - rdp = rcu_preempt_state.rda[cpu]; + rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; @@ -201,7 +201,7 @@ static void rcu_preempt_note_context_switch(int cpu) */ void __rcu_read_lock(void) { - ACCESS_ONCE(current->rcu_read_lock_nesting)++; + current->rcu_read_lock_nesting++; barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ } EXPORT_SYMBOL_GPL(__rcu_read_lock); @@ -344,7 +344,9 @@ void __rcu_read_unlock(void) struct task_struct *t = current; barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ - if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && + --t->rcu_read_lock_nesting; + barrier(); /* decrement before load of ->rcu_read_unlock_special */ + if (t->rcu_read_lock_nesting == 0 && unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) rcu_read_unlock_special(t); #ifdef CONFIG_PROVE_LOCKING @@ -417,6 +419,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp) } } +/* + * Suppress preemptible RCU's CPU stall warnings by pushing the + * time of the next stall-warning message comfortably far into the + * future. + */ +static void rcu_preempt_stall_reset(void) +{ + rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; +} + #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* @@ -546,9 +558,11 @@ EXPORT_SYMBOL_GPL(call_rcu); * * Control will return to the caller some time after a full grace * period has elapsed, in other words after all currently executing RCU - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. + * read-side critical sections have completed. Note, however, that + * upon return from synchronize_rcu(), the caller might well be executing + * concurrently with new RCU read-side critical sections that began while + * synchronize_rcu() was waiting. RCU read-side critical sections are + * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. */ void synchronize_rcu(void) { @@ -771,7 +785,7 @@ static void rcu_preempt_send_cbs_to_orphanage(void) */ static void __init __rcu_init_preempt(void) { - RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); + rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); } /* @@ -865,6 +879,14 @@ static void rcu_print_task_stall(struct rcu_node *rnp) { } +/* + * Because preemptible RCU does not exist, there is no need to suppress + * its CPU stall warnings. + */ +static void rcu_preempt_stall_reset(void) +{ +} + #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* @@ -919,15 +941,6 @@ static void rcu_preempt_process_callbacks(void) } /* - * In classic RCU, call_rcu() is just call_rcu_sched(). - */ -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - call_rcu_sched(head, func); -} -EXPORT_SYMBOL_GPL(call_rcu); - -/* * Wait for an rcu-preempt grace period, but make it happen quickly. * But because preemptable RCU does not exist, map to rcu-sched. */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 36c95b45738e..d15430b9d122 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -64,7 +64,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit); + seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit); + seq_printf(m, " ci=%lu co=%lu ca=%lu\n", + rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } #define PRINT_RCU_DATA(name, func, m) \ @@ -119,7 +121,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit); + seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit); + seq_printf(m, ",%lu,%lu,%lu\n", + rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } static int show_rcudata_csv(struct seq_file *m, void *unused) @@ -128,7 +132,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) #ifdef CONFIG_NO_HZ seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); #endif /* #ifdef CONFIG_NO_HZ */ - seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); + seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n"); #ifdef CONFIG_TREE_PREEMPT_RCU seq_puts(m, "\"rcu_preempt:\"\n"); PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); @@ -262,7 +266,7 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) struct rcu_data *rdp; for_each_possible_cpu(cpu) { - rdp = rsp->rda[cpu]; + rdp = per_cpu_ptr(rsp->rda, cpu); if (rdp->beenonline) print_one_rcu_pending(m, rdp); } diff --git a/kernel/sched.c b/kernel/sched.c index 5998222f901c..d42992bccdfa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3714,7 +3714,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); raw_spin_unlock(&rq->lock); - perf_event_task_tick(curr); + perf_event_task_tick(); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); @@ -4772,7 +4772,7 @@ recheck: } if (user) { - retval = security_task_setscheduler(p, policy, param); + retval = security_task_setscheduler(p); if (retval) return retval; } @@ -5023,7 +5023,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) goto out_unlock; - retval = security_task_setscheduler(p, 0, NULL); + retval = security_task_setscheduler(p); if (retval) goto out_unlock; @@ -5473,7 +5473,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->se.exec_start = sched_clock(); cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); + /* + * We're having a chicken and egg problem, even though we are + * holding rq->lock, the cpu isn't yet set to this cpu so the + * lockdep check in task_group() will fail. + * + * Similar case to sched_fork(). / Alternatively we could + * use task_rq_lock() here and obtain the other rq->lock. + * + * Silence PROVE_RCU + */ + rcu_read_lock(); __set_task_cpu(idle, cpu); + rcu_read_unlock(); rq->curr = rq->idle = idle; #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 74cccfae87a8..933f3d1b62ea 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3793,8 +3793,11 @@ static void task_fork_fair(struct task_struct *p) update_rq_clock(rq); - if (unlikely(task_cpu(p) != this_cpu)) + if (unlikely(task_cpu(p) != this_cpu)) { + rcu_read_lock(); __set_task_cpu(p, this_cpu); + rcu_read_unlock(); + } update_curr(cfs_rq); diff --git a/kernel/srcu.c b/kernel/srcu.c index 2980da3fd509..c71e07500536 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -46,11 +46,9 @@ static int init_srcu_struct_fields(struct srcu_struct *sp) int __init_srcu_struct(struct srcu_struct *sp, const char *name, struct lock_class_key *key) { -#ifdef CONFIG_DEBUG_LOCK_ALLOC /* Don't re-initialize a lock while it is held. */ debug_check_no_locks_freed((void *)sp, sizeof(*sp)); lockdep_init_map(&sp->dep_map, name, key, 0); -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ return init_srcu_struct_fields(sp); } EXPORT_SYMBOL_GPL(__init_srcu_struct); diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index 04cdcf72c827..10b90d8a03c4 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -143,15 +143,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) if (!table->maxlen) set_fail(&fail, table, "No maxlen"); } - if ((table->proc_handler == proc_doulongvec_minmax) || - (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { - if (table->maxlen > sizeof (unsigned long)) { - if (!table->extra1) - set_fail(&fail, table, "No min"); - if (!table->extra2) - set_fail(&fail, table, "No max"); - } - } #ifdef CONFIG_PROC_SYSCTL if (table->procname && !table->proc_handler) set_fail(&fail, table, "No proc_handler"); diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 4f104515a19b..f8b11a283171 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -115,7 +115,9 @@ static int test_kprobes(void) int ret; struct kprobe *kps[2] = {&kp, &kp2}; - kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + /* addr and flags should be cleard for reusing kprobe. */ + kp.addr = NULL; + kp.flags = 0; ret = register_kprobes(kps, 2); if (ret < 0) { printk(KERN_ERR "Kprobe smoke test failed: " @@ -210,7 +212,9 @@ static int test_jprobes(void) int ret; struct jprobe *jps[2] = {&jp, &jp2}; - jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + /* addr and flags should be cleard for reusing kprobe. */ + jp.kp.addr = NULL; + jp.kp.flags = 0; ret = register_jprobes(jps, 2); if (ret < 0) { printk(KERN_ERR "Kprobe smoke test failed: " @@ -323,7 +327,9 @@ static int test_kretprobes(void) int ret; struct kretprobe *rps[2] = {&rp, &rp2}; - rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + /* addr and flags should be cleard for reusing kprobe. */ + rp.kp.addr = NULL; + rp.kp.flags = 0; ret = register_kretprobes(rps, 2); if (ret < 0) { printk(KERN_ERR "Kprobe smoke test failed: " diff --git a/kernel/timer.c b/kernel/timer.c index 97bf05baade7..68a9ae7679b7 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -37,7 +37,7 @@ #include <linux/delay.h> #include <linux/tick.h> #include <linux/kallsyms.h> -#include <linux/perf_event.h> +#include <linux/irq_work.h> #include <linux/sched.h> #include <linux/slab.h> @@ -1279,7 +1279,10 @@ void update_process_times(int user_tick) run_local_timers(); rcu_check_callbacks(cpu, user_tick); printk_tick(); - perf_event_do_pending(); +#ifdef CONFIG_IRQ_WORK + if (in_irq()) + irq_work_run(); +#endif scheduler_tick(); run_posix_cpu_timers(p); } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 538501c6ea50..e550d2eda1df 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS help See Documentation/trace/ftrace-design.txt +config HAVE_C_RECORDMCOUNT + bool + help + C version of recordmcount available? + config TRACER_MAX_TRACE bool diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fa7ece649fe1..ebd80d50c474 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -884,10 +884,8 @@ enum { FTRACE_ENABLE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), FTRACE_UPDATE_TRACE_FUNC = (1 << 2), - FTRACE_ENABLE_MCOUNT = (1 << 3), - FTRACE_DISABLE_MCOUNT = (1 << 4), - FTRACE_START_FUNC_RET = (1 << 5), - FTRACE_STOP_FUNC_RET = (1 << 6), + FTRACE_START_FUNC_RET = (1 << 3), + FTRACE_STOP_FUNC_RET = (1 << 4), }; static int ftrace_filtered; @@ -1226,8 +1224,6 @@ static void ftrace_shutdown(int command) static void ftrace_startup_sysctl(void) { - int command = FTRACE_ENABLE_MCOUNT; - if (unlikely(ftrace_disabled)) return; @@ -1235,23 +1231,17 @@ static void ftrace_startup_sysctl(void) saved_ftrace_func = NULL; /* ftrace_start_up is true if we want ftrace running */ if (ftrace_start_up) - command |= FTRACE_ENABLE_CALLS; - - ftrace_run_update_code(command); + ftrace_run_update_code(FTRACE_ENABLE_CALLS); } static void ftrace_shutdown_sysctl(void) { - int command = FTRACE_DISABLE_MCOUNT; - if (unlikely(ftrace_disabled)) return; /* ftrace_start_up is true if ftrace is running */ if (ftrace_start_up) - command |= FTRACE_DISABLE_CALLS; - - ftrace_run_update_code(command); + ftrace_run_update_code(FTRACE_DISABLE_CALLS); } static cycle_t ftrace_update_time; @@ -1368,24 +1358,29 @@ enum { #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ struct ftrace_iterator { - struct ftrace_page *pg; - int hidx; - int idx; - unsigned flags; - struct trace_parser parser; + loff_t pos; + loff_t func_pos; + struct ftrace_page *pg; + struct dyn_ftrace *func; + struct ftrace_func_probe *probe; + struct trace_parser parser; + int hidx; + int idx; + unsigned flags; }; static void * -t_hash_next(struct seq_file *m, void *v, loff_t *pos) +t_hash_next(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct hlist_node *hnd = v; + struct hlist_node *hnd = NULL; struct hlist_head *hhd; - WARN_ON(!(iter->flags & FTRACE_ITER_HASH)); - (*pos)++; + iter->pos = *pos; + if (iter->probe) + hnd = &iter->probe->node; retry: if (iter->hidx >= FTRACE_FUNC_HASHSIZE) return NULL; @@ -1408,7 +1403,12 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos) } } - return hnd; + if (WARN_ON_ONCE(!hnd)) + return NULL; + + iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node); + + return iter; } static void *t_hash_start(struct seq_file *m, loff_t *pos) @@ -1417,26 +1417,32 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) void *p = NULL; loff_t l; - if (!(iter->flags & FTRACE_ITER_HASH)) - *pos = 0; - - iter->flags |= FTRACE_ITER_HASH; + if (iter->func_pos > *pos) + return NULL; iter->hidx = 0; - for (l = 0; l <= *pos; ) { - p = t_hash_next(m, p, &l); + for (l = 0; l <= (*pos - iter->func_pos); ) { + p = t_hash_next(m, &l); if (!p) break; } - return p; + if (!p) + return NULL; + + /* Only set this if we have an item */ + iter->flags |= FTRACE_ITER_HASH; + + return iter; } -static int t_hash_show(struct seq_file *m, void *v) +static int +t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) { struct ftrace_func_probe *rec; - struct hlist_node *hnd = v; - rec = hlist_entry(hnd, struct ftrace_func_probe, node); + rec = iter->probe; + if (WARN_ON_ONCE(!rec)) + return -EIO; if (rec->ops->print) return rec->ops->print(m, rec->ip, rec->ops, rec->data); @@ -1457,12 +1463,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) struct dyn_ftrace *rec = NULL; if (iter->flags & FTRACE_ITER_HASH) - return t_hash_next(m, v, pos); + return t_hash_next(m, pos); (*pos)++; + iter->pos = *pos; if (iter->flags & FTRACE_ITER_PRINTALL) - return NULL; + return t_hash_start(m, pos); retry: if (iter->idx >= iter->pg->index) { @@ -1491,7 +1498,20 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } } - return rec; + if (!rec) + return t_hash_start(m, pos); + + iter->func_pos = *pos; + iter->func = rec; + + return iter; +} + +static void reset_iter_read(struct ftrace_iterator *iter) +{ + iter->pos = 0; + iter->func_pos = 0; + iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); } static void *t_start(struct seq_file *m, loff_t *pos) @@ -1502,6 +1522,12 @@ static void *t_start(struct seq_file *m, loff_t *pos) mutex_lock(&ftrace_lock); /* + * If an lseek was done, then reset and start from beginning. + */ + if (*pos < iter->pos) + reset_iter_read(iter); + + /* * For set_ftrace_filter reading, if we have the filter * off, we can short cut and just print out that all * functions are enabled. @@ -1518,6 +1544,11 @@ static void *t_start(struct seq_file *m, loff_t *pos) if (iter->flags & FTRACE_ITER_HASH) return t_hash_start(m, pos); + /* + * Unfortunately, we need to restart at ftrace_pages_start + * every time we let go of the ftrace_mutex. This is because + * those pointers can change without the lock. + */ iter->pg = ftrace_pages_start; iter->idx = 0; for (l = 0; l <= *pos; ) { @@ -1526,10 +1557,14 @@ static void *t_start(struct seq_file *m, loff_t *pos) break; } - if (!p && iter->flags & FTRACE_ITER_FILTER) - return t_hash_start(m, pos); + if (!p) { + if (iter->flags & FTRACE_ITER_FILTER) + return t_hash_start(m, pos); - return p; + return NULL; + } + + return iter; } static void t_stop(struct seq_file *m, void *p) @@ -1540,16 +1575,18 @@ static void t_stop(struct seq_file *m, void *p) static int t_show(struct seq_file *m, void *v) { struct ftrace_iterator *iter = m->private; - struct dyn_ftrace *rec = v; + struct dyn_ftrace *rec; if (iter->flags & FTRACE_ITER_HASH) - return t_hash_show(m, v); + return t_hash_show(m, iter); if (iter->flags & FTRACE_ITER_PRINTALL) { seq_printf(m, "#### all functions enabled ####\n"); return 0; } + rec = iter->func; + if (!rec) return 0; @@ -1601,8 +1638,8 @@ ftrace_failures_open(struct inode *inode, struct file *file) ret = ftrace_avail_open(inode, file); if (!ret) { - m = (struct seq_file *)file->private_data; - iter = (struct ftrace_iterator *)m->private; + m = file->private_data; + iter = m->private; iter->flags = FTRACE_ITER_FAILURES; } @@ -2418,7 +2455,7 @@ static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = seq_read, .write = ftrace_filter_write, - .llseek = no_llseek, + .llseek = ftrace_regex_lseek, .release = ftrace_filter_release, }; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bca96377fd4e..c5a632a669e1 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2606,6 +2606,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); +/* + * The total entries in the ring buffer is the running counter + * of entries entered into the ring buffer, minus the sum of + * the entries read from the ring buffer and the number of + * entries that were overwritten. + */ +static inline unsigned long +rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) +{ + return local_read(&cpu_buffer->entries) - + (local_read(&cpu_buffer->overrun) + cpu_buffer->read); +} + /** * ring_buffer_entries_cpu - get the number of entries in a cpu buffer * @buffer: The ring buffer @@ -2614,16 +2627,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun)) - - cpu_buffer->read; - return ret; + return rb_num_of_entries(cpu_buffer); } EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); @@ -2684,8 +2694,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - entries += (local_read(&cpu_buffer->entries) - - local_read(&cpu_buffer->overrun)) - cpu_buffer->read; + entries += rb_num_of_entries(cpu_buffer); } return entries; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9ec59f541156..001bcd2ccf4a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2196,7 +2196,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp) static int tracing_release(struct inode *inode, struct file *file) { - struct seq_file *m = (struct seq_file *)file->private_data; + struct seq_file *m = file->private_data; struct trace_iterator *iter; int cpu; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d39b3c5454a5..9021f8c0c0c3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -343,6 +343,10 @@ void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc); +void trace_graph_function(struct trace_array *tr, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags, int pc); void trace_default_header(struct seq_file *m); void print_trace_header(struct seq_file *m, struct trace_iterator *iter); int trace_empty(struct trace_iterator *iter); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 31cc4cb0dbf2..39c059ca670e 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -9,7 +9,7 @@ #include <linux/kprobes.h> #include "trace.h" -static char *perf_trace_buf[4]; +static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; /* * Force it to be aligned to unsigned long to avoid misaligned accesses @@ -24,7 +24,7 @@ static int total_ref_count; static int perf_trace_event_init(struct ftrace_event_call *tp_event, struct perf_event *p_event) { - struct hlist_head *list; + struct hlist_head __percpu *list; int ret = -ENOMEM; int cpu; @@ -42,11 +42,11 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, tp_event->perf_events = list; if (!total_ref_count) { - char *buf; + char __percpu *buf; int i; - for (i = 0; i < 4; i++) { - buf = (char *)alloc_percpu(perf_trace_t); + for (i = 0; i < PERF_NR_CONTEXTS; i++) { + buf = (char __percpu *)alloc_percpu(perf_trace_t); if (!buf) goto fail; @@ -65,7 +65,7 @@ fail: if (!total_ref_count) { int i; - for (i = 0; i < 4; i++) { + for (i = 0; i < PERF_NR_CONTEXTS; i++) { free_percpu(perf_trace_buf[i]); perf_trace_buf[i] = NULL; } @@ -101,22 +101,26 @@ int perf_trace_init(struct perf_event *p_event) return ret; } -int perf_trace_enable(struct perf_event *p_event) +int perf_trace_add(struct perf_event *p_event, int flags) { struct ftrace_event_call *tp_event = p_event->tp_event; + struct hlist_head __percpu *pcpu_list; struct hlist_head *list; - list = tp_event->perf_events; - if (WARN_ON_ONCE(!list)) + pcpu_list = tp_event->perf_events; + if (WARN_ON_ONCE(!pcpu_list)) return -EINVAL; - list = this_cpu_ptr(list); + if (!(flags & PERF_EF_START)) + p_event->hw.state = PERF_HES_STOPPED; + + list = this_cpu_ptr(pcpu_list); hlist_add_head_rcu(&p_event->hlist_entry, list); return 0; } -void perf_trace_disable(struct perf_event *p_event) +void perf_trace_del(struct perf_event *p_event, int flags) { hlist_del_rcu(&p_event->hlist_entry); } @@ -142,7 +146,7 @@ void perf_trace_destroy(struct perf_event *p_event) tp_event->perf_events = NULL; if (!--total_ref_count) { - for (i = 0; i < 4; i++) { + for (i = 0; i < PERF_NR_CONTEXTS; i++) { free_percpu(perf_trace_buf[i]); perf_trace_buf[i] = NULL; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 4c758f146328..398c0e8b332c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -600,21 +600,29 @@ out: enum { FORMAT_HEADER = 1, - FORMAT_PRINTFMT = 2, + FORMAT_FIELD_SEPERATOR = 2, + FORMAT_PRINTFMT = 3, }; static void *f_next(struct seq_file *m, void *v, loff_t *pos) { struct ftrace_event_call *call = m->private; struct ftrace_event_field *field; - struct list_head *head; + struct list_head *common_head = &ftrace_common_fields; + struct list_head *head = trace_get_fields(call); (*pos)++; switch ((unsigned long)v) { case FORMAT_HEADER: - head = &ftrace_common_fields; + if (unlikely(list_empty(common_head))) + return NULL; + + field = list_entry(common_head->prev, + struct ftrace_event_field, link); + return field; + case FORMAT_FIELD_SEPERATOR: if (unlikely(list_empty(head))) return NULL; @@ -626,31 +634,10 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos) return NULL; } - head = trace_get_fields(call); - - /* - * To separate common fields from event fields, the - * LSB is set on the first event field. Clear it in case. - */ - v = (void *)((unsigned long)v & ~1L); - field = v; - /* - * If this is a common field, and at the end of the list, then - * continue with main list. - */ - if (field->link.prev == &ftrace_common_fields) { - if (unlikely(list_empty(head))) - return NULL; - field = list_entry(head->prev, struct ftrace_event_field, link); - /* Set the LSB to notify f_show to print an extra newline */ - field = (struct ftrace_event_field *) - ((unsigned long)field | 1); - return field; - } - - /* If we are done tell f_show to print the format */ - if (field->link.prev == head) + if (field->link.prev == common_head) + return (void *)FORMAT_FIELD_SEPERATOR; + else if (field->link.prev == head) return (void *)FORMAT_PRINTFMT; field = list_entry(field->link.prev, struct ftrace_event_field, link); @@ -688,22 +675,16 @@ static int f_show(struct seq_file *m, void *v) seq_printf(m, "format:\n"); return 0; + case FORMAT_FIELD_SEPERATOR: + seq_putc(m, '\n'); + return 0; + case FORMAT_PRINTFMT: seq_printf(m, "\nprint fmt: %s\n", call->print_fmt); return 0; } - /* - * To separate common fields from event fields, the - * LSB is set on the first event field. Clear it and - * print a newline if it is set. - */ - if ((unsigned long)v & 1) { - seq_putc(m, '\n'); - v = (void *)((unsigned long)v & ~1L); - } - field = v; /* diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 6f233698518e..76b05980225c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -15,15 +15,19 @@ #include "trace.h" #include "trace_output.h" +/* When set, irq functions will be ignored */ +static int ftrace_graph_skip_irqs; + struct fgraph_cpu_data { pid_t last_pid; int depth; + int depth_irq; int ignore; unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; }; struct fgraph_data { - struct fgraph_cpu_data *cpu_data; + struct fgraph_cpu_data __percpu *cpu_data; /* Place to preserve last processed entry. */ struct ftrace_graph_ent_entry ent; @@ -41,6 +45,7 @@ struct fgraph_data { #define TRACE_GRAPH_PRINT_PROC 0x8 #define TRACE_GRAPH_PRINT_DURATION 0x10 #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 +#define TRACE_GRAPH_PRINT_IRQS 0x40 static struct tracer_opt trace_opts[] = { /* Display overruns? (for self-debug purpose) */ @@ -55,13 +60,15 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, /* Display absolute time of an entry */ { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, + /* Display interrupts */ + { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, { } /* Empty entry */ }; static struct tracer_flags tracer_flags = { /* Don't display overruns and proc by default */ .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | - TRACE_GRAPH_PRINT_DURATION, + TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, .opts = trace_opts }; @@ -204,6 +211,14 @@ int __trace_graph_entry(struct trace_array *tr, return 1; } +static inline int ftrace_graph_ignore_irqs(void) +{ + if (!ftrace_graph_skip_irqs) + return 0; + + return in_irq(); +} + int trace_graph_entry(struct ftrace_graph_ent *trace) { struct trace_array *tr = graph_array; @@ -218,7 +233,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return 0; /* trace it when it is-nested-in or is a function enabled. */ - if (!(trace->depth || ftrace_graph_addr(trace->func))) + if (!(trace->depth || ftrace_graph_addr(trace->func)) || + ftrace_graph_ignore_irqs()) return 0; local_irq_save(flags); @@ -246,6 +262,34 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) return trace_graph_entry(trace); } +static void +__trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned long flags, int pc) +{ + u64 time = trace_clock_local(); + struct ftrace_graph_ent ent = { + .func = ip, + .depth = 0, + }; + struct ftrace_graph_ret ret = { + .func = ip, + .depth = 0, + .calltime = time, + .rettime = time, + }; + + __trace_graph_entry(tr, &ent, flags, pc); + __trace_graph_return(tr, &ret, flags, pc); +} + +void +trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + __trace_graph_function(tr, ip, flags, pc); +} + void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned long flags, @@ -649,8 +693,9 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) /* Print nsecs (we don't want to exceed 7 numbers) */ if (len < 7) { - snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu", - nsecs_rem); + size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); + + snprintf(nsecs_str, slen, "%03lu", nsecs_rem); ret = trace_seq_printf(s, ".%s", nsecs_str); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -855,6 +900,108 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, return 0; } +/* + * Entry check for irq code + * + * returns 1 if + * - we are inside irq code + * - we just extered irq code + * + * retunns 0 if + * - funcgraph-interrupts option is set + * - we are not inside irq code + */ +static int +check_irq_entry(struct trace_iterator *iter, u32 flags, + unsigned long addr, int depth) +{ + int cpu = iter->cpu; + int *depth_irq; + struct fgraph_data *data = iter->private; + + /* + * If we are either displaying irqs, or we got called as + * a graph event and private data does not exist, + * then we bypass the irq check. + */ + if ((flags & TRACE_GRAPH_PRINT_IRQS) || + (!data)) + return 0; + + depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + + /* + * We are inside the irq code + */ + if (*depth_irq >= 0) + return 1; + + if ((addr < (unsigned long)__irqentry_text_start) || + (addr >= (unsigned long)__irqentry_text_end)) + return 0; + + /* + * We are entering irq code. + */ + *depth_irq = depth; + return 1; +} + +/* + * Return check for irq code + * + * returns 1 if + * - we are inside irq code + * - we just left irq code + * + * returns 0 if + * - funcgraph-interrupts option is set + * - we are not inside irq code + */ +static int +check_irq_return(struct trace_iterator *iter, u32 flags, int depth) +{ + int cpu = iter->cpu; + int *depth_irq; + struct fgraph_data *data = iter->private; + + /* + * If we are either displaying irqs, or we got called as + * a graph event and private data does not exist, + * then we bypass the irq check. + */ + if ((flags & TRACE_GRAPH_PRINT_IRQS) || + (!data)) + return 0; + + depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + + /* + * We are not inside the irq code. + */ + if (*depth_irq == -1) + return 0; + + /* + * We are inside the irq code, and this is returning entry. + * Let's not trace it and clear the entry depth, since + * we are out of irq code. + * + * This condition ensures that we 'leave the irq code' once + * we are out of the entry depth. Thus protecting us from + * the RETURN entry loss. + */ + if (*depth_irq >= depth) { + *depth_irq = -1; + return 1; + } + + /* + * We are inside the irq code, and this is not the entry. + */ + return 1; +} + static enum print_line_t print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, struct trace_iterator *iter, u32 flags) @@ -865,6 +1012,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, static enum print_line_t ret; int cpu = iter->cpu; + if (check_irq_entry(iter, flags, call->func, call->depth)) + return TRACE_TYPE_HANDLED; + if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) return TRACE_TYPE_PARTIAL_LINE; @@ -902,6 +1052,9 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, int ret; int i; + if (check_irq_return(iter, flags, trace->depth)) + return TRACE_TYPE_HANDLED; + if (data) { struct fgraph_cpu_data *cpu_data; int cpu = iter->cpu; @@ -1054,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, enum print_line_t -print_graph_function_flags(struct trace_iterator *iter, u32 flags) +__print_graph_function_flags(struct trace_iterator *iter, u32 flags) { struct ftrace_graph_ent_entry *field; struct fgraph_data *data = iter->private; @@ -1117,7 +1270,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) static enum print_line_t print_graph_function(struct trace_iterator *iter) { - return print_graph_function_flags(iter, tracer_flags.val); + return __print_graph_function_flags(iter, tracer_flags.val); +} + +enum print_line_t print_graph_function_flags(struct trace_iterator *iter, + u32 flags) +{ + if (trace_flags & TRACE_ITER_LATENCY_FMT) + flags |= TRACE_GRAPH_PRINT_DURATION; + else + flags |= TRACE_GRAPH_PRINT_ABS_TIME; + + return __print_graph_function_flags(iter, flags); } static enum print_line_t @@ -1149,7 +1313,7 @@ static void print_lat_header(struct seq_file *s, u32 flags) seq_printf(s, "#%.*s|||| / \n", size, spaces); } -void print_graph_headers_flags(struct seq_file *s, u32 flags) +static void __print_graph_headers_flags(struct seq_file *s, u32 flags) { int lat = trace_flags & TRACE_ITER_LATENCY_FMT; @@ -1190,6 +1354,23 @@ void print_graph_headers(struct seq_file *s) print_graph_headers_flags(s, tracer_flags.val); } +void print_graph_headers_flags(struct seq_file *s, u32 flags) +{ + struct trace_iterator *iter = s->private; + + if (trace_flags & TRACE_ITER_LATENCY_FMT) { + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return; + + print_trace_header(s, iter); + flags |= TRACE_GRAPH_PRINT_DURATION; + } else + flags |= TRACE_GRAPH_PRINT_ABS_TIME; + + __print_graph_headers_flags(s, flags); +} + void graph_trace_open(struct trace_iterator *iter) { /* pid and depth on the last trace processed */ @@ -1210,9 +1391,12 @@ void graph_trace_open(struct trace_iterator *iter) pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); + int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + *pid = -1; *depth = 0; *ignore = 0; + *depth_irq = -1; } iter->private = data; @@ -1235,6 +1419,14 @@ void graph_trace_close(struct trace_iterator *iter) } } +static int func_graph_set_flag(u32 old_flags, u32 bit, int set) +{ + if (bit == TRACE_GRAPH_PRINT_IRQS) + ftrace_graph_skip_irqs = !set; + + return 0; +} + static struct trace_event_functions graph_functions = { .trace = print_graph_function_event, }; @@ -1261,6 +1453,7 @@ static struct tracer graph_trace __read_mostly = { .print_line = print_graph_function, .print_header = print_graph_headers, .flags = &tracer_flags, + .set_flag = func_graph_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_function_graph, #endif diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 73a6b0601f2e..5cf8c602b880 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -87,14 +87,22 @@ static __cacheline_aligned_in_smp unsigned long max_sequence; #ifdef CONFIG_FUNCTION_TRACER /* - * irqsoff uses its own tracer function to keep the overhead down: + * Prologue for the preempt and irqs off function tracers. + * + * Returns 1 if it is OK to continue, and data->disabled is + * incremented. + * 0 if the trace is to be ignored, and data->disabled + * is kept the same. + * + * Note, this function is also used outside this ifdef but + * inside the #ifdef of the function graph tracer below. + * This is OK, since the function graph tracer is + * dependent on the function tracer. */ -static void -irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +static int func_prolog_dec(struct trace_array *tr, + struct trace_array_cpu **data, + unsigned long *flags) { - struct trace_array *tr = irqsoff_trace; - struct trace_array_cpu *data; - unsigned long flags; long disabled; int cpu; @@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) */ cpu = raw_smp_processor_id(); if (likely(!per_cpu(tracing_cpu, cpu))) - return; + return 0; - local_save_flags(flags); + local_save_flags(*flags); /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(flags)) - return; + if (!irqs_disabled_flags(*flags)) + return 0; - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + *data = tr->data[cpu]; + disabled = atomic_inc_return(&(*data)->disabled); if (likely(disabled == 1)) - trace_function(tr, ip, parent_ip, flags, preempt_count()); + return 1; + + atomic_dec(&(*data)->disabled); + + return 0; +} + +/* + * irqsoff uses its own tracer function to keep the overhead down: + */ +static void +irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + if (!func_prolog_dec(tr, &data, &flags)) + return; + + trace_function(tr, ip, parent_ip, flags, preempt_count()); atomic_dec(&data->disabled); } @@ -155,30 +183,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; - long disabled; int ret; - int cpu; int pc; - cpu = raw_smp_processor_id(); - if (likely(!per_cpu(tracing_cpu, cpu))) + if (!func_prolog_dec(tr, &data, &flags)) return 0; - local_save_flags(flags); - /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(flags)) - return 0; - - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) { - pc = preempt_count(); - ret = __trace_graph_entry(tr, trace, flags, pc); - } else - ret = 0; - + pc = preempt_count(); + ret = __trace_graph_entry(tr, trace, flags, pc); atomic_dec(&data->disabled); + return ret; } @@ -187,27 +201,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace) struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; - long disabled; - int cpu; int pc; - cpu = raw_smp_processor_id(); - if (likely(!per_cpu(tracing_cpu, cpu))) + if (!func_prolog_dec(tr, &data, &flags)) return; - local_save_flags(flags); - /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(flags)) - return; - - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) { - pc = preempt_count(); - __trace_graph_return(tr, trace, flags, pc); - } - + pc = preempt_count(); + __trace_graph_return(tr, trace, flags, pc); atomic_dec(&data->disabled); } @@ -229,75 +229,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter) static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) { - u32 flags = GRAPH_TRACER_FLAGS; - - if (trace_flags & TRACE_ITER_LATENCY_FMT) - flags |= TRACE_GRAPH_PRINT_DURATION; - else - flags |= TRACE_GRAPH_PRINT_ABS_TIME; - /* * In graph mode call the graph tracer output function, * otherwise go with the TRACE_FN event handler */ if (is_graph()) - return print_graph_function_flags(iter, flags); + return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); return TRACE_TYPE_UNHANDLED; } static void irqsoff_print_header(struct seq_file *s) { - if (is_graph()) { - struct trace_iterator *iter = s->private; - u32 flags = GRAPH_TRACER_FLAGS; - - if (trace_flags & TRACE_ITER_LATENCY_FMT) { - /* print nothing if the buffers are empty */ - if (trace_empty(iter)) - return; - - print_trace_header(s, iter); - flags |= TRACE_GRAPH_PRINT_DURATION; - } else - flags |= TRACE_GRAPH_PRINT_ABS_TIME; - - print_graph_headers_flags(s, flags); - } else + if (is_graph()) + print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); + else trace_default_header(s); } static void -trace_graph_function(struct trace_array *tr, - unsigned long ip, unsigned long flags, int pc) -{ - u64 time = trace_clock_local(); - struct ftrace_graph_ent ent = { - .func = ip, - .depth = 0, - }; - struct ftrace_graph_ret ret = { - .func = ip, - .depth = 0, - .calltime = time, - .rettime = time, - }; - - __trace_graph_entry(tr, &ent, flags, pc); - __trace_graph_return(tr, &ret, flags, pc); -} - -static void __trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { - if (!is_graph()) + if (is_graph()) + trace_graph_function(tr, ip, parent_ip, flags, pc); + else trace_function(tr, ip, parent_ip, flags, pc); - else { - trace_graph_function(tr, parent_ip, flags, pc); - trace_graph_function(tr, ip, flags, pc); - } } #else diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 4086eae6e81b..7319559ed59f 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -31,48 +31,98 @@ static int wakeup_rt; static arch_spinlock_t wakeup_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +static void wakeup_reset(struct trace_array *tr); static void __wakeup_reset(struct trace_array *tr); +static int wakeup_graph_entry(struct ftrace_graph_ent *trace); +static void wakeup_graph_return(struct ftrace_graph_ret *trace); static int save_lat_flag; +#define TRACE_DISPLAY_GRAPH 1 + +static struct tracer_opt trace_opts[] = { +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* display latency trace as call graph */ + { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, +#endif + { } /* Empty entry */ +}; + +static struct tracer_flags tracer_flags = { + .val = 0, + .opts = trace_opts, +}; + +#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) + #ifdef CONFIG_FUNCTION_TRACER + /* - * irqsoff uses its own tracer function to keep the overhead down: + * Prologue for the wakeup function tracers. + * + * Returns 1 if it is OK to continue, and preemption + * is disabled and data->disabled is incremented. + * 0 if the trace is to be ignored, and preemption + * is not disabled and data->disabled is + * kept the same. + * + * Note, this function is also used outside this ifdef but + * inside the #ifdef of the function graph tracer below. + * This is OK, since the function graph tracer is + * dependent on the function tracer. */ -static void -wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +static int +func_prolog_preempt_disable(struct trace_array *tr, + struct trace_array_cpu **data, + int *pc) { - struct trace_array *tr = wakeup_trace; - struct trace_array_cpu *data; - unsigned long flags; long disabled; int cpu; - int pc; if (likely(!wakeup_task)) - return; + return 0; - pc = preempt_count(); + *pc = preempt_count(); preempt_disable_notrace(); cpu = raw_smp_processor_id(); if (cpu != wakeup_current_cpu) goto out_enable; - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + *data = tr->data[cpu]; + disabled = atomic_inc_return(&(*data)->disabled); if (unlikely(disabled != 1)) goto out; - local_irq_save(flags); + return 1; - trace_function(tr, ip, parent_ip, flags, pc); +out: + atomic_dec(&(*data)->disabled); + +out_enable: + preempt_enable_notrace(); + return 0; +} +/* + * wakeup uses its own tracer function to keep the overhead down: + */ +static void +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return; + + local_irq_save(flags); + trace_function(tr, ip, parent_ip, flags, pc); local_irq_restore(flags); - out: atomic_dec(&data->disabled); - out_enable: preempt_enable_notrace(); } @@ -82,6 +132,156 @@ static struct ftrace_ops trace_ops __read_mostly = }; #endif /* CONFIG_FUNCTION_TRACER */ +static int start_func_tracer(int graph) +{ + int ret; + + if (!graph) + ret = register_ftrace_function(&trace_ops); + else + ret = register_ftrace_graph(&wakeup_graph_return, + &wakeup_graph_entry); + + if (!ret && tracing_is_enabled()) + tracer_enabled = 1; + else + tracer_enabled = 0; + + return ret; +} + +static void stop_func_tracer(int graph) +{ + tracer_enabled = 0; + + if (!graph) + unregister_ftrace_function(&trace_ops); + else + unregister_ftrace_graph(); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +{ + + if (!(bit & TRACE_DISPLAY_GRAPH)) + return -EINVAL; + + if (!(is_graph() ^ set)) + return 0; + + stop_func_tracer(!set); + + wakeup_reset(wakeup_trace); + tracing_max_latency = 0; + + return start_func_tracer(set); +} + +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc, ret = 0; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return 0; + + local_save_flags(flags); + ret = __trace_graph_entry(tr, trace, flags, pc); + atomic_dec(&data->disabled); + preempt_enable_notrace(); + + return ret; +} + +static void wakeup_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return; + + local_save_flags(flags); + __trace_graph_return(tr, trace, flags, pc); + atomic_dec(&data->disabled); + + preempt_enable_notrace(); + return; +} + +static void wakeup_trace_open(struct trace_iterator *iter) +{ + if (is_graph()) + graph_trace_open(iter); +} + +static void wakeup_trace_close(struct trace_iterator *iter) +{ + if (iter->private) + graph_trace_close(iter); +} + +#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) + +static enum print_line_t wakeup_print_line(struct trace_iterator *iter) +{ + /* + * In graph mode call the graph tracer output function, + * otherwise go with the TRACE_FN event handler + */ + if (is_graph()) + return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); + + return TRACE_TYPE_UNHANDLED; +} + +static void wakeup_print_header(struct seq_file *s) +{ + if (is_graph()) + print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); + else + trace_default_header(s); +} + +static void +__trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + if (is_graph()) + trace_graph_function(tr, ip, parent_ip, flags, pc); + else + trace_function(tr, ip, parent_ip, flags, pc); +} +#else +#define __trace_function trace_function + +static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +{ + return -EINVAL; +} + +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + return -1; +} + +static enum print_line_t wakeup_print_line(struct trace_iterator *iter) +{ + return TRACE_TYPE_UNHANDLED; +} + +static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } +static void wakeup_print_header(struct seq_file *s) { } +static void wakeup_trace_open(struct trace_iterator *iter) { } +static void wakeup_trace_close(struct trace_iterator *iter) { } +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + /* * Should this new latency be reported/recorded? */ @@ -152,7 +352,7 @@ probe_wakeup_sched_switch(void *ignore, /* The task we are waiting for is waking up */ data = wakeup_trace->data[wakeup_cpu]; - trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); + __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); T0 = data->preempt_timestamp; @@ -252,7 +452,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) * is not called by an assembly function (where as schedule is) * it should be safe to use it here. */ - trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); + __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); out_locked: arch_spin_unlock(&wakeup_lock); @@ -303,12 +503,8 @@ static void start_wakeup_tracer(struct trace_array *tr) */ smp_wmb(); - register_ftrace_function(&trace_ops); - - if (tracing_is_enabled()) - tracer_enabled = 1; - else - tracer_enabled = 0; + if (start_func_tracer(is_graph())) + printk(KERN_ERR "failed to start wakeup tracer\n"); return; fail_deprobe_wake_new: @@ -320,7 +516,7 @@ fail_deprobe: static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; - unregister_ftrace_function(&trace_ops); + stop_func_tracer(is_graph()); unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); unregister_trace_sched_wakeup_new(probe_wakeup, NULL); unregister_trace_sched_wakeup(probe_wakeup, NULL); @@ -379,9 +575,15 @@ static struct tracer wakeup_tracer __read_mostly = .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, .print_max = 1, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, .use_max_tr = 1, }; @@ -394,9 +596,15 @@ static struct tracer wakeup_rt_tracer __read_mostly = .stop = wakeup_tracer_stop, .wait_pipe = poll_wait_pipe, .print_max = 1, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, .use_max_tr = 1, }; diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index a7cc3793baf6..209b379a4721 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -263,6 +263,11 @@ int __init trace_workqueue_early_init(void) { int ret, cpu; + for_each_possible_cpu(cpu) { + spin_lock_init(&workqueue_cpu_stat(cpu)->lock); + INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); + } + ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); if (ret) goto out; @@ -279,11 +284,6 @@ int __init trace_workqueue_early_init(void) if (ret) goto no_creation; - for_each_possible_cpu(cpu) { - spin_lock_init(&workqueue_cpu_stat(cpu)->lock); - INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); - } - return 0; no_creation: diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index c77f3eceea25..e95ee7f31d43 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -25,6 +25,7 @@ #include <linux/err.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/jump_label.h> extern struct tracepoint __start___tracepoints[]; extern struct tracepoint __stop___tracepoints[]; @@ -263,7 +264,13 @@ static void set_tracepoint(struct tracepoint_entry **entry, * is used. */ rcu_assign_pointer(elem->funcs, (*entry)->funcs); - elem->state = active; + if (!elem->state && active) { + jump_label_enable(&elem->state); + elem->state = active; + } else if (elem->state && !active) { + jump_label_disable(&elem->state); + elem->state = active; + } } /* @@ -277,7 +284,10 @@ static void disable_tracepoint(struct tracepoint *elem) if (elem->unregfunc && elem->state) elem->unregfunc(); - elem->state = 0; + if (elem->state) { + jump_label_disable(&elem->state); + elem->state = 0; + } rcu_assign_pointer(elem->funcs, NULL); } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 7f9c3c52ecc1..bafba687a6d8 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -43,7 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif -static int __read_mostly did_panic; static int __initdata no_watchdog; @@ -187,18 +186,6 @@ static int is_softlockup(unsigned long touch_ts) return 0; } -static int -watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr) -{ - did_panic = 1; - - return NOTIFY_DONE; -} - -static struct notifier_block panic_block = { - .notifier_call = watchdog_panic, -}; - #ifdef CONFIG_HARDLOCKUP_DETECTOR static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, @@ -209,7 +196,7 @@ static struct perf_event_attr wd_hw_attr = { }; /* Callback function for perf event subsystem */ -void watchdog_overflow_callback(struct perf_event *event, int nmi, +static void watchdog_overflow_callback(struct perf_event *event, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { @@ -371,14 +358,14 @@ static int watchdog_nmi_enable(int cpu) /* Try to register using hardware perf events */ wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(); - event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback); + event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); if (!IS_ERR(event)) { printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); goto out_save; } printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); - return -1; + return PTR_ERR(event); /* success path */ out_save: @@ -422,17 +409,19 @@ static int watchdog_prepare_cpu(int cpu) static int watchdog_enable(int cpu) { struct task_struct *p = per_cpu(softlockup_watchdog, cpu); + int err; /* enable the perf event */ - if (watchdog_nmi_enable(cpu) != 0) - return -1; + err = watchdog_nmi_enable(cpu); + if (err) + return err; /* create the watchdog thread */ if (!p) { p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); if (IS_ERR(p)) { printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); - return -1; + return PTR_ERR(p); } kthread_bind(p, cpu); per_cpu(watchdog_touch_ts, cpu) = 0; @@ -484,6 +473,9 @@ static void watchdog_disable_all_cpus(void) { int cpu; + if (no_watchdog) + return; + for_each_online_cpu(cpu) watchdog_disable(cpu); @@ -526,17 +518,16 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { int hotcpu = (unsigned long)hcpu; + int err = 0; switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - if (watchdog_prepare_cpu(hotcpu)) - return NOTIFY_BAD; + err = watchdog_prepare_cpu(hotcpu); break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - if (watchdog_enable(hotcpu)) - return NOTIFY_BAD; + err = watchdog_enable(hotcpu); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: @@ -549,7 +540,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #endif /* CONFIG_HOTPLUG_CPU */ } - return NOTIFY_OK; + return notifier_from_errno(err); } static struct notifier_block __cpuinitdata cpu_nfb = { @@ -565,13 +556,11 @@ static int __init spawn_watchdog_task(void) return 0; err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - WARN_ON(err == NOTIFY_BAD); + WARN_ON(notifier_to_errno(err)); cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); - atomic_notifier_chain_register(&panic_notifier_list, &panic_block); - return 0; } early_initcall(spawn_watchdog_task); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 1b4afd2e6ca0..21ac83070a80 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -482,6 +482,7 @@ config PROVE_LOCKING select DEBUG_SPINLOCK select DEBUG_MUTEXES select DEBUG_LOCK_ALLOC + select TRACE_IRQFLAGS default n help This feature enables the kernel to prove that all locking @@ -539,6 +540,23 @@ config PROVE_RCU_REPEATEDLY disabling, allowing multiple RCU-lockdep warnings to be printed on a single reboot. + Say Y to allow multiple RCU-lockdep warnings per boot. + + Say N if you are unsure. + +config SPARSE_RCU_POINTER + bool "RCU debugging: sparse-based checks for pointer usage" + default n + help + This feature enables the __rcu sparse annotation for + RCU-protected pointers. This annotation will cause sparse + to flag any non-RCU used of annotated pointers. This can be + helpful when debugging RCU usage. Please note that this feature + is not intended to enforce code cleanliness; it is instead merely + a debugging aid. + + Say Y to make sparse flag questionable use of RCU-protected pointers + Say N if you are unsure. config LOCKDEP @@ -579,11 +597,10 @@ config DEBUG_LOCKDEP of more runtime overhead. config TRACE_IRQFLAGS - depends on DEBUG_KERNEL bool - default y - depends on TRACE_IRQFLAGS_SUPPORT - depends on PROVE_LOCKING + help + Enables hooks to interrupt enabling and disabling for + either tracing or lock debugging. config DEBUG_SPINLOCK_SLEEP bool "Spinlock debugging: sleep-inside-spinlock checking" @@ -832,6 +849,30 @@ config RCU_CPU_STALL_DETECTOR Say Y if you are unsure. +config RCU_CPU_STALL_TIMEOUT + int "RCU CPU stall timeout in seconds" + depends on RCU_CPU_STALL_DETECTOR + range 3 300 + default 60 + help + If a given RCU grace period extends more than the specified + number of seconds, a CPU stall warning is printed. If the + RCU grace period persists, additional CPU stall warnings are + printed at more widely spaced intervals. + +config RCU_CPU_STALL_DETECTOR_RUNNABLE + bool "RCU CPU stall checking starts automatically at boot" + depends on RCU_CPU_STALL_DETECTOR + default y + help + If set, start checking for RCU CPU stalls immediately on + boot. Otherwise, RCU CPU stall checking must be manually + enabled. + + Say Y if you are unsure. + + Say N if you wish to suppress RCU CPU stall checking during boot. + config RCU_CPU_STALL_VERBOSE bool "Print additional per-task information for RCU_CPU_STALL_DETECTOR" depends on RCU_CPU_STALL_DETECTOR && TREE_PREEMPT_RCU diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 02afc2533728..7bd6df781ce5 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -26,19 +26,11 @@ #include <linux/dynamic_debug.h> #include <linux/debugfs.h> #include <linux/slab.h> +#include <linux/jump_label.h> extern struct _ddebug __start___verbose[]; extern struct _ddebug __stop___verbose[]; -/* dynamic_debug_enabled, and dynamic_debug_enabled2 are bitmasks in which - * bit n is set to 1 if any modname hashes into the bucket n, 0 otherwise. They - * use independent hash functions, to reduce the chance of false positives. - */ -long long dynamic_debug_enabled; -EXPORT_SYMBOL_GPL(dynamic_debug_enabled); -long long dynamic_debug_enabled2; -EXPORT_SYMBOL_GPL(dynamic_debug_enabled2); - struct ddebug_table { struct list_head link; char *mod_name; @@ -88,26 +80,6 @@ static char *ddebug_describe_flags(struct _ddebug *dp, char *buf, } /* - * must be called with ddebug_lock held - */ - -static int disabled_hash(char hash, bool first_table) -{ - struct ddebug_table *dt; - char table_hash_value; - - list_for_each_entry(dt, &ddebug_tables, link) { - if (first_table) - table_hash_value = dt->ddebugs->primary_hash; - else - table_hash_value = dt->ddebugs->secondary_hash; - if (dt->num_enabled && (hash == table_hash_value)) - return 0; - } - return 1; -} - -/* * Search the tables for _ddebug's which match the given * `query' and apply the `flags' and `mask' to them. Tells * the user which ddebug's were changed, or whether none @@ -170,17 +142,9 @@ static void ddebug_change(const struct ddebug_query *query, dt->num_enabled++; dp->flags = newflags; if (newflags) { - dynamic_debug_enabled |= - (1LL << dp->primary_hash); - dynamic_debug_enabled2 |= - (1LL << dp->secondary_hash); + jump_label_enable(&dp->enabled); } else { - if (disabled_hash(dp->primary_hash, true)) - dynamic_debug_enabled &= - ~(1LL << dp->primary_hash); - if (disabled_hash(dp->secondary_hash, false)) - dynamic_debug_enabled2 &= - ~(1LL << dp->secondary_hash); + jump_label_disable(&dp->enabled); } if (verbose) printk(KERN_INFO diff --git a/lib/radix-tree.c b/lib/radix-tree.c index efd16fa80b1c..6f412ab4c24f 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -49,7 +49,7 @@ struct radix_tree_node { unsigned int height; /* Height from the bottom */ unsigned int count; struct rcu_head rcu_head; - void *slots[RADIX_TREE_MAP_SIZE]; + void __rcu *slots[RADIX_TREE_MAP_SIZE]; unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; }; diff --git a/net/Kconfig b/net/Kconfig index e926884c1675..55fd82e9ffd9 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -293,6 +293,7 @@ source "net/wimax/Kconfig" source "net/rfkill/Kconfig" source "net/9p/Kconfig" source "net/caif/Kconfig" +source "net/ceph/Kconfig" endif # if NET diff --git a/net/Makefile b/net/Makefile index ea60fbce9b1b..6b7bfd7f1416 100644 --- a/net/Makefile +++ b/net/Makefile @@ -68,3 +68,4 @@ obj-$(CONFIG_SYSCTL) += sysctl_net.o endif obj-$(CONFIG_WIMAX) += wimax/ obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/ +obj-$(CONFIG_CEPH_LIB) += ceph/ diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 622b471e14e0..74bcc662c3dd 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -778,7 +778,7 @@ static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb) eg->packets_rcvd++; mpc->eg_ops->put(eg); - memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data)); + memset(ATM_SKB(new_skb), 0, sizeof(struct atm_skb_data)); netif_rx(new_skb); } diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig new file mode 100644 index 000000000000..ad424049b0cf --- /dev/null +++ b/net/ceph/Kconfig @@ -0,0 +1,28 @@ +config CEPH_LIB + tristate "Ceph core library (EXPERIMENTAL)" + depends on INET && EXPERIMENTAL + select LIBCRC32C + select CRYPTO_AES + select CRYPTO + default n + help + Choose Y or M here to include cephlib, which provides the + common functionality to both the Ceph filesystem and + to the rados block device (rbd). + + More information at http://ceph.newdream.net/. + + If unsure, say N. + +config CEPH_LIB_PRETTYDEBUG + bool "Include file:line in ceph debug output" + depends on CEPH_LIB + default n + help + If you say Y here, debug output will include a filename and + line to aid debugging. This increases kernel size and slows + execution slightly when debug call sites are enabled (e.g., + via CONFIG_DYNAMIC_DEBUG). + + If unsure, say N. + diff --git a/net/ceph/Makefile b/net/ceph/Makefile new file mode 100644 index 000000000000..aab1cabb8035 --- /dev/null +++ b/net/ceph/Makefile @@ -0,0 +1,37 @@ +# +# Makefile for CEPH filesystem. +# + +ifneq ($(KERNELRELEASE),) + +obj-$(CONFIG_CEPH_LIB) += libceph.o + +libceph-objs := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ + mon_client.o \ + osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \ + debugfs.o \ + auth.o auth_none.o \ + crypto.o armor.o \ + auth_x.o \ + ceph_fs.o ceph_strings.o ceph_hash.o \ + pagevec.o + +else +#Otherwise we were called directly from the command +# line; invoke the kernel build system. + +KERNELDIR ?= /lib/modules/$(shell uname -r)/build +PWD := $(shell pwd) + +default: all + +all: + $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules + +modules_install: + $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules_install + +clean: + $(MAKE) -C $(KERNELDIR) M=$(PWD) clean + +endif diff --git a/fs/ceph/armor.c b/net/ceph/armor.c index eb2a666b0be7..eb2a666b0be7 100644 --- a/fs/ceph/armor.c +++ b/net/ceph/armor.c diff --git a/fs/ceph/auth.c b/net/ceph/auth.c index 6d2e30600627..549c1f43e1d5 100644 --- a/fs/ceph/auth.c +++ b/net/ceph/auth.c @@ -1,16 +1,16 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/module.h> #include <linux/err.h> #include <linux/slab.h> -#include "types.h" +#include <linux/ceph/types.h> +#include <linux/ceph/decode.h> +#include <linux/ceph/libceph.h> +#include <linux/ceph/messenger.h> #include "auth_none.h" #include "auth_x.h" -#include "decode.h" -#include "super.h" -#include "messenger.h" /* * get protocol handler diff --git a/fs/ceph/auth_none.c b/net/ceph/auth_none.c index ad1dc21286c7..214c2bb43d62 100644 --- a/fs/ceph/auth_none.c +++ b/net/ceph/auth_none.c @@ -1,14 +1,15 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/err.h> #include <linux/module.h> #include <linux/random.h> #include <linux/slab.h> +#include <linux/ceph/decode.h> +#include <linux/ceph/auth.h> + #include "auth_none.h" -#include "auth.h" -#include "decode.h" static void reset(struct ceph_auth_client *ac) { diff --git a/fs/ceph/auth_none.h b/net/ceph/auth_none.h index 8164df1a08be..ed7d088b1bc9 100644 --- a/fs/ceph/auth_none.h +++ b/net/ceph/auth_none.h @@ -2,8 +2,7 @@ #define _FS_CEPH_AUTH_NONE_H #include <linux/slab.h> - -#include "auth.h" +#include <linux/ceph/auth.h> /* * null security mode. diff --git a/fs/ceph/auth_x.c b/net/ceph/auth_x.c index a2d002cbdec2..7fd5dfcf6e18 100644 --- a/fs/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -1,16 +1,17 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/err.h> #include <linux/module.h> #include <linux/random.h> #include <linux/slab.h> +#include <linux/ceph/decode.h> +#include <linux/ceph/auth.h> + +#include "crypto.h" #include "auth_x.h" #include "auth_x_protocol.h" -#include "crypto.h" -#include "auth.h" -#include "decode.h" #define TEMP_TICKET_BUF_LEN 256 diff --git a/fs/ceph/auth_x.h b/net/ceph/auth_x.h index ff6f8180e681..e02da7a5c5a1 100644 --- a/fs/ceph/auth_x.h +++ b/net/ceph/auth_x.h @@ -3,8 +3,9 @@ #include <linux/rbtree.h> +#include <linux/ceph/auth.h> + #include "crypto.h" -#include "auth.h" #include "auth_x_protocol.h" /* diff --git a/fs/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h index 671d30576c4f..671d30576c4f 100644 --- a/fs/ceph/auth_x_protocol.h +++ b/net/ceph/auth_x_protocol.h diff --git a/fs/ceph/buffer.c b/net/ceph/buffer.c index cd39f17021de..53d8abfa25d5 100644 --- a/fs/ceph/buffer.c +++ b/net/ceph/buffer.c @@ -1,10 +1,11 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> +#include <linux/module.h> #include <linux/slab.h> -#include "buffer.h" -#include "decode.h" +#include <linux/ceph/buffer.h> +#include <linux/ceph/decode.h> struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) { @@ -32,6 +33,7 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) dout("buffer_new %p\n", b); return b; } +EXPORT_SYMBOL(ceph_buffer_new); void ceph_buffer_release(struct kref *kref) { @@ -46,6 +48,7 @@ void ceph_buffer_release(struct kref *kref) } kfree(b); } +EXPORT_SYMBOL(ceph_buffer_release); int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end) { diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c new file mode 100644 index 000000000000..f3e4a13fea0c --- /dev/null +++ b/net/ceph/ceph_common.c @@ -0,0 +1,529 @@ + +#include <linux/ceph/ceph_debug.h> +#include <linux/backing-dev.h> +#include <linux/ctype.h> +#include <linux/fs.h> +#include <linux/inet.h> +#include <linux/in6.h> +#include <linux/module.h> +#include <linux/mount.h> +#include <linux/parser.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/statfs.h> +#include <linux/string.h> + + +#include <linux/ceph/libceph.h> +#include <linux/ceph/debugfs.h> +#include <linux/ceph/decode.h> +#include <linux/ceph/mon_client.h> +#include <linux/ceph/auth.h> + + + +/* + * find filename portion of a path (/foo/bar/baz -> baz) + */ +const char *ceph_file_part(const char *s, int len) +{ + const char *e = s + len; + + while (e != s && *(e-1) != '/') + e--; + return e; +} +EXPORT_SYMBOL(ceph_file_part); + +const char *ceph_msg_type_name(int type) +{ + switch (type) { + case CEPH_MSG_SHUTDOWN: return "shutdown"; + case CEPH_MSG_PING: return "ping"; + case CEPH_MSG_AUTH: return "auth"; + case CEPH_MSG_AUTH_REPLY: return "auth_reply"; + case CEPH_MSG_MON_MAP: return "mon_map"; + case CEPH_MSG_MON_GET_MAP: return "mon_get_map"; + case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe"; + case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack"; + case CEPH_MSG_STATFS: return "statfs"; + case CEPH_MSG_STATFS_REPLY: return "statfs_reply"; + case CEPH_MSG_MDS_MAP: return "mds_map"; + case CEPH_MSG_CLIENT_SESSION: return "client_session"; + case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect"; + case CEPH_MSG_CLIENT_REQUEST: return "client_request"; + case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward"; + case CEPH_MSG_CLIENT_REPLY: return "client_reply"; + case CEPH_MSG_CLIENT_CAPS: return "client_caps"; + case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release"; + case CEPH_MSG_CLIENT_SNAP: return "client_snap"; + case CEPH_MSG_CLIENT_LEASE: return "client_lease"; + case CEPH_MSG_OSD_MAP: return "osd_map"; + case CEPH_MSG_OSD_OP: return "osd_op"; + case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; + default: return "unknown"; + } +} +EXPORT_SYMBOL(ceph_msg_type_name); + +/* + * Initially learn our fsid, or verify an fsid matches. + */ +int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) +{ + if (client->have_fsid) { + if (ceph_fsid_compare(&client->fsid, fsid)) { + pr_err("bad fsid, had %pU got %pU", + &client->fsid, fsid); + return -1; + } + } else { + pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid); + memcpy(&client->fsid, fsid, sizeof(*fsid)); + ceph_debugfs_client_init(client); + client->have_fsid = true; + } + return 0; +} +EXPORT_SYMBOL(ceph_check_fsid); + +static int strcmp_null(const char *s1, const char *s2) +{ + if (!s1 && !s2) + return 0; + if (s1 && !s2) + return -1; + if (!s1 && s2) + return 1; + return strcmp(s1, s2); +} + +int ceph_compare_options(struct ceph_options *new_opt, + struct ceph_client *client) +{ + struct ceph_options *opt1 = new_opt; + struct ceph_options *opt2 = client->options; + int ofs = offsetof(struct ceph_options, mon_addr); + int i; + int ret; + + ret = memcmp(opt1, opt2, ofs); + if (ret) + return ret; + + ret = strcmp_null(opt1->name, opt2->name); + if (ret) + return ret; + + ret = strcmp_null(opt1->secret, opt2->secret); + if (ret) + return ret; + + /* any matching mon ip implies a match */ + for (i = 0; i < opt1->num_mon; i++) { + if (ceph_monmap_contains(client->monc.monmap, + &opt1->mon_addr[i])) + return 0; + } + return -1; +} +EXPORT_SYMBOL(ceph_compare_options); + + +static int parse_fsid(const char *str, struct ceph_fsid *fsid) +{ + int i = 0; + char tmp[3]; + int err = -EINVAL; + int d; + + dout("parse_fsid '%s'\n", str); + tmp[2] = 0; + while (*str && i < 16) { + if (ispunct(*str)) { + str++; + continue; + } + if (!isxdigit(str[0]) || !isxdigit(str[1])) + break; + tmp[0] = str[0]; + tmp[1] = str[1]; + if (sscanf(tmp, "%x", &d) < 1) + break; + fsid->fsid[i] = d & 0xff; + i++; + str += 2; + } + + if (i == 16) + err = 0; + dout("parse_fsid ret %d got fsid %pU", err, fsid); + return err; +} + +/* + * ceph options + */ +enum { + Opt_osdtimeout, + Opt_osdkeepalivetimeout, + Opt_mount_timeout, + Opt_osd_idle_ttl, + Opt_last_int, + /* int args above */ + Opt_fsid, + Opt_name, + Opt_secret, + Opt_ip, + Opt_last_string, + /* string args above */ + Opt_noshare, + Opt_nocrc, +}; + +static match_table_t opt_tokens = { + {Opt_osdtimeout, "osdtimeout=%d"}, + {Opt_osdkeepalivetimeout, "osdkeepalive=%d"}, + {Opt_mount_timeout, "mount_timeout=%d"}, + {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, + /* int args above */ + {Opt_fsid, "fsid=%s"}, + {Opt_name, "name=%s"}, + {Opt_secret, "secret=%s"}, + {Opt_ip, "ip=%s"}, + /* string args above */ + {Opt_noshare, "noshare"}, + {Opt_nocrc, "nocrc"}, + {-1, NULL} +}; + +void ceph_destroy_options(struct ceph_options *opt) +{ + dout("destroy_options %p\n", opt); + kfree(opt->name); + kfree(opt->secret); + kfree(opt); +} +EXPORT_SYMBOL(ceph_destroy_options); + +int ceph_parse_options(struct ceph_options **popt, char *options, + const char *dev_name, const char *dev_name_end, + int (*parse_extra_token)(char *c, void *private), + void *private) +{ + struct ceph_options *opt; + const char *c; + int err = -ENOMEM; + substring_t argstr[MAX_OPT_ARGS]; + + opt = kzalloc(sizeof(*opt), GFP_KERNEL); + if (!opt) + return err; + opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr), + GFP_KERNEL); + if (!opt->mon_addr) + goto out; + + dout("parse_options %p options '%s' dev_name '%s'\n", opt, options, + dev_name); + + /* start with defaults */ + opt->flags = CEPH_OPT_DEFAULT; + opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT; + opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; + opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ + opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ + + /* get mon ip(s) */ + /* ip1[:port1][,ip2[:port2]...] */ + err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr, + CEPH_MAX_MON, &opt->num_mon); + if (err < 0) + goto out; + + /* parse mount options */ + while ((c = strsep(&options, ",")) != NULL) { + int token, intval, ret; + if (!*c) + continue; + err = -EINVAL; + token = match_token((char *)c, opt_tokens, argstr); + if (token < 0 && parse_extra_token) { + /* extra? */ + err = parse_extra_token((char *)c, private); + if (err < 0) { + pr_err("bad option at '%s'\n", c); + goto out; + } + continue; + } + if (token < Opt_last_int) { + ret = match_int(&argstr[0], &intval); + if (ret < 0) { + pr_err("bad mount option arg (not int) " + "at '%s'\n", c); + continue; + } + dout("got int token %d val %d\n", token, intval); + } else if (token > Opt_last_int && token < Opt_last_string) { + dout("got string token %d val %s\n", token, + argstr[0].from); + } else { + dout("got token %d\n", token); + } + switch (token) { + case Opt_ip: + err = ceph_parse_ips(argstr[0].from, + argstr[0].to, + &opt->my_addr, + 1, NULL); + if (err < 0) + goto out; + opt->flags |= CEPH_OPT_MYIP; + break; + + case Opt_fsid: + err = parse_fsid(argstr[0].from, &opt->fsid); + if (err == 0) + opt->flags |= CEPH_OPT_FSID; + break; + case Opt_name: + opt->name = kstrndup(argstr[0].from, + argstr[0].to-argstr[0].from, + GFP_KERNEL); + break; + case Opt_secret: + opt->secret = kstrndup(argstr[0].from, + argstr[0].to-argstr[0].from, + GFP_KERNEL); + break; + + /* misc */ + case Opt_osdtimeout: + opt->osd_timeout = intval; + break; + case Opt_osdkeepalivetimeout: + opt->osd_keepalive_timeout = intval; + break; + case Opt_osd_idle_ttl: + opt->osd_idle_ttl = intval; + break; + case Opt_mount_timeout: + opt->mount_timeout = intval; + break; + + case Opt_noshare: + opt->flags |= CEPH_OPT_NOSHARE; + break; + + case Opt_nocrc: + opt->flags |= CEPH_OPT_NOCRC; + break; + + default: + BUG_ON(token); + } + } + + /* success */ + *popt = opt; + return 0; + +out: + ceph_destroy_options(opt); + return err; +} +EXPORT_SYMBOL(ceph_parse_options); + +u64 ceph_client_id(struct ceph_client *client) +{ + return client->monc.auth->global_id; +} +EXPORT_SYMBOL(ceph_client_id); + +/* + * create a fresh client instance + */ +struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private) +{ + struct ceph_client *client; + int err = -ENOMEM; + + client = kzalloc(sizeof(*client), GFP_KERNEL); + if (client == NULL) + return ERR_PTR(-ENOMEM); + + client->private = private; + client->options = opt; + + mutex_init(&client->mount_mutex); + init_waitqueue_head(&client->auth_wq); + client->auth_err = 0; + + client->extra_mon_dispatch = NULL; + client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT; + client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT; + + client->msgr = NULL; + + /* subsystems */ + err = ceph_monc_init(&client->monc, client); + if (err < 0) + goto fail; + err = ceph_osdc_init(&client->osdc, client); + if (err < 0) + goto fail_monc; + + return client; + +fail_monc: + ceph_monc_stop(&client->monc); +fail: + kfree(client); + return ERR_PTR(err); +} +EXPORT_SYMBOL(ceph_create_client); + +void ceph_destroy_client(struct ceph_client *client) +{ + dout("destroy_client %p\n", client); + + /* unmount */ + ceph_osdc_stop(&client->osdc); + + /* + * make sure mds and osd connections close out before destroying + * the auth module, which is needed to free those connections' + * ceph_authorizers. + */ + ceph_msgr_flush(); + + ceph_monc_stop(&client->monc); + + ceph_debugfs_client_cleanup(client); + + if (client->msgr) + ceph_messenger_destroy(client->msgr); + + ceph_destroy_options(client->options); + + kfree(client); + dout("destroy_client %p done\n", client); +} +EXPORT_SYMBOL(ceph_destroy_client); + +/* + * true if we have the mon map (and have thus joined the cluster) + */ +static int have_mon_and_osd_map(struct ceph_client *client) +{ + return client->monc.monmap && client->monc.monmap->epoch && + client->osdc.osdmap && client->osdc.osdmap->epoch; +} + +/* + * mount: join the ceph cluster, and open root directory. + */ +int __ceph_open_session(struct ceph_client *client, unsigned long started) +{ + struct ceph_entity_addr *myaddr = NULL; + int err; + unsigned long timeout = client->options->mount_timeout * HZ; + + /* initialize the messenger */ + if (client->msgr == NULL) { + if (ceph_test_opt(client, MYIP)) + myaddr = &client->options->my_addr; + client->msgr = ceph_messenger_create(myaddr, + client->supported_features, + client->required_features); + if (IS_ERR(client->msgr)) { + client->msgr = NULL; + return PTR_ERR(client->msgr); + } + client->msgr->nocrc = ceph_test_opt(client, NOCRC); + } + + /* open session, and wait for mon and osd maps */ + err = ceph_monc_open_session(&client->monc); + if (err < 0) + return err; + + while (!have_mon_and_osd_map(client)) { + err = -EIO; + if (timeout && time_after_eq(jiffies, started + timeout)) + return err; + + /* wait */ + dout("mount waiting for mon_map\n"); + err = wait_event_interruptible_timeout(client->auth_wq, + have_mon_and_osd_map(client) || (client->auth_err < 0), + timeout); + if (err == -EINTR || err == -ERESTARTSYS) + return err; + if (client->auth_err < 0) + return client->auth_err; + } + + return 0; +} +EXPORT_SYMBOL(__ceph_open_session); + + +int ceph_open_session(struct ceph_client *client) +{ + int ret; + unsigned long started = jiffies; /* note the start time */ + + dout("open_session start\n"); + mutex_lock(&client->mount_mutex); + + ret = __ceph_open_session(client, started); + + mutex_unlock(&client->mount_mutex); + return ret; +} +EXPORT_SYMBOL(ceph_open_session); + + +static int __init init_ceph_lib(void) +{ + int ret = 0; + + ret = ceph_debugfs_init(); + if (ret < 0) + goto out; + + ret = ceph_msgr_init(); + if (ret < 0) + goto out_debugfs; + + pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n", + CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL, + CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT, + CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT); + + return 0; + +out_debugfs: + ceph_debugfs_cleanup(); +out: + return ret; +} + +static void __exit exit_ceph_lib(void) +{ + dout("exit_ceph_lib\n"); + ceph_msgr_exit(); + ceph_debugfs_cleanup(); +} + +module_init(init_ceph_lib); +module_exit(exit_ceph_lib); + +MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); +MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); +MODULE_AUTHOR("Patience Warnick <patience@newdream.net>"); +MODULE_DESCRIPTION("Ceph filesystem for Linux"); +MODULE_LICENSE("GPL"); diff --git a/fs/ceph/ceph_fs.c b/net/ceph/ceph_fs.c index 3ac6cc7c1156..a3a3a31d3c37 100644 --- a/fs/ceph/ceph_fs.c +++ b/net/ceph/ceph_fs.c @@ -1,7 +1,8 @@ /* * Some non-inline ceph helpers */ -#include "types.h" +#include <linux/module.h> +#include <linux/ceph/types.h> /* * return true if @layout appears to be valid @@ -52,6 +53,7 @@ int ceph_flags_to_mode(int flags) return mode; } +EXPORT_SYMBOL(ceph_flags_to_mode); int ceph_caps_for_mode(int mode) { @@ -70,3 +72,4 @@ int ceph_caps_for_mode(int mode) return caps; } +EXPORT_SYMBOL(ceph_caps_for_mode); diff --git a/fs/ceph/ceph_hash.c b/net/ceph/ceph_hash.c index bd570015d147..815ef8826796 100644 --- a/fs/ceph/ceph_hash.c +++ b/net/ceph/ceph_hash.c @@ -1,5 +1,5 @@ -#include "types.h" +#include <linux/ceph/types.h> /* * Robert Jenkin's hash function. diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c new file mode 100644 index 000000000000..3fbda04de29c --- /dev/null +++ b/net/ceph/ceph_strings.c @@ -0,0 +1,84 @@ +/* + * Ceph string constants + */ +#include <linux/module.h> +#include <linux/ceph/types.h> + +const char *ceph_entity_type_name(int type) +{ + switch (type) { + case CEPH_ENTITY_TYPE_MDS: return "mds"; + case CEPH_ENTITY_TYPE_OSD: return "osd"; + case CEPH_ENTITY_TYPE_MON: return "mon"; + case CEPH_ENTITY_TYPE_CLIENT: return "client"; + case CEPH_ENTITY_TYPE_AUTH: return "auth"; + default: return "unknown"; + } +} + +const char *ceph_osd_op_name(int op) +{ + switch (op) { + case CEPH_OSD_OP_READ: return "read"; + case CEPH_OSD_OP_STAT: return "stat"; + + case CEPH_OSD_OP_MASKTRUNC: return "masktrunc"; + + case CEPH_OSD_OP_WRITE: return "write"; + case CEPH_OSD_OP_DELETE: return "delete"; + case CEPH_OSD_OP_TRUNCATE: return "truncate"; + case CEPH_OSD_OP_ZERO: return "zero"; + case CEPH_OSD_OP_WRITEFULL: return "writefull"; + case CEPH_OSD_OP_ROLLBACK: return "rollback"; + + case CEPH_OSD_OP_APPEND: return "append"; + case CEPH_OSD_OP_STARTSYNC: return "startsync"; + case CEPH_OSD_OP_SETTRUNC: return "settrunc"; + case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc"; + + case CEPH_OSD_OP_TMAPUP: return "tmapup"; + case CEPH_OSD_OP_TMAPGET: return "tmapget"; + case CEPH_OSD_OP_TMAPPUT: return "tmapput"; + + case CEPH_OSD_OP_GETXATTR: return "getxattr"; + case CEPH_OSD_OP_GETXATTRS: return "getxattrs"; + case CEPH_OSD_OP_SETXATTR: return "setxattr"; + case CEPH_OSD_OP_SETXATTRS: return "setxattrs"; + case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs"; + case CEPH_OSD_OP_RMXATTR: return "rmxattr"; + case CEPH_OSD_OP_CMPXATTR: return "cmpxattr"; + + case CEPH_OSD_OP_PULL: return "pull"; + case CEPH_OSD_OP_PUSH: return "push"; + case CEPH_OSD_OP_BALANCEREADS: return "balance-reads"; + case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads"; + case CEPH_OSD_OP_SCRUB: return "scrub"; + + case CEPH_OSD_OP_WRLOCK: return "wrlock"; + case CEPH_OSD_OP_WRUNLOCK: return "wrunlock"; + case CEPH_OSD_OP_RDLOCK: return "rdlock"; + case CEPH_OSD_OP_RDUNLOCK: return "rdunlock"; + case CEPH_OSD_OP_UPLOCK: return "uplock"; + case CEPH_OSD_OP_DNLOCK: return "dnlock"; + + case CEPH_OSD_OP_CALL: return "call"; + + case CEPH_OSD_OP_PGLS: return "pgls"; + } + return "???"; +} + + +const char *ceph_pool_op_name(int op) +{ + switch (op) { + case POOL_OP_CREATE: return "create"; + case POOL_OP_DELETE: return "delete"; + case POOL_OP_AUID_CHANGE: return "auid change"; + case POOL_OP_CREATE_SNAP: return "create snap"; + case POOL_OP_DELETE_SNAP: return "delete snap"; + case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap"; + case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap"; + } + return "???"; +} diff --git a/fs/ceph/crush/crush.c b/net/ceph/crush/crush.c index fabd302e5779..d6ebb13a18a4 100644 --- a/fs/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -8,7 +8,7 @@ # define BUG_ON(x) assert(!(x)) #endif -#include "crush.h" +#include <linux/crush/crush.h> const char *crush_bucket_alg_name(int alg) { diff --git a/fs/ceph/crush/hash.c b/net/ceph/crush/hash.c index 5873aed694bf..5bb63e37a8a1 100644 --- a/fs/ceph/crush/hash.c +++ b/net/ceph/crush/hash.c @@ -1,6 +1,6 @@ #include <linux/types.h> -#include "hash.h" +#include <linux/crush/hash.h> /* * Robert Jenkins' function for mixing 32-bit values diff --git a/fs/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index a4eec133258e..42599e31dcad 100644 --- a/fs/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -18,8 +18,8 @@ # define kfree(x) free(x) #endif -#include "crush.h" -#include "hash.h" +#include <linux/crush/crush.h> +#include <linux/crush/hash.h> /* * Implement the core CRUSH mapping algorithm. diff --git a/fs/ceph/crypto.c b/net/ceph/crypto.c index a3e627f63293..7b505b0c983f 100644 --- a/fs/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -1,13 +1,13 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/err.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <crypto/hash.h> +#include <linux/ceph/decode.h> #include "crypto.h" -#include "decode.h" int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end) { diff --git a/fs/ceph/crypto.h b/net/ceph/crypto.h index bdf38607323c..f9eccace592b 100644 --- a/fs/ceph/crypto.h +++ b/net/ceph/crypto.h @@ -1,8 +1,8 @@ #ifndef _FS_CEPH_CRYPTO_H #define _FS_CEPH_CRYPTO_H -#include "types.h" -#include "buffer.h" +#include <linux/ceph/types.h> +#include <linux/ceph/buffer.h> /* * cryptographic secret diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c new file mode 100644 index 000000000000..27d4ea315d12 --- /dev/null +++ b/net/ceph/debugfs.c @@ -0,0 +1,267 @@ +#include <linux/ceph/ceph_debug.h> + +#include <linux/device.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> + +#include <linux/ceph/libceph.h> +#include <linux/ceph/mon_client.h> +#include <linux/ceph/auth.h> +#include <linux/ceph/debugfs.h> + +#ifdef CONFIG_DEBUG_FS + +/* + * Implement /sys/kernel/debug/ceph fun + * + * /sys/kernel/debug/ceph/client* - an instance of the ceph client + * .../osdmap - current osdmap + * .../monmap - current monmap + * .../osdc - active osd requests + * .../monc - mon client state + * .../dentry_lru - dump contents of dentry lru + * .../caps - expose cap (reservation) stats + * .../bdi - symlink to ../../bdi/something + */ + +static struct dentry *ceph_debugfs_dir; + +static int monmap_show(struct seq_file *s, void *p) +{ + int i; + struct ceph_client *client = s->private; + + if (client->monc.monmap == NULL) + return 0; + + seq_printf(s, "epoch %d\n", client->monc.monmap->epoch); + for (i = 0; i < client->monc.monmap->num_mon; i++) { + struct ceph_entity_inst *inst = + &client->monc.monmap->mon_inst[i]; + + seq_printf(s, "\t%s%lld\t%s\n", + ENTITY_NAME(inst->name), + ceph_pr_addr(&inst->addr.in_addr)); + } + return 0; +} + +static int osdmap_show(struct seq_file *s, void *p) +{ + int i; + struct ceph_client *client = s->private; + struct rb_node *n; + + if (client->osdc.osdmap == NULL) + return 0; + seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch); + seq_printf(s, "flags%s%s\n", + (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ? + " NEARFULL" : "", + (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ? + " FULL" : ""); + for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { + struct ceph_pg_pool_info *pool = + rb_entry(n, struct ceph_pg_pool_info, node); + seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n", + pool->id, pool->v.pg_num, pool->pg_num_mask, + pool->v.lpg_num, pool->lpg_num_mask); + } + for (i = 0; i < client->osdc.osdmap->max_osd; i++) { + struct ceph_entity_addr *addr = + &client->osdc.osdmap->osd_addr[i]; + int state = client->osdc.osdmap->osd_state[i]; + char sb[64]; + + seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n", + i, ceph_pr_addr(&addr->in_addr), + ((client->osdc.osdmap->osd_weight[i]*100) >> 16), + ceph_osdmap_state_str(sb, sizeof(sb), state)); + } + return 0; +} + +static int monc_show(struct seq_file *s, void *p) +{ + struct ceph_client *client = s->private; + struct ceph_mon_generic_request *req; + struct ceph_mon_client *monc = &client->monc; + struct rb_node *rp; + + mutex_lock(&monc->mutex); + + if (monc->have_mdsmap) + seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap); + if (monc->have_osdmap) + seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap); + if (monc->want_next_osdmap) + seq_printf(s, "want next osdmap\n"); + + for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { + __u16 op; + req = rb_entry(rp, struct ceph_mon_generic_request, node); + op = le16_to_cpu(req->request->hdr.type); + if (op == CEPH_MSG_STATFS) + seq_printf(s, "%lld statfs\n", req->tid); + else + seq_printf(s, "%lld unknown\n", req->tid); + } + + mutex_unlock(&monc->mutex); + return 0; +} + +static int osdc_show(struct seq_file *s, void *pp) +{ + struct ceph_client *client = s->private; + struct ceph_osd_client *osdc = &client->osdc; + struct rb_node *p; + + mutex_lock(&osdc->request_mutex); + for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { + struct ceph_osd_request *req; + struct ceph_osd_request_head *head; + struct ceph_osd_op *op; + int num_ops; + int opcode, olen; + int i; + + req = rb_entry(p, struct ceph_osd_request, r_node); + + seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1, + le32_to_cpu(req->r_pgid.pool), + le16_to_cpu(req->r_pgid.ps)); + + head = req->r_request->front.iov_base; + op = (void *)(head + 1); + + num_ops = le16_to_cpu(head->num_ops); + olen = le32_to_cpu(head->object_len); + seq_printf(s, "%.*s", olen, + (const char *)(head->ops + num_ops)); + + if (req->r_reassert_version.epoch) + seq_printf(s, "\t%u'%llu", + (unsigned)le32_to_cpu(req->r_reassert_version.epoch), + le64_to_cpu(req->r_reassert_version.version)); + else + seq_printf(s, "\t"); + + for (i = 0; i < num_ops; i++) { + opcode = le16_to_cpu(op->op); + seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); + op++; + } + + seq_printf(s, "\n"); + } + mutex_unlock(&osdc->request_mutex); + return 0; +} + +CEPH_DEFINE_SHOW_FUNC(monmap_show) +CEPH_DEFINE_SHOW_FUNC(osdmap_show) +CEPH_DEFINE_SHOW_FUNC(monc_show) +CEPH_DEFINE_SHOW_FUNC(osdc_show) + +int ceph_debugfs_init(void) +{ + ceph_debugfs_dir = debugfs_create_dir("ceph", NULL); + if (!ceph_debugfs_dir) + return -ENOMEM; + return 0; +} + +void ceph_debugfs_cleanup(void) +{ + debugfs_remove(ceph_debugfs_dir); +} + +int ceph_debugfs_client_init(struct ceph_client *client) +{ + int ret = -ENOMEM; + char name[80]; + + snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid, + client->monc.auth->global_id); + + client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); + if (!client->debugfs_dir) + goto out; + + client->monc.debugfs_file = debugfs_create_file("monc", + 0600, + client->debugfs_dir, + client, + &monc_show_fops); + if (!client->monc.debugfs_file) + goto out; + + client->osdc.debugfs_file = debugfs_create_file("osdc", + 0600, + client->debugfs_dir, + client, + &osdc_show_fops); + if (!client->osdc.debugfs_file) + goto out; + + client->debugfs_monmap = debugfs_create_file("monmap", + 0600, + client->debugfs_dir, + client, + &monmap_show_fops); + if (!client->debugfs_monmap) + goto out; + + client->debugfs_osdmap = debugfs_create_file("osdmap", + 0600, + client->debugfs_dir, + client, + &osdmap_show_fops); + if (!client->debugfs_osdmap) + goto out; + + return 0; + +out: + ceph_debugfs_client_cleanup(client); + return ret; +} + +void ceph_debugfs_client_cleanup(struct ceph_client *client) +{ + debugfs_remove(client->debugfs_osdmap); + debugfs_remove(client->debugfs_monmap); + debugfs_remove(client->osdc.debugfs_file); + debugfs_remove(client->monc.debugfs_file); + debugfs_remove(client->debugfs_dir); +} + +#else /* CONFIG_DEBUG_FS */ + +int ceph_debugfs_init(void) +{ + return 0; +} + +void ceph_debugfs_cleanup(void) +{ +} + +int ceph_debugfs_client_init(struct ceph_client *client) +{ + return 0; +} + +void ceph_debugfs_client_cleanup(struct ceph_client *client) +{ +} + +#endif /* CONFIG_DEBUG_FS */ + +EXPORT_SYMBOL(ceph_debugfs_init); +EXPORT_SYMBOL(ceph_debugfs_cleanup); diff --git a/fs/ceph/messenger.c b/net/ceph/messenger.c index 2502d76fcec1..0e8157ee5d43 100644 --- a/fs/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/crc32c.h> #include <linux/ctype.h> @@ -9,12 +9,14 @@ #include <linux/slab.h> #include <linux/socket.h> #include <linux/string.h> +#include <linux/bio.h> +#include <linux/blkdev.h> #include <net/tcp.h> -#include "super.h" -#include "messenger.h" -#include "decode.h" -#include "pagelist.h" +#include <linux/ceph/libceph.h> +#include <linux/ceph/messenger.h> +#include <linux/ceph/decode.h> +#include <linux/ceph/pagelist.h> /* * Ceph uses the messenger to exchange ceph_msg messages with other @@ -48,7 +50,7 @@ static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN]; static DEFINE_SPINLOCK(addr_str_lock); static int last_addr_str; -const char *pr_addr(const struct sockaddr_storage *ss) +const char *ceph_pr_addr(const struct sockaddr_storage *ss) { int i; char *s; @@ -79,6 +81,7 @@ const char *pr_addr(const struct sockaddr_storage *ss) return s; } +EXPORT_SYMBOL(ceph_pr_addr); static void encode_my_addr(struct ceph_messenger *msgr) { @@ -91,7 +94,7 @@ static void encode_my_addr(struct ceph_messenger *msgr) */ struct workqueue_struct *ceph_msgr_wq; -int __init ceph_msgr_init(void) +int ceph_msgr_init(void) { ceph_msgr_wq = create_workqueue("ceph-msgr"); if (IS_ERR(ceph_msgr_wq)) { @@ -102,16 +105,19 @@ int __init ceph_msgr_init(void) } return 0; } +EXPORT_SYMBOL(ceph_msgr_init); void ceph_msgr_exit(void) { destroy_workqueue(ceph_msgr_wq); } +EXPORT_SYMBOL(ceph_msgr_exit); void ceph_msgr_flush(void) { flush_workqueue(ceph_msgr_wq); } +EXPORT_SYMBOL(ceph_msgr_flush); /* @@ -221,19 +227,19 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con) set_sock_callbacks(sock, con); - dout("connect %s\n", pr_addr(&con->peer_addr.in_addr)); + dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr)); ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), O_NONBLOCK); if (ret == -EINPROGRESS) { dout("connect %s EINPROGRESS sk_state = %u\n", - pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr.in_addr), sock->sk->sk_state); ret = 0; } if (ret < 0) { pr_err("connect %s error %d\n", - pr_addr(&con->peer_addr.in_addr), ret); + ceph_pr_addr(&con->peer_addr.in_addr), ret); sock_release(sock); con->sock = NULL; con->error_msg = "connect error"; @@ -334,7 +340,8 @@ static void reset_connection(struct ceph_connection *con) */ void ceph_con_close(struct ceph_connection *con) { - dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr)); + dout("con_close %p peer %s\n", con, + ceph_pr_addr(&con->peer_addr.in_addr)); set_bit(CLOSED, &con->state); /* in case there's queued work */ clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */ clear_bit(LOSSYTX, &con->state); /* so we retry next connect */ @@ -347,19 +354,21 @@ void ceph_con_close(struct ceph_connection *con) mutex_unlock(&con->mutex); queue_con(con); } +EXPORT_SYMBOL(ceph_con_close); /* * Reopen a closed connection, with a new peer address. */ void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr) { - dout("con_open %p %s\n", con, pr_addr(&addr->in_addr)); + dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr)); set_bit(OPENING, &con->state); clear_bit(CLOSED, &con->state); memcpy(&con->peer_addr, addr, sizeof(*addr)); con->delay = 0; /* reset backoff memory */ queue_con(con); } +EXPORT_SYMBOL(ceph_con_open); /* * return true if this connection ever successfully opened @@ -406,6 +415,7 @@ void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con) INIT_LIST_HEAD(&con->out_sent); INIT_DELAYED_WORK(&con->work, con_work); } +EXPORT_SYMBOL(ceph_con_init); /* @@ -529,8 +539,11 @@ static void prepare_write_message(struct ceph_connection *con) if (le32_to_cpu(m->hdr.data_len) > 0) { /* initialize page iterator */ con->out_msg_pos.page = 0; - con->out_msg_pos.page_pos = - le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK; + if (m->pages) + con->out_msg_pos.page_pos = + le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK; + else + con->out_msg_pos.page_pos = 0; con->out_msg_pos.data_pos = 0; con->out_msg_pos.did_page_crc = 0; con->out_more = 1; /* data + footer will follow */ @@ -647,7 +660,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr, dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, con->connect_seq, global_seq, proto); - con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED); + con->out_connect.features = cpu_to_le64(msgr->supported_features); con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); con->out_connect.global_seq = cpu_to_le32(global_seq); @@ -712,6 +725,31 @@ out: return ret; /* done! */ } +#ifdef CONFIG_BLOCK +static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg) +{ + if (!bio) { + *iter = NULL; + *seg = 0; + return; + } + *iter = bio; + *seg = bio->bi_idx; +} + +static void iter_bio_next(struct bio **bio_iter, int *seg) +{ + if (*bio_iter == NULL) + return; + + BUG_ON(*seg >= (*bio_iter)->bi_vcnt); + + (*seg)++; + if (*seg == (*bio_iter)->bi_vcnt) + init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); +} +#endif + /* * Write as much message data payload as we can. If we finish, queue * up the footer. @@ -726,21 +764,46 @@ static int write_partial_msg_pages(struct ceph_connection *con) size_t len; int crc = con->msgr->nocrc; int ret; + int total_max_write; + int in_trail = 0; + size_t trail_len = (msg->trail ? msg->trail->length : 0); dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages, con->out_msg_pos.page_pos); - while (con->out_msg_pos.page < con->out_msg->nr_pages) { +#ifdef CONFIG_BLOCK + if (msg->bio && !msg->bio_iter) + init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); +#endif + + while (data_len > con->out_msg_pos.data_pos) { struct page *page = NULL; void *kaddr = NULL; + int max_write = PAGE_SIZE; + int page_shift = 0; + + total_max_write = data_len - trail_len - + con->out_msg_pos.data_pos; /* * if we are calculating the data crc (the default), we need * to map the page. if our pages[] has been revoked, use the * zero page. */ - if (msg->pages) { + + /* have we reached the trail part of the data? */ + if (con->out_msg_pos.data_pos >= data_len - trail_len) { + in_trail = 1; + + total_max_write = data_len - con->out_msg_pos.data_pos; + + page = list_first_entry(&msg->trail->head, + struct page, lru); + if (crc) + kaddr = kmap(page); + max_write = PAGE_SIZE; + } else if (msg->pages) { page = msg->pages[con->out_msg_pos.page]; if (crc) kaddr = kmap(page); @@ -749,13 +812,25 @@ static int write_partial_msg_pages(struct ceph_connection *con) struct page, lru); if (crc) kaddr = kmap(page); +#ifdef CONFIG_BLOCK + } else if (msg->bio) { + struct bio_vec *bv; + + bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); + page = bv->bv_page; + page_shift = bv->bv_offset; + if (crc) + kaddr = kmap(page) + page_shift; + max_write = bv->bv_len; +#endif } else { page = con->msgr->zero_page; if (crc) kaddr = page_address(con->msgr->zero_page); } - len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos), - (int)(data_len - con->out_msg_pos.data_pos)); + len = min_t(int, max_write - con->out_msg_pos.page_pos, + total_max_write); + if (crc && !con->out_msg_pos.did_page_crc) { void *base = kaddr + con->out_msg_pos.page_pos; u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); @@ -765,13 +840,14 @@ static int write_partial_msg_pages(struct ceph_connection *con) cpu_to_le32(crc32c(tmpcrc, base, len)); con->out_msg_pos.did_page_crc = 1; } - ret = kernel_sendpage(con->sock, page, - con->out_msg_pos.page_pos, len, + con->out_msg_pos.page_pos + page_shift, + len, MSG_DONTWAIT | MSG_NOSIGNAL | MSG_MORE); - if (crc && (msg->pages || msg->pagelist)) + if (crc && + (msg->pages || msg->pagelist || msg->bio || in_trail)) kunmap(page); if (ret <= 0) @@ -783,9 +859,16 @@ static int write_partial_msg_pages(struct ceph_connection *con) con->out_msg_pos.page_pos = 0; con->out_msg_pos.page++; con->out_msg_pos.did_page_crc = 0; - if (msg->pagelist) + if (in_trail) + list_move_tail(&page->lru, + &msg->trail->head); + else if (msg->pagelist) list_move_tail(&page->lru, &msg->pagelist->head); +#ifdef CONFIG_BLOCK + else if (msg->bio) + iter_bio_next(&msg->bio_iter, &msg->bio_seg); +#endif } } @@ -938,7 +1021,7 @@ static int verify_hello(struct ceph_connection *con) { if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) { pr_err("connect to %s got bad banner\n", - pr_addr(&con->peer_addr.in_addr)); + ceph_pr_addr(&con->peer_addr.in_addr)); con->error_msg = "protocol error, bad banner"; return -1; } @@ -1041,7 +1124,7 @@ int ceph_parse_ips(const char *c, const char *end, addr_set_port(ss, port); - dout("parse_ips got %s\n", pr_addr(ss)); + dout("parse_ips got %s\n", ceph_pr_addr(ss)); if (p == end) break; @@ -1061,6 +1144,7 @@ bad: pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c); return -EINVAL; } +EXPORT_SYMBOL(ceph_parse_ips); static int process_banner(struct ceph_connection *con) { @@ -1082,9 +1166,9 @@ static int process_banner(struct ceph_connection *con) !(addr_is_blank(&con->actual_peer_addr.in_addr) && con->actual_peer_addr.nonce == con->peer_addr.nonce)) { pr_warning("wrong peer, want %s/%d, got %s/%d\n", - pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr.in_addr), (int)le32_to_cpu(con->peer_addr.nonce), - pr_addr(&con->actual_peer_addr.in_addr), + ceph_pr_addr(&con->actual_peer_addr.in_addr), (int)le32_to_cpu(con->actual_peer_addr.nonce)); con->error_msg = "wrong peer at address"; return -1; @@ -1102,7 +1186,7 @@ static int process_banner(struct ceph_connection *con) addr_set_port(&con->msgr->inst.addr.in_addr, port); encode_my_addr(con->msgr); dout("process_banner learned my addr is %s\n", - pr_addr(&con->msgr->inst.addr.in_addr)); + ceph_pr_addr(&con->msgr->inst.addr.in_addr)); } set_bit(NEGOTIATING, &con->state); @@ -1123,8 +1207,8 @@ static void fail_protocol(struct ceph_connection *con) static int process_connect(struct ceph_connection *con) { - u64 sup_feat = CEPH_FEATURE_SUPPORTED; - u64 req_feat = CEPH_FEATURE_REQUIRED; + u64 sup_feat = con->msgr->supported_features; + u64 req_feat = con->msgr->required_features; u64 server_feat = le64_to_cpu(con->in_reply.features); dout("process_connect on %p tag %d\n", con, (int)con->in_tag); @@ -1134,7 +1218,7 @@ static int process_connect(struct ceph_connection *con) pr_err("%s%lld %s feature set mismatch," " my %llx < server's %llx, missing %llx\n", ENTITY_NAME(con->peer_name), - pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr.in_addr), sup_feat, server_feat, server_feat & ~sup_feat); con->error_msg = "missing required protocol features"; fail_protocol(con); @@ -1144,7 +1228,7 @@ static int process_connect(struct ceph_connection *con) pr_err("%s%lld %s protocol version mismatch," " my %d != server's %d\n", ENTITY_NAME(con->peer_name), - pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr.in_addr), le32_to_cpu(con->out_connect.protocol_version), le32_to_cpu(con->in_reply.protocol_version)); con->error_msg = "protocol version mismatch"; @@ -1178,7 +1262,7 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->in_connect.connect_seq)); pr_err("%s%lld %s connection reset\n", ENTITY_NAME(con->peer_name), - pr_addr(&con->peer_addr.in_addr)); + ceph_pr_addr(&con->peer_addr.in_addr)); reset_connection(con); prepare_write_connect(con->msgr, con, 0); prepare_read_connect(con); @@ -1223,7 +1307,7 @@ static int process_connect(struct ceph_connection *con) pr_err("%s%lld %s protocol feature mismatch," " my required %llx > server's %llx, need %llx\n", ENTITY_NAME(con->peer_name), - pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr.in_addr), req_feat, server_feat, req_feat & ~server_feat); con->error_msg = "missing required protocol features"; fail_protocol(con); @@ -1305,8 +1389,7 @@ static int read_partial_message_section(struct ceph_connection *con, struct kvec *section, unsigned int sec_len, u32 *crc) { - int left; - int ret; + int ret, left; BUG_ON(!section); @@ -1329,13 +1412,83 @@ static int read_partial_message_section(struct ceph_connection *con, static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, struct ceph_msg_header *hdr, int *skip); + + +static int read_partial_message_pages(struct ceph_connection *con, + struct page **pages, + unsigned data_len, int datacrc) +{ + void *p; + int ret; + int left; + + left = min((int)(data_len - con->in_msg_pos.data_pos), + (int)(PAGE_SIZE - con->in_msg_pos.page_pos)); + /* (page) data */ + BUG_ON(pages == NULL); + p = kmap(pages[con->in_msg_pos.page]); + ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, + left); + if (ret > 0 && datacrc) + con->in_data_crc = + crc32c(con->in_data_crc, + p + con->in_msg_pos.page_pos, ret); + kunmap(pages[con->in_msg_pos.page]); + if (ret <= 0) + return ret; + con->in_msg_pos.data_pos += ret; + con->in_msg_pos.page_pos += ret; + if (con->in_msg_pos.page_pos == PAGE_SIZE) { + con->in_msg_pos.page_pos = 0; + con->in_msg_pos.page++; + } + + return ret; +} + +#ifdef CONFIG_BLOCK +static int read_partial_message_bio(struct ceph_connection *con, + struct bio **bio_iter, int *bio_seg, + unsigned data_len, int datacrc) +{ + struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg); + void *p; + int ret, left; + + if (IS_ERR(bv)) + return PTR_ERR(bv); + + left = min((int)(data_len - con->in_msg_pos.data_pos), + (int)(bv->bv_len - con->in_msg_pos.page_pos)); + + p = kmap(bv->bv_page) + bv->bv_offset; + + ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, + left); + if (ret > 0 && datacrc) + con->in_data_crc = + crc32c(con->in_data_crc, + p + con->in_msg_pos.page_pos, ret); + kunmap(bv->bv_page); + if (ret <= 0) + return ret; + con->in_msg_pos.data_pos += ret; + con->in_msg_pos.page_pos += ret; + if (con->in_msg_pos.page_pos == bv->bv_len) { + con->in_msg_pos.page_pos = 0; + iter_bio_next(bio_iter, bio_seg); + } + + return ret; +} +#endif + /* * read (part of) a message. */ static int read_partial_message(struct ceph_connection *con) { struct ceph_msg *m = con->in_msg; - void *p; int ret; int to, left; unsigned front_len, middle_len, data_len, data_off; @@ -1381,7 +1534,7 @@ static int read_partial_message(struct ceph_connection *con) if ((s64)seq - (s64)con->in_seq < 1) { pr_info("skipping %s%lld %s seq %lld, expected %lld\n", ENTITY_NAME(con->peer_name), - pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr.in_addr), seq, con->in_seq + 1); con->in_base_pos = -front_len - middle_len - data_len - sizeof(m->footer); @@ -1422,7 +1575,10 @@ static int read_partial_message(struct ceph_connection *con) m->middle->vec.iov_len = 0; con->in_msg_pos.page = 0; - con->in_msg_pos.page_pos = data_off & ~PAGE_MASK; + if (m->pages) + con->in_msg_pos.page_pos = data_off & ~PAGE_MASK; + else + con->in_msg_pos.page_pos = 0; con->in_msg_pos.data_pos = 0; } @@ -1440,27 +1596,29 @@ static int read_partial_message(struct ceph_connection *con) if (ret <= 0) return ret; } +#ifdef CONFIG_BLOCK + if (m->bio && !m->bio_iter) + init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg); +#endif /* (page) data */ while (con->in_msg_pos.data_pos < data_len) { - left = min((int)(data_len - con->in_msg_pos.data_pos), - (int)(PAGE_SIZE - con->in_msg_pos.page_pos)); - BUG_ON(m->pages == NULL); - p = kmap(m->pages[con->in_msg_pos.page]); - ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, - left); - if (ret > 0 && datacrc) - con->in_data_crc = - crc32c(con->in_data_crc, - p + con->in_msg_pos.page_pos, ret); - kunmap(m->pages[con->in_msg_pos.page]); - if (ret <= 0) - return ret; - con->in_msg_pos.data_pos += ret; - con->in_msg_pos.page_pos += ret; - if (con->in_msg_pos.page_pos == PAGE_SIZE) { - con->in_msg_pos.page_pos = 0; - con->in_msg_pos.page++; + if (m->pages) { + ret = read_partial_message_pages(con, m->pages, + data_len, datacrc); + if (ret <= 0) + return ret; +#ifdef CONFIG_BLOCK + } else if (m->bio) { + + ret = read_partial_message_bio(con, + &m->bio_iter, &m->bio_seg, + data_len, datacrc); + if (ret <= 0) + return ret; +#endif + } else { + BUG_ON(1); } } @@ -1874,9 +2032,9 @@ out: static void ceph_fault(struct ceph_connection *con) { pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), - pr_addr(&con->peer_addr.in_addr), con->error_msg); + ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); dout("fault %p state %lu to peer %s\n", - con, con->state, pr_addr(&con->peer_addr.in_addr)); + con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); if (test_bit(LOSSYTX, &con->state)) { dout("fault on LOSSYTX channel\n"); @@ -1936,7 +2094,9 @@ out: /* * create a new messenger instance */ -struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr) +struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr, + u32 supported_features, + u32 required_features) { struct ceph_messenger *msgr; @@ -1944,6 +2104,9 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr) if (msgr == NULL) return ERR_PTR(-ENOMEM); + msgr->supported_features = supported_features; + msgr->required_features = required_features; + spin_lock_init(&msgr->global_seq_lock); /* the zero page is needed if a request is "canceled" while the message @@ -1966,6 +2129,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr) dout("messenger_create %p\n", msgr); return msgr; } +EXPORT_SYMBOL(ceph_messenger_create); void ceph_messenger_destroy(struct ceph_messenger *msgr) { @@ -1975,6 +2139,7 @@ void ceph_messenger_destroy(struct ceph_messenger *msgr) kfree(msgr); dout("destroyed messenger %p\n", msgr); } +EXPORT_SYMBOL(ceph_messenger_destroy); /* * Queue up an outgoing message on the given connection. @@ -2011,6 +2176,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) if (test_and_set_bit(WRITE_PENDING, &con->state) == 0) queue_con(con); } +EXPORT_SYMBOL(ceph_con_send); /* * Revoke a message that was previously queued for send @@ -2076,6 +2242,7 @@ void ceph_con_keepalive(struct ceph_connection *con) test_and_set_bit(WRITE_PENDING, &con->state) == 0) queue_con(con); } +EXPORT_SYMBOL(ceph_con_keepalive); /* @@ -2136,6 +2303,10 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags) m->nr_pages = 0; m->pages = NULL; m->pagelist = NULL; + m->bio = NULL; + m->bio_iter = NULL; + m->bio_seg = 0; + m->trail = NULL; dout("ceph_msg_new %p front %d\n", m, front_len); return m; @@ -2146,6 +2317,7 @@ out: pr_err("msg_new can't create type %d front %d\n", type, front_len); return NULL; } +EXPORT_SYMBOL(ceph_msg_new); /* * Allocate "middle" portion of a message, if it is needed and wasn't @@ -2250,11 +2422,14 @@ void ceph_msg_last_put(struct kref *kref) m->pagelist = NULL; } + m->trail = NULL; + if (m->pool) ceph_msgpool_put(m->pool, m); else ceph_msg_kfree(m); } +EXPORT_SYMBOL(ceph_msg_last_put); void ceph_msg_dump(struct ceph_msg *msg) { @@ -2275,3 +2450,4 @@ void ceph_msg_dump(struct ceph_msg *msg) DUMP_PREFIX_OFFSET, 16, 1, &msg->footer, sizeof(msg->footer), true); } +EXPORT_SYMBOL(ceph_msg_dump); diff --git a/fs/ceph/mon_client.c b/net/ceph/mon_client.c index b2a5a3e4a671..8a079399174a 100644 --- a/fs/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -1,14 +1,16 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> +#include <linux/module.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/random.h> #include <linux/sched.h> -#include "mon_client.h" -#include "super.h" -#include "auth.h" -#include "decode.h" +#include <linux/ceph/mon_client.h> +#include <linux/ceph/libceph.h> +#include <linux/ceph/decode.h> + +#include <linux/ceph/auth.h> /* * Interact with Ceph monitor cluster. Handle requests for new map @@ -74,7 +76,7 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end) m->num_mon); for (i = 0; i < m->num_mon; i++) dout("monmap_decode mon%d is %s\n", i, - pr_addr(&m->mon_inst[i].addr.in_addr)); + ceph_pr_addr(&m->mon_inst[i].addr.in_addr)); return m; bad: @@ -191,30 +193,33 @@ static void __send_subscribe(struct ceph_mon_client *monc) struct ceph_msg *msg = monc->m_subscribe; struct ceph_mon_subscribe_item *i; void *p, *end; + int num; p = msg->front.iov_base; end = p + msg->front_max; - dout("__send_subscribe to 'mdsmap' %u+\n", - (unsigned)monc->have_mdsmap); + num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap; + ceph_encode_32(&p, num); + if (monc->want_next_osdmap) { dout("__send_subscribe to 'osdmap' %u\n", (unsigned)monc->have_osdmap); - ceph_encode_32(&p, 3); ceph_encode_string(&p, end, "osdmap", 6); i = p; i->have = cpu_to_le64(monc->have_osdmap); i->onetime = 1; p += sizeof(*i); monc->want_next_osdmap = 2; /* requested */ - } else { - ceph_encode_32(&p, 2); } - ceph_encode_string(&p, end, "mdsmap", 6); - i = p; - i->have = cpu_to_le64(monc->have_mdsmap); - i->onetime = 0; - p += sizeof(*i); + if (monc->want_mdsmap) { + dout("__send_subscribe to 'mdsmap' %u+\n", + (unsigned)monc->have_mdsmap); + ceph_encode_string(&p, end, "mdsmap", 6); + i = p; + i->have = cpu_to_le64(monc->have_mdsmap); + i->onetime = 0; + p += sizeof(*i); + } ceph_encode_string(&p, end, "monmap", 6); i = p; i->have = 0; @@ -243,7 +248,8 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc, mutex_lock(&monc->mutex); if (monc->hunting) { pr_info("mon%d %s session established\n", - monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr)); + monc->cur_mon, + ceph_pr_addr(&monc->con->peer_addr.in_addr)); monc->hunting = false; } dout("handle_subscribe_ack after %d seconds\n", seconds); @@ -266,6 +272,7 @@ int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got) mutex_unlock(&monc->mutex); return 0; } +EXPORT_SYMBOL(ceph_monc_got_mdsmap); int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got) { @@ -310,6 +317,7 @@ int ceph_monc_open_session(struct ceph_mon_client *monc) mutex_unlock(&monc->mutex); return 0; } +EXPORT_SYMBOL(ceph_monc_open_session); /* * The monitor responds with mount ack indicate mount success. The @@ -540,6 +548,7 @@ out: kref_put(&req->kref, release_generic_request); return err; } +EXPORT_SYMBOL(ceph_monc_do_statfs); /* * pool ops @@ -651,6 +660,7 @@ int ceph_monc_create_snapid(struct ceph_mon_client *monc, pool, 0, (char *)snapid, sizeof(*snapid)); } +EXPORT_SYMBOL(ceph_monc_create_snapid); int ceph_monc_delete_snapid(struct ceph_mon_client *monc, u32 pool, u64 snapid) @@ -708,9 +718,9 @@ static void delayed_work(struct work_struct *work) */ static int build_initial_monmap(struct ceph_mon_client *monc) { - struct ceph_mount_args *args = monc->client->mount_args; - struct ceph_entity_addr *mon_addr = args->mon_addr; - int num_mon = args->num_mon; + struct ceph_options *opt = monc->client->options; + struct ceph_entity_addr *mon_addr = opt->mon_addr; + int num_mon = opt->num_mon; int i; /* build initial monmap */ @@ -728,11 +738,6 @@ static int build_initial_monmap(struct ceph_mon_client *monc) } monc->monmap->num_mon = num_mon; monc->have_fsid = false; - - /* release addr memory */ - kfree(args->mon_addr); - args->mon_addr = NULL; - args->num_mon = 0; return 0; } @@ -753,8 +758,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) monc->con = NULL; /* authentication */ - monc->auth = ceph_auth_init(cl->mount_args->name, - cl->mount_args->secret); + monc->auth = ceph_auth_init(cl->options->name, + cl->options->secret); if (IS_ERR(monc->auth)) return PTR_ERR(monc->auth); monc->auth->want_keys = @@ -808,6 +813,7 @@ out_monmap: out: return err; } +EXPORT_SYMBOL(ceph_monc_init); void ceph_monc_stop(struct ceph_mon_client *monc) { @@ -832,6 +838,7 @@ void ceph_monc_stop(struct ceph_mon_client *monc) kfree(monc->monmap); } +EXPORT_SYMBOL(ceph_monc_stop); static void handle_auth_reply(struct ceph_mon_client *monc, struct ceph_msg *msg) @@ -889,6 +896,7 @@ int ceph_monc_validate_auth(struct ceph_mon_client *monc) mutex_unlock(&monc->mutex); return ret; } +EXPORT_SYMBOL(ceph_monc_validate_auth); /* * handle incoming message @@ -922,15 +930,16 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) ceph_monc_handle_map(monc, msg); break; - case CEPH_MSG_MDS_MAP: - ceph_mdsc_handle_map(&monc->client->mdsc, msg); - break; - case CEPH_MSG_OSD_MAP: ceph_osdc_handle_map(&monc->client->osdc, msg); break; default: + /* can the chained handler handle it? */ + if (monc->client->extra_mon_dispatch && + monc->client->extra_mon_dispatch(monc->client, msg) == 0) + break; + pr_err("received unknown message type %d %s\n", type, ceph_msg_type_name(type)); } @@ -994,7 +1003,7 @@ static void mon_fault(struct ceph_connection *con) if (monc->con && !monc->hunting) pr_info("mon%d %s session lost, " "hunting for new mon\n", monc->cur_mon, - pr_addr(&monc->con->peer_addr.in_addr)); + ceph_pr_addr(&monc->con->peer_addr.in_addr)); __close_session(monc); if (!monc->hunting) { diff --git a/fs/ceph/msgpool.c b/net/ceph/msgpool.c index dd65a6438131..d5f2d97ac05c 100644 --- a/fs/ceph/msgpool.c +++ b/net/ceph/msgpool.c @@ -1,11 +1,11 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/err.h> #include <linux/sched.h> #include <linux/types.h> #include <linux/vmalloc.h> -#include "msgpool.h" +#include <linux/ceph/msgpool.h> static void *alloc_fn(gfp_t gfp_mask, void *arg) { diff --git a/fs/ceph/osd_client.c b/net/ceph/osd_client.c index 3b5571b8ce22..79391994b3ed 100644 --- a/fs/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1,17 +1,22 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> +#include <linux/module.h> #include <linux/err.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/uaccess.h> +#ifdef CONFIG_BLOCK +#include <linux/bio.h> +#endif -#include "super.h" -#include "osd_client.h" -#include "messenger.h" -#include "decode.h" -#include "auth.h" +#include <linux/ceph/libceph.h> +#include <linux/ceph/osd_client.h> +#include <linux/ceph/messenger.h> +#include <linux/ceph/decode.h> +#include <linux/ceph/auth.h> +#include <linux/ceph/pagelist.h> #define OSD_OP_FRONT_LEN 4096 #define OSD_OPREPLY_FRONT_LEN 512 @@ -22,6 +27,59 @@ static int __kick_requests(struct ceph_osd_client *osdc, static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd); +static int op_needs_trail(int op) +{ + switch (op) { + case CEPH_OSD_OP_GETXATTR: + case CEPH_OSD_OP_SETXATTR: + case CEPH_OSD_OP_CMPXATTR: + case CEPH_OSD_OP_CALL: + return 1; + default: + return 0; + } +} + +static int op_has_extent(int op) +{ + return (op == CEPH_OSD_OP_READ || + op == CEPH_OSD_OP_WRITE); +} + +void ceph_calc_raw_layout(struct ceph_osd_client *osdc, + struct ceph_file_layout *layout, + u64 snapid, + u64 off, u64 *plen, u64 *bno, + struct ceph_osd_request *req, + struct ceph_osd_req_op *op) +{ + struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; + u64 orig_len = *plen; + u64 objoff, objlen; /* extent in object */ + + reqhead->snapid = cpu_to_le64(snapid); + + /* object extent? */ + ceph_calc_file_object_mapping(layout, off, plen, bno, + &objoff, &objlen); + if (*plen < orig_len) + dout(" skipping last %llu, final file extent %llu~%llu\n", + orig_len - *plen, off, *plen); + + if (op_has_extent(op->op)) { + op->extent.offset = objoff; + op->extent.length = objlen; + } + req->r_num_pages = calc_pages_for(off, *plen); + if (op->op == CEPH_OSD_OP_WRITE) + op->payload_len = *plen; + + dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", + *bno, objoff, objlen, req->r_num_pages); + +} +EXPORT_SYMBOL(ceph_calc_raw_layout); + /* * Implement client access to distributed object storage cluster. * @@ -48,34 +106,19 @@ static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd); * fill osd op in request message. */ static void calc_layout(struct ceph_osd_client *osdc, - struct ceph_vino vino, struct ceph_file_layout *layout, + struct ceph_vino vino, + struct ceph_file_layout *layout, u64 off, u64 *plen, - struct ceph_osd_request *req) + struct ceph_osd_request *req, + struct ceph_osd_req_op *op) { - struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; - struct ceph_osd_op *op = (void *)(reqhead + 1); - u64 orig_len = *plen; - u64 objoff, objlen; /* extent in object */ u64 bno; - reqhead->snapid = cpu_to_le64(vino.snap); - - /* object extent? */ - ceph_calc_file_object_mapping(layout, off, plen, &bno, - &objoff, &objlen); - if (*plen < orig_len) - dout(" skipping last %llu, final file extent %llu~%llu\n", - orig_len - *plen, off, *plen); + ceph_calc_raw_layout(osdc, layout, vino.snap, off, + plen, &bno, req, op); sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno); req->r_oid_len = strlen(req->r_oid); - - op->extent.offset = cpu_to_le64(objoff); - op->extent.length = cpu_to_le64(objlen); - req->r_num_pages = calc_pages_for(off, *plen); - - dout("calc_layout %s (%d) %llu~%llu (%d pages)\n", - req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages); } /* @@ -101,56 +144,66 @@ void ceph_osdc_release_request(struct kref *kref) if (req->r_own_pages) ceph_release_page_vector(req->r_pages, req->r_num_pages); +#ifdef CONFIG_BLOCK + if (req->r_bio) + bio_put(req->r_bio); +#endif ceph_put_snap_context(req->r_snapc); + if (req->r_trail) { + ceph_pagelist_release(req->r_trail); + kfree(req->r_trail); + } if (req->r_mempool) mempool_free(req, req->r_osdc->req_mempool); else kfree(req); } +EXPORT_SYMBOL(ceph_osdc_release_request); -/* - * build new request AND message, calculate layout, and adjust file - * extent as needed. - * - * if the file was recently truncated, we include information about its - * old and new size so that the object can be updated appropriately. (we - * avoid synchronously deleting truncated objects because it's slow.) - * - * if @do_sync, include a 'startsync' command so that the osd will flush - * data quickly. - */ -struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, - struct ceph_file_layout *layout, - struct ceph_vino vino, - u64 off, u64 *plen, - int opcode, int flags, +static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail) +{ + int i = 0; + + if (needs_trail) + *needs_trail = 0; + while (ops[i].op) { + if (needs_trail && op_needs_trail(ops[i].op)) + *needs_trail = 1; + i++; + } + + return i; +} + +struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, + int flags, struct ceph_snap_context *snapc, - int do_sync, - u32 truncate_seq, - u64 truncate_size, - struct timespec *mtime, - bool use_mempool, int num_reply) + struct ceph_osd_req_op *ops, + bool use_mempool, + gfp_t gfp_flags, + struct page **pages, + struct bio *bio) { struct ceph_osd_request *req; struct ceph_msg *msg; - struct ceph_osd_request_head *head; - struct ceph_osd_op *op; - void *p; - int num_op = 1 + do_sync; - size_t msg_size = sizeof(*head) + num_op*sizeof(*op); - int i; + int needs_trail; + int num_op = get_num_ops(ops, &needs_trail); + size_t msg_size = sizeof(struct ceph_osd_request_head); + + msg_size += num_op*sizeof(struct ceph_osd_op); if (use_mempool) { - req = mempool_alloc(osdc->req_mempool, GFP_NOFS); + req = mempool_alloc(osdc->req_mempool, gfp_flags); memset(req, 0, sizeof(*req)); } else { - req = kzalloc(sizeof(*req), GFP_NOFS); + req = kzalloc(sizeof(*req), gfp_flags); } if (req == NULL) return NULL; req->r_osdc = osdc; req->r_mempool = use_mempool; + kref_init(&req->r_kref); init_completion(&req->r_completion); init_completion(&req->r_safe_completion); @@ -164,13 +217,22 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); else msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, - OSD_OPREPLY_FRONT_LEN, GFP_NOFS); + OSD_OPREPLY_FRONT_LEN, gfp_flags); if (!msg) { ceph_osdc_put_request(req); return NULL; } req->r_reply = msg; + /* allocate space for the trailing data */ + if (needs_trail) { + req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags); + if (!req->r_trail) { + ceph_osdc_put_request(req); + return NULL; + } + ceph_pagelist_init(req->r_trail); + } /* create request message; allow space for oid */ msg_size += 40; if (snapc) @@ -178,18 +240,115 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op, 0); else - msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS); + msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags); if (!msg) { ceph_osdc_put_request(req); return NULL; } + msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); memset(msg->front.iov_base, 0, msg->front.iov_len); + + req->r_request = msg; + req->r_pages = pages; +#ifdef CONFIG_BLOCK + if (bio) { + req->r_bio = bio; + bio_get(req->r_bio); + } +#endif + + return req; +} +EXPORT_SYMBOL(ceph_osdc_alloc_request); + +static void osd_req_encode_op(struct ceph_osd_request *req, + struct ceph_osd_op *dst, + struct ceph_osd_req_op *src) +{ + dst->op = cpu_to_le16(src->op); + + switch (dst->op) { + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_WRITE: + dst->extent.offset = + cpu_to_le64(src->extent.offset); + dst->extent.length = + cpu_to_le64(src->extent.length); + dst->extent.truncate_size = + cpu_to_le64(src->extent.truncate_size); + dst->extent.truncate_seq = + cpu_to_le32(src->extent.truncate_seq); + break; + + case CEPH_OSD_OP_GETXATTR: + case CEPH_OSD_OP_SETXATTR: + case CEPH_OSD_OP_CMPXATTR: + BUG_ON(!req->r_trail); + + dst->xattr.name_len = cpu_to_le32(src->xattr.name_len); + dst->xattr.value_len = cpu_to_le32(src->xattr.value_len); + dst->xattr.cmp_op = src->xattr.cmp_op; + dst->xattr.cmp_mode = src->xattr.cmp_mode; + ceph_pagelist_append(req->r_trail, src->xattr.name, + src->xattr.name_len); + ceph_pagelist_append(req->r_trail, src->xattr.val, + src->xattr.value_len); + break; + case CEPH_OSD_OP_CALL: + BUG_ON(!req->r_trail); + + dst->cls.class_len = src->cls.class_len; + dst->cls.method_len = src->cls.method_len; + dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); + + ceph_pagelist_append(req->r_trail, src->cls.class_name, + src->cls.class_len); + ceph_pagelist_append(req->r_trail, src->cls.method_name, + src->cls.method_len); + ceph_pagelist_append(req->r_trail, src->cls.indata, + src->cls.indata_len); + break; + case CEPH_OSD_OP_ROLLBACK: + dst->snap.snapid = cpu_to_le64(src->snap.snapid); + break; + case CEPH_OSD_OP_STARTSYNC: + break; + default: + pr_err("unrecognized osd opcode %d\n", dst->op); + WARN_ON(1); + break; + } + dst->payload_len = cpu_to_le32(src->payload_len); +} + +/* + * build new request AND message + * + */ +void ceph_osdc_build_request(struct ceph_osd_request *req, + u64 off, u64 *plen, + struct ceph_osd_req_op *src_ops, + struct ceph_snap_context *snapc, + struct timespec *mtime, + const char *oid, + int oid_len) +{ + struct ceph_msg *msg = req->r_request; + struct ceph_osd_request_head *head; + struct ceph_osd_req_op *src_op; + struct ceph_osd_op *op; + void *p; + int num_op = get_num_ops(src_ops, NULL); + size_t msg_size = sizeof(*head) + num_op*sizeof(*op); + int flags = req->r_flags; + u64 data_len = 0; + int i; + head = msg->front.iov_base; op = (void *)(head + 1); p = (void *)(op + num_op); - req->r_request = msg; req->r_snapc = ceph_get_snap_context(snapc); head->client_inc = cpu_to_le32(1); /* always, for now. */ @@ -197,29 +356,23 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, if (flags & CEPH_OSD_FLAG_WRITE) ceph_encode_timespec(&head->mtime, mtime); head->num_ops = cpu_to_le16(num_op); - op->op = cpu_to_le16(opcode); - /* calculate max write size */ - calc_layout(osdc, vino, layout, off, plen, req); - req->r_file_layout = *layout; /* keep a copy */ - - if (flags & CEPH_OSD_FLAG_WRITE) { - req->r_request->hdr.data_off = cpu_to_le16(off); - req->r_request->hdr.data_len = cpu_to_le32(*plen); - op->payload_len = cpu_to_le32(*plen); - } - op->extent.truncate_size = cpu_to_le64(truncate_size); - op->extent.truncate_seq = cpu_to_le32(truncate_seq); /* fill in oid */ - head->object_len = cpu_to_le32(req->r_oid_len); - memcpy(p, req->r_oid, req->r_oid_len); - p += req->r_oid_len; - - if (do_sync) { + head->object_len = cpu_to_le32(oid_len); + memcpy(p, oid, oid_len); + p += oid_len; + + src_op = src_ops; + while (src_op->op) { + osd_req_encode_op(req, op, src_op); + src_op++; op++; - op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC); } + + if (req->r_trail) + data_len += req->r_trail->length; + if (snapc) { head->snap_seq = cpu_to_le64(snapc->seq); head->num_snaps = cpu_to_le32(snapc->num_snaps); @@ -229,12 +382,79 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, } } + if (flags & CEPH_OSD_FLAG_WRITE) { + req->r_request->hdr.data_off = cpu_to_le16(off); + req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len); + } else if (data_len) { + req->r_request->hdr.data_off = 0; + req->r_request->hdr.data_len = cpu_to_le32(data_len); + } + BUG_ON(p > msg->front.iov_base + msg->front.iov_len); msg_size = p - msg->front.iov_base; msg->front.iov_len = msg_size; msg->hdr.front_len = cpu_to_le32(msg_size); + return; +} +EXPORT_SYMBOL(ceph_osdc_build_request); + +/* + * build new request AND message, calculate layout, and adjust file + * extent as needed. + * + * if the file was recently truncated, we include information about its + * old and new size so that the object can be updated appropriately. (we + * avoid synchronously deleting truncated objects because it's slow.) + * + * if @do_sync, include a 'startsync' command so that the osd will flush + * data quickly. + */ +struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, + struct ceph_file_layout *layout, + struct ceph_vino vino, + u64 off, u64 *plen, + int opcode, int flags, + struct ceph_snap_context *snapc, + int do_sync, + u32 truncate_seq, + u64 truncate_size, + struct timespec *mtime, + bool use_mempool, int num_reply) +{ + struct ceph_osd_req_op ops[3]; + struct ceph_osd_request *req; + + ops[0].op = opcode; + ops[0].extent.truncate_seq = truncate_seq; + ops[0].extent.truncate_size = truncate_size; + ops[0].payload_len = 0; + + if (do_sync) { + ops[1].op = CEPH_OSD_OP_STARTSYNC; + ops[1].payload_len = 0; + ops[2].op = 0; + } else + ops[1].op = 0; + + req = ceph_osdc_alloc_request(osdc, flags, + snapc, ops, + use_mempool, + GFP_NOFS, NULL, NULL); + if (IS_ERR(req)) + return req; + + /* calculate max write size */ + calc_layout(osdc, vino, layout, off, plen, req, ops); + req->r_file_layout = *layout; /* keep a copy */ + + ceph_osdc_build_request(req, off, plen, ops, + snapc, + mtime, + req->r_oid, req->r_oid_len); + return req; } +EXPORT_SYMBOL(ceph_osdc_new_request); /* * We keep osd requests in an rbtree, sorted by ->r_tid. @@ -389,7 +609,7 @@ static void __move_osd_to_lru(struct ceph_osd_client *osdc, dout("__move_osd_to_lru %p\n", osd); BUG_ON(!list_empty(&osd->o_osd_lru)); list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); - osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ; + osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ; } static void __remove_osd_from_lru(struct ceph_osd *osd) @@ -483,7 +703,7 @@ static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o) static void __schedule_osd_timeout(struct ceph_osd_client *osdc) { schedule_delayed_work(&osdc->timeout_work, - osdc->client->mount_args->osd_keepalive_timeout * HZ); + osdc->client->options->osd_keepalive_timeout * HZ); } static void __cancel_osd_timeout(struct ceph_osd_client *osdc) @@ -684,9 +904,9 @@ static void handle_timeout(struct work_struct *work) container_of(work, struct ceph_osd_client, timeout_work.work); struct ceph_osd_request *req, *last_req = NULL; struct ceph_osd *osd; - unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ; + unsigned long timeout = osdc->client->options->osd_timeout * HZ; unsigned long keepalive = - osdc->client->mount_args->osd_keepalive_timeout * HZ; + osdc->client->options->osd_keepalive_timeout * HZ; unsigned long last_stamp = 0; struct rb_node *p; struct list_head slow_osds; @@ -773,7 +993,7 @@ static void handle_osds_timeout(struct work_struct *work) container_of(work, struct ceph_osd_client, osds_timeout_work.work); unsigned long delay = - osdc->client->mount_args->osd_idle_ttl * HZ >> 2; + osdc->client->options->osd_idle_ttl * HZ >> 2; dout("osds timeout\n"); down_read(&osdc->map_sem); @@ -1104,6 +1324,10 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, req->r_request->pages = req->r_pages; req->r_request->nr_pages = req->r_num_pages; +#ifdef CONFIG_BLOCK + req->r_request->bio = req->r_bio; +#endif + req->r_request->trail = req->r_trail; register_request(osdc, req); @@ -1131,6 +1355,7 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, up_read(&osdc->map_sem); return rc; } +EXPORT_SYMBOL(ceph_osdc_start_request); /* * wait for a request to complete @@ -1153,6 +1378,7 @@ int ceph_osdc_wait_request(struct ceph_osd_client *osdc, dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result); return req->r_result; } +EXPORT_SYMBOL(ceph_osdc_wait_request); /* * sync - wait for all in-flight requests to flush. avoid starvation. @@ -1186,6 +1412,7 @@ void ceph_osdc_sync(struct ceph_osd_client *osdc) mutex_unlock(&osdc->request_mutex); dout("sync done (thru tid %llu)\n", last_tid); } +EXPORT_SYMBOL(ceph_osdc_sync); /* * init, shutdown @@ -1211,7 +1438,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); schedule_delayed_work(&osdc->osds_timeout_work, - round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ)); + round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ)); err = -ENOMEM; osdc->req_mempool = mempool_create_kmalloc_pool(10, @@ -1237,6 +1464,7 @@ out_mempool: out: return err; } +EXPORT_SYMBOL(ceph_osdc_init); void ceph_osdc_stop(struct ceph_osd_client *osdc) { @@ -1251,6 +1479,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) ceph_msgpool_destroy(&osdc->msgpool_op); ceph_msgpool_destroy(&osdc->msgpool_op_reply); } +EXPORT_SYMBOL(ceph_osdc_stop); /* * Read some contiguous pages. If we cross a stripe boundary, shorten @@ -1288,6 +1517,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, dout("readpages result %d\n", rc); return rc; } +EXPORT_SYMBOL(ceph_osdc_readpages); /* * do a synchronous write on N pages @@ -1330,6 +1560,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, dout("writepages result %d\n", rc); return rc; } +EXPORT_SYMBOL(ceph_osdc_writepages); /* * handle incoming message @@ -1420,6 +1651,9 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, } m->pages = req->r_pages; m->nr_pages = req->r_num_pages; +#ifdef CONFIG_BLOCK + m->bio = req->r_bio; +#endif } *skip = 0; req->r_con_filling_msg = ceph_con_get(con); diff --git a/fs/ceph/osdmap.c b/net/ceph/osdmap.c index e31f118f1392..d73f3f6efa36 100644 --- a/fs/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1,14 +1,15 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> +#include <linux/module.h> #include <linux/slab.h> #include <asm/div64.h> -#include "super.h" -#include "osdmap.h" -#include "crush/hash.h" -#include "crush/mapper.h" -#include "decode.h" +#include <linux/ceph/libceph.h> +#include <linux/ceph/osdmap.h> +#include <linux/ceph/decode.h> +#include <linux/crush/hash.h> +#include <linux/crush/mapper.h> char *ceph_osdmap_state_str(char *str, int len, int state) { @@ -417,6 +418,20 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) return NULL; } +int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name) +{ + struct rb_node *rbp; + + for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) { + struct ceph_pg_pool_info *pi = + rb_entry(rbp, struct ceph_pg_pool_info, node); + if (pi->name && strcmp(pi->name, name) == 0) + return pi->id; + } + return -ENOENT; +} +EXPORT_SYMBOL(ceph_pg_poolid_by_name); + static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) { rb_erase(&pi->node, root); @@ -966,6 +981,7 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); } +EXPORT_SYMBOL(ceph_calc_file_object_mapping); /* * calculate an object layout (i.e. pgid) from an oid, @@ -1011,6 +1027,7 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol, ol->ol_stripe_unit = fl->fl_object_stripe_unit; return 0; } +EXPORT_SYMBOL(ceph_calc_object_layout); /* * Calculate raw osd vector for the given pgid. Return pointer to osd @@ -1108,3 +1125,4 @@ int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) return osds[i]; return -1; } +EXPORT_SYMBOL(ceph_calc_pg_primary); diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c new file mode 100644 index 000000000000..13cb409a7bba --- /dev/null +++ b/net/ceph/pagelist.c @@ -0,0 +1,154 @@ + +#include <linux/module.h> +#include <linux/gfp.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/ceph/pagelist.h> + +static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl) +{ + if (pl->mapped_tail) { + struct page *page = list_entry(pl->head.prev, struct page, lru); + kunmap(page); + pl->mapped_tail = NULL; + } +} + +int ceph_pagelist_release(struct ceph_pagelist *pl) +{ + ceph_pagelist_unmap_tail(pl); + while (!list_empty(&pl->head)) { + struct page *page = list_first_entry(&pl->head, struct page, + lru); + list_del(&page->lru); + __free_page(page); + } + ceph_pagelist_free_reserve(pl); + return 0; +} +EXPORT_SYMBOL(ceph_pagelist_release); + +static int ceph_pagelist_addpage(struct ceph_pagelist *pl) +{ + struct page *page; + + if (!pl->num_pages_free) { + page = __page_cache_alloc(GFP_NOFS); + } else { + page = list_first_entry(&pl->free_list, struct page, lru); + list_del(&page->lru); + --pl->num_pages_free; + } + if (!page) + return -ENOMEM; + pl->room += PAGE_SIZE; + ceph_pagelist_unmap_tail(pl); + list_add_tail(&page->lru, &pl->head); + pl->mapped_tail = kmap(page); + return 0; +} + +int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len) +{ + while (pl->room < len) { + size_t bit = pl->room; + int ret; + + memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), + buf, bit); + pl->length += bit; + pl->room -= bit; + buf += bit; + len -= bit; + ret = ceph_pagelist_addpage(pl); + if (ret) + return ret; + } + + memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len); + pl->length += len; + pl->room -= len; + return 0; +} +EXPORT_SYMBOL(ceph_pagelist_append); + +/** + * Allocate enough pages for a pagelist to append the given amount + * of data without without allocating. + * Returns: 0 on success, -ENOMEM on error. + */ +int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space) +{ + if (space <= pl->room) + return 0; + space -= pl->room; + space = (space + PAGE_SIZE - 1) >> PAGE_SHIFT; /* conv to num pages */ + + while (space > pl->num_pages_free) { + struct page *page = __page_cache_alloc(GFP_NOFS); + if (!page) + return -ENOMEM; + list_add_tail(&page->lru, &pl->free_list); + ++pl->num_pages_free; + } + return 0; +} +EXPORT_SYMBOL(ceph_pagelist_reserve); + +/** + * Free any pages that have been preallocated. + */ +int ceph_pagelist_free_reserve(struct ceph_pagelist *pl) +{ + while (!list_empty(&pl->free_list)) { + struct page *page = list_first_entry(&pl->free_list, + struct page, lru); + list_del(&page->lru); + __free_page(page); + --pl->num_pages_free; + } + BUG_ON(pl->num_pages_free); + return 0; +} +EXPORT_SYMBOL(ceph_pagelist_free_reserve); + +/** + * Create a truncation point. + */ +void ceph_pagelist_set_cursor(struct ceph_pagelist *pl, + struct ceph_pagelist_cursor *c) +{ + c->pl = pl; + c->page_lru = pl->head.prev; + c->room = pl->room; +} +EXPORT_SYMBOL(ceph_pagelist_set_cursor); + +/** + * Truncate a pagelist to the given point. Move extra pages to reserve. + * This won't sleep. + * Returns: 0 on success, + * -EINVAL if the pagelist doesn't match the trunc point pagelist + */ +int ceph_pagelist_truncate(struct ceph_pagelist *pl, + struct ceph_pagelist_cursor *c) +{ + struct page *page; + + if (pl != c->pl) + return -EINVAL; + ceph_pagelist_unmap_tail(pl); + while (pl->head.prev != c->page_lru) { + page = list_entry(pl->head.prev, struct page, lru); + list_del(&page->lru); /* remove from pagelist */ + list_add_tail(&page->lru, &pl->free_list); /* add to reserve */ + ++pl->num_pages_free; + } + pl->room = c->room; + if (!list_empty(&pl->head)) { + page = list_entry(pl->head.prev, struct page, lru); + pl->mapped_tail = kmap(page); + } + return 0; +} +EXPORT_SYMBOL(ceph_pagelist_truncate); diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c new file mode 100644 index 000000000000..54caf0687155 --- /dev/null +++ b/net/ceph/pagevec.c @@ -0,0 +1,223 @@ +#include <linux/ceph/ceph_debug.h> + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/namei.h> +#include <linux/writeback.h> + +#include <linux/ceph/libceph.h> + +/* + * build a vector of user pages + */ +struct page **ceph_get_direct_page_vector(const char __user *data, + int num_pages, + loff_t off, size_t len) +{ + struct page **pages; + int rc; + + pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); + if (!pages) + return ERR_PTR(-ENOMEM); + + down_read(¤t->mm->mmap_sem); + rc = get_user_pages(current, current->mm, (unsigned long)data, + num_pages, 0, 0, pages, NULL); + up_read(¤t->mm->mmap_sem); + if (rc < 0) + goto fail; + return pages; + +fail: + kfree(pages); + return ERR_PTR(rc); +} +EXPORT_SYMBOL(ceph_get_direct_page_vector); + +void ceph_put_page_vector(struct page **pages, int num_pages) +{ + int i; + + for (i = 0; i < num_pages; i++) + put_page(pages[i]); + kfree(pages); +} +EXPORT_SYMBOL(ceph_put_page_vector); + +void ceph_release_page_vector(struct page **pages, int num_pages) +{ + int i; + + for (i = 0; i < num_pages; i++) + __free_pages(pages[i], 0); + kfree(pages); +} +EXPORT_SYMBOL(ceph_release_page_vector); + +/* + * allocate a vector new pages + */ +struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags) +{ + struct page **pages; + int i; + + pages = kmalloc(sizeof(*pages) * num_pages, flags); + if (!pages) + return ERR_PTR(-ENOMEM); + for (i = 0; i < num_pages; i++) { + pages[i] = __page_cache_alloc(flags); + if (pages[i] == NULL) { + ceph_release_page_vector(pages, i); + return ERR_PTR(-ENOMEM); + } + } + return pages; +} +EXPORT_SYMBOL(ceph_alloc_page_vector); + +/* + * copy user data into a page vector + */ +int ceph_copy_user_to_page_vector(struct page **pages, + const char __user *data, + loff_t off, size_t len) +{ + int i = 0; + int po = off & ~PAGE_CACHE_MASK; + int left = len; + int l, bad; + + while (left > 0) { + l = min_t(int, PAGE_CACHE_SIZE-po, left); + bad = copy_from_user(page_address(pages[i]) + po, data, l); + if (bad == l) + return -EFAULT; + data += l - bad; + left -= l - bad; + po += l - bad; + if (po == PAGE_CACHE_SIZE) { + po = 0; + i++; + } + } + return len; +} +EXPORT_SYMBOL(ceph_copy_user_to_page_vector); + +int ceph_copy_to_page_vector(struct page **pages, + const char *data, + loff_t off, size_t len) +{ + int i = 0; + size_t po = off & ~PAGE_CACHE_MASK; + size_t left = len; + size_t l; + + while (left > 0) { + l = min_t(size_t, PAGE_CACHE_SIZE-po, left); + memcpy(page_address(pages[i]) + po, data, l); + data += l; + left -= l; + po += l; + if (po == PAGE_CACHE_SIZE) { + po = 0; + i++; + } + } + return len; +} +EXPORT_SYMBOL(ceph_copy_to_page_vector); + +int ceph_copy_from_page_vector(struct page **pages, + char *data, + loff_t off, size_t len) +{ + int i = 0; + size_t po = off & ~PAGE_CACHE_MASK; + size_t left = len; + size_t l; + + while (left > 0) { + l = min_t(size_t, PAGE_CACHE_SIZE-po, left); + memcpy(data, page_address(pages[i]) + po, l); + data += l; + left -= l; + po += l; + if (po == PAGE_CACHE_SIZE) { + po = 0; + i++; + } + } + return len; +} +EXPORT_SYMBOL(ceph_copy_from_page_vector); + +/* + * copy user data from a page vector into a user pointer + */ +int ceph_copy_page_vector_to_user(struct page **pages, + char __user *data, + loff_t off, size_t len) +{ + int i = 0; + int po = off & ~PAGE_CACHE_MASK; + int left = len; + int l, bad; + + while (left > 0) { + l = min_t(int, left, PAGE_CACHE_SIZE-po); + bad = copy_to_user(data, page_address(pages[i]) + po, l); + if (bad == l) + return -EFAULT; + data += l - bad; + left -= l - bad; + if (po) { + po += l - bad; + if (po == PAGE_CACHE_SIZE) + po = 0; + } + i++; + } + return len; +} +EXPORT_SYMBOL(ceph_copy_page_vector_to_user); + +/* + * Zero an extent within a page vector. Offset is relative to the + * start of the first page. + */ +void ceph_zero_page_vector_range(int off, int len, struct page **pages) +{ + int i = off >> PAGE_CACHE_SHIFT; + + off &= ~PAGE_CACHE_MASK; + + dout("zero_page_vector_page %u~%u\n", off, len); + + /* leading partial page? */ + if (off) { + int end = min((int)PAGE_CACHE_SIZE, off + len); + dout("zeroing %d %p head from %d\n", i, pages[i], + (int)off); + zero_user_segment(pages[i], off, end); + len -= (end - off); + i++; + } + while (len >= PAGE_CACHE_SIZE) { + dout("zeroing %d %p len=%d\n", i, pages[i], len); + zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE); + len -= PAGE_CACHE_SIZE; + i++; + } + /* trailing partial page? */ + if (len) { + dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len); + zero_user_segment(pages[i], 0, len); + } +} +EXPORT_SYMBOL(ceph_zero_page_vector_range); + diff --git a/net/core/datagram.c b/net/core/datagram.c index 251997a95483..282806ba7a57 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -243,6 +243,7 @@ void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) unlock_sock_fast(sk, slow); /* skb is now orphaned, can be freed outside of locked section */ + trace_kfree_skb(skb, skb_free_datagram_locked); __kfree_skb(skb); } EXPORT_SYMBOL(skb_free_datagram_locked); diff --git a/net/core/dev.c b/net/core/dev.c index 660dd41aaaa6..7ec85e27beed 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -128,6 +128,8 @@ #include <linux/jhash.h> #include <linux/random.h> #include <trace/events/napi.h> +#include <trace/events/net.h> +#include <trace/events/skb.h> #include <linux/pci.h> #include "net-sysfs.h" @@ -1978,6 +1980,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, } rc = ops->ndo_start_xmit(skb, dev); + trace_net_dev_xmit(skb, rc); if (rc == NETDEV_TX_OK) txq_trans_update(txq); return rc; @@ -1998,6 +2001,7 @@ gso: skb_dst_drop(nskb); rc = ops->ndo_start_xmit(nskb, dev); + trace_net_dev_xmit(nskb, rc); if (unlikely(rc != NETDEV_TX_OK)) { if (rc & ~NETDEV_TX_MASK) goto out_kfree_gso_skb; @@ -2186,6 +2190,7 @@ int dev_queue_xmit(struct sk_buff *skb) #ifdef CONFIG_NET_CLS_ACT skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); #endif + trace_net_dev_queue(skb); if (q->enqueue) { rc = __dev_xmit_skb(skb, q, dev, txq); goto out; @@ -2512,6 +2517,7 @@ int netif_rx(struct sk_buff *skb) if (netdev_tstamp_prequeue) net_timestamp_check(skb); + trace_netif_rx(skb); #ifdef CONFIG_RPS { struct rps_dev_flow voidflow, *rflow = &voidflow; @@ -2571,6 +2577,7 @@ static void net_tx_action(struct softirq_action *h) clist = clist->next; WARN_ON(atomic_read(&skb->users)); + trace_kfree_skb(skb, net_tx_action); __kfree_skb(skb); } } @@ -2828,6 +2835,7 @@ static int __netif_receive_skb(struct sk_buff *skb) if (!netdev_tstamp_prequeue) net_timestamp_check(skb); + trace_netif_receive_skb(skb); if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb)) return NET_RX_SUCCESS; diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 4016ac6bdd5e..8451ab481095 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -397,7 +397,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index)) return -ENOMEM; full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size; - indir = kmalloc(full_size, GFP_USER); + indir = kzalloc(full_size, GFP_USER); if (!indir) return -ENOMEM; @@ -538,7 +538,7 @@ static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr) gstrings.len = ret; - data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); + data = kzalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); if (!data) return -ENOMEM; @@ -775,7 +775,7 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) if (regs.len > reglen) regs.len = reglen; - regbuf = kmalloc(reglen, GFP_USER); + regbuf = kzalloc(reglen, GFP_USER); if (!regbuf) return -ENOMEM; diff --git a/net/core/net-traces.c b/net/core/net-traces.c index afa6380ed88a..7f1bb2aba03b 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -26,6 +26,7 @@ #define CREATE_TRACE_POINTS #include <trace/events/skb.h> +#include <trace/events/net.h> #include <trace/events/napi.h> EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c83b421341c0..56ba3c4e4761 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -466,6 +466,7 @@ void consume_skb(struct sk_buff *skb) smp_rmb(); else if (likely(!atomic_dec_and_test(&skb->users))) return; + trace_consume_skb(skb); __kfree_skb(skb); } EXPORT_SYMBOL(consume_skb); diff --git a/net/core/sock.c b/net/core/sock.c index ef30e9d286e7..7d99e13148e6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1078,8 +1078,11 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) #ifdef CONFIG_CGROUPS void sock_update_classid(struct sock *sk) { - u32 classid = task_cls_classid(current); + u32 classid; + rcu_read_lock(); /* doing current task, which cannot vanish. */ + classid = task_cls_classid(current); + rcu_read_unlock(); if (classid && classid != sk->sk_classid) sk->sk_classid = classid; } diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 244f7cb08d68..37f8adb68c79 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -11,6 +11,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/percpu.h> +#include <linux/security.h> #include <net/net_namespace.h> #include <linux/netfilter.h> @@ -87,6 +88,29 @@ static void ct_seq_stop(struct seq_file *s, void *v) rcu_read_unlock(); } +#ifdef CONFIG_NF_CONNTRACK_SECMARK +static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) +{ + int ret; + u32 len; + char *secctx; + + ret = security_secid_to_secctx(ct->secmark, &secctx, &len); + if (ret) + return ret; + + ret = seq_printf(s, "secctx=%s ", secctx); + + security_release_secctx(secctx, len); + return ret; +} +#else +static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) +{ + return 0; +} +#endif + static int ct_seq_show(struct seq_file *s, void *v) { struct nf_conntrack_tuple_hash *hash = v; @@ -148,10 +172,8 @@ static int ct_seq_show(struct seq_file *s, void *v) goto release; #endif -#ifdef CONFIG_NF_CONNTRACK_SECMARK - if (seq_printf(s, "secmark=%u ", ct->secmark)) + if (ct_show_secctx(s, ct)) goto release; -#endif if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use))) goto release; diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 8c8632d9b93c..957c9241fb0c 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -38,7 +38,7 @@ static DEFINE_SPINLOCK(nf_nat_lock); static struct nf_conntrack_l3proto *l3proto __read_mostly; #define MAX_IP_NAT_PROTO 256 -static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO] +static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO] __read_mostly; static inline const struct nf_nat_protocol * diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 78b505d33bfb..fdaec7daff1d 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -27,7 +27,7 @@ static DEFINE_MUTEX(afinfo_mutex); -const struct nf_afinfo *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly; +const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly; EXPORT_SYMBOL(nf_afinfo); int nf_register_afinfo(const struct nf_afinfo *afinfo) diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index cdcc7649476b..5702de35e2bb 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -26,10 +26,10 @@ static DEFINE_MUTEX(nf_ct_ecache_mutex); -struct nf_ct_event_notifier *nf_conntrack_event_cb __read_mostly; +struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_event_cb); -struct nf_exp_event_notifier *nf_expect_event_cb __read_mostly; +struct nf_exp_event_notifier __rcu *nf_expect_event_cb __read_mostly; EXPORT_SYMBOL_GPL(nf_expect_event_cb); /* deliver cached events and clear cache entry - must be called with locally diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index 8d9e4c949b96..bd82450c193f 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -16,7 +16,7 @@ #include <linux/skbuff.h> #include <net/netfilter/nf_conntrack_extend.h> -static struct nf_ct_ext_type *nf_ct_ext_types[NF_CT_EXT_NUM]; +static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM]; static DEFINE_MUTEX(nf_ct_ext_type_mutex); void __nf_ct_ext_destroy(struct nf_conn *ct) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 5bae1cd15eea..146476c6441a 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -22,6 +22,7 @@ #include <linux/rculist_nulls.h> #include <linux/types.h> #include <linux/timer.h> +#include <linux/security.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/netlink.h> @@ -245,16 +246,31 @@ nla_put_failure: #ifdef CONFIG_NF_CONNTRACK_SECMARK static inline int -ctnetlink_dump_secmark(struct sk_buff *skb, const struct nf_conn *ct) +ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) { - NLA_PUT_BE32(skb, CTA_SECMARK, htonl(ct->secmark)); - return 0; + struct nlattr *nest_secctx; + int len, ret; + char *secctx; + + ret = security_secid_to_secctx(ct->secmark, &secctx, &len); + if (ret) + return ret; + + ret = -1; + nest_secctx = nla_nest_start(skb, CTA_SECCTX | NLA_F_NESTED); + if (!nest_secctx) + goto nla_put_failure; + + NLA_PUT_STRING(skb, CTA_SECCTX_NAME, secctx); + nla_nest_end(skb, nest_secctx); + ret = 0; nla_put_failure: - return -1; + security_release_secctx(secctx, len); + return ret; } #else -#define ctnetlink_dump_secmark(a, b) (0) +#define ctnetlink_dump_secctx(a, b) (0) #endif #define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple) @@ -391,7 +407,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq, ctnetlink_dump_protoinfo(skb, ct) < 0 || ctnetlink_dump_helpinfo(skb, ct) < 0 || ctnetlink_dump_mark(skb, ct) < 0 || - ctnetlink_dump_secmark(skb, ct) < 0 || + ctnetlink_dump_secctx(skb, ct) < 0 || ctnetlink_dump_id(skb, ct) < 0 || ctnetlink_dump_use(skb, ct) < 0 || ctnetlink_dump_master(skb, ct) < 0 || @@ -437,6 +453,17 @@ ctnetlink_counters_size(const struct nf_conn *ct) ; } +#ifdef CONFIG_NF_CONNTRACK_SECMARK +static int ctnetlink_nlmsg_secctx_size(const struct nf_conn *ct) +{ + int len; + + security_secid_to_secctx(ct->secmark, NULL, &len); + + return sizeof(char) * len; +} +#endif + static inline size_t ctnetlink_nlmsg_size(const struct nf_conn *ct) { @@ -453,7 +480,8 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct) + nla_total_size(0) /* CTA_HELP */ + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ #ifdef CONFIG_NF_CONNTRACK_SECMARK - + nla_total_size(sizeof(u_int32_t)) /* CTA_SECMARK */ + + nla_total_size(0) /* CTA_SECCTX */ + + nla_total_size(ctnetlink_nlmsg_secctx_size(ct)) /* CTA_SECCTX_NAME */ #endif #ifdef CONFIG_NF_NAT_NEEDED + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */ @@ -556,7 +584,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) #ifdef CONFIG_NF_CONNTRACK_SECMARK if ((events & (1 << IPCT_SECMARK) || ct->secmark) - && ctnetlink_dump_secmark(skb, ct) < 0) + && ctnetlink_dump_secctx(skb, ct) < 0) goto nla_put_failure; #endif diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 5886ba1d52a0..ed6d92958023 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -28,8 +28,8 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> -static struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX] __read_mostly; -struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX] __read_mostly; +static struct nf_conntrack_l4proto __rcu **nf_ct_protos[PF_MAX] __read_mostly; +struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX] __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_l3protos); static DEFINE_MUTEX(nf_ct_proto_mutex); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index eb973fcd67ab..0fb65705b44b 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -15,6 +15,7 @@ #include <linux/seq_file.h> #include <linux/percpu.h> #include <linux/netdevice.h> +#include <linux/security.h> #include <net/net_namespace.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> @@ -108,6 +109,29 @@ static void ct_seq_stop(struct seq_file *s, void *v) rcu_read_unlock(); } +#ifdef CONFIG_NF_CONNTRACK_SECMARK +static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) +{ + int ret; + u32 len; + char *secctx; + + ret = security_secid_to_secctx(ct->secmark, &secctx, &len); + if (ret) + return ret; + + ret = seq_printf(s, "secctx=%s ", secctx); + + security_release_secctx(secctx, len); + return ret; +} +#else +static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) +{ + return 0; +} +#endif + /* return 0 on success, 1 in case of error */ static int ct_seq_show(struct seq_file *s, void *v) { @@ -168,10 +192,8 @@ static int ct_seq_show(struct seq_file *s, void *v) goto release; #endif -#ifdef CONFIG_NF_CONNTRACK_SECMARK - if (seq_printf(s, "secmark=%u ", ct->secmark)) + if (ct_show_secctx(s, ct)) goto release; -#endif #ifdef CONFIG_NF_CONNTRACK_ZONES if (seq_printf(s, "zone=%u ", nf_ct_zone(ct))) diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 7df37fd786bc..b07393eab88e 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -16,7 +16,7 @@ #define NF_LOG_PREFIXLEN 128 #define NFLOGGER_NAME_LEN 64 -static const struct nf_logger *nf_loggers[NFPROTO_NUMPROTO] __read_mostly; +static const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO] __read_mostly; static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly; static DEFINE_MUTEX(nf_log_mutex); diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 78b3cf9c519c..74aebed5bd28 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -18,7 +18,7 @@ * long term mutex. The handler must provide an an outfn() to accept packets * for queueing and must reinject all packets it receives, no matter what. */ -static const struct nf_queue_handler *queue_handler[NFPROTO_NUMPROTO] __read_mostly; +static const struct nf_queue_handler __rcu *queue_handler[NFPROTO_NUMPROTO] __read_mostly; static DEFINE_MUTEX(queue_handler_mutex); diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 0cb6053f02fd..782e51986a6f 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -9,7 +9,6 @@ #include <linux/module.h> #include <linux/gfp.h> #include <linux/skbuff.h> -#include <linux/selinux.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index 23b2d6c486b5..9faf5e050b79 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -14,8 +14,8 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> +#include <linux/security.h> #include <linux/skbuff.h> -#include <linux/selinux.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_SECMARK.h> @@ -39,9 +39,8 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par) switch (mode) { case SECMARK_MODE_SEL: - secmark = info->u.sel.selsid; + secmark = info->secid; break; - default: BUG(); } @@ -50,33 +49,33 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static int checkentry_selinux(struct xt_secmark_target_info *info) +static int checkentry_lsm(struct xt_secmark_target_info *info) { int err; - struct xt_secmark_target_selinux_info *sel = &info->u.sel; - sel->selctx[SECMARK_SELCTX_MAX - 1] = '\0'; + info->secctx[SECMARK_SECCTX_MAX - 1] = '\0'; + info->secid = 0; - err = selinux_string_to_sid(sel->selctx, &sel->selsid); + err = security_secctx_to_secid(info->secctx, strlen(info->secctx), + &info->secid); if (err) { if (err == -EINVAL) - pr_info("invalid SELinux context \'%s\'\n", - sel->selctx); + pr_info("invalid security context \'%s\'\n", info->secctx); return err; } - if (!sel->selsid) { - pr_info("unable to map SELinux context \'%s\'\n", sel->selctx); + if (!info->secid) { + pr_info("unable to map security context \'%s\'\n", info->secctx); return -ENOENT; } - err = selinux_secmark_relabel_packet_permission(sel->selsid); + err = security_secmark_relabel_packet(info->secid); if (err) { pr_info("unable to obtain relabeling permission\n"); return err; } - selinux_secmark_refcount_inc(); + security_secmark_refcount_inc(); return 0; } @@ -100,16 +99,16 @@ static int secmark_tg_check(const struct xt_tgchk_param *par) switch (info->mode) { case SECMARK_MODE_SEL: - err = checkentry_selinux(info); - if (err <= 0) - return err; break; - default: pr_info("invalid mode: %hu\n", info->mode); return -EINVAL; } + err = checkentry_lsm(info); + if (err) + return err; + if (!mode) mode = info->mode; return 0; @@ -119,7 +118,7 @@ static void secmark_tg_destroy(const struct xt_tgdtor_param *par) { switch (mode) { case SECMARK_MODE_SEL: - selinux_secmark_refcount_dec(); + security_secmark_refcount_dec(); } } diff --git a/net/rds/page.c b/net/rds/page.c index 595a952d4b17..1dfbfea12e9b 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -57,30 +57,17 @@ int rds_page_copy_user(struct page *page, unsigned long offset, unsigned long ret; void *addr; - if (to_user) + addr = kmap(page); + if (to_user) { rds_stats_add(s_copy_to_user, bytes); - else + ret = copy_to_user(ptr, addr + offset, bytes); + } else { rds_stats_add(s_copy_from_user, bytes); - - addr = kmap_atomic(page, KM_USER0); - if (to_user) - ret = __copy_to_user_inatomic(ptr, addr + offset, bytes); - else - ret = __copy_from_user_inatomic(addr + offset, ptr, bytes); - kunmap_atomic(addr, KM_USER0); - - if (ret) { - addr = kmap(page); - if (to_user) - ret = copy_to_user(ptr, addr + offset, bytes); - else - ret = copy_from_user(addr + offset, ptr, bytes); - kunmap(page); - if (ret) - return -EFAULT; + ret = copy_from_user(addr + offset, ptr, bytes); } + kunmap(page); - return 0; + return ret ? -EFAULT : 0; } EXPORT_SYMBOL_GPL(rds_page_copy_user); diff --git a/scripts/Makefile b/scripts/Makefile index 842dbc2d5aed..2e088109fbd5 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -11,6 +11,7 @@ hostprogs-$(CONFIG_KALLSYMS) += kallsyms hostprogs-$(CONFIG_LOGO) += pnmtologo hostprogs-$(CONFIG_VT) += conmakehash hostprogs-$(CONFIG_IKCONFIG) += bin2c +hostprogs-$(BUILD_C_RECORDMCOUNT) += recordmcount always := $(hostprogs-y) $(hostprogs-m) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index a1a5cf95a68d..843bd4f4ffc9 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -209,12 +209,22 @@ cmd_modversions = \ endif ifdef CONFIG_FTRACE_MCOUNT_RECORD +ifdef BUILD_C_RECORDMCOUNT +# Due to recursion, we must skip empty.o. +# The empty.o file is created in the make process in order to determine +# the target endianness and word size. It is made before all other C +# files, including recordmcount. +cmd_record_mcount = if [ $(@) != "scripts/mod/empty.o" ]; then \ + $(objtree)/scripts/recordmcount "$(@)"; \ + fi; +else cmd_record_mcount = set -e ; perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \ "$(if $(CONFIG_CPU_BIG_ENDIAN),big,little)" \ "$(if $(CONFIG_64BIT),64,32)" \ "$(OBJDUMP)" "$(OBJCOPY)" "$(CC)" "$(LD)" "$(NM)" "$(RM)" "$(MV)" \ "$(if $(part-of-module),1,0)" "$(@)"; endif +endif define rule_cc_o_c $(call echo-cmd,checksrc) $(cmd_checksrc) \ diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 54fd1b700131..7bfcf1a09ac5 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -101,14 +101,6 @@ basename_flags = -D"KBUILD_BASENAME=KBUILD_STR($(call name-fix,$(basetarget)))" modname_flags = $(if $(filter 1,$(words $(modname))),\ -D"KBUILD_MODNAME=KBUILD_STR($(call name-fix,$(modname)))") -#hash values -ifdef CONFIG_DYNAMIC_DEBUG -debug_flags = -D"DEBUG_HASH=$(shell ./scripts/basic/hash djb2 $(@D)$(modname))"\ - -D"DEBUG_HASH2=$(shell ./scripts/basic/hash r5 $(@D)$(modname))" -else -debug_flags = -endif - orig_c_flags = $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(KBUILD_SUBDIR_CCFLAGS) \ $(ccflags-y) $(CFLAGS_$(basetarget).o) _c_flags = $(filter-out $(CFLAGS_REMOVE_$(basetarget).o), $(orig_c_flags)) @@ -152,8 +144,7 @@ endif c_flags = -Wp,-MD,$(depfile) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) \ $(__c_flags) $(modkern_cflags) \ - -D"KBUILD_STR(s)=\#s" $(basename_flags) $(modname_flags) \ - $(debug_flags) + -D"KBUILD_STR(s)=\#s" $(basename_flags) $(modname_flags) a_flags = -Wp,-MD,$(depfile) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) \ $(__a_flags) $(modkern_aflags) diff --git a/scripts/basic/Makefile b/scripts/basic/Makefile index 09559951df12..4c324a1f1e0e 100644 --- a/scripts/basic/Makefile +++ b/scripts/basic/Makefile @@ -9,7 +9,7 @@ # fixdep: Used to generate dependency information during build process # docproc: Used in Documentation/DocBook -hostprogs-y := fixdep docproc hash +hostprogs-y := fixdep docproc always := $(hostprogs-y) # fixdep is needed to compile other host programs diff --git a/scripts/basic/hash.c b/scripts/basic/hash.c deleted file mode 100644 index 2ef5d3f666b8..000000000000 --- a/scripts/basic/hash.c +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (C) 2008 Red Hat, Inc., Jason Baron <jbaron@redhat.com> - * - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -#define DYNAMIC_DEBUG_HASH_BITS 6 - -static const char *program; - -static void usage(void) -{ - printf("Usage: %s <djb2|r5> <modname>\n", program); - exit(1); -} - -/* djb2 hashing algorithm by Dan Bernstein. From: - * http://www.cse.yorku.ca/~oz/hash.html - */ - -static unsigned int djb2_hash(char *str) -{ - unsigned long hash = 5381; - int c; - - c = *str; - while (c) { - hash = ((hash << 5) + hash) + c; - c = *++str; - } - return (unsigned int)(hash & ((1 << DYNAMIC_DEBUG_HASH_BITS) - 1)); -} - -static unsigned int r5_hash(char *str) -{ - unsigned long hash = 0; - int c; - - c = *str; - while (c) { - hash = (hash + (c << 4) + (c >> 4)) * 11; - c = *++str; - } - return (unsigned int)(hash & ((1 << DYNAMIC_DEBUG_HASH_BITS) - 1)); -} - -int main(int argc, char *argv[]) -{ - program = argv[0]; - - if (argc != 3) - usage(); - if (!strcmp(argv[1], "djb2")) - printf("%d\n", djb2_hash(argv[2])); - else if (!strcmp(argv[1], "r5")) - printf("%d\n", r5_hash(argv[2])); - else - usage(); - exit(0); -} - diff --git a/scripts/gcc-goto.sh b/scripts/gcc-goto.sh new file mode 100644 index 000000000000..520d16b1ffaf --- /dev/null +++ b/scripts/gcc-goto.sh @@ -0,0 +1,5 @@ +#!/bin/sh +# Test for gcc 'asm goto' suport +# Copyright (C) 2010, Jason Baron <jbaron@redhat.com> + +echo "int main(void) { entry: asm goto (\"\"::::entry); return 0; }" | $@ -x c - -c -o /dev/null >/dev/null 2>&1 && echo "y" diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c new file mode 100644 index 000000000000..26e1271259ba --- /dev/null +++ b/scripts/recordmcount.c @@ -0,0 +1,363 @@ +/* + * recordmcount.c: construct a table of the locations of calls to 'mcount' + * so that ftrace can find them quickly. + * Copyright 2009 John F. Reiser <jreiser@BitWagon.com>. All rights reserved. + * Licensed under the GNU General Public License, version 2 (GPLv2). + * + * Restructured to fit Linux format, as well as other updates: + * Copyright 2010 Steven Rostedt <srostedt@redhat.com>, Red Hat Inc. + */ + +/* + * Strategy: alter the .o file in-place. + * + * Append a new STRTAB that has the new section names, followed by a new array + * ElfXX_Shdr[] that has the new section headers, followed by the section + * contents for __mcount_loc and its relocations. The old shstrtab strings, + * and the old ElfXX_Shdr[] array, remain as "garbage" (commonly, a couple + * kilobytes.) Subsequent processing by /bin/ld (or the kernel module loader) + * will ignore the garbage regions, because they are not designated by the + * new .e_shoff nor the new ElfXX_Shdr[]. [In order to remove the garbage, + * then use "ld -r" to create a new file that omits the garbage.] + */ + +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <elf.h> +#include <fcntl.h> +#include <setjmp.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +static int fd_map; /* File descriptor for file being modified. */ +static int mmap_failed; /* Boolean flag. */ +static void *ehdr_curr; /* current ElfXX_Ehdr * for resource cleanup */ +static char gpfx; /* prefix for global symbol name (sometimes '_') */ +static struct stat sb; /* Remember .st_size, etc. */ +static jmp_buf jmpenv; /* setjmp/longjmp per-file error escape */ + +/* setjmp() return values */ +enum { + SJ_SETJMP = 0, /* hardwired first return */ + SJ_FAIL, + SJ_SUCCEED +}; + +/* Per-file resource cleanup when multiple files. */ +static void +cleanup(void) +{ + if (!mmap_failed) + munmap(ehdr_curr, sb.st_size); + else + free(ehdr_curr); + close(fd_map); +} + +static void __attribute__((noreturn)) +fail_file(void) +{ + cleanup(); + longjmp(jmpenv, SJ_FAIL); +} + +static void __attribute__((noreturn)) +succeed_file(void) +{ + cleanup(); + longjmp(jmpenv, SJ_SUCCEED); +} + +/* ulseek, uread, ...: Check return value for errors. */ + +static off_t +ulseek(int const fd, off_t const offset, int const whence) +{ + off_t const w = lseek(fd, offset, whence); + if ((off_t)-1 == w) { + perror("lseek"); + fail_file(); + } + return w; +} + +static size_t +uread(int const fd, void *const buf, size_t const count) +{ + size_t const n = read(fd, buf, count); + if (n != count) { + perror("read"); + fail_file(); + } + return n; +} + +static size_t +uwrite(int const fd, void const *const buf, size_t const count) +{ + size_t const n = write(fd, buf, count); + if (n != count) { + perror("write"); + fail_file(); + } + return n; +} + +static void * +umalloc(size_t size) +{ + void *const addr = malloc(size); + if (0 == addr) { + fprintf(stderr, "malloc failed: %zu bytes\n", size); + fail_file(); + } + return addr; +} + +/* + * Get the whole file as a programming convenience in order to avoid + * malloc+lseek+read+free of many pieces. If successful, then mmap + * avoids copying unused pieces; else just read the whole file. + * Open for both read and write; new info will be appended to the file. + * Use MAP_PRIVATE so that a few changes to the in-memory ElfXX_Ehdr + * do not propagate to the file until an explicit overwrite at the last. + * This preserves most aspects of consistency (all except .st_size) + * for simultaneous readers of the file while we are appending to it. + * However, multiple writers still are bad. We choose not to use + * locking because it is expensive and the use case of kernel build + * makes multiple writers unlikely. + */ +static void *mmap_file(char const *fname) +{ + void *addr; + + fd_map = open(fname, O_RDWR); + if (0 > fd_map || 0 > fstat(fd_map, &sb)) { + perror(fname); + fail_file(); + } + if (!S_ISREG(sb.st_mode)) { + fprintf(stderr, "not a regular file: %s\n", fname); + fail_file(); + } + addr = mmap(0, sb.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, + fd_map, 0); + mmap_failed = 0; + if (MAP_FAILED == addr) { + mmap_failed = 1; + addr = umalloc(sb.st_size); + uread(fd_map, addr, sb.st_size); + } + return addr; +} + +/* w8rev, w8nat, ...: Handle endianness. */ + +static uint64_t w8rev(uint64_t const x) +{ + return ((0xff & (x >> (0 * 8))) << (7 * 8)) + | ((0xff & (x >> (1 * 8))) << (6 * 8)) + | ((0xff & (x >> (2 * 8))) << (5 * 8)) + | ((0xff & (x >> (3 * 8))) << (4 * 8)) + | ((0xff & (x >> (4 * 8))) << (3 * 8)) + | ((0xff & (x >> (5 * 8))) << (2 * 8)) + | ((0xff & (x >> (6 * 8))) << (1 * 8)) + | ((0xff & (x >> (7 * 8))) << (0 * 8)); +} + +static uint32_t w4rev(uint32_t const x) +{ + return ((0xff & (x >> (0 * 8))) << (3 * 8)) + | ((0xff & (x >> (1 * 8))) << (2 * 8)) + | ((0xff & (x >> (2 * 8))) << (1 * 8)) + | ((0xff & (x >> (3 * 8))) << (0 * 8)); +} + +static uint32_t w2rev(uint16_t const x) +{ + return ((0xff & (x >> (0 * 8))) << (1 * 8)) + | ((0xff & (x >> (1 * 8))) << (0 * 8)); +} + +static uint64_t w8nat(uint64_t const x) +{ + return x; +} + +static uint32_t w4nat(uint32_t const x) +{ + return x; +} + +static uint32_t w2nat(uint16_t const x) +{ + return x; +} + +static uint64_t (*w8)(uint64_t); +static uint32_t (*w)(uint32_t); +static uint32_t (*w2)(uint16_t); + +/* Names of the sections that could contain calls to mcount. */ +static int +is_mcounted_section_name(char const *const txtname) +{ + return 0 == strcmp(".text", txtname) || + 0 == strcmp(".sched.text", txtname) || + 0 == strcmp(".spinlock.text", txtname) || + 0 == strcmp(".irqentry.text", txtname) || + 0 == strcmp(".text.unlikely", txtname); +} + +/* 32 bit and 64 bit are very similar */ +#include "recordmcount.h" +#define RECORD_MCOUNT_64 +#include "recordmcount.h" + +static void +do_file(char const *const fname) +{ + Elf32_Ehdr *const ehdr = mmap_file(fname); + unsigned int reltype = 0; + + ehdr_curr = ehdr; + w = w4nat; + w2 = w2nat; + w8 = w8nat; + switch (ehdr->e_ident[EI_DATA]) { + static unsigned int const endian = 1; + default: { + fprintf(stderr, "unrecognized ELF data encoding %d: %s\n", + ehdr->e_ident[EI_DATA], fname); + fail_file(); + } break; + case ELFDATA2LSB: { + if (1 != *(unsigned char const *)&endian) { + /* main() is big endian, file.o is little endian. */ + w = w4rev; + w2 = w2rev; + w8 = w8rev; + } + } break; + case ELFDATA2MSB: { + if (0 != *(unsigned char const *)&endian) { + /* main() is little endian, file.o is big endian. */ + w = w4rev; + w2 = w2rev; + w8 = w8rev; + } + } break; + } /* end switch */ + if (0 != memcmp(ELFMAG, ehdr->e_ident, SELFMAG) + || ET_REL != w2(ehdr->e_type) + || EV_CURRENT != ehdr->e_ident[EI_VERSION]) { + fprintf(stderr, "unrecognized ET_REL file %s\n", fname); + fail_file(); + } + + gpfx = 0; + switch (w2(ehdr->e_machine)) { + default: { + fprintf(stderr, "unrecognized e_machine %d %s\n", + w2(ehdr->e_machine), fname); + fail_file(); + } break; + case EM_386: reltype = R_386_32; break; + case EM_ARM: reltype = R_ARM_ABS32; break; + case EM_IA_64: reltype = R_IA64_IMM64; gpfx = '_'; break; + case EM_PPC: reltype = R_PPC_ADDR32; gpfx = '_'; break; + case EM_PPC64: reltype = R_PPC64_ADDR64; gpfx = '_'; break; + case EM_S390: /* reltype: e_class */ gpfx = '_'; break; + case EM_SH: reltype = R_SH_DIR32; break; + case EM_SPARCV9: reltype = R_SPARC_64; gpfx = '_'; break; + case EM_X86_64: reltype = R_X86_64_64; break; + } /* end switch */ + + switch (ehdr->e_ident[EI_CLASS]) { + default: { + fprintf(stderr, "unrecognized ELF class %d %s\n", + ehdr->e_ident[EI_CLASS], fname); + fail_file(); + } break; + case ELFCLASS32: { + if (sizeof(Elf32_Ehdr) != w2(ehdr->e_ehsize) + || sizeof(Elf32_Shdr) != w2(ehdr->e_shentsize)) { + fprintf(stderr, + "unrecognized ET_REL file: %s\n", fname); + fail_file(); + } + if (EM_S390 == w2(ehdr->e_machine)) + reltype = R_390_32; + do32(ehdr, fname, reltype); + } break; + case ELFCLASS64: { + Elf64_Ehdr *const ghdr = (Elf64_Ehdr *)ehdr; + if (sizeof(Elf64_Ehdr) != w2(ghdr->e_ehsize) + || sizeof(Elf64_Shdr) != w2(ghdr->e_shentsize)) { + fprintf(stderr, + "unrecognized ET_REL file: %s\n", fname); + fail_file(); + } + if (EM_S390 == w2(ghdr->e_machine)) + reltype = R_390_64; + do64(ghdr, fname, reltype); + } break; + } /* end switch */ + + cleanup(); +} + +int +main(int argc, char const *argv[]) +{ + const char ftrace[] = "kernel/trace/ftrace.o"; + int ftrace_size = sizeof(ftrace) - 1; + int n_error = 0; /* gcc-4.3.0 false positive complaint */ + + if (argc <= 1) { + fprintf(stderr, "usage: recordmcount file.o...\n"); + return 0; + } + + /* Process each file in turn, allowing deep failure. */ + for (--argc, ++argv; 0 < argc; --argc, ++argv) { + int const sjval = setjmp(jmpenv); + int len; + + /* + * The file kernel/trace/ftrace.o references the mcount + * function but does not call it. Since ftrace.o should + * not be traced anyway, we just skip it. + */ + len = strlen(argv[0]); + if (len >= ftrace_size && + strcmp(argv[0] + (len - ftrace_size), ftrace) == 0) + continue; + + switch (sjval) { + default: { + fprintf(stderr, "internal error: %s\n", argv[0]); + exit(1); + } break; + case SJ_SETJMP: { /* normal sequence */ + /* Avoid problems if early cleanup() */ + fd_map = -1; + ehdr_curr = NULL; + mmap_failed = 1; + do_file(argv[0]); + } break; + case SJ_FAIL: { /* error in do_file or below */ + ++n_error; + } break; + case SJ_SUCCEED: { /* premature success */ + /* do nothing */ + } break; + } /* end switch */ + } + return !!n_error; +} + + diff --git a/scripts/recordmcount.h b/scripts/recordmcount.h new file mode 100644 index 000000000000..7f39d0943d2d --- /dev/null +++ b/scripts/recordmcount.h @@ -0,0 +1,366 @@ +/* + * recordmcount.h + * + * This code was taken out of recordmcount.c written by + * Copyright 2009 John F. Reiser <jreiser@BitWagon.com>. All rights reserved. + * + * The original code had the same algorithms for both 32bit + * and 64bit ELF files, but the code was duplicated to support + * the difference in structures that were used. This + * file creates a macro of everything that is different between + * the 64 and 32 bit code, such that by including this header + * twice we can create both sets of functions by including this + * header once with RECORD_MCOUNT_64 undefined, and again with + * it defined. + * + * This conversion to macros was done by: + * Copyright 2010 Steven Rostedt <srostedt@redhat.com>, Red Hat Inc. + * + * Licensed under the GNU General Public License, version 2 (GPLv2). + */ +#undef append_func +#undef sift_rel_mcount +#undef find_secsym_ndx +#undef __has_rel_mcount +#undef has_rel_mcount +#undef tot_relsize +#undef do_func +#undef Elf_Ehdr +#undef Elf_Shdr +#undef Elf_Rel +#undef Elf_Rela +#undef Elf_Sym +#undef ELF_R_SYM +#undef ELF_R_INFO +#undef ELF_ST_BIND +#undef uint_t +#undef _w +#undef _align +#undef _size + +#ifdef RECORD_MCOUNT_64 +# define append_func append64 +# define sift_rel_mcount sift64_rel_mcount +# define find_secsym_ndx find64_secsym_ndx +# define __has_rel_mcount __has64_rel_mcount +# define has_rel_mcount has64_rel_mcount +# define tot_relsize tot64_relsize +# define do_func do64 +# define Elf_Ehdr Elf64_Ehdr +# define Elf_Shdr Elf64_Shdr +# define Elf_Rel Elf64_Rel +# define Elf_Rela Elf64_Rela +# define Elf_Sym Elf64_Sym +# define ELF_R_SYM ELF64_R_SYM +# define ELF_R_INFO ELF64_R_INFO +# define ELF_ST_BIND ELF64_ST_BIND +# define uint_t uint64_t +# define _w w8 +# define _align 7u +# define _size 8 +#else +# define append_func append32 +# define sift_rel_mcount sift32_rel_mcount +# define find_secsym_ndx find32_secsym_ndx +# define __has_rel_mcount __has32_rel_mcount +# define has_rel_mcount has32_rel_mcount +# define tot_relsize tot32_relsize +# define do_func do32 +# define Elf_Ehdr Elf32_Ehdr +# define Elf_Shdr Elf32_Shdr +# define Elf_Rel Elf32_Rel +# define Elf_Rela Elf32_Rela +# define Elf_Sym Elf32_Sym +# define ELF_R_SYM ELF32_R_SYM +# define ELF_R_INFO ELF32_R_INFO +# define ELF_ST_BIND ELF32_ST_BIND +# define uint_t uint32_t +# define _w w +# define _align 3u +# define _size 4 +#endif + +/* Append the new shstrtab, Elf_Shdr[], __mcount_loc and its relocations. */ +static void append_func(Elf_Ehdr *const ehdr, + Elf_Shdr *const shstr, + uint_t const *const mloc0, + uint_t const *const mlocp, + Elf_Rel const *const mrel0, + Elf_Rel const *const mrelp, + unsigned int const rel_entsize, + unsigned int const symsec_sh_link) +{ + /* Begin constructing output file */ + Elf_Shdr mcsec; + char const *mc_name = (sizeof(Elf_Rela) == rel_entsize) + ? ".rela__mcount_loc" + : ".rel__mcount_loc"; + unsigned const old_shnum = w2(ehdr->e_shnum); + uint_t const old_shoff = _w(ehdr->e_shoff); + uint_t const old_shstr_sh_size = _w(shstr->sh_size); + uint_t const old_shstr_sh_offset = _w(shstr->sh_offset); + uint_t t = 1 + strlen(mc_name) + _w(shstr->sh_size); + uint_t new_e_shoff; + + shstr->sh_size = _w(t); + shstr->sh_offset = _w(sb.st_size); + t += sb.st_size; + t += (_align & -t); /* word-byte align */ + new_e_shoff = t; + + /* body for new shstrtab */ + ulseek(fd_map, sb.st_size, SEEK_SET); + uwrite(fd_map, old_shstr_sh_offset + (void *)ehdr, old_shstr_sh_size); + uwrite(fd_map, mc_name, 1 + strlen(mc_name)); + + /* old(modified) Elf_Shdr table, word-byte aligned */ + ulseek(fd_map, t, SEEK_SET); + t += sizeof(Elf_Shdr) * old_shnum; + uwrite(fd_map, old_shoff + (void *)ehdr, + sizeof(Elf_Shdr) * old_shnum); + + /* new sections __mcount_loc and .rel__mcount_loc */ + t += 2*sizeof(mcsec); + mcsec.sh_name = w((sizeof(Elf_Rela) == rel_entsize) + strlen(".rel") + + old_shstr_sh_size); + mcsec.sh_type = w(SHT_PROGBITS); + mcsec.sh_flags = _w(SHF_ALLOC); + mcsec.sh_addr = 0; + mcsec.sh_offset = _w(t); + mcsec.sh_size = _w((void *)mlocp - (void *)mloc0); + mcsec.sh_link = 0; + mcsec.sh_info = 0; + mcsec.sh_addralign = _w(_size); + mcsec.sh_entsize = _w(_size); + uwrite(fd_map, &mcsec, sizeof(mcsec)); + + mcsec.sh_name = w(old_shstr_sh_size); + mcsec.sh_type = (sizeof(Elf_Rela) == rel_entsize) + ? w(SHT_RELA) + : w(SHT_REL); + mcsec.sh_flags = 0; + mcsec.sh_addr = 0; + mcsec.sh_offset = _w((void *)mlocp - (void *)mloc0 + t); + mcsec.sh_size = _w((void *)mrelp - (void *)mrel0); + mcsec.sh_link = w(symsec_sh_link); + mcsec.sh_info = w(old_shnum); + mcsec.sh_addralign = _w(_size); + mcsec.sh_entsize = _w(rel_entsize); + uwrite(fd_map, &mcsec, sizeof(mcsec)); + + uwrite(fd_map, mloc0, (void *)mlocp - (void *)mloc0); + uwrite(fd_map, mrel0, (void *)mrelp - (void *)mrel0); + + ehdr->e_shoff = _w(new_e_shoff); + ehdr->e_shnum = w2(2 + w2(ehdr->e_shnum)); /* {.rel,}__mcount_loc */ + ulseek(fd_map, 0, SEEK_SET); + uwrite(fd_map, ehdr, sizeof(*ehdr)); +} + + +/* + * Look at the relocations in order to find the calls to mcount. + * Accumulate the section offsets that are found, and their relocation info, + * onto the end of the existing arrays. + */ +static uint_t *sift_rel_mcount(uint_t *mlocp, + unsigned const offbase, + Elf_Rel **const mrelpp, + Elf_Shdr const *const relhdr, + Elf_Ehdr const *const ehdr, + unsigned const recsym, + uint_t const recval, + unsigned const reltype) +{ + uint_t *const mloc0 = mlocp; + Elf_Rel *mrelp = *mrelpp; + Elf_Shdr *const shdr0 = (Elf_Shdr *)(_w(ehdr->e_shoff) + + (void *)ehdr); + unsigned const symsec_sh_link = w(relhdr->sh_link); + Elf_Shdr const *const symsec = &shdr0[symsec_sh_link]; + Elf_Sym const *const sym0 = (Elf_Sym const *)(_w(symsec->sh_offset) + + (void *)ehdr); + + Elf_Shdr const *const strsec = &shdr0[w(symsec->sh_link)]; + char const *const str0 = (char const *)(_w(strsec->sh_offset) + + (void *)ehdr); + + Elf_Rel const *const rel0 = (Elf_Rel const *)(_w(relhdr->sh_offset) + + (void *)ehdr); + unsigned rel_entsize = _w(relhdr->sh_entsize); + unsigned const nrel = _w(relhdr->sh_size) / rel_entsize; + Elf_Rel const *relp = rel0; + + unsigned mcountsym = 0; + unsigned t; + + for (t = nrel; t; --t) { + if (!mcountsym) { + Elf_Sym const *const symp = + &sym0[ELF_R_SYM(_w(relp->r_info))]; + char const *symname = &str0[w(symp->st_name)]; + + if ('.' == symname[0]) + ++symname; /* ppc64 hack */ + if (0 == strcmp((('_' == gpfx) ? "_mcount" : "mcount"), + symname)) + mcountsym = ELF_R_SYM(_w(relp->r_info)); + } + + if (mcountsym == ELF_R_SYM(_w(relp->r_info))) { + uint_t const addend = _w(_w(relp->r_offset) - recval); + + mrelp->r_offset = _w(offbase + + ((void *)mlocp - (void *)mloc0)); + mrelp->r_info = _w(ELF_R_INFO(recsym, reltype)); + if (sizeof(Elf_Rela) == rel_entsize) { + ((Elf_Rela *)mrelp)->r_addend = addend; + *mlocp++ = 0; + } else + *mlocp++ = addend; + + mrelp = (Elf_Rel *)(rel_entsize + (void *)mrelp); + } + relp = (Elf_Rel const *)(rel_entsize + (void *)relp); + } + *mrelpp = mrelp; + return mlocp; +} + + +/* + * Find a symbol in the given section, to be used as the base for relocating + * the table of offsets of calls to mcount. A local or global symbol suffices, + * but avoid a Weak symbol because it may be overridden; the change in value + * would invalidate the relocations of the offsets of the calls to mcount. + * Often the found symbol will be the unnamed local symbol generated by + * GNU 'as' for the start of each section. For example: + * Num: Value Size Type Bind Vis Ndx Name + * 2: 00000000 0 SECTION LOCAL DEFAULT 1 + */ +static unsigned find_secsym_ndx(unsigned const txtndx, + char const *const txtname, + uint_t *const recvalp, + Elf_Shdr const *const symhdr, + Elf_Ehdr const *const ehdr) +{ + Elf_Sym const *const sym0 = (Elf_Sym const *)(_w(symhdr->sh_offset) + + (void *)ehdr); + unsigned const nsym = _w(symhdr->sh_size) / _w(symhdr->sh_entsize); + Elf_Sym const *symp; + unsigned t; + + for (symp = sym0, t = nsym; t; --t, ++symp) { + unsigned int const st_bind = ELF_ST_BIND(symp->st_info); + + if (txtndx == w2(symp->st_shndx) + /* avoid STB_WEAK */ + && (STB_LOCAL == st_bind || STB_GLOBAL == st_bind)) { + *recvalp = _w(symp->st_value); + return symp - sym0; + } + } + fprintf(stderr, "Cannot find symbol for section %d: %s.\n", + txtndx, txtname); + fail_file(); +} + + +/* Evade ISO C restriction: no declaration after statement in has_rel_mcount. */ +static char const * +__has_rel_mcount(Elf_Shdr const *const relhdr, /* is SHT_REL or SHT_RELA */ + Elf_Shdr const *const shdr0, + char const *const shstrtab, + char const *const fname) +{ + /* .sh_info depends on .sh_type == SHT_REL[,A] */ + Elf_Shdr const *const txthdr = &shdr0[w(relhdr->sh_info)]; + char const *const txtname = &shstrtab[w(txthdr->sh_name)]; + + if (0 == strcmp("__mcount_loc", txtname)) { + fprintf(stderr, "warning: __mcount_loc already exists: %s\n", + fname); + succeed_file(); + } + if (SHT_PROGBITS != w(txthdr->sh_type) || + !is_mcounted_section_name(txtname)) + return NULL; + return txtname; +} + +static char const *has_rel_mcount(Elf_Shdr const *const relhdr, + Elf_Shdr const *const shdr0, + char const *const shstrtab, + char const *const fname) +{ + if (SHT_REL != w(relhdr->sh_type) && SHT_RELA != w(relhdr->sh_type)) + return NULL; + return __has_rel_mcount(relhdr, shdr0, shstrtab, fname); +} + + +static unsigned tot_relsize(Elf_Shdr const *const shdr0, + unsigned nhdr, + const char *const shstrtab, + const char *const fname) +{ + unsigned totrelsz = 0; + Elf_Shdr const *shdrp = shdr0; + + for (; nhdr; --nhdr, ++shdrp) { + if (has_rel_mcount(shdrp, shdr0, shstrtab, fname)) + totrelsz += _w(shdrp->sh_size); + } + return totrelsz; +} + + +/* Overall supervision for Elf32 ET_REL file. */ +static void +do_func(Elf_Ehdr *const ehdr, char const *const fname, unsigned const reltype) +{ + Elf_Shdr *const shdr0 = (Elf_Shdr *)(_w(ehdr->e_shoff) + + (void *)ehdr); + unsigned const nhdr = w2(ehdr->e_shnum); + Elf_Shdr *const shstr = &shdr0[w2(ehdr->e_shstrndx)]; + char const *const shstrtab = (char const *)(_w(shstr->sh_offset) + + (void *)ehdr); + + Elf_Shdr const *relhdr; + unsigned k; + + /* Upper bound on space: assume all relevant relocs are for mcount. */ + unsigned const totrelsz = tot_relsize(shdr0, nhdr, shstrtab, fname); + Elf_Rel *const mrel0 = umalloc(totrelsz); + Elf_Rel * mrelp = mrel0; + + /* 2*sizeof(address) <= sizeof(Elf_Rel) */ + uint_t *const mloc0 = umalloc(totrelsz>>1); + uint_t * mlocp = mloc0; + + unsigned rel_entsize = 0; + unsigned symsec_sh_link = 0; + + for (relhdr = shdr0, k = nhdr; k; --k, ++relhdr) { + char const *const txtname = has_rel_mcount(relhdr, shdr0, + shstrtab, fname); + if (txtname) { + uint_t recval = 0; + unsigned const recsym = find_secsym_ndx( + w(relhdr->sh_info), txtname, &recval, + &shdr0[symsec_sh_link = w(relhdr->sh_link)], + ehdr); + + rel_entsize = _w(relhdr->sh_entsize); + mlocp = sift_rel_mcount(mlocp, + (void *)mlocp - (void *)mloc0, &mrelp, + relhdr, ehdr, recsym, recval, reltype); + } + } + if (mloc0 != mlocp) { + append_func(ehdr, shstr, mloc0, mlocp, mrel0, mrelp, + rel_entsize, symsec_sh_link); + } + free(mrel0); + free(mloc0); +} diff --git a/security/apparmor/.gitignore b/security/apparmor/.gitignore index 0a0a99f3b083..4d995aeaebc0 100644 --- a/security/apparmor/.gitignore +++ b/security/apparmor/.gitignore @@ -3,3 +3,4 @@ # af_names.h capability_names.h +rlim_names.h diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 7320331b44ab..544ff5837cb6 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -29,7 +29,7 @@ * aa_simple_write_to_buffer - common routine for getting policy from user * @op: operation doing the user buffer copy * @userbuf: user buffer to copy data from (NOT NULL) - * @alloc_size: size of user buffer + * @alloc_size: size of user buffer (REQUIRES: @alloc_size >= @copy_size) * @copy_size: size of data to copy from user buffer * @pos: position write is at in the file (NOT NULL) * @@ -42,6 +42,8 @@ static char *aa_simple_write_to_buffer(int op, const char __user *userbuf, { char *data; + BUG_ON(copy_size > alloc_size); + if (*pos != 0) /* only writes from pos 0, that is complete writes */ return ERR_PTR(-ESPIPE); diff --git a/security/capability.c b/security/capability.c index 95a6599a37bb..30ae00fbecd5 100644 --- a/security/capability.c +++ b/security/capability.c @@ -677,7 +677,18 @@ static void cap_inet_conn_established(struct sock *sk, struct sk_buff *skb) { } +static int cap_secmark_relabel_packet(u32 secid) +{ + return 0; +} +static void cap_secmark_refcount_inc(void) +{ +} + +static void cap_secmark_refcount_dec(void) +{ +} static void cap_req_classify_flow(const struct request_sock *req, struct flowi *fl) @@ -777,7 +788,8 @@ static int cap_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) static int cap_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid) { - return -EOPNOTSUPP; + *secid = 0; + return 0; } static void cap_release_secctx(char *secdata, u32 seclen) @@ -1018,6 +1030,9 @@ void __init security_fixup_ops(struct security_operations *ops) set_to_cap_if_null(ops, inet_conn_request); set_to_cap_if_null(ops, inet_csk_clone); set_to_cap_if_null(ops, inet_conn_established); + set_to_cap_if_null(ops, secmark_relabel_packet); + set_to_cap_if_null(ops, secmark_refcount_inc); + set_to_cap_if_null(ops, secmark_refcount_dec); set_to_cap_if_null(ops, req_classify_flow); set_to_cap_if_null(ops, tun_dev_create); set_to_cap_if_null(ops, tun_dev_post_create); diff --git a/security/commoncap.c b/security/commoncap.c index 9d172e6e330c..5e632b4857e4 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -719,14 +719,11 @@ static int cap_safe_nice(struct task_struct *p) /** * cap_task_setscheduler - Detemine if scheduler policy change is permitted * @p: The task to affect - * @policy: The policy to effect - * @lp: The parameters to the scheduling policy * * Detemine if the requested scheduler policy change is permitted for the * specified task, returning 0 if permission is granted, -ve if denied. */ -int cap_task_setscheduler(struct task_struct *p, int policy, - struct sched_param *lp) +int cap_task_setscheduler(struct task_struct *p) { return cap_safe_nice(p); } diff --git a/security/security.c b/security/security.c index c53949f17d9e..b50f472061a4 100644 --- a/security/security.c +++ b/security/security.c @@ -89,20 +89,12 @@ __setup("security=", choose_lsm); * Return true if: * -The passed LSM is the one chosen by user at boot time, * -or the passed LSM is configured as the default and the user did not - * choose an alternate LSM at boot time, - * -or there is no default LSM set and the user didn't specify a - * specific LSM and we're the first to ask for registration permission, - * -or the passed LSM is currently loaded. + * choose an alternate LSM at boot time. * Otherwise, return false. */ int __init security_module_enable(struct security_operations *ops) { - if (!*chosen_lsm) - strncpy(chosen_lsm, ops->name, SECURITY_NAME_MAX); - else if (strncmp(ops->name, chosen_lsm, SECURITY_NAME_MAX)) - return 0; - - return 1; + return !strcmp(ops->name, chosen_lsm); } /** @@ -786,10 +778,9 @@ int security_task_setrlimit(struct task_struct *p, unsigned int resource, return security_ops->task_setrlimit(p, resource, new_rlim); } -int security_task_setscheduler(struct task_struct *p, - int policy, struct sched_param *lp) +int security_task_setscheduler(struct task_struct *p) { - return security_ops->task_setscheduler(p, policy, lp); + return security_ops->task_setscheduler(p); } int security_task_getscheduler(struct task_struct *p) @@ -1145,6 +1136,24 @@ void security_inet_conn_established(struct sock *sk, security_ops->inet_conn_established(sk, skb); } +int security_secmark_relabel_packet(u32 secid) +{ + return security_ops->secmark_relabel_packet(secid); +} +EXPORT_SYMBOL(security_secmark_relabel_packet); + +void security_secmark_refcount_inc(void) +{ + security_ops->secmark_refcount_inc(); +} +EXPORT_SYMBOL(security_secmark_refcount_inc); + +void security_secmark_refcount_dec(void) +{ + security_ops->secmark_refcount_dec(); +} +EXPORT_SYMBOL(security_secmark_refcount_dec); + int security_tun_dev_create(void) { return security_ops->tun_dev_create(); diff --git a/security/selinux/Makefile b/security/selinux/Makefile index 58d80f3bd6f6..ad5cd76ec231 100644 --- a/security/selinux/Makefile +++ b/security/selinux/Makefile @@ -2,25 +2,20 @@ # Makefile for building the SELinux module as part of the kernel tree. # -obj-$(CONFIG_SECURITY_SELINUX) := selinux.o ss/ - -selinux-y := avc.o \ - hooks.o \ - selinuxfs.o \ - netlink.o \ - nlmsgtab.o \ - netif.o \ - netnode.o \ - netport.o \ - exports.o +obj-$(CONFIG_SECURITY_SELINUX) := selinux.o + +selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o \ + netnode.o netport.o exports.o \ + ss/ebitmap.o ss/hashtab.o ss/symtab.o ss/sidtab.o ss/avtab.o \ + ss/policydb.o ss/services.o ss/conditional.o ss/mls.o ss/status.o selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o selinux-$(CONFIG_NETLABEL) += netlabel.o -EXTRA_CFLAGS += -Isecurity/selinux -Isecurity/selinux/include +ccflags-y := -Isecurity/selinux -Isecurity/selinux/include -$(obj)/avc.o: $(obj)/flask.h +$(addprefix $(obj)/,$(selinux-y)): $(obj)/flask.h quiet_cmd_flask = GEN $(obj)/flask.h $(obj)/av_permissions.h cmd_flask = scripts/selinux/genheaders/genheaders $(obj)/flask.h $(obj)/av_permissions.h diff --git a/security/selinux/exports.c b/security/selinux/exports.c index c0a454aee1e0..90664385dead 100644 --- a/security/selinux/exports.c +++ b/security/selinux/exports.c @@ -11,58 +11,9 @@ * it under the terms of the GNU General Public License version 2, * as published by the Free Software Foundation. */ -#include <linux/types.h> -#include <linux/kernel.h> #include <linux/module.h> -#include <linux/selinux.h> -#include <linux/fs.h> -#include <linux/ipc.h> -#include <asm/atomic.h> #include "security.h" -#include "objsec.h" - -/* SECMARK reference count */ -extern atomic_t selinux_secmark_refcount; - -int selinux_string_to_sid(char *str, u32 *sid) -{ - if (selinux_enabled) - return security_context_to_sid(str, strlen(str), sid); - else { - *sid = 0; - return 0; - } -} -EXPORT_SYMBOL_GPL(selinux_string_to_sid); - -int selinux_secmark_relabel_packet_permission(u32 sid) -{ - if (selinux_enabled) { - const struct task_security_struct *__tsec; - u32 tsid; - - __tsec = current_security(); - tsid = __tsec->sid; - - return avc_has_perm(tsid, sid, SECCLASS_PACKET, - PACKET__RELABELTO, NULL); - } - return 0; -} -EXPORT_SYMBOL_GPL(selinux_secmark_relabel_packet_permission); - -void selinux_secmark_refcount_inc(void) -{ - atomic_inc(&selinux_secmark_refcount); -} -EXPORT_SYMBOL_GPL(selinux_secmark_refcount_inc); - -void selinux_secmark_refcount_dec(void) -{ - atomic_dec(&selinux_secmark_refcount); -} -EXPORT_SYMBOL_GPL(selinux_secmark_refcount_dec); bool selinux_is_enabled(void) { diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 4796ddd4e721..d9154cf90ae1 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3354,11 +3354,11 @@ static int selinux_task_setrlimit(struct task_struct *p, unsigned int resource, return 0; } -static int selinux_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp) +static int selinux_task_setscheduler(struct task_struct *p) { int rc; - rc = cap_task_setscheduler(p, policy, lp); + rc = cap_task_setscheduler(p); if (rc) return rc; @@ -4279,6 +4279,27 @@ static void selinux_inet_conn_established(struct sock *sk, struct sk_buff *skb) selinux_skb_peerlbl_sid(skb, family, &sksec->peer_sid); } +static int selinux_secmark_relabel_packet(u32 sid) +{ + const struct task_security_struct *__tsec; + u32 tsid; + + __tsec = current_security(); + tsid = __tsec->sid; + + return avc_has_perm(tsid, sid, SECCLASS_PACKET, PACKET__RELABELTO, NULL); +} + +static void selinux_secmark_refcount_inc(void) +{ + atomic_inc(&selinux_secmark_refcount); +} + +static void selinux_secmark_refcount_dec(void) +{ + atomic_dec(&selinux_secmark_refcount); +} + static void selinux_req_classify_flow(const struct request_sock *req, struct flowi *fl) { @@ -5533,6 +5554,9 @@ static struct security_operations selinux_ops = { .inet_conn_request = selinux_inet_conn_request, .inet_csk_clone = selinux_inet_csk_clone, .inet_conn_established = selinux_inet_conn_established, + .secmark_relabel_packet = selinux_secmark_relabel_packet, + .secmark_refcount_inc = selinux_secmark_refcount_inc, + .secmark_refcount_dec = selinux_secmark_refcount_dec, .req_classify_flow = selinux_req_classify_flow, .tun_dev_create = selinux_tun_dev_create, .tun_dev_post_create = selinux_tun_dev_post_create, diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index b4c9eb4bd6f9..8858d2b2d4b6 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -17,7 +17,7 @@ struct security_class_mapping secclass_map[] = { { "compute_av", "compute_create", "compute_member", "check_context", "load_policy", "compute_relabel", "compute_user", "setenforce", "setbool", "setsecparam", - "setcheckreqprot", NULL } }, + "setcheckreqprot", "read_policy", NULL } }, { "process", { "fork", "transition", "sigchld", "sigkill", "sigstop", "signull", "signal", "ptrace", "getsched", "setsched", diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index 1f7c2491d3dc..671273eb1115 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -9,6 +9,7 @@ #define _SELINUX_SECURITY_H_ #include <linux/magic.h> +#include <linux/types.h> #include "flask.h" #define SECSID_NULL 0x00000000 /* unspecified SID */ @@ -82,6 +83,8 @@ extern int selinux_policycap_openperm; int security_mls_enabled(void); int security_load_policy(void *data, size_t len); +int security_read_policy(void **data, ssize_t *len); +size_t security_policydb_len(void); int security_policycap_supported(unsigned int req_cap); @@ -191,5 +194,25 @@ static inline int security_netlbl_sid_to_secattr(u32 sid, const char *security_get_initial_sid_context(u32 sid); +/* + * status notifier using mmap interface + */ +extern struct page *selinux_kernel_status_page(void); + +#define SELINUX_KERNEL_STATUS_VERSION 1 +struct selinux_kernel_status { + u32 version; /* version number of thie structure */ + u32 sequence; /* sequence number of seqlock logic */ + u32 enforcing; /* current setting of enforcing mode */ + u32 policyload; /* times of policy reloaded */ + u32 deny_unknown; /* current setting of deny_unknown */ + /* + * The version > 0 supports above members. + */ +} __attribute__((packed)); + +extern void selinux_status_update_setenforce(int enforcing); +extern void selinux_status_update_policyload(int seqno); + #endif /* _SELINUX_SECURITY_H_ */ diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 79a1bb635662..87e0556bae70 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -68,6 +68,8 @@ static int *bool_pending_values; static struct dentry *class_dir; static unsigned long last_class_ino; +static char policy_opened; + /* global data for policy capabilities */ static struct dentry *policycap_dir; @@ -110,6 +112,8 @@ enum sel_inos { SEL_COMPAT_NET, /* whether to use old compat network packet controls */ SEL_REJECT_UNKNOWN, /* export unknown reject handling to userspace */ SEL_DENY_UNKNOWN, /* export unknown deny handling to userspace */ + SEL_STATUS, /* export current status using mmap() */ + SEL_POLICY, /* allow userspace to read the in kernel policy */ SEL_INO_NEXT, /* The next inode number to use */ }; @@ -171,6 +175,7 @@ static ssize_t sel_write_enforce(struct file *file, const char __user *buf, if (selinux_enforcing) avc_ss_reset(0); selnl_notify_setenforce(selinux_enforcing); + selinux_status_update_setenforce(selinux_enforcing); } length = count; out: @@ -205,6 +210,59 @@ static const struct file_operations sel_handle_unknown_ops = { .llseek = generic_file_llseek, }; +static int sel_open_handle_status(struct inode *inode, struct file *filp) +{ + struct page *status = selinux_kernel_status_page(); + + if (!status) + return -ENOMEM; + + filp->private_data = status; + + return 0; +} + +static ssize_t sel_read_handle_status(struct file *filp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct page *status = filp->private_data; + + BUG_ON(!status); + + return simple_read_from_buffer(buf, count, ppos, + page_address(status), + sizeof(struct selinux_kernel_status)); +} + +static int sel_mmap_handle_status(struct file *filp, + struct vm_area_struct *vma) +{ + struct page *status = filp->private_data; + unsigned long size = vma->vm_end - vma->vm_start; + + BUG_ON(!status); + + /* only allows one page from the head */ + if (vma->vm_pgoff > 0 || size != PAGE_SIZE) + return -EIO; + /* disallow writable mapping */ + if (vma->vm_flags & VM_WRITE) + return -EPERM; + /* disallow mprotect() turns it into writable */ + vma->vm_flags &= ~VM_MAYWRITE; + + return remap_pfn_range(vma, vma->vm_start, + page_to_pfn(status), + size, vma->vm_page_prot); +} + +static const struct file_operations sel_handle_status_ops = { + .open = sel_open_handle_status, + .read = sel_read_handle_status, + .mmap = sel_mmap_handle_status, + .llseek = generic_file_llseek, +}; + #ifdef CONFIG_SECURITY_SELINUX_DISABLE static ssize_t sel_write_disable(struct file *file, const char __user *buf, size_t count, loff_t *ppos) @@ -296,6 +354,141 @@ static const struct file_operations sel_mls_ops = { .llseek = generic_file_llseek, }; +struct policy_load_memory { + size_t len; + void *data; +}; + +static int sel_open_policy(struct inode *inode, struct file *filp) +{ + struct policy_load_memory *plm = NULL; + int rc; + + BUG_ON(filp->private_data); + + mutex_lock(&sel_mutex); + + rc = task_has_security(current, SECURITY__READ_POLICY); + if (rc) + goto err; + + rc = -EBUSY; + if (policy_opened) + goto err; + + rc = -ENOMEM; + plm = kzalloc(sizeof(*plm), GFP_KERNEL); + if (!plm) + goto err; + + if (i_size_read(inode) != security_policydb_len()) { + mutex_lock(&inode->i_mutex); + i_size_write(inode, security_policydb_len()); + mutex_unlock(&inode->i_mutex); + } + + rc = security_read_policy(&plm->data, &plm->len); + if (rc) + goto err; + + policy_opened = 1; + + filp->private_data = plm; + + mutex_unlock(&sel_mutex); + + return 0; +err: + mutex_unlock(&sel_mutex); + + if (plm) + vfree(plm->data); + kfree(plm); + return rc; +} + +static int sel_release_policy(struct inode *inode, struct file *filp) +{ + struct policy_load_memory *plm = filp->private_data; + + BUG_ON(!plm); + + policy_opened = 0; + + vfree(plm->data); + kfree(plm); + + return 0; +} + +static ssize_t sel_read_policy(struct file *filp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct policy_load_memory *plm = filp->private_data; + int ret; + + mutex_lock(&sel_mutex); + + ret = task_has_security(current, SECURITY__READ_POLICY); + if (ret) + goto out; + + ret = simple_read_from_buffer(buf, count, ppos, plm->data, plm->len); +out: + mutex_unlock(&sel_mutex); + return ret; +} + +static int sel_mmap_policy_fault(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct policy_load_memory *plm = vma->vm_file->private_data; + unsigned long offset; + struct page *page; + + if (vmf->flags & (FAULT_FLAG_MKWRITE | FAULT_FLAG_WRITE)) + return VM_FAULT_SIGBUS; + + offset = vmf->pgoff << PAGE_SHIFT; + if (offset >= roundup(plm->len, PAGE_SIZE)) + return VM_FAULT_SIGBUS; + + page = vmalloc_to_page(plm->data + offset); + get_page(page); + + vmf->page = page; + + return 0; +} + +static struct vm_operations_struct sel_mmap_policy_ops = { + .fault = sel_mmap_policy_fault, + .page_mkwrite = sel_mmap_policy_fault, +}; + +int sel_mmap_policy(struct file *filp, struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_SHARED) { + /* do not allow mprotect to make mapping writable */ + vma->vm_flags &= ~VM_MAYWRITE; + + if (vma->vm_flags & VM_WRITE) + return -EACCES; + } + + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &sel_mmap_policy_ops; + + return 0; +} + +static const struct file_operations sel_policy_ops = { + .open = sel_open_policy, + .read = sel_read_policy, + .mmap = sel_mmap_policy, + .release = sel_release_policy, +}; + static ssize_t sel_write_load(struct file *file, const char __user *buf, size_t count, loff_t *ppos) @@ -1612,6 +1805,8 @@ static int sel_fill_super(struct super_block *sb, void *data, int silent) [SEL_CHECKREQPROT] = {"checkreqprot", &sel_checkreqprot_ops, S_IRUGO|S_IWUSR}, [SEL_REJECT_UNKNOWN] = {"reject_unknown", &sel_handle_unknown_ops, S_IRUGO}, [SEL_DENY_UNKNOWN] = {"deny_unknown", &sel_handle_unknown_ops, S_IRUGO}, + [SEL_STATUS] = {"status", &sel_handle_status_ops, S_IRUGO}, + [SEL_POLICY] = {"policy", &sel_policy_ops, S_IRUSR}, /* last one */ {""} }; ret = simple_fill_super(sb, SELINUX_MAGIC, selinux_files); diff --git a/security/selinux/ss/Makefile b/security/selinux/ss/Makefile deleted file mode 100644 index 15d4e62917de..000000000000 --- a/security/selinux/ss/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# -# Makefile for building the SELinux security server as part of the kernel tree. -# - -EXTRA_CFLAGS += -Isecurity/selinux -Isecurity/selinux/include -obj-y := ss.o - -ss-y := ebitmap.o hashtab.o symtab.o sidtab.o avtab.o policydb.o services.o conditional.o mls.o - diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c index 929480c6c430..a3dd9faa19c0 100644 --- a/security/selinux/ss/avtab.c +++ b/security/selinux/ss/avtab.c @@ -266,8 +266,8 @@ int avtab_alloc(struct avtab *h, u32 nrules) if (shift > 2) shift = shift - 2; nslot = 1 << shift; - if (nslot > MAX_AVTAB_SIZE) - nslot = MAX_AVTAB_SIZE; + if (nslot > MAX_AVTAB_HASH_BUCKETS) + nslot = MAX_AVTAB_HASH_BUCKETS; mask = nslot - 1; h->htable = kcalloc(nslot, sizeof(*(h->htable)), GFP_KERNEL); @@ -501,6 +501,48 @@ bad: goto out; } +int avtab_write_item(struct policydb *p, struct avtab_node *cur, void *fp) +{ + __le16 buf16[4]; + __le32 buf32[1]; + int rc; + + buf16[0] = cpu_to_le16(cur->key.source_type); + buf16[1] = cpu_to_le16(cur->key.target_type); + buf16[2] = cpu_to_le16(cur->key.target_class); + buf16[3] = cpu_to_le16(cur->key.specified); + rc = put_entry(buf16, sizeof(u16), 4, fp); + if (rc) + return rc; + buf32[0] = cpu_to_le32(cur->datum.data); + rc = put_entry(buf32, sizeof(u32), 1, fp); + if (rc) + return rc; + return 0; +} + +int avtab_write(struct policydb *p, struct avtab *a, void *fp) +{ + unsigned int i; + int rc = 0; + struct avtab_node *cur; + __le32 buf[1]; + + buf[0] = cpu_to_le32(a->nel); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + + for (i = 0; i < a->nslot; i++) { + for (cur = a->htable[i]; cur; cur = cur->next) { + rc = avtab_write_item(p, cur, fp); + if (rc) + return rc; + } + } + + return rc; +} void avtab_cache_init(void) { avtab_node_cachep = kmem_cache_create("avtab_node", diff --git a/security/selinux/ss/avtab.h b/security/selinux/ss/avtab.h index cd4f734e2749..dff0c75345c1 100644 --- a/security/selinux/ss/avtab.h +++ b/security/selinux/ss/avtab.h @@ -71,6 +71,8 @@ int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol, void *p); int avtab_read(struct avtab *a, void *fp, struct policydb *pol); +int avtab_write_item(struct policydb *p, struct avtab_node *cur, void *fp); +int avtab_write(struct policydb *p, struct avtab *a, void *fp); struct avtab_node *avtab_insert_nonunique(struct avtab *h, struct avtab_key *key, struct avtab_datum *datum); @@ -85,7 +87,6 @@ void avtab_cache_destroy(void); #define MAX_AVTAB_HASH_BITS 11 #define MAX_AVTAB_HASH_BUCKETS (1 << MAX_AVTAB_HASH_BITS) #define MAX_AVTAB_HASH_MASK (MAX_AVTAB_HASH_BUCKETS-1) -#define MAX_AVTAB_SIZE MAX_AVTAB_HASH_BUCKETS #endif /* _SS_AVTAB_H_ */ diff --git a/security/selinux/ss/conditional.c b/security/selinux/ss/conditional.c index c91e150c3087..655fe1c6cc69 100644 --- a/security/selinux/ss/conditional.c +++ b/security/selinux/ss/conditional.c @@ -490,6 +490,129 @@ err: return rc; } +int cond_write_bool(void *vkey, void *datum, void *ptr) +{ + char *key = vkey; + struct cond_bool_datum *booldatum = datum; + struct policy_data *pd = ptr; + void *fp = pd->fp; + __le32 buf[3]; + u32 len; + int rc; + + len = strlen(key); + buf[0] = cpu_to_le32(booldatum->value); + buf[1] = cpu_to_le32(booldatum->state); + buf[2] = cpu_to_le32(len); + rc = put_entry(buf, sizeof(u32), 3, fp); + if (rc) + return rc; + rc = put_entry(key, 1, len, fp); + if (rc) + return rc; + return 0; +} + +/* + * cond_write_cond_av_list doesn't write out the av_list nodes. + * Instead it writes out the key/value pairs from the avtab. This + * is necessary because there is no way to uniquely identifying rules + * in the avtab so it is not possible to associate individual rules + * in the avtab with a conditional without saving them as part of + * the conditional. This means that the avtab with the conditional + * rules will not be saved but will be rebuilt on policy load. + */ +static int cond_write_av_list(struct policydb *p, + struct cond_av_list *list, struct policy_file *fp) +{ + __le32 buf[1]; + struct cond_av_list *cur_list; + u32 len; + int rc; + + len = 0; + for (cur_list = list; cur_list != NULL; cur_list = cur_list->next) + len++; + + buf[0] = cpu_to_le32(len); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + + if (len == 0) + return 0; + + for (cur_list = list; cur_list != NULL; cur_list = cur_list->next) { + rc = avtab_write_item(p, cur_list->node, fp); + if (rc) + return rc; + } + + return 0; +} + +int cond_write_node(struct policydb *p, struct cond_node *node, + struct policy_file *fp) +{ + struct cond_expr *cur_expr; + __le32 buf[2]; + int rc; + u32 len = 0; + + buf[0] = cpu_to_le32(node->cur_state); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + + for (cur_expr = node->expr; cur_expr != NULL; cur_expr = cur_expr->next) + len++; + + buf[0] = cpu_to_le32(len); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + + for (cur_expr = node->expr; cur_expr != NULL; cur_expr = cur_expr->next) { + buf[0] = cpu_to_le32(cur_expr->expr_type); + buf[1] = cpu_to_le32(cur_expr->bool); + rc = put_entry(buf, sizeof(u32), 2, fp); + if (rc) + return rc; + } + + rc = cond_write_av_list(p, node->true_list, fp); + if (rc) + return rc; + rc = cond_write_av_list(p, node->false_list, fp); + if (rc) + return rc; + + return 0; +} + +int cond_write_list(struct policydb *p, struct cond_node *list, void *fp) +{ + struct cond_node *cur; + u32 len; + __le32 buf[1]; + int rc; + + len = 0; + for (cur = list; cur != NULL; cur = cur->next) + len++; + buf[0] = cpu_to_le32(len); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + + for (cur = list; cur != NULL; cur = cur->next) { + rc = cond_write_node(p, cur, fp); + if (rc) + return rc; + } + + return 0; +} /* Determine whether additional permissions are granted by the conditional * av table, and if so, add them to the result */ diff --git a/security/selinux/ss/conditional.h b/security/selinux/ss/conditional.h index 53ddb013ae57..3f209c635295 100644 --- a/security/selinux/ss/conditional.h +++ b/security/selinux/ss/conditional.h @@ -69,6 +69,8 @@ int cond_index_bool(void *key, void *datum, void *datap); int cond_read_bool(struct policydb *p, struct hashtab *h, void *fp); int cond_read_list(struct policydb *p, void *fp); +int cond_write_bool(void *key, void *datum, void *ptr); +int cond_write_list(struct policydb *p, struct cond_node *list, void *fp); void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decision *avd); diff --git a/security/selinux/ss/ebitmap.c b/security/selinux/ss/ebitmap.c index 04b6145d767f..d42951fcbe87 100644 --- a/security/selinux/ss/ebitmap.c +++ b/security/selinux/ss/ebitmap.c @@ -22,6 +22,8 @@ #include "ebitmap.h" #include "policydb.h" +#define BITS_PER_U64 (sizeof(u64) * 8) + int ebitmap_cmp(struct ebitmap *e1, struct ebitmap *e2) { struct ebitmap_node *n1, *n2; @@ -363,10 +365,10 @@ int ebitmap_read(struct ebitmap *e, void *fp) e->highbit = le32_to_cpu(buf[1]); count = le32_to_cpu(buf[2]); - if (mapunit != sizeof(u64) * 8) { + if (mapunit != BITS_PER_U64) { printk(KERN_ERR "SELinux: ebitmap: map size %u does not " "match my size %Zd (high bit was %d)\n", - mapunit, sizeof(u64) * 8, e->highbit); + mapunit, BITS_PER_U64, e->highbit); goto bad; } @@ -446,3 +448,78 @@ bad: ebitmap_destroy(e); goto out; } + +int ebitmap_write(struct ebitmap *e, void *fp) +{ + struct ebitmap_node *n; + u32 count; + __le32 buf[3]; + u64 map; + int bit, last_bit, last_startbit, rc; + + buf[0] = cpu_to_le32(BITS_PER_U64); + + count = 0; + last_bit = 0; + last_startbit = -1; + ebitmap_for_each_positive_bit(e, n, bit) { + if (rounddown(bit, (int)BITS_PER_U64) > last_startbit) { + count++; + last_startbit = rounddown(bit, BITS_PER_U64); + } + last_bit = roundup(bit + 1, BITS_PER_U64); + } + buf[1] = cpu_to_le32(last_bit); + buf[2] = cpu_to_le32(count); + + rc = put_entry(buf, sizeof(u32), 3, fp); + if (rc) + return rc; + + map = 0; + last_startbit = INT_MIN; + ebitmap_for_each_positive_bit(e, n, bit) { + if (rounddown(bit, (int)BITS_PER_U64) > last_startbit) { + __le64 buf64[1]; + + /* this is the very first bit */ + if (!map) { + last_startbit = rounddown(bit, BITS_PER_U64); + map = (u64)1 << (bit - last_startbit); + continue; + } + + /* write the last node */ + buf[0] = cpu_to_le32(last_startbit); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + + buf64[0] = cpu_to_le64(map); + rc = put_entry(buf64, sizeof(u64), 1, fp); + if (rc) + return rc; + + /* set up for the next node */ + map = 0; + last_startbit = rounddown(bit, BITS_PER_U64); + } + map |= (u64)1 << (bit - last_startbit); + } + /* write the last node */ + if (map) { + __le64 buf64[1]; + + /* write the last node */ + buf[0] = cpu_to_le32(last_startbit); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + + buf64[0] = cpu_to_le64(map); + rc = put_entry(buf64, sizeof(u64), 1, fp); + if (rc) + return rc; + } + return 0; +} diff --git a/security/selinux/ss/ebitmap.h b/security/selinux/ss/ebitmap.h index f283b4367f54..1f4e93c2ae86 100644 --- a/security/selinux/ss/ebitmap.h +++ b/security/selinux/ss/ebitmap.h @@ -123,6 +123,7 @@ int ebitmap_get_bit(struct ebitmap *e, unsigned long bit); int ebitmap_set_bit(struct ebitmap *e, unsigned long bit, int value); void ebitmap_destroy(struct ebitmap *e); int ebitmap_read(struct ebitmap *e, void *fp); +int ebitmap_write(struct ebitmap *e, void *fp); #ifdef CONFIG_NETLABEL int ebitmap_netlbl_export(struct ebitmap *ebmap, diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index 3a29704be8ce..94f630d93a5c 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c @@ -37,6 +37,7 @@ #include "policydb.h" #include "conditional.h" #include "mls.h" +#include "services.h" #define _DEBUG_HASHES @@ -185,9 +186,19 @@ static u32 rangetr_hash(struct hashtab *h, const void *k) static int rangetr_cmp(struct hashtab *h, const void *k1, const void *k2) { const struct range_trans *key1 = k1, *key2 = k2; - return (key1->source_type != key2->source_type || - key1->target_type != key2->target_type || - key1->target_class != key2->target_class); + int v; + + v = key1->source_type - key2->source_type; + if (v) + return v; + + v = key1->target_type - key2->target_type; + if (v) + return v; + + v = key1->target_class - key2->target_class; + + return v; } /* @@ -1624,11 +1635,11 @@ static int role_bounds_sanity_check(void *key, void *datum, void *datap) static int type_bounds_sanity_check(void *key, void *datum, void *datap) { - struct type_datum *upper, *type; + struct type_datum *upper; struct policydb *p = datap; int depth = 0; - upper = type = datum; + upper = datum; while (upper->bounds) { if (++depth == POLICYDB_BOUNDS_MAXDEPTH) { printk(KERN_ERR "SELinux: type %s: " @@ -2306,3 +2317,843 @@ bad: policydb_destroy(p); goto out; } + +/* + * Write a MLS level structure to a policydb binary + * representation file. + */ +static int mls_write_level(struct mls_level *l, void *fp) +{ + __le32 buf[1]; + int rc; + + buf[0] = cpu_to_le32(l->sens); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + + rc = ebitmap_write(&l->cat, fp); + if (rc) + return rc; + + return 0; +} + +/* + * Write a MLS range structure to a policydb binary + * representation file. + */ +static int mls_write_range_helper(struct mls_range *r, void *fp) +{ + __le32 buf[3]; + size_t items; + int rc, eq; + + eq = mls_level_eq(&r->level[1], &r->level[0]); + + if (eq) + items = 2; + else + items = 3; + buf[0] = cpu_to_le32(items-1); + buf[1] = cpu_to_le32(r->level[0].sens); + if (!eq) + buf[2] = cpu_to_le32(r->level[1].sens); + + BUG_ON(items > (sizeof(buf)/sizeof(buf[0]))); + + rc = put_entry(buf, sizeof(u32), items, fp); + if (rc) + return rc; + + rc = ebitmap_write(&r->level[0].cat, fp); + if (rc) + return rc; + if (!eq) { + rc = ebitmap_write(&r->level[1].cat, fp); + if (rc) + return rc; + } + + return 0; +} + +static int sens_write(void *vkey, void *datum, void *ptr) +{ + char *key = vkey; + struct level_datum *levdatum = datum; + struct policy_data *pd = ptr; + void *fp = pd->fp; + __le32 buf[2]; + size_t len; + int rc; + + len = strlen(key); + buf[0] = cpu_to_le32(len); + buf[1] = cpu_to_le32(levdatum->isalias); + rc = put_entry(buf, sizeof(u32), 2, fp); + if (rc) + return rc; + + rc = put_entry(key, 1, len, fp); + if (rc) + return rc; + + rc = mls_write_level(levdatum->level, fp); + if (rc) + return rc; + + return 0; +} + +static int cat_write(void *vkey, void *datum, void *ptr) +{ + char *key = vkey; + struct cat_datum *catdatum = datum; + struct policy_data *pd = ptr; + void *fp = pd->fp; + __le32 buf[3]; + size_t len; + int rc; + + len = strlen(key); + buf[0] = cpu_to_le32(len); + buf[1] = cpu_to_le32(catdatum->value); + buf[2] = cpu_to_le32(catdatum->isalias); + rc = put_entry(buf, sizeof(u32), 3, fp); + if (rc) + return rc; + + rc = put_entry(key, 1, len, fp); + if (rc) + return rc; + + return 0; +} + +static int role_trans_write(struct role_trans *r, void *fp) +{ + struct role_trans *tr; + u32 buf[3]; + size_t nel; + int rc; + + nel = 0; + for (tr = r; tr; tr = tr->next) + nel++; + buf[0] = cpu_to_le32(nel); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + for (tr = r; tr; tr = tr->next) { + buf[0] = cpu_to_le32(tr->role); + buf[1] = cpu_to_le32(tr->type); + buf[2] = cpu_to_le32(tr->new_role); + rc = put_entry(buf, sizeof(u32), 3, fp); + if (rc) + return rc; + } + + return 0; +} + +static int role_allow_write(struct role_allow *r, void *fp) +{ + struct role_allow *ra; + u32 buf[2]; + size_t nel; + int rc; + + nel = 0; + for (ra = r; ra; ra = ra->next) + nel++; + buf[0] = cpu_to_le32(nel); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + for (ra = r; ra; ra = ra->next) { + buf[0] = cpu_to_le32(ra->role); + buf[1] = cpu_to_le32(ra->new_role); + rc = put_entry(buf, sizeof(u32), 2, fp); + if (rc) + return rc; + } + return 0; +} + +/* + * Write a security context structure + * to a policydb binary representation file. + */ +static int context_write(struct policydb *p, struct context *c, + void *fp) +{ + int rc; + __le32 buf[3]; + + buf[0] = cpu_to_le32(c->user); + buf[1] = cpu_to_le32(c->role); + buf[2] = cpu_to_le32(c->type); + + rc = put_entry(buf, sizeof(u32), 3, fp); + if (rc) + return rc; + + rc = mls_write_range_helper(&c->range, fp); + if (rc) + return rc; + + return 0; +} + +/* + * The following *_write functions are used to + * write the symbol data to a policy database + * binary representation file. + */ + +static int perm_write(void *vkey, void *datum, void *fp) +{ + char *key = vkey; + struct perm_datum *perdatum = datum; + __le32 buf[2]; + size_t len; + int rc; + + len = strlen(key); + buf[0] = cpu_to_le32(len); + buf[1] = cpu_to_le32(perdatum->value); + rc = put_entry(buf, sizeof(u32), 2, fp); + if (rc) + return rc; + + rc = put_entry(key, 1, len, fp); + if (rc) + return rc; + + return 0; +} + +static int common_write(void *vkey, void *datum, void *ptr) +{ + char *key = vkey; + struct common_datum *comdatum = datum; + struct policy_data *pd = ptr; + void *fp = pd->fp; + __le32 buf[4]; + size_t len; + int rc; + + len = strlen(key); + buf[0] = cpu_to_le32(len); + buf[1] = cpu_to_le32(comdatum->value); + buf[2] = cpu_to_le32(comdatum->permissions.nprim); + buf[3] = cpu_to_le32(comdatum->permissions.table->nel); + rc = put_entry(buf, sizeof(u32), 4, fp); + if (rc) + return rc; + + rc = put_entry(key, 1, len, fp); + if (rc) + return rc; + + rc = hashtab_map(comdatum->permissions.table, perm_write, fp); + if (rc) + return rc; + + return 0; +} + +static int write_cons_helper(struct policydb *p, struct constraint_node *node, + void *fp) +{ + struct constraint_node *c; + struct constraint_expr *e; + __le32 buf[3]; + u32 nel; + int rc; + + for (c = node; c; c = c->next) { + nel = 0; + for (e = c->expr; e; e = e->next) + nel++; + buf[0] = cpu_to_le32(c->permissions); + buf[1] = cpu_to_le32(nel); + rc = put_entry(buf, sizeof(u32), 2, fp); + if (rc) + return rc; + for (e = c->expr; e; e = e->next) { + buf[0] = cpu_to_le32(e->expr_type); + buf[1] = cpu_to_le32(e->attr); + buf[2] = cpu_to_le32(e->op); + rc = put_entry(buf, sizeof(u32), 3, fp); + if (rc) + return rc; + + switch (e->expr_type) { + case CEXPR_NAMES: + rc = ebitmap_write(&e->names, fp); + if (rc) + return rc; + break; + default: + break; + } + } + } + + return 0; +} + +static int class_write(void *vkey, void *datum, void *ptr) +{ + char *key = vkey; + struct class_datum *cladatum = datum; + struct policy_data *pd = ptr; + void *fp = pd->fp; + struct policydb *p = pd->p; + struct constraint_node *c; + __le32 buf[6]; + u32 ncons; + size_t len, len2; + int rc; + + len = strlen(key); + if (cladatum->comkey) + len2 = strlen(cladatum->comkey); + else + len2 = 0; + + ncons = 0; + for (c = cladatum->constraints; c; c = c->next) + ncons++; + + buf[0] = cpu_to_le32(len); + buf[1] = cpu_to_le32(len2); + buf[2] = cpu_to_le32(cladatum->value); + buf[3] = cpu_to_le32(cladatum->permissions.nprim); + if (cladatum->permissions.table) + buf[4] = cpu_to_le32(cladatum->permissions.table->nel); + else + buf[4] = 0; + buf[5] = cpu_to_le32(ncons); + rc = put_entry(buf, sizeof(u32), 6, fp); + if (rc) + return rc; + + rc = put_entry(key, 1, len, fp); + if (rc) + return rc; + + if (cladatum->comkey) { + rc = put_entry(cladatum->comkey, 1, len2, fp); + if (rc) + return rc; + } + + rc = hashtab_map(cladatum->permissions.table, perm_write, fp); + if (rc) + return rc; + + rc = write_cons_helper(p, cladatum->constraints, fp); + if (rc) + return rc; + + /* write out the validatetrans rule */ + ncons = 0; + for (c = cladatum->validatetrans; c; c = c->next) + ncons++; + + buf[0] = cpu_to_le32(ncons); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + + rc = write_cons_helper(p, cladatum->validatetrans, fp); + if (rc) + return rc; + + return 0; +} + +static int role_write(void *vkey, void *datum, void *ptr) +{ + char *key = vkey; + struct role_datum *role = datum; + struct policy_data *pd = ptr; + void *fp = pd->fp; + struct policydb *p = pd->p; + __le32 buf[3]; + size_t items, len; + int rc; + + len = strlen(key); + items = 0; + buf[items++] = cpu_to_le32(len); + buf[items++] = cpu_to_le32(role->value); + if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) + buf[items++] = cpu_to_le32(role->bounds); + + BUG_ON(items > (sizeof(buf)/sizeof(buf[0]))); + + rc = put_entry(buf, sizeof(u32), items, fp); + if (rc) + return rc; + + rc = put_entry(key, 1, len, fp); + if (rc) + return rc; + + rc = ebitmap_write(&role->dominates, fp); + if (rc) + return rc; + + rc = ebitmap_write(&role->types, fp); + if (rc) + return rc; + + return 0; +} + +static int type_write(void *vkey, void *datum, void *ptr) +{ + char *key = vkey; + struct type_datum *typdatum = datum; + struct policy_data *pd = ptr; + struct policydb *p = pd->p; + void *fp = pd->fp; + __le32 buf[4]; + int rc; + size_t items, len; + + len = strlen(key); + items = 0; + buf[items++] = cpu_to_le32(len); + buf[items++] = cpu_to_le32(typdatum->value); + if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) { + u32 properties = 0; + + if (typdatum->primary) + properties |= TYPEDATUM_PROPERTY_PRIMARY; + + if (typdatum->attribute) + properties |= TYPEDATUM_PROPERTY_ATTRIBUTE; + + buf[items++] = cpu_to_le32(properties); + buf[items++] = cpu_to_le32(typdatum->bounds); + } else { + buf[items++] = cpu_to_le32(typdatum->primary); + } + BUG_ON(items > (sizeof(buf) / sizeof(buf[0]))); + rc = put_entry(buf, sizeof(u32), items, fp); + if (rc) + return rc; + + rc = put_entry(key, 1, len, fp); + if (rc) + return rc; + + return 0; +} + +static int user_write(void *vkey, void *datum, void *ptr) +{ + char *key = vkey; + struct user_datum *usrdatum = datum; + struct policy_data *pd = ptr; + struct policydb *p = pd->p; + void *fp = pd->fp; + __le32 buf[3]; + size_t items, len; + int rc; + + len = strlen(key); + items = 0; + buf[items++] = cpu_to_le32(len); + buf[items++] = cpu_to_le32(usrdatum->value); + if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) + buf[items++] = cpu_to_le32(usrdatum->bounds); + BUG_ON(items > (sizeof(buf) / sizeof(buf[0]))); + rc = put_entry(buf, sizeof(u32), items, fp); + if (rc) + return rc; + + rc = put_entry(key, 1, len, fp); + if (rc) + return rc; + + rc = ebitmap_write(&usrdatum->roles, fp); + if (rc) + return rc; + + rc = mls_write_range_helper(&usrdatum->range, fp); + if (rc) + return rc; + + rc = mls_write_level(&usrdatum->dfltlevel, fp); + if (rc) + return rc; + + return 0; +} + +static int (*write_f[SYM_NUM]) (void *key, void *datum, + void *datap) = +{ + common_write, + class_write, + role_write, + type_write, + user_write, + cond_write_bool, + sens_write, + cat_write, +}; + +static int ocontext_write(struct policydb *p, struct policydb_compat_info *info, + void *fp) +{ + unsigned int i, j, rc; + size_t nel, len; + __le32 buf[3]; + u32 nodebuf[8]; + struct ocontext *c; + for (i = 0; i < info->ocon_num; i++) { + nel = 0; + for (c = p->ocontexts[i]; c; c = c->next) + nel++; + buf[0] = cpu_to_le32(nel); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + for (c = p->ocontexts[i]; c; c = c->next) { + switch (i) { + case OCON_ISID: + buf[0] = cpu_to_le32(c->sid[0]); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + rc = context_write(p, &c->context[0], fp); + if (rc) + return rc; + break; + case OCON_FS: + case OCON_NETIF: + len = strlen(c->u.name); + buf[0] = cpu_to_le32(len); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + rc = put_entry(c->u.name, 1, len, fp); + if (rc) + return rc; + rc = context_write(p, &c->context[0], fp); + if (rc) + return rc; + rc = context_write(p, &c->context[1], fp); + if (rc) + return rc; + break; + case OCON_PORT: + buf[0] = cpu_to_le32(c->u.port.protocol); + buf[1] = cpu_to_le32(c->u.port.low_port); + buf[2] = cpu_to_le32(c->u.port.high_port); + rc = put_entry(buf, sizeof(u32), 3, fp); + if (rc) + return rc; + rc = context_write(p, &c->context[0], fp); + if (rc) + return rc; + break; + case OCON_NODE: + nodebuf[0] = c->u.node.addr; /* network order */ + nodebuf[1] = c->u.node.mask; /* network order */ + rc = put_entry(nodebuf, sizeof(u32), 2, fp); + if (rc) + return rc; + rc = context_write(p, &c->context[0], fp); + if (rc) + return rc; + break; + case OCON_FSUSE: + buf[0] = cpu_to_le32(c->v.behavior); + len = strlen(c->u.name); + buf[1] = cpu_to_le32(len); + rc = put_entry(buf, sizeof(u32), 2, fp); + if (rc) + return rc; + rc = put_entry(c->u.name, 1, len, fp); + if (rc) + return rc; + rc = context_write(p, &c->context[0], fp); + if (rc) + return rc; + break; + case OCON_NODE6: + for (j = 0; j < 4; j++) + nodebuf[j] = c->u.node6.addr[j]; /* network order */ + for (j = 0; j < 4; j++) + nodebuf[j + 4] = c->u.node6.mask[j]; /* network order */ + rc = put_entry(nodebuf, sizeof(u32), 8, fp); + if (rc) + return rc; + rc = context_write(p, &c->context[0], fp); + if (rc) + return rc; + break; + } + } + } + return 0; +} + +static int genfs_write(struct policydb *p, void *fp) +{ + struct genfs *genfs; + struct ocontext *c; + size_t len; + __le32 buf[1]; + int rc; + + len = 0; + for (genfs = p->genfs; genfs; genfs = genfs->next) + len++; + buf[0] = cpu_to_le32(len); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + for (genfs = p->genfs; genfs; genfs = genfs->next) { + len = strlen(genfs->fstype); + buf[0] = cpu_to_le32(len); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + rc = put_entry(genfs->fstype, 1, len, fp); + if (rc) + return rc; + len = 0; + for (c = genfs->head; c; c = c->next) + len++; + buf[0] = cpu_to_le32(len); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + for (c = genfs->head; c; c = c->next) { + len = strlen(c->u.name); + buf[0] = cpu_to_le32(len); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + rc = put_entry(c->u.name, 1, len, fp); + if (rc) + return rc; + buf[0] = cpu_to_le32(c->v.sclass); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + rc = context_write(p, &c->context[0], fp); + if (rc) + return rc; + } + } + return 0; +} + +static int range_count(void *key, void *data, void *ptr) +{ + int *cnt = ptr; + *cnt = *cnt + 1; + + return 0; +} + +static int range_write_helper(void *key, void *data, void *ptr) +{ + __le32 buf[2]; + struct range_trans *rt = key; + struct mls_range *r = data; + struct policy_data *pd = ptr; + void *fp = pd->fp; + struct policydb *p = pd->p; + int rc; + + buf[0] = cpu_to_le32(rt->source_type); + buf[1] = cpu_to_le32(rt->target_type); + rc = put_entry(buf, sizeof(u32), 2, fp); + if (rc) + return rc; + if (p->policyvers >= POLICYDB_VERSION_RANGETRANS) { + buf[0] = cpu_to_le32(rt->target_class); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + } + rc = mls_write_range_helper(r, fp); + if (rc) + return rc; + + return 0; +} + +static int range_write(struct policydb *p, void *fp) +{ + size_t nel; + __le32 buf[1]; + int rc; + struct policy_data pd; + + pd.p = p; + pd.fp = fp; + + /* count the number of entries in the hashtab */ + nel = 0; + rc = hashtab_map(p->range_tr, range_count, &nel); + if (rc) + return rc; + + buf[0] = cpu_to_le32(nel); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + + /* actually write all of the entries */ + rc = hashtab_map(p->range_tr, range_write_helper, &pd); + if (rc) + return rc; + + return 0; +} + +/* + * Write the configuration data in a policy database + * structure to a policy database binary representation + * file. + */ +int policydb_write(struct policydb *p, void *fp) +{ + unsigned int i, num_syms; + int rc; + __le32 buf[4]; + u32 config; + size_t len; + struct policydb_compat_info *info; + + /* + * refuse to write policy older than compressed avtab + * to simplify the writer. There are other tests dropped + * since we assume this throughout the writer code. Be + * careful if you ever try to remove this restriction + */ + if (p->policyvers < POLICYDB_VERSION_AVTAB) { + printk(KERN_ERR "SELinux: refusing to write policy version %d." + " Because it is less than version %d\n", p->policyvers, + POLICYDB_VERSION_AVTAB); + return -EINVAL; + } + + config = 0; + if (p->mls_enabled) + config |= POLICYDB_CONFIG_MLS; + + if (p->reject_unknown) + config |= REJECT_UNKNOWN; + if (p->allow_unknown) + config |= ALLOW_UNKNOWN; + + /* Write the magic number and string identifiers. */ + buf[0] = cpu_to_le32(POLICYDB_MAGIC); + len = strlen(POLICYDB_STRING); + buf[1] = cpu_to_le32(len); + rc = put_entry(buf, sizeof(u32), 2, fp); + if (rc) + return rc; + rc = put_entry(POLICYDB_STRING, 1, len, fp); + if (rc) + return rc; + + /* Write the version, config, and table sizes. */ + info = policydb_lookup_compat(p->policyvers); + if (!info) { + printk(KERN_ERR "SELinux: compatibility lookup failed for policy " + "version %d", p->policyvers); + return rc; + } + + buf[0] = cpu_to_le32(p->policyvers); + buf[1] = cpu_to_le32(config); + buf[2] = cpu_to_le32(info->sym_num); + buf[3] = cpu_to_le32(info->ocon_num); + + rc = put_entry(buf, sizeof(u32), 4, fp); + if (rc) + return rc; + + if (p->policyvers >= POLICYDB_VERSION_POLCAP) { + rc = ebitmap_write(&p->policycaps, fp); + if (rc) + return rc; + } + + if (p->policyvers >= POLICYDB_VERSION_PERMISSIVE) { + rc = ebitmap_write(&p->permissive_map, fp); + if (rc) + return rc; + } + + num_syms = info->sym_num; + for (i = 0; i < num_syms; i++) { + struct policy_data pd; + + pd.fp = fp; + pd.p = p; + + buf[0] = cpu_to_le32(p->symtab[i].nprim); + buf[1] = cpu_to_le32(p->symtab[i].table->nel); + + rc = put_entry(buf, sizeof(u32), 2, fp); + if (rc) + return rc; + rc = hashtab_map(p->symtab[i].table, write_f[i], &pd); + if (rc) + return rc; + } + + rc = avtab_write(p, &p->te_avtab, fp); + if (rc) + return rc; + + rc = cond_write_list(p, p->cond_list, fp); + if (rc) + return rc; + + rc = role_trans_write(p->role_tr, fp); + if (rc) + return rc; + + rc = role_allow_write(p->role_allow, fp); + if (rc) + return rc; + + rc = ocontext_write(p, info, fp); + if (rc) + return rc; + + rc = genfs_write(p, fp); + if (rc) + return rc; + + rc = range_write(p, fp); + if (rc) + return rc; + + for (i = 0; i < p->p_types.nprim; i++) { + struct ebitmap *e = flex_array_get(p->type_attr_map_array, i); + + BUG_ON(!e); + rc = ebitmap_write(e, fp); + if (rc) + return rc; + } + + return 0; +} diff --git a/security/selinux/ss/policydb.h b/security/selinux/ss/policydb.h index 310e94442cb8..95d3d7de361e 100644 --- a/security/selinux/ss/policydb.h +++ b/security/selinux/ss/policydb.h @@ -254,6 +254,9 @@ struct policydb { struct ebitmap permissive_map; + /* length of this policy when it was loaded */ + size_t len; + unsigned int policyvers; unsigned int reject_unknown : 1; @@ -270,6 +273,7 @@ extern int policydb_class_isvalid(struct policydb *p, unsigned int class); extern int policydb_type_isvalid(struct policydb *p, unsigned int type); extern int policydb_role_isvalid(struct policydb *p, unsigned int role); extern int policydb_read(struct policydb *p, void *fp); +extern int policydb_write(struct policydb *p, void *fp); #define PERM_SYMTAB_SIZE 32 @@ -290,6 +294,11 @@ struct policy_file { size_t len; }; +struct policy_data { + struct policydb *p; + void *fp; +}; + static inline int next_entry(void *buf, struct policy_file *fp, size_t bytes) { if (bytes > fp->len) @@ -301,6 +310,17 @@ static inline int next_entry(void *buf, struct policy_file *fp, size_t bytes) return 0; } +static inline int put_entry(void *buf, size_t bytes, int num, struct policy_file *fp) +{ + size_t len = bytes * num; + + memcpy(fp->data, buf, len); + fp->data += len; + fp->len -= len; + + return 0; +} + extern u16 string_to_security_class(struct policydb *p, const char *name); extern u32 string_to_av_perm(struct policydb *p, u16 tclass, const char *name); diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 9ea2feca3cd4..223c1ff6ef23 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -51,6 +51,7 @@ #include <linux/mutex.h> #include <linux/selinux.h> #include <linux/flex_array.h> +#include <linux/vmalloc.h> #include <net/netlabel.h> #include "flask.h" @@ -991,7 +992,8 @@ static int context_struct_to_string(struct context *context, char **scontext, u3 { char *scontextp; - *scontext = NULL; + if (scontext) + *scontext = NULL; *scontext_len = 0; if (context->len) { @@ -1008,6 +1010,9 @@ static int context_struct_to_string(struct context *context, char **scontext, u3 *scontext_len += strlen(policydb.p_type_val_to_name[context->type - 1]) + 1; *scontext_len += mls_compute_context_len(context); + if (!scontext) + return 0; + /* Allocate space for the context; caller must free this space. */ scontextp = kmalloc(*scontext_len, GFP_ATOMIC); if (!scontextp) @@ -1047,7 +1052,8 @@ static int security_sid_to_context_core(u32 sid, char **scontext, struct context *context; int rc = 0; - *scontext = NULL; + if (scontext) + *scontext = NULL; *scontext_len = 0; if (!ss_initialized) { @@ -1055,6 +1061,8 @@ static int security_sid_to_context_core(u32 sid, char **scontext, char *scontextp; *scontext_len = strlen(initial_sid_to_string[sid]) + 1; + if (!scontext) + goto out; scontextp = kmalloc(*scontext_len, GFP_ATOMIC); if (!scontextp) { rc = -ENOMEM; @@ -1769,6 +1777,7 @@ int security_load_policy(void *data, size_t len) return rc; } + policydb.len = len; rc = selinux_set_mapping(&policydb, secclass_map, ¤t_mapping, ¤t_mapping_size); @@ -1791,6 +1800,7 @@ int security_load_policy(void *data, size_t len) selinux_complete_init(); avc_ss_reset(seqno); selnl_notify_policyload(seqno); + selinux_status_update_policyload(seqno); selinux_netlbl_cache_invalidate(); selinux_xfrm_notify_policyload(); return 0; @@ -1804,6 +1814,7 @@ int security_load_policy(void *data, size_t len) if (rc) return rc; + newpolicydb.len = len; /* If switching between different policy types, log MLS status */ if (policydb.mls_enabled && !newpolicydb.mls_enabled) printk(KERN_INFO "SELinux: Disabling MLS support...\n"); @@ -1870,6 +1881,7 @@ int security_load_policy(void *data, size_t len) avc_ss_reset(seqno); selnl_notify_policyload(seqno); + selinux_status_update_policyload(seqno); selinux_netlbl_cache_invalidate(); selinux_xfrm_notify_policyload(); @@ -1883,6 +1895,17 @@ err: } +size_t security_policydb_len(void) +{ + size_t len; + + read_lock(&policy_rwlock); + len = policydb.len; + read_unlock(&policy_rwlock); + + return len; +} + /** * security_port_sid - Obtain the SID for a port. * @protocol: protocol number @@ -2374,6 +2397,7 @@ out: if (!rc) { avc_ss_reset(seqno); selnl_notify_policyload(seqno); + selinux_status_update_policyload(seqno); selinux_xfrm_notify_policyload(); } return rc; @@ -3129,3 +3153,38 @@ netlbl_sid_to_secattr_failure: return rc; } #endif /* CONFIG_NETLABEL */ + +/** + * security_read_policy - read the policy. + * @data: binary policy data + * @len: length of data in bytes + * + */ +int security_read_policy(void **data, ssize_t *len) +{ + int rc; + struct policy_file fp; + + if (!ss_initialized) + return -EINVAL; + + *len = security_policydb_len(); + + *data = vmalloc_user(*len); + if (!*data) + return -ENOMEM; + + fp.data = *data; + fp.len = *len; + + read_lock(&policy_rwlock); + rc = policydb_write(&policydb, &fp); + read_unlock(&policy_rwlock); + + if (rc) + return rc; + + *len = (unsigned long)fp.data - (unsigned long)*data; + return 0; + +} diff --git a/security/selinux/ss/status.c b/security/selinux/ss/status.c new file mode 100644 index 000000000000..d982365f9d1a --- /dev/null +++ b/security/selinux/ss/status.c @@ -0,0 +1,126 @@ +/* + * mmap based event notifications for SELinux + * + * Author: KaiGai Kohei <kaigai@ak.jp.nec.com> + * + * Copyright (C) 2010 NEC corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + */ +#include <linux/kernel.h> +#include <linux/gfp.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include "avc.h" +#include "services.h" + +/* + * The selinux_status_page shall be exposed to userspace applications + * using mmap interface on /selinux/status. + * It enables to notify applications a few events that will cause reset + * of userspace access vector without context switching. + * + * The selinux_kernel_status structure on the head of status page is + * protected from concurrent accesses using seqlock logic, so userspace + * application should reference the status page according to the seqlock + * logic. + * + * Typically, application checks status->sequence at the head of access + * control routine. If it is odd-number, kernel is updating the status, + * so please wait for a moment. If it is changed from the last sequence + * number, it means something happen, so application will reset userspace + * avc, if needed. + * In most cases, application shall confirm the kernel status is not + * changed without any system call invocations. + */ +static struct page *selinux_status_page; +static DEFINE_MUTEX(selinux_status_lock); + +/* + * selinux_kernel_status_page + * + * It returns a reference to selinux_status_page. If the status page is + * not allocated yet, it also tries to allocate it at the first time. + */ +struct page *selinux_kernel_status_page(void) +{ + struct selinux_kernel_status *status; + struct page *result = NULL; + + mutex_lock(&selinux_status_lock); + if (!selinux_status_page) { + selinux_status_page = alloc_page(GFP_KERNEL|__GFP_ZERO); + + if (selinux_status_page) { + status = page_address(selinux_status_page); + + status->version = SELINUX_KERNEL_STATUS_VERSION; + status->sequence = 0; + status->enforcing = selinux_enforcing; + /* + * NOTE: the next policyload event shall set + * a positive value on the status->policyload, + * although it may not be 1, but never zero. + * So, application can know it was updated. + */ + status->policyload = 0; + status->deny_unknown = !security_get_allow_unknown(); + } + } + result = selinux_status_page; + mutex_unlock(&selinux_status_lock); + + return result; +} + +/* + * selinux_status_update_setenforce + * + * It updates status of the current enforcing/permissive mode. + */ +void selinux_status_update_setenforce(int enforcing) +{ + struct selinux_kernel_status *status; + + mutex_lock(&selinux_status_lock); + if (selinux_status_page) { + status = page_address(selinux_status_page); + + status->sequence++; + smp_wmb(); + + status->enforcing = enforcing; + + smp_wmb(); + status->sequence++; + } + mutex_unlock(&selinux_status_lock); +} + +/* + * selinux_status_update_policyload + * + * It updates status of the times of policy reloaded, and current + * setting of deny_unknown. + */ +void selinux_status_update_policyload(int seqno) +{ + struct selinux_kernel_status *status; + + mutex_lock(&selinux_status_lock); + if (selinux_status_page) { + status = page_address(selinux_status_page); + + status->sequence++; + smp_wmb(); + + status->policyload = seqno; + status->deny_unknown = !security_get_allow_unknown(); + + smp_wmb(); + status->sequence++; + } + mutex_unlock(&selinux_status_lock); +} diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index c448d57ae2b7..bc39f4067af6 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -1281,12 +1281,11 @@ static int smack_task_getioprio(struct task_struct *p) * * Return 0 if read access is permitted */ -static int smack_task_setscheduler(struct task_struct *p, int policy, - struct sched_param *lp) +static int smack_task_setscheduler(struct task_struct *p) { int rc; - rc = cap_task_setscheduler(p, policy, lp); + rc = cap_task_setscheduler(p); if (rc == 0) rc = smk_curacc_on_task(p, MAY_WRITE); return rc; @@ -3005,7 +3004,8 @@ static int smack_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) { char *sp = smack_from_secid(secid); - *secdata = sp; + if (secdata) + *secdata = sp; *seclen = strlen(sp); return 0; } diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c index c668b447c725..7556315c1978 100644 --- a/security/tomoyo/common.c +++ b/security/tomoyo/common.c @@ -768,8 +768,10 @@ static bool tomoyo_select_one(struct tomoyo_io_buffer *head, const char *data) return true; /* Do nothing if open(O_WRONLY). */ memset(&head->r, 0, sizeof(head->r)); head->r.print_this_domain_only = true; - head->r.eof = !domain; - head->r.domain = &domain->list; + if (domain) + head->r.domain = &domain->list; + else + head->r.eof = 1; tomoyo_io_printf(head, "# select %s\n", data); if (domain && domain->is_deleted) tomoyo_io_printf(head, "# This is a deleted domain.\n"); @@ -2051,13 +2053,22 @@ void tomoyo_check_profile(void) const u8 profile = domain->profile; if (tomoyo_profile_ptr[profile]) continue; + printk(KERN_ERR "You need to define profile %u before using it.\n", + profile); + printk(KERN_ERR "Please see http://tomoyo.sourceforge.jp/2.3/ " + "for more information.\n"); panic("Profile %u (used by '%s') not defined.\n", profile, domain->domainname->name); } tomoyo_read_unlock(idx); - if (tomoyo_profile_version != 20090903) + if (tomoyo_profile_version != 20090903) { + printk(KERN_ERR "You need to install userland programs for " + "TOMOYO 2.3 and initialize policy configuration.\n"); + printk(KERN_ERR "Please see http://tomoyo.sourceforge.jp/2.3/ " + "for more information.\n"); panic("Profile version %u is not supported.\n", tomoyo_profile_version); + } printk(KERN_INFO "TOMOYO: 2.3.0\n"); printk(KERN_INFO "Mandatory Access Control activated.\n"); } diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c index a7868ad4d530..cbbed0db9e56 100644 --- a/sound/core/rawmidi.c +++ b/sound/core/rawmidi.c @@ -535,13 +535,15 @@ static int snd_rawmidi_release(struct inode *inode, struct file *file) { struct snd_rawmidi_file *rfile; struct snd_rawmidi *rmidi; + struct module *module; rfile = file->private_data; rmidi = rfile->rmidi; rawmidi_release_priv(rfile); kfree(rfile); + module = rmidi->card->module; snd_card_file_remove(rmidi->card, file); - module_put(rmidi->card->module); + module_put(module); return 0; } diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt index 5164a655c39f..b2c63309a651 100644 --- a/tools/perf/Documentation/perf-annotate.txt +++ b/tools/perf/Documentation/perf-annotate.txt @@ -8,7 +8,7 @@ perf-annotate - Read perf.data (created by perf record) and display annotated co SYNOPSIS -------- [verse] -'perf annotate' [-i <file> | --input=file] symbol_name +'perf annotate' [-i <file> | --input=file] [symbol_name] DESCRIPTION ----------- @@ -24,6 +24,13 @@ OPTIONS --input=:: Input file name. (default: perf.data) +--stdio:: Use the stdio interface. + +--tui:: Use the TUI interface Use of --tui requires a tty, if one is not + present, as when piping to other commands, the stdio interface is + used. This interfaces starts by centering on the line with more + samples, TAB/UNTAB cycles thru the lines with more samples. + SEE ALSO -------- -linkperf:perf-record[1] +linkperf:perf-record[1], linkperf:perf-report[1] diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index abfabe9147a4..12052c9ed0ba 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -65,6 +65,13 @@ OPTIONS the tree is considered as a new profiled object. + Default: fractal,0.5. +--stdio:: Use the stdio interface. + +--tui:: Use the TUI interface, that is integrated with annotate and allows + zooming into DSOs or threads, among other features. Use of --tui + requires a tty, if one is not present, as when piping to other + commands, the stdio interface is used. + SEE ALSO -------- linkperf:perf-stat[1] diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 1950e19af1cf..d1db0f676a4b 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -313,6 +313,9 @@ TEST_PROGRAMS = SCRIPT_SH += perf-archive.sh +grep-libs = $(filter -l%,$(1)) +strip-libs = $(filter-out -l%,$(1)) + # # No Perl scripts right now: # @@ -588,14 +591,17 @@ endif ifdef NO_LIBPERL BASIC_CFLAGS += -DNO_LIBPERL else - PERL_EMBED_LDOPTS = `perl -MExtUtils::Embed -e ldopts 2>/dev/null` + PERL_EMBED_LDOPTS = $(shell perl -MExtUtils::Embed -e ldopts 2>/dev/null) + PERL_EMBED_LDFLAGS = $(call strip-libs,$(PERL_EMBED_LDOPTS)) + PERL_EMBED_LIBADD = $(call grep-libs,$(PERL_EMBED_LDOPTS)) PERL_EMBED_CCOPTS = `perl -MExtUtils::Embed -e ccopts 2>/dev/null` FLAGS_PERL_EMBED=$(PERL_EMBED_CCOPTS) $(PERL_EMBED_LDOPTS) ifneq ($(call try-cc,$(SOURCE_PERL_EMBED),$(FLAGS_PERL_EMBED)),y) BASIC_CFLAGS += -DNO_LIBPERL else - ALL_LDFLAGS += $(PERL_EMBED_LDOPTS) + ALL_LDFLAGS += $(PERL_EMBED_LDFLAGS) + EXTLIBS += $(PERL_EMBED_LIBADD) LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-perl.o LIB_OBJS += $(OUTPUT)scripts/perl/Perf-Trace-Util/Context.o endif @@ -604,13 +610,16 @@ endif ifdef NO_LIBPYTHON BASIC_CFLAGS += -DNO_LIBPYTHON else - PYTHON_EMBED_LDOPTS = `python-config --ldflags 2>/dev/null` + PYTHON_EMBED_LDOPTS = $(shell python-config --ldflags 2>/dev/null) + PYTHON_EMBED_LDFLAGS = $(call strip-libs,$(PYTHON_EMBED_LDOPTS)) + PYTHON_EMBED_LIBADD = $(call grep-libs,$(PYTHON_EMBED_LDOPTS)) PYTHON_EMBED_CCOPTS = `python-config --cflags 2>/dev/null` FLAGS_PYTHON_EMBED=$(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS) ifneq ($(call try-cc,$(SOURCE_PYTHON_EMBED),$(FLAGS_PYTHON_EMBED)),y) BASIC_CFLAGS += -DNO_LIBPYTHON else - ALL_LDFLAGS += $(PYTHON_EMBED_LDOPTS) + ALL_LDFLAGS += $(PYTHON_EMBED_LDFLAGS) + EXTLIBS += $(PYTHON_EMBED_LIBADD) LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-python.o LIB_OBJS += $(OUTPUT)scripts/python/Perf-Trace-Util/Context.o endif @@ -653,6 +662,15 @@ else endif endif + +ifdef NO_STRLCPY + BASIC_CFLAGS += -DNO_STRLCPY +else + ifneq ($(call try-cc,$(SOURCE_STRLCPY),),y) + BASIC_CFLAGS += -DNO_STRLCPY + endif +endif + ifndef CC_LD_DYNPATH ifdef NO_R_TO_GCC_LINKER # Some gcc does not accept and pass -R to the linker to specify @@ -910,8 +928,8 @@ $(OUTPUT)perf.o: perf.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS $(ALL_CFLAGS) -c $(filter %.c,$^) -o $@ $(OUTPUT)perf$X: $(OUTPUT)perf.o $(BUILTIN_OBJS) $(PERFLIBS) - $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(OUTPUT)perf.o \ - $(BUILTIN_OBJS) $(ALL_LDFLAGS) $(LIBS) + $(QUIET_LINK)$(CC) $(ALL_CFLAGS) $(ALL_LDFLAGS) $(OUTPUT)perf.o \ + $(BUILTIN_OBJS) $(LIBS) -o $@ $(OUTPUT)builtin-help.o: builtin-help.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) \ diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 1478dc64bf15..6d5604d8df95 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -28,7 +28,7 @@ static char const *input_name = "perf.data"; -static bool force; +static bool force, use_tui, use_stdio; static bool full_paths; @@ -321,7 +321,7 @@ static int hist_entry__tty_annotate(struct hist_entry *he) static void hists__find_annotations(struct hists *self) { - struct rb_node *first = rb_first(&self->entries), *nd = first; + struct rb_node *nd = rb_first(&self->entries), *next; int key = KEY_RIGHT; while (nd) { @@ -343,20 +343,19 @@ find_next: if (use_browser > 0) { key = hist_entry__tui_annotate(he); - if (is_exit_key(key)) - break; switch (key) { case KEY_RIGHT: - case '\t': - nd = rb_next(nd); + next = rb_next(nd); break; case KEY_LEFT: - if (nd == first) - continue; - nd = rb_prev(nd); - default: + next = rb_prev(nd); break; + default: + return; } + + if (next != NULL) + nd = next; } else { hist_entry__tty_annotate(he); nd = rb_next(nd); @@ -428,6 +427,8 @@ static const struct option options[] = { "be more verbose (show symbol address, etc)"), OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"), + OPT_BOOLEAN(0, "tui", &use_tui, "Use the TUI interface"), + OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"), OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name, "file", "vmlinux pathname"), OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules, @@ -443,6 +444,11 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __used) { argc = parse_options(argc, argv, options, annotate_usage, 0); + if (use_stdio) + use_browser = 0; + else if (use_tui) + use_browser = 1; + setup_browser(); symbol_conf.priv_size = sizeof(struct sym_priv); diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 55fc1f46892a..5de405d45230 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -32,7 +32,7 @@ static char const *input_name = "perf.data"; -static bool force; +static bool force, use_tui, use_stdio; static bool hide_unresolved; static bool dont_use_callchains; @@ -107,7 +107,8 @@ static int perf_session__add_hist_entry(struct perf_session *self, goto out_free_syms; err = 0; if (symbol_conf.use_callchain) { - err = append_chain(he->callchain, data->callchain, syms, data->period); + err = callchain_append(he->callchain, data->callchain, syms, + data->period); if (err) goto out_free_syms; } @@ -450,6 +451,8 @@ static const struct option options[] = { "Show per-thread event counters"), OPT_STRING(0, "pretty", &pretty_printing_style, "key", "pretty printing style key: normal raw"), + OPT_BOOLEAN(0, "tui", &use_tui, "Use the TUI interface"), + OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"), OPT_STRING('s', "sort", &sort_order, "key[,key2...]", "sort by key(s): pid, comm, dso, symbol, parent"), OPT_BOOLEAN(0, "showcpuutilization", &symbol_conf.show_cpu_utilization, @@ -482,8 +485,15 @@ int cmd_report(int argc, const char **argv, const char *prefix __used) { argc = parse_options(argc, argv, options, report_usage, 0); + if (use_stdio) + use_browser = 0; + else if (use_tui) + use_browser = 1; + if (strcmp(input_name, "-") != 0) setup_browser(); + else + use_browser = 0; /* * Only in the newt browser we are doing integrated annotation, * so don't allocate extra space that won't be used in the stdio diff --git a/tools/perf/feature-tests.mak b/tools/perf/feature-tests.mak index 7a7b60859053..b253db634f04 100644 --- a/tools/perf/feature-tests.mak +++ b/tools/perf/feature-tests.mak @@ -110,6 +110,17 @@ int main(void) } endef +define SOURCE_STRLCPY +#include <stdlib.h> +extern size_t strlcpy(char *dest, const char *src, size_t size); + +int main(void) +{ + strlcpy(NULL, NULL, 0); + return 0; +} +endef + # try-cc # Usage: option = $(call try-cc, source-to-build, cc-options) try-cc = $(shell sh -c \ diff --git a/tools/perf/scripts/python/bin/netdev-times-record b/tools/perf/scripts/python/bin/netdev-times-record new file mode 100644 index 000000000000..d931a828126b --- /dev/null +++ b/tools/perf/scripts/python/bin/netdev-times-record @@ -0,0 +1,8 @@ +#!/bin/bash +perf record -a -e net:net_dev_xmit -e net:net_dev_queue \ + -e net:netif_receive_skb -e net:netif_rx \ + -e skb:consume_skb -e skb:kfree_skb \ + -e skb:skb_copy_datagram_iovec -e napi:napi_poll \ + -e irq:irq_handler_entry -e irq:irq_handler_exit \ + -e irq:softirq_entry -e irq:softirq_exit \ + -e irq:softirq_raise $@ diff --git a/tools/perf/scripts/python/bin/netdev-times-report b/tools/perf/scripts/python/bin/netdev-times-report new file mode 100644 index 000000000000..c3d0a638123d --- /dev/null +++ b/tools/perf/scripts/python/bin/netdev-times-report @@ -0,0 +1,5 @@ +#!/bin/bash +# description: display a process of packet and processing time +# args: [tx] [rx] [dev=] [debug] + +perf trace -s ~/libexec/perf-core/scripts/python/netdev-times.py $@ diff --git a/tools/perf/scripts/python/netdev-times.py b/tools/perf/scripts/python/netdev-times.py new file mode 100644 index 000000000000..9aa0a32972e8 --- /dev/null +++ b/tools/perf/scripts/python/netdev-times.py @@ -0,0 +1,464 @@ +# Display a process of packets and processed time. +# It helps us to investigate networking or network device. +# +# options +# tx: show only tx chart +# rx: show only rx chart +# dev=: show only thing related to specified device +# debug: work with debug mode. It shows buffer status. + +import os +import sys + +sys.path.append(os.environ['PERF_EXEC_PATH'] + \ + '/scripts/python/Perf-Trace-Util/lib/Perf/Trace') + +from perf_trace_context import * +from Core import * +from Util import * + +all_event_list = []; # insert all tracepoint event related with this script +irq_dic = {}; # key is cpu and value is a list which stacks irqs + # which raise NET_RX softirq +net_rx_dic = {}; # key is cpu and value include time of NET_RX softirq-entry + # and a list which stacks receive +receive_hunk_list = []; # a list which include a sequence of receive events +rx_skb_list = []; # received packet list for matching + # skb_copy_datagram_iovec + +buffer_budget = 65536; # the budget of rx_skb_list, tx_queue_list and + # tx_xmit_list +of_count_rx_skb_list = 0; # overflow count + +tx_queue_list = []; # list of packets which pass through dev_queue_xmit +of_count_tx_queue_list = 0; # overflow count + +tx_xmit_list = []; # list of packets which pass through dev_hard_start_xmit +of_count_tx_xmit_list = 0; # overflow count + +tx_free_list = []; # list of packets which is freed + +# options +show_tx = 0; +show_rx = 0; +dev = 0; # store a name of device specified by option "dev=" +debug = 0; + +# indices of event_info tuple +EINFO_IDX_NAME= 0 +EINFO_IDX_CONTEXT=1 +EINFO_IDX_CPU= 2 +EINFO_IDX_TIME= 3 +EINFO_IDX_PID= 4 +EINFO_IDX_COMM= 5 + +# Calculate a time interval(msec) from src(nsec) to dst(nsec) +def diff_msec(src, dst): + return (dst - src) / 1000000.0 + +# Display a process of transmitting a packet +def print_transmit(hunk): + if dev != 0 and hunk['dev'].find(dev) < 0: + return + print "%7s %5d %6d.%06dsec %12.3fmsec %12.3fmsec" % \ + (hunk['dev'], hunk['len'], + nsecs_secs(hunk['queue_t']), + nsecs_nsecs(hunk['queue_t'])/1000, + diff_msec(hunk['queue_t'], hunk['xmit_t']), + diff_msec(hunk['xmit_t'], hunk['free_t'])) + +# Format for displaying rx packet processing +PF_IRQ_ENTRY= " irq_entry(+%.3fmsec irq=%d:%s)" +PF_SOFT_ENTRY=" softirq_entry(+%.3fmsec)" +PF_NAPI_POLL= " napi_poll_exit(+%.3fmsec %s)" +PF_JOINT= " |" +PF_WJOINT= " | |" +PF_NET_RECV= " |---netif_receive_skb(+%.3fmsec skb=%x len=%d)" +PF_NET_RX= " |---netif_rx(+%.3fmsec skb=%x)" +PF_CPY_DGRAM= " | skb_copy_datagram_iovec(+%.3fmsec %d:%s)" +PF_KFREE_SKB= " | kfree_skb(+%.3fmsec location=%x)" +PF_CONS_SKB= " | consume_skb(+%.3fmsec)" + +# Display a process of received packets and interrputs associated with +# a NET_RX softirq +def print_receive(hunk): + show_hunk = 0 + irq_list = hunk['irq_list'] + cpu = irq_list[0]['cpu'] + base_t = irq_list[0]['irq_ent_t'] + # check if this hunk should be showed + if dev != 0: + for i in range(len(irq_list)): + if irq_list[i]['name'].find(dev) >= 0: + show_hunk = 1 + break + else: + show_hunk = 1 + if show_hunk == 0: + return + + print "%d.%06dsec cpu=%d" % \ + (nsecs_secs(base_t), nsecs_nsecs(base_t)/1000, cpu) + for i in range(len(irq_list)): + print PF_IRQ_ENTRY % \ + (diff_msec(base_t, irq_list[i]['irq_ent_t']), + irq_list[i]['irq'], irq_list[i]['name']) + print PF_JOINT + irq_event_list = irq_list[i]['event_list'] + for j in range(len(irq_event_list)): + irq_event = irq_event_list[j] + if irq_event['event'] == 'netif_rx': + print PF_NET_RX % \ + (diff_msec(base_t, irq_event['time']), + irq_event['skbaddr']) + print PF_JOINT + print PF_SOFT_ENTRY % \ + diff_msec(base_t, hunk['sirq_ent_t']) + print PF_JOINT + event_list = hunk['event_list'] + for i in range(len(event_list)): + event = event_list[i] + if event['event_name'] == 'napi_poll': + print PF_NAPI_POLL % \ + (diff_msec(base_t, event['event_t']), event['dev']) + if i == len(event_list) - 1: + print "" + else: + print PF_JOINT + else: + print PF_NET_RECV % \ + (diff_msec(base_t, event['event_t']), event['skbaddr'], + event['len']) + if 'comm' in event.keys(): + print PF_WJOINT + print PF_CPY_DGRAM % \ + (diff_msec(base_t, event['comm_t']), + event['pid'], event['comm']) + elif 'handle' in event.keys(): + print PF_WJOINT + if event['handle'] == "kfree_skb": + print PF_KFREE_SKB % \ + (diff_msec(base_t, + event['comm_t']), + event['location']) + elif event['handle'] == "consume_skb": + print PF_CONS_SKB % \ + diff_msec(base_t, + event['comm_t']) + print PF_JOINT + +def trace_begin(): + global show_tx + global show_rx + global dev + global debug + + for i in range(len(sys.argv)): + if i == 0: + continue + arg = sys.argv[i] + if arg == 'tx': + show_tx = 1 + elif arg =='rx': + show_rx = 1 + elif arg.find('dev=',0, 4) >= 0: + dev = arg[4:] + elif arg == 'debug': + debug = 1 + if show_tx == 0 and show_rx == 0: + show_tx = 1 + show_rx = 1 + +def trace_end(): + # order all events in time + all_event_list.sort(lambda a,b :cmp(a[EINFO_IDX_TIME], + b[EINFO_IDX_TIME])) + # process all events + for i in range(len(all_event_list)): + event_info = all_event_list[i] + name = event_info[EINFO_IDX_NAME] + if name == 'irq__softirq_exit': + handle_irq_softirq_exit(event_info) + elif name == 'irq__softirq_entry': + handle_irq_softirq_entry(event_info) + elif name == 'irq__softirq_raise': + handle_irq_softirq_raise(event_info) + elif name == 'irq__irq_handler_entry': + handle_irq_handler_entry(event_info) + elif name == 'irq__irq_handler_exit': + handle_irq_handler_exit(event_info) + elif name == 'napi__napi_poll': + handle_napi_poll(event_info) + elif name == 'net__netif_receive_skb': + handle_netif_receive_skb(event_info) + elif name == 'net__netif_rx': + handle_netif_rx(event_info) + elif name == 'skb__skb_copy_datagram_iovec': + handle_skb_copy_datagram_iovec(event_info) + elif name == 'net__net_dev_queue': + handle_net_dev_queue(event_info) + elif name == 'net__net_dev_xmit': + handle_net_dev_xmit(event_info) + elif name == 'skb__kfree_skb': + handle_kfree_skb(event_info) + elif name == 'skb__consume_skb': + handle_consume_skb(event_info) + # display receive hunks + if show_rx: + for i in range(len(receive_hunk_list)): + print_receive(receive_hunk_list[i]) + # display transmit hunks + if show_tx: + print " dev len Qdisc " \ + " netdevice free" + for i in range(len(tx_free_list)): + print_transmit(tx_free_list[i]) + if debug: + print "debug buffer status" + print "----------------------------" + print "xmit Qdisc:remain:%d overflow:%d" % \ + (len(tx_queue_list), of_count_tx_queue_list) + print "xmit netdevice:remain:%d overflow:%d" % \ + (len(tx_xmit_list), of_count_tx_xmit_list) + print "receive:remain:%d overflow:%d" % \ + (len(rx_skb_list), of_count_rx_skb_list) + +# called from perf, when it finds a correspoinding event +def irq__softirq_entry(name, context, cpu, sec, nsec, pid, comm, vec): + if symbol_str("irq__softirq_entry", "vec", vec) != "NET_RX": + return + event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, vec) + all_event_list.append(event_info) + +def irq__softirq_exit(name, context, cpu, sec, nsec, pid, comm, vec): + if symbol_str("irq__softirq_entry", "vec", vec) != "NET_RX": + return + event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, vec) + all_event_list.append(event_info) + +def irq__softirq_raise(name, context, cpu, sec, nsec, pid, comm, vec): + if symbol_str("irq__softirq_entry", "vec", vec) != "NET_RX": + return + event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, vec) + all_event_list.append(event_info) + +def irq__irq_handler_entry(name, context, cpu, sec, nsec, pid, comm, + irq, irq_name): + event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, + irq, irq_name) + all_event_list.append(event_info) + +def irq__irq_handler_exit(name, context, cpu, sec, nsec, pid, comm, irq, ret): + event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, irq, ret) + all_event_list.append(event_info) + +def napi__napi_poll(name, context, cpu, sec, nsec, pid, comm, napi, dev_name): + event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, + napi, dev_name) + all_event_list.append(event_info) + +def net__netif_receive_skb(name, context, cpu, sec, nsec, pid, comm, skbaddr, + skblen, dev_name): + event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, + skbaddr, skblen, dev_name) + all_event_list.append(event_info) + +def net__netif_rx(name, context, cpu, sec, nsec, pid, comm, skbaddr, + skblen, dev_name): + event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, + skbaddr, skblen, dev_name) + all_event_list.append(event_info) + +def net__net_dev_queue(name, context, cpu, sec, nsec, pid, comm, + skbaddr, skblen, dev_name): + event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, + skbaddr, skblen, dev_name) + all_event_list.append(event_info) + +def net__net_dev_xmit(name, context, cpu, sec, nsec, pid, comm, + skbaddr, skblen, rc, dev_name): + event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, + skbaddr, skblen, rc ,dev_name) + all_event_list.append(event_info) + +def skb__kfree_skb(name, context, cpu, sec, nsec, pid, comm, + skbaddr, protocol, location): + event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, + skbaddr, protocol, location) + all_event_list.append(event_info) + +def skb__consume_skb(name, context, cpu, sec, nsec, pid, comm, skbaddr): + event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, + skbaddr) + all_event_list.append(event_info) + +def skb__skb_copy_datagram_iovec(name, context, cpu, sec, nsec, pid, comm, + skbaddr, skblen): + event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, + skbaddr, skblen) + all_event_list.append(event_info) + +def handle_irq_handler_entry(event_info): + (name, context, cpu, time, pid, comm, irq, irq_name) = event_info + if cpu not in irq_dic.keys(): + irq_dic[cpu] = [] + irq_record = {'irq':irq, 'name':irq_name, 'cpu':cpu, 'irq_ent_t':time} + irq_dic[cpu].append(irq_record) + +def handle_irq_handler_exit(event_info): + (name, context, cpu, time, pid, comm, irq, ret) = event_info + if cpu not in irq_dic.keys(): + return + irq_record = irq_dic[cpu].pop() + if irq != irq_record['irq']: + return + irq_record.update({'irq_ext_t':time}) + # if an irq doesn't include NET_RX softirq, drop. + if 'event_list' in irq_record.keys(): + irq_dic[cpu].append(irq_record) + +def handle_irq_softirq_raise(event_info): + (name, context, cpu, time, pid, comm, vec) = event_info + if cpu not in irq_dic.keys() \ + or len(irq_dic[cpu]) == 0: + return + irq_record = irq_dic[cpu].pop() + if 'event_list' in irq_record.keys(): + irq_event_list = irq_record['event_list'] + else: + irq_event_list = [] + irq_event_list.append({'time':time, 'event':'sirq_raise'}) + irq_record.update({'event_list':irq_event_list}) + irq_dic[cpu].append(irq_record) + +def handle_irq_softirq_entry(event_info): + (name, context, cpu, time, pid, comm, vec) = event_info + net_rx_dic[cpu] = {'sirq_ent_t':time, 'event_list':[]} + +def handle_irq_softirq_exit(event_info): + (name, context, cpu, time, pid, comm, vec) = event_info + irq_list = [] + event_list = 0 + if cpu in irq_dic.keys(): + irq_list = irq_dic[cpu] + del irq_dic[cpu] + if cpu in net_rx_dic.keys(): + sirq_ent_t = net_rx_dic[cpu]['sirq_ent_t'] + event_list = net_rx_dic[cpu]['event_list'] + del net_rx_dic[cpu] + if irq_list == [] or event_list == 0: + return + rec_data = {'sirq_ent_t':sirq_ent_t, 'sirq_ext_t':time, + 'irq_list':irq_list, 'event_list':event_list} + # merge information realted to a NET_RX softirq + receive_hunk_list.append(rec_data) + +def handle_napi_poll(event_info): + (name, context, cpu, time, pid, comm, napi, dev_name) = event_info + if cpu in net_rx_dic.keys(): + event_list = net_rx_dic[cpu]['event_list'] + rec_data = {'event_name':'napi_poll', + 'dev':dev_name, 'event_t':time} + event_list.append(rec_data) + +def handle_netif_rx(event_info): + (name, context, cpu, time, pid, comm, + skbaddr, skblen, dev_name) = event_info + if cpu not in irq_dic.keys() \ + or len(irq_dic[cpu]) == 0: + return + irq_record = irq_dic[cpu].pop() + if 'event_list' in irq_record.keys(): + irq_event_list = irq_record['event_list'] + else: + irq_event_list = [] + irq_event_list.append({'time':time, 'event':'netif_rx', + 'skbaddr':skbaddr, 'skblen':skblen, 'dev_name':dev_name}) + irq_record.update({'event_list':irq_event_list}) + irq_dic[cpu].append(irq_record) + +def handle_netif_receive_skb(event_info): + global of_count_rx_skb_list + + (name, context, cpu, time, pid, comm, + skbaddr, skblen, dev_name) = event_info + if cpu in net_rx_dic.keys(): + rec_data = {'event_name':'netif_receive_skb', + 'event_t':time, 'skbaddr':skbaddr, 'len':skblen} + event_list = net_rx_dic[cpu]['event_list'] + event_list.append(rec_data) + rx_skb_list.insert(0, rec_data) + if len(rx_skb_list) > buffer_budget: + rx_skb_list.pop() + of_count_rx_skb_list += 1 + +def handle_net_dev_queue(event_info): + global of_count_tx_queue_list + + (name, context, cpu, time, pid, comm, + skbaddr, skblen, dev_name) = event_info + skb = {'dev':dev_name, 'skbaddr':skbaddr, 'len':skblen, 'queue_t':time} + tx_queue_list.insert(0, skb) + if len(tx_queue_list) > buffer_budget: + tx_queue_list.pop() + of_count_tx_queue_list += 1 + +def handle_net_dev_xmit(event_info): + global of_count_tx_xmit_list + + (name, context, cpu, time, pid, comm, + skbaddr, skblen, rc, dev_name) = event_info + if rc == 0: # NETDEV_TX_OK + for i in range(len(tx_queue_list)): + skb = tx_queue_list[i] + if skb['skbaddr'] == skbaddr: + skb['xmit_t'] = time + tx_xmit_list.insert(0, skb) + del tx_queue_list[i] + if len(tx_xmit_list) > buffer_budget: + tx_xmit_list.pop() + of_count_tx_xmit_list += 1 + return + +def handle_kfree_skb(event_info): + (name, context, cpu, time, pid, comm, + skbaddr, protocol, location) = event_info + for i in range(len(tx_queue_list)): + skb = tx_queue_list[i] + if skb['skbaddr'] == skbaddr: + del tx_queue_list[i] + return + for i in range(len(tx_xmit_list)): + skb = tx_xmit_list[i] + if skb['skbaddr'] == skbaddr: + skb['free_t'] = time + tx_free_list.append(skb) + del tx_xmit_list[i] + return + for i in range(len(rx_skb_list)): + rec_data = rx_skb_list[i] + if rec_data['skbaddr'] == skbaddr: + rec_data.update({'handle':"kfree_skb", + 'comm':comm, 'pid':pid, 'comm_t':time}) + del rx_skb_list[i] + return + +def handle_consume_skb(event_info): + (name, context, cpu, time, pid, comm, skbaddr) = event_info + for i in range(len(tx_xmit_list)): + skb = tx_xmit_list[i] + if skb['skbaddr'] == skbaddr: + skb['free_t'] = time + tx_free_list.append(skb) + del tx_xmit_list[i] + return + +def handle_skb_copy_datagram_iovec(event_info): + (name, context, cpu, time, pid, comm, skbaddr, skblen) = event_info + for i in range(len(rx_skb_list)): + rec_data = rx_skb_list[i] + if skbaddr == rec_data['skbaddr']: + rec_data.update({'handle':"skb_copy_datagram_iovec", + 'comm':comm, 'pid':pid, 'comm_t':time}) + del rx_skb_list[i] + return diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h index 27e9ebe4076e..a7729797fd96 100644 --- a/tools/perf/util/cache.h +++ b/tools/perf/util/cache.h @@ -82,6 +82,8 @@ extern char *perf_path(const char *fmt, ...) __attribute__((format (printf, 1, 2 extern char *perf_pathdup(const char *fmt, ...) __attribute__((format (printf, 1, 2))); +#ifdef NO_STRLCPY extern size_t strlcpy(char *dest, const char *src, size_t size); +#endif #endif /* __PERF_CACHE_H */ diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index f231f43424d2..e12d539417b2 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -28,6 +28,9 @@ bool ip_callchain__valid(struct ip_callchain *chain, const event_t *event) #define chain_for_each_child(child, parent) \ list_for_each_entry(child, &parent->children, brothers) +#define chain_for_each_child_safe(child, next, parent) \ + list_for_each_entry_safe(child, next, &parent->children, brothers) + static void rb_insert_callchain(struct rb_root *root, struct callchain_node *chain, enum chain_mode mode) @@ -86,10 +89,10 @@ __sort_chain_flat(struct rb_root *rb_root, struct callchain_node *node, * sort them by hit */ static void -sort_chain_flat(struct rb_root *rb_root, struct callchain_node *node, +sort_chain_flat(struct rb_root *rb_root, struct callchain_root *root, u64 min_hit, struct callchain_param *param __used) { - __sort_chain_flat(rb_root, node, min_hit); + __sort_chain_flat(rb_root, &root->node, min_hit); } static void __sort_chain_graph_abs(struct callchain_node *node, @@ -108,11 +111,11 @@ static void __sort_chain_graph_abs(struct callchain_node *node, } static void -sort_chain_graph_abs(struct rb_root *rb_root, struct callchain_node *chain_root, +sort_chain_graph_abs(struct rb_root *rb_root, struct callchain_root *chain_root, u64 min_hit, struct callchain_param *param __used) { - __sort_chain_graph_abs(chain_root, min_hit); - rb_root->rb_node = chain_root->rb_root.rb_node; + __sort_chain_graph_abs(&chain_root->node, min_hit); + rb_root->rb_node = chain_root->node.rb_root.rb_node; } static void __sort_chain_graph_rel(struct callchain_node *node, @@ -133,11 +136,11 @@ static void __sort_chain_graph_rel(struct callchain_node *node, } static void -sort_chain_graph_rel(struct rb_root *rb_root, struct callchain_node *chain_root, +sort_chain_graph_rel(struct rb_root *rb_root, struct callchain_root *chain_root, u64 min_hit __used, struct callchain_param *param) { - __sort_chain_graph_rel(chain_root, param->min_percent / 100.0); - rb_root->rb_node = chain_root->rb_root.rb_node; + __sort_chain_graph_rel(&chain_root->node, param->min_percent / 100.0); + rb_root->rb_node = chain_root->node.rb_root.rb_node; } int register_callchain_param(struct callchain_param *param) @@ -284,19 +287,18 @@ split_add_child(struct callchain_node *parent, struct resolved_chain *chain, } static int -__append_chain(struct callchain_node *root, struct resolved_chain *chain, - unsigned int start, u64 period); +append_chain(struct callchain_node *root, struct resolved_chain *chain, + unsigned int start, u64 period); static void -__append_chain_children(struct callchain_node *root, - struct resolved_chain *chain, - unsigned int start, u64 period) +append_chain_children(struct callchain_node *root, struct resolved_chain *chain, + unsigned int start, u64 period) { struct callchain_node *rnode; /* lookup in childrens */ chain_for_each_child(rnode, root) { - unsigned int ret = __append_chain(rnode, chain, start, period); + unsigned int ret = append_chain(rnode, chain, start, period); if (!ret) goto inc_children_hit; @@ -309,8 +311,8 @@ inc_children_hit: } static int -__append_chain(struct callchain_node *root, struct resolved_chain *chain, - unsigned int start, u64 period) +append_chain(struct callchain_node *root, struct resolved_chain *chain, + unsigned int start, u64 period) { struct callchain_list *cnode; unsigned int i = start; @@ -357,7 +359,7 @@ __append_chain(struct callchain_node *root, struct resolved_chain *chain, } /* We match the node and still have a part remaining */ - __append_chain_children(root, chain, i, period); + append_chain_children(root, chain, i, period); return 0; } @@ -380,8 +382,8 @@ static void filter_context(struct ip_callchain *old, struct resolved_chain *new, } -int append_chain(struct callchain_node *root, struct ip_callchain *chain, - struct map_symbol *syms, u64 period) +int callchain_append(struct callchain_root *root, struct ip_callchain *chain, + struct map_symbol *syms, u64 period) { struct resolved_chain *filtered; @@ -398,9 +400,65 @@ int append_chain(struct callchain_node *root, struct ip_callchain *chain, if (!filtered->nr) goto end; - __append_chain_children(root, filtered, 0, period); + append_chain_children(&root->node, filtered, 0, period); + + if (filtered->nr > root->max_depth) + root->max_depth = filtered->nr; end: free(filtered); return 0; } + +static int +merge_chain_branch(struct callchain_node *dst, struct callchain_node *src, + struct resolved_chain *chain) +{ + struct callchain_node *child, *next_child; + struct callchain_list *list, *next_list; + int old_pos = chain->nr; + int err = 0; + + list_for_each_entry_safe(list, next_list, &src->val, list) { + chain->ips[chain->nr].ip = list->ip; + chain->ips[chain->nr].ms = list->ms; + chain->nr++; + list_del(&list->list); + free(list); + } + + if (src->hit) + append_chain_children(dst, chain, 0, src->hit); + + chain_for_each_child_safe(child, next_child, src) { + err = merge_chain_branch(dst, child, chain); + if (err) + break; + + list_del(&child->brothers); + free(child); + } + + chain->nr = old_pos; + + return err; +} + +int callchain_merge(struct callchain_root *dst, struct callchain_root *src) +{ + struct resolved_chain *chain; + int err; + + chain = malloc(sizeof(*chain) + + src->max_depth * sizeof(struct resolved_ip)); + if (!chain) + return -ENOMEM; + + chain->nr = 0; + + err = merge_chain_branch(&dst->node, &src->node, chain); + + free(chain); + + return err; +} diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index 6de4313924fb..c15fb8c24ad2 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -26,9 +26,14 @@ struct callchain_node { u64 children_hit; }; +struct callchain_root { + u64 max_depth; + struct callchain_node node; +}; + struct callchain_param; -typedef void (*sort_chain_func_t)(struct rb_root *, struct callchain_node *, +typedef void (*sort_chain_func_t)(struct rb_root *, struct callchain_root *, u64, struct callchain_param *); struct callchain_param { @@ -44,15 +49,16 @@ struct callchain_list { struct list_head list; }; -static inline void callchain_init(struct callchain_node *node) +static inline void callchain_init(struct callchain_root *root) { - INIT_LIST_HEAD(&node->brothers); - INIT_LIST_HEAD(&node->children); - INIT_LIST_HEAD(&node->val); + INIT_LIST_HEAD(&root->node.brothers); + INIT_LIST_HEAD(&root->node.children); + INIT_LIST_HEAD(&root->node.val); - node->children_hit = 0; - node->parent = NULL; - node->hit = 0; + root->node.parent = NULL; + root->node.hit = 0; + root->node.children_hit = 0; + root->max_depth = 0; } static inline u64 cumul_hits(struct callchain_node *node) @@ -61,8 +67,9 @@ static inline u64 cumul_hits(struct callchain_node *node) } int register_callchain_param(struct callchain_param *param); -int append_chain(struct callchain_node *root, struct ip_callchain *chain, - struct map_symbol *syms, u64 period); +int callchain_append(struct callchain_root *root, struct ip_callchain *chain, + struct map_symbol *syms, u64 period); +int callchain_merge(struct callchain_root *dst, struct callchain_root *src); bool ip_callchain__valid(struct ip_callchain *chain, const event_t *event); #endif /* __PERF_CALLCHAIN_H */ diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index be22ae6ef055..2022e8740994 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -87,7 +87,7 @@ static void hist_entry__add_cpumode_period(struct hist_entry *self, static struct hist_entry *hist_entry__new(struct hist_entry *template) { - size_t callchain_size = symbol_conf.use_callchain ? sizeof(struct callchain_node) : 0; + size_t callchain_size = symbol_conf.use_callchain ? sizeof(struct callchain_root) : 0; struct hist_entry *self = malloc(sizeof(*self) + callchain_size); if (self != NULL) { @@ -226,6 +226,8 @@ static bool collapse__insert_entry(struct rb_root *root, struct hist_entry *he) if (!cmp) { iter->period += he->period; + if (symbol_conf.use_callchain) + callchain_merge(iter->callchain, he->callchain); hist_entry__free(he); return false; } diff --git a/tools/perf/util/path.c b/tools/perf/util/path.c index 58a470d036dd..bd7497711424 100644 --- a/tools/perf/util/path.c +++ b/tools/perf/util/path.c @@ -22,6 +22,7 @@ static const char *get_perf_dir(void) return "."; } +#ifdef NO_STRLCPY size_t strlcpy(char *dest, const char *src, size_t size) { size_t ret = strlen(src); @@ -33,7 +34,7 @@ size_t strlcpy(char *dest, const char *src, size_t size) } return ret; } - +#endif static char *get_pathname(void) { diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index 46e531d09e8b..0b91053a7d11 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -70,7 +70,7 @@ struct hist_entry { struct hist_entry *pair; struct rb_root sorted_chain; }; - struct callchain_node callchain[0]; + struct callchain_root callchain[0]; }; enum sort_type { diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index b2f5ae97f33d..b39f499e575a 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -388,6 +388,20 @@ size_t dso__fprintf_buildid(struct dso *self, FILE *fp) return fprintf(fp, "%s", sbuild_id); } +size_t dso__fprintf_symbols_by_name(struct dso *self, enum map_type type, FILE *fp) +{ + size_t ret = 0; + struct rb_node *nd; + struct symbol_name_rb_node *pos; + + for (nd = rb_first(&self->symbol_names[type]); nd; nd = rb_next(nd)) { + pos = rb_entry(nd, struct symbol_name_rb_node, rb_node); + fprintf(fp, "%s\n", pos->sym.name); + } + + return ret; +} + size_t dso__fprintf(struct dso *self, enum map_type type, FILE *fp) { struct rb_node *nd; diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index ea95c2756f05..038f2201ee09 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -182,6 +182,7 @@ size_t machines__fprintf_dsos(struct rb_root *self, FILE *fp); size_t machines__fprintf_dsos_buildid(struct rb_root *self, FILE *fp, bool with_hits); size_t dso__fprintf_buildid(struct dso *self, FILE *fp); +size_t dso__fprintf_symbols_by_name(struct dso *self, enum map_type type, FILE *fp); size_t dso__fprintf(struct dso *self, enum map_type type, FILE *fp); enum dso_origin { diff --git a/tools/perf/util/ui/browser.c b/tools/perf/util/ui/browser.c index 66f2d583d8c4..6d0df809a2ed 100644 --- a/tools/perf/util/ui/browser.c +++ b/tools/perf/util/ui/browser.c @@ -1,16 +1,6 @@ -#define _GNU_SOURCE -#include <stdio.h> -#undef _GNU_SOURCE -/* - * slang versions <= 2.0.6 have a "#if HAVE_LONG_LONG" that breaks - * the build if it isn't defined. Use the equivalent one that glibc - * has on features.h. - */ -#include <features.h> -#ifndef HAVE_LONG_LONG -#define HAVE_LONG_LONG __GLIBC_HAVE_LONG_LONG -#endif #include <slang.h> +#include "libslang.h" +#include <linux/compiler.h> #include <linux/list.h> #include <linux/rbtree.h> #include <stdlib.h> @@ -19,17 +9,9 @@ #include "helpline.h" #include "../color.h" #include "../util.h" +#include <stdio.h> -#if SLANG_VERSION < 20104 -#define sltt_set_color(obj, name, fg, bg) \ - SLtt_set_color(obj,(char *)name, (char *)fg, (char *)bg) -#else -#define sltt_set_color SLtt_set_color -#endif - -newtComponent newt_form__new(void); - -int ui_browser__percent_color(double percent, bool current) +static int ui_browser__percent_color(double percent, bool current) { if (current) return HE_COLORSET_SELECTED; @@ -40,6 +22,23 @@ int ui_browser__percent_color(double percent, bool current) return HE_COLORSET_NORMAL; } +void ui_browser__set_color(struct ui_browser *self __used, int color) +{ + SLsmg_set_color(color); +} + +void ui_browser__set_percent_color(struct ui_browser *self, + double percent, bool current) +{ + int color = ui_browser__percent_color(percent, current); + ui_browser__set_color(self, color); +} + +void ui_browser__gotorc(struct ui_browser *self, int y, int x) +{ + SLsmg_gotorc(self->y + y, self->x + x); +} + void ui_browser__list_head_seek(struct ui_browser *self, off_t offset, int whence) { struct list_head *head = self->entries; @@ -111,7 +110,7 @@ unsigned int ui_browser__rb_tree_refresh(struct ui_browser *self) nd = self->top; while (nd != NULL) { - SLsmg_gotorc(self->y + row, self->x); + ui_browser__gotorc(self, row, 0); self->write(self, nd, row); if (++row == self->height) break; @@ -131,13 +130,10 @@ void ui_browser__refresh_dimensions(struct ui_browser *self) int cols, rows; newtGetScreenSize(&cols, &rows); - if (self->width > cols - 4) - self->width = cols - 4; - self->height = rows - 5; - if (self->height > self->nr_entries) - self->height = self->nr_entries; - self->y = (rows - self->height) / 2; - self->x = (cols - self->width) / 2; + self->width = cols - 1; + self->height = rows - 2; + self->y = 1; + self->x = 0; } void ui_browser__reset_index(struct ui_browser *self) @@ -146,34 +142,48 @@ void ui_browser__reset_index(struct ui_browser *self) self->seek(self, 0, SEEK_SET); } +void ui_browser__add_exit_key(struct ui_browser *self, int key) +{ + newtFormAddHotKey(self->form, key); +} + +void ui_browser__add_exit_keys(struct ui_browser *self, int keys[]) +{ + int i = 0; + + while (keys[i] && i < 64) { + ui_browser__add_exit_key(self, keys[i]); + ++i; + } +} + int ui_browser__show(struct ui_browser *self, const char *title, const char *helpline, ...) { va_list ap; + int keys[] = { NEWT_KEY_UP, NEWT_KEY_DOWN, NEWT_KEY_PGUP, + NEWT_KEY_PGDN, NEWT_KEY_HOME, NEWT_KEY_END, ' ', + NEWT_KEY_LEFT, NEWT_KEY_ESCAPE, 'q', CTRL('c'), 0 }; - if (self->form != NULL) { + if (self->form != NULL) newtFormDestroy(self->form); - newtPopWindow(); - } + ui_browser__refresh_dimensions(self); - newtCenteredWindow(self->width, self->height, title); - self->form = newt_form__new(); + self->form = newtForm(NULL, NULL, 0); if (self->form == NULL) return -1; - self->sb = newtVerticalScrollbar(self->width, 0, self->height, + self->sb = newtVerticalScrollbar(self->width, 1, self->height, HE_COLORSET_NORMAL, HE_COLORSET_SELECTED); if (self->sb == NULL) return -1; - newtFormAddHotKey(self->form, NEWT_KEY_UP); - newtFormAddHotKey(self->form, NEWT_KEY_DOWN); - newtFormAddHotKey(self->form, NEWT_KEY_PGUP); - newtFormAddHotKey(self->form, NEWT_KEY_PGDN); - newtFormAddHotKey(self->form, NEWT_KEY_HOME); - newtFormAddHotKey(self->form, NEWT_KEY_END); - newtFormAddHotKey(self->form, ' '); + SLsmg_gotorc(0, 0); + ui_browser__set_color(self, NEWT_COLORSET_ROOT); + slsmg_write_nstring(title, self->width); + + ui_browser__add_exit_keys(self, keys); newtFormAddComponent(self->form, self->sb); va_start(ap, helpline); @@ -185,7 +195,6 @@ int ui_browser__show(struct ui_browser *self, const char *title, void ui_browser__hide(struct ui_browser *self) { newtFormDestroy(self->form); - newtPopWindow(); self->form = NULL; ui_helpline__pop(); } @@ -196,28 +205,28 @@ int ui_browser__refresh(struct ui_browser *self) newtScrollbarSet(self->sb, self->index, self->nr_entries - 1); row = self->refresh(self); - SLsmg_set_color(HE_COLORSET_NORMAL); + ui_browser__set_color(self, HE_COLORSET_NORMAL); SLsmg_fill_region(self->y + row, self->x, self->height - row, self->width, ' '); return 0; } -int ui_browser__run(struct ui_browser *self, struct newtExitStruct *es) +int ui_browser__run(struct ui_browser *self) { + struct newtExitStruct es; + if (ui_browser__refresh(self) < 0) return -1; while (1) { off_t offset; - newtFormRun(self->form, es); + newtFormRun(self->form, &es); - if (es->reason != NEWT_EXIT_HOTKEY) + if (es.reason != NEWT_EXIT_HOTKEY) break; - if (is_exit_key(es->u.key)) - return es->u.key; - switch (es->u.key) { + switch (es.u.key) { case NEWT_KEY_DOWN: if (self->index == self->nr_entries - 1) break; @@ -274,12 +283,12 @@ int ui_browser__run(struct ui_browser *self, struct newtExitStruct *es) self->seek(self, -offset, SEEK_END); break; default: - return es->u.key; + return es.u.key; } if (ui_browser__refresh(self) < 0) return -1; } - return 0; + return -1; } unsigned int ui_browser__list_head_refresh(struct ui_browser *self) @@ -294,7 +303,7 @@ unsigned int ui_browser__list_head_refresh(struct ui_browser *self) pos = self->top; list_for_each_from(pos, head) { - SLsmg_gotorc(self->y + row, self->x); + ui_browser__gotorc(self, row, 0); self->write(self, pos, row); if (++row == self->height) break; diff --git a/tools/perf/util/ui/browser.h b/tools/perf/util/ui/browser.h index 0b9f829214f7..0dc7e4da36f5 100644 --- a/tools/perf/util/ui/browser.h +++ b/tools/perf/util/ui/browser.h @@ -25,16 +25,21 @@ struct ui_browser { }; -int ui_browser__percent_color(double percent, bool current); +void ui_browser__set_color(struct ui_browser *self, int color); +void ui_browser__set_percent_color(struct ui_browser *self, + double percent, bool current); bool ui_browser__is_current_entry(struct ui_browser *self, unsigned row); void ui_browser__refresh_dimensions(struct ui_browser *self); void ui_browser__reset_index(struct ui_browser *self); +void ui_browser__gotorc(struct ui_browser *self, int y, int x); +void ui_browser__add_exit_key(struct ui_browser *self, int key); +void ui_browser__add_exit_keys(struct ui_browser *self, int keys[]); int ui_browser__show(struct ui_browser *self, const char *title, const char *helpline, ...); void ui_browser__hide(struct ui_browser *self); int ui_browser__refresh(struct ui_browser *self); -int ui_browser__run(struct ui_browser *self, struct newtExitStruct *es); +int ui_browser__run(struct ui_browser *self); void ui_browser__rb_tree_seek(struct ui_browser *self, off_t offset, int whence); unsigned int ui_browser__rb_tree_refresh(struct ui_browser *self); diff --git a/tools/perf/util/ui/browsers/annotate.c b/tools/perf/util/ui/browsers/annotate.c index a90273e63f4f..82b78f99251b 100644 --- a/tools/perf/util/ui/browsers/annotate.c +++ b/tools/perf/util/ui/browsers/annotate.c @@ -40,14 +40,12 @@ static void annotate_browser__write(struct ui_browser *self, void *entry, int ro if (ol->offset != -1) { struct objdump_line_rb_node *olrb = objdump_line__rb(ol); - int color = ui_browser__percent_color(olrb->percent, current_entry); - SLsmg_set_color(color); + ui_browser__set_percent_color(self, olrb->percent, current_entry); slsmg_printf(" %7.2f ", olrb->percent); if (!current_entry) - SLsmg_set_color(HE_COLORSET_CODE); + ui_browser__set_color(self, HE_COLORSET_CODE); } else { - int color = ui_browser__percent_color(0, current_entry); - SLsmg_set_color(color); + ui_browser__set_percent_color(self, 0, current_entry); slsmg_write_nstring(" ", 9); } @@ -135,32 +133,31 @@ static void annotate_browser__set_top(struct annotate_browser *self, self->curr_hot = nd; } -static int annotate_browser__run(struct annotate_browser *self, - struct newtExitStruct *es) +static int annotate_browser__run(struct annotate_browser *self) { struct rb_node *nd; struct hist_entry *he = self->b.priv; + int key; if (ui_browser__show(&self->b, he->ms.sym->name, - "<- or ESC: exit, TAB/shift+TAB: cycle thru samples") < 0) + "<-, -> or ESC: exit, TAB/shift+TAB: cycle thru samples") < 0) return -1; - - newtFormAddHotKey(self->b.form, NEWT_KEY_LEFT); - newtFormAddHotKey(self->b.form, NEWT_KEY_RIGHT); + /* + * To allow builtin-annotate to cycle thru multiple symbols by + * examining the exit key for this function. + */ + ui_browser__add_exit_key(&self->b, NEWT_KEY_RIGHT); nd = self->curr_hot; if (nd) { - newtFormAddHotKey(self->b.form, NEWT_KEY_TAB); - newtFormAddHotKey(self->b.form, NEWT_KEY_UNTAB); + int tabs[] = { NEWT_KEY_TAB, NEWT_KEY_UNTAB, 0 }; + ui_browser__add_exit_keys(&self->b, tabs); } while (1) { - ui_browser__run(&self->b, es); - - if (es->reason != NEWT_EXIT_HOTKEY) - break; + key = ui_browser__run(&self->b); - switch (es->u.key) { + switch (key) { case NEWT_KEY_TAB: nd = rb_prev(nd); if (nd == NULL) @@ -179,12 +176,11 @@ static int annotate_browser__run(struct annotate_browser *self, } out: ui_browser__hide(&self->b); - return es->u.key; + return key; } int hist_entry__tui_annotate(struct hist_entry *self) { - struct newtExitStruct es; struct objdump_line *pos, *n; struct objdump_line_rb_node *rbpos; LIST_HEAD(head); @@ -232,7 +228,7 @@ int hist_entry__tui_annotate(struct hist_entry *self) annotate_browser__set_top(&browser, browser.curr_hot); browser.b.width += 18; /* Percentage */ - ret = annotate_browser__run(&browser, &es); + ret = annotate_browser__run(&browser); list_for_each_entry_safe(pos, n, &head, node) { list_del(&pos->node); objdump_line__free(pos); diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c index 6866aa4c41e0..ebda8c3fde9e 100644 --- a/tools/perf/util/ui/browsers/hists.c +++ b/tools/perf/util/ui/browsers/hists.c @@ -58,6 +58,11 @@ static char callchain_list__folded(const struct callchain_list *self) return map_symbol__folded(&self->ms); } +static void map_symbol__set_folding(struct map_symbol *self, bool unfold) +{ + self->unfolded = unfold ? self->has_children : false; +} + static int callchain_node__count_rows_rb_tree(struct callchain_node *self) { int n = 0; @@ -129,16 +134,16 @@ static void callchain_node__init_have_children_rb_tree(struct callchain_node *se for (nd = rb_first(&self->rb_root); nd; nd = rb_next(nd)) { struct callchain_node *child = rb_entry(nd, struct callchain_node, rb_node); struct callchain_list *chain; - int first = true; + bool first = true; list_for_each_entry(chain, &child->val, list) { if (first) { first = false; chain->ms.has_children = chain->list.next != &child->val || - rb_first(&child->rb_root) != NULL; + !RB_EMPTY_ROOT(&child->rb_root); } else chain->ms.has_children = chain->list.next == &child->val && - rb_first(&child->rb_root) != NULL; + !RB_EMPTY_ROOT(&child->rb_root); } callchain_node__init_have_children_rb_tree(child); @@ -150,7 +155,7 @@ static void callchain_node__init_have_children(struct callchain_node *self) struct callchain_list *chain; list_for_each_entry(chain, &self->val, list) - chain->ms.has_children = rb_first(&self->rb_root) != NULL; + chain->ms.has_children = !RB_EMPTY_ROOT(&self->rb_root); callchain_node__init_have_children_rb_tree(self); } @@ -168,6 +173,7 @@ static void callchain__init_have_children(struct rb_root *self) static void hist_entry__init_have_children(struct hist_entry *self) { if (!self->init_have_children) { + self->ms.has_children = !RB_EMPTY_ROOT(&self->sorted_chain); callchain__init_have_children(&self->sorted_chain); self->init_have_children = true; } @@ -195,43 +201,114 @@ static bool hist_browser__toggle_fold(struct hist_browser *self) return false; } -static int hist_browser__run(struct hist_browser *self, const char *title, - struct newtExitStruct *es) +static int callchain_node__set_folding_rb_tree(struct callchain_node *self, bool unfold) +{ + int n = 0; + struct rb_node *nd; + + for (nd = rb_first(&self->rb_root); nd; nd = rb_next(nd)) { + struct callchain_node *child = rb_entry(nd, struct callchain_node, rb_node); + struct callchain_list *chain; + bool has_children = false; + + list_for_each_entry(chain, &child->val, list) { + ++n; + map_symbol__set_folding(&chain->ms, unfold); + has_children = chain->ms.has_children; + } + + if (has_children) + n += callchain_node__set_folding_rb_tree(child, unfold); + } + + return n; +} + +static int callchain_node__set_folding(struct callchain_node *node, bool unfold) +{ + struct callchain_list *chain; + bool has_children = false; + int n = 0; + + list_for_each_entry(chain, &node->val, list) { + ++n; + map_symbol__set_folding(&chain->ms, unfold); + has_children = chain->ms.has_children; + } + + if (has_children) + n += callchain_node__set_folding_rb_tree(node, unfold); + + return n; +} + +static int callchain__set_folding(struct rb_root *chain, bool unfold) +{ + struct rb_node *nd; + int n = 0; + + for (nd = rb_first(chain); nd; nd = rb_next(nd)) { + struct callchain_node *node = rb_entry(nd, struct callchain_node, rb_node); + n += callchain_node__set_folding(node, unfold); + } + + return n; +} + +static void hist_entry__set_folding(struct hist_entry *self, bool unfold) +{ + hist_entry__init_have_children(self); + map_symbol__set_folding(&self->ms, unfold); + + if (self->ms.has_children) { + int n = callchain__set_folding(&self->sorted_chain, unfold); + self->nr_rows = unfold ? n : 0; + } else + self->nr_rows = 0; +} + +static void hists__set_folding(struct hists *self, bool unfold) +{ + struct rb_node *nd; + + self->nr_entries = 0; + + for (nd = rb_first(&self->entries); nd; nd = rb_next(nd)) { + struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node); + hist_entry__set_folding(he, unfold); + self->nr_entries += 1 + he->nr_rows; + } +} + +static void hist_browser__set_folding(struct hist_browser *self, bool unfold) +{ + hists__set_folding(self->hists, unfold); + self->b.nr_entries = self->hists->nr_entries; + /* Go to the start, we may be way after valid entries after a collapse */ + ui_browser__reset_index(&self->b); +} + +static int hist_browser__run(struct hist_browser *self, const char *title) { - char str[256], unit; - unsigned long nr_events = self->hists->stats.nr_events[PERF_RECORD_SAMPLE]; + int key; + int exit_keys[] = { 'a', '?', 'h', 'C', 'd', 'D', 'E', 't', + NEWT_KEY_ENTER, NEWT_KEY_RIGHT, NEWT_KEY_LEFT, 0, }; self->b.entries = &self->hists->entries; self->b.nr_entries = self->hists->nr_entries; hist_browser__refresh_dimensions(self); - nr_events = convert_unit(nr_events, &unit); - snprintf(str, sizeof(str), "Events: %lu%c ", - nr_events, unit); - newtDrawRootText(0, 0, str); - if (ui_browser__show(&self->b, title, "Press '?' for help on key bindings") < 0) return -1; - newtFormAddHotKey(self->b.form, 'a'); - newtFormAddHotKey(self->b.form, '?'); - newtFormAddHotKey(self->b.form, 'h'); - newtFormAddHotKey(self->b.form, 'd'); - newtFormAddHotKey(self->b.form, 'D'); - newtFormAddHotKey(self->b.form, 't'); - - newtFormAddHotKey(self->b.form, NEWT_KEY_LEFT); - newtFormAddHotKey(self->b.form, NEWT_KEY_RIGHT); - newtFormAddHotKey(self->b.form, NEWT_KEY_ENTER); + ui_browser__add_exit_keys(&self->b, exit_keys); while (1) { - ui_browser__run(&self->b, es); + key = ui_browser__run(&self->b); - if (es->reason != NEWT_EXIT_HOTKEY) - break; - switch (es->u.key) { + switch (key) { case 'D': { /* Debug */ static int seq; struct hist_entry *h = rb_entry(self->b.top, @@ -245,18 +322,26 @@ static int hist_browser__run(struct hist_browser *self, const char *title, self->b.top_idx, h->row_offset, h->nr_rows); } - continue; + break; + case 'C': + /* Collapse the whole world. */ + hist_browser__set_folding(self, false); + break; + case 'E': + /* Expand the whole world. */ + hist_browser__set_folding(self, true); + break; case NEWT_KEY_ENTER: if (hist_browser__toggle_fold(self)) break; /* fall thru */ default: - return 0; + goto out; } } - +out: ui_browser__hide(&self->b); - return 0; + return key; } static char *callchain_list__sym_name(struct callchain_list *self, @@ -306,15 +391,10 @@ static int hist_browser__show_callchain_node_rb_tree(struct hist_browser *self, int color; bool was_first = first; - if (first) { + if (first) first = false; - chain->ms.has_children = chain->list.next != &child->val || - rb_first(&child->rb_root) != NULL; - } else { + else extra_offset = LEVEL_OFFSET_STEP; - chain->ms.has_children = chain->list.next == &child->val && - rb_first(&child->rb_root) != NULL; - } folded_sign = callchain_list__folded(chain); if (*row_offset != 0) { @@ -341,8 +421,8 @@ static int hist_browser__show_callchain_node_rb_tree(struct hist_browser *self, *is_current_entry = true; } - SLsmg_set_color(color); - SLsmg_gotorc(self->b.y + row, self->b.x); + ui_browser__set_color(&self->b, color); + ui_browser__gotorc(&self->b, row, 0); slsmg_write_nstring(" ", offset + extra_offset); slsmg_printf("%c ", folded_sign); slsmg_write_nstring(str, width); @@ -384,12 +464,7 @@ static int hist_browser__show_callchain_node(struct hist_browser *self, list_for_each_entry(chain, &node->val, list) { char ipstr[BITS_PER_LONG / 4 + 1], *s; int color; - /* - * FIXME: This should be moved to somewhere else, - * probably when the callchain is created, so as not to - * traverse it all over again - */ - chain->ms.has_children = rb_first(&node->rb_root) != NULL; + folded_sign = callchain_list__folded(chain); if (*row_offset != 0) { @@ -405,8 +480,8 @@ static int hist_browser__show_callchain_node(struct hist_browser *self, } s = callchain_list__sym_name(chain, ipstr, sizeof(ipstr)); - SLsmg_gotorc(self->b.y + row, self->b.x); - SLsmg_set_color(color); + ui_browser__gotorc(&self->b, row, 0); + ui_browser__set_color(&self->b, color); slsmg_write_nstring(" ", offset); slsmg_printf("%c ", folded_sign); slsmg_write_nstring(s, width - 2); @@ -465,7 +540,7 @@ static int hist_browser__show_entry(struct hist_browser *self, } if (symbol_conf.use_callchain) { - entry->ms.has_children = !RB_EMPTY_ROOT(&entry->sorted_chain); + hist_entry__init_have_children(entry); folded_sign = hist_entry__folded(entry); } @@ -484,8 +559,8 @@ static int hist_browser__show_entry(struct hist_browser *self, color = HE_COLORSET_NORMAL; } - SLsmg_set_color(color); - SLsmg_gotorc(self->b.y + row, self->b.x); + ui_browser__set_color(&self->b, color); + ui_browser__gotorc(&self->b, row, 0); if (symbol_conf.use_callchain) { slsmg_printf("%c ", folded_sign); width -= 2; @@ -687,8 +762,6 @@ static struct hist_browser *hist_browser__new(struct hists *hists) static void hist_browser__delete(struct hist_browser *self) { - newtFormDestroy(self->b.form); - newtPopWindow(); free(self); } @@ -702,21 +775,26 @@ static struct thread *hist_browser__selected_thread(struct hist_browser *self) return self->he_selection->thread; } -static int hist_browser__title(char *bf, size_t size, const char *ev_name, - const struct dso *dso, const struct thread *thread) +static int hists__browser_title(struct hists *self, char *bf, size_t size, + const char *ev_name, const struct dso *dso, + const struct thread *thread) { - int printed = 0; + char unit; + int printed; + unsigned long nr_events = self->stats.nr_events[PERF_RECORD_SAMPLE]; + + nr_events = convert_unit(nr_events, &unit); + printed = snprintf(bf, size, "Events: %lu%c %s", nr_events, unit, ev_name); if (thread) printed += snprintf(bf + printed, size - printed, - "Thread: %s(%d)", - (thread->comm_set ? thread->comm : ""), + ", Thread: %s(%d)", + (thread->comm_set ? thread->comm : ""), thread->pid); if (dso) printed += snprintf(bf + printed, size - printed, - "%sDSO: %s", thread ? " " : "", - dso->short_name); - return printed ?: snprintf(bf, size, "Event: %s", ev_name); + ", DSO: %s", dso->short_name); + return printed; } int hists__browse(struct hists *self, const char *helpline, const char *ev_name) @@ -725,7 +803,6 @@ int hists__browse(struct hists *self, const char *helpline, const char *ev_name) struct pstack *fstack; const struct thread *thread_filter = NULL; const struct dso *dso_filter = NULL; - struct newtExitStruct es; char msg[160]; int key = -1; @@ -738,9 +815,8 @@ int hists__browse(struct hists *self, const char *helpline, const char *ev_name) ui_helpline__push(helpline); - hist_browser__title(msg, sizeof(msg), ev_name, - dso_filter, thread_filter); - + hists__browser_title(self, msg, sizeof(msg), ev_name, + dso_filter, thread_filter); while (1) { const struct thread *thread; const struct dso *dso; @@ -749,70 +825,63 @@ int hists__browse(struct hists *self, const char *helpline, const char *ev_name) annotate = -2, zoom_dso = -2, zoom_thread = -2, browse_map = -2; - if (hist_browser__run(browser, msg, &es)) - break; + key = hist_browser__run(browser, msg); thread = hist_browser__selected_thread(browser); dso = browser->selection->map ? browser->selection->map->dso : NULL; - if (es.reason == NEWT_EXIT_HOTKEY) { - key = es.u.key; - - switch (key) { - case NEWT_KEY_F1: - goto do_help; - case NEWT_KEY_TAB: - case NEWT_KEY_UNTAB: - /* - * Exit the browser, let hists__browser_tree - * go to the next or previous - */ - goto out_free_stack; - default:; - } - - switch (key) { - case 'a': - if (browser->selection->map == NULL || - browser->selection->map->dso->annotate_warned) - continue; - goto do_annotate; - case 'd': - goto zoom_dso; - case 't': - goto zoom_thread; - case 'h': - case '?': -do_help: - ui__help_window("-> Zoom into DSO/Threads & Annotate current symbol\n" - "<- Zoom out\n" - "a Annotate current symbol\n" - "h/?/F1 Show this window\n" - "d Zoom into current DSO\n" - "t Zoom into current Thread\n" - "q/CTRL+C Exit browser"); + switch (key) { + case NEWT_KEY_TAB: + case NEWT_KEY_UNTAB: + /* + * Exit the browser, let hists__browser_tree + * go to the next or previous + */ + goto out_free_stack; + case 'a': + if (browser->selection->map == NULL && + browser->selection->map->dso->annotate_warned) continue; - default:; - } - if (is_exit_key(key)) { - if (key == NEWT_KEY_ESCAPE && - !ui__dialog_yesno("Do you really want to exit?")) - continue; - break; - } - - if (es.u.key == NEWT_KEY_LEFT) { - const void *top; + goto do_annotate; + case 'd': + goto zoom_dso; + case 't': + goto zoom_thread; + case NEWT_KEY_F1: + case 'h': + case '?': + ui__help_window("-> Zoom into DSO/Threads & Annotate current symbol\n" + "<- Zoom out\n" + "a Annotate current symbol\n" + "h/?/F1 Show this window\n" + "C Collapse all callchains\n" + "E Expand all callchains\n" + "d Zoom into current DSO\n" + "t Zoom into current Thread\n" + "q/CTRL+C Exit browser"); + continue; + case NEWT_KEY_ENTER: + case NEWT_KEY_RIGHT: + /* menu */ + break; + case NEWT_KEY_LEFT: { + const void *top; - if (pstack__empty(fstack)) - continue; - top = pstack__pop(fstack); - if (top == &dso_filter) - goto zoom_out_dso; - if (top == &thread_filter) - goto zoom_out_thread; + if (pstack__empty(fstack)) continue; - } + top = pstack__pop(fstack); + if (top == &dso_filter) + goto zoom_out_dso; + if (top == &thread_filter) + goto zoom_out_thread; + continue; + } + case NEWT_KEY_ESCAPE: + if (!ui__dialog_yesno("Do you really want to exit?")) + continue; + /* Fall thru */ + default: + goto out_free_stack; } if (browser->selection->sym != NULL && @@ -885,8 +954,8 @@ zoom_out_dso: pstack__push(fstack, &dso_filter); } hists__filter_by_dso(self, dso_filter); - hist_browser__title(msg, sizeof(msg), ev_name, - dso_filter, thread_filter); + hists__browser_title(self, msg, sizeof(msg), ev_name, + dso_filter, thread_filter); hist_browser__reset(browser); } else if (choice == zoom_thread) { zoom_thread: @@ -903,8 +972,8 @@ zoom_out_thread: pstack__push(fstack, &thread_filter); } hists__filter_by_thread(self, thread_filter); - hist_browser__title(msg, sizeof(msg), ev_name, - dso_filter, thread_filter); + hists__browser_title(self, msg, sizeof(msg), ev_name, + dso_filter, thread_filter); hist_browser__reset(browser); } } @@ -925,10 +994,6 @@ int hists__tui_browse_tree(struct rb_root *self, const char *help) const char *ev_name = __event_name(hists->type, hists->config); key = hists__browse(hists, help, ev_name); - - if (is_exit_key(key)) - break; - switch (key) { case NEWT_KEY_TAB: next = rb_next(nd); @@ -940,7 +1005,7 @@ int hists__tui_browse_tree(struct rb_root *self, const char *help) continue; nd = rb_prev(nd); default: - break; + return key; } } diff --git a/tools/perf/util/ui/browsers/map.c b/tools/perf/util/ui/browsers/map.c index 142b825b42bf..e35437dfa5b4 100644 --- a/tools/perf/util/ui/browsers/map.c +++ b/tools/perf/util/ui/browsers/map.c @@ -1,6 +1,5 @@ #include "../libslang.h" #include <elf.h> -#include <newt.h> #include <sys/ttydefaults.h> #include <ctype.h> #include <string.h> @@ -47,7 +46,6 @@ out_free_form: struct map_browser { struct ui_browser b; struct map *map; - u16 namelen; u8 addrlen; }; @@ -56,14 +54,16 @@ static void map_browser__write(struct ui_browser *self, void *nd, int row) struct symbol *sym = rb_entry(nd, struct symbol, rb_node); struct map_browser *mb = container_of(self, struct map_browser, b); bool current_entry = ui_browser__is_current_entry(self, row); - int color = ui_browser__percent_color(0, current_entry); + int width; - SLsmg_set_color(color); + ui_browser__set_percent_color(self, 0, current_entry); slsmg_printf("%*llx %*llx %c ", mb->addrlen, sym->start, mb->addrlen, sym->end, sym->binding == STB_GLOBAL ? 'g' : sym->binding == STB_LOCAL ? 'l' : 'w'); - slsmg_write_nstring(sym->name, mb->namelen); + width = self->width - ((mb->addrlen * 2) + 4); + if (width > 0) + slsmg_write_nstring(sym->name, width); } /* FIXME uber-kludgy, see comment on cmd_report... */ @@ -98,31 +98,29 @@ static int map_browser__search(struct map_browser *self) return 0; } -static int map_browser__run(struct map_browser *self, struct newtExitStruct *es) +static int map_browser__run(struct map_browser *self) { + int key; + if (ui_browser__show(&self->b, self->map->dso->long_name, "Press <- or ESC to exit, %s / to search", verbose ? "" : "restart with -v to use") < 0) return -1; - newtFormAddHotKey(self->b.form, NEWT_KEY_LEFT); - newtFormAddHotKey(self->b.form, NEWT_KEY_ENTER); if (verbose) - newtFormAddHotKey(self->b.form, '/'); + ui_browser__add_exit_key(&self->b, '/'); while (1) { - ui_browser__run(&self->b, es); + key = ui_browser__run(&self->b); - if (es->reason != NEWT_EXIT_HOTKEY) - break; - if (verbose && es->u.key == '/') + if (verbose && key == '/') map_browser__search(self); else break; } ui_browser__hide(&self->b); - return 0; + return key; } int map__browse(struct map *self) @@ -136,7 +134,6 @@ int map__browse(struct map *self) }, .map = self, }; - struct newtExitStruct es; struct rb_node *nd; char tmp[BITS_PER_LONG / 4]; u64 maxaddr = 0; @@ -144,8 +141,6 @@ int map__browse(struct map *self) for (nd = rb_first(mb.b.entries); nd; nd = rb_next(nd)) { struct symbol *pos = rb_entry(nd, struct symbol, rb_node); - if (mb.namelen < pos->namelen) - mb.namelen = pos->namelen; if (maxaddr < pos->end) maxaddr = pos->end; if (verbose) { @@ -156,6 +151,5 @@ int map__browse(struct map *self) } mb.addrlen = snprintf(tmp, sizeof(tmp), "%llx", maxaddr); - mb.b.width += mb.addrlen * 2 + 4 + mb.namelen; - return map_browser__run(&mb, &es); + return map_browser__run(&mb); } diff --git a/tools/perf/util/ui/util.c b/tools/perf/util/ui/util.c index 04600e26ceea..9706d9d40279 100644 --- a/tools/perf/util/ui/util.c +++ b/tools/perf/util/ui/util.c @@ -11,8 +11,6 @@ #include "helpline.h" #include "util.h" -newtComponent newt_form__new(void); - static void newt_form__set_exit_keys(newtComponent self) { newtFormAddHotKey(self, NEWT_KEY_LEFT); @@ -22,7 +20,7 @@ static void newt_form__set_exit_keys(newtComponent self) newtFormAddHotKey(self, CTRL('c')); } -newtComponent newt_form__new(void) +static newtComponent newt_form__new(void) { newtComponent self = newtForm(NULL, NULL, 0); if (self) diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index f380fed74359..7562707ddd1c 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -266,19 +266,6 @@ bool strglobmatch(const char *str, const char *pat); bool strlazymatch(const char *str, const char *pat); unsigned long convert_unit(unsigned long value, char *unit); -#ifndef ESC -#define ESC 27 -#endif - -static inline bool is_exit_key(int key) -{ - char up; - if (key == CTRL('c') || key == ESC) - return true; - up = toupper(key); - return up == 'Q'; -} - #define _STR(x) #x #define STR(x) _STR(x) |