diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-01-28 18:44:15 +0100 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-01-28 18:44:15 +0100 |
commit | c0e809e244804d428bcd976eaf9369f60508ea8a (patch) | |
tree | 99fa85899a3c11d2ebeb6d090f218fda968a0e6a /arch/x86/kernel | |
parent | Merge branch 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/k... (diff) | |
parent | Merge branch 'core/kprobes' into perf/core, to pick up fixes (diff) | |
download | linux-c0e809e244804d428bcd976eaf9369f60508ea8a.tar.xz linux-c0e809e244804d428bcd976eaf9369f60508ea8a.zip |
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull perf updates from Ingo Molnar:
"Kernel side changes:
- Ftrace is one of the last W^X violators (after this only KLP is
left). These patches move it over to the generic text_poke()
interface and thereby get rid of this oddity. This requires a
surprising amount of surgery, by Peter Zijlstra.
- x86/AMD PMUs: add support for 'Large Increment per Cycle Events' to
count certain types of events that have a special, quirky hw ABI
(by Kim Phillips)
- kprobes fixes by Masami Hiramatsu
Lots of tooling updates as well, the following subcommands were
updated: annotate/report/top, c2c, clang, record, report/top TUI,
sched timehist, tests; plus updates were done to the gtk ui, libperf,
headers and the parser"
* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (57 commits)
perf/x86/amd: Add support for Large Increment per Cycle Events
perf/x86/amd: Constrain Large Increment per Cycle events
perf/x86/intel/rapl: Add Comet Lake support
tracing: Initialize ret in syscall_enter_define_fields()
perf header: Use last modification time for timestamp
perf c2c: Fix return type for histogram sorting comparision functions
perf beauty sockaddr: Fix augmented syscall format warning
perf/ui/gtk: Fix gtk2 build
perf ui gtk: Add missing zalloc object
perf tools: Use %define api.pure full instead of %pure-parser
libperf: Setup initial evlist::all_cpus value
perf report: Fix no libunwind compiled warning break s390 issue
perf tools: Support --prefix/--prefix-strip
perf report: Clarify in help that --children is default
tools build: Fix test-clang.cpp with Clang 8+
perf clang: Fix build with Clang 9
kprobes: Fix optimize_kprobe()/unoptimize_kprobe() cancellation logic
tools lib: Fix builds when glibc contains strlcpy()
perf report/top: Make 'e' visible in the help and make it toggle showing callchains
perf report/top: Do not offer annotation for symbols without samples
...
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/alternative.c | 198 | ||||
-rw-r--r-- | arch/x86/kernel/ftrace.c | 688 | ||||
-rw-r--r-- | arch/x86/kernel/jump_label.c | 116 | ||||
-rw-r--r-- | arch/x86/kernel/kprobes/core.c | 20 | ||||
-rw-r--r-- | arch/x86/kernel/kprobes/opt.c | 67 | ||||
-rw-r--r-- | arch/x86/kernel/traps.c | 9 |
6 files changed, 358 insertions, 740 deletions
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9ec463fe96f2..34360ca301a2 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -936,44 +936,81 @@ static void do_sync_core(void *info) sync_core(); } -static struct bp_patching_desc { +void text_poke_sync(void) +{ + on_each_cpu(do_sync_core, NULL, 1); +} + +struct text_poke_loc { + s32 rel_addr; /* addr := _stext + rel_addr */ + s32 rel32; + u8 opcode; + const u8 text[POKE_MAX_OPCODE_SIZE]; +}; + +struct bp_patching_desc { struct text_poke_loc *vec; int nr_entries; -} bp_patching; + atomic_t refs; +}; + +static struct bp_patching_desc *bp_desc; + +static inline struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp) +{ + struct bp_patching_desc *desc = READ_ONCE(*descp); /* rcu_dereference */ + + if (!desc || !atomic_inc_not_zero(&desc->refs)) + return NULL; + + return desc; +} + +static inline void put_desc(struct bp_patching_desc *desc) +{ + smp_mb__before_atomic(); + atomic_dec(&desc->refs); +} -static int patch_cmp(const void *key, const void *elt) +static inline void *text_poke_addr(struct text_poke_loc *tp) +{ + return _stext + tp->rel_addr; +} + +static int notrace patch_cmp(const void *key, const void *elt) { struct text_poke_loc *tp = (struct text_poke_loc *) elt; - if (key < tp->addr) + if (key < text_poke_addr(tp)) return -1; - if (key > tp->addr) + if (key > text_poke_addr(tp)) return 1; return 0; } NOKPROBE_SYMBOL(patch_cmp); -int poke_int3_handler(struct pt_regs *regs) +int notrace poke_int3_handler(struct pt_regs *regs) { + struct bp_patching_desc *desc; struct text_poke_loc *tp; + int len, ret = 0; void *ip; + if (user_mode(regs)) + return 0; + /* * Having observed our INT3 instruction, we now must observe - * bp_patching.nr_entries. + * bp_desc: * - * nr_entries != 0 INT3 + * bp_desc = desc INT3 * WMB RMB - * write INT3 if (nr_entries) - * - * Idem for other elements in bp_patching. + * write INT3 if (desc) */ smp_rmb(); - if (likely(!bp_patching.nr_entries)) - return 0; - - if (user_mode(regs)) + desc = try_get_desc(&bp_desc); + if (!desc) return 0; /* @@ -984,19 +1021,20 @@ int poke_int3_handler(struct pt_regs *regs) /* * Skip the binary search if there is a single member in the vector. */ - if (unlikely(bp_patching.nr_entries > 1)) { - tp = bsearch(ip, bp_patching.vec, bp_patching.nr_entries, + if (unlikely(desc->nr_entries > 1)) { + tp = bsearch(ip, desc->vec, desc->nr_entries, sizeof(struct text_poke_loc), patch_cmp); if (!tp) - return 0; + goto out_put; } else { - tp = bp_patching.vec; - if (tp->addr != ip) - return 0; + tp = desc->vec; + if (text_poke_addr(tp) != ip) + goto out_put; } - ip += tp->len; + len = text_opcode_size(tp->opcode); + ip += len; switch (tp->opcode) { case INT3_INSN_OPCODE: @@ -1004,7 +1042,7 @@ int poke_int3_handler(struct pt_regs *regs) * Someone poked an explicit INT3, they'll want to handle it, * do not consume. */ - return 0; + goto out_put; case CALL_INSN_OPCODE: int3_emulate_call(regs, (long)ip + tp->rel32); @@ -1019,10 +1057,18 @@ int poke_int3_handler(struct pt_regs *regs) BUG(); } - return 1; + ret = 1; + +out_put: + put_desc(desc); + return ret; } NOKPROBE_SYMBOL(poke_int3_handler); +#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) +static struct text_poke_loc tp_vec[TP_VEC_MAX]; +static int tp_vec_nr; + /** * text_poke_bp_batch() -- update instructions on live kernel on SMP * @tp: vector of instructions to patch @@ -1044,16 +1090,20 @@ NOKPROBE_SYMBOL(poke_int3_handler); * replacing opcode * - sync cores */ -void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) +static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) { + struct bp_patching_desc desc = { + .vec = tp, + .nr_entries = nr_entries, + .refs = ATOMIC_INIT(1), + }; unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; int do_sync; lockdep_assert_held(&text_mutex); - bp_patching.vec = tp; - bp_patching.nr_entries = nr_entries; + smp_store_release(&bp_desc, &desc); /* rcu_assign_pointer */ /* * Corresponding read barrier in int3 notifier for making sure the @@ -1065,18 +1115,20 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) * First step: add a int3 trap to the address that will be patched. */ for (i = 0; i < nr_entries; i++) - text_poke(tp[i].addr, &int3, sizeof(int3)); + text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); - on_each_cpu(do_sync_core, NULL, 1); + text_poke_sync(); /* * Second step: update all but the first byte of the patched range. */ for (do_sync = 0, i = 0; i < nr_entries; i++) { - if (tp[i].len - sizeof(int3) > 0) { - text_poke((char *)tp[i].addr + sizeof(int3), - (const char *)tp[i].text + sizeof(int3), - tp[i].len - sizeof(int3)); + int len = text_opcode_size(tp[i].opcode); + + if (len - INT3_INSN_SIZE > 0) { + text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + (const char *)tp[i].text + INT3_INSN_SIZE, + len - INT3_INSN_SIZE); do_sync++; } } @@ -1087,7 +1139,7 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) * not necessary and we'd be safe even without it. But * better safe than sorry (plus there's not only Intel). */ - on_each_cpu(do_sync_core, NULL, 1); + text_poke_sync(); } /* @@ -1098,19 +1150,20 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) if (tp[i].text[0] == INT3_INSN_OPCODE) continue; - text_poke(tp[i].addr, tp[i].text, sizeof(int3)); + text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE); do_sync++; } if (do_sync) - on_each_cpu(do_sync_core, NULL, 1); + text_poke_sync(); /* - * sync_core() implies an smp_mb() and orders this store against - * the writing of the new instruction. + * Remove and synchronize_rcu(), except we have a very primitive + * refcount based completion. */ - bp_patching.vec = NULL; - bp_patching.nr_entries = 0; + WRITE_ONCE(bp_desc, NULL); /* RCU_INIT_POINTER */ + if (!atomic_dec_and_test(&desc.refs)) + atomic_cond_read_acquire(&desc.refs, !VAL); } void text_poke_loc_init(struct text_poke_loc *tp, void *addr, @@ -1118,11 +1171,7 @@ void text_poke_loc_init(struct text_poke_loc *tp, void *addr, { struct insn insn; - if (!opcode) - opcode = (void *)tp->text; - else - memcpy((void *)tp->text, opcode, len); - + memcpy((void *)tp->text, opcode, len); if (!emulate) emulate = opcode; @@ -1132,8 +1181,7 @@ void text_poke_loc_init(struct text_poke_loc *tp, void *addr, BUG_ON(!insn_complete(&insn)); BUG_ON(len != insn.length); - tp->addr = addr; - tp->len = len; + tp->rel_addr = addr - (void *)_stext; tp->opcode = insn.opcode.bytes[0]; switch (tp->opcode) { @@ -1167,6 +1215,55 @@ void text_poke_loc_init(struct text_poke_loc *tp, void *addr, } } +/* + * We hard rely on the tp_vec being ordered; ensure this is so by flushing + * early if needed. + */ +static bool tp_order_fail(void *addr) +{ + struct text_poke_loc *tp; + + if (!tp_vec_nr) + return false; + + if (!addr) /* force */ + return true; + + tp = &tp_vec[tp_vec_nr - 1]; + if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) + return true; + + return false; +} + +static void text_poke_flush(void *addr) +{ + if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { + text_poke_bp_batch(tp_vec, tp_vec_nr); + tp_vec_nr = 0; + } +} + +void text_poke_finish(void) +{ + text_poke_flush(NULL); +} + +void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) +{ + struct text_poke_loc *tp; + + if (unlikely(system_state == SYSTEM_BOOTING)) { + text_poke_early(addr, opcode, len); + return; + } + + text_poke_flush(addr); + + tp = &tp_vec[tp_vec_nr++]; + text_poke_loc_init(tp, addr, opcode, len, emulate); +} + /** * text_poke_bp() -- update instructions on live kernel on SMP * @addr: address to patch @@ -1178,10 +1275,15 @@ void text_poke_loc_init(struct text_poke_loc *tp, void *addr, * dynamically allocated memory. This function should be used when it is * not possible to allocate memory. */ -void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) +void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) { struct text_poke_loc tp; + if (unlikely(system_state == SYSTEM_BOOTING)) { + text_poke_early(addr, opcode, len); + return; + } + text_poke_loc_init(&tp, addr, opcode, len, emulate); text_poke_bp_batch(&tp, 1); } diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 2009047bb015..37a0aeaf89e7 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -35,6 +35,8 @@ #ifdef CONFIG_DYNAMIC_FTRACE +static int ftrace_poke_late = 0; + int ftrace_arch_code_modify_prepare(void) __acquires(&text_mutex) { @@ -44,84 +46,37 @@ int ftrace_arch_code_modify_prepare(void) * ftrace has it set to "read/write". */ mutex_lock(&text_mutex); - set_kernel_text_rw(); - set_all_modules_text_rw(); + ftrace_poke_late = 1; return 0; } int ftrace_arch_code_modify_post_process(void) __releases(&text_mutex) { - set_all_modules_text_ro(); - set_kernel_text_ro(); + /* + * ftrace_make_{call,nop}() may be called during + * module load, and we need to finish the text_poke_queue() + * that they do, here. + */ + text_poke_finish(); + ftrace_poke_late = 0; mutex_unlock(&text_mutex); return 0; } -union ftrace_code_union { - char code[MCOUNT_INSN_SIZE]; - struct { - unsigned char op; - int offset; - } __attribute__((packed)); -}; - -static int ftrace_calc_offset(long ip, long addr) -{ - return (int)(addr - ip); -} - -static unsigned char * -ftrace_text_replace(unsigned char op, unsigned long ip, unsigned long addr) +static const char *ftrace_nop_replace(void) { - static union ftrace_code_union calc; - - calc.op = op; - calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr); - - return calc.code; -} - -static unsigned char * -ftrace_call_replace(unsigned long ip, unsigned long addr) -{ - return ftrace_text_replace(0xe8, ip, addr); -} - -static inline int -within(unsigned long addr, unsigned long start, unsigned long end) -{ - return addr >= start && addr < end; + return ideal_nops[NOP_ATOMIC5]; } -static unsigned long text_ip_addr(unsigned long ip) +static const char *ftrace_call_replace(unsigned long ip, unsigned long addr) { - /* - * On x86_64, kernel text mappings are mapped read-only, so we use - * the kernel identity mapping instead of the kernel text mapping - * to modify the kernel text. - * - * For 32bit kernels, these mappings are same and we can use - * kernel identity mapping to modify code. - */ - if (within(ip, (unsigned long)_text, (unsigned long)_etext)) - ip = (unsigned long)__va(__pa_symbol(ip)); - - return ip; + return text_gen_insn(CALL_INSN_OPCODE, (void *)ip, (void *)addr); } -static const unsigned char *ftrace_nop_replace(void) +static int ftrace_verify_code(unsigned long ip, const char *old_code) { - return ideal_nops[NOP_ATOMIC5]; -} - -static int -ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, - unsigned const char *new_code) -{ - unsigned char replaced[MCOUNT_INSN_SIZE]; - - ftrace_expected = old_code; + char cur_code[MCOUNT_INSN_SIZE]; /* * Note: @@ -130,31 +85,46 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, * Carefully read and modify the code with probe_kernel_*(), and make * sure what we read is what we expected it to be before modifying it. */ - /* read the text we want to modify */ - if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + if (probe_kernel_read(cur_code, (void *)ip, MCOUNT_INSN_SIZE)) { + WARN_ON(1); return -EFAULT; + } /* Make sure it is what we expect it to be */ - if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) + if (memcmp(cur_code, old_code, MCOUNT_INSN_SIZE) != 0) { + WARN_ON(1); return -EINVAL; + } - ip = text_ip_addr(ip); - - /* replace the text with the new text */ - if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE)) - return -EPERM; + return 0; +} - sync_core(); +/* + * Marked __ref because it calls text_poke_early() which is .init.text. That is + * ok because that call will happen early, during boot, when .init sections are + * still present. + */ +static int __ref +ftrace_modify_code_direct(unsigned long ip, const char *old_code, + const char *new_code) +{ + int ret = ftrace_verify_code(ip, old_code); + if (ret) + return ret; + /* replace the text with the new text */ + if (ftrace_poke_late) + text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); + else + text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); return 0; } -int ftrace_make_nop(struct module *mod, - struct dyn_ftrace *rec, unsigned long addr) +int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { - unsigned const char *new, *old; unsigned long ip = rec->ip; + const char *new, *old; old = ftrace_call_replace(ip, addr); new = ftrace_nop_replace(); @@ -168,19 +138,20 @@ int ftrace_make_nop(struct module *mod, * just modify the code directly. */ if (addr == MCOUNT_ADDR) - return ftrace_modify_code_direct(rec->ip, old, new); + return ftrace_modify_code_direct(ip, old, new); - ftrace_expected = NULL; - - /* Normal cases use add_brk_on_nop */ + /* + * x86 overrides ftrace_replace_code -- this function will never be used + * in this case. + */ WARN_ONCE(1, "invalid use of ftrace_make_nop"); return -EINVAL; } int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { - unsigned const char *new, *old; unsigned long ip = rec->ip; + const char *new, *old; old = ftrace_nop_replace(); new = ftrace_call_replace(ip, addr); @@ -190,43 +161,6 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) } /* - * The modifying_ftrace_code is used to tell the breakpoint - * handler to call ftrace_int3_handler(). If it fails to - * call this handler for a breakpoint added by ftrace, then - * the kernel may crash. - * - * As atomic_writes on x86 do not need a barrier, we do not - * need to add smp_mb()s for this to work. It is also considered - * that we can not read the modifying_ftrace_code before - * executing the breakpoint. That would be quite remarkable if - * it could do that. Here's the flow that is required: - * - * CPU-0 CPU-1 - * - * atomic_inc(mfc); - * write int3s - * <trap-int3> // implicit (r)mb - * if (atomic_read(mfc)) - * call ftrace_int3_handler() - * - * Then when we are finished: - * - * atomic_dec(mfc); - * - * If we hit a breakpoint that was not set by ftrace, it does not - * matter if ftrace_int3_handler() is called or not. It will - * simply be ignored. But it is crucial that a ftrace nop/caller - * breakpoint is handled. No other user should ever place a - * breakpoint on an ftrace nop/caller location. It must only - * be done by this code. - */ -atomic_t modifying_ftrace_code __read_mostly; - -static int -ftrace_modify_code(unsigned long ip, unsigned const char *old_code, - unsigned const char *new_code); - -/* * Should never be called: * As it is only called by __ftrace_replace_code() which is called by * ftrace_replace_code() that x86 overrides, and by ftrace_update_code() @@ -238,452 +172,84 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) { WARN_ON(1); - ftrace_expected = NULL; return -EINVAL; } -static unsigned long ftrace_update_func; -static unsigned long ftrace_update_func_call; - -static int update_ftrace_func(unsigned long ip, void *new) -{ - unsigned char old[MCOUNT_INSN_SIZE]; - int ret; - - memcpy(old, (void *)ip, MCOUNT_INSN_SIZE); - - ftrace_update_func = ip; - /* Make sure the breakpoints see the ftrace_update_func update */ - smp_wmb(); - - /* See comment above by declaration of modifying_ftrace_code */ - atomic_inc(&modifying_ftrace_code); - - ret = ftrace_modify_code(ip, old, new); - - atomic_dec(&modifying_ftrace_code); - - return ret; -} - int ftrace_update_ftrace_func(ftrace_func_t func) { - unsigned long ip = (unsigned long)(&ftrace_call); - unsigned char *new; - int ret; - - ftrace_update_func_call = (unsigned long)func; - - new = ftrace_call_replace(ip, (unsigned long)func); - ret = update_ftrace_func(ip, new); - - /* Also update the regs callback function */ - if (!ret) { - ip = (unsigned long)(&ftrace_regs_call); - new = ftrace_call_replace(ip, (unsigned long)func); - ret = update_ftrace_func(ip, new); - } - - return ret; -} - -static nokprobe_inline int is_ftrace_caller(unsigned long ip) -{ - if (ip == ftrace_update_func) - return 1; - - return 0; -} - -/* - * A breakpoint was added to the code address we are about to - * modify, and this is the handle that will just skip over it. - * We are either changing a nop into a trace call, or a trace - * call to a nop. While the change is taking place, we treat - * it just like it was a nop. - */ -int ftrace_int3_handler(struct pt_regs *regs) -{ unsigned long ip; + const char *new; - if (WARN_ON_ONCE(!regs)) - return 0; - - ip = regs->ip - INT3_INSN_SIZE; - - if (ftrace_location(ip)) { - int3_emulate_call(regs, (unsigned long)ftrace_regs_caller); - return 1; - } else if (is_ftrace_caller(ip)) { - if (!ftrace_update_func_call) { - int3_emulate_jmp(regs, ip + CALL_INSN_SIZE); - return 1; - } - int3_emulate_call(regs, ftrace_update_func_call); - return 1; - } - - return 0; -} -NOKPROBE_SYMBOL(ftrace_int3_handler); - -static int ftrace_write(unsigned long ip, const char *val, int size) -{ - ip = text_ip_addr(ip); - - if (probe_kernel_write((void *)ip, val, size)) - return -EPERM; - - return 0; -} - -static int add_break(unsigned long ip, const char *old) -{ - unsigned char replaced[MCOUNT_INSN_SIZE]; - unsigned char brk = BREAKPOINT_INSTRUCTION; - - if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) - return -EFAULT; - - ftrace_expected = old; - - /* Make sure it is what we expect it to be */ - if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0) - return -EINVAL; - - return ftrace_write(ip, &brk, 1); -} - -static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr) -{ - unsigned const char *old; - unsigned long ip = rec->ip; - - old = ftrace_call_replace(ip, addr); - - return add_break(rec->ip, old); -} - - -static int add_brk_on_nop(struct dyn_ftrace *rec) -{ - unsigned const char *old; - - old = ftrace_nop_replace(); - - return add_break(rec->ip, old); -} - -static int add_breakpoints(struct dyn_ftrace *rec, bool enable) -{ - unsigned long ftrace_addr; - int ret; - - ftrace_addr = ftrace_get_addr_curr(rec); - - ret = ftrace_test_record(rec, enable); - - switch (ret) { - case FTRACE_UPDATE_IGNORE: - return 0; - - case FTRACE_UPDATE_MAKE_CALL: - /* converting nop to call */ - return add_brk_on_nop(rec); - - case FTRACE_UPDATE_MODIFY_CALL: - case FTRACE_UPDATE_MAKE_NOP: - /* converting a call to a nop */ - return add_brk_on_call(rec, ftrace_addr); - } - return 0; -} - -/* - * On error, we need to remove breakpoints. This needs to - * be done caefully. If the address does not currently have a - * breakpoint, we know we are done. Otherwise, we look at the - * remaining 4 bytes of the instruction. If it matches a nop - * we replace the breakpoint with the nop. Otherwise we replace - * it with the call instruction. - */ -static int remove_breakpoint(struct dyn_ftrace *rec) -{ - unsigned char ins[MCOUNT_INSN_SIZE]; - unsigned char brk = BREAKPOINT_INSTRUCTION; - const unsigned char *nop; - unsigned long ftrace_addr; - unsigned long ip = rec->ip; - - /* If we fail the read, just give up */ - if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE)) - return -EFAULT; - - /* If this does not have a breakpoint, we are done */ - if (ins[0] != brk) - return 0; - - nop = ftrace_nop_replace(); - - /* - * If the last 4 bytes of the instruction do not match - * a nop, then we assume that this is a call to ftrace_addr. - */ - if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) { - /* - * For extra paranoidism, we check if the breakpoint is on - * a call that would actually jump to the ftrace_addr. - * If not, don't touch the breakpoint, we make just create - * a disaster. - */ - ftrace_addr = ftrace_get_addr_new(rec); - nop = ftrace_call_replace(ip, ftrace_addr); - - if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0) - goto update; - - /* Check both ftrace_addr and ftrace_old_addr */ - ftrace_addr = ftrace_get_addr_curr(rec); - nop = ftrace_call_replace(ip, ftrace_addr); - - ftrace_expected = nop; - - if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) - return -EINVAL; - } - - update: - return ftrace_write(ip, nop, 1); -} - -static int add_update_code(unsigned long ip, unsigned const char *new) -{ - /* skip breakpoint */ - ip++; - new++; - return ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1); -} - -static int add_update_call(struct dyn_ftrace *rec, unsigned long addr) -{ - unsigned long ip = rec->ip; - unsigned const char *new; - - new = ftrace_call_replace(ip, addr); - return add_update_code(ip, new); -} - -static int add_update_nop(struct dyn_ftrace *rec) -{ - unsigned long ip = rec->ip; - unsigned const char *new; - - new = ftrace_nop_replace(); - return add_update_code(ip, new); -} - -static int add_update(struct dyn_ftrace *rec, bool enable) -{ - unsigned long ftrace_addr; - int ret; - - ret = ftrace_test_record(rec, enable); - - ftrace_addr = ftrace_get_addr_new(rec); - - switch (ret) { - case FTRACE_UPDATE_IGNORE: - return 0; - - case FTRACE_UPDATE_MODIFY_CALL: - case FTRACE_UPDATE_MAKE_CALL: - /* converting nop to call */ - return add_update_call(rec, ftrace_addr); - - case FTRACE_UPDATE_MAKE_NOP: - /* converting a call to a nop */ - return add_update_nop(rec); - } - - return 0; -} - -static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr) -{ - unsigned long ip = rec->ip; - unsigned const char *new; - - new = ftrace_call_replace(ip, addr); - - return ftrace_write(ip, new, 1); -} - -static int finish_update_nop(struct dyn_ftrace *rec) -{ - unsigned long ip = rec->ip; - unsigned const char *new; - - new = ftrace_nop_replace(); - - return ftrace_write(ip, new, 1); -} - -static int finish_update(struct dyn_ftrace *rec, bool enable) -{ - unsigned long ftrace_addr; - int ret; - - ret = ftrace_update_record(rec, enable); - - ftrace_addr = ftrace_get_addr_new(rec); - - switch (ret) { - case FTRACE_UPDATE_IGNORE: - return 0; - - case FTRACE_UPDATE_MODIFY_CALL: - case FTRACE_UPDATE_MAKE_CALL: - /* converting nop to call */ - return finish_update_call(rec, ftrace_addr); + ip = (unsigned long)(&ftrace_call); + new = ftrace_call_replace(ip, (unsigned long)func); + text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); - case FTRACE_UPDATE_MAKE_NOP: - /* converting a call to a nop */ - return finish_update_nop(rec); - } + ip = (unsigned long)(&ftrace_regs_call); + new = ftrace_call_replace(ip, (unsigned long)func); + text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); return 0; } -static void do_sync_core(void *data) -{ - sync_core(); -} - -static void run_sync(void) -{ - int enable_irqs; - - /* No need to sync if there's only one CPU */ - if (num_online_cpus() == 1) - return; - - enable_irqs = irqs_disabled(); - - /* We may be called with interrupts disabled (on bootup). */ - if (enable_irqs) - local_irq_enable(); - on_each_cpu(do_sync_core, NULL, 1); - if (enable_irqs) - local_irq_disable(); -} - void ftrace_replace_code(int enable) { struct ftrace_rec_iter *iter; struct dyn_ftrace *rec; - const char *report = "adding breakpoints"; - int count = 0; + const char *new, *old; int ret; for_ftrace_rec_iter(iter) { rec = ftrace_rec_iter_record(iter); - ret = add_breakpoints(rec, enable); - if (ret) - goto remove_breakpoints; - count++; - } - - run_sync(); + switch (ftrace_test_record(rec, enable)) { + case FTRACE_UPDATE_IGNORE: + default: + continue; - report = "updating code"; - count = 0; + case FTRACE_UPDATE_MAKE_CALL: + old = ftrace_nop_replace(); + break; - for_ftrace_rec_iter(iter) { - rec = ftrace_rec_iter_record(iter); + case FTRACE_UPDATE_MODIFY_CALL: + case FTRACE_UPDATE_MAKE_NOP: + old = ftrace_call_replace(rec->ip, ftrace_get_addr_curr(rec)); + break; + } - ret = add_update(rec, enable); - if (ret) - goto remove_breakpoints; - count++; + ret = ftrace_verify_code(rec->ip, old); + if (ret) { + ftrace_bug(ret, rec); + return; + } } - run_sync(); - - report = "removing breakpoints"; - count = 0; - for_ftrace_rec_iter(iter) { rec = ftrace_rec_iter_record(iter); - ret = finish_update(rec, enable); - if (ret) - goto remove_breakpoints; - count++; - } + switch (ftrace_test_record(rec, enable)) { + case FTRACE_UPDATE_IGNORE: + default: + continue; - run_sync(); + case FTRACE_UPDATE_MAKE_CALL: + case FTRACE_UPDATE_MODIFY_CALL: + new = ftrace_call_replace(rec->ip, ftrace_get_addr_new(rec)); + break; - return; + case FTRACE_UPDATE_MAKE_NOP: + new = ftrace_nop_replace(); + break; + } - remove_breakpoints: - pr_warn("Failed on %s (%d):\n", report, count); - ftrace_bug(ret, rec); - for_ftrace_rec_iter(iter) { - rec = ftrace_rec_iter_record(iter); - /* - * Breakpoints are handled only when this function is in - * progress. The system could not work with them. - */ - if (remove_breakpoint(rec)) - BUG(); + text_poke_queue((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL); + ftrace_update_record(rec, enable); } - run_sync(); -} - -static int -ftrace_modify_code(unsigned long ip, unsigned const char *old_code, - unsigned const char *new_code) -{ - int ret; - - ret = add_break(ip, old_code); - if (ret) - goto out; - - run_sync(); - - ret = add_update_code(ip, new_code); - if (ret) - goto fail_update; - - run_sync(); - - ret = ftrace_write(ip, new_code, 1); - /* - * The breakpoint is handled only when this function is in progress. - * The system could not work if we could not remove it. - */ - BUG_ON(ret); - out: - run_sync(); - return ret; - - fail_update: - /* Also here the system could not work with the breakpoint */ - if (ftrace_write(ip, old_code, 1)) - BUG(); - goto out; + text_poke_finish(); } void arch_ftrace_update_code(int command) { - /* See comment above by declaration of modifying_ftrace_code */ - atomic_inc(&modifying_ftrace_code); - ftrace_modify_all_code(command); - - atomic_dec(&modifying_ftrace_code); } int __init ftrace_dyn_arch_init(void) @@ -748,6 +314,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned long start_offset; unsigned long end_offset; unsigned long op_offset; + unsigned long call_offset; unsigned long offset; unsigned long npages; unsigned long size; @@ -764,10 +331,12 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) start_offset = (unsigned long)ftrace_regs_caller; end_offset = (unsigned long)ftrace_regs_caller_end; op_offset = (unsigned long)ftrace_regs_caller_op_ptr; + call_offset = (unsigned long)ftrace_regs_call; } else { start_offset = (unsigned long)ftrace_caller; end_offset = (unsigned long)ftrace_epilogue; op_offset = (unsigned long)ftrace_caller_op_ptr; + call_offset = (unsigned long)ftrace_call; } size = end_offset - start_offset; @@ -824,16 +393,21 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* put in the new offset to the ftrace_ops */ memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); + /* put in the call to the function */ + mutex_lock(&text_mutex); + call_offset -= start_offset; + memcpy(trampoline + call_offset, + text_gen_insn(CALL_INSN_OPCODE, + trampoline + call_offset, + ftrace_ops_get_func(ops)), CALL_INSN_SIZE); + mutex_unlock(&text_mutex); + /* ALLOC_TRAMP flags lets us know we created it */ ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; set_vm_flush_reset_perms(trampoline); - /* - * Module allocation needs to be completed by making the page - * executable. The page is still writable, which is a security hazard, - * but anyhow ftrace breaks W^X completely. - */ + set_memory_ro((unsigned long)trampoline, npages); set_memory_x((unsigned long)trampoline, npages); return (unsigned long)trampoline; fail: @@ -860,62 +434,54 @@ static unsigned long calc_trampoline_call_offset(bool save_regs) void arch_ftrace_update_trampoline(struct ftrace_ops *ops) { ftrace_func_t func; - unsigned char *new; unsigned long offset; unsigned long ip; unsigned int size; - int ret, npages; + const char *new; - if (ops->trampoline) { - /* - * The ftrace_ops caller may set up its own trampoline. - * In such a case, this code must not modify it. - */ - if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) - return; - npages = PAGE_ALIGN(ops->trampoline_size) >> PAGE_SHIFT; - set_memory_rw(ops->trampoline, npages); - } else { + if (!ops->trampoline) { ops->trampoline = create_trampoline(ops, &size); if (!ops->trampoline) return; ops->trampoline_size = size; - npages = PAGE_ALIGN(size) >> PAGE_SHIFT; + return; } + /* + * The ftrace_ops caller may set up its own trampoline. + * In such a case, this code must not modify it. + */ + if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + return; + offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS); ip = ops->trampoline + offset; - func = ftrace_ops_get_func(ops); - ftrace_update_func_call = (unsigned long)func; - + mutex_lock(&text_mutex); /* Do a safe modify in case the trampoline is executing */ new = ftrace_call_replace(ip, (unsigned long)func); - ret = update_ftrace_func(ip, new); - set_memory_ro(ops->trampoline, npages); - - /* The update should never fail */ - WARN_ON(ret); + text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + mutex_unlock(&text_mutex); } /* Return the address of the function the trampoline calls */ static void *addr_from_call(void *ptr) { - union ftrace_code_union calc; + union text_poke_insn call; int ret; - ret = probe_kernel_read(&calc, ptr, MCOUNT_INSN_SIZE); + ret = probe_kernel_read(&call, ptr, CALL_INSN_SIZE); if (WARN_ON_ONCE(ret < 0)) return NULL; /* Make sure this is a call */ - if (WARN_ON_ONCE(calc.op != 0xe8)) { - pr_warn("Expected e8, got %x\n", calc.op); + if (WARN_ON_ONCE(call.opcode != CALL_INSN_OPCODE)) { + pr_warn("Expected E8, got %x\n", call.opcode); return NULL; } - return ptr + MCOUNT_INSN_SIZE + calc.offset; + return ptr + CALL_INSN_SIZE + call.disp; } void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, @@ -982,19 +548,18 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops) #ifdef CONFIG_DYNAMIC_FTRACE extern void ftrace_graph_call(void); -static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr) +static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr) { - return ftrace_text_replace(0xe9, ip, addr); + return text_gen_insn(JMP32_INSN_OPCODE, (void *)ip, (void *)addr); } static int ftrace_mod_jmp(unsigned long ip, void *func) { - unsigned char *new; + const char *new; - ftrace_update_func_call = 0UL; new = ftrace_jmp_replace(ip, (unsigned long)func); - - return update_ftrace_func(ip, new); + text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + return 0; } int ftrace_enable_ftrace_graph_caller(void) @@ -1020,10 +585,9 @@ int ftrace_disable_ftrace_graph_caller(void) void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, unsigned long frame_pointer) { + unsigned long return_hooker = (unsigned long)&return_to_handler; unsigned long old; int faulted; - unsigned long return_hooker = (unsigned long) - &return_to_handler; /* * When resuming from suspend-to-ram, this function can be indirectly diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index c1a8b9e71408..9c4498ea0b3c 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -16,15 +16,7 @@ #include <asm/alternative.h> #include <asm/text-patching.h> -union jump_code_union { - char code[JUMP_LABEL_NOP_SIZE]; - struct { - char jump; - int offset; - } __attribute__((packed)); -}; - -static void bug_at(unsigned char *ip, int line) +static void bug_at(const void *ip, int line) { /* * The location is not an op that we were expecting. @@ -35,42 +27,42 @@ static void bug_at(unsigned char *ip, int line) BUG(); } -static void __jump_label_set_jump_code(struct jump_entry *entry, - enum jump_label_type type, - union jump_code_union *code, - int init) +static const void * +__jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, int init) { const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; - const void *expect; + const void *expect, *code; + const void *addr, *dest; int line; - code->jump = 0xe9; - code->offset = jump_entry_target(entry) - - (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); + addr = (void *)jump_entry_code(entry); + dest = (void *)jump_entry_target(entry); + + code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest); if (init) { expect = default_nop; line = __LINE__; } else if (type == JUMP_LABEL_JMP) { expect = ideal_nop; line = __LINE__; } else { - expect = code->code; line = __LINE__; + expect = code; line = __LINE__; } - if (memcmp((void *)jump_entry_code(entry), expect, JUMP_LABEL_NOP_SIZE)) - bug_at((void *)jump_entry_code(entry), line); + if (memcmp(addr, expect, JUMP_LABEL_NOP_SIZE)) + bug_at(addr, line); if (type == JUMP_LABEL_NOP) - memcpy(code, ideal_nop, JUMP_LABEL_NOP_SIZE); + code = ideal_nop; + + return code; } -static void __ref __jump_label_transform(struct jump_entry *entry, - enum jump_label_type type, - int init) +static void inline __jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + int init) { - union jump_code_union code; - - __jump_label_set_jump_code(entry, type, &code, init); + const void *opcode = __jump_label_set_jump_code(entry, type, init); /* * As long as only a single processor is running and the code is still @@ -84,31 +76,33 @@ static void __ref __jump_label_transform(struct jump_entry *entry, * always nop being the 'currently valid' instruction */ if (init || system_state == SYSTEM_BOOTING) { - text_poke_early((void *)jump_entry_code(entry), &code, + text_poke_early((void *)jump_entry_code(entry), opcode, JUMP_LABEL_NOP_SIZE); return; } - text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE, NULL); + text_poke_bp((void *)jump_entry_code(entry), opcode, JUMP_LABEL_NOP_SIZE, NULL); } -void arch_jump_label_transform(struct jump_entry *entry, - enum jump_label_type type) +static void __ref jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + int init) { mutex_lock(&text_mutex); - __jump_label_transform(entry, type, 0); + __jump_label_transform(entry, type, init); mutex_unlock(&text_mutex); } -#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) -static struct text_poke_loc tp_vec[TP_VEC_MAX]; -static int tp_vec_nr; +void arch_jump_label_transform(struct jump_entry *entry, + enum jump_label_type type) +{ + jump_label_transform(entry, type, 0); +} bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type) { - struct text_poke_loc *tp; - void *entry_code; + const void *opcode; if (system_state == SYSTEM_BOOTING) { /* @@ -118,53 +112,19 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, return true; } - /* - * No more space in the vector, tell upper layer to apply - * the queue before continuing. - */ - if (tp_vec_nr == TP_VEC_MAX) - return false; - - tp = &tp_vec[tp_vec_nr]; - - entry_code = (void *)jump_entry_code(entry); - - /* - * The INT3 handler will do a bsearch in the queue, so we need entries - * to be sorted. We can survive an unsorted list by rejecting the entry, - * forcing the generic jump_label code to apply the queue. Warning once, - * to raise the attention to the case of an unsorted entry that is - * better not happen, because, in the worst case we will perform in the - * same way as we do without batching - with some more overhead. - */ - if (tp_vec_nr > 0) { - int prev = tp_vec_nr - 1; - struct text_poke_loc *prev_tp = &tp_vec[prev]; - - if (WARN_ON_ONCE(prev_tp->addr > entry_code)) - return false; - } - - __jump_label_set_jump_code(entry, type, - (union jump_code_union *)&tp->text, 0); - - text_poke_loc_init(tp, entry_code, NULL, JUMP_LABEL_NOP_SIZE, NULL); - - tp_vec_nr++; - + mutex_lock(&text_mutex); + opcode = __jump_label_set_jump_code(entry, type, 0); + text_poke_queue((void *)jump_entry_code(entry), + opcode, JUMP_LABEL_NOP_SIZE, NULL); + mutex_unlock(&text_mutex); return true; } void arch_jump_label_transform_apply(void) { - if (!tp_vec_nr) - return; - mutex_lock(&text_mutex); - text_poke_bp_batch(tp_vec, tp_vec_nr); + text_poke_finish(); mutex_unlock(&text_mutex); - - tp_vec_nr = 0; } static enum { @@ -193,5 +153,5 @@ __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, jlstate = JL_STATE_NO_UPDATE; } if (jlstate == JL_STATE_UPDATE) - __jump_label_transform(entry, type, 1); + jump_label_transform(entry, type, 1); } diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index a0c223ab7264..4d7022a740ab 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -120,14 +120,14 @@ __synthesize_relative_insn(void *dest, void *from, void *to, u8 op) /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ void synthesize_reljump(void *dest, void *from, void *to) { - __synthesize_relative_insn(dest, from, to, RELATIVEJUMP_OPCODE); + __synthesize_relative_insn(dest, from, to, JMP32_INSN_OPCODE); } NOKPROBE_SYMBOL(synthesize_reljump); /* Insert a call instruction at address 'from', which calls address 'to'.*/ void synthesize_relcall(void *dest, void *from, void *to) { - __synthesize_relative_insn(dest, from, to, RELATIVECALL_OPCODE); + __synthesize_relative_insn(dest, from, to, CALL_INSN_OPCODE); } NOKPROBE_SYMBOL(synthesize_relcall); @@ -302,7 +302,7 @@ static int can_probe(unsigned long paddr) * Another debugging subsystem might insert this breakpoint. * In that case, we can't recover it. */ - if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + if (insn.opcode.bytes[0] == INT3_INSN_OPCODE) return 0; addr += insn.length; } @@ -357,7 +357,7 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) return 0; /* Another subsystem puts a breakpoint, failed to recover */ - if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + if (insn->opcode.bytes[0] == INT3_INSN_OPCODE) return 0; /* We should not singlestep on the exception masking instructions */ @@ -401,14 +401,14 @@ static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p, int len = insn->length; if (can_boost(insn, p->addr) && - MAX_INSN_SIZE - len >= RELATIVEJUMP_SIZE) { + MAX_INSN_SIZE - len >= JMP32_INSN_SIZE) { /* * These instructions can be executed directly if it * jumps back to correct address. */ synthesize_reljump(buf + len, p->ainsn.insn + len, p->addr + insn->length); - len += RELATIVEJUMP_SIZE; + len += JMP32_INSN_SIZE; p->ainsn.boostable = true; } else { p->ainsn.boostable = false; @@ -502,12 +502,14 @@ int arch_prepare_kprobe(struct kprobe *p) void arch_arm_kprobe(struct kprobe *p) { - text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); + text_poke(p->addr, ((unsigned char []){INT3_INSN_OPCODE}), 1); + text_poke_sync(); } void arch_disarm_kprobe(struct kprobe *p) { text_poke(p->addr, &p->opcode, 1); + text_poke_sync(); } void arch_remove_kprobe(struct kprobe *p) @@ -610,7 +612,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, regs->flags |= X86_EFLAGS_TF; regs->flags &= ~X86_EFLAGS_IF; /* single step inline if the instruction is an int3 */ - if (p->opcode == BREAKPOINT_INSTRUCTION) + if (p->opcode == INT3_INSN_OPCODE) regs->ip = (unsigned long)p->addr; else regs->ip = (unsigned long)p->ainsn.insn; @@ -696,7 +698,7 @@ int kprobe_int3_handler(struct pt_regs *regs) reset_current_kprobe(); return 1; } - } else if (*addr != BREAKPOINT_INSTRUCTION) { + } else if (*addr != INT3_INSN_OPCODE) { /* * The breakpoint instruction was removed right * after we hit it. Another cpu has removed diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 8900329c28a7..3f45b5c43a71 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -38,7 +38,7 @@ unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) long offs; int i; - for (i = 0; i < RELATIVEJUMP_SIZE; i++) { + for (i = 0; i < JMP32_INSN_SIZE; i++) { kp = get_kprobe((void *)addr - i); /* This function only handles jump-optimized kprobe */ if (kp && kprobe_optimized(kp)) { @@ -62,10 +62,10 @@ found: if (addr == (unsigned long)kp->addr) { buf[0] = kp->opcode; - memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + memcpy(buf + 1, op->optinsn.copied_insn, DISP32_SIZE); } else { offs = addr - (unsigned long)kp->addr - 1; - memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs); + memcpy(buf, op->optinsn.copied_insn + offs, DISP32_SIZE - offs); } return (unsigned long)buf; @@ -141,8 +141,6 @@ STACK_FRAME_NON_STANDARD(optprobe_template_func); #define TMPL_END_IDX \ ((long)optprobe_template_end - (long)optprobe_template_entry) -#define INT3_SIZE sizeof(kprobe_opcode_t) - /* Optimized kprobe call back function: called from optinsn */ static void optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) @@ -162,7 +160,7 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) regs->cs |= get_kernel_rpl(); regs->gs = 0; #endif - regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; + regs->ip = (unsigned long)op->kp.addr + INT3_INSN_SIZE; regs->orig_ax = ~0UL; __this_cpu_write(current_kprobe, &op->kp); @@ -179,7 +177,7 @@ static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real) struct insn insn; int len = 0, ret; - while (len < RELATIVEJUMP_SIZE) { + while (len < JMP32_INSN_SIZE) { ret = __copy_instruction(dest + len, src + len, real + len, &insn); if (!ret || !can_boost(&insn, src + len)) return -EINVAL; @@ -271,7 +269,7 @@ static int can_optimize(unsigned long paddr) return 0; /* Check there is enough space for a relative jump. */ - if (size - offset < RELATIVEJUMP_SIZE) + if (size - offset < JMP32_INSN_SIZE) return 0; /* Decode instructions */ @@ -290,15 +288,15 @@ static int can_optimize(unsigned long paddr) kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); insn_get_length(&insn); /* Another subsystem puts a breakpoint */ - if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + if (insn.opcode.bytes[0] == INT3_INSN_OPCODE) return 0; /* Recover address */ insn.kaddr = (void *)addr; insn.next_byte = (void *)(addr + insn.length); /* Check any instructions don't jump into target */ if (insn_is_indirect_jump(&insn) || - insn_jump_into_range(&insn, paddr + INT3_SIZE, - RELATIVE_ADDR_SIZE)) + insn_jump_into_range(&insn, paddr + INT3_INSN_SIZE, + DISP32_SIZE)) return 0; addr += insn.length; } @@ -374,7 +372,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, * Verify if the address gap is in 2GB range, because this uses * a relative jump. */ - rel = (long)slot - (long)op->kp.addr + RELATIVEJUMP_SIZE; + rel = (long)slot - (long)op->kp.addr + JMP32_INSN_SIZE; if (abs(rel) > 0x7fffffff) { ret = -ERANGE; goto err; @@ -401,7 +399,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, /* Set returning jmp instruction at the tail of out-of-line buffer */ synthesize_reljump(buf + len, slot + len, (u8 *)op->kp.addr + op->optinsn.size); - len += RELATIVEJUMP_SIZE; + len += JMP32_INSN_SIZE; /* We have to use text_poke() for instruction buffer because it is RO */ text_poke(slot, buf, len); @@ -416,49 +414,50 @@ err: } /* - * Replace breakpoints (int3) with relative jumps. + * Replace breakpoints (INT3) with relative jumps (JMP.d32). * Caller must call with locking kprobe_mutex and text_mutex. + * + * The caller will have installed a regular kprobe and after that issued + * syncrhonize_rcu_tasks(), this ensures that the instruction(s) that live in + * the 4 bytes after the INT3 are unused and can now be overwritten. */ void arch_optimize_kprobes(struct list_head *oplist) { struct optimized_kprobe *op, *tmp; - u8 insn_buff[RELATIVEJUMP_SIZE]; + u8 insn_buff[JMP32_INSN_SIZE]; list_for_each_entry_safe(op, tmp, oplist, list) { s32 rel = (s32)((long)op->optinsn.insn - - ((long)op->kp.addr + RELATIVEJUMP_SIZE)); + ((long)op->kp.addr + JMP32_INSN_SIZE)); WARN_ON(kprobe_disabled(&op->kp)); /* Backup instructions which will be replaced by jump address */ - memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, - RELATIVE_ADDR_SIZE); + memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_INSN_SIZE, + DISP32_SIZE); - insn_buff[0] = RELATIVEJUMP_OPCODE; + insn_buff[0] = JMP32_INSN_OPCODE; *(s32 *)(&insn_buff[1]) = rel; - text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, NULL); + text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL); list_del_init(&op->list); } } -/* Replace a relative jump with a breakpoint (int3). */ +/* + * Replace a relative jump (JMP.d32) with a breakpoint (INT3). + * + * After that, we can restore the 4 bytes after the INT3 to undo what + * arch_optimize_kprobes() scribbled. This is safe since those bytes will be + * unused once the INT3 lands. + */ void arch_unoptimize_kprobe(struct optimized_kprobe *op) { - u8 insn_buff[RELATIVEJUMP_SIZE]; - u8 emulate_buff[RELATIVEJUMP_SIZE]; - - /* Set int3 to first byte for kprobes */ - insn_buff[0] = BREAKPOINT_INSTRUCTION; - memcpy(insn_buff + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); - - emulate_buff[0] = RELATIVEJUMP_OPCODE; - *(s32 *)(&emulate_buff[1]) = (s32)((long)op->optinsn.insn - - ((long)op->kp.addr + RELATIVEJUMP_SIZE)); - - text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, - emulate_buff); + arch_arm_kprobe(&op->kp); + text_poke(op->kp.addr + INT3_INSN_SIZE, + op->optinsn.copied_insn, DISP32_SIZE); + text_poke_sync(); } /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 05da6b5b167b..f19de6f45d48 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -572,15 +572,6 @@ NOKPROBE_SYMBOL(do_general_protection); dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) { -#ifdef CONFIG_DYNAMIC_FTRACE - /* - * ftrace must be first, everything else may cause a recursive crash. - * See note by declaration of modifying_ftrace_code in ftrace.c - */ - if (unlikely(atomic_read(&modifying_ftrace_code)) && - ftrace_int3_handler(regs)) - return; -#endif if (poke_int3_handler(regs)) return; |