diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/hashtab.c | 84 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 7 | ||||
-rw-r--r-- | kernel/events/core.c | 95 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 5 | ||||
-rw-r--r-- | kernel/power/snapshot.c | 10 | ||||
-rw-r--r-- | kernel/printk/braille.c | 4 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 33 | ||||
-rw-r--r-- | kernel/sysctl.c | 45 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 5 | ||||
-rw-r--r-- | kernel/time/timekeeping_debug.c | 9 | ||||
-rw-r--r-- | kernel/trace/blktrace.c | 2 |
11 files changed, 235 insertions, 64 deletions
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index fff3650d52fc..570eeca7bdfa 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -26,11 +26,18 @@ struct bpf_htab { struct bucket *buckets; void *elems; struct pcpu_freelist freelist; + void __percpu *extra_elems; atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ }; +enum extra_elem_state { + HTAB_NOT_AN_EXTRA_ELEM = 0, + HTAB_EXTRA_ELEM_FREE, + HTAB_EXTRA_ELEM_USED +}; + /* each htab element is struct htab_elem + key + value */ struct htab_elem { union { @@ -38,7 +45,10 @@ struct htab_elem { struct bpf_htab *htab; struct pcpu_freelist_node fnode; }; - struct rcu_head rcu; + union { + struct rcu_head rcu; + enum extra_elem_state state; + }; u32 hash; char key[0] __aligned(8); }; @@ -113,6 +123,23 @@ free_elems: return err; } +static int alloc_extra_elems(struct bpf_htab *htab) +{ + void __percpu *pptr; + int cpu; + + pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN); + if (!pptr) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state = + HTAB_EXTRA_ELEM_FREE; + } + htab->extra_elems = pptr; + return 0; +} + /* Called from syscall */ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { @@ -185,6 +212,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (percpu) cost += (u64) round_up(htab->map.value_size, 8) * num_possible_cpus() * htab->map.max_entries; + else + cost += (u64) htab->elem_size * num_possible_cpus(); if (cost >= U32_MAX - PAGE_SIZE) /* make sure page count doesn't overflow */ @@ -212,14 +241,22 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) raw_spin_lock_init(&htab->buckets[i].lock); } + if (!percpu) { + err = alloc_extra_elems(htab); + if (err) + goto free_buckets; + } + if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { err = prealloc_elems_and_freelist(htab); if (err) - goto free_buckets; + goto free_extra_elems; } return &htab->map; +free_extra_elems: + free_percpu(htab->extra_elems); free_buckets: kvfree(htab->buckets); free_htab: @@ -349,7 +386,6 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); kfree(l); - } static void htab_elem_free_rcu(struct rcu_head *head) @@ -370,6 +406,11 @@ static void htab_elem_free_rcu(struct rcu_head *head) static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { + if (l->state == HTAB_EXTRA_ELEM_USED) { + l->state = HTAB_EXTRA_ELEM_FREE; + return; + } + if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { pcpu_freelist_push(&htab->freelist, &l->fnode); } else { @@ -381,25 +422,44 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, - bool percpu, bool onallcpus) + bool percpu, bool onallcpus, + bool old_elem_exists) { u32 size = htab->map.value_size; bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); struct htab_elem *l_new; void __percpu *pptr; + int err = 0; if (prealloc) { l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); if (!l_new) - return ERR_PTR(-E2BIG); + err = -E2BIG; } else { if (atomic_inc_return(&htab->count) > htab->map.max_entries) { atomic_dec(&htab->count); - return ERR_PTR(-E2BIG); + err = -E2BIG; + } else { + l_new = kmalloc(htab->elem_size, + GFP_ATOMIC | __GFP_NOWARN); + if (!l_new) + return ERR_PTR(-ENOMEM); } - l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); - if (!l_new) - return ERR_PTR(-ENOMEM); + } + + if (err) { + if (!old_elem_exists) + return ERR_PTR(err); + + /* if we're updating the existing element and the hash table + * is full, use per-cpu extra elems + */ + l_new = this_cpu_ptr(htab->extra_elems); + if (l_new->state != HTAB_EXTRA_ELEM_FREE) + return ERR_PTR(-E2BIG); + l_new->state = HTAB_EXTRA_ELEM_USED; + } else { + l_new->state = HTAB_NOT_AN_EXTRA_ELEM; } memcpy(l_new->key, key, key_size); @@ -489,7 +549,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (ret) goto err; - l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false); + l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, + !!l_old); if (IS_ERR(l_new)) { /* all pre-allocated elements are in use or memory exhausted */ ret = PTR_ERR(l_new); @@ -563,7 +624,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, } } else { l_new = alloc_htab_elem(htab, key, value, key_size, - hash, true, onallcpus); + hash, true, onallcpus, false); if (IS_ERR(l_new)) { ret = PTR_ERR(l_new); goto err; @@ -652,6 +713,7 @@ static void htab_map_free(struct bpf_map *map) htab_free_elems(htab); pcpu_freelist_destroy(&htab->freelist); } + free_percpu(htab->extra_elems); kvfree(htab->buckets); kfree(htab); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f72f23b8fdab..daea765d72e6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -194,6 +194,7 @@ struct verifier_env { struct verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ + u32 id_gen; /* used to generate unique reg IDs */ bool allow_ptr_leaks; }; @@ -1052,7 +1053,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) goto error; break; case BPF_MAP_TYPE_CGROUP_ARRAY: - if (func_id != BPF_FUNC_skb_in_cgroup) + if (func_id != BPF_FUNC_skb_under_cgroup) goto error; break; default: @@ -1074,7 +1075,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) goto error; break; - case BPF_FUNC_skb_in_cgroup: + case BPF_FUNC_skb_under_cgroup: if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) goto error; break; @@ -1301,7 +1302,7 @@ add_imm: /* dst_reg stays as pkt_ptr type and since some positive * integer value was added to the pointer, increment its 'id' */ - dst_reg->id++; + dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range and off to zero */ dst_reg->off = 0; diff --git a/kernel/events/core.c b/kernel/events/core.c index 1903b8f3a705..5650f5317e0c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -242,18 +242,6 @@ unlock: return ret; } -static void event_function_local(struct perf_event *event, event_f func, void *data) -{ - struct event_function_struct efs = { - .event = event, - .func = func, - .data = data, - }; - - int ret = event_function(&efs); - WARN_ON_ONCE(ret); -} - static void event_function_call(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; @@ -303,6 +291,54 @@ again: raw_spin_unlock_irq(&ctx->lock); } +/* + * Similar to event_function_call() + event_function(), but hard assumes IRQs + * are already disabled and we're on the right CPU. + */ +static void event_function_local(struct perf_event *event, event_f func, void *data) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct task_struct *task = READ_ONCE(ctx->task); + struct perf_event_context *task_ctx = NULL; + + WARN_ON_ONCE(!irqs_disabled()); + + if (task) { + if (task == TASK_TOMBSTONE) + return; + + task_ctx = ctx; + } + + perf_ctx_lock(cpuctx, task_ctx); + + task = ctx->task; + if (task == TASK_TOMBSTONE) + goto unlock; + + if (task) { + /* + * We must be either inactive or active and the right task, + * otherwise we're screwed, since we cannot IPI to somewhere + * else. + */ + if (ctx->is_active) { + if (WARN_ON_ONCE(task != current)) + goto unlock; + + if (WARN_ON_ONCE(cpuctx->task_ctx != ctx)) + goto unlock; + } + } else { + WARN_ON_ONCE(&cpuctx->ctx != ctx); + } + + func(event, cpuctx, ctx, data); +unlock: + perf_ctx_unlock(cpuctx, task_ctx); +} + #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP |\ @@ -3513,9 +3549,10 @@ static int perf_event_read(struct perf_event *event, bool group) .group = group, .ret = 0, }; - smp_call_function_single(event->oncpu, - __perf_event_read, &data, 1); - ret = data.ret; + ret = smp_call_function_single(event->oncpu, __perf_event_read, &data, 1); + /* The event must have been read from an online CPU: */ + WARN_ON_ONCE(ret); + ret = ret ? : data.ret; } else if (event->state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; unsigned long flags; @@ -6584,15 +6621,6 @@ got_name: } /* - * Whether this @filter depends on a dynamic object which is not loaded - * yet or its load addresses are not known. - */ -static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter) -{ - return filter->filter && filter->inode; -} - -/* * Check whether inode and address range match filter criteria. */ static bool perf_addr_filter_match(struct perf_addr_filter *filter, @@ -6653,6 +6681,13 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma) struct perf_event_context *ctx; int ctxn; + /* + * Data tracing isn't supported yet and as such there is no need + * to keep track of anything that isn't related to executable code: + */ + if (!(vma->vm_flags & VM_EXEC)) + return; + rcu_read_lock(); for_each_task_context_nr(ctxn) { ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); @@ -7805,7 +7840,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event) list_for_each_entry(filter, &ifh->list, entry) { event->addr_filters_offs[count] = 0; - if (perf_addr_filter_needs_mmap(filter)) + /* + * Adjust base offset if the filter is associated to a binary + * that needs to be mapped: + */ + if (filter->inode) event->addr_filters_offs[count] = perf_addr_filter_apply(filter, mm); @@ -7936,8 +7975,10 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, goto fail; } - if (token == IF_SRC_FILE) { - filename = match_strdup(&args[2]); + if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { + int fpos = filter->range ? 2 : 1; + + filename = match_strdup(&args[fpos]); if (!filename) { ret = -ENOMEM; goto fail; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index b7a525ab2083..8c50276b60d1 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -172,8 +172,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); err = -EAGAIN; ptep = page_check_address(page, mm, addr, &ptl, 0); - if (!ptep) + if (!ptep) { + mem_cgroup_cancel_charge(kpage, memcg, false); goto unlock; + } get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr, false); @@ -200,7 +202,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: - mem_cgroup_cancel_charge(kpage, memcg, false); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 9a0178c2ac1d..b02228411d57 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -835,9 +835,9 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) */ static bool rtree_next_node(struct memory_bitmap *bm) { - bm->cur.node = list_entry(bm->cur.node->list.next, - struct rtree_node, list); - if (&bm->cur.node->list != &bm->cur.zone->leaves) { + if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) { + bm->cur.node = list_entry(bm->cur.node->list.next, + struct rtree_node, list); bm->cur.node_pfn += BM_BITS_PER_BLOCK; bm->cur.node_bit = 0; touch_softlockup_watchdog(); @@ -845,9 +845,9 @@ static bool rtree_next_node(struct memory_bitmap *bm) } /* No more nodes, goto next zone */ - bm->cur.zone = list_entry(bm->cur.zone->list.next, + if (!list_is_last(&bm->cur.zone->list, &bm->zones)) { + bm->cur.zone = list_entry(bm->cur.zone->list.next, struct mem_zone_bm_rtree, list); - if (&bm->cur.zone->list != &bm->zones) { bm->cur.node = list_entry(bm->cur.zone->leaves.next, struct rtree_node, list); bm->cur.node_pfn = 0; diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c index 276762f3a460..d5760c42f042 100644 --- a/kernel/printk/braille.c +++ b/kernel/printk/braille.c @@ -9,10 +9,10 @@ char *_braille_console_setup(char **str, char **brl_options) { - if (!memcmp(*str, "brl,", 4)) { + if (!strncmp(*str, "brl,", 4)) { *brl_options = ""; *str += 4; - } else if (!memcmp(str, "brl=", 4)) { + } else if (!strncmp(*str, "brl=", 4)) { *brl_options = *str + 4; *str = strchr(*brl_options, ','); if (!*str) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 9858266fb0b3..a846cf89eb96 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -263,6 +263,11 @@ void account_idle_time(cputime_t cputime) cpustat[CPUTIME_IDLE] += (__force u64) cputime; } +/* + * When a guest is interrupted for a longer amount of time, missed clock + * ticks are not redelivered later. Due to that, this function may on + * occasion account more time than the calling functions think elapsed. + */ static __always_inline cputime_t steal_account_process_time(cputime_t maxtime) { #ifdef CONFIG_PARAVIRT @@ -371,7 +376,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, * idle, or potentially user or system time. Due to rounding, * other time can exceed ticks occasionally. */ - other = account_other_time(cputime); + other = account_other_time(ULONG_MAX); if (other >= cputime) return; cputime -= other; @@ -486,7 +491,7 @@ void account_process_tick(struct task_struct *p, int user_tick) } cputime = cputime_one_jiffy; - steal = steal_account_process_time(cputime); + steal = steal_account_process_time(ULONG_MAX); if (steal >= cputime) return; @@ -516,7 +521,7 @@ void account_idle_ticks(unsigned long ticks) } cputime = jiffies_to_cputime(ticks); - steal = steal_account_process_time(cputime); + steal = steal_account_process_time(ULONG_MAX); if (steal >= cputime) return; @@ -614,19 +619,25 @@ static void cputime_adjust(struct task_cputime *curr, stime = curr->stime; utime = curr->utime; - if (utime == 0) { - stime = rtime; + /* + * If either stime or both stime and utime are 0, assume all runtime is + * userspace. Once a task gets some ticks, the monotonicy code at + * 'update' will ensure things converge to the observed ratio. + */ + if (stime == 0) { + utime = rtime; goto update; } - if (stime == 0) { - utime = rtime; + if (utime == 0) { + stime = rtime; goto update; } stime = scale_stime((__force u64)stime, (__force u64)rtime, (__force u64)(stime + utime)); +update: /* * Make sure stime doesn't go backwards; this preserves monotonicity * for utime because rtime is monotonic. @@ -649,7 +660,6 @@ static void cputime_adjust(struct task_cputime *curr, stime = rtime - utime; } -update: prev->stime = stime; prev->utime = utime; out: @@ -694,6 +704,13 @@ static cputime_t get_vtime_delta(struct task_struct *tsk) unsigned long now = READ_ONCE(jiffies); cputime_t delta, other; + /* + * Unlike tick based timing, vtime based timing never has lost + * ticks, and no need for steal time accounting to make up for + * lost ticks. Vtime accounts a rounded version of actual + * elapsed time. Limit account_other_time to prevent rounding + * errors from causing elapsed vtime to go negative. + */ delta = jiffies_to_cputime(now - tsk->vtime_snap); other = account_other_time(delta); WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b43d0b27c1fe..a13bbdaab47d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2140,6 +2140,21 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, return 0; } +static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + if (*negp) + return -EINVAL; + *valp = *lvalp; + } else { + unsigned int val = *valp; + *lvalp = (unsigned long)val; + } + return 0; +} + static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, @@ -2259,8 +2274,27 @@ static int do_proc_dointvec(struct ctl_table *table, int write, int proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,buffer,lenp,ppos, - NULL,NULL); + return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); +} + +/** + * proc_douintvec - read a vector of unsigned integers + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer + * values from/to the user buffer, treated as an ASCII string. + * + * Returns 0 on success. + */ +int proc_douintvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_douintvec_conv, NULL); } /* @@ -2858,6 +2892,12 @@ int proc_dointvec(struct ctl_table *table, int write, return -ENOSYS; } +int proc_douintvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -2903,6 +2943,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, * exception granted :-) */ EXPORT_SYMBOL(proc_dointvec); +EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3b65746c7f15..e07fb093f819 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -401,7 +401,10 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) do { seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); - now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr); + now = ktime_to_ns(tkr->base); + + now += clocksource_delta(tkr->read(tkr->clock), + tkr->cycle_last, tkr->mask); } while (read_seqcount_retry(&tkf->seq, seq)); return now; diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index f6bd65236712..107310a6f36f 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -23,7 +23,9 @@ #include "timekeeping_internal.h" -static unsigned int sleep_time_bin[32] = {0}; +#define NUM_BINS 32 + +static unsigned int sleep_time_bin[NUM_BINS] = {0}; static int tk_debug_show_sleep_time(struct seq_file *s, void *data) { @@ -69,6 +71,9 @@ late_initcall(tk_debug_sleep_time_init); void tk_debug_account_sleep_time(struct timespec64 *t) { - sleep_time_bin[fls(t->tv_sec)]++; + /* Cap bin index so we don't overflow the array */ + int bin = min(fls(t->tv_sec), NUM_BINS-1); + + sleep_time_bin[bin]++; } diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7598e6ca817a..dbafc5df03f3 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -223,7 +223,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= MASK_TC_BIT(op_flags, META); what |= MASK_TC_BIT(op_flags, PREFLUSH); what |= MASK_TC_BIT(op_flags, FUA); - if (op == REQ_OP_DISCARD) + if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE) what |= BLK_TC_ACT(BLK_TC_DISCARD); if (op == REQ_OP_FLUSH) what |= BLK_TC_ACT(BLK_TC_FLUSH); |