summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/btf.c46
-rw-r--r--kernel/bpf/cgroup.c54
-rw-r--r--kernel/bpf/core.c30
-rw-r--r--kernel/bpf/devmap.c7
-rw-r--r--kernel/bpf/hashtab.c16
-rw-r--r--kernel/bpf/sockmap.c297
-rw-r--r--kernel/bpf/syscall.c103
-rw-r--r--kernel/bpf/verifier.c11
-rw-r--r--kernel/fork.c33
-rw-r--r--kernel/kthread.c38
-rw-r--r--kernel/memremap.c22
-rw-r--r--kernel/rseq.c41
-rw-r--r--kernel/sched/core.c67
-rw-r--r--kernel/sched/cpufreq_schedutil.c2
-rw-r--r--kernel/sched/deadline.c11
-rw-r--r--kernel/sched/fair.c45
-rw-r--r--kernel/sched/rt.c16
-rw-r--r--kernel/sched/sched.h11
-rw-r--r--kernel/softirq.c12
-rw-r--r--kernel/stop_machine.c6
-rw-r--r--kernel/time/tick-common.c3
-rw-r--r--kernel/trace/ftrace.c13
-rw-r--r--kernel/trace/ring_buffer.c16
-rw-r--r--kernel/trace/trace.c19
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_events_filter.c5
-rw-r--r--kernel/trace/trace_events_hist.c2
-rw-r--r--kernel/trace/trace_events_trigger.c18
-rw-r--r--kernel/trace/trace_functions_graph.c5
-rw-r--r--kernel/trace/trace_kprobe.c21
-rw-r--r--kernel/trace/trace_output.c5
31 files changed, 617 insertions, 362 deletions
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2d49d18b793a..9704934252b3 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -450,7 +450,7 @@ static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
*/
static bool btf_type_int_is_regular(const struct btf_type *t)
{
- u16 nr_bits, nr_bytes;
+ u8 nr_bits, nr_bytes;
u32 int_data;
int_data = btf_type_int(t);
@@ -991,38 +991,38 @@ static void btf_int_bits_seq_show(const struct btf *btf,
void *data, u8 bits_offset,
struct seq_file *m)
{
+ u16 left_shift_bits, right_shift_bits;
u32 int_data = btf_type_int(t);
- u16 nr_bits = BTF_INT_BITS(int_data);
- u16 total_bits_offset;
- u16 nr_copy_bytes;
- u16 nr_copy_bits;
- u8 nr_upper_bits;
- union {
- u64 u64_num;
- u8 u8_nums[8];
- } print_num;
+ u8 nr_bits = BTF_INT_BITS(int_data);
+ u8 total_bits_offset;
+ u8 nr_copy_bytes;
+ u8 nr_copy_bits;
+ u64 print_num;
+ /*
+ * bits_offset is at most 7.
+ * BTF_INT_OFFSET() cannot exceed 64 bits.
+ */
total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset);
nr_copy_bits = nr_bits + bits_offset;
nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits);
- print_num.u64_num = 0;
- memcpy(&print_num.u64_num, data, nr_copy_bytes);
+ print_num = 0;
+ memcpy(&print_num, data, nr_copy_bytes);
- /* Ditch the higher order bits */
- nr_upper_bits = BITS_PER_BYTE_MASKED(nr_copy_bits);
- if (nr_upper_bits) {
- /* We need to mask out some bits of the upper byte. */
- u8 mask = (1 << nr_upper_bits) - 1;
-
- print_num.u8_nums[nr_copy_bytes - 1] &= mask;
- }
+#ifdef __BIG_ENDIAN_BITFIELD
+ left_shift_bits = bits_offset;
+#else
+ left_shift_bits = BITS_PER_U64 - nr_copy_bits;
+#endif
+ right_shift_bits = BITS_PER_U64 - nr_bits;
- print_num.u64_num >>= bits_offset;
+ print_num <<= left_shift_bits;
+ print_num >>= right_shift_bits;
- seq_printf(m, "0x%llx", print_num.u64_num);
+ seq_printf(m, "0x%llx", print_num);
}
static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,
@@ -1032,7 +1032,7 @@ static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,
u32 int_data = btf_type_int(t);
u8 encoding = BTF_INT_ENCODING(int_data);
bool sign = encoding & BTF_INT_SIGNED;
- u32 nr_bits = BTF_INT_BITS(int_data);
+ u8 nr_bits = BTF_INT_BITS(int_data);
if (bits_offset || BTF_INT_OFFSET(int_data) ||
BITS_PER_BYTE_MASKED(nr_bits)) {
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index f7c00bd6f8e4..3d83ee7df381 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -428,6 +428,60 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
return ret;
}
+int cgroup_bpf_prog_attach(const union bpf_attr *attr,
+ enum bpf_prog_type ptype, struct bpf_prog *prog)
+{
+ struct cgroup *cgrp;
+ int ret;
+
+ cgrp = cgroup_get_from_fd(attr->target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
+ attr->attach_flags);
+ cgroup_put(cgrp);
+ return ret;
+}
+
+int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
+{
+ struct bpf_prog *prog;
+ struct cgroup *cgrp;
+ int ret;
+
+ cgrp = cgroup_get_from_fd(attr->target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+ if (IS_ERR(prog))
+ prog = NULL;
+
+ ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
+ if (prog)
+ bpf_prog_put(prog);
+
+ cgroup_put(cgrp);
+ return ret;
+}
+
+int cgroup_bpf_prog_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct cgroup *cgrp;
+ int ret;
+
+ cgrp = cgroup_get_from_fd(attr->query.target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ ret = cgroup_bpf_query(cgrp, attr, uattr);
+
+ cgroup_put(cgrp);
+ return ret;
+}
+
/**
* __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
* @sk: The socket sending or receiving traffic
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index a9e6c04d0f4a..1e5625d46414 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -598,8 +598,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
bpf_fill_ill_insns(hdr, size);
hdr->pages = size / PAGE_SIZE;
- hdr->locked = 0;
-
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr));
start = (get_random_int() % hole) & ~(alignment - 1);
@@ -1450,22 +1448,6 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
return 0;
}
-static int bpf_prog_check_pages_ro_locked(const struct bpf_prog *fp)
-{
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
- int i, err;
-
- for (i = 0; i < fp->aux->func_cnt; i++) {
- err = bpf_prog_check_pages_ro_single(fp->aux->func[i]);
- if (err)
- return err;
- }
-
- return bpf_prog_check_pages_ro_single(fp);
-#endif
- return 0;
-}
-
static void bpf_prog_select_func(struct bpf_prog *fp)
{
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
@@ -1524,17 +1506,7 @@ finalize:
* all eBPF JITs might immediately support all features.
*/
*err = bpf_check_tail_call(fp);
- if (*err)
- return fp;
-
- /* Checkpoint: at this point onwards any cBPF -> eBPF or
- * native eBPF program is read-only. If we failed to change
- * the page attributes (e.g. allocation failure from
- * splitting large pages), then reject the whole program
- * in order to guarantee not ending up with any W+X pages
- * from BPF side in kernel.
- */
- *err = bpf_prog_check_pages_ro_locked(fp);
+
return fp;
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 642c97f6d1b8..d361fc1e3bf3 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -334,10 +334,15 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
{
struct net_device *dev = dst->dev;
struct xdp_frame *xdpf;
+ int err;
if (!dev->netdev_ops->ndo_xdp_xmit)
return -EOPNOTSUPP;
+ err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
+ if (unlikely(err))
+ return err;
+
xdpf = convert_to_xdp_frame(xdp);
if (unlikely(!xdpf))
return -EOVERFLOW;
@@ -350,7 +355,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
{
int err;
- err = __xdp_generic_ok_fwd_dev(skb, dst->dev);
+ err = xdp_ok_fwd_dev(dst->dev, skb->len);
if (unlikely(err))
return err;
skb->dev = dst->dev;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3ca2198a6d22..513d9dfcf4ee 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -747,13 +747,15 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
* old element will be freed immediately.
* Otherwise return an error
*/
- atomic_dec(&htab->count);
- return ERR_PTR(-E2BIG);
+ l_new = ERR_PTR(-E2BIG);
+ goto dec_count;
}
l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
htab->map.numa_node);
- if (!l_new)
- return ERR_PTR(-ENOMEM);
+ if (!l_new) {
+ l_new = ERR_PTR(-ENOMEM);
+ goto dec_count;
+ }
}
memcpy(l_new->key, key, key_size);
@@ -766,7 +768,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
GFP_ATOMIC | __GFP_NOWARN);
if (!pptr) {
kfree(l_new);
- return ERR_PTR(-ENOMEM);
+ l_new = ERR_PTR(-ENOMEM);
+ goto dec_count;
}
}
@@ -780,6 +783,9 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new->hash = hash;
return l_new;
+dec_count:
+ atomic_dec(&htab->count);
+ return l_new;
}
static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 52a91d816c0e..98fb7938beea 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -72,6 +72,7 @@ struct bpf_htab {
u32 n_buckets;
u32 elem_size;
struct bpf_sock_progs progs;
+ struct rcu_head rcu;
};
struct htab_elem {
@@ -89,8 +90,8 @@ enum smap_psock_state {
struct smap_psock_map_entry {
struct list_head list;
struct sock **entry;
- struct htab_elem *hash_link;
- struct bpf_htab *htab;
+ struct htab_elem __rcu *hash_link;
+ struct bpf_htab __rcu *htab;
};
struct smap_psock {
@@ -120,6 +121,7 @@ struct smap_psock {
struct bpf_prog *bpf_parse;
struct bpf_prog *bpf_verdict;
struct list_head maps;
+ spinlock_t maps_lock;
/* Back reference used when sock callback trigger sockmap operations */
struct sock *sock;
@@ -140,6 +142,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
+static void bpf_tcp_close(struct sock *sk, long timeout);
static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
{
@@ -161,7 +164,42 @@ out:
return !empty;
}
-static struct proto tcp_bpf_proto;
+enum {
+ SOCKMAP_IPV4,
+ SOCKMAP_IPV6,
+ SOCKMAP_NUM_PROTS,
+};
+
+enum {
+ SOCKMAP_BASE,
+ SOCKMAP_TX,
+ SOCKMAP_NUM_CONFIGS,
+};
+
+static struct proto *saved_tcpv6_prot __read_mostly;
+static DEFINE_SPINLOCK(tcpv6_prot_lock);
+static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS];
+static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS],
+ struct proto *base)
+{
+ prot[SOCKMAP_BASE] = *base;
+ prot[SOCKMAP_BASE].close = bpf_tcp_close;
+ prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg;
+ prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read;
+
+ prot[SOCKMAP_TX] = prot[SOCKMAP_BASE];
+ prot[SOCKMAP_TX].sendmsg = bpf_tcp_sendmsg;
+ prot[SOCKMAP_TX].sendpage = bpf_tcp_sendpage;
+}
+
+static void update_sk_prot(struct sock *sk, struct smap_psock *psock)
+{
+ int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4;
+ int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE;
+
+ sk->sk_prot = &bpf_tcp_prots[family][conf];
+}
+
static int bpf_tcp_init(struct sock *sk)
{
struct smap_psock *psock;
@@ -181,14 +219,17 @@ static int bpf_tcp_init(struct sock *sk)
psock->save_close = sk->sk_prot->close;
psock->sk_proto = sk->sk_prot;
- if (psock->bpf_tx_msg) {
- tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg;
- tcp_bpf_proto.sendpage = bpf_tcp_sendpage;
- tcp_bpf_proto.recvmsg = bpf_tcp_recvmsg;
- tcp_bpf_proto.stream_memory_read = bpf_tcp_stream_read;
+ /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */
+ if (sk->sk_family == AF_INET6 &&
+ unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) {
+ spin_lock_bh(&tcpv6_prot_lock);
+ if (likely(sk->sk_prot != saved_tcpv6_prot)) {
+ build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot);
+ smp_store_release(&saved_tcpv6_prot, sk->sk_prot);
+ }
+ spin_unlock_bh(&tcpv6_prot_lock);
}
-
- sk->sk_prot = &tcp_bpf_proto;
+ update_sk_prot(sk, psock);
rcu_read_unlock();
return 0;
}
@@ -219,24 +260,64 @@ out:
rcu_read_unlock();
}
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
+ u32 hash, void *key, u32 key_size)
+{
+ struct htab_elem *l;
+
+ hlist_for_each_entry_rcu(l, head, hash_node) {
+ if (l->hash == hash && !memcmp(&l->key, key, key_size))
+ return l;
+ }
+
+ return NULL;
+}
+
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &__select_bucket(htab, hash)->head;
+}
+
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
{
atomic_dec(&htab->count);
kfree_rcu(l, rcu);
}
+static struct smap_psock_map_entry *psock_map_pop(struct sock *sk,
+ struct smap_psock *psock)
+{
+ struct smap_psock_map_entry *e;
+
+ spin_lock_bh(&psock->maps_lock);
+ e = list_first_entry_or_null(&psock->maps,
+ struct smap_psock_map_entry,
+ list);
+ if (e)
+ list_del(&e->list);
+ spin_unlock_bh(&psock->maps_lock);
+ return e;
+}
+
static void bpf_tcp_close(struct sock *sk, long timeout)
{
void (*close_fun)(struct sock *sk, long timeout);
- struct smap_psock_map_entry *e, *tmp;
+ struct smap_psock_map_entry *e;
struct sk_msg_buff *md, *mtmp;
struct smap_psock *psock;
struct sock *osk;
+ lock_sock(sk);
rcu_read_lock();
psock = smap_psock_sk(sk);
if (unlikely(!psock)) {
rcu_read_unlock();
+ release_sock(sk);
return sk->sk_prot->close(sk, timeout);
}
@@ -247,7 +328,6 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
*/
close_fun = psock->save_close;
- write_lock_bh(&sk->sk_callback_lock);
if (psock->cork) {
free_start_sg(psock->sock, psock->cork);
kfree(psock->cork);
@@ -260,21 +340,40 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
kfree(md);
}
- list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+ e = psock_map_pop(sk, psock);
+ while (e) {
if (e->entry) {
osk = cmpxchg(e->entry, sk, NULL);
if (osk == sk) {
- list_del(&e->list);
smap_release_sock(psock, sk);
}
} else {
- hlist_del_rcu(&e->hash_link->hash_node);
- smap_release_sock(psock, e->hash_link->sk);
- free_htab_elem(e->htab, e->hash_link);
+ struct htab_elem *link = rcu_dereference(e->hash_link);
+ struct bpf_htab *htab = rcu_dereference(e->htab);
+ struct hlist_head *head;
+ struct htab_elem *l;
+ struct bucket *b;
+
+ b = __select_bucket(htab, link->hash);
+ head = &b->head;
+ raw_spin_lock_bh(&b->lock);
+ l = lookup_elem_raw(head,
+ link->hash, link->key,
+ htab->map.key_size);
+ /* If another thread deleted this object skip deletion.
+ * The refcnt on psock may or may not be zero.
+ */
+ if (l) {
+ hlist_del_rcu(&link->hash_node);
+ smap_release_sock(psock, link->sk);
+ free_htab_elem(htab, link);
+ }
+ raw_spin_unlock_bh(&b->lock);
}
+ e = psock_map_pop(sk, psock);
}
- write_unlock_bh(&sk->sk_callback_lock);
rcu_read_unlock();
+ release_sock(sk);
close_fun(sk, timeout);
}
@@ -472,7 +571,8 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md)
while (sg[i].length) {
free += sg[i].length;
sk_mem_uncharge(sk, sg[i].length);
- put_page(sg_page(&sg[i]));
+ if (!md->skb)
+ put_page(sg_page(&sg[i]));
sg[i].length = 0;
sg[i].page_link = 0;
sg[i].offset = 0;
@@ -481,6 +581,8 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md)
if (i == MAX_SKB_FRAGS)
i = 0;
}
+ if (md->skb)
+ consume_skb(md->skb);
return free;
}
@@ -1111,8 +1213,7 @@ static void bpf_tcp_msg_add(struct smap_psock *psock,
static int bpf_tcp_ulp_register(void)
{
- tcp_bpf_proto = tcp_prot;
- tcp_bpf_proto.close = bpf_tcp_close;
+ build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot);
/* Once BPF TX ULP is registered it is never unregistered. It
* will be in the ULP list for the lifetime of the system. Doing
* duplicate registers is not a problem.
@@ -1135,7 +1236,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
*/
TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
skb->sk = psock->sock;
- bpf_compute_data_pointers(skb);
+ bpf_compute_data_end_sk_skb(skb);
preempt_disable();
rc = (*prog->bpf_func)(skb, prog->insnsi);
preempt_enable();
@@ -1357,7 +1458,9 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
{
if (refcount_dec_and_test(&psock->refcnt)) {
tcp_cleanup_ulp(sock);
+ write_lock_bh(&sock->sk_callback_lock);
smap_stop_sock(psock, sock);
+ write_unlock_bh(&sock->sk_callback_lock);
clear_bit(SMAP_TX_RUNNING, &psock->state);
rcu_assign_sk_user_data(sock, NULL);
call_rcu_sched(&psock->rcu, smap_destroy_psock);
@@ -1388,7 +1491,7 @@ static int smap_parse_func_strparser(struct strparser *strp,
* any socket yet.
*/
skb->sk = psock->sock;
- bpf_compute_data_pointers(skb);
+ bpf_compute_data_end_sk_skb(skb);
rc = (*prog->bpf_func)(skb, prog->insnsi);
skb->sk = NULL;
rcu_read_unlock();
@@ -1508,6 +1611,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, int node)
INIT_LIST_HEAD(&psock->maps);
INIT_LIST_HEAD(&psock->ingress);
refcount_set(&psock->refcnt, 1);
+ spin_lock_init(&psock->maps_lock);
rcu_assign_sk_user_data(sock, psock);
sock_hold(sock);
@@ -1564,18 +1668,32 @@ free_stab:
return ERR_PTR(err);
}
-static void smap_list_remove(struct smap_psock *psock,
- struct sock **entry,
- struct htab_elem *hash_link)
+static void smap_list_map_remove(struct smap_psock *psock,
+ struct sock **entry)
{
struct smap_psock_map_entry *e, *tmp;
+ spin_lock_bh(&psock->maps_lock);
list_for_each_entry_safe(e, tmp, &psock->maps, list) {
- if (e->entry == entry || e->hash_link == hash_link) {
+ if (e->entry == entry)
+ list_del(&e->list);
+ }
+ spin_unlock_bh(&psock->maps_lock);
+}
+
+static void smap_list_hash_remove(struct smap_psock *psock,
+ struct htab_elem *hash_link)
+{
+ struct smap_psock_map_entry *e, *tmp;
+
+ spin_lock_bh(&psock->maps_lock);
+ list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+ struct htab_elem *c = rcu_dereference(e->hash_link);
+
+ if (c == hash_link)
list_del(&e->list);
- break;
- }
}
+ spin_unlock_bh(&psock->maps_lock);
}
static void sock_map_free(struct bpf_map *map)
@@ -1601,7 +1719,6 @@ static void sock_map_free(struct bpf_map *map)
if (!sock)
continue;
- write_lock_bh(&sock->sk_callback_lock);
psock = smap_psock_sk(sock);
/* This check handles a racing sock event that can get the
* sk_callback_lock before this case but after xchg happens
@@ -1609,10 +1726,9 @@ static void sock_map_free(struct bpf_map *map)
* to be null and queued for garbage collection.
*/
if (likely(psock)) {
- smap_list_remove(psock, &stab->sock_map[i], NULL);
+ smap_list_map_remove(psock, &stab->sock_map[i]);
smap_release_sock(psock, sock);
}
- write_unlock_bh(&sock->sk_callback_lock);
}
rcu_read_unlock();
@@ -1661,17 +1777,15 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
if (!sock)
return -EINVAL;
- write_lock_bh(&sock->sk_callback_lock);
psock = smap_psock_sk(sock);
if (!psock)
goto out;
if (psock->bpf_parse)
smap_stop_sock(psock, sock);
- smap_list_remove(psock, &stab->sock_map[k], NULL);
+ smap_list_map_remove(psock, &stab->sock_map[k]);
smap_release_sock(psock, sock);
out:
- write_unlock_bh(&sock->sk_callback_lock);
return 0;
}
@@ -1752,7 +1866,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
}
}
- write_lock_bh(&sock->sk_callback_lock);
psock = smap_psock_sk(sock);
/* 2. Do not allow inheriting programs if psock exists and has
@@ -1789,7 +1902,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
if (!e) {
err = -ENOMEM;
- goto out_progs;
+ goto out_free;
}
}
@@ -1809,7 +1922,9 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
if (err)
goto out_free;
smap_init_progs(psock, verdict, parse);
+ write_lock_bh(&sock->sk_callback_lock);
smap_start_sock(psock, sock);
+ write_unlock_bh(&sock->sk_callback_lock);
}
/* 4. Place psock in sockmap for use and stop any programs on
@@ -1819,9 +1934,10 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
*/
if (map_link) {
e->entry = map_link;
+ spin_lock_bh(&psock->maps_lock);
list_add_tail(&e->list, &psock->maps);
+ spin_unlock_bh(&psock->maps_lock);
}
- write_unlock_bh(&sock->sk_callback_lock);
return err;
out_free:
smap_release_sock(psock, sock);
@@ -1832,7 +1948,6 @@ out_progs:
}
if (tx_msg)
bpf_prog_put(tx_msg);
- write_unlock_bh(&sock->sk_callback_lock);
kfree(e);
return err;
}
@@ -1869,10 +1984,8 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
if (osock) {
struct smap_psock *opsock = smap_psock_sk(osock);
- write_lock_bh(&osock->sk_callback_lock);
- smap_list_remove(opsock, &stab->sock_map[i], NULL);
+ smap_list_map_remove(opsock, &stab->sock_map[i]);
smap_release_sock(opsock, osock);
- write_unlock_bh(&osock->sk_callback_lock);
}
out:
return err;
@@ -1915,6 +2028,24 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
return 0;
}
+int sockmap_get_from_fd(const union bpf_attr *attr, int type,
+ struct bpf_prog *prog)
+{
+ int ufd = attr->target_fd;
+ struct bpf_map *map;
+ struct fd f;
+ int err;
+
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = sock_map_prog(map, prog, attr->attach_type);
+ fdput(f);
+ return err;
+}
+
static void *sock_map_lookup(struct bpf_map *map, void *key)
{
return NULL;
@@ -1944,7 +2075,13 @@ static int sock_map_update_elem(struct bpf_map *map,
return -EOPNOTSUPP;
}
+ lock_sock(skops.sk);
+ preempt_disable();
+ rcu_read_lock();
err = sock_map_ctx_update_elem(&skops, map, key, flags);
+ rcu_read_unlock();
+ preempt_enable();
+ release_sock(skops.sk);
fput(socket->file);
return err;
}
@@ -2043,14 +2180,13 @@ free_htab:
return ERR_PTR(err);
}
-static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
+static void __bpf_htab_free(struct rcu_head *rcu)
{
- return &htab->buckets[hash & (htab->n_buckets - 1)];
-}
+ struct bpf_htab *htab;
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
-{
- return &__select_bucket(htab, hash)->head;
+ htab = container_of(rcu, struct bpf_htab, rcu);
+ bpf_map_area_free(htab->buckets);
+ kfree(htab);
}
static void sock_hash_free(struct bpf_map *map)
@@ -2069,16 +2205,18 @@ static void sock_hash_free(struct bpf_map *map)
*/
rcu_read_lock();
for (i = 0; i < htab->n_buckets; i++) {
- struct hlist_head *head = select_bucket(htab, i);
+ struct bucket *b = __select_bucket(htab, i);
+ struct hlist_head *head;
struct hlist_node *n;
struct htab_elem *l;
+ raw_spin_lock_bh(&b->lock);
+ head = &b->head;
hlist_for_each_entry_safe(l, n, head, hash_node) {
struct sock *sock = l->sk;
struct smap_psock *psock;
hlist_del_rcu(&l->hash_node);
- write_lock_bh(&sock->sk_callback_lock);
psock = smap_psock_sk(sock);
/* This check handles a racing sock event that can get
* the sk_callback_lock before this case but after xchg
@@ -2086,16 +2224,15 @@ static void sock_hash_free(struct bpf_map *map)
* (psock) to be null and queued for garbage collection.
*/
if (likely(psock)) {
- smap_list_remove(psock, NULL, l);
+ smap_list_hash_remove(psock, l);
smap_release_sock(psock, sock);
}
- write_unlock_bh(&sock->sk_callback_lock);
- kfree(l);
+ free_htab_elem(htab, l);
}
+ raw_spin_unlock_bh(&b->lock);
}
rcu_read_unlock();
- bpf_map_area_free(htab->buckets);
- kfree(htab);
+ call_rcu(&htab->rcu, __bpf_htab_free);
}
static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
@@ -2122,19 +2259,6 @@ static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
return l_new;
}
-static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
- u32 hash, void *key, u32 key_size)
-{
- struct htab_elem *l;
-
- hlist_for_each_entry_rcu(l, head, hash_node) {
- if (l->hash == hash && !memcmp(&l->key, key, key_size))
- return l;
- }
-
- return NULL;
-}
-
static inline u32 htab_map_hash(const void *key, u32 key_len)
{
return jhash(key, key_len, 0);
@@ -2230,7 +2354,10 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
if (err)
goto err;
- /* bpf_map_update_elem() can be called in_irq() */
+ /* psock is valid here because otherwise above *ctx_update_elem would
+ * have thrown an error. It is safe to skip error check.
+ */
+ psock = smap_psock_sk(sock);
raw_spin_lock_bh(&b->lock);
l_old = lookup_elem_raw(head, hash, key, key_size);
if (l_old && map_flags == BPF_NOEXIST) {
@@ -2248,15 +2375,12 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
goto bucket_err;
}
- psock = smap_psock_sk(sock);
- if (unlikely(!psock)) {
- err = -EINVAL;
- goto bucket_err;
- }
-
- e->hash_link = l_new;
- e->htab = container_of(map, struct bpf_htab, map);
+ rcu_assign_pointer(e->hash_link, l_new);
+ rcu_assign_pointer(e->htab,
+ container_of(map, struct bpf_htab, map));
+ spin_lock_bh(&psock->maps_lock);
list_add_tail(&e->list, &psock->maps);
+ spin_unlock_bh(&psock->maps_lock);
/* add new element to the head of the list, so that
* concurrent search will find it before old elem
@@ -2266,19 +2390,17 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
psock = smap_psock_sk(l_old->sk);
hlist_del_rcu(&l_old->hash_node);
- smap_list_remove(psock, NULL, l_old);
+ smap_list_hash_remove(psock, l_old);
smap_release_sock(psock, l_old->sk);
free_htab_elem(htab, l_old);
}
raw_spin_unlock_bh(&b->lock);
return 0;
bucket_err:
+ smap_release_sock(psock, sock);
raw_spin_unlock_bh(&b->lock);
err:
kfree(e);
- psock = smap_psock_sk(sock);
- if (psock)
- smap_release_sock(psock, sock);
return err;
}
@@ -2300,7 +2422,13 @@ static int sock_hash_update_elem(struct bpf_map *map,
return -EINVAL;
}
+ lock_sock(skops.sk);
+ preempt_disable();
+ rcu_read_lock();
err = sock_hash_ctx_update_elem(&skops, map, key, flags);
+ rcu_read_unlock();
+ preempt_enable();
+ release_sock(skops.sk);
fput(socket->file);
return err;
}
@@ -2326,7 +2454,6 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key)
struct smap_psock *psock;
hlist_del_rcu(&l->hash_node);
- write_lock_bh(&sock->sk_callback_lock);
psock = smap_psock_sk(sock);
/* This check handles a racing sock event that can get the
* sk_callback_lock before this case but after xchg happens
@@ -2334,10 +2461,9 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key)
* to be null and queued for garbage collection.
*/
if (likely(psock)) {
- smap_list_remove(psock, NULL, l);
+ smap_list_hash_remove(psock, l);
smap_release_sock(psock, sock);
}
- write_unlock_bh(&sock->sk_callback_lock);
free_htab_elem(htab, l);
ret = 0;
}
@@ -2359,10 +2485,8 @@ struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
b = __select_bucket(htab, hash);
head = &b->head;
- raw_spin_lock_bh(&b->lock);
l = lookup_elem_raw(head, hash, key, key_size);
sk = l ? l->sk : NULL;
- raw_spin_unlock_bh(&b->lock);
return sk;
}
@@ -2383,6 +2507,7 @@ const struct bpf_map_ops sock_hash_ops = {
.map_get_next_key = sock_hash_get_next_key,
.map_update_elem = sock_hash_update_elem,
.map_delete_elem = sock_hash_delete_elem,
+ .map_release_uref = sock_map_release,
};
BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 35dc466641f2..a31a1ba0f8ea 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -735,7 +735,9 @@ static int map_update_elem(union bpf_attr *attr)
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_update_elem(map, key, value, attr->flags);
goto out;
- } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+ } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
+ map->map_type == BPF_MAP_TYPE_SOCKHASH ||
+ map->map_type == BPF_MAP_TYPE_SOCKMAP) {
err = map->ops->map_update_elem(map, key, value, attr->flags);
goto out;
}
@@ -1483,8 +1485,6 @@ out_free_tp:
return err;
}
-#ifdef CONFIG_CGROUP_BPF
-
static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
enum bpf_attach_type attach_type)
{
@@ -1499,40 +1499,6 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
-static int sockmap_get_from_fd(const union bpf_attr *attr,
- int type, bool attach)
-{
- struct bpf_prog *prog = NULL;
- int ufd = attr->target_fd;
- struct bpf_map *map;
- struct fd f;
- int err;
-
- f = fdget(ufd);
- map = __bpf_map_get(f);
- if (IS_ERR(map))
- return PTR_ERR(map);
-
- if (attach) {
- prog = bpf_prog_get_type(attr->attach_bpf_fd, type);
- if (IS_ERR(prog)) {
- fdput(f);
- return PTR_ERR(prog);
- }
- }
-
- err = sock_map_prog(map, prog, attr->attach_type);
- if (err) {
- fdput(f);
- if (prog)
- bpf_prog_put(prog);
- return err;
- }
-
- fdput(f);
- return 0;
-}
-
#define BPF_F_ATTACH_MASK \
(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
@@ -1540,7 +1506,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
{
enum bpf_prog_type ptype;
struct bpf_prog *prog;
- struct cgroup *cgrp;
int ret;
if (!capable(CAP_NET_ADMIN))
@@ -1577,12 +1542,15 @@ static int bpf_prog_attach(const union bpf_attr *attr)
ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
break;
case BPF_SK_MSG_VERDICT:
- return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, true);
+ ptype = BPF_PROG_TYPE_SK_MSG;
+ break;
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
- return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true);
+ ptype = BPF_PROG_TYPE_SK_SKB;
+ break;
case BPF_LIRC_MODE2:
- return lirc_prog_attach(attr);
+ ptype = BPF_PROG_TYPE_LIRC_MODE2;
+ break;
default:
return -EINVAL;
}
@@ -1596,18 +1564,20 @@ static int bpf_prog_attach(const union bpf_attr *attr)
return -EINVAL;
}
- cgrp = cgroup_get_from_fd(attr->target_fd);
- if (IS_ERR(cgrp)) {
- bpf_prog_put(prog);
- return PTR_ERR(cgrp);
+ switch (ptype) {
+ case BPF_PROG_TYPE_SK_SKB:
+ case BPF_PROG_TYPE_SK_MSG:
+ ret = sockmap_get_from_fd(attr, ptype, prog);
+ break;
+ case BPF_PROG_TYPE_LIRC_MODE2:
+ ret = lirc_prog_attach(attr, prog);
+ break;
+ default:
+ ret = cgroup_bpf_prog_attach(attr, ptype, prog);
}
- ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
- attr->attach_flags);
if (ret)
bpf_prog_put(prog);
- cgroup_put(cgrp);
-
return ret;
}
@@ -1616,9 +1586,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
static int bpf_prog_detach(const union bpf_attr *attr)
{
enum bpf_prog_type ptype;
- struct bpf_prog *prog;
- struct cgroup *cgrp;
- int ret;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
@@ -1651,29 +1618,17 @@ static int bpf_prog_detach(const union bpf_attr *attr)
ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
break;
case BPF_SK_MSG_VERDICT:
- return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, false);
+ return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL);
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
- return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false);
+ return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL);
case BPF_LIRC_MODE2:
return lirc_prog_detach(attr);
default:
return -EINVAL;
}
- cgrp = cgroup_get_from_fd(attr->target_fd);
- if (IS_ERR(cgrp))
- return PTR_ERR(cgrp);
-
- prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
- if (IS_ERR(prog))
- prog = NULL;
-
- ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
- if (prog)
- bpf_prog_put(prog);
- cgroup_put(cgrp);
- return ret;
+ return cgroup_bpf_prog_detach(attr, ptype);
}
#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
@@ -1681,9 +1636,6 @@ static int bpf_prog_detach(const union bpf_attr *attr)
static int bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
- struct cgroup *cgrp;
- int ret;
-
if (!capable(CAP_NET_ADMIN))
return -EPERM;
if (CHECK_ATTR(BPF_PROG_QUERY))
@@ -1711,14 +1663,9 @@ static int bpf_prog_query(const union bpf_attr *attr,
default:
return -EINVAL;
}
- cgrp = cgroup_get_from_fd(attr->query.target_fd);
- if (IS_ERR(cgrp))
- return PTR_ERR(cgrp);
- ret = cgroup_bpf_query(cgrp, attr, uattr);
- cgroup_put(cgrp);
- return ret;
+
+ return cgroup_bpf_prog_query(attr, uattr);
}
-#endif /* CONFIG_CGROUP_BPF */
#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
@@ -2365,7 +2312,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_OBJ_GET:
err = bpf_obj_get(&attr);
break;
-#ifdef CONFIG_CGROUP_BPF
case BPF_PROG_ATTACH:
err = bpf_prog_attach(&attr);
break;
@@ -2375,7 +2321,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_PROG_QUERY:
err = bpf_prog_query(&attr, uattr);
break;
-#endif
case BPF_PROG_TEST_RUN:
err = bpf_prog_test_run(&attr, uattr);
break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9e2bf834f13a..63aaac52a265 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5430,6 +5430,10 @@ static int jit_subprogs(struct bpf_verifier_env *env)
if (insn->code != (BPF_JMP | BPF_CALL) ||
insn->src_reg != BPF_PSEUDO_CALL)
continue;
+ /* Upon error here we cannot fall back to interpreter but
+ * need a hard reject of the program. Thus -EFAULT is
+ * propagated in any case.
+ */
subprog = find_subprog(env, i + insn->imm + 1);
if (subprog < 0) {
WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
@@ -5450,7 +5454,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL);
if (!func)
- return -ENOMEM;
+ goto out_undo_insn;
for (i = 0; i < env->subprog_cnt; i++) {
subprog_start = subprog_end;
@@ -5515,7 +5519,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
tmp = bpf_int_jit_compile(func[i]);
if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
verbose(env, "JIT doesn't support bpf-to-bpf calls\n");
- err = -EFAULT;
+ err = -ENOTSUPP;
goto out_free;
}
cond_resched();
@@ -5552,6 +5556,7 @@ out_free:
if (func[i])
bpf_jit_free(func[i]);
kfree(func);
+out_undo_insn:
/* cleanup main prog to be interpreted */
prog->jit_requested = 0;
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
@@ -5578,6 +5583,8 @@ static int fixup_call_args(struct bpf_verifier_env *env)
err = jit_subprogs(env);
if (err == 0)
return 0;
+ if (err == -EFAULT)
+ return err;
}
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
for (i = 0; i < prog->len; i++, insn++) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 9440d61b925c..1b27babc4c78 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -303,11 +303,36 @@ struct kmem_cache *files_cachep;
struct kmem_cache *fs_cachep;
/* SLAB cache for vm_area_struct structures */
-struct kmem_cache *vm_area_cachep;
+static struct kmem_cache *vm_area_cachep;
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
+struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+
+ if (vma)
+ vma_init(vma, mm);
+ return vma;
+}
+
+struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
+{
+ struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+
+ if (new) {
+ *new = *orig;
+ INIT_LIST_HEAD(&new->anon_vma_chain);
+ }
+ return new;
+}
+
+void vm_area_free(struct vm_area_struct *vma)
+{
+ kmem_cache_free(vm_area_cachep, vma);
+}
+
static void account_kernel_stack(struct task_struct *tsk, int account)
{
void *stack = task_stack_page(tsk);
@@ -455,11 +480,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
goto fail_nomem;
charge = len;
}
- tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+ tmp = vm_area_dup(mpnt);
if (!tmp)
goto fail_nomem;
- *tmp = *mpnt;
- INIT_LIST_HEAD(&tmp->anon_vma_chain);
retval = vma_dup_policy(mpnt, tmp);
if (retval)
goto fail_nomem_policy;
@@ -539,7 +562,7 @@ fail_uprobe_end:
fail_nomem_anon_vma_fork:
mpol_put(vma_policy(tmp));
fail_nomem_policy:
- kmem_cache_free(vm_area_cachep, tmp);
+ vm_area_free(tmp);
fail_nomem:
retval = -ENOMEM;
vm_unacct_memory(charge);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 481951bf091d..486dedbd9af5 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -177,9 +177,20 @@ void *kthread_probe_data(struct task_struct *task)
static void __kthread_parkme(struct kthread *self)
{
for (;;) {
- set_current_state(TASK_PARKED);
+ /*
+ * TASK_PARKED is a special state; we must serialize against
+ * possible pending wakeups to avoid store-store collisions on
+ * task->state.
+ *
+ * Such a collision might possibly result in the task state
+ * changin from TASK_PARKED and us failing the
+ * wait_task_inactive() in kthread_park().
+ */
+ set_special_state(TASK_PARKED);
if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
break;
+
+ complete_all(&self->parked);
schedule();
}
__set_current_state(TASK_RUNNING);
@@ -191,11 +202,6 @@ void kthread_parkme(void)
}
EXPORT_SYMBOL_GPL(kthread_parkme);
-void kthread_park_complete(struct task_struct *k)
-{
- complete_all(&to_kthread(k)->parked);
-}
-
static int kthread(void *_create)
{
/* Copy data: it's on kthread's stack */
@@ -319,8 +325,14 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
task = create->result;
if (!IS_ERR(task)) {
static const struct sched_param param = { .sched_priority = 0 };
+ char name[TASK_COMM_LEN];
- vsnprintf(task->comm, sizeof(task->comm), namefmt, args);
+ /*
+ * task is already visible to other tasks, so updating
+ * COMM must be protected.
+ */
+ vsnprintf(name, sizeof(name), namefmt, args);
+ set_task_comm(task, name);
/*
* root may have changed our (kthreadd's) priority or CPU mask.
* The kernel thread should not inherit these properties.
@@ -461,6 +473,9 @@ void kthread_unpark(struct task_struct *k)
reinit_completion(&kthread->parked);
clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ /*
+ * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup.
+ */
wake_up_state(k, TASK_PARKED);
}
EXPORT_SYMBOL_GPL(kthread_unpark);
@@ -487,7 +502,16 @@ int kthread_park(struct task_struct *k)
set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
if (k != current) {
wake_up_process(k);
+ /*
+ * Wait for __kthread_parkme() to complete(), this means we
+ * _will_ have TASK_PARKED and are about to call schedule().
+ */
wait_for_completion(&kthread->parked);
+ /*
+ * Now wait for that schedule() to complete and the task to
+ * get scheduled out.
+ */
+ WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED));
}
return 0;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 5857267a4af5..38283363da06 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -176,10 +176,27 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
unsigned long pfn, pgoff, order;
pgprot_t pgprot = PAGE_KERNEL;
int error, nid, is_ram;
+ struct dev_pagemap *conflict_pgmap;
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
- align_start;
+ align_end = align_start + align_size - 1;
+
+ conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_start), NULL);
+ if (conflict_pgmap) {
+ dev_WARN(dev, "Conflicting mapping in same section\n");
+ put_dev_pagemap(conflict_pgmap);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL);
+ if (conflict_pgmap) {
+ dev_WARN(dev, "Conflicting mapping in same section\n");
+ put_dev_pagemap(conflict_pgmap);
+ return ERR_PTR(-ENOMEM);
+ }
+
is_ram = region_intersects(align_start, align_size,
IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
@@ -199,7 +216,6 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
mutex_lock(&pgmap_lock);
error = 0;
- align_end = align_start + align_size - 1;
foreach_order_pgoff(res, order, pgoff) {
error = __radix_tree_insert(&pgmap_radix,
@@ -305,7 +321,7 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap);
#ifdef CONFIG_DEV_PAGEMAP_OPS
DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
-EXPORT_SYMBOL_GPL(devmap_managed_key);
+EXPORT_SYMBOL(devmap_managed_key);
static atomic_t devmap_enable;
/*
@@ -346,5 +362,5 @@ void __put_devmap_managed_page(struct page *page)
} else if (!count)
__put_page(page);
}
-EXPORT_SYMBOL_GPL(__put_devmap_managed_page);
+EXPORT_SYMBOL(__put_devmap_managed_page);
#endif /* CONFIG_DEV_PAGEMAP_OPS */
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 22b6acf1ad63..c6242d8594dc 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -85,9 +85,9 @@ static int rseq_update_cpu_id(struct task_struct *t)
{
u32 cpu_id = raw_smp_processor_id();
- if (__put_user(cpu_id, &t->rseq->cpu_id_start))
+ if (put_user(cpu_id, &t->rseq->cpu_id_start))
return -EFAULT;
- if (__put_user(cpu_id, &t->rseq->cpu_id))
+ if (put_user(cpu_id, &t->rseq->cpu_id))
return -EFAULT;
trace_rseq_update(t);
return 0;
@@ -100,14 +100,14 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t)
/*
* Reset cpu_id_start to its initial state (0).
*/
- if (__put_user(cpu_id_start, &t->rseq->cpu_id_start))
+ if (put_user(cpu_id_start, &t->rseq->cpu_id_start))
return -EFAULT;
/*
* Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming
* in after unregistration can figure out that rseq needs to be
* registered again.
*/
- if (__put_user(cpu_id, &t->rseq->cpu_id))
+ if (put_user(cpu_id, &t->rseq->cpu_id))
return -EFAULT;
return 0;
}
@@ -115,29 +115,36 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t)
static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
{
struct rseq_cs __user *urseq_cs;
- unsigned long ptr;
+ u64 ptr;
u32 __user *usig;
u32 sig;
int ret;
- ret = __get_user(ptr, &t->rseq->rseq_cs);
- if (ret)
- return ret;
+ if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr)))
+ return -EFAULT;
if (!ptr) {
memset(rseq_cs, 0, sizeof(*rseq_cs));
return 0;
}
- urseq_cs = (struct rseq_cs __user *)ptr;
+ if (ptr >= TASK_SIZE)
+ return -EINVAL;
+ urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr;
if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
return -EFAULT;
- if (rseq_cs->version > 0)
- return -EINVAL;
+ if (rseq_cs->start_ip >= TASK_SIZE ||
+ rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE ||
+ rseq_cs->abort_ip >= TASK_SIZE ||
+ rseq_cs->version > 0)
+ return -EINVAL;
+ /* Check for overflow. */
+ if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip)
+ return -EINVAL;
/* Ensure that abort_ip is not in the critical section. */
if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset)
return -EINVAL;
- usig = (u32 __user *)(rseq_cs->abort_ip - sizeof(u32));
+ usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32));
ret = get_user(sig, usig);
if (ret)
return ret;
@@ -146,7 +153,7 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
printk_ratelimited(KERN_WARNING
"Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n",
sig, current->rseq_sig, current->pid, usig);
- return -EPERM;
+ return -EINVAL;
}
return 0;
}
@@ -157,7 +164,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
int ret;
/* Get thread flags. */
- ret = __get_user(flags, &t->rseq->flags);
+ ret = get_user(flags, &t->rseq->flags);
if (ret)
return ret;
@@ -195,9 +202,11 @@ static int clear_rseq_cs(struct task_struct *t)
* of code outside of the rseq assembly block. This performs
* a lazy clear of the rseq_cs field.
*
- * Set rseq_cs to NULL with single-copy atomicity.
+ * Set rseq_cs to NULL.
*/
- return __put_user(0UL, &t->rseq->rseq_cs);
+ if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64)))
+ return -EFAULT;
+ return 0;
}
/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 78d8facba456..fe365c9a08e9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7,7 +7,6 @@
*/
#include "sched.h"
-#include <linux/kthread.h>
#include <linux/nospec.h>
#include <linux/kcov.h>
@@ -2724,28 +2723,20 @@ static struct rq *finish_task_switch(struct task_struct *prev)
membarrier_mm_sync_core_before_usermode(mm);
mmdrop(mm);
}
- if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) {
- switch (prev_state) {
- case TASK_DEAD:
- if (prev->sched_class->task_dead)
- prev->sched_class->task_dead(prev);
+ if (unlikely(prev_state == TASK_DEAD)) {
+ if (prev->sched_class->task_dead)
+ prev->sched_class->task_dead(prev);
- /*
- * Remove function-return probe instances associated with this
- * task and put them back on the free list.
- */
- kprobe_flush_task(prev);
-
- /* Task is done with its stack. */
- put_task_stack(prev);
+ /*
+ * Remove function-return probe instances associated with this
+ * task and put them back on the free list.
+ */
+ kprobe_flush_task(prev);
- put_task_struct(prev);
- break;
+ /* Task is done with its stack. */
+ put_task_stack(prev);
- case TASK_PARKED:
- kthread_park_complete(prev);
- break;
- }
+ put_task_struct(prev);
}
tick_nohz_task_switch();
@@ -3113,7 +3104,9 @@ static void sched_tick_remote(struct work_struct *work)
struct tick_work *twork = container_of(dwork, struct tick_work, work);
int cpu = twork->cpu;
struct rq *rq = cpu_rq(cpu);
+ struct task_struct *curr;
struct rq_flags rf;
+ u64 delta;
/*
* Handle the tick only if it appears the remote CPU is running in full
@@ -3122,24 +3115,28 @@ static void sched_tick_remote(struct work_struct *work)
* statistics and checks timeslices in a time-independent way, regardless
* of when exactly it is running.
*/
- if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) {
- struct task_struct *curr;
- u64 delta;
+ if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
+ goto out_requeue;
- rq_lock_irq(rq, &rf);
- update_rq_clock(rq);
- curr = rq->curr;
- delta = rq_clock_task(rq) - curr->se.exec_start;
+ rq_lock_irq(rq, &rf);
+ curr = rq->curr;
+ if (is_idle_task(curr))
+ goto out_unlock;
- /*
- * Make sure the next tick runs within a reasonable
- * amount of time.
- */
- WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
- curr->sched_class->task_tick(rq, curr, 0);
- rq_unlock_irq(rq, &rf);
- }
+ update_rq_clock(rq);
+ delta = rq_clock_task(rq) - curr->se.exec_start;
+
+ /*
+ * Make sure the next tick runs within a reasonable
+ * amount of time.
+ */
+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ curr->sched_class->task_tick(rq, curr, 0);
+
+out_unlock:
+ rq_unlock_irq(rq, &rf);
+out_requeue:
/*
* Run the remote tick once per second (1Hz). This arbitrary
* frequency is large enough to avoid overload but short enough
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 3cde46483f0a..c907fde01eaa 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -192,7 +192,7 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
- if (rq->rt.rt_nr_running)
+ if (rt_rq_is_runnable(&rq->rt))
return sg_cpu->max;
/*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fbfc3f1d368a..10c7b51c0d1f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2290,8 +2290,17 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
if (task_on_rq_queued(p) && p->dl.dl_runtime)
task_non_contending(p);
- if (!task_on_rq_queued(p))
+ if (!task_on_rq_queued(p)) {
+ /*
+ * Inactive timer is armed. However, p is leaving DEADLINE and
+ * might migrate away from this rq while continuing to run on
+ * some other class. We need to remove its contribution from
+ * this rq running_bw now, or sub_rq_bw (below) will complain.
+ */
+ if (p->dl.dl_non_contending)
+ sub_running_bw(&p->dl, &rq->dl);
sub_rq_bw(&p->dl, &rq->dl);
+ }
/*
* We cannot use inactive_task_timer() to invoke sub_running_bw()
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1866e64792a7..2f0a0be4d344 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3982,18 +3982,10 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
if (!sched_feat(UTIL_EST))
return;
- /*
- * Update root cfs_rq's estimated utilization
- *
- * If *p is the last task then the root cfs_rq's estimated utilization
- * of a CPU is 0 by definition.
- */
- ue.enqueued = 0;
- if (cfs_rq->nr_running) {
- ue.enqueued = cfs_rq->avg.util_est.enqueued;
- ue.enqueued -= min_t(unsigned int, ue.enqueued,
- (_task_util_est(p) | UTIL_AVG_UNCHANGED));
- }
+ /* Update root cfs_rq's estimated utilization */
+ ue.enqueued = cfs_rq->avg.util_est.enqueued;
+ ue.enqueued -= min_t(unsigned int, ue.enqueued,
+ (_task_util_est(p) | UTIL_AVG_UNCHANGED));
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
/*
@@ -4590,6 +4582,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
now = sched_clock_cpu(smp_processor_id());
cfs_b->runtime = cfs_b->quota;
cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
+ cfs_b->expires_seq++;
}
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4612,6 +4605,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
struct task_group *tg = cfs_rq->tg;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
u64 amount = 0, min_amount, expires;
+ int expires_seq;
/* note: this is a positive sum as runtime_remaining <= 0 */
min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -4628,6 +4622,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_b->idle = 0;
}
}
+ expires_seq = cfs_b->expires_seq;
expires = cfs_b->runtime_expires;
raw_spin_unlock(&cfs_b->lock);
@@ -4637,8 +4632,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
* spread between our sched_clock and the one on which runtime was
* issued.
*/
- if ((s64)(expires - cfs_rq->runtime_expires) > 0)
+ if (cfs_rq->expires_seq != expires_seq) {
+ cfs_rq->expires_seq = expires_seq;
cfs_rq->runtime_expires = expires;
+ }
return cfs_rq->runtime_remaining > 0;
}
@@ -4664,12 +4661,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
* has not truly expired.
*
* Fortunately we can check determine whether this the case by checking
- * whether the global deadline has advanced. It is valid to compare
- * cfs_b->runtime_expires without any locks since we only care about
- * exact equality, so a partial write will still work.
+ * whether the global deadline(cfs_b->expires_seq) has advanced.
*/
-
- if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
+ if (cfs_rq->expires_seq == cfs_b->expires_seq) {
/* extend local deadline, drift is bounded above by 2 ticks */
cfs_rq->runtime_expires += TICK_NSEC;
} else {
@@ -5202,13 +5196,18 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
+ u64 overrun;
+
lockdep_assert_held(&cfs_b->lock);
- if (!cfs_b->period_active) {
- cfs_b->period_active = 1;
- hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
- hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
- }
+ if (cfs_b->period_active)
+ return;
+
+ cfs_b->period_active = 1;
+ overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
+ cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
+ cfs_b->expires_seq++;
+ hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
}
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 47556b0c9a95..572567078b60 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -508,8 +508,11 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
rt_se = rt_rq->tg->rt_se[cpu];
- if (!rt_se)
+ if (!rt_se) {
dequeue_top_rt_rq(rt_rq);
+ /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
+ }
else if (on_rt_rq(rt_se))
dequeue_rt_entity(rt_se, 0);
}
@@ -1001,8 +1004,6 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq)
sub_nr_running(rq, rt_rq->rt_nr_running);
rt_rq->rt_queued = 0;
- /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
- cpufreq_update_util(rq, 0);
}
static void
@@ -1014,11 +1015,14 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq)
if (rt_rq->rt_queued)
return;
- if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
+
+ if (rt_rq_throttled(rt_rq))
return;
- add_nr_running(rq, rt_rq->rt_nr_running);
- rt_rq->rt_queued = 1;
+ if (rt_rq->rt_nr_running) {
+ add_nr_running(rq, rt_rq->rt_nr_running);
+ rt_rq->rt_queued = 1;
+ }
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq, 0);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6601baf2361c..c7742dcc136c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -334,9 +334,10 @@ struct cfs_bandwidth {
u64 runtime;
s64 hierarchical_quota;
u64 runtime_expires;
+ int expires_seq;
- int idle;
- int period_active;
+ short idle;
+ short period_active;
struct hrtimer period_timer;
struct hrtimer slack_timer;
struct list_head throttled_cfs_rq;
@@ -551,6 +552,7 @@ struct cfs_rq {
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
+ int expires_seq;
u64 runtime_expires;
s64 runtime_remaining;
@@ -609,6 +611,11 @@ struct rt_rq {
#endif
};
+static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_queued && rt_rq->rt_nr_running;
+}
+
/* Deadline class' related fields in a runqueue */
struct dl_rq {
/* runqueue is an rbtree, ordered by deadline */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 900dcfee542c..75ffc1d1a2e0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -79,12 +79,16 @@ static void wakeup_softirqd(void)
/*
* If ksoftirqd is scheduled, we do not want to process pending softirqs
- * right now. Let ksoftirqd handle this at its own rate, to get fairness.
+ * right now. Let ksoftirqd handle this at its own rate, to get fairness,
+ * unless we're doing some of the synchronous softirqs.
*/
-static bool ksoftirqd_running(void)
+#define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ))
+static bool ksoftirqd_running(unsigned long pending)
{
struct task_struct *tsk = __this_cpu_read(ksoftirqd);
+ if (pending & SOFTIRQ_NOW_MASK)
+ return false;
return tsk && (tsk->state == TASK_RUNNING);
}
@@ -328,7 +332,7 @@ asmlinkage __visible void do_softirq(void)
pending = local_softirq_pending();
- if (pending && !ksoftirqd_running())
+ if (pending && !ksoftirqd_running(pending))
do_softirq_own_stack();
local_irq_restore(flags);
@@ -355,7 +359,7 @@ void irq_enter(void)
static inline void invoke_softirq(void)
{
- if (ksoftirqd_running())
+ if (ksoftirqd_running(local_softirq_pending()))
return;
if (!force_irqthreads) {
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index f89014a2c238..1ff523dae6e2 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -270,7 +270,11 @@ unlock:
goto retry;
}
- wake_up_q(&wakeq);
+ if (!err) {
+ preempt_disable();
+ wake_up_q(&wakeq);
+ preempt_enable();
+ }
return err;
}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b7005dd21ec1..14de3727b18e 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -277,8 +277,7 @@ static bool tick_check_preferred(struct clock_event_device *curdev,
*/
return !curdev ||
newdev->rating > curdev->rating ||
- (!cpumask_equal(curdev->cpumask, newdev->cpumask) &&
- !tick_check_percpu(curdev, newdev, smp_processor_id()));
+ !cpumask_equal(curdev->cpumask, newdev->cpumask);
}
/*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index efed9c1cfb7e..caf9cbf35816 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -192,17 +192,6 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
op->saved_func(ip, parent_ip, op, regs);
}
-/**
- * clear_ftrace_function - reset the ftrace function
- *
- * This NULLs the ftrace function and in essence stops
- * tracing. There may be lag
- */
-void clear_ftrace_function(void)
-{
- ftrace_trace_function = ftrace_stub;
-}
-
static void ftrace_sync(struct work_struct *work)
{
/*
@@ -6689,7 +6678,7 @@ void ftrace_kill(void)
{
ftrace_disabled = 1;
ftrace_enabled = 0;
- clear_ftrace_function();
+ ftrace_trace_function = ftrace_stub;
}
/**
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6a46af21765c..0b0b688ea166 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3227,6 +3227,22 @@ int ring_buffer_record_is_on(struct ring_buffer *buffer)
}
/**
+ * ring_buffer_record_is_set_on - return true if the ring buffer is set writable
+ * @buffer: The ring buffer to see if write is set enabled
+ *
+ * Returns true if the ring buffer is set writable by ring_buffer_record_on().
+ * Note that this does NOT mean it is in a writable state.
+ *
+ * It may return true when the ring buffer has been disabled by
+ * ring_buffer_record_disable(), as that is a temporary disabling of
+ * the ring buffer.
+ */
+int ring_buffer_record_is_set_on(struct ring_buffer *buffer)
+{
+ return !(atomic_read(&buffer->record_disabled) & RB_BUFFER_OFF);
+}
+
+/**
* ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
* @buffer: The ring buffer to stop writes to.
* @cpu: The CPU buffer to stop
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a0079b4c7a49..823687997b01 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1373,6 +1373,12 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
arch_spin_lock(&tr->max_lock);
+ /* Inherit the recordable setting from trace_buffer */
+ if (ring_buffer_record_is_set_on(tr->trace_buffer.buffer))
+ ring_buffer_record_on(tr->max_buffer.buffer);
+ else
+ ring_buffer_record_off(tr->max_buffer.buffer);
+
swap(tr->trace_buffer.buffer, tr->max_buffer.buffer);
__update_max_tr(tr, tsk, cpu);
@@ -2953,6 +2959,7 @@ out_nobuffer:
}
EXPORT_SYMBOL_GPL(trace_vbprintk);
+__printf(3, 0)
static int
__trace_array_vprintk(struct ring_buffer *buffer,
unsigned long ip, const char *fmt, va_list args)
@@ -3007,12 +3014,14 @@ out_nobuffer:
return len;
}
+__printf(3, 0)
int trace_array_vprintk(struct trace_array *tr,
unsigned long ip, const char *fmt, va_list args)
{
return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args);
}
+__printf(3, 0)
int trace_array_printk(struct trace_array *tr,
unsigned long ip, const char *fmt, ...)
{
@@ -3028,6 +3037,7 @@ int trace_array_printk(struct trace_array *tr,
return ret;
}
+__printf(3, 4)
int trace_array_printk_buf(struct ring_buffer *buffer,
unsigned long ip, const char *fmt, ...)
{
@@ -3043,6 +3053,7 @@ int trace_array_printk_buf(struct ring_buffer *buffer,
return ret;
}
+__printf(2, 0)
int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
{
return trace_array_vprintk(&global_trace, ip, fmt, args);
@@ -3360,8 +3371,8 @@ static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m,
print_event_info(buf, m);
- seq_printf(m, "# TASK-PID CPU# %s TIMESTAMP FUNCTION\n", tgid ? "TGID " : "");
- seq_printf(m, "# | | | %s | |\n", tgid ? " | " : "");
+ seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? "TGID " : "");
+ seq_printf(m, "# | | %s | | |\n", tgid ? " | " : "");
}
static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m,
@@ -3381,9 +3392,9 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
tgid ? tgid_space : space);
seq_printf(m, "# %s||| / delay\n",
tgid ? tgid_space : space);
- seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n",
+ seq_printf(m, "# TASK-PID %sCPU# |||| TIMESTAMP FUNCTION\n",
tgid ? " TGID " : space);
- seq_printf(m, "# | | | %s|||| | |\n",
+ seq_printf(m, "# | | %s | |||| | |\n",
tgid ? " | " : space);
}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 630c5a24b2b2..f8f86231ad90 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -583,9 +583,7 @@ static __always_inline void trace_clear_recursion(int bit)
static inline struct ring_buffer_iter *
trace_buffer_iter(struct trace_iterator *iter, int cpu)
{
- if (iter->buffer_iter && iter->buffer_iter[cpu])
- return iter->buffer_iter[cpu];
- return NULL;
+ return iter->buffer_iter ? iter->buffer_iter[cpu] : NULL;
}
int tracer_init(struct tracer *t, struct trace_array *tr);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 0dceb77d1d42..893a206bcba4 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1701,6 +1701,7 @@ static void create_filter_finish(struct filter_parse_error *pe)
* @filter_str: filter string
* @set_str: remember @filter_str and enable detailed error in filter
* @filterp: out param for created filter (always updated on return)
+ * Must be a pointer that references a NULL pointer.
*
* Creates a filter for @call with @filter_str. If @set_str is %true,
* @filter_str is copied and recorded in the new filter.
@@ -1718,6 +1719,10 @@ static int create_filter(struct trace_event_call *call,
struct filter_parse_error *pe = NULL;
int err;
+ /* filterp must point to NULL */
+ if (WARN_ON(*filterp))
+ *filterp = NULL;
+
err = create_filter_start(filter_string, set_str, &pe, filterp);
if (err)
return err;
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 046c716a6536..aae18af94c94 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -393,7 +393,7 @@ static void hist_err_event(char *str, char *system, char *event, char *var)
else if (system)
snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event);
else
- strncpy(err, var, MAX_FILTER_STR_VAL);
+ strscpy(err, var, MAX_FILTER_STR_VAL);
hist_err(str, err);
}
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d18249683682..5dea177cef53 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -679,6 +679,8 @@ event_trigger_callback(struct event_command *cmd_ops,
goto out_free;
out_reg:
+ /* Up the trigger_data count to make sure reg doesn't free it on failure */
+ event_trigger_init(trigger_ops, trigger_data);
ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
/*
* The above returns on success the # of functions enabled,
@@ -686,11 +688,13 @@ event_trigger_callback(struct event_command *cmd_ops,
* Consider no functions a failure too.
*/
if (!ret) {
+ cmd_ops->unreg(glob, trigger_ops, trigger_data, file);
ret = -ENOENT;
- goto out_free;
- } else if (ret < 0)
- goto out_free;
- ret = 0;
+ } else if (ret > 0)
+ ret = 0;
+
+ /* Down the counter of trigger_data or free it if not used anymore */
+ event_trigger_free(trigger_ops, trigger_data);
out:
return ret;
@@ -1416,6 +1420,9 @@ int event_enable_trigger_func(struct event_command *cmd_ops,
goto out;
}
+ /* Up the trigger_data count to make sure nothing frees it on failure */
+ event_trigger_init(trigger_ops, trigger_data);
+
if (trigger) {
number = strsep(&trigger, ":");
@@ -1466,6 +1473,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops,
goto out_disable;
/* Just return zero, not the number of enabled functions */
ret = 0;
+ event_trigger_free(trigger_ops, trigger_data);
out:
return ret;
@@ -1476,7 +1484,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops,
out_free:
if (cmd_ops->set_filter)
cmd_ops->set_filter(NULL, trigger_data, NULL);
- kfree(trigger_data);
+ event_trigger_free(trigger_ops, trigger_data);
kfree(enable_data);
goto out;
}
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 23c0b0cb5fb9..169b3c44ee97 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -831,6 +831,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
struct ftrace_graph_ret *graph_ret;
struct ftrace_graph_ent *call;
unsigned long long duration;
+ int cpu = iter->cpu;
int i;
graph_ret = &ret_entry->ret;
@@ -839,7 +840,6 @@ print_graph_entry_leaf(struct trace_iterator *iter,
if (data) {
struct fgraph_cpu_data *cpu_data;
- int cpu = iter->cpu;
cpu_data = per_cpu_ptr(data->cpu_data, cpu);
@@ -869,6 +869,9 @@ print_graph_entry_leaf(struct trace_iterator *iter,
trace_seq_printf(s, "%ps();\n", (void *)call->func);
+ print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET,
+ cpu, iter->ent->pid, flags);
+
return trace_handle_return(s);
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index daa81571b22a..6b71860f3998 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -400,11 +400,10 @@ static struct trace_kprobe *find_trace_kprobe(const char *event,
static int
enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
{
+ struct event_file_link *link = NULL;
int ret = 0;
if (file) {
- struct event_file_link *link;
-
link = kmalloc(sizeof(*link), GFP_KERNEL);
if (!link) {
ret = -ENOMEM;
@@ -424,6 +423,18 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
else
ret = enable_kprobe(&tk->rp.kp);
}
+
+ if (ret) {
+ if (file) {
+ /* Notice the if is true on not WARN() */
+ if (!WARN_ON_ONCE(!link))
+ list_del_rcu(&link->list);
+ kfree(link);
+ tk->tp.flags &= ~TP_FLAG_TRACE;
+ } else {
+ tk->tp.flags &= ~TP_FLAG_PROFILE;
+ }
+ }
out:
return ret;
}
@@ -1480,8 +1491,10 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
}
ret = __register_trace_kprobe(tk);
- if (ret < 0)
+ if (ret < 0) {
+ kfree(tk->tp.call.print_fmt);
goto error;
+ }
return &tk->tp.call;
error:
@@ -1501,6 +1514,8 @@ void destroy_local_trace_kprobe(struct trace_event_call *event_call)
}
__unregister_trace_kprobe(tk);
+
+ kfree(tk->tp.call.print_fmt);
free_trace_kprobe(tk);
}
#endif /* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 90db994ac900..1c8e30fda46a 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -594,8 +594,7 @@ int trace_print_context(struct trace_iterator *iter)
trace_find_cmdline(entry->pid, comm);
- trace_seq_printf(s, "%16s-%-5d [%03d] ",
- comm, entry->pid, iter->cpu);
+ trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
if (tr->trace_flags & TRACE_ITER_RECORD_TGID) {
unsigned int tgid = trace_find_tgid(entry->pid);
@@ -606,6 +605,8 @@ int trace_print_context(struct trace_iterator *iter)
trace_seq_printf(s, "(%5d) ", tgid);
}
+ trace_seq_printf(s, "[%03d] ", iter->cpu);
+
if (tr->trace_flags & TRACE_ITER_IRQ_INFO)
trace_print_lat_fmt(s, entry);