diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/core.c | 4 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 64 | ||||
-rw-r--r-- | kernel/bpf/inode.c | 20 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 22 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 10 | ||||
-rw-r--r-- | kernel/cgroup.c | 126 |
6 files changed, 187 insertions, 59 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 334b1bdd572c..972d9a8e4ac4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -306,10 +306,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; ARG1 = (u64) (unsigned long) ctx; - /* Registers used in classic BPF programs need to be reset first. */ - regs[BPF_REG_A] = 0; - regs[BPF_REG_X] = 0; - select_insn: goto *jumptable[insn->code]; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 34777b3746fa..c5b30fd8a315 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -14,11 +14,15 @@ #include <linux/filter.h> #include <linux/vmalloc.h> +struct bucket { + struct hlist_head head; + raw_spinlock_t lock; +}; + struct bpf_htab { struct bpf_map map; - struct hlist_head *buckets; - raw_spinlock_t lock; - u32 count; /* number of elements in this hashtable */ + struct bucket *buckets; + atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ }; @@ -79,34 +83,35 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) /* prevent zero size kmalloc and check for u32 overflow */ if (htab->n_buckets == 0 || - htab->n_buckets > U32_MAX / sizeof(struct hlist_head)) + htab->n_buckets > U32_MAX / sizeof(struct bucket)) goto free_htab; - if ((u64) htab->n_buckets * sizeof(struct hlist_head) + + if ((u64) htab->n_buckets * sizeof(struct bucket) + (u64) htab->elem_size * htab->map.max_entries >= U32_MAX - PAGE_SIZE) /* make sure page count doesn't overflow */ goto free_htab; - htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) + + htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) + htab->elem_size * htab->map.max_entries, PAGE_SIZE) >> PAGE_SHIFT; err = -ENOMEM; - htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head), + htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket), GFP_USER | __GFP_NOWARN); if (!htab->buckets) { - htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head)); + htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket)); if (!htab->buckets) goto free_htab; } - for (i = 0; i < htab->n_buckets; i++) - INIT_HLIST_HEAD(&htab->buckets[i]); + for (i = 0; i < htab->n_buckets; i++) { + INIT_HLIST_HEAD(&htab->buckets[i].head); + raw_spin_lock_init(&htab->buckets[i].lock); + } - raw_spin_lock_init(&htab->lock); - htab->count = 0; + atomic_set(&htab->count, 0); return &htab->map; @@ -120,11 +125,16 @@ static inline u32 htab_map_hash(const void *key, u32 key_len) return jhash(key, key_len, 0); } -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) { return &htab->buckets[hash & (htab->n_buckets - 1)]; } +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &__select_bucket(htab, hash)->head; +} + static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, u32 key_size) { @@ -227,6 +237,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new, *l_old; struct hlist_head *head; + struct bucket *b; unsigned long flags; u32 key_size; int ret; @@ -248,15 +259,15 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, memcpy(l_new->key + round_up(key_size, 8), value, map->value_size); l_new->hash = htab_map_hash(l_new->key, key_size); + b = __select_bucket(htab, l_new->hash); + head = &b->head; /* bpf_map_update_elem() can be called in_irq() */ - raw_spin_lock_irqsave(&htab->lock, flags); - - head = select_bucket(htab, l_new->hash); + raw_spin_lock_irqsave(&b->lock, flags); l_old = lookup_elem_raw(head, l_new->hash, key, key_size); - if (!l_old && unlikely(htab->count >= map->max_entries)) { + if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) { /* if elem with this 'key' doesn't exist and we've reached * max_entries limit, fail insertion of new elem */ @@ -284,13 +295,13 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, hlist_del_rcu(&l_old->hash_node); kfree_rcu(l_old, rcu); } else { - htab->count++; + atomic_inc(&htab->count); } - raw_spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&b->lock, flags); return 0; err: - raw_spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&b->lock, flags); kfree(l_new); return ret; } @@ -300,6 +311,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_head *head; + struct bucket *b; struct htab_elem *l; unsigned long flags; u32 hash, key_size; @@ -310,21 +322,21 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) key_size = map->key_size; hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; - raw_spin_lock_irqsave(&htab->lock, flags); - - head = select_bucket(htab, hash); + raw_spin_lock_irqsave(&b->lock, flags); l = lookup_elem_raw(head, hash, key, key_size); if (l) { hlist_del_rcu(&l->hash_node); - htab->count--; + atomic_dec(&htab->count); kfree_rcu(l, rcu); ret = 0; } - raw_spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&b->lock, flags); return ret; } @@ -339,7 +351,7 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_for_each_entry_safe(l, n, head, hash_node) { hlist_del_rcu(&l->hash_node); - htab->count--; + atomic_dec(&htab->count); kfree(l); } } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5a8a797d50b7..f2ece3c174a5 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -187,11 +187,31 @@ static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode, } } +static int bpf_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry) +{ + if (bpf_dname_reserved(new_dentry)) + return -EPERM; + + return simple_link(old_dentry, dir, new_dentry); +} + +static int bpf_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + if (bpf_dname_reserved(new_dentry)) + return -EPERM; + + return simple_rename(old_dir, old_dentry, new_dir, new_dentry); +} + static const struct inode_operations bpf_dir_iops = { .lookup = simple_lookup, .mknod = bpf_mkobj, .mkdir = bpf_mkdir, .rmdir = simple_rmdir, + .rename = bpf_rename, + .link = bpf_link, .unlink = simple_unlink, }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3b39550d8485..637397059f76 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -113,8 +113,28 @@ static int bpf_map_release(struct inode *inode, struct file *filp) return 0; } +#ifdef CONFIG_PROC_FS +static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) +{ + const struct bpf_map *map = filp->private_data; + + seq_printf(m, + "map_type:\t%u\n" + "key_size:\t%u\n" + "value_size:\t%u\n" + "max_entries:\t%u\n", + map->map_type, + map->key_size, + map->value_size, + map->max_entries); +} +#endif + static const struct file_operations bpf_map_fops = { - .release = bpf_map_release, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_map_show_fdinfo, +#endif + .release = bpf_map_release, }; int bpf_map_new_fd(struct bpf_map *map) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a7945d10b378..d1d3e8f57de9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1121,6 +1121,16 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if ((opcode == BPF_LSH || opcode == BPF_RSH || + opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { + int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; + + if (insn->imm < 0 || insn->imm >= size) { + verbose("invalid shift %d\n", insn->imm); + return -EINVAL; + } + } + /* pattern match 'bpf_add Rx, imm' instruction */ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && regs[insn->dst_reg].type == FRAME_PTR && diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 470f6536b9e8..fe95970b1f79 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -57,8 +57,8 @@ #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <linux/kthread.h> #include <linux/delay.h> - #include <linux/atomic.h> +#include <net/sock.h> /* * pidlists linger the following amount before being destroyed. The goal @@ -440,11 +440,6 @@ static bool cgroup_tryget(struct cgroup *cgrp) return css_tryget(&cgrp->self); } -static void cgroup_put(struct cgroup *cgrp) -{ - css_put(&cgrp->self); -} - struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = of->kn->parent->priv; @@ -465,25 +460,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) } EXPORT_SYMBOL_GPL(of_css); -/** - * cgroup_is_descendant - test ancestry - * @cgrp: the cgroup to be tested - * @ancestor: possible ancestor of @cgrp - * - * Test whether @cgrp is a descendant of @ancestor. It also returns %true - * if @cgrp == @ancestor. This function is safe to call as long as @cgrp - * and @ancestor are accessible. - */ -bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) -{ - while (cgrp) { - if (cgrp == ancestor) - return true; - cgrp = cgroup_parent(cgrp); - } - return false; -} - static int notify_on_release(const struct cgroup *cgrp) { return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -1924,6 +1900,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) if (ret < 0) goto out; root_cgrp->id = ret; + root_cgrp->ancestor_ids[0] = ret; ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, GFP_KERNEL); @@ -4903,11 +4880,11 @@ err_free_css: static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { - struct cgroup *parent, *cgrp; + struct cgroup *parent, *cgrp, *tcgrp; struct cgroup_root *root; struct cgroup_subsys *ss; struct kernfs_node *kn; - int ssid, ret; + int level, ssid, ret; /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable. */ @@ -4918,9 +4895,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (!parent) return -ENODEV; root = parent->root; + level = parent->level + 1; /* allocate the cgroup and its ID, 0 is reserved for the root */ - cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); + cgrp = kzalloc(sizeof(*cgrp) + + sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL); if (!cgrp) { ret = -ENOMEM; goto out_unlock; @@ -4944,6 +4923,10 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, cgrp->self.parent = &parent->self; cgrp->root = root; + cgrp->level = level; + + for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) + cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -5822,6 +5805,93 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) return id > 0 ? idr_find(&ss->css_idr, id) : NULL; } +/** + * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path + * @path: path on the default hierarchy + * + * Find the cgroup at @path on the default hierarchy, increment its + * reference count and return it. Returns pointer to the found cgroup on + * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR) + * if @path points to a non-directory. + */ +struct cgroup *cgroup_get_from_path(const char *path) +{ + struct kernfs_node *kn; + struct cgroup *cgrp; + + mutex_lock(&cgroup_mutex); + + kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path); + if (kn) { + if (kernfs_type(kn) == KERNFS_DIR) { + cgrp = kn->priv; + cgroup_get(cgrp); + } else { + cgrp = ERR_PTR(-ENOTDIR); + } + kernfs_put(kn); + } else { + cgrp = ERR_PTR(-ENOENT); + } + + mutex_unlock(&cgroup_mutex); + return cgrp; +} +EXPORT_SYMBOL_GPL(cgroup_get_from_path); + +/* + * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data + * definition in cgroup-defs.h. + */ +#ifdef CONFIG_SOCK_CGROUP_DATA + +#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) + +DEFINE_SPINLOCK(cgroup_sk_update_lock); +static bool cgroup_sk_alloc_disabled __read_mostly; + +void cgroup_sk_alloc_disable(void) +{ + if (cgroup_sk_alloc_disabled) + return; + pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n"); + cgroup_sk_alloc_disabled = true; +} + +#else + +#define cgroup_sk_alloc_disabled false + +#endif + +void cgroup_sk_alloc(struct sock_cgroup_data *skcd) +{ + if (cgroup_sk_alloc_disabled) + return; + + rcu_read_lock(); + + while (true) { + struct css_set *cset; + + cset = task_css_set(current); + if (likely(cgroup_tryget(cset->dfl_cgrp))) { + skcd->val = (unsigned long)cset->dfl_cgrp; + break; + } + cpu_relax(); + } + + rcu_read_unlock(); +} + +void cgroup_sk_free(struct sock_cgroup_data *skcd) +{ + cgroup_put(sock_cgroup_ptr(skcd)); +} + +#endif /* CONFIG_SOCK_CGROUP_DATA */ + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state * debug_css_alloc(struct cgroup_subsys_state *parent_css) |