summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c61
-rw-r--r--kernel/audit.h29
-rw-r--r--kernel/auditsc.c12
-rw-r--r--kernel/bpf/arraymap.c55
-rw-r--r--kernel/bpf/cgroup.c37
-rw-r--r--kernel/bpf/core.c49
-rw-r--r--kernel/bpf/hashtab.c21
-rw-r--r--kernel/bpf/map_in_map.c5
-rw-r--r--kernel/bpf/map_in_map.h1
-rw-r--r--kernel/bpf/syscall.c510
-rw-r--r--kernel/bpf/verifier.c189
-rw-r--r--kernel/cgroup/Makefile1
-rw-r--r--kernel/cgroup/cgroup-internal.h2
-rw-r--r--kernel/cgroup/cgroup-v1.c155
-rw-r--r--kernel/cgroup/cgroup.c155
-rw-r--r--kernel/cgroup/cpuset.c33
-rw-r--r--kernel/cgroup/debug.c357
-rw-r--r--kernel/compat.c398
-rw-r--r--kernel/cpu.c37
-rw-r--r--kernel/events/core.c47
-rw-r--r--kernel/exit.c319
-rw-r--r--kernel/extable.c2
-rw-r--r--kernel/fork.c14
-rw-r--r--kernel/irq/affinity.c13
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/irq/internals.h4
-rw-r--r--kernel/irq/irqdesc.c1
-rw-r--r--kernel/irq/irqdomain.c19
-rw-r--r--kernel/irq/manage.c62
-rw-r--r--kernel/kexec_file.c14
-rw-r--r--kernel/kprobes.c42
-rw-r--r--kernel/locking/qrwlock.c1
-rw-r--r--kernel/locking/qspinlock.c1
-rw-r--r--kernel/locking/qspinlock_paravirt.h3
-rw-r--r--kernel/locking/rwsem-spinlock.c4
-rw-r--r--kernel/memremap.c6
-rw-r--r--kernel/module.c10
-rw-r--r--kernel/pid.c7
-rw-r--r--kernel/power/snapshot.c4
-rw-r--r--kernel/printk/internal.h6
-rw-r--r--kernel/printk/printk.c19
-rw-r--r--kernel/printk/printk_safe.c36
-rw-r--r--kernel/sched/cputime.c180
-rw-r--r--kernel/sched/fair.c32
-rw-r--r--kernel/seccomp.c16
-rw-r--r--kernel/signal.c155
-rw-r--r--kernel/sys.c116
-rw-r--r--kernel/time/alarmtimer.c4
-rw-r--r--kernel/time/hrtimer.c30
-rw-r--r--kernel/time/posix-cpu-timers.c8
-rw-r--r--kernel/time/posix-stubs.c96
-rw-r--r--kernel/time/posix-timers.c127
-rw-r--r--kernel/time/time.c58
-rw-r--r--kernel/trace/Kconfig22
-rw-r--r--kernel/trace/bpf_trace.c66
-rw-r--r--kernel/trace/ftrace.c366
-rw-r--r--kernel/trace/trace.c361
-rw-r--r--kernel/trace/trace.h30
-rw-r--r--kernel/trace/trace_events.c66
-rw-r--r--kernel/trace/trace_kprobe.c2
-rw-r--r--kernel/trace/trace_output.c27
-rw-r--r--kernel/trace/trace_sched_switch.c72
62 files changed, 3083 insertions, 1494 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 4b7d49868ce1..833267bbd80b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -575,12 +575,16 @@ static void kauditd_retry_skb(struct sk_buff *skb)
/**
* auditd_reset - Disconnect the auditd connection
+ * @ac: auditd connection state
*
* Description:
* Break the auditd/kauditd connection and move all the queued records into the
- * hold queue in case auditd reconnects.
+ * hold queue in case auditd reconnects. It is important to note that the @ac
+ * pointer should never be dereferenced inside this function as it may be NULL
+ * or invalid, you can only compare the memory address! If @ac is NULL then
+ * the connection will always be reset.
*/
-static void auditd_reset(void)
+static void auditd_reset(const struct auditd_connection *ac)
{
unsigned long flags;
struct sk_buff *skb;
@@ -590,17 +594,21 @@ static void auditd_reset(void)
spin_lock_irqsave(&auditd_conn_lock, flags);
ac_old = rcu_dereference_protected(auditd_conn,
lockdep_is_held(&auditd_conn_lock));
+ if (ac && ac != ac_old) {
+ /* someone already registered a new auditd connection */
+ spin_unlock_irqrestore(&auditd_conn_lock, flags);
+ return;
+ }
rcu_assign_pointer(auditd_conn, NULL);
spin_unlock_irqrestore(&auditd_conn_lock, flags);
if (ac_old)
call_rcu(&ac_old->rcu, auditd_conn_free);
- /* flush all of the main and retry queues to the hold queue */
+ /* flush the retry queue to the hold queue, but don't touch the main
+ * queue since we need to process that normally for multicast */
while ((skb = skb_dequeue(&audit_retry_queue)))
kauditd_hold_skb(skb);
- while ((skb = skb_dequeue(&audit_queue)))
- kauditd_hold_skb(skb);
}
/**
@@ -649,8 +657,8 @@ static int auditd_send_unicast_skb(struct sk_buff *skb)
return rc;
err:
- if (rc == -ECONNREFUSED)
- auditd_reset();
+ if (ac && rc == -ECONNREFUSED)
+ auditd_reset(ac);
return rc;
}
@@ -795,9 +803,9 @@ static int kauditd_thread(void *dummy)
rc = kauditd_send_queue(sk, portid,
&audit_hold_queue, UNICAST_RETRIES,
NULL, kauditd_rehold_skb);
- if (rc < 0) {
+ if (ac && rc < 0) {
sk = NULL;
- auditd_reset();
+ auditd_reset(ac);
goto main_queue;
}
@@ -805,9 +813,9 @@ static int kauditd_thread(void *dummy)
rc = kauditd_send_queue(sk, portid,
&audit_retry_queue, UNICAST_RETRIES,
NULL, kauditd_hold_skb);
- if (rc < 0) {
+ if (ac && rc < 0) {
sk = NULL;
- auditd_reset();
+ auditd_reset(ac);
goto main_queue;
}
@@ -815,12 +823,13 @@ main_queue:
/* process the main queue - do the multicast send and attempt
* unicast, dump failed record sends to the retry queue; if
* sk == NULL due to previous failures we will just do the
- * multicast send and move the record to the retry queue */
+ * multicast send and move the record to the hold queue */
rc = kauditd_send_queue(sk, portid, &audit_queue, 1,
kauditd_send_multicast_skb,
- kauditd_retry_skb);
- if (sk == NULL || rc < 0)
- auditd_reset();
+ (sk ?
+ kauditd_retry_skb : kauditd_hold_skb));
+ if (ac && rc < 0)
+ auditd_reset(ac);
sk = NULL;
/* drop our netns reference, no auditd sends past this line */
@@ -1230,7 +1239,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
auditd_pid, 1);
/* unregister the auditd connection */
- auditd_reset();
+ auditd_reset(NULL);
}
}
if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
@@ -1999,22 +2008,10 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
{
- kernel_cap_t *perm = &name->fcap.permitted;
- kernel_cap_t *inh = &name->fcap.inheritable;
- int log = 0;
-
- if (!cap_isclear(*perm)) {
- audit_log_cap(ab, "cap_fp", perm);
- log = 1;
- }
- if (!cap_isclear(*inh)) {
- audit_log_cap(ab, "cap_fi", inh);
- log = 1;
- }
-
- if (log)
- audit_log_format(ab, " cap_fe=%d cap_fver=%x",
- name->fcap.fE, name->fcap_ver);
+ audit_log_cap(ab, "cap_fp", &name->fcap.permitted);
+ audit_log_cap(ab, "cap_fi", &name->fcap.inheritable);
+ audit_log_format(ab, " cap_fe=%d cap_fver=%x",
+ name->fcap.fE, name->fcap_ver);
}
static inline int audit_copy_fcaps(struct audit_names *name,
diff --git a/kernel/audit.h b/kernel/audit.h
index ddfce2ea4891..b331d9b83f63 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -68,6 +68,7 @@ struct audit_cap_data {
unsigned int fE; /* effective bit of file cap */
kernel_cap_t effective; /* effective set of process */
};
+ kernel_cap_t ambient;
};
/* When fs/namei.c:getname() is called, we store the pointer in name and bump
@@ -247,13 +248,13 @@ struct audit_netlink_list {
struct sk_buff_head q;
};
-int audit_send_list(void *);
+int audit_send_list(void *_dest);
extern int selinux_audit_rule_update(void);
extern struct mutex audit_filter_mutex;
-extern int audit_del_rule(struct audit_entry *);
-extern void audit_free_rule_rcu(struct rcu_head *);
+extern int audit_del_rule(struct audit_entry *entry);
+extern void audit_free_rule_rcu(struct rcu_head *head);
extern struct list_head audit_filter_list[];
extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
@@ -301,17 +302,17 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark
#endif /* CONFIG_AUDIT_WATCH */
#ifdef CONFIG_AUDIT_TREE
-extern struct audit_chunk *audit_tree_lookup(const struct inode *);
-extern void audit_put_chunk(struct audit_chunk *);
-extern bool audit_tree_match(struct audit_chunk *, struct audit_tree *);
-extern int audit_make_tree(struct audit_krule *, char *, u32);
-extern int audit_add_tree_rule(struct audit_krule *);
-extern int audit_remove_tree_rule(struct audit_krule *);
+extern struct audit_chunk *audit_tree_lookup(const struct inode *inode);
+extern void audit_put_chunk(struct audit_chunk *chunk);
+extern bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree);
+extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op);
+extern int audit_add_tree_rule(struct audit_krule *rule);
+extern int audit_remove_tree_rule(struct audit_krule *rule);
extern void audit_trim_trees(void);
extern int audit_tag_tree(char *old, char *new);
-extern const char *audit_tree_path(struct audit_tree *);
-extern void audit_put_tree(struct audit_tree *);
-extern void audit_kill_trees(struct list_head *);
+extern const char *audit_tree_path(struct audit_tree *tree);
+extern void audit_put_tree(struct audit_tree *tree);
+extern void audit_kill_trees(struct list_head *list);
#else
#define audit_remove_tree_rule(rule) BUG()
#define audit_add_tree_rule(rule) -EINVAL
@@ -323,7 +324,7 @@ extern void audit_kill_trees(struct list_head *);
#define audit_kill_trees(list) BUG()
#endif
-extern char *audit_unpack_string(void **, size_t *, size_t);
+extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len);
extern pid_t audit_sig_pid;
extern kuid_t audit_sig_uid;
@@ -333,7 +334,7 @@ extern int audit_filter(int msgtype, unsigned int listtype);
#ifdef CONFIG_AUDITSYSCALL
extern int audit_signal_info(int sig, struct task_struct *t);
-extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
+extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx);
extern struct list_head *audit_killed_trees(void);
#else
#define audit_signal_info(s,t) AUDIT_DISABLED
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index bb724baa7ac9..3260ba2312a9 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1261,6 +1261,7 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable);
audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
+ audit_log_cap(ab, "cap_pa", &context->capset.cap.ambient);
break;
case AUDIT_MMAP:
audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
@@ -1382,9 +1383,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted);
audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable);
audit_log_cap(ab, "old_pe", &axs->old_pcap.effective);
- audit_log_cap(ab, "new_pp", &axs->new_pcap.permitted);
- audit_log_cap(ab, "new_pi", &axs->new_pcap.inheritable);
- audit_log_cap(ab, "new_pe", &axs->new_pcap.effective);
+ audit_log_cap(ab, "old_pa", &axs->old_pcap.ambient);
+ audit_log_cap(ab, "pp", &axs->new_pcap.permitted);
+ audit_log_cap(ab, "pi", &axs->new_pcap.inheritable);
+ audit_log_cap(ab, "pe", &axs->new_pcap.effective);
+ audit_log_cap(ab, "pa", &axs->new_pcap.ambient);
break; }
}
@@ -2342,10 +2345,12 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
ax->old_pcap.permitted = old->cap_permitted;
ax->old_pcap.inheritable = old->cap_inheritable;
ax->old_pcap.effective = old->cap_effective;
+ ax->old_pcap.ambient = old->cap_ambient;
ax->new_pcap.permitted = new->cap_permitted;
ax->new_pcap.inheritable = new->cap_inheritable;
ax->new_pcap.effective = new->cap_effective;
+ ax->new_pcap.ambient = new->cap_ambient;
return 0;
}
@@ -2364,6 +2369,7 @@ void __audit_log_capset(const struct cred *new, const struct cred *old)
context->capset.cap.effective = new->cap_effective;
context->capset.cap.inheritable = new->cap_effective;
context->capset.cap.permitted = new->cap_permitted;
+ context->capset.cap.ambient = new->cap_ambient;
context->type = AUDIT_CAPSET;
}
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 172dc8ee0e3b..d771a3872500 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -335,6 +335,26 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
}
/* only called from syscall */
+int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
+{
+ void **elem, *ptr;
+ int ret = 0;
+
+ if (!map->ops->map_fd_sys_lookup_elem)
+ return -ENOTSUPP;
+
+ rcu_read_lock();
+ elem = array_map_lookup_elem(map, key);
+ if (elem && (ptr = READ_ONCE(*elem)))
+ *value = map->ops->map_fd_sys_lookup_elem(ptr);
+ else
+ ret = -ENOENT;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/* only called from syscall */
int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
void *key, void *value, u64 map_flags)
{
@@ -400,6 +420,11 @@ static void prog_fd_array_put_ptr(void *ptr)
bpf_prog_put(ptr);
}
+static u32 prog_fd_array_sys_lookup_elem(void *ptr)
+{
+ return ((struct bpf_prog *)ptr)->aux->id;
+}
+
/* decrement refcnt of all bpf_progs that are stored in this map */
void bpf_fd_array_map_clear(struct bpf_map *map)
{
@@ -418,6 +443,7 @@ const struct bpf_map_ops prog_array_map_ops = {
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = prog_fd_array_get_ptr,
.map_fd_put_ptr = prog_fd_array_put_ptr,
+ .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
};
static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
@@ -452,38 +478,24 @@ static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
struct file *map_file, int fd)
{
- const struct perf_event_attr *attr;
struct bpf_event_entry *ee;
struct perf_event *event;
struct file *perf_file;
+ u64 value;
perf_file = perf_event_get(fd);
if (IS_ERR(perf_file))
return perf_file;
+ ee = ERR_PTR(-EOPNOTSUPP);
event = perf_file->private_data;
- ee = ERR_PTR(-EINVAL);
-
- attr = perf_event_attrs(event);
- if (IS_ERR(attr) || attr->inherit)
+ if (perf_event_read_local(event, &value) == -EOPNOTSUPP)
goto err_out;
- switch (attr->type) {
- case PERF_TYPE_SOFTWARE:
- if (attr->config != PERF_COUNT_SW_BPF_OUTPUT)
- goto err_out;
- /* fall-through */
- case PERF_TYPE_RAW:
- case PERF_TYPE_HARDWARE:
- ee = bpf_event_entry_gen(perf_file, map_file);
- if (ee)
- return ee;
- ee = ERR_PTR(-ENOMEM);
- /* fall-through */
- default:
- break;
- }
-
+ ee = bpf_event_entry_gen(perf_file, map_file);
+ if (ee)
+ return ee;
+ ee = ERR_PTR(-ENOMEM);
err_out:
fput(perf_file);
return ee;
@@ -599,4 +611,5 @@ const struct bpf_map_ops array_of_maps_map_ops = {
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = bpf_map_fd_get_ptr,
.map_fd_put_ptr = bpf_map_fd_put_ptr,
+ .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
};
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ea6033cba947..546113430049 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -236,3 +236,40 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
return ret;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
+
+/**
+ * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
+ * @sk: socket to get cgroup from
+ * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
+ * sk with connection information (IP addresses, etc.) May not contain
+ * cgroup info if it is a req sock.
+ * @type: The type of program to be exectuted
+ *
+ * socket passed is expected to be of type INET or INET6.
+ *
+ * The program type passed in via @type must be suitable for sock_ops
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
+ struct bpf_sock_ops_kern *sock_ops,
+ enum bpf_attach_type type)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_prog *prog;
+ int ret = 0;
+
+
+ rcu_read_lock();
+
+ prog = rcu_dereference(cgrp->bpf.effective[type]);
+ if (prog)
+ ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM;
+
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index dedf367f59bb..ad5f55922a13 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -763,10 +763,10 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
*
* Decode and execute eBPF instructions.
*/
-static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
+static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
+ u64 *stack)
{
- u64 stack[MAX_BPF_STACK / sizeof(u64)];
- u64 regs[MAX_BPF_REG], tmp;
+ u64 tmp;
static const void *jumptable[256] = {
[0 ... 255] = &&default_label,
/* Now overwrite non-defaults ... */
@@ -824,7 +824,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
/* Call instruction */
[BPF_JMP | BPF_CALL] = &&JMP_CALL,
- [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL,
+ [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
/* Jumps */
[BPF_JMP | BPF_JA] = &&JMP_JA,
[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
@@ -874,9 +874,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
#define CONT ({ insn++; goto select_insn; })
#define CONT_JMP ({ insn++; goto select_insn; })
- FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
- ARG1 = (u64) (unsigned long) ctx;
-
select_insn:
goto *jumptable[insn->code];
@@ -1219,7 +1216,39 @@ load_byte:
WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
return 0;
}
-STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */
+STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */
+
+#define PROG_NAME(stack_size) __bpf_prog_run##stack_size
+#define DEFINE_BPF_PROG_RUN(stack_size) \
+static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
+{ \
+ u64 stack[stack_size / sizeof(u64)]; \
+ u64 regs[MAX_BPF_REG]; \
+\
+ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
+ ARG1 = (u64) (unsigned long) ctx; \
+ return ___bpf_prog_run(regs, insn, stack); \
+}
+
+#define EVAL1(FN, X) FN(X)
+#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y)
+#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y)
+#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y)
+#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y)
+#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y)
+
+EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192);
+EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384);
+EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512);
+
+#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size),
+
+static unsigned int (*interpreters[])(const void *ctx,
+ const struct bpf_insn *insn) = {
+EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
+EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
+EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
+};
bool bpf_prog_array_compatible(struct bpf_array *array,
const struct bpf_prog *fp)
@@ -1268,7 +1297,9 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
*/
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
- fp->bpf_func = (void *) __bpf_prog_run;
+ u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
+
+ fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
/* eBPF JITs can rewrite the program in case constant
* blinding is active. However, in case of error during
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 004334ea13ba..4fb463172aa8 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1244,6 +1244,26 @@ static void fd_htab_map_free(struct bpf_map *map)
}
/* only called from syscall */
+int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
+{
+ void **ptr;
+ int ret = 0;
+
+ if (!map->ops->map_fd_sys_lookup_elem)
+ return -ENOTSUPP;
+
+ rcu_read_lock();
+ ptr = htab_map_lookup_elem(map, key);
+ if (ptr)
+ *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr));
+ else
+ ret = -ENOENT;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/* only called from syscall */
int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
void *key, void *value, u64 map_flags)
{
@@ -1305,4 +1325,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
.map_delete_elem = htab_map_delete_elem,
.map_fd_get_ptr = bpf_map_fd_get_ptr,
.map_fd_put_ptr = bpf_map_fd_put_ptr,
+ .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
};
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 59bcdf821ae4..1da574612bea 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -95,3 +95,8 @@ void bpf_map_fd_put_ptr(void *ptr)
*/
bpf_map_put(ptr);
}
+
+u32 bpf_map_fd_sys_lookup_elem(void *ptr)
+{
+ return ((struct bpf_map *)ptr)->id;
+}
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
index 177fadb689dc..6183db9ec08c 100644
--- a/kernel/bpf/map_in_map.h
+++ b/kernel/bpf/map_in_map.h
@@ -19,5 +19,6 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
int ufd);
void bpf_map_fd_put_ptr(void *ptr);
+u32 bpf_map_fd_sys_lookup_elem(void *ptr);
#endif
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 265a0d854e33..045646da97cc 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -22,8 +22,20 @@
#include <linux/filter.h>
#include <linux/version.h>
#include <linux/kernel.h>
+#include <linux/idr.h>
+
+#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
+ (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
+ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
+ (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
+#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
+#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
DEFINE_PER_CPU(int, bpf_prog_active);
+static DEFINE_IDR(prog_idr);
+static DEFINE_SPINLOCK(prog_idr_lock);
+static DEFINE_IDR(map_idr);
+static DEFINE_SPINLOCK(map_idr_lock);
int sysctl_unprivileged_bpf_disabled __read_mostly;
@@ -114,6 +126,37 @@ static void bpf_map_uncharge_memlock(struct bpf_map *map)
free_uid(user);
}
+static int bpf_map_alloc_id(struct bpf_map *map)
+{
+ int id;
+
+ spin_lock_bh(&map_idr_lock);
+ id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
+ if (id > 0)
+ map->id = id;
+ spin_unlock_bh(&map_idr_lock);
+
+ if (WARN_ON_ONCE(!id))
+ return -ENOSPC;
+
+ return id > 0 ? 0 : id;
+}
+
+static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
+{
+ if (do_idr_lock)
+ spin_lock_bh(&map_idr_lock);
+ else
+ __acquire(&map_idr_lock);
+
+ idr_remove(&map_idr, map->id);
+
+ if (do_idr_lock)
+ spin_unlock_bh(&map_idr_lock);
+ else
+ __release(&map_idr_lock);
+}
+
/* called from workqueue */
static void bpf_map_free_deferred(struct work_struct *work)
{
@@ -135,14 +178,21 @@ static void bpf_map_put_uref(struct bpf_map *map)
/* decrement map refcnt and schedule it for freeing via workqueue
* (unrelying map implementation ops->map_free() might sleep)
*/
-void bpf_map_put(struct bpf_map *map)
+static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
{
if (atomic_dec_and_test(&map->refcnt)) {
+ /* bpf_map_free_id() must be called first */
+ bpf_map_free_id(map, do_idr_lock);
INIT_WORK(&map->work, bpf_map_free_deferred);
schedule_work(&map->work);
}
}
+void bpf_map_put(struct bpf_map *map)
+{
+ __bpf_map_put(map, true);
+}
+
void bpf_map_put_with_uref(struct bpf_map *map)
{
bpf_map_put_uref(map);
@@ -166,10 +216,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
const struct bpf_map *map = filp->private_data;
const struct bpf_array *array;
u32 owner_prog_type = 0;
+ u32 owner_jited = 0;
if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
array = container_of(map, struct bpf_array, map);
owner_prog_type = array->owner_prog_type;
+ owner_jited = array->owner_jited;
}
seq_printf(m,
@@ -186,9 +238,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
map->map_flags,
map->pages * 1ULL << PAGE_SHIFT);
- if (owner_prog_type)
+ if (owner_prog_type) {
seq_printf(m, "owner_prog_type:\t%u\n",
owner_prog_type);
+ seq_printf(m, "owner_jited:\t%u\n",
+ owner_jited);
+ }
}
#endif
@@ -236,11 +291,22 @@ static int map_create(union bpf_attr *attr)
if (err)
goto free_map_nouncharge;
- err = bpf_map_new_fd(map);
- if (err < 0)
- /* failed to allocate fd */
+ err = bpf_map_alloc_id(map);
+ if (err)
goto free_map;
+ err = bpf_map_new_fd(map);
+ if (err < 0) {
+ /* failed to allocate fd.
+ * bpf_map_put() is needed because the above
+ * bpf_map_alloc_id() has published the map
+ * to the userspace and the userspace may
+ * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
+ */
+ bpf_map_put(map);
+ return err;
+ }
+
trace_bpf_map_create(map, err);
return err;
@@ -295,6 +361,28 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
return map;
}
+/* map_idr_lock should have been held */
+static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map,
+ bool uref)
+{
+ int refold;
+
+ refold = __atomic_add_unless(&map->refcnt, 1, 0);
+
+ if (refold >= BPF_MAX_REFCNT) {
+ __bpf_map_put(map, false);
+ return ERR_PTR(-EBUSY);
+ }
+
+ if (!refold)
+ return ERR_PTR(-ENOENT);
+
+ if (uref)
+ atomic_inc(&map->usercnt);
+
+ return map;
+}
+
int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
{
return -ENOTSUPP;
@@ -322,19 +410,18 @@ static int map_lookup_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- err = -ENOMEM;
- key = kmalloc(map->key_size, GFP_USER);
- if (!key)
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
goto err_put;
-
- err = -EFAULT;
- if (copy_from_user(key, ukey, map->key_size) != 0)
- goto free_key;
+ }
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
value_size = round_up(map->value_size, 8) * num_possible_cpus();
+ else if (IS_FD_MAP(map))
+ value_size = sizeof(u32);
else
value_size = map->value_size;
@@ -350,9 +437,10 @@ static int map_lookup_elem(union bpf_attr *attr)
err = bpf_percpu_array_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
err = bpf_stackmap_copy(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
- map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
- err = -ENOTSUPP;
+ } else if (IS_FD_ARRAY(map)) {
+ err = bpf_fd_array_map_lookup_elem(map, key, value);
+ } else if (IS_FD_HASH(map)) {
+ err = bpf_fd_htab_map_lookup_elem(map, key, value);
} else {
rcu_read_lock();
ptr = map->ops->map_lookup_elem(map, key);
@@ -402,14 +490,11 @@ static int map_update_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- err = -ENOMEM;
- key = kmalloc(map->key_size, GFP_USER);
- if (!key)
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
goto err_put;
-
- err = -EFAULT;
- if (copy_from_user(key, ukey, map->key_size) != 0)
- goto free_key;
+ }
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
@@ -488,14 +573,11 @@ static int map_delete_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- err = -ENOMEM;
- key = kmalloc(map->key_size, GFP_USER);
- if (!key)
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
goto err_put;
-
- err = -EFAULT;
- if (copy_from_user(key, ukey, map->key_size) != 0)
- goto free_key;
+ }
preempt_disable();
__this_cpu_inc(bpf_prog_active);
@@ -507,7 +589,6 @@ static int map_delete_elem(union bpf_attr *attr)
if (!err)
trace_bpf_map_delete_elem(map, ufd, key);
-free_key:
kfree(key);
err_put:
fdput(f);
@@ -536,14 +617,11 @@ static int map_get_next_key(union bpf_attr *attr)
return PTR_ERR(map);
if (ukey) {
- err = -ENOMEM;
- key = kmalloc(map->key_size, GFP_USER);
- if (!key)
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
goto err_put;
-
- err = -EFAULT;
- if (copy_from_user(key, ukey, map->key_size) != 0)
- goto free_key;
+ }
} else {
key = NULL;
}
@@ -650,6 +728,42 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
free_uid(user);
}
+static int bpf_prog_alloc_id(struct bpf_prog *prog)
+{
+ int id;
+
+ spin_lock_bh(&prog_idr_lock);
+ id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
+ if (id > 0)
+ prog->aux->id = id;
+ spin_unlock_bh(&prog_idr_lock);
+
+ /* id is in [1, INT_MAX) */
+ if (WARN_ON_ONCE(!id))
+ return -ENOSPC;
+
+ return id > 0 ? 0 : id;
+}
+
+static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
+{
+ /* cBPF to eBPF migrations are currently not in the idr store. */
+ if (!prog->aux->id)
+ return;
+
+ if (do_idr_lock)
+ spin_lock_bh(&prog_idr_lock);
+ else
+ __acquire(&prog_idr_lock);
+
+ idr_remove(&prog_idr, prog->aux->id);
+
+ if (do_idr_lock)
+ spin_unlock_bh(&prog_idr_lock);
+ else
+ __release(&prog_idr_lock);
+}
+
static void __bpf_prog_put_rcu(struct rcu_head *rcu)
{
struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
@@ -659,14 +773,21 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
bpf_prog_free(aux->prog);
}
-void bpf_prog_put(struct bpf_prog *prog)
+static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
if (atomic_dec_and_test(&prog->aux->refcnt)) {
trace_bpf_prog_put_rcu(prog);
+ /* bpf_prog_free_id() must be called first */
+ bpf_prog_free_id(prog, do_idr_lock);
bpf_prog_kallsyms_del(prog);
call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
}
}
+
+void bpf_prog_put(struct bpf_prog *prog)
+{
+ __bpf_prog_put(prog, true);
+}
EXPORT_SYMBOL_GPL(bpf_prog_put);
static int bpf_prog_release(struct inode *inode, struct file *filp)
@@ -748,6 +869,24 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
}
EXPORT_SYMBOL_GPL(bpf_prog_inc);
+/* prog_idr_lock should have been held */
+static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
+{
+ int refold;
+
+ refold = __atomic_add_unless(&prog->aux->refcnt, 1, 0);
+
+ if (refold >= BPF_MAX_REFCNT) {
+ __bpf_prog_put(prog, false);
+ return ERR_PTR(-EBUSY);
+ }
+
+ if (!refold)
+ return ERR_PTR(-ENOENT);
+
+ return prog;
+}
+
static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
{
struct fd f = fdget(ufd);
@@ -815,7 +954,9 @@ static int bpf_prog_load(union bpf_attr *attr)
attr->kern_version != LINUX_VERSION_CODE)
return -EINVAL;
- if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN))
+ if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
+ type != BPF_PROG_TYPE_CGROUP_SKB &&
+ !capable(CAP_SYS_ADMIN))
return -EPERM;
/* plain bpf_prog allocation */
@@ -855,11 +996,22 @@ static int bpf_prog_load(union bpf_attr *attr)
if (err < 0)
goto free_used_maps;
- err = bpf_prog_new_fd(prog);
- if (err < 0)
- /* failed to allocate fd */
+ err = bpf_prog_alloc_id(prog);
+ if (err)
goto free_used_maps;
+ err = bpf_prog_new_fd(prog);
+ if (err < 0) {
+ /* failed to allocate fd.
+ * bpf_prog_put() is needed because the above
+ * bpf_prog_alloc_id() has published the prog
+ * to the userspace and the userspace may
+ * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID.
+ */
+ bpf_prog_put(prog);
+ return err;
+ }
+
bpf_prog_kallsyms_add(prog);
trace_bpf_prog_load(prog, err);
return err;
@@ -919,6 +1071,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_CGROUP_INET_SOCK_CREATE:
ptype = BPF_PROG_TYPE_CGROUP_SOCK;
break;
+ case BPF_CGROUP_SOCK_OPS:
+ ptype = BPF_PROG_TYPE_SOCK_OPS;
+ break;
default:
return -EINVAL;
}
@@ -959,6 +1114,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS:
case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_SOCK_OPS:
cgrp = cgroup_get_from_fd(attr->target_fd);
if (IS_ERR(cgrp))
return PTR_ERR(cgrp);
@@ -973,6 +1129,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
return ret;
}
+
#endif /* CONFIG_CGROUP_BPF */
#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
@@ -997,6 +1154,237 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
return ret;
}
+#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id
+
+static int bpf_obj_get_next_id(const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ struct idr *idr,
+ spinlock_t *lock)
+{
+ u32 next_id = attr->start_id;
+ int err = 0;
+
+ if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ next_id++;
+ spin_lock_bh(lock);
+ if (!idr_get_next(idr, &next_id))
+ err = -ENOENT;
+ spin_unlock_bh(lock);
+
+ if (!err)
+ err = put_user(next_id, &uattr->next_id);
+
+ return err;
+}
+
+#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
+
+static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
+{
+ struct bpf_prog *prog;
+ u32 id = attr->prog_id;
+ int fd;
+
+ if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ spin_lock_bh(&prog_idr_lock);
+ prog = idr_find(&prog_idr, id);
+ if (prog)
+ prog = bpf_prog_inc_not_zero(prog);
+ else
+ prog = ERR_PTR(-ENOENT);
+ spin_unlock_bh(&prog_idr_lock);
+
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ fd = bpf_prog_new_fd(prog);
+ if (fd < 0)
+ bpf_prog_put(prog);
+
+ return fd;
+}
+
+#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id
+
+static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
+{
+ struct bpf_map *map;
+ u32 id = attr->map_id;
+ int fd;
+
+ if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ spin_lock_bh(&map_idr_lock);
+ map = idr_find(&map_idr, id);
+ if (map)
+ map = bpf_map_inc_not_zero(map, true);
+ else
+ map = ERR_PTR(-ENOENT);
+ spin_unlock_bh(&map_idr_lock);
+
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ fd = bpf_map_new_fd(map);
+ if (fd < 0)
+ bpf_map_put(map);
+
+ return fd;
+}
+
+static int check_uarg_tail_zero(void __user *uaddr,
+ size_t expected_size,
+ size_t actual_size)
+{
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+ int err;
+
+ if (actual_size <= expected_size)
+ return 0;
+
+ addr = uaddr + expected_size;
+ end = uaddr + actual_size;
+
+ for (; addr < end; addr++) {
+ err = get_user(val, addr);
+ if (err)
+ return err;
+ if (val)
+ return -E2BIG;
+ }
+
+ return 0;
+}
+
+static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ struct bpf_prog_info info = {};
+ u32 info_len = attr->info.info_len;
+ char __user *uinsns;
+ u32 ulen;
+ int err;
+
+ err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ if (err)
+ return err;
+ info_len = min_t(u32, sizeof(info), info_len);
+
+ if (copy_from_user(&info, uinfo, info_len))
+ return err;
+
+ info.type = prog->type;
+ info.id = prog->aux->id;
+
+ memcpy(info.tag, prog->tag, sizeof(prog->tag));
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ info.jited_prog_len = 0;
+ info.xlated_prog_len = 0;
+ goto done;
+ }
+
+ ulen = info.jited_prog_len;
+ info.jited_prog_len = prog->jited_len;
+ if (info.jited_prog_len && ulen) {
+ uinsns = u64_to_user_ptr(info.jited_prog_insns);
+ ulen = min_t(u32, info.jited_prog_len, ulen);
+ if (copy_to_user(uinsns, prog->bpf_func, ulen))
+ return -EFAULT;
+ }
+
+ ulen = info.xlated_prog_len;
+ info.xlated_prog_len = bpf_prog_size(prog->len);
+ if (info.xlated_prog_len && ulen) {
+ uinsns = u64_to_user_ptr(info.xlated_prog_insns);
+ ulen = min_t(u32, info.xlated_prog_len, ulen);
+ if (copy_to_user(uinsns, prog->insnsi, ulen))
+ return -EFAULT;
+ }
+
+done:
+ if (copy_to_user(uinfo, &info, info_len) ||
+ put_user(info_len, &uattr->info.info_len))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int bpf_map_get_info_by_fd(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ struct bpf_map_info info = {};
+ u32 info_len = attr->info.info_len;
+ int err;
+
+ err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ if (err)
+ return err;
+ info_len = min_t(u32, sizeof(info), info_len);
+
+ info.type = map->map_type;
+ info.id = map->id;
+ info.key_size = map->key_size;
+ info.value_size = map->value_size;
+ info.max_entries = map->max_entries;
+ info.map_flags = map->map_flags;
+
+ if (copy_to_user(uinfo, &info, info_len) ||
+ put_user(info_len, &uattr->info.info_len))
+ return -EFAULT;
+
+ return 0;
+}
+
+#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
+
+static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ int ufd = attr->info.bpf_fd;
+ struct fd f;
+ int err;
+
+ if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
+ return -EINVAL;
+
+ f = fdget(ufd);
+ if (!f.file)
+ return -EBADFD;
+
+ if (f.file->f_op == &bpf_prog_fops)
+ err = bpf_prog_get_info_by_fd(f.file->private_data, attr,
+ uattr);
+ else if (f.file->f_op == &bpf_map_fops)
+ err = bpf_map_get_info_by_fd(f.file->private_data, attr,
+ uattr);
+ else
+ err = -EINVAL;
+
+ fdput(f);
+ return err;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
@@ -1016,23 +1404,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
* user-space does not rely on any kernel feature
* extensions we dont know about yet.
*/
- if (size > sizeof(attr)) {
- unsigned char __user *addr;
- unsigned char __user *end;
- unsigned char val;
-
- addr = (void __user *)uattr + sizeof(attr);
- end = (void __user *)uattr + size;
-
- for (; addr < end; addr++) {
- err = get_user(val, addr);
- if (err)
- return err;
- if (val)
- return -E2BIG;
- }
- size = sizeof(attr);
- }
+ err = check_uarg_tail_zero(uattr, sizeof(attr), size);
+ if (err)
+ return err;
+ size = min_t(u32, size, sizeof(attr));
/* copy attributes from user space, may be less than sizeof(bpf_attr) */
if (copy_from_user(&attr, uattr, size) != 0)
@@ -1074,6 +1449,23 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_PROG_TEST_RUN:
err = bpf_prog_test_run(&attr, uattr);
break;
+ case BPF_PROG_GET_NEXT_ID:
+ err = bpf_obj_get_next_id(&attr, uattr,
+ &prog_idr, &prog_idr_lock);
+ break;
+ case BPF_MAP_GET_NEXT_ID:
+ err = bpf_obj_get_next_id(&attr, uattr,
+ &map_idr, &map_idr_lock);
+ break;
+ case BPF_PROG_GET_FD_BY_ID:
+ err = bpf_prog_get_fd_by_id(&attr);
+ break;
+ case BPF_MAP_GET_FD_BY_ID:
+ err = bpf_map_get_fd_by_id(&attr);
+ break;
+ case BPF_OBJ_GET_INFO_BY_FD:
+ err = bpf_obj_get_info_by_fd(&attr, uattr);
+ break;
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a8a725697bed..6a86723c5b64 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -546,20 +546,6 @@ static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
return 0;
}
-static int bpf_size_to_bytes(int bpf_size)
-{
- if (bpf_size == BPF_W)
- return 4;
- else if (bpf_size == BPF_H)
- return 2;
- else if (bpf_size == BPF_B)
- return 1;
- else if (bpf_size == BPF_DW)
- return 8;
- else
- return -EINVAL;
-}
-
static bool is_spillable_regtype(enum bpf_reg_type type)
{
switch (type) {
@@ -758,15 +744,29 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
}
/* check access to 'struct bpf_context' fields */
-static int check_ctx_access(struct bpf_verifier_env *env, int off, int size,
+static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
enum bpf_access_type t, enum bpf_reg_type *reg_type)
{
+ struct bpf_insn_access_aux info = {
+ .reg_type = *reg_type,
+ };
+
/* for analyzer ctx accesses are already validated and converted */
if (env->analyzer_ops)
return 0;
if (env->prog->aux->ops->is_valid_access &&
- env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) {
+ env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
+ /* A non zero info.ctx_field_size indicates that this field is a
+ * candidate for later verifier transformation to load the whole
+ * field and then apply a mask when accessed with a narrower
+ * access than actual ctx access size. A zero info.ctx_field_size
+ * will only allow for whole field access and rejects any other
+ * type of narrower access.
+ */
+ env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
+ *reg_type = info.reg_type;
+
/* remember the offset of last byte accessed in ctx */
if (env->prog->aux->max_ctx_offset < off + size)
env->prog->aux->max_ctx_offset = off + size;
@@ -868,7 +868,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
* if t==write && value_regno==-1, some unknown value is stored into memory
* if t==read && value_regno==-1, don't care what we read from memory
*/
-static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
+static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off,
int bpf_size, enum bpf_access_type t,
int value_regno)
{
@@ -911,7 +911,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
verbose("R%d leaks addr into ctx\n", value_regno);
return -EACCES;
}
- err = check_ctx_access(env, off, size, t, &reg_type);
+ err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
if (!err && t == BPF_READ && value_regno >= 0) {
mark_reg_unknown_value_and_range(state->regs,
value_regno);
@@ -926,6 +926,10 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
verbose("invalid stack off=%d size=%d\n", off, size);
return -EACCES;
}
+
+ if (env->prog->aux->stack_depth < -off)
+ env->prog->aux->stack_depth = -off;
+
if (t == BPF_WRITE) {
if (!env->allow_ptr_leaks &&
state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
@@ -968,7 +972,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
return err;
}
-static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)
+static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
{
struct bpf_reg_state *regs = env->cur_state.regs;
int err;
@@ -995,13 +999,13 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check whether atomic_add can read the memory */
- err = check_mem_access(env, insn->dst_reg, insn->off,
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ, -1);
if (err)
return err;
/* check whether atomic_add can write into the same memory */
- return check_mem_access(env, insn->dst_reg, insn->off,
+ return check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE, -1);
}
@@ -1037,6 +1041,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
return -EACCES;
}
+ if (env->prog->aux->stack_depth < -off)
+ env->prog->aux->stack_depth = -off;
+
if (meta && meta->raw_mode) {
meta->access_size = access_size;
meta->regno = regno;
@@ -1344,8 +1351,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
if (reg->type != PTR_TO_PACKET &&
reg->type != PTR_TO_PACKET_END)
continue;
- reg->type = UNKNOWN_VALUE;
- reg->imm = 0;
+ __mark_reg_unknown_value(state->spilled_regs,
+ i / BPF_REG_SIZE);
}
}
@@ -1414,7 +1421,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
* is inferred from register state.
*/
for (i = 0; i < meta.access_size; i++) {
- err = check_mem_access(env, meta.regno, i, BPF_B, BPF_WRITE, -1);
+ err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, BPF_WRITE, -1);
if (err)
return err;
}
@@ -1650,6 +1657,65 @@ static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
}
+static int evaluate_reg_imm_alu_unknown(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
+ struct bpf_reg_state *src_reg = &regs[insn->src_reg];
+ u8 opcode = BPF_OP(insn->code);
+ s64 imm_log2 = __ilog2_u64((long long)dst_reg->imm);
+
+ /* BPF_X code with src_reg->type UNKNOWN_VALUE here. */
+ if (src_reg->imm > 0 && dst_reg->imm) {
+ switch (opcode) {
+ case BPF_ADD:
+ /* dreg += sreg
+ * where both have zero upper bits. Adding them
+ * can only result making one more bit non-zero
+ * in the larger value.
+ * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
+ * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
+ */
+ dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
+ dst_reg->imm--;
+ break;
+ case BPF_AND:
+ /* dreg &= sreg
+ * AND can not extend zero bits only shrink
+ * Ex. 0x00..00ffffff
+ * & 0x0f..ffffffff
+ * ----------------
+ * 0x00..00ffffff
+ */
+ dst_reg->imm = max(src_reg->imm, 63 - imm_log2);
+ break;
+ case BPF_OR:
+ /* dreg |= sreg
+ * OR can only extend zero bits
+ * Ex. 0x00..00ffffff
+ * | 0x0f..ffffffff
+ * ----------------
+ * 0x0f..00ffffff
+ */
+ dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
+ break;
+ case BPF_SUB:
+ case BPF_MUL:
+ case BPF_RSH:
+ case BPF_LSH:
+ /* These may be flushed out later */
+ default:
+ mark_reg_unknown_value(regs, insn->dst_reg);
+ }
+ } else {
+ mark_reg_unknown_value(regs, insn->dst_reg);
+ }
+
+ dst_reg->type = UNKNOWN_VALUE;
+ return 0;
+}
+
static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
struct bpf_insn *insn)
{
@@ -1659,6 +1725,9 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
u8 opcode = BPF_OP(insn->code);
u64 dst_imm = dst_reg->imm;
+ if (BPF_SRC(insn->code) == BPF_X && src_reg->type == UNKNOWN_VALUE)
+ return evaluate_reg_imm_alu_unknown(env, insn);
+
/* dst_reg->type == CONST_IMM here. Simulate execution of insns
* containing ALU ops. Don't care about overflow or negative
* values, just add/sub/... them; registers are in u64.
@@ -1950,6 +2019,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
*/
regs[insn->dst_reg].type = CONST_IMM;
regs[insn->dst_reg].imm = insn->imm;
+ regs[insn->dst_reg].id = 0;
regs[insn->dst_reg].max_value = insn->imm;
regs[insn->dst_reg].min_value = insn->imm;
regs[insn->dst_reg].min_align = calc_align(insn->imm);
@@ -2407,6 +2477,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
regs[insn->dst_reg].type = CONST_IMM;
regs[insn->dst_reg].imm = imm;
+ regs[insn->dst_reg].id = 0;
return 0;
}
@@ -2826,6 +2897,8 @@ static bool states_equal(struct bpf_verifier_env *env,
return false;
if (i % BPF_REG_SIZE)
continue;
+ if (old->stack_slot_type[i] != STACK_SPILL)
+ continue;
if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
&cur->spilled_regs[i / BPF_REG_SIZE],
sizeof(old->spilled_regs[0])))
@@ -2987,18 +3060,12 @@ static int do_check(struct bpf_verifier_env *env)
/* check that memory (src_reg + off) is readable,
* the state of dst_reg will be updated by this func
*/
- err = check_mem_access(env, insn->src_reg, insn->off,
+ err = check_mem_access(env, insn_idx, insn->src_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ,
insn->dst_reg);
if (err)
return err;
- if (BPF_SIZE(insn->code) != BPF_W &&
- BPF_SIZE(insn->code) != BPF_DW) {
- insn_idx++;
- continue;
- }
-
prev_src_type = &env->insn_aux_data[insn_idx].ptr_type;
if (*prev_src_type == NOT_INIT) {
@@ -3026,7 +3093,7 @@ static int do_check(struct bpf_verifier_env *env)
enum bpf_reg_type *prev_dst_type, dst_reg_type;
if (BPF_MODE(insn->code) == BPF_XADD) {
- err = check_xadd(env, insn);
+ err = check_xadd(env, insn_idx, insn);
if (err)
return err;
insn_idx++;
@@ -3045,7 +3112,7 @@ static int do_check(struct bpf_verifier_env *env)
dst_reg_type = regs[insn->dst_reg].type;
/* check that memory (dst_reg + off) is writeable */
- err = check_mem_access(env, insn->dst_reg, insn->off,
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE,
insn->src_reg);
if (err)
@@ -3074,7 +3141,7 @@ static int do_check(struct bpf_verifier_env *env)
return err;
/* check that memory (dst_reg + off) is writeable */
- err = check_mem_access(env, insn->dst_reg, insn->off,
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE,
-1);
if (err)
@@ -3172,7 +3239,8 @@ process_bpf_exit:
insn_idx++;
}
- verbose("processed %d insns\n", insn_processed);
+ verbose("processed %d insns, stack depth %d\n",
+ insn_processed, env->prog->aux->stack_depth);
return 0;
}
@@ -3372,11 +3440,13 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
static int convert_ctx_accesses(struct bpf_verifier_env *env)
{
const struct bpf_verifier_ops *ops = env->prog->aux->ops;
+ int i, cnt, size, ctx_field_size, delta = 0;
const int insn_cnt = env->prog->len;
struct bpf_insn insn_buf[16], *insn;
struct bpf_prog *new_prog;
enum bpf_access_type type;
- int i, cnt, delta = 0;
+ bool is_narrower_load;
+ u32 target_size;
if (ops->gen_prologue) {
cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
@@ -3416,12 +3486,52 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
continue;
- cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog);
- if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+ ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
+ size = BPF_LDST_BYTES(insn);
+
+ /* If the read access is a narrower load of the field,
+ * convert to a 4/8-byte load, to minimum program type specific
+ * convert_ctx_access changes. If conversion is successful,
+ * we will apply proper mask to the result.
+ */
+ is_narrower_load = size < ctx_field_size;
+ if (is_narrower_load) {
+ u32 off = insn->off;
+ u8 size_code;
+
+ if (type == BPF_WRITE) {
+ verbose("bpf verifier narrow ctx access misconfigured\n");
+ return -EINVAL;
+ }
+
+ size_code = BPF_H;
+ if (ctx_field_size == 4)
+ size_code = BPF_W;
+ else if (ctx_field_size == 8)
+ size_code = BPF_DW;
+
+ insn->off = off & ~(ctx_field_size - 1);
+ insn->code = BPF_LDX | BPF_MEM | size_code;
+ }
+
+ target_size = 0;
+ cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog,
+ &target_size);
+ if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
+ (ctx_field_size && !target_size)) {
verbose("bpf verifier is misconfigured\n");
return -EINVAL;
}
+ if (is_narrower_load && size < target_size) {
+ if (ctx_field_size <= 4)
+ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
+ (1 << size * 8) - 1);
+ else
+ insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
+ (1 << size * 8) - 1);
+ }
+
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
@@ -3467,6 +3577,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
* the program array.
*/
prog->cb_access = 1;
+ env->prog->aux->stack_depth = MAX_BPF_STACK;
/* mark bpf_tail_call as different opcode to avoid
* conditional branch in the interpeter for every normal
@@ -3474,7 +3585,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
* that doesn't support bpf_tail_call yet
*/
insn->imm = 0;
- insn->code |= BPF_X;
+ insn->code = BPF_JMP | BPF_TAIL_CALL;
continue;
}
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 387348a40c64..ce693ccb8c58 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
obj-$(CONFIG_CGROUP_PIDS) += pids.o
obj-$(CONFIG_CGROUP_RDMA) += rdma.o
obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CGROUP_DEBUG) += debug.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 00f4d6bf048f..793565c05742 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -192,6 +192,8 @@ int cgroup_rmdir(struct kernfs_node *kn);
int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
struct kernfs_root *kf_root);
+int cgroup_task_count(const struct cgroup *cgrp);
+
/*
* namespace.c
*/
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 85d75152402d..7bf4b1533f34 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -334,19 +334,15 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
/**
* cgroup_task_count - count the number of tasks in a cgroup.
* @cgrp: the cgroup in question
- *
- * Return the number of tasks in the cgroup. The returned number can be
- * higher than the actual number of tasks due to css_set references from
- * namespace roots and temporary usages.
*/
-static int cgroup_task_count(const struct cgroup *cgrp)
+int cgroup_task_count(const struct cgroup *cgrp)
{
int count = 0;
struct cgrp_cset_link *link;
spin_lock_irq(&css_set_lock);
list_for_each_entry(link, &cgrp->cset_links, cset_link)
- count += refcount_read(&link->cset->refcount);
+ count += link->cset->nr_tasks;
spin_unlock_irq(&css_set_lock);
return count;
}
@@ -1263,150 +1259,3 @@ static int __init cgroup_no_v1(char *str)
return 1;
}
__setup("cgroup_no_v1=", cgroup_no_v1);
-
-
-#ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *
-debug_css_alloc(struct cgroup_subsys_state *parent_css)
-{
- struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
-
- if (!css)
- return ERR_PTR(-ENOMEM);
-
- return css;
-}
-
-static void debug_css_free(struct cgroup_subsys_state *css)
-{
- kfree(css);
-}
-
-static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return cgroup_task_count(css->cgroup);
-}
-
-static u64 current_css_set_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return (u64)(unsigned long)current->cgroups;
-}
-
-static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- u64 count;
-
- rcu_read_lock();
- count = refcount_read(&task_css_set(current)->refcount);
- rcu_read_unlock();
- return count;
-}
-
-static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
-{
- struct cgrp_cset_link *link;
- struct css_set *cset;
- char *name_buf;
-
- name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
- if (!name_buf)
- return -ENOMEM;
-
- spin_lock_irq(&css_set_lock);
- rcu_read_lock();
- cset = rcu_dereference(current->cgroups);
- list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
- struct cgroup *c = link->cgrp;
-
- cgroup_name(c, name_buf, NAME_MAX + 1);
- seq_printf(seq, "Root %d group %s\n",
- c->root->hierarchy_id, name_buf);
- }
- rcu_read_unlock();
- spin_unlock_irq(&css_set_lock);
- kfree(name_buf);
- return 0;
-}
-
-#define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct seq_file *seq, void *v)
-{
- struct cgroup_subsys_state *css = seq_css(seq);
- struct cgrp_cset_link *link;
-
- spin_lock_irq(&css_set_lock);
- list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
- struct css_set *cset = link->cset;
- struct task_struct *task;
- int count = 0;
-
- seq_printf(seq, "css_set %pK\n", cset);
-
- list_for_each_entry(task, &cset->tasks, cg_list) {
- if (count++ > MAX_TASKS_SHOWN_PER_CSS)
- goto overflow;
- seq_printf(seq, " task %d\n", task_pid_vnr(task));
- }
-
- list_for_each_entry(task, &cset->mg_tasks, cg_list) {
- if (count++ > MAX_TASKS_SHOWN_PER_CSS)
- goto overflow;
- seq_printf(seq, " task %d\n", task_pid_vnr(task));
- }
- continue;
- overflow:
- seq_puts(seq, " ...\n");
- }
- spin_unlock_irq(&css_set_lock);
- return 0;
-}
-
-static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
-{
- return (!cgroup_is_populated(css->cgroup) &&
- !css_has_online_children(&css->cgroup->self));
-}
-
-static struct cftype debug_files[] = {
- {
- .name = "taskcount",
- .read_u64 = debug_taskcount_read,
- },
-
- {
- .name = "current_css_set",
- .read_u64 = current_css_set_read,
- },
-
- {
- .name = "current_css_set_refcount",
- .read_u64 = current_css_set_refcount_read,
- },
-
- {
- .name = "current_css_set_cg_links",
- .seq_show = current_css_set_cg_links_read,
- },
-
- {
- .name = "cgroup_css_links",
- .seq_show = cgroup_css_links_read,
- },
-
- {
- .name = "releasable",
- .read_u64 = releasable_read,
- },
-
- { } /* terminate */
-};
-
-struct cgroup_subsys debug_cgrp_subsys = {
- .css_alloc = debug_css_alloc,
- .css_free = debug_css_free,
- .legacy_cftypes = debug_files,
-};
-#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 8d4e85eae42c..620794a20a33 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -573,6 +573,11 @@ static int css_set_count = 1; /* 1 for init_css_set */
/**
* css_set_populated - does a css_set contain any tasks?
* @cset: target css_set
+ *
+ * css_set_populated() should be the same as !!cset->nr_tasks at steady
+ * state. However, css_set_populated() can be called while a task is being
+ * added to or removed from the linked list before the nr_tasks is
+ * properly updated. Hence, we can't just look at ->nr_tasks here.
*/
static bool css_set_populated(struct css_set *cset)
{
@@ -1542,10 +1547,56 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
return len;
}
+static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
+{
+ char *token;
+
+ *root_flags = 0;
+
+ if (!data)
+ return 0;
+
+ while ((token = strsep(&data, ",")) != NULL) {
+ if (!strcmp(token, "nsdelegate")) {
+ *root_flags |= CGRP_ROOT_NS_DELEGATE;
+ continue;
+ }
+
+ pr_err("cgroup2: unknown option \"%s\"\n", token);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void apply_cgroup_root_flags(unsigned int root_flags)
+{
+ if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
+ if (root_flags & CGRP_ROOT_NS_DELEGATE)
+ cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+ }
+}
+
+static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
+{
+ if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
+ seq_puts(seq, ",nsdelegate");
+ return 0;
+}
+
static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
{
- pr_err("remount is not allowed\n");
- return -EINVAL;
+ unsigned int root_flags;
+ int ret;
+
+ ret = parse_cgroup_root_flags(data, &root_flags);
+ if (ret)
+ return ret;
+
+ apply_cgroup_root_flags(root_flags);
+ return 0;
}
/*
@@ -1598,6 +1649,7 @@ static void cgroup_enable_task_cg_lists(void)
css_set_update_populated(cset, true);
list_add_tail(&p->cg_list, &cset->tasks);
get_css_set(cset);
+ cset->nr_tasks++;
}
spin_unlock(&p->sighand->siglock);
} while_each_thread(g, p);
@@ -1784,6 +1836,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
{
struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct dentry *dentry;
+ int ret;
get_cgroup_ns(ns);
@@ -1801,16 +1854,21 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
cgroup_enable_task_cg_lists();
if (fs_type == &cgroup2_fs_type) {
- if (data) {
- pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+ unsigned int root_flags;
+
+ ret = parse_cgroup_root_flags(data, &root_flags);
+ if (ret) {
put_cgroup_ns(ns);
- return ERR_PTR(-EINVAL);
+ return ERR_PTR(ret);
}
+
cgrp_dfl_visible = true;
cgroup_get_live(&cgrp_dfl_root.cgrp);
dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
CGROUP2_SUPER_MAGIC, ns);
+ if (!IS_ERR(dentry))
+ apply_cgroup_root_flags(root_flags);
} else {
dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
CGROUP_SUPER_MAGIC, ns);
@@ -2064,8 +2122,10 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
struct css_set *to_cset = cset->mg_dst_cset;
get_css_set(to_cset);
+ to_cset->nr_tasks++;
css_set_move_task(task, from_cset, to_cset, true);
put_css_set_locked(from_cset);
+ from_cset->nr_tasks--;
}
}
spin_unlock_irq(&css_set_lock);
@@ -2355,27 +2415,14 @@ static int cgroup_procs_write_permission(struct task_struct *task,
struct cgroup *dst_cgrp,
struct kernfs_open_file *of)
{
- int ret = 0;
-
- if (cgroup_on_dfl(dst_cgrp)) {
- struct super_block *sb = of->file->f_path.dentry->d_sb;
- struct cgroup *cgrp;
- struct inode *inode;
-
- spin_lock_irq(&css_set_lock);
- cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
- spin_unlock_irq(&css_set_lock);
-
- while (!cgroup_is_descendant(dst_cgrp, cgrp))
- cgrp = cgroup_parent(cgrp);
+ struct super_block *sb = of->file->f_path.dentry->d_sb;
+ struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+ struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp;
+ struct cgroup *src_cgrp, *com_cgrp;
+ struct inode *inode;
+ int ret;
- ret = -ENOMEM;
- inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
- if (inode) {
- ret = inode_permission(inode, MAY_WRITE);
- iput(inode);
- }
- } else {
+ if (!cgroup_on_dfl(dst_cgrp)) {
const struct cred *cred = current_cred();
const struct cred *tcred = get_task_cred(task);
@@ -2383,14 +2430,47 @@ static int cgroup_procs_write_permission(struct task_struct *task,
* even if we're attaching all tasks in the thread group,
* we only need to check permissions on one of them.
*/
- if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
- !uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->euid, tcred->suid))
+ if (uid_eq(cred->euid, GLOBAL_ROOT_UID) ||
+ uid_eq(cred->euid, tcred->uid) ||
+ uid_eq(cred->euid, tcred->suid))
+ ret = 0;
+ else
ret = -EACCES;
+
put_cred(tcred);
+ return ret;
}
- return ret;
+ /* find the source cgroup */
+ spin_lock_irq(&css_set_lock);
+ src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+ spin_unlock_irq(&css_set_lock);
+
+ /* and the common ancestor */
+ com_cgrp = src_cgrp;
+ while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
+ com_cgrp = cgroup_parent(com_cgrp);
+
+ /* %current should be authorized to migrate to the common ancestor */
+ inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
+ if (!inode)
+ return -ENOMEM;
+
+ ret = inode_permission(inode, MAY_WRITE);
+ iput(inode);
+ if (ret)
+ return ret;
+
+ /*
+ * If namespaces are delegation boundaries, %current must be able
+ * to see both source and destination cgroups from its namespace.
+ */
+ if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
+ (!cgroup_is_descendant(src_cgrp, root_cgrp) ||
+ !cgroup_is_descendant(dst_cgrp, root_cgrp)))
+ return -ENOENT;
+
+ return 0;
}
/*
@@ -2954,11 +3034,23 @@ static void cgroup_file_release(struct kernfs_open_file *of)
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
+ struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct cgroup *cgrp = of->kn->parent->priv;
struct cftype *cft = of->kn->priv;
struct cgroup_subsys_state *css;
int ret;
+ /*
+ * If namespaces are delegation boundaries, disallow writes to
+ * files in an non-init namespace root from inside the namespace
+ * except for the files explicitly marked delegatable -
+ * cgroup.procs and cgroup.subtree_control.
+ */
+ if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
+ !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
+ ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
+ return -EPERM;
+
if (cft->write)
return cft->write(of, buf, nbytes, off);
@@ -3792,6 +3884,7 @@ static int cgroup_procs_show(struct seq_file *s, void *v)
static struct cftype cgroup_base_files[] = {
{
.name = "cgroup.procs",
+ .flags = CFTYPE_NS_DELEGATABLE,
.file_offset = offsetof(struct cgroup, procs_file),
.release = cgroup_procs_release,
.seq_start = cgroup_procs_start,
@@ -3805,6 +3898,7 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "cgroup.subtree_control",
+ .flags = CFTYPE_NS_DELEGATABLE,
.seq_show = cgroup_subtree_control_show,
.write = cgroup_subtree_control_write,
},
@@ -4393,6 +4487,7 @@ int cgroup_rmdir(struct kernfs_node *kn)
}
static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
+ .show_options = cgroup_show_options,
.remount_fs = cgroup_remount,
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
@@ -4789,6 +4884,7 @@ void cgroup_post_fork(struct task_struct *child)
cset = task_css_set(current);
if (list_empty(&child->cg_list)) {
get_css_set(cset);
+ cset->nr_tasks++;
css_set_move_task(child, NULL, cset, false);
}
spin_unlock_irq(&css_set_lock);
@@ -4838,6 +4934,7 @@ void cgroup_exit(struct task_struct *tsk)
if (!list_empty(&tsk->cg_list)) {
spin_lock_irq(&css_set_lock);
css_set_move_task(tsk, cset, NULL, false);
+ cset->nr_tasks--;
spin_unlock_irq(&css_set_lock);
} else {
get_css_set(cset);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index ae643412948a..ca8376e5008c 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1038,40 +1038,25 @@ static void cpuset_post_attach(void)
* @tsk: the task to change
* @newmems: new nodes that the task will be set
*
- * In order to avoid seeing no nodes if the old and new nodes are disjoint,
- * we structure updates as setting all new allowed nodes, then clearing newly
- * disallowed ones.
+ * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
+ * and rebind an eventual tasks' mempolicy. If the task is allocating in
+ * parallel, it might temporarily see an empty intersection, which results in
+ * a seqlock check and retry before OOM or allocation failure.
*/
static void cpuset_change_task_nodemask(struct task_struct *tsk,
nodemask_t *newmems)
{
- bool need_loop;
-
task_lock(tsk);
- /*
- * Determine if a loop is necessary if another thread is doing
- * read_mems_allowed_begin(). If at least one node remains unchanged and
- * tsk does not have a mempolicy, then an empty nodemask will not be
- * possible when mems_allowed is larger than a word.
- */
- need_loop = task_has_mempolicy(tsk) ||
- !nodes_intersects(*newmems, tsk->mems_allowed);
- if (need_loop) {
- local_irq_disable();
- write_seqcount_begin(&tsk->mems_allowed_seq);
- }
+ local_irq_disable();
+ write_seqcount_begin(&tsk->mems_allowed_seq);
nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
- mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
-
- mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
+ mpol_rebind_task(tsk, newmems);
tsk->mems_allowed = *newmems;
- if (need_loop) {
- write_seqcount_end(&tsk->mems_allowed_seq);
- local_irq_enable();
- }
+ write_seqcount_end(&tsk->mems_allowed_seq);
+ local_irq_enable();
task_unlock(tsk);
}
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
new file mode 100644
index 000000000000..dac46af22782
--- /dev/null
+++ b/kernel/cgroup/debug.c
@@ -0,0 +1,357 @@
+/*
+ * Debug controller
+ *
+ * WARNING: This controller is for cgroup core debugging only.
+ * Its interfaces are unstable and subject to changes at any time.
+ */
+#include <linux/ctype.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include "cgroup-internal.h"
+
+static struct cgroup_subsys_state *
+debug_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+
+ if (!css)
+ return ERR_PTR(-ENOMEM);
+
+ return css;
+}
+
+static void debug_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css);
+}
+
+/*
+ * debug_taskcount_read - return the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ */
+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return cgroup_task_count(css->cgroup);
+}
+
+static int current_css_set_read(struct seq_file *seq, void *v)
+{
+ struct kernfs_open_file *of = seq->private;
+ struct css_set *cset;
+ struct cgroup_subsys *ss;
+ struct cgroup_subsys_state *css;
+ int i, refcnt;
+
+ if (!cgroup_kn_lock_live(of->kn, false))
+ return -ENODEV;
+
+ spin_lock_irq(&css_set_lock);
+ rcu_read_lock();
+ cset = rcu_dereference(current->cgroups);
+ refcnt = refcount_read(&cset->refcount);
+ seq_printf(seq, "css_set %pK %d", cset, refcnt);
+ if (refcnt > cset->nr_tasks)
+ seq_printf(seq, " +%d", refcnt - cset->nr_tasks);
+ seq_puts(seq, "\n");
+
+ /*
+ * Print the css'es stored in the current css_set.
+ */
+ for_each_subsys(ss, i) {
+ css = cset->subsys[ss->id];
+ if (!css)
+ continue;
+ seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name,
+ (unsigned long)css, css->id);
+ }
+ rcu_read_unlock();
+ spin_unlock_irq(&css_set_lock);
+ cgroup_kn_unlock(of->kn);
+ return 0;
+}
+
+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 count;
+
+ rcu_read_lock();
+ count = refcount_read(&task_css_set(current)->refcount);
+ rcu_read_unlock();
+ return count;
+}
+
+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
+{
+ struct cgrp_cset_link *link;
+ struct css_set *cset;
+ char *name_buf;
+
+ name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!name_buf)
+ return -ENOMEM;
+
+ spin_lock_irq(&css_set_lock);
+ rcu_read_lock();
+ cset = rcu_dereference(current->cgroups);
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+ struct cgroup *c = link->cgrp;
+
+ cgroup_name(c, name_buf, NAME_MAX + 1);
+ seq_printf(seq, "Root %d group %s\n",
+ c->root->hierarchy_id, name_buf);
+ }
+ rcu_read_unlock();
+ spin_unlock_irq(&css_set_lock);
+ kfree(name_buf);
+ return 0;
+}
+
+#define MAX_TASKS_SHOWN_PER_CSS 25
+static int cgroup_css_links_read(struct seq_file *seq, void *v)
+{
+ struct cgroup_subsys_state *css = seq_css(seq);
+ struct cgrp_cset_link *link;
+ int dead_cnt = 0, extra_refs = 0;
+
+ spin_lock_irq(&css_set_lock);
+ list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
+ struct css_set *cset = link->cset;
+ struct task_struct *task;
+ int count = 0;
+ int refcnt = refcount_read(&cset->refcount);
+
+ seq_printf(seq, " %d", refcnt);
+ if (refcnt - cset->nr_tasks > 0) {
+ int extra = refcnt - cset->nr_tasks;
+
+ seq_printf(seq, " +%d", extra);
+ /*
+ * Take out the one additional reference in
+ * init_css_set.
+ */
+ if (cset == &init_css_set)
+ extra--;
+ extra_refs += extra;
+ }
+ seq_puts(seq, "\n");
+
+ list_for_each_entry(task, &cset->tasks, cg_list) {
+ if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
+ seq_printf(seq, " task %d\n",
+ task_pid_vnr(task));
+ }
+
+ list_for_each_entry(task, &cset->mg_tasks, cg_list) {
+ if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
+ seq_printf(seq, " task %d\n",
+ task_pid_vnr(task));
+ }
+ /* show # of overflowed tasks */
+ if (count > MAX_TASKS_SHOWN_PER_CSS)
+ seq_printf(seq, " ... (%d)\n",
+ count - MAX_TASKS_SHOWN_PER_CSS);
+
+ if (cset->dead) {
+ seq_puts(seq, " [dead]\n");
+ dead_cnt++;
+ }
+
+ WARN_ON(count != cset->nr_tasks);
+ }
+ spin_unlock_irq(&css_set_lock);
+
+ if (!dead_cnt && !extra_refs)
+ return 0;
+
+ seq_puts(seq, "\n");
+ if (extra_refs)
+ seq_printf(seq, "extra references = %d\n", extra_refs);
+ if (dead_cnt)
+ seq_printf(seq, "dead css_sets = %d\n", dead_cnt);
+
+ return 0;
+}
+
+static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
+{
+ struct kernfs_open_file *of = seq->private;
+ struct cgroup *cgrp;
+ struct cgroup_subsys *ss;
+ struct cgroup_subsys_state *css;
+ char pbuf[16];
+ int i;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENODEV;
+
+ for_each_subsys(ss, i) {
+ css = rcu_dereference_check(cgrp->subsys[ss->id], true);
+ if (!css)
+ continue;
+
+ pbuf[0] = '\0';
+
+ /* Show the parent CSS if applicable*/
+ if (css->parent)
+ snprintf(pbuf, sizeof(pbuf) - 1, " P=%d",
+ css->parent->id);
+ seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name,
+ (unsigned long)css, css->id,
+ atomic_read(&css->online_cnt), pbuf);
+ }
+
+ cgroup_kn_unlock(of->kn);
+ return 0;
+}
+
+static void cgroup_masks_read_one(struct seq_file *seq, const char *name,
+ u16 mask)
+{
+ struct cgroup_subsys *ss;
+ int ssid;
+ bool first = true;
+
+ seq_printf(seq, "%-17s: ", name);
+ for_each_subsys(ss, ssid) {
+ if (!(mask & (1 << ssid)))
+ continue;
+ if (!first)
+ seq_puts(seq, ", ");
+ seq_puts(seq, ss->name);
+ first = false;
+ }
+ seq_putc(seq, '\n');
+}
+
+static int cgroup_masks_read(struct seq_file *seq, void *v)
+{
+ struct kernfs_open_file *of = seq->private;
+ struct cgroup *cgrp;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENODEV;
+
+ cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control);
+ cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask);
+
+ cgroup_kn_unlock(of->kn);
+ return 0;
+}
+
+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ return (!cgroup_is_populated(css->cgroup) &&
+ !css_has_online_children(&css->cgroup->self));
+}
+
+static struct cftype debug_legacy_files[] = {
+ {
+ .name = "taskcount",
+ .read_u64 = debug_taskcount_read,
+ },
+
+ {
+ .name = "current_css_set",
+ .seq_show = current_css_set_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ {
+ .name = "current_css_set_refcount",
+ .read_u64 = current_css_set_refcount_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ {
+ .name = "current_css_set_cg_links",
+ .seq_show = current_css_set_cg_links_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ {
+ .name = "cgroup_css_links",
+ .seq_show = cgroup_css_links_read,
+ },
+
+ {
+ .name = "cgroup_subsys_states",
+ .seq_show = cgroup_subsys_states_read,
+ },
+
+ {
+ .name = "cgroup_masks",
+ .seq_show = cgroup_masks_read,
+ },
+
+ {
+ .name = "releasable",
+ .read_u64 = releasable_read,
+ },
+
+ { } /* terminate */
+};
+
+static struct cftype debug_files[] = {
+ {
+ .name = "taskcount",
+ .read_u64 = debug_taskcount_read,
+ },
+
+ {
+ .name = "current_css_set",
+ .seq_show = current_css_set_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ {
+ .name = "current_css_set_refcount",
+ .read_u64 = current_css_set_refcount_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ {
+ .name = "current_css_set_cg_links",
+ .seq_show = current_css_set_cg_links_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ {
+ .name = "css_links",
+ .seq_show = cgroup_css_links_read,
+ },
+
+ {
+ .name = "csses",
+ .seq_show = cgroup_subsys_states_read,
+ },
+
+ {
+ .name = "masks",
+ .seq_show = cgroup_masks_read,
+ },
+
+ { } /* terminate */
+};
+
+struct cgroup_subsys debug_cgrp_subsys = {
+ .css_alloc = debug_css_alloc,
+ .css_free = debug_css_free,
+ .legacy_cftypes = debug_legacy_files,
+};
+
+/*
+ * On v2, debug is an implicit controller enabled by "cgroup_debug" boot
+ * parameter.
+ */
+static int __init enable_cgroup_debug(char *str)
+{
+ debug_cgrp_subsys.dfl_cftypes = debug_files;
+ debug_cgrp_subsys.implicit_on_dfl = true;
+ return 1;
+}
+__setup("cgroup_debug", enable_cgroup_debug);
diff --git a/kernel/compat.c b/kernel/compat.c
index ebd8bdc3fd68..6f0a0e723a06 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -120,6 +120,50 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp
__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
+static int __compat_get_timespec64(struct timespec64 *ts64,
+ const struct compat_timespec __user *cts)
+{
+ struct compat_timespec ts;
+ int ret;
+
+ ret = copy_from_user(&ts, cts, sizeof(ts));
+ if (ret)
+ return -EFAULT;
+
+ ts64->tv_sec = ts.tv_sec;
+ ts64->tv_nsec = ts.tv_nsec;
+
+ return 0;
+}
+
+static int __compat_put_timespec64(const struct timespec64 *ts64,
+ struct compat_timespec __user *cts)
+{
+ struct compat_timespec ts = {
+ .tv_sec = ts64->tv_sec,
+ .tv_nsec = ts64->tv_nsec
+ };
+ return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0;
+}
+
+int compat_get_timespec64(struct timespec64 *ts, const void __user *uts)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
+ else
+ return __compat_get_timespec64(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_get_timespec64);
+
+int compat_put_timespec64(const struct timespec64 *ts, void __user *uts)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
+ else
+ return __compat_put_timespec64(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_put_timespec64);
+
int compat_get_timeval(struct timeval *tv, const void __user *utv)
{
if (COMPAT_USE_64BIT_TIME)
@@ -203,53 +247,6 @@ int put_compat_itimerval(struct compat_itimerval __user *o, const struct itimerv
return copy_to_user(o, &v32, sizeof(struct compat_itimerval)) ? -EFAULT : 0;
}
-static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
-{
- return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
-}
-
-COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
-{
- if (tbuf) {
- struct tms tms;
- struct compat_tms tmp;
-
- do_sys_times(&tms);
- /* Convert our struct tms to the compat version. */
- tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
- tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
- tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
- tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
- if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
- return -EFAULT;
- }
- force_successful_syscall_return();
- return compat_jiffies_to_clock_t(jiffies);
-}
-
-#ifdef __ARCH_WANT_SYS_SIGPENDING
-
-/*
- * Assumption: old_sigset_t and compat_old_sigset_t are both
- * types that can be passed to put_user()/get_user().
- */
-
-COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set)
-{
- old_sigset_t s;
- long ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret = sys_sigpending((old_sigset_t __user *) &s);
- set_fs(old_fs);
- if (ret == 0)
- ret = put_user(s, set);
- return ret;
-}
-
-#endif
-
#ifdef __ARCH_WANT_SYS_SIGPROCMASK
/*
@@ -304,164 +301,33 @@ COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
#endif
-COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
- struct compat_rlimit __user *, rlim)
-{
- struct rlimit r;
-
- if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
- __get_user(r.rlim_cur, &rlim->rlim_cur) ||
- __get_user(r.rlim_max, &rlim->rlim_max))
- return -EFAULT;
-
- if (r.rlim_cur == COMPAT_RLIM_INFINITY)
- r.rlim_cur = RLIM_INFINITY;
- if (r.rlim_max == COMPAT_RLIM_INFINITY)
- r.rlim_max = RLIM_INFINITY;
- return do_prlimit(current, resource, &r, NULL);
-}
-
-#ifdef COMPAT_RLIM_OLD_INFINITY
-
-COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
- struct compat_rlimit __user *, rlim)
-{
- struct rlimit r;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret = sys_old_getrlimit(resource, (struct rlimit __user *)&r);
- set_fs(old_fs);
-
- if (!ret) {
- if (r.rlim_cur > COMPAT_RLIM_OLD_INFINITY)
- r.rlim_cur = COMPAT_RLIM_INFINITY;
- if (r.rlim_max > COMPAT_RLIM_OLD_INFINITY)
- r.rlim_max = COMPAT_RLIM_INFINITY;
-
- if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
- __put_user(r.rlim_cur, &rlim->rlim_cur) ||
- __put_user(r.rlim_max, &rlim->rlim_max))
- return -EFAULT;
- }
- return ret;
-}
-
-#endif
-
-COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
- struct compat_rlimit __user *, rlim)
-{
- struct rlimit r;
- int ret;
-
- ret = do_prlimit(current, resource, NULL, &r);
- if (!ret) {
- if (r.rlim_cur > COMPAT_RLIM_INFINITY)
- r.rlim_cur = COMPAT_RLIM_INFINITY;
- if (r.rlim_max > COMPAT_RLIM_INFINITY)
- r.rlim_max = COMPAT_RLIM_INFINITY;
-
- if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
- __put_user(r.rlim_cur, &rlim->rlim_cur) ||
- __put_user(r.rlim_max, &rlim->rlim_max))
- return -EFAULT;
- }
- return ret;
-}
-
int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
{
- if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) ||
- __put_user(r->ru_utime.tv_sec, &ru->ru_utime.tv_sec) ||
- __put_user(r->ru_utime.tv_usec, &ru->ru_utime.tv_usec) ||
- __put_user(r->ru_stime.tv_sec, &ru->ru_stime.tv_sec) ||
- __put_user(r->ru_stime.tv_usec, &ru->ru_stime.tv_usec) ||
- __put_user(r->ru_maxrss, &ru->ru_maxrss) ||
- __put_user(r->ru_ixrss, &ru->ru_ixrss) ||
- __put_user(r->ru_idrss, &ru->ru_idrss) ||
- __put_user(r->ru_isrss, &ru->ru_isrss) ||
- __put_user(r->ru_minflt, &ru->ru_minflt) ||
- __put_user(r->ru_majflt, &ru->ru_majflt) ||
- __put_user(r->ru_nswap, &ru->ru_nswap) ||
- __put_user(r->ru_inblock, &ru->ru_inblock) ||
- __put_user(r->ru_oublock, &ru->ru_oublock) ||
- __put_user(r->ru_msgsnd, &ru->ru_msgsnd) ||
- __put_user(r->ru_msgrcv, &ru->ru_msgrcv) ||
- __put_user(r->ru_nsignals, &ru->ru_nsignals) ||
- __put_user(r->ru_nvcsw, &ru->ru_nvcsw) ||
- __put_user(r->ru_nivcsw, &ru->ru_nivcsw))
+ struct compat_rusage r32;
+ memset(&r32, 0, sizeof(r32));
+ r32.ru_utime.tv_sec = r->ru_utime.tv_sec;
+ r32.ru_utime.tv_usec = r->ru_utime.tv_usec;
+ r32.ru_stime.tv_sec = r->ru_stime.tv_sec;
+ r32.ru_stime.tv_usec = r->ru_stime.tv_usec;
+ r32.ru_maxrss = r->ru_maxrss;
+ r32.ru_ixrss = r->ru_ixrss;
+ r32.ru_idrss = r->ru_idrss;
+ r32.ru_isrss = r->ru_isrss;
+ r32.ru_minflt = r->ru_minflt;
+ r32.ru_majflt = r->ru_majflt;
+ r32.ru_nswap = r->ru_nswap;
+ r32.ru_inblock = r->ru_inblock;
+ r32.ru_oublock = r->ru_oublock;
+ r32.ru_msgsnd = r->ru_msgsnd;
+ r32.ru_msgrcv = r->ru_msgrcv;
+ r32.ru_nsignals = r->ru_nsignals;
+ r32.ru_nvcsw = r->ru_nvcsw;
+ r32.ru_nivcsw = r->ru_nivcsw;
+ if (copy_to_user(ru, &r32, sizeof(r32)))
return -EFAULT;
return 0;
}
-COMPAT_SYSCALL_DEFINE4(wait4,
- compat_pid_t, pid,
- compat_uint_t __user *, stat_addr,
- int, options,
- struct compat_rusage __user *, ru)
-{
- if (!ru) {
- return sys_wait4(pid, stat_addr, options, NULL);
- } else {
- struct rusage r;
- int ret;
- unsigned int status;
- mm_segment_t old_fs = get_fs();
-
- set_fs (KERNEL_DS);
- ret = sys_wait4(pid,
- (stat_addr ?
- (unsigned int __user *) &status : NULL),
- options, (struct rusage __user *) &r);
- set_fs (old_fs);
-
- if (ret > 0) {
- if (put_compat_rusage(&r, ru))
- return -EFAULT;
- if (stat_addr && put_user(status, stat_addr))
- return -EFAULT;
- }
- return ret;
- }
-}
-
-COMPAT_SYSCALL_DEFINE5(waitid,
- int, which, compat_pid_t, pid,
- struct compat_siginfo __user *, uinfo, int, options,
- struct compat_rusage __user *, uru)
-{
- siginfo_t info;
- struct rusage ru;
- long ret;
- mm_segment_t old_fs = get_fs();
-
- memset(&info, 0, sizeof(info));
-
- set_fs(KERNEL_DS);
- ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options,
- uru ? (struct rusage __user *)&ru : NULL);
- set_fs(old_fs);
-
- if ((ret < 0) || (info.si_signo == 0))
- return ret;
-
- if (uru) {
- /* sys_waitid() overwrites everything in ru */
- if (COMPAT_USE_64BIT_TIME)
- ret = copy_to_user(uru, &ru, sizeof(ru));
- else
- ret = put_compat_rusage(&ru, uru);
- if (ret)
- return -EFAULT;
- }
-
- BUG_ON(info.si_code & __SI_MASK);
- info.si_code |= __SI_CHLD;
- return copy_siginfo_to_user32(uinfo, &info);
-}
-
static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
unsigned len, struct cpumask *new_mask)
{
@@ -542,6 +408,27 @@ int put_compat_itimerspec(struct compat_itimerspec __user *dst,
return 0;
}
+int get_compat_itimerspec64(struct itimerspec64 *its,
+ const struct compat_itimerspec __user *uits)
+{
+
+ if (__compat_get_timespec64(&its->it_interval, &uits->it_interval) ||
+ __compat_get_timespec64(&its->it_value, &uits->it_value))
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(get_compat_itimerspec64);
+
+int put_compat_itimerspec64(const struct itimerspec64 *its,
+ struct compat_itimerspec __user *uits)
+{
+ if (__compat_put_timespec64(&its->it_interval, &uits->it_interval) ||
+ __compat_put_timespec64(&its->it_value, &uits->it_value))
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(put_compat_itimerspec64);
+
/*
* We currently only need the following fields from the sigevent
* structure: sigev_value, sigev_signo, sig_notify and (sometimes
@@ -566,84 +453,59 @@ int get_compat_sigevent(struct sigevent *event,
long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
unsigned long bitmap_size)
{
- int i, j;
- unsigned long m;
- compat_ulong_t um;
unsigned long nr_compat_longs;
/* align bitmap up to nearest compat_long_t boundary */
bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
+ nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
if (!access_ok(VERIFY_READ, umask, bitmap_size / 8))
return -EFAULT;
- nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
-
- for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) {
- m = 0;
-
- for (j = 0; j < sizeof(m)/sizeof(um); j++) {
- /*
- * We dont want to read past the end of the userspace
- * bitmap. We must however ensure the end of the
- * kernel bitmap is zeroed.
- */
- if (nr_compat_longs) {
- nr_compat_longs--;
- if (__get_user(um, umask))
- return -EFAULT;
- } else {
- um = 0;
- }
-
- umask++;
- m |= (long)um << (j * BITS_PER_COMPAT_LONG);
- }
- *mask++ = m;
+ user_access_begin();
+ while (nr_compat_longs > 1) {
+ compat_ulong_t l1, l2;
+ unsafe_get_user(l1, umask++, Efault);
+ unsafe_get_user(l2, umask++, Efault);
+ *mask++ = ((unsigned long)l2 << BITS_PER_COMPAT_LONG) | l1;
+ nr_compat_longs -= 2;
}
-
+ if (nr_compat_longs)
+ unsafe_get_user(*mask, umask++, Efault);
+ user_access_end();
return 0;
+
+Efault:
+ user_access_end();
+ return -EFAULT;
}
long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
unsigned long bitmap_size)
{
- int i, j;
- unsigned long m;
- compat_ulong_t um;
unsigned long nr_compat_longs;
/* align bitmap up to nearest compat_long_t boundary */
bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
+ nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8))
return -EFAULT;
- nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
-
- for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) {
- m = *mask++;
-
- for (j = 0; j < sizeof(m)/sizeof(um); j++) {
- um = m;
-
- /*
- * We dont want to write past the end of the userspace
- * bitmap.
- */
- if (nr_compat_longs) {
- nr_compat_longs--;
- if (__put_user(um, umask))
- return -EFAULT;
- }
-
- umask++;
- m >>= 4*sizeof(um);
- m >>= 4*sizeof(um);
- }
+ user_access_begin();
+ while (nr_compat_longs > 1) {
+ unsigned long m = *mask++;
+ unsafe_put_user((compat_ulong_t)m, umask++, Efault);
+ unsafe_put_user(m >> BITS_PER_COMPAT_LONG, umask++, Efault);
+ nr_compat_longs -= 2;
}
-
+ if (nr_compat_longs)
+ unsafe_put_user((compat_ulong_t)*mask, umask++, Efault);
+ user_access_end();
return 0;
+Efault:
+ user_access_end();
+ return -EFAULT;
}
void
@@ -669,38 +531,6 @@ sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
}
}
-COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
- struct compat_siginfo __user *, uinfo,
- struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
-{
- compat_sigset_t s32;
- sigset_t s;
- struct timespec t;
- siginfo_t info;
- long ret;
-
- if (sigsetsize != sizeof(sigset_t))
- return -EINVAL;
-
- if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
- return -EFAULT;
- sigset_from_compat(&s, &s32);
-
- if (uts) {
- if (compat_get_timespec(&t, uts))
- return -EFAULT;
- }
-
- ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
-
- if (ret > 0 && uinfo) {
- if (copy_siginfo_to_user32(uinfo, &info))
- ret = -EFAULT;
- }
-
- return ret;
-}
-
#ifdef CONFIG_NUMA
COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
compat_uptr_t __user *, pages32,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b03a32595cfe..ab860453841d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -271,11 +271,25 @@ void cpu_hotplug_enable(void)
EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
#endif /* CONFIG_HOTPLUG_CPU */
+static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st);
+
static int bringup_wait_for_ap(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
wait_for_completion(&st->done);
+ BUG_ON(!cpu_online(cpu));
+
+ /* Unpark the stopper thread and the hotplug thread of the target cpu */
+ stop_machine_unpark(cpu);
+ kthread_unpark(st->thread);
+
+ /* Should we go further up ? */
+ if (st->target > CPUHP_AP_ONLINE_IDLE) {
+ __cpuhp_kick_ap_work(st);
+ wait_for_completion(&st->done);
+ }
return st->result;
}
@@ -296,9 +310,7 @@ static int bringup_cpu(unsigned int cpu)
irq_unlock_sparse();
if (ret)
return ret;
- ret = bringup_wait_for_ap(cpu);
- BUG_ON(!cpu_online(cpu));
- return ret;
+ return bringup_wait_for_ap(cpu);
}
/*
@@ -767,31 +779,20 @@ void notify_cpu_starting(unsigned int cpu)
}
/*
- * Called from the idle task. We need to set active here, so we can kick off
- * the stopper thread and unpark the smpboot threads. If the target state is
- * beyond CPUHP_AP_ONLINE_IDLE we kick cpuhp thread and let it bring up the
- * cpu further.
+ * Called from the idle task. Wake up the controlling task which brings the
+ * stopper and the hotplug thread of the upcoming CPU up and then delegates
+ * the rest of the online bringup to the hotplug thread.
*/
void cpuhp_online_idle(enum cpuhp_state state)
{
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
- unsigned int cpu = smp_processor_id();
/* Happens for the boot cpu */
if (state != CPUHP_AP_ONLINE_IDLE)
return;
st->state = CPUHP_AP_ONLINE_IDLE;
-
- /* Unpark the stopper thread and the hotplug thread of this cpu */
- stop_machine_unpark(cpu);
- kthread_unpark(st->thread);
-
- /* Should we go further up ? */
- if (st->target > CPUHP_AP_ONLINE_IDLE)
- __cpuhp_kick_ap_work(st);
- else
- complete(&st->done);
+ complete(&st->done);
}
/* Requires cpu_add_remove_lock to be held */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4d2c32f98482..1538df9b2b65 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3632,10 +3632,10 @@ static inline u64 perf_event_count(struct perf_event *event)
* will not be local and we cannot read them atomically
* - must not have a pmu::count method
*/
-u64 perf_event_read_local(struct perf_event *event)
+int perf_event_read_local(struct perf_event *event, u64 *value)
{
unsigned long flags;
- u64 val;
+ int ret = 0;
/*
* Disabling interrupts avoids all counter scheduling (context
@@ -3643,25 +3643,37 @@ u64 perf_event_read_local(struct perf_event *event)
*/
local_irq_save(flags);
- /* If this is a per-task event, it must be for current */
- WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
- event->hw.target != current);
-
- /* If this is a per-CPU event, it must be for this CPU */
- WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
- event->cpu != smp_processor_id());
-
/*
* It must not be an event with inherit set, we cannot read
* all child counters from atomic context.
*/
- WARN_ON_ONCE(event->attr.inherit);
+ if (event->attr.inherit) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
/*
* It must not have a pmu::count method, those are not
* NMI safe.
*/
- WARN_ON_ONCE(event->pmu->count);
+ if (event->pmu->count) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ /* If this is a per-task event, it must be for current */
+ if ((event->attach_state & PERF_ATTACH_TASK) &&
+ event->hw.target != current) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* If this is a per-CPU event, it must be for this CPU */
+ if (!(event->attach_state & PERF_ATTACH_TASK) &&
+ event->cpu != smp_processor_id()) {
+ ret = -EINVAL;
+ goto out;
+ }
/*
* If the event is currently on this CPU, its either a per-task event,
@@ -3671,10 +3683,11 @@ u64 perf_event_read_local(struct perf_event *event)
if (event->oncpu == smp_processor_id())
event->pmu->read(event);
- val = local64_read(&event->count);
+ *value = local64_read(&event->count);
+out:
local_irq_restore(flags);
- return val;
+ return ret;
}
static int perf_event_read(struct perf_event *event, bool group)
@@ -8049,12 +8062,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
bool is_kprobe, is_tracepoint;
struct bpf_prog *prog;
- if (event->attr.type == PERF_TYPE_HARDWARE ||
- event->attr.type == PERF_TYPE_SOFTWARE)
- return perf_event_set_bpf_handler(event, prog_fd);
-
if (event->attr.type != PERF_TYPE_TRACEPOINT)
- return -EINVAL;
+ return perf_event_set_bpf_handler(event, prog_fd);
if (event->tp_event->prog)
return -EEXIST;
diff --git a/kernel/exit.c b/kernel/exit.c
index c63226283aef..608c9775a37b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -51,7 +51,6 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/tracehook.h>
#include <linux/fs_struct.h>
-#include <linux/userfaultfd_k.h>
#include <linux/init_task.h>
#include <linux/perf_event.h>
#include <trace/events/sched.h>
@@ -62,6 +61,7 @@
#include <linux/kcov.h>
#include <linux/random.h>
#include <linux/rcuwait.h>
+#include <linux/compat.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -982,14 +982,21 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
return 0;
}
+struct waitid_info {
+ pid_t pid;
+ uid_t uid;
+ int status;
+ int cause;
+};
+
struct wait_opts {
enum pid_type wo_type;
int wo_flags;
struct pid *wo_pid;
- struct siginfo __user *wo_info;
- int __user *wo_stat;
- struct rusage __user *wo_rusage;
+ struct waitid_info *wo_info;
+ int wo_stat;
+ struct rusage *wo_rusage;
wait_queue_entry_t child_wait;
int notask_error;
@@ -1036,34 +1043,6 @@ eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
return 1;
}
-static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
- pid_t pid, uid_t uid, int why, int status)
-{
- struct siginfo __user *infop;
- int retval = wo->wo_rusage
- ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
-
- put_task_struct(p);
- infop = wo->wo_info;
- if (infop) {
- if (!retval)
- retval = put_user(SIGCHLD, &infop->si_signo);
- if (!retval)
- retval = put_user(0, &infop->si_errno);
- if (!retval)
- retval = put_user((short)why, &infop->si_code);
- if (!retval)
- retval = put_user(pid, &infop->si_pid);
- if (!retval)
- retval = put_user(uid, &infop->si_uid);
- if (!retval)
- retval = put_user(status, &infop->si_status);
- }
- if (!retval)
- retval = pid;
- return retval;
-}
-
/*
* Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
@@ -1072,30 +1051,23 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
*/
static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
{
- int state, retval, status;
+ int state, status;
pid_t pid = task_pid_vnr(p);
uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
- struct siginfo __user *infop;
+ struct waitid_info *infop;
if (!likely(wo->wo_flags & WEXITED))
return 0;
if (unlikely(wo->wo_flags & WNOWAIT)) {
- int exit_code = p->exit_code;
- int why;
-
+ status = p->exit_code;
get_task_struct(p);
read_unlock(&tasklist_lock);
sched_annotate_sleep();
-
- if ((exit_code & 0x7f) == 0) {
- why = CLD_EXITED;
- status = exit_code >> 8;
- } else {
- why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
- status = exit_code & 0x7f;
- }
- return wait_noreap_copyout(wo, p, pid, uid, why, status);
+ if (wo->wo_rusage)
+ getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
+ put_task_struct(p);
+ goto out_info;
}
/*
* Move the task's state to DEAD/TRACE, only one thread can do this.
@@ -1168,38 +1140,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
spin_unlock_irq(&current->sighand->siglock);
}
- retval = wo->wo_rusage
- ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
+ if (wo->wo_rusage)
+ getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
status = (p->signal->flags & SIGNAL_GROUP_EXIT)
? p->signal->group_exit_code : p->exit_code;
- if (!retval && wo->wo_stat)
- retval = put_user(status, wo->wo_stat);
-
- infop = wo->wo_info;
- if (!retval && infop)
- retval = put_user(SIGCHLD, &infop->si_signo);
- if (!retval && infop)
- retval = put_user(0, &infop->si_errno);
- if (!retval && infop) {
- int why;
-
- if ((status & 0x7f) == 0) {
- why = CLD_EXITED;
- status >>= 8;
- } else {
- why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
- status &= 0x7f;
- }
- retval = put_user((short)why, &infop->si_code);
- if (!retval)
- retval = put_user(status, &infop->si_status);
- }
- if (!retval && infop)
- retval = put_user(pid, &infop->si_pid);
- if (!retval && infop)
- retval = put_user(uid, &infop->si_uid);
- if (!retval)
- retval = pid;
+ wo->wo_stat = status;
if (state == EXIT_TRACE) {
write_lock_irq(&tasklist_lock);
@@ -1216,7 +1161,21 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
if (state == EXIT_DEAD)
release_task(p);
- return retval;
+out_info:
+ infop = wo->wo_info;
+ if (infop) {
+ if ((status & 0x7f) == 0) {
+ infop->cause = CLD_EXITED;
+ infop->status = status >> 8;
+ } else {
+ infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
+ infop->status = status & 0x7f;
+ }
+ infop->pid = pid;
+ infop->uid = uid;
+ }
+
+ return pid;
}
static int *task_stopped_code(struct task_struct *p, bool ptrace)
@@ -1252,8 +1211,8 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
static int wait_task_stopped(struct wait_opts *wo,
int ptrace, struct task_struct *p)
{
- struct siginfo __user *infop;
- int retval, exit_code, *p_code, why;
+ struct waitid_info *infop;
+ int exit_code, *p_code, why;
uid_t uid = 0; /* unneeded, required by compiler */
pid_t pid;
@@ -1298,34 +1257,21 @@ unlock_sig:
why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
read_unlock(&tasklist_lock);
sched_annotate_sleep();
+ if (wo->wo_rusage)
+ getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
+ put_task_struct(p);
- if (unlikely(wo->wo_flags & WNOWAIT))
- return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
-
- retval = wo->wo_rusage
- ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
- if (!retval && wo->wo_stat)
- retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
+ if (likely(!(wo->wo_flags & WNOWAIT)))
+ wo->wo_stat = (exit_code << 8) | 0x7f;
infop = wo->wo_info;
- if (!retval && infop)
- retval = put_user(SIGCHLD, &infop->si_signo);
- if (!retval && infop)
- retval = put_user(0, &infop->si_errno);
- if (!retval && infop)
- retval = put_user((short)why, &infop->si_code);
- if (!retval && infop)
- retval = put_user(exit_code, &infop->si_status);
- if (!retval && infop)
- retval = put_user(pid, &infop->si_pid);
- if (!retval && infop)
- retval = put_user(uid, &infop->si_uid);
- if (!retval)
- retval = pid;
- put_task_struct(p);
-
- BUG_ON(!retval);
- return retval;
+ if (infop) {
+ infop->cause = why;
+ infop->status = exit_code;
+ infop->pid = pid;
+ infop->uid = uid;
+ }
+ return pid;
}
/*
@@ -1336,7 +1282,7 @@ unlock_sig:
*/
static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
{
- int retval;
+ struct waitid_info *infop;
pid_t pid;
uid_t uid;
@@ -1361,22 +1307,20 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
get_task_struct(p);
read_unlock(&tasklist_lock);
sched_annotate_sleep();
+ if (wo->wo_rusage)
+ getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
+ put_task_struct(p);
- if (!wo->wo_info) {
- retval = wo->wo_rusage
- ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
- put_task_struct(p);
- if (!retval && wo->wo_stat)
- retval = put_user(0xffff, wo->wo_stat);
- if (!retval)
- retval = pid;
+ infop = wo->wo_info;
+ if (!infop) {
+ wo->wo_stat = 0xffff;
} else {
- retval = wait_noreap_copyout(wo, p, pid, uid,
- CLD_CONTINUED, SIGCONT);
- BUG_ON(retval == 0);
+ infop->cause = CLD_CONTINUED;
+ infop->pid = pid;
+ infop->uid = uid;
+ infop->status = SIGCONT;
}
-
- return retval;
+ return pid;
}
/*
@@ -1604,8 +1548,8 @@ end:
return retval;
}
-SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
- infop, int, options, struct rusage __user *, ru)
+static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
+ int options, struct rusage *ru)
{
struct wait_opts wo;
struct pid *pid = NULL;
@@ -1643,38 +1587,48 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
wo.wo_pid = pid;
wo.wo_flags = options;
wo.wo_info = infop;
- wo.wo_stat = NULL;
wo.wo_rusage = ru;
ret = do_wait(&wo);
- if (ret > 0) {
- ret = 0;
- } else if (infop) {
- /*
- * For a WNOHANG return, clear out all the fields
- * we would set so the user can easily tell the
- * difference.
- */
- if (!ret)
- ret = put_user(0, &infop->si_signo);
- if (!ret)
- ret = put_user(0, &infop->si_errno);
- if (!ret)
- ret = put_user(0, &infop->si_code);
- if (!ret)
- ret = put_user(0, &infop->si_pid);
- if (!ret)
- ret = put_user(0, &infop->si_uid);
- if (!ret)
- ret = put_user(0, &infop->si_status);
- }
-
put_pid(pid);
return ret;
}
-SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
- int, options, struct rusage __user *, ru)
+SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
+ infop, int, options, struct rusage __user *, ru)
+{
+ struct rusage r;
+ struct waitid_info info = {.status = 0};
+ long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
+ int signo = 0;
+ if (err > 0) {
+ signo = SIGCHLD;
+ err = 0;
+ }
+
+ if (!err) {
+ if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
+ return -EFAULT;
+ }
+ if (!infop)
+ return err;
+
+ user_access_begin();
+ unsafe_put_user(signo, &infop->si_signo, Efault);
+ unsafe_put_user(0, &infop->si_errno, Efault);
+ unsafe_put_user((short)info.cause, &infop->si_code, Efault);
+ unsafe_put_user(info.pid, &infop->si_pid, Efault);
+ unsafe_put_user(info.uid, &infop->si_uid, Efault);
+ unsafe_put_user(info.status, &infop->si_status, Efault);
+ user_access_end();
+ return err;
+Efault:
+ user_access_end();
+ return -EFAULT;
+}
+
+long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
+ struct rusage *ru)
{
struct wait_opts wo;
struct pid *pid = NULL;
@@ -1702,14 +1656,29 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
wo.wo_pid = pid;
wo.wo_flags = options | WEXITED;
wo.wo_info = NULL;
- wo.wo_stat = stat_addr;
+ wo.wo_stat = 0;
wo.wo_rusage = ru;
ret = do_wait(&wo);
put_pid(pid);
+ if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
+ ret = -EFAULT;
return ret;
}
+SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
+ int, options, struct rusage __user *, ru)
+{
+ struct rusage r;
+ long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);
+
+ if (err > 0) {
+ if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
+ return -EFAULT;
+ }
+ return err;
+}
+
#ifdef __ARCH_WANT_SYS_WAITPID
/*
@@ -1722,3 +1691,61 @@ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
}
#endif
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(wait4,
+ compat_pid_t, pid,
+ compat_uint_t __user *, stat_addr,
+ int, options,
+ struct compat_rusage __user *, ru)
+{
+ struct rusage r;
+ long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
+ if (err > 0) {
+ if (ru && put_compat_rusage(&r, ru))
+ return -EFAULT;
+ }
+ return err;
+}
+
+COMPAT_SYSCALL_DEFINE5(waitid,
+ int, which, compat_pid_t, pid,
+ struct compat_siginfo __user *, infop, int, options,
+ struct compat_rusage __user *, uru)
+{
+ struct rusage ru;
+ struct waitid_info info = {.status = 0};
+ long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
+ int signo = 0;
+ if (err > 0) {
+ signo = SIGCHLD;
+ err = 0;
+ }
+
+ if (!err && uru) {
+ /* kernel_waitid() overwrites everything in ru */
+ if (COMPAT_USE_64BIT_TIME)
+ err = copy_to_user(uru, &ru, sizeof(ru));
+ else
+ err = put_compat_rusage(&ru, uru);
+ if (err)
+ return -EFAULT;
+ }
+
+ if (!infop)
+ return err;
+
+ user_access_begin();
+ unsafe_put_user(signo, &infop->si_signo, Efault);
+ unsafe_put_user(0, &infop->si_errno, Efault);
+ unsafe_put_user((short)info.cause, &infop->si_code, Efault);
+ unsafe_put_user(info.pid, &infop->si_pid, Efault);
+ unsafe_put_user(info.uid, &infop->si_uid, Efault);
+ unsafe_put_user(info.status, &infop->si_status, Efault);
+ user_access_end();
+ return err;
+Efault:
+ user_access_end();
+ return -EFAULT;
+}
+#endif
diff --git a/kernel/extable.c b/kernel/extable.c
index 0fbdd8582f08..223df4a328a4 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -69,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr)
return 0;
}
-int core_kernel_text(unsigned long addr)
+int notrace core_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_stext &&
addr < (unsigned long)_etext)
diff --git a/kernel/fork.c b/kernel/fork.c
index e53770d2bf95..0f69a3e5281e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -326,8 +326,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
}
/* All stack pages belong to the same memcg. */
- memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB,
- account * (THREAD_SIZE / 1024));
+ mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
} else {
/*
* All stack pages are in the same zone and belong to the
@@ -338,8 +338,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
THREAD_SIZE / 1024 * account);
- memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB,
- account * (THREAD_SIZE / 1024));
+ mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
}
}
@@ -1637,9 +1637,9 @@ static __latent_entropy struct task_struct *copy_process(
prev_cputime_init(&p->prev_cputime);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
- seqcount_init(&p->vtime_seqcount);
- p->vtime_snap = 0;
- p->vtime_snap_whence = VTIME_INACTIVE;
+ seqcount_init(&p->vtime.seqcount);
+ p->vtime.starttime = 0;
+ p->vtime.state = VTIME_INACTIVE;
#endif
#if defined(SPLIT_RSS_COUNTING)
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index d2747f9c5707..d69bd77252a7 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -110,6 +110,13 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
struct cpumask *masks;
cpumask_var_t nmsk, *node_to_present_cpumask;
+ /*
+ * If there aren't any vectors left after applying the pre/post
+ * vectors don't bother with assigning affinity.
+ */
+ if (!affv)
+ return NULL;
+
if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
return NULL;
@@ -192,15 +199,19 @@ out:
/**
* irq_calc_affinity_vectors - Calculate the optimal number of vectors
+ * @minvec: The minimum number of vectors available
* @maxvec: The maximum number of vectors available
* @affd: Description of the affinity requirements
*/
-int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd)
+int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd)
{
int resv = affd->pre_vectors + affd->post_vectors;
int vecs = maxvec - resv;
int ret;
+ if (resv > minvec)
+ return 0;
+
get_online_cpus();
ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv;
put_online_cpus();
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ad43468e89f0..d171bc57e1e0 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -234,7 +234,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
return IRQ_STARTUP_MANAGED;
}
#else
-static int
+static __always_inline int
__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
{
return IRQ_STARTUP_NORMAL;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 9da14d125df4..dbfba9933ed2 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -437,7 +437,9 @@ static inline void irq_remove_debugfs_entry(struct irq_desc *desc)
# ifdef CONFIG_IRQ_DOMAIN
void irq_domain_debugfs_init(struct dentry *root);
# else
-static inline void irq_domain_debugfs_init(struct dentry *root);
+static inline void irq_domain_debugfs_init(struct dentry *root)
+{
+}
# endif
#else /* CONFIG_GENERIC_IRQ_DEBUGFS */
static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 8bbd06405e60..73be2b3909bd 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -373,6 +373,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
raw_spin_lock_init(&desc->lock);
lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+ mutex_init(&desc->request_mutex);
init_rcu_head(&desc->rcu);
desc_set_defaults(irq, desc, node, affinity, owner);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 14fe862aa2e3..f1f251479aa6 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1,5 +1,6 @@
#define pr_fmt(fmt) "irq: " fmt
+#include <linux/acpi.h>
#include <linux/debugfs.h>
#include <linux/hardirq.h>
#include <linux/interrupt.h>
@@ -155,6 +156,21 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
domain->name = fwid->name;
break;
}
+#ifdef CONFIG_ACPI
+ } else if (is_acpi_device_node(fwnode)) {
+ struct acpi_buffer buf = {
+ .length = ACPI_ALLOCATE_BUFFER,
+ };
+ acpi_handle handle;
+
+ handle = acpi_device_handle(to_acpi_device_node(fwnode));
+ if (acpi_get_name(handle, ACPI_FULL_PATHNAME, &buf) == AE_OK) {
+ domain->name = buf.pointer;
+ domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
+ }
+
+ domain->fwnode = fwnode;
+#endif
} else if (of_node) {
char *name;
@@ -1667,8 +1683,7 @@ static void debugfs_add_domain_dir(struct irq_domain *d)
static void debugfs_remove_domain_dir(struct irq_domain *d)
{
- if (d->debugfs_file)
- debugfs_remove(d->debugfs_file);
+ debugfs_remove(d->debugfs_file);
}
void __init irq_domain_debugfs_init(struct dentry *root)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5c11c1730ba5..5624b2dd6b58 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1167,6 +1167,18 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)
new->flags &= ~IRQF_ONESHOT;
+ mutex_lock(&desc->request_mutex);
+ if (!desc->action) {
+ ret = irq_request_resources(desc);
+ if (ret) {
+ pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n",
+ new->name, irq, desc->irq_data.chip->name);
+ goto out_mutex;
+ }
+ }
+
+ chip_bus_lock(desc);
+
/*
* The following block of code has to be executed atomically
*/
@@ -1267,13 +1279,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
}
if (!shared) {
- ret = irq_request_resources(desc);
- if (ret) {
- pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n",
- new->name, irq, desc->irq_data.chip->name);
- goto out_unlock;
- }
-
init_waitqueue_head(&desc->wait_for_threads);
/* Setup the type (level, edge polarity) if configured: */
@@ -1347,6 +1352,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
}
raw_spin_unlock_irqrestore(&desc->lock, flags);
+ chip_bus_sync_unlock(desc);
+ mutex_unlock(&desc->request_mutex);
irq_setup_timings(desc, new);
@@ -1378,6 +1385,14 @@ mismatch:
out_unlock:
raw_spin_unlock_irqrestore(&desc->lock, flags);
+ chip_bus_sync_unlock(desc);
+
+ if (!desc->action)
+ irq_release_resources(desc);
+
+out_mutex:
+ mutex_unlock(&desc->request_mutex);
+
out_thread:
if (new->thread) {
struct task_struct *t = new->thread;
@@ -1417,9 +1432,7 @@ int setup_irq(unsigned int irq, struct irqaction *act)
if (retval < 0)
return retval;
- chip_bus_lock(desc);
retval = __setup_irq(irq, desc, act);
- chip_bus_sync_unlock(desc);
if (retval)
irq_chip_pm_put(&desc->irq_data);
@@ -1443,6 +1456,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
if (!desc)
return NULL;
+ mutex_lock(&desc->request_mutex);
chip_bus_lock(desc);
raw_spin_lock_irqsave(&desc->lock, flags);
@@ -1475,8 +1489,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
if (!desc->action) {
irq_settings_clr_disable_unlazy(desc);
irq_shutdown(desc);
- irq_release_resources(desc);
- irq_remove_timings(desc);
}
#ifdef CONFIG_SMP
@@ -1518,6 +1530,13 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
}
}
+ if (!desc->action) {
+ irq_release_resources(desc);
+ irq_remove_timings(desc);
+ }
+
+ mutex_unlock(&desc->request_mutex);
+
irq_chip_pm_put(&desc->irq_data);
module_put(desc->owner);
kfree(action->secondary);
@@ -1674,9 +1693,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
return retval;
}
- chip_bus_lock(desc);
retval = __setup_irq(irq, desc, action);
- chip_bus_sync_unlock(desc);
if (retval) {
irq_chip_pm_put(&desc->irq_data);
@@ -1924,9 +1941,7 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
if (retval < 0)
return retval;
- chip_bus_lock(desc);
retval = __setup_irq(irq, desc, act);
- chip_bus_sync_unlock(desc);
if (retval)
irq_chip_pm_put(&desc->irq_data);
@@ -1935,9 +1950,10 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
}
/**
- * request_percpu_irq - allocate a percpu interrupt line
+ * __request_percpu_irq - allocate a percpu interrupt line
* @irq: Interrupt line to allocate
* @handler: Function to be called when the IRQ occurs.
+ * @flags: Interrupt type flags (IRQF_TIMER only)
* @devname: An ascii name for the claiming device
* @dev_id: A percpu cookie passed back to the handler function
*
@@ -1950,8 +1966,9 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
* the handler gets called with the interrupted CPU's instance of
* that variable.
*/
-int request_percpu_irq(unsigned int irq, irq_handler_t handler,
- const char *devname, void __percpu *dev_id)
+int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
+ unsigned long flags, const char *devname,
+ void __percpu *dev_id)
{
struct irqaction *action;
struct irq_desc *desc;
@@ -1965,12 +1982,15 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
!irq_settings_is_per_cpu_devid(desc))
return -EINVAL;
+ if (flags && flags != IRQF_TIMER)
+ return -EINVAL;
+
action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
if (!action)
return -ENOMEM;
action->handler = handler;
- action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND;
+ action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND;
action->name = devname;
action->percpu_dev_id = dev_id;
@@ -1980,9 +2000,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
return retval;
}
- chip_bus_lock(desc);
retval = __setup_irq(irq, desc, action);
- chip_bus_sync_unlock(desc);
if (retval) {
irq_chip_pm_put(&desc->irq_data);
@@ -1991,7 +2009,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
return retval;
}
-EXPORT_SYMBOL_GPL(request_percpu_irq);
+EXPORT_SYMBOL_GPL(__request_percpu_irq);
/**
* irq_get_irqchip_state - returns the irqchip state of a interrupt.
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b118735fea9d..766e7e4d3ad9 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -162,16 +162,10 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
}
if (cmdline_len) {
- image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
- if (!image->cmdline_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
- cmdline_len);
- if (ret) {
- ret = -EFAULT;
+ image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len);
+ if (IS_ERR(image->cmdline_buf)) {
+ ret = PTR_ERR(image->cmdline_buf);
+ image->cmdline_buf = NULL;
goto out;
}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 6756d750b31b..a1606a4224e1 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1771,24 +1771,13 @@ unsigned long __weak arch_deref_entry_point(void *entry)
int register_jprobes(struct jprobe **jps, int num)
{
- struct jprobe *jp;
int ret = 0, i;
if (num <= 0)
return -EINVAL;
+
for (i = 0; i < num; i++) {
- unsigned long addr, offset;
- jp = jps[i];
- addr = arch_deref_entry_point(jp->entry);
-
- /* Verify probepoint is a function entry point */
- if (kallsyms_lookup_size_offset(addr, NULL, &offset) &&
- offset == 0) {
- jp->kp.pre_handler = setjmp_pre_handler;
- jp->kp.break_handler = longjmp_break_handler;
- ret = register_kprobe(&jp->kp);
- } else
- ret = -EINVAL;
+ ret = register_jprobe(jps[i]);
if (ret < 0) {
if (i > 0)
@@ -1796,13 +1785,30 @@ int register_jprobes(struct jprobe **jps, int num)
break;
}
}
+
return ret;
}
EXPORT_SYMBOL_GPL(register_jprobes);
int register_jprobe(struct jprobe *jp)
{
- return register_jprobes(&jp, 1);
+ unsigned long addr, offset;
+ struct kprobe *kp = &jp->kp;
+
+ /*
+ * Verify probepoint as well as the jprobe handler are
+ * valid function entry points.
+ */
+ addr = arch_deref_entry_point(jp->entry);
+
+ if (kallsyms_lookup_size_offset(addr, NULL, &offset) && offset == 0 &&
+ kprobe_on_func_entry(kp->addr, kp->symbol_name, kp->offset)) {
+ kp->pre_handler = setjmp_pre_handler;
+ kp->break_handler = longjmp_break_handler;
+ return register_kprobe(kp);
+ }
+
+ return -EINVAL;
}
EXPORT_SYMBOL_GPL(register_jprobe);
@@ -1888,12 +1894,12 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);
-bool __weak arch_function_offset_within_entry(unsigned long offset)
+bool __weak arch_kprobe_on_func_entry(unsigned long offset)
{
return !offset;
}
-bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
+bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
{
kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);
@@ -1901,7 +1907,7 @@ bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsign
return false;
if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) ||
- !arch_function_offset_within_entry(offset))
+ !arch_kprobe_on_func_entry(offset))
return false;
return true;
@@ -1914,7 +1920,7 @@ int register_kretprobe(struct kretprobe *rp)
int i;
void *addr;
- if (!function_offset_within_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset))
+ if (!kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset))
return -EINVAL;
if (kretprobe_blacklist_size) {
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index cc3ed0ccdfa2..2655f26ec882 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -20,6 +20,7 @@
#include <linux/cpumask.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
+#include <linux/spinlock.h>
#include <asm/qrwlock.h>
/*
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index b2caec7315af..fd24153e8a48 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -28,6 +28,7 @@
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/mutex.h>
+#include <linux/prefetch.h>
#include <asm/byteorder.h>
#include <asm/qspinlock.h>
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index e6b2f7ad3e51..4ccfcaae5b89 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -193,7 +193,8 @@ void __init __pv_init_lock_hash(void)
*/
pv_lock_hash = alloc_large_system_hash("PV qspinlock",
sizeof(struct pv_hash_entry),
- pv_hash_size, 0, HASH_EARLY,
+ pv_hash_size, 0,
+ HASH_EARLY | HASH_ZERO,
&pv_lock_hash_bits, NULL,
pv_hash_size, pv_hash_size);
}
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index c65f7989f850..20819df98125 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -231,8 +231,8 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state)
out_nolock:
list_del(&waiter.list);
- if (!list_empty(&sem->wait_list))
- __rwsem_do_wake(sem, 1);
+ if (!list_empty(&sem->wait_list) && sem->count >= 0)
+ __rwsem_do_wake(sem, 0);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
return -EINTR;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 23a6483c3666..124bed776532 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -358,7 +358,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
goto err_pfn_remap;
mem_hotplug_begin();
- error = arch_add_memory(nid, align_start, align_size, true);
+ error = arch_add_memory(nid, align_start, align_size, false);
+ if (!error)
+ move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+ align_start >> PAGE_SHIFT,
+ align_size >> PAGE_SHIFT);
mem_hotplug_done();
if (error)
goto err_add_memory;
diff --git a/kernel/module.c b/kernel/module.c
index d7eb41d772c4..b3dbdde82e80 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -49,9 +49,7 @@
#include <linux/rculist.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
-#ifdef CONFIG_STRICT_MODULE_RWX
-#include <asm/set_memory.h>
-#endif
+#include <linux/set_memory.h>
#include <asm/mmu_context.h>
#include <linux/license.h>
#include <asm/sections.h>
@@ -3074,9 +3072,9 @@ static int find_module_sections(struct module *mod, struct load_info *info)
mod->trace_events = section_objs(info, "_ftrace_events",
sizeof(*mod->trace_events),
&mod->num_trace_events);
- mod->trace_enums = section_objs(info, "_ftrace_enum_map",
- sizeof(*mod->trace_enums),
- &mod->num_trace_enums);
+ mod->trace_evals = section_objs(info, "_ftrace_eval_map",
+ sizeof(*mod->trace_evals),
+ &mod->num_trace_evals);
#endif
#ifdef CONFIG_TRACING
mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
diff --git a/kernel/pid.c b/kernel/pid.c
index fd1cde1e4576..731c4e528f4e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -575,16 +575,13 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
*/
void __init pidhash_init(void)
{
- unsigned int i, pidhash_size;
+ unsigned int pidhash_size;
pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
- HASH_EARLY | HASH_SMALL,
+ HASH_EARLY | HASH_SMALL | HASH_ZERO,
&pidhash_shift, NULL,
0, 4096);
pidhash_size = 1U << pidhash_shift;
-
- for (i = 0; i < pidhash_size; i++)
- INIT_HLIST_HEAD(&pid_hash[i]);
}
void __init pidmap_init(void)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b7708e319941..222317721c5a 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -30,15 +30,13 @@
#include <linux/slab.h>
#include <linux/compiler.h>
#include <linux/ktime.h>
+#include <linux/set_memory.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/io.h>
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
-#include <asm/set_memory.h>
-#endif
#include "power.h"
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 1db044f808b7..2a7d04049af4 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -18,12 +18,14 @@
#ifdef CONFIG_PRINTK
-#define PRINTK_SAFE_CONTEXT_MASK 0x7fffffff
-#define PRINTK_NMI_CONTEXT_MASK 0x80000000
+#define PRINTK_SAFE_CONTEXT_MASK 0x3fffffff
+#define PRINTK_NMI_DEFERRED_CONTEXT_MASK 0x40000000
+#define PRINTK_NMI_CONTEXT_MASK 0x80000000
extern raw_spinlock_t logbuf_lock;
__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
+__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
__printf(1, 0) int vprintk_func(const char *fmt, va_list args);
void __printk_safe_enter(void);
void __printk_safe_exit(void);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index bd53ea579dc8..fc47863f629c 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2720,16 +2720,13 @@ void wake_up_klogd(void)
preempt_enable();
}
-int printk_deferred(const char *fmt, ...)
+int vprintk_deferred(const char *fmt, va_list args)
{
- va_list args;
int r;
- preempt_disable();
- va_start(args, fmt);
r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
- va_end(args);
+ preempt_disable();
__this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
preempt_enable();
@@ -2737,6 +2734,18 @@ int printk_deferred(const char *fmt, ...)
return r;
}
+int printk_deferred(const char *fmt, ...)
+{
+ va_list args;
+ int r;
+
+ va_start(args, fmt);
+ r = vprintk_deferred(fmt, args);
+ va_end(args);
+
+ return r;
+}
+
/*
* printk rate limiting, lifted from the networking subsystem.
*
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 033e50a7d706..3cdaeaef9ce1 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -80,8 +80,8 @@ static void queue_flush_work(struct printk_safe_seq_buf *s)
* happen, printk_safe_log_store() will notice the buffer->len mismatch
* and repeat the write.
*/
-static int printk_safe_log_store(struct printk_safe_seq_buf *s,
- const char *fmt, va_list args)
+static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s,
+ const char *fmt, va_list args)
{
int add;
size_t len;
@@ -299,7 +299,7 @@ void printk_safe_flush_on_panic(void)
* one writer running. But the buffer might get flushed from another
* CPU, so we need to be careful.
*/
-static int vprintk_nmi(const char *fmt, va_list args)
+static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
{
struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
@@ -308,17 +308,29 @@ static int vprintk_nmi(const char *fmt, va_list args)
void printk_nmi_enter(void)
{
- this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
+ /*
+ * The size of the extra per-CPU buffer is limited. Use it only when
+ * the main one is locked. If this CPU is not in the safe context,
+ * the lock must be taken on another CPU and we could wait for it.
+ */
+ if ((this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) &&
+ raw_spin_is_locked(&logbuf_lock)) {
+ this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
+ } else {
+ this_cpu_or(printk_context, PRINTK_NMI_DEFERRED_CONTEXT_MASK);
+ }
}
void printk_nmi_exit(void)
{
- this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK);
+ this_cpu_and(printk_context,
+ ~(PRINTK_NMI_CONTEXT_MASK |
+ PRINTK_NMI_DEFERRED_CONTEXT_MASK));
}
#else
-static int vprintk_nmi(const char *fmt, va_list args)
+static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
{
return 0;
}
@@ -330,7 +342,7 @@ static int vprintk_nmi(const char *fmt, va_list args)
* into itself. It uses a per-CPU buffer to store the message, just like
* NMI.
*/
-static int vprintk_safe(const char *fmt, va_list args)
+static __printf(1, 0) int vprintk_safe(const char *fmt, va_list args)
{
struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq);
@@ -351,12 +363,22 @@ void __printk_safe_exit(void)
__printf(1, 0) int vprintk_func(const char *fmt, va_list args)
{
+ /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */
if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
return vprintk_nmi(fmt, args);
+ /* Use extra buffer to prevent a recursion deadlock in safe mode. */
if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK)
return vprintk_safe(fmt, args);
+ /*
+ * Use the main logbuf when logbuf_lock is available in NMI.
+ * But avoid calling console drivers that might have their own locks.
+ */
+ if (this_cpu_read(printk_context) & PRINTK_NMI_DEFERRED_CONTEXT_MASK)
+ return vprintk_deferred(fmt, args);
+
+ /* No obstacles. */
return vprintk_default(fmt, args);
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 67c70e287647..6e3ea4ac1bda 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -611,17 +611,23 @@ static void cputime_adjust(struct task_cputime *curr,
utime = curr->utime;
/*
- * If either stime or both stime and utime are 0, assume all runtime is
- * userspace. Once a task gets some ticks, the monotonicy code at
- * 'update' will ensure things converge to the observed ratio.
+ * If either stime or utime are 0, assume all runtime is userspace.
+ * Once a task gets some ticks, the monotonicy code at 'update:'
+ * will ensure things converge to the observed ratio.
*/
- if (stime != 0) {
- if (utime == 0)
- stime = rtime;
- else
- stime = scale_stime(stime, rtime, stime + utime);
+ if (stime == 0) {
+ utime = rtime;
+ goto update;
}
+ if (utime == 0) {
+ stime = rtime;
+ goto update;
+ }
+
+ stime = scale_stime(stime, rtime, stime + utime);
+
+update:
/*
* Make sure stime doesn't go backwards; this preserves monotonicity
* for utime because rtime is monotonic.
@@ -673,20 +679,21 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static u64 vtime_delta(struct task_struct *tsk)
+static u64 vtime_delta(struct vtime *vtime)
{
- unsigned long now = READ_ONCE(jiffies);
+ unsigned long long clock;
- if (time_before(now, (unsigned long)tsk->vtime_snap))
+ clock = sched_clock_cpu(smp_processor_id());
+ if (clock < vtime->starttime)
return 0;
- return jiffies_to_nsecs(now - tsk->vtime_snap);
+ return clock - vtime->starttime;
}
-static u64 get_vtime_delta(struct task_struct *tsk)
+static u64 get_vtime_delta(struct vtime *vtime)
{
- unsigned long now = READ_ONCE(jiffies);
- u64 delta, other;
+ u64 delta = vtime_delta(vtime);
+ u64 other;
/*
* Unlike tick based timing, vtime based timing never has lost
@@ -695,104 +702,138 @@ static u64 get_vtime_delta(struct task_struct *tsk)
* elapsed time. Limit account_other_time to prevent rounding
* errors from causing elapsed vtime to go negative.
*/
- delta = jiffies_to_nsecs(now - tsk->vtime_snap);
other = account_other_time(delta);
- WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
- tsk->vtime_snap = now;
+ WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
+ vtime->starttime += delta;
return delta - other;
}
-static void __vtime_account_system(struct task_struct *tsk)
+static void __vtime_account_system(struct task_struct *tsk,
+ struct vtime *vtime)
+{
+ vtime->stime += get_vtime_delta(vtime);
+ if (vtime->stime >= TICK_NSEC) {
+ account_system_time(tsk, irq_count(), vtime->stime);
+ vtime->stime = 0;
+ }
+}
+
+static void vtime_account_guest(struct task_struct *tsk,
+ struct vtime *vtime)
{
- account_system_time(tsk, irq_count(), get_vtime_delta(tsk));
+ vtime->gtime += get_vtime_delta(vtime);
+ if (vtime->gtime >= TICK_NSEC) {
+ account_guest_time(tsk, vtime->gtime);
+ vtime->gtime = 0;
+ }
}
void vtime_account_system(struct task_struct *tsk)
{
- if (!vtime_delta(tsk))
+ struct vtime *vtime = &tsk->vtime;
+
+ if (!vtime_delta(vtime))
return;
- write_seqcount_begin(&tsk->vtime_seqcount);
- __vtime_account_system(tsk);
- write_seqcount_end(&tsk->vtime_seqcount);
+ write_seqcount_begin(&vtime->seqcount);
+ /* We might have scheduled out from guest path */
+ if (current->flags & PF_VCPU)
+ vtime_account_guest(tsk, vtime);
+ else
+ __vtime_account_system(tsk, vtime);
+ write_seqcount_end(&vtime->seqcount);
}
-void vtime_account_user(struct task_struct *tsk)
+void vtime_user_enter(struct task_struct *tsk)
{
- write_seqcount_begin(&tsk->vtime_seqcount);
- tsk->vtime_snap_whence = VTIME_SYS;
- if (vtime_delta(tsk))
- account_user_time(tsk, get_vtime_delta(tsk));
- write_seqcount_end(&tsk->vtime_seqcount);
+ struct vtime *vtime = &tsk->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ __vtime_account_system(tsk, vtime);
+ vtime->state = VTIME_USER;
+ write_seqcount_end(&vtime->seqcount);
}
-void vtime_user_enter(struct task_struct *tsk)
+void vtime_user_exit(struct task_struct *tsk)
{
- write_seqcount_begin(&tsk->vtime_seqcount);
- if (vtime_delta(tsk))
- __vtime_account_system(tsk);
- tsk->vtime_snap_whence = VTIME_USER;
- write_seqcount_end(&tsk->vtime_seqcount);
+ struct vtime *vtime = &tsk->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ vtime->utime += get_vtime_delta(vtime);
+ if (vtime->utime >= TICK_NSEC) {
+ account_user_time(tsk, vtime->utime);
+ vtime->utime = 0;
+ }
+ vtime->state = VTIME_SYS;
+ write_seqcount_end(&vtime->seqcount);
}
void vtime_guest_enter(struct task_struct *tsk)
{
+ struct vtime *vtime = &tsk->vtime;
/*
* The flags must be updated under the lock with
- * the vtime_snap flush and update.
+ * the vtime_starttime flush and update.
* That enforces a right ordering and update sequence
* synchronization against the reader (task_gtime())
* that can thus safely catch up with a tickless delta.
*/
- write_seqcount_begin(&tsk->vtime_seqcount);
- if (vtime_delta(tsk))
- __vtime_account_system(tsk);
+ write_seqcount_begin(&vtime->seqcount);
+ __vtime_account_system(tsk, vtime);
current->flags |= PF_VCPU;
- write_seqcount_end(&tsk->vtime_seqcount);
+ write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_enter);
void vtime_guest_exit(struct task_struct *tsk)
{
- write_seqcount_begin(&tsk->vtime_seqcount);
- __vtime_account_system(tsk);
+ struct vtime *vtime = &tsk->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ vtime_account_guest(tsk, vtime);
current->flags &= ~PF_VCPU;
- write_seqcount_end(&tsk->vtime_seqcount);
+ write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_exit);
void vtime_account_idle(struct task_struct *tsk)
{
- account_idle_time(get_vtime_delta(tsk));
+ account_idle_time(get_vtime_delta(&tsk->vtime));
}
void arch_vtime_task_switch(struct task_struct *prev)
{
- write_seqcount_begin(&prev->vtime_seqcount);
- prev->vtime_snap_whence = VTIME_INACTIVE;
- write_seqcount_end(&prev->vtime_seqcount);
+ struct vtime *vtime = &prev->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ vtime->state = VTIME_INACTIVE;
+ write_seqcount_end(&vtime->seqcount);
+
+ vtime = &current->vtime;
- write_seqcount_begin(&current->vtime_seqcount);
- current->vtime_snap_whence = VTIME_SYS;
- current->vtime_snap = jiffies;
- write_seqcount_end(&current->vtime_seqcount);
+ write_seqcount_begin(&vtime->seqcount);
+ vtime->state = VTIME_SYS;
+ vtime->starttime = sched_clock_cpu(smp_processor_id());
+ write_seqcount_end(&vtime->seqcount);
}
void vtime_init_idle(struct task_struct *t, int cpu)
{
+ struct vtime *vtime = &t->vtime;
unsigned long flags;
local_irq_save(flags);
- write_seqcount_begin(&t->vtime_seqcount);
- t->vtime_snap_whence = VTIME_SYS;
- t->vtime_snap = jiffies;
- write_seqcount_end(&t->vtime_seqcount);
+ write_seqcount_begin(&vtime->seqcount);
+ vtime->state = VTIME_SYS;
+ vtime->starttime = sched_clock_cpu(cpu);
+ write_seqcount_end(&vtime->seqcount);
local_irq_restore(flags);
}
u64 task_gtime(struct task_struct *t)
{
+ struct vtime *vtime = &t->vtime;
unsigned int seq;
u64 gtime;
@@ -800,13 +841,13 @@ u64 task_gtime(struct task_struct *t)
return t->gtime;
do {
- seq = read_seqcount_begin(&t->vtime_seqcount);
+ seq = read_seqcount_begin(&vtime->seqcount);
gtime = t->gtime;
- if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
- gtime += vtime_delta(t);
+ if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
+ gtime += vtime->gtime + vtime_delta(vtime);
- } while (read_seqcount_retry(&t->vtime_seqcount, seq));
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
return gtime;
}
@@ -818,8 +859,9 @@ u64 task_gtime(struct task_struct *t)
*/
void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
{
- u64 delta;
+ struct vtime *vtime = &t->vtime;
unsigned int seq;
+ u64 delta;
if (!vtime_accounting_enabled()) {
*utime = t->utime;
@@ -828,25 +870,25 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
}
do {
- seq = read_seqcount_begin(&t->vtime_seqcount);
+ seq = read_seqcount_begin(&vtime->seqcount);
*utime = t->utime;
*stime = t->stime;
/* Task is sleeping, nothing to add */
- if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t))
+ if (vtime->state == VTIME_INACTIVE || is_idle_task(t))
continue;
- delta = vtime_delta(t);
+ delta = vtime_delta(vtime);
/*
* Task runs either in user or kernel space, add pending nohz time to
* the right place.
*/
- if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU)
- *utime += delta;
- else if (t->vtime_snap_whence == VTIME_SYS)
- *stime += delta;
- } while (read_seqcount_retry(&t->vtime_seqcount, seq));
+ if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
+ *utime += vtime->utime + delta;
+ else if (vtime->state == VTIME_SYS)
+ *stime += vtime->stime + delta;
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 008c514dc241..c95880e216f6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6646,10 +6646,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* our sched_group. We may want to revisit it if we couldn't
* meet load balance goals by pulling other tasks on src_cpu.
*
- * Also avoid computing new_dst_cpu if we have already computed
- * one in current iteration.
+ * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
+ * already computed one in current iteration.
*/
- if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
+ if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
return 0;
/* Prevent to re-select dst_cpu via env's cpus */
@@ -8022,14 +8022,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.tasks = LIST_HEAD_INIT(env.tasks),
};
- /*
- * For NEWLY_IDLE load_balancing, we don't need to consider
- * other cpus in our group
- */
- if (idle == CPU_NEWLY_IDLE)
- env.dst_grpmask = NULL;
-
- cpumask_copy(cpus, cpu_active_mask);
+ cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
schedstat_inc(sd->lb_count[idle]);
@@ -8151,7 +8144,15 @@ more_balance:
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(env.flags & LBF_ALL_PINNED)) {
cpumask_clear_cpu(cpu_of(busiest), cpus);
- if (!cpumask_empty(cpus)) {
+ /*
+ * Attempting to continue load balancing at the current
+ * sched_domain level only makes sense if there are
+ * active CPUs remaining as possible busiest CPUs to
+ * pull load from which are not contained within the
+ * destination group that is receiving any migrated
+ * load.
+ */
+ if (!cpumask_subset(cpus, env.dst_grpmask)) {
env.loop = 0;
env.loop_break = sched_nr_migrate_break;
goto redo;
@@ -8447,6 +8448,13 @@ static int active_load_balance_cpu_stop(void *data)
.src_cpu = busiest_rq->cpu,
.src_rq = busiest_rq,
.idle = CPU_IDLE,
+ /*
+ * can_migrate_task() doesn't need to compute new_dst_cpu
+ * for active balancing. Since we have CPU_IDLE, but no
+ * @dst_grpmask we need to make that test go away with lying
+ * about DST_PINNED.
+ */
+ .flags = LBF_DST_PINNED,
};
schedstat_inc(sd->alb_count);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 65f61077ad50..98b59b5db90b 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -13,7 +13,7 @@
* of Berkeley Packet Filters/Linux Socket Filters.
*/
-#include <linux/atomic.h>
+#include <linux/refcount.h>
#include <linux/audit.h>
#include <linux/compat.h>
#include <linux/coredump.h>
@@ -56,7 +56,7 @@
* to a task_struct (other than @usage).
*/
struct seccomp_filter {
- atomic_t usage;
+ refcount_t usage;
struct seccomp_filter *prev;
struct bpf_prog *prog;
};
@@ -378,7 +378,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
return ERR_PTR(ret);
}
- atomic_set(&sfilter->usage, 1);
+ refcount_set(&sfilter->usage, 1);
return sfilter;
}
@@ -465,7 +465,7 @@ void get_seccomp_filter(struct task_struct *tsk)
if (!orig)
return;
/* Reference count is bounded by the number of total processes. */
- atomic_inc(&orig->usage);
+ refcount_inc(&orig->usage);
}
static inline void seccomp_filter_free(struct seccomp_filter *filter)
@@ -481,7 +481,7 @@ void put_seccomp_filter(struct task_struct *tsk)
{
struct seccomp_filter *orig = tsk->seccomp.filter;
/* Clean up single-reference branches iteratively. */
- while (orig && atomic_dec_and_test(&orig->usage)) {
+ while (orig && refcount_dec_and_test(&orig->usage)) {
struct seccomp_filter *freeme = orig;
orig = orig->prev;
seccomp_filter_free(freeme);
@@ -641,11 +641,12 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
return 0;
case SECCOMP_RET_KILL:
- default: {
- siginfo_t info;
+ default:
audit_seccomp(this_syscall, SIGSYS, action);
/* Dump core only if this is the last remaining thread. */
if (get_nr_threads(current) == 1) {
+ siginfo_t info;
+
/* Show the original registers in the dump. */
syscall_rollback(current, task_pt_regs(current));
/* Trigger a manual coredump since do_exit skips it. */
@@ -654,7 +655,6 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
}
do_exit(SIGSYS);
}
- }
unreachable();
diff --git a/kernel/signal.c b/kernel/signal.c
index 35a570f71f07..48a59eefd8ad 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2776,7 +2776,7 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
* @info: if non-null, the signal's siginfo is returned here
* @ts: upper bound on process time suspension
*/
-int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
+static int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
const struct timespec *ts)
{
ktime_t *to = NULL, timeout = KTIME_MAX;
@@ -2865,6 +2865,40 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
return ret;
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
+ struct compat_siginfo __user *, uinfo,
+ struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
+{
+ compat_sigset_t s32;
+ sigset_t s;
+ struct timespec t;
+ siginfo_t info;
+ long ret;
+
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
+ return -EFAULT;
+ sigset_from_compat(&s, &s32);
+
+ if (uts) {
+ if (compat_get_timespec(&t, uts))
+ return -EFAULT;
+ }
+
+ ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
+
+ if (ret > 0 && uinfo) {
+ if (copy_siginfo_to_user32(uinfo, &info))
+ ret = -EFAULT;
+ }
+
+ return ret;
+}
+#endif
+
/**
* sys_kill - send a signal to a process
* @pid: the PID of the process
@@ -3121,78 +3155,68 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
}
static int
-do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)
+do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp)
{
- stack_t oss;
- int error;
+ struct task_struct *t = current;
- oss.ss_sp = (void __user *) current->sas_ss_sp;
- oss.ss_size = current->sas_ss_size;
- oss.ss_flags = sas_ss_flags(sp) |
- (current->sas_ss_flags & SS_FLAG_BITS);
+ if (oss) {
+ memset(oss, 0, sizeof(stack_t));
+ oss->ss_sp = (void __user *) t->sas_ss_sp;
+ oss->ss_size = t->sas_ss_size;
+ oss->ss_flags = sas_ss_flags(sp) |
+ (current->sas_ss_flags & SS_FLAG_BITS);
+ }
- if (uss) {
- void __user *ss_sp;
- size_t ss_size;
- unsigned ss_flags;
+ if (ss) {
+ void __user *ss_sp = ss->ss_sp;
+ size_t ss_size = ss->ss_size;
+ unsigned ss_flags = ss->ss_flags;
int ss_mode;
- error = -EFAULT;
- if (!access_ok(VERIFY_READ, uss, sizeof(*uss)))
- goto out;
- error = __get_user(ss_sp, &uss->ss_sp) |
- __get_user(ss_flags, &uss->ss_flags) |
- __get_user(ss_size, &uss->ss_size);
- if (error)
- goto out;
-
- error = -EPERM;
- if (on_sig_stack(sp))
- goto out;
+ if (unlikely(on_sig_stack(sp)))
+ return -EPERM;
ss_mode = ss_flags & ~SS_FLAG_BITS;
- error = -EINVAL;
- if (ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK &&
- ss_mode != 0)
- goto out;
+ if (unlikely(ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK &&
+ ss_mode != 0))
+ return -EINVAL;
if (ss_mode == SS_DISABLE) {
ss_size = 0;
ss_sp = NULL;
} else {
- error = -ENOMEM;
- if (ss_size < MINSIGSTKSZ)
- goto out;
+ if (unlikely(ss_size < MINSIGSTKSZ))
+ return -ENOMEM;
}
- current->sas_ss_sp = (unsigned long) ss_sp;
- current->sas_ss_size = ss_size;
- current->sas_ss_flags = ss_flags;
+ t->sas_ss_sp = (unsigned long) ss_sp;
+ t->sas_ss_size = ss_size;
+ t->sas_ss_flags = ss_flags;
}
-
- error = 0;
- if (uoss) {
- error = -EFAULT;
- if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss)))
- goto out;
- error = __put_user(oss.ss_sp, &uoss->ss_sp) |
- __put_user(oss.ss_size, &uoss->ss_size) |
- __put_user(oss.ss_flags, &uoss->ss_flags);
- }
-
-out:
- return error;
+ return 0;
}
+
SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
{
- return do_sigaltstack(uss, uoss, current_user_stack_pointer());
+ stack_t new, old;
+ int err;
+ if (uss && copy_from_user(&new, uss, sizeof(stack_t)))
+ return -EFAULT;
+ err = do_sigaltstack(uss ? &new : NULL, uoss ? &old : NULL,
+ current_user_stack_pointer());
+ if (!err && uoss && copy_to_user(uoss, &old, sizeof(stack_t)))
+ err = -EFAULT;
+ return err;
}
int restore_altstack(const stack_t __user *uss)
{
- int err = do_sigaltstack(uss, NULL, current_user_stack_pointer());
+ stack_t new;
+ if (copy_from_user(&new, uss, sizeof(stack_t)))
+ return -EFAULT;
+ (void)do_sigaltstack(&new, NULL, current_user_stack_pointer());
/* squash all but EFAULT for now */
- return err == -EFAULT ? err : 0;
+ return 0;
}
int __save_altstack(stack_t __user *uss, unsigned long sp)
@@ -3215,29 +3239,24 @@ COMPAT_SYSCALL_DEFINE2(sigaltstack,
{
stack_t uss, uoss;
int ret;
- mm_segment_t seg;
if (uss_ptr) {
compat_stack_t uss32;
-
- memset(&uss, 0, sizeof(stack_t));
if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t)))
return -EFAULT;
uss.ss_sp = compat_ptr(uss32.ss_sp);
uss.ss_flags = uss32.ss_flags;
uss.ss_size = uss32.ss_size;
}
- seg = get_fs();
- set_fs(KERNEL_DS);
- ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL),
- (stack_t __force __user *) &uoss,
+ ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss,
compat_user_stack_pointer());
- set_fs(seg);
if (ret >= 0 && uoss_ptr) {
- if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) ||
- __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) ||
- __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) ||
- __put_user(uoss.ss_size, &uoss_ptr->ss_size))
+ compat_stack_t old;
+ memset(&old, 0, sizeof(old));
+ old.ss_sp = ptr_to_compat(uoss.ss_sp);
+ old.ss_flags = uoss.ss_flags;
+ old.ss_size = uoss.ss_size;
+ if (copy_to_user(uoss_ptr, &old, sizeof(compat_stack_t)))
ret = -EFAULT;
}
return ret;
@@ -3277,6 +3296,18 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t));
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
+{
+ sigset_t set;
+ int err = do_sigpending(&set, sizeof(old_sigset_t));
+ if (err == 0)
+ if (copy_to_user(set32, &set, sizeof(old_sigset_t)))
+ err = -EFAULT;
+ return err;
+}
+#endif
+
#endif
#ifdef __ARCH_WANT_SYS_SIGPROCMASK
diff --git a/kernel/sys.c b/kernel/sys.c
index 8a94b4eabcaa..47d901586b4e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -886,7 +886,7 @@ SYSCALL_DEFINE0(getegid)
return from_kgid_munged(current_user_ns(), current_egid());
}
-void do_sys_times(struct tms *tms)
+static void do_sys_times(struct tms *tms)
{
u64 tgutime, tgstime, cutime, cstime;
@@ -912,6 +912,32 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
return (long) jiffies_64_to_clock_t(get_jiffies_64());
}
+#ifdef CONFIG_COMPAT
+static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
+{
+ return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
+}
+
+COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
+{
+ if (tbuf) {
+ struct tms tms;
+ struct compat_tms tmp;
+
+ do_sys_times(&tms);
+ /* Convert our struct tms to the compat version. */
+ tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
+ tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
+ tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
+ tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
+ if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
+ return -EFAULT;
+ }
+ force_successful_syscall_return();
+ return compat_jiffies_to_clock_t(jiffies);
+}
+#endif
+
/*
* This needs some heavy checking ...
* I just haven't the stomach for it. I also don't fully
@@ -1306,6 +1332,54 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
return ret;
}
+#ifdef CONFIG_COMPAT
+
+COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
+{
+ struct rlimit r;
+ struct compat_rlimit r32;
+
+ if (copy_from_user(&r32, rlim, sizeof(struct compat_rlimit)))
+ return -EFAULT;
+
+ if (r32.rlim_cur == COMPAT_RLIM_INFINITY)
+ r.rlim_cur = RLIM_INFINITY;
+ else
+ r.rlim_cur = r32.rlim_cur;
+ if (r32.rlim_max == COMPAT_RLIM_INFINITY)
+ r.rlim_max = RLIM_INFINITY;
+ else
+ r.rlim_max = r32.rlim_max;
+ return do_prlimit(current, resource, &r, NULL);
+}
+
+COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
+{
+ struct rlimit r;
+ int ret;
+
+ ret = do_prlimit(current, resource, NULL, &r);
+ if (!ret) {
+ struct rlimit r32;
+ if (r.rlim_cur > COMPAT_RLIM_INFINITY)
+ r32.rlim_cur = COMPAT_RLIM_INFINITY;
+ else
+ r32.rlim_cur = r.rlim_cur;
+ if (r.rlim_max > COMPAT_RLIM_INFINITY)
+ r32.rlim_max = COMPAT_RLIM_INFINITY;
+ else
+ r32.rlim_max = r.rlim_max;
+
+ if (copy_to_user(rlim, &r32, sizeof(struct compat_rlimit)))
+ return -EFAULT;
+ }
+ return ret;
+}
+
+#endif
+
#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
/*
@@ -1328,6 +1402,30 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0;
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
+{
+ struct rlimit r;
+
+ if (resource >= RLIM_NLIMITS)
+ return -EINVAL;
+
+ task_lock(current->group_leader);
+ r = current->signal->rlim[resource];
+ task_unlock(current->group_leader);
+ if (r.rlim_cur > 0x7FFFFFFF)
+ r.rlim_cur = 0x7FFFFFFF;
+ if (r.rlim_max > 0x7FFFFFFF)
+ r.rlim_max = 0x7FFFFFFF;
+
+ if (put_user(r.rlim_cur, &rlim->rlim_cur) ||
+ put_user(r.rlim_max, &rlim->rlim_max))
+ return -EFAULT;
+ return 0;
+}
+#endif
+
#endif
static inline bool rlim64_is_infinity(__u64 rlim64)
@@ -1552,7 +1650,7 @@ static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
r->ru_oublock += task_io_get_oublock(t);
}
-static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
+void getrusage(struct task_struct *p, int who, struct rusage *r)
{
struct task_struct *t;
unsigned long flags;
@@ -1626,20 +1724,16 @@ out:
r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
}
-int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
+SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
{
struct rusage r;
- k_getrusage(p, who, &r);
- return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
-}
-
-SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
-{
if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
who != RUSAGE_THREAD)
return -EINVAL;
- return getrusage(current, who, ru);
+
+ getrusage(current, who, &r);
+ return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
}
#ifdef CONFIG_COMPAT
@@ -1651,7 +1745,7 @@ COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru)
who != RUSAGE_THREAD)
return -EINVAL;
- k_getrusage(current, who, &r);
+ getrusage(current, who, &r);
return put_compat_rusage(&r, ru);
}
#endif
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index c991cf212c6d..0b8ff7d257ea 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -712,14 +712,14 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
alarmtimer_freezerset(absexp, type);
restart = &current->restart_block;
if (restart->nanosleep.type != TT_NONE) {
- struct timespec rmt;
+ struct timespec64 rmt;
ktime_t rem;
rem = ktime_sub(absexp, alarm_bases[type].gettime());
if (rem <= 0)
return 0;
- rmt = ktime_to_timespec(rem);
+ rmt = ktime_to_timespec64(rem);
return nanosleep_copyout(restart, &rmt);
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 81da124f1115..88f75f92ef36 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1440,17 +1440,17 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
}
EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
-int nanosleep_copyout(struct restart_block *restart, struct timespec *ts)
+int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
{
switch(restart->nanosleep.type) {
#ifdef CONFIG_COMPAT
case TT_COMPAT:
- if (compat_put_timespec(ts, restart->nanosleep.compat_rmtp))
+ if (compat_put_timespec64(ts, restart->nanosleep.compat_rmtp))
return -EFAULT;
break;
#endif
case TT_NATIVE:
- if (copy_to_user(restart->nanosleep.rmtp, ts, sizeof(struct timespec)))
+ if (put_timespec64(ts, restart->nanosleep.rmtp))
return -EFAULT;
break;
default:
@@ -1485,11 +1485,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
restart = &current->restart_block;
if (restart->nanosleep.type != TT_NONE) {
ktime_t rem = hrtimer_expires_remaining(&t->timer);
- struct timespec rmt;
+ struct timespec64 rmt;
if (rem <= 0)
return 0;
- rmt = ktime_to_timespec(rem);
+ rmt = ktime_to_timespec64(rem);
return nanosleep_copyout(restart, &rmt);
}
@@ -1546,19 +1546,17 @@ out:
SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
struct timespec __user *, rmtp)
{
- struct timespec64 tu64;
- struct timespec tu;
+ struct timespec64 tu;
- if (copy_from_user(&tu, rqtp, sizeof(tu)))
+ if (get_timespec64(&tu, rqtp))
return -EFAULT;
- tu64 = timespec_to_timespec64(tu);
- if (!timespec64_valid(&tu64))
+ if (!timespec64_valid(&tu))
return -EINVAL;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
- return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+ return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
}
#ifdef CONFIG_COMPAT
@@ -1566,19 +1564,17 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
struct compat_timespec __user *, rmtp)
{
- struct timespec64 tu64;
- struct timespec tu;
+ struct timespec64 tu;
- if (compat_get_timespec(&tu, rqtp))
+ if (compat_get_timespec64(&tu, rqtp))
return -EFAULT;
- tu64 = timespec_to_timespec64(tu);
- if (!timespec64_valid(&tu64))
+ if (!timespec64_valid(&tu))
return -EINVAL;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
- return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+ return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
}
#endif
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 60cb24ac9ebc..a3bd5dbe0dc4 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1318,12 +1318,8 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
*/
restart = &current->restart_block;
restart->nanosleep.expires = expires;
- if (restart->nanosleep.type != TT_NONE) {
- struct timespec ts;
-
- ts = timespec64_to_timespec(it.it_value);
- error = nanosleep_copyout(restart, &ts);
- }
+ if (restart->nanosleep.type != TT_NONE)
+ error = nanosleep_copyout(restart, &it.it_value);
}
return error;
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 38f3b20efa29..06f34feb635e 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -41,12 +41,6 @@ SYS_NI(setitimer);
#ifdef __ARCH_WANT_SYS_ALARM
SYS_NI(alarm);
#endif
-COMPAT_SYS_NI(timer_create);
-COMPAT_SYS_NI(clock_adjtime);
-COMPAT_SYS_NI(timer_settime);
-COMPAT_SYS_NI(timer_gettime);
-COMPAT_SYS_NI(getitimer);
-COMPAT_SYS_NI(setitimer);
/*
* We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC
@@ -57,40 +51,52 @@ COMPAT_SYS_NI(setitimer);
SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
const struct timespec __user *, tp)
{
- struct timespec64 new_tp64;
- struct timespec new_tp;
+ struct timespec64 new_tp;
if (which_clock != CLOCK_REALTIME)
return -EINVAL;
- if (copy_from_user(&new_tp, tp, sizeof (*tp)))
+ if (get_timespec64(&new_tp, tp))
return -EFAULT;
- new_tp64 = timespec_to_timespec64(new_tp);
- return do_sys_settimeofday64(&new_tp64, NULL);
+ return do_sys_settimeofday64(&new_tp, NULL);
}
-SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
- struct timespec __user *,tp)
+int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp)
{
- struct timespec64 kernel_tp64;
- struct timespec kernel_tp;
-
switch (which_clock) {
- case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break;
- case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break;
- case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break;
- default: return -EINVAL;
+ case CLOCK_REALTIME:
+ ktime_get_real_ts64(tp);
+ break;
+ case CLOCK_MONOTONIC:
+ ktime_get_ts64(tp);
+ break;
+ case CLOCK_BOOTTIME:
+ get_monotonic_boottime64(tp);
+ break;
+ default:
+ return -EINVAL;
}
- kernel_tp = timespec64_to_timespec(kernel_tp64);
- if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
+ return 0;
+}
+SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
+ struct timespec __user *, tp)
+{
+ int ret;
+ struct timespec64 kernel_tp;
+
+ ret = do_clock_gettime(which_clock, &kernel_tp);
+ if (ret)
+ return ret;
+
+ if (put_timespec64(&kernel_tp, tp))
return -EFAULT;
return 0;
}
SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp)
{
- struct timespec rtn_tp = {
+ struct timespec64 rtn_tp = {
.tv_sec = 0,
.tv_nsec = hrtimer_resolution,
};
@@ -99,7 +105,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __us
case CLOCK_REALTIME:
case CLOCK_MONOTONIC:
case CLOCK_BOOTTIME:
- if (copy_to_user(tp, &rtn_tp, sizeof(rtn_tp)))
+ if (put_timespec64(&rtn_tp, tp))
return -EFAULT;
return 0;
default:
@@ -138,44 +144,45 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
}
#ifdef CONFIG_COMPAT
+COMPAT_SYS_NI(timer_create);
+COMPAT_SYS_NI(clock_adjtime);
+COMPAT_SYS_NI(timer_settime);
+COMPAT_SYS_NI(timer_gettime);
+COMPAT_SYS_NI(getitimer);
+COMPAT_SYS_NI(setitimer);
+
COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
struct compat_timespec __user *, tp)
{
- struct timespec64 new_tp64;
- struct timespec new_tp;
+ struct timespec64 new_tp;
if (which_clock != CLOCK_REALTIME)
return -EINVAL;
- if (compat_get_timespec(&new_tp, tp))
+ if (compat_get_timespec64(&new_tp, tp))
return -EFAULT;
- new_tp64 = timespec_to_timespec64(new_tp);
- return do_sys_settimeofday64(&new_tp64, NULL);
+ return do_sys_settimeofday64(&new_tp, NULL);
}
-COMPAT_SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
- struct compat_timespec __user *,tp)
+COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
+ struct compat_timespec __user *, tp)
{
- struct timespec64 kernel_tp64;
- struct timespec kernel_tp;
+ int ret;
+ struct timespec64 kernel_tp;
- switch (which_clock) {
- case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break;
- case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break;
- case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break;
- default: return -EINVAL;
- }
+ ret = do_clock_gettime(which_clock, &kernel_tp);
+ if (ret)
+ return ret;
- kernel_tp = timespec64_to_timespec(kernel_tp64);
- if (compat_put_timespec(&kernel_tp, tp))
+ if (compat_put_timespec64(&kernel_tp, tp))
return -EFAULT;
return 0;
}
-COMPAT_SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
+COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
struct compat_timespec __user *, tp)
{
- struct timespec rtn_tp = {
+ struct timespec64 rtn_tp = {
.tv_sec = 0,
.tv_nsec = hrtimer_resolution,
};
@@ -184,13 +191,14 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
case CLOCK_REALTIME:
case CLOCK_MONOTONIC:
case CLOCK_BOOTTIME:
- if (compat_put_timespec(&rtn_tp, tp))
+ if (compat_put_timespec64(&rtn_tp, tp))
return -EFAULT;
return 0;
default:
return -EINVAL;
}
}
+
COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
struct compat_timespec __user *, rqtp,
struct compat_timespec __user *, rmtp)
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 82d67be7d9d1..13d6881f908b 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -739,13 +739,11 @@ static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting)
SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
struct itimerspec __user *, setting)
{
- struct itimerspec64 cur_setting64;
+ struct itimerspec64 cur_setting;
- int ret = do_timer_gettime(timer_id, &cur_setting64);
+ int ret = do_timer_gettime(timer_id, &cur_setting);
if (!ret) {
- struct itimerspec cur_setting;
- cur_setting = itimerspec64_to_itimerspec(&cur_setting64);
- if (copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
+ if (put_itimerspec64(&cur_setting, setting))
ret = -EFAULT;
}
return ret;
@@ -755,13 +753,11 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
struct compat_itimerspec __user *, setting)
{
- struct itimerspec64 cur_setting64;
+ struct itimerspec64 cur_setting;
- int ret = do_timer_gettime(timer_id, &cur_setting64);
+ int ret = do_timer_gettime(timer_id, &cur_setting);
if (!ret) {
- struct itimerspec cur_setting;
- cur_setting = itimerspec64_to_itimerspec(&cur_setting64);
- if (put_compat_itimerspec(setting, &cur_setting))
+ if (put_compat_itimerspec64(&cur_setting, setting))
ret = -EFAULT;
}
return ret;
@@ -907,23 +903,19 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
const struct itimerspec __user *, new_setting,
struct itimerspec __user *, old_setting)
{
- struct itimerspec64 new_spec64, old_spec64;
- struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL;
- struct itimerspec new_spec;
+ struct itimerspec64 new_spec, old_spec;
+ struct itimerspec64 *rtn = old_setting ? &old_spec : NULL;
int error = 0;
if (!new_setting)
return -EINVAL;
- if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
+ if (get_itimerspec64(&new_spec, new_setting))
return -EFAULT;
- new_spec64 = itimerspec_to_itimerspec64(&new_spec);
- error = do_timer_settime(timer_id, flags, &new_spec64, rtn);
+ error = do_timer_settime(timer_id, flags, &new_spec, rtn);
if (!error && old_setting) {
- struct itimerspec old_spec;
- old_spec = itimerspec64_to_itimerspec(&old_spec64);
- if (copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
+ if (put_itimerspec64(&old_spec, old_setting))
error = -EFAULT;
}
return error;
@@ -934,22 +926,18 @@ COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
struct compat_itimerspec __user *, new,
struct compat_itimerspec __user *, old)
{
- struct itimerspec64 new_spec64, old_spec64;
- struct itimerspec64 *rtn = old ? &old_spec64 : NULL;
- struct itimerspec new_spec;
+ struct itimerspec64 new_spec, old_spec;
+ struct itimerspec64 *rtn = old ? &old_spec : NULL;
int error = 0;
if (!new)
return -EINVAL;
- if (get_compat_itimerspec(&new_spec, new))
+ if (get_compat_itimerspec64(&new_spec, new))
return -EFAULT;
- new_spec64 = itimerspec_to_itimerspec64(&new_spec);
- error = do_timer_settime(timer_id, flags, &new_spec64, rtn);
+ error = do_timer_settime(timer_id, flags, &new_spec, rtn);
if (!error && old) {
- struct itimerspec old_spec;
- old_spec = itimerspec64_to_itimerspec(&old_spec64);
- if (put_compat_itimerspec(old, &old_spec))
+ if (put_compat_itimerspec64(&old_spec, old))
error = -EFAULT;
}
return error;
@@ -1049,34 +1037,30 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
const struct timespec __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timespec64 new_tp64;
- struct timespec new_tp;
+ struct timespec64 new_tp;
if (!kc || !kc->clock_set)
return -EINVAL;
- if (copy_from_user(&new_tp, tp, sizeof (*tp)))
+ if (get_timespec64(&new_tp, tp))
return -EFAULT;
- new_tp64 = timespec_to_timespec64(new_tp);
- return kc->clock_set(which_clock, &new_tp64);
+ return kc->clock_set(which_clock, &new_tp);
}
SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
struct timespec __user *,tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timespec64 kernel_tp64;
- struct timespec kernel_tp;
+ struct timespec64 kernel_tp;
int error;
if (!kc)
return -EINVAL;
- error = kc->clock_get(which_clock, &kernel_tp64);
- kernel_tp = timespec64_to_timespec(kernel_tp64);
+ error = kc->clock_get(which_clock, &kernel_tp);
- if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
+ if (!error && put_timespec64(&kernel_tp, tp))
error = -EFAULT;
return error;
@@ -1109,17 +1093,15 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
struct timespec __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timespec64 rtn_tp64;
- struct timespec rtn_tp;
+ struct timespec64 rtn_tp;
int error;
if (!kc)
return -EINVAL;
- error = kc->clock_getres(which_clock, &rtn_tp64);
- rtn_tp = timespec64_to_timespec(rtn_tp64);
+ error = kc->clock_getres(which_clock, &rtn_tp);
- if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
+ if (!error && tp && put_timespec64(&rtn_tp, tp))
error = -EFAULT;
return error;
@@ -1131,38 +1113,33 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
struct compat_timespec __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timespec64 new_tp64;
- struct timespec new_tp;
+ struct timespec64 ts;
if (!kc || !kc->clock_set)
return -EINVAL;
- if (compat_get_timespec(&new_tp, tp))
+ if (compat_get_timespec64(&ts, tp))
return -EFAULT;
- new_tp64 = timespec_to_timespec64(new_tp);
-
- return kc->clock_set(which_clock, &new_tp64);
+ return kc->clock_set(which_clock, &ts);
}
COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
struct compat_timespec __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timespec64 kernel_tp64;
- struct timespec kernel_tp;
- int error;
+ struct timespec64 ts;
+ int err;
if (!kc)
return -EINVAL;
- error = kc->clock_get(which_clock, &kernel_tp64);
- kernel_tp = timespec64_to_timespec(kernel_tp64);
+ err = kc->clock_get(which_clock, &ts);
- if (!error && compat_put_timespec(&kernel_tp, tp))
- error = -EFAULT;
+ if (!err && compat_put_timespec64(&ts, tp))
+ err = -EFAULT;
- return error;
+ return err;
}
COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
@@ -1193,21 +1170,19 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
struct compat_timespec __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timespec64 rtn_tp64;
- struct timespec rtn_tp;
- int error;
+ struct timespec64 ts;
+ int err;
if (!kc)
return -EINVAL;
- error = kc->clock_getres(which_clock, &rtn_tp64);
- rtn_tp = timespec64_to_timespec(rtn_tp64);
-
- if (!error && tp && compat_put_timespec(&rtn_tp, tp))
- error = -EFAULT;
+ err = kc->clock_getres(which_clock, &ts);
+ if (!err && tp && compat_put_timespec64(&ts, tp))
+ return -EFAULT;
- return error;
+ return err;
}
+
#endif
/*
@@ -1226,26 +1201,24 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
struct timespec __user *, rmtp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timespec64 t64;
- struct timespec t;
+ struct timespec64 t;
if (!kc)
return -EINVAL;
if (!kc->nsleep)
return -ENANOSLEEP_NOTSUP;
- if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
+ if (get_timespec64(&t, rqtp))
return -EFAULT;
- t64 = timespec_to_timespec64(t);
- if (!timespec64_valid(&t64))
+ if (!timespec64_valid(&t))
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
- return kc->nsleep(which_clock, flags, &t64);
+ return kc->nsleep(which_clock, flags, &t);
}
#ifdef CONFIG_COMPAT
@@ -1254,26 +1227,24 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
struct compat_timespec __user *, rmtp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timespec64 t64;
- struct timespec t;
+ struct timespec64 t;
if (!kc)
return -EINVAL;
if (!kc->nsleep)
return -ENANOSLEEP_NOTSUP;
- if (compat_get_timespec(&t, rqtp))
+ if (compat_get_timespec64(&t, rqtp))
return -EFAULT;
- t64 = timespec_to_timespec64(t);
- if (!timespec64_valid(&t64))
+ if (!timespec64_valid(&t))
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
- return kc->nsleep(which_clock, flags, &t64);
+ return kc->nsleep(which_clock, flags, &t);
}
#endif
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 7c89e437c4d7..44a8c1402133 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -890,3 +890,61 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
return res;
}
+
+int get_timespec64(struct timespec64 *ts,
+ const struct timespec __user *uts)
+{
+ struct timespec kts;
+ int ret;
+
+ ret = copy_from_user(&kts, uts, sizeof(kts));
+ if (ret)
+ return -EFAULT;
+
+ ts->tv_sec = kts.tv_sec;
+ ts->tv_nsec = kts.tv_nsec;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(get_timespec64);
+
+int put_timespec64(const struct timespec64 *ts,
+ struct timespec __user *uts)
+{
+ struct timespec kts = {
+ .tv_sec = ts->tv_sec,
+ .tv_nsec = ts->tv_nsec
+ };
+ return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(put_timespec64);
+
+int get_itimerspec64(struct itimerspec64 *it,
+ const struct itimerspec __user *uit)
+{
+ int ret;
+
+ ret = get_timespec64(&it->it_interval, &uit->it_interval);
+ if (ret)
+ return ret;
+
+ ret = get_timespec64(&it->it_value, &uit->it_value);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(get_itimerspec64);
+
+int put_itimerspec64(const struct itimerspec64 *it,
+ struct itimerspec __user *uit)
+{
+ int ret;
+
+ ret = put_timespec64(&it->it_interval, &uit->it_interval);
+ if (ret)
+ return ret;
+
+ ret = put_timespec64(&it->it_value, &uit->it_value);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(put_itimerspec64);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 7e06f04e98fe..434c840e2d82 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -667,30 +667,30 @@ config RING_BUFFER_STARTUP_TEST
If unsure, say N
-config TRACE_ENUM_MAP_FILE
- bool "Show enum mappings for trace events"
+config TRACE_EVAL_MAP_FILE
+ bool "Show eval mappings for trace events"
depends on TRACING
help
- The "print fmt" of the trace events will show the enum names instead
- of their values. This can cause problems for user space tools that
- use this string to parse the raw data as user space does not know
+ The "print fmt" of the trace events will show the enum/sizeof names
+ instead of their values. This can cause problems for user space tools
+ that use this string to parse the raw data as user space does not know
how to convert the string to its value.
To fix this, there's a special macro in the kernel that can be used
- to convert the enum into its value. If this macro is used, then the
- print fmt strings will have the enums converted to their values.
+ to convert an enum/sizeof into its value. If this macro is used, then
+ the print fmt strings will be converted to their values.
If something does not get converted properly, this option can be
- used to show what enums the kernel tried to convert.
+ used to show what enums/sizeof the kernel tried to convert.
- This option is for debugging the enum conversions. A file is created
- in the tracing directory called "enum_map" that will show the enum
+ This option is for debugging the conversions. A file is created
+ in the tracing directory called "eval_map" that will show the
names matched with their values and what trace event system they
belong too.
Normally, the mapping of the strings to values will be freed after
boot up or module load. With this option, they will not be freed, as
- they are needed for the "enum_map" file. Enabling this option will
+ they are needed for the "eval_map" file. Enabling this option will
increase the memory footprint of the running kernel.
If unsure, say N
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 460a031c77e5..37385193a608 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -122,8 +122,8 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
}
/*
- * limited trace_printk()
- * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
+ * Only limited trace_printk() conversion specifiers allowed:
+ * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %s
*/
BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
u64, arg2, u64, arg3)
@@ -198,7 +198,8 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
i++;
}
- if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x')
+ if (fmt[i] != 'i' && fmt[i] != 'd' &&
+ fmt[i] != 'u' && fmt[i] != 'x')
return -EINVAL;
fmt_cnt++;
}
@@ -234,7 +235,8 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
unsigned int cpu = smp_processor_id();
u64 index = flags & BPF_F_INDEX_MASK;
struct bpf_event_entry *ee;
- struct perf_event *event;
+ u64 value = 0;
+ int err;
if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
return -EINVAL;
@@ -247,21 +249,14 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
if (!ee)
return -ENOENT;
- event = ee->event;
- if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
- event->attr.type != PERF_TYPE_RAW))
- return -EINVAL;
-
- /* make sure event is local and doesn't have pmu::count */
- if (unlikely(event->oncpu != cpu || event->pmu->count))
- return -EINVAL;
-
+ err = perf_event_read_local(ee->event, &value);
/*
- * we don't know if the function is run successfully by the
- * return value. It can be judged in other places, such as
- * eBPF programs.
+ * this api is ugly since we miss [-22..-2] range of valid
+ * counter values, but that's uapi
*/
- return perf_event_read_local(event);
+ if (err)
+ return err;
+ return value;
}
static const struct bpf_func_proto bpf_perf_event_read_proto = {
@@ -272,14 +267,16 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
.arg2_type = ARG_ANYTHING,
};
+static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
+
static __always_inline u64
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
u64 flags, struct perf_raw_record *raw)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd);
unsigned int cpu = smp_processor_id();
u64 index = flags & BPF_F_INDEX_MASK;
- struct perf_sample_data sample_data;
struct bpf_event_entry *ee;
struct perf_event *event;
@@ -300,9 +297,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
if (unlikely(event->oncpu != cpu))
return -EOPNOTSUPP;
- perf_sample_data_init(&sample_data, 0, 0);
- sample_data.raw = raw;
- perf_event_output(event, &sample_data, regs);
+ perf_sample_data_init(sd, 0, 0);
+ sd->raw = raw;
+ perf_event_output(event, sd, regs);
return 0;
}
@@ -483,7 +480,7 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
/* bpf+kprobe programs can access fields of 'struct pt_regs' */
static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
- enum bpf_reg_type *reg_type)
+ struct bpf_insn_access_aux *info)
{
if (off < 0 || off >= sizeof(struct pt_regs))
return false;
@@ -566,7 +563,7 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
}
static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
- enum bpf_reg_type *reg_type)
+ struct bpf_insn_access_aux *info)
{
if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
return false;
@@ -585,40 +582,47 @@ const struct bpf_verifier_ops tracepoint_prog_ops = {
};
static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
- enum bpf_reg_type *reg_type)
+ struct bpf_insn_access_aux *info)
{
+ const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data,
+ sample_period);
+
if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
return false;
if (type != BPF_READ)
return false;
if (off % size != 0)
return false;
- if (off == offsetof(struct bpf_perf_event_data, sample_period)) {
- if (size != sizeof(u64))
+
+ switch (off) {
+ case bpf_ctx_range(struct bpf_perf_event_data, sample_period):
+ bpf_ctx_record_field_size(info, size_sp);
+ if (!bpf_ctx_narrow_access_ok(off, size, size_sp))
return false;
- } else {
+ break;
+ default:
if (size != sizeof(long))
return false;
}
+
return true;
}
static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
- struct bpf_prog *prog)
+ struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
switch (si->off) {
case offsetof(struct bpf_perf_event_data, sample_period):
- BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
-
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
data), si->dst_reg, si->src_reg,
offsetof(struct bpf_perf_event_data_kern, data));
*insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
- offsetof(struct perf_sample_data, period));
+ bpf_target_off(struct perf_sample_data, period, 8,
+ target_size));
break;
default:
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b308be30dfb9..2953d558bbee 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1293,6 +1293,28 @@ static void ftrace_hash_clear(struct ftrace_hash *hash)
FTRACE_WARN_ON(hash->count);
}
+static void free_ftrace_mod(struct ftrace_mod_load *ftrace_mod)
+{
+ list_del(&ftrace_mod->list);
+ kfree(ftrace_mod->module);
+ kfree(ftrace_mod->func);
+ kfree(ftrace_mod);
+}
+
+static void clear_ftrace_mod_list(struct list_head *head)
+{
+ struct ftrace_mod_load *p, *n;
+
+ /* stack tracer isn't supported yet */
+ if (!head)
+ return;
+
+ mutex_lock(&ftrace_lock);
+ list_for_each_entry_safe(p, n, head, list)
+ free_ftrace_mod(p);
+ mutex_unlock(&ftrace_lock);
+}
+
static void free_ftrace_hash(struct ftrace_hash *hash)
{
if (!hash || hash == EMPTY_HASH)
@@ -1346,6 +1368,35 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
return hash;
}
+
+static int ftrace_add_mod(struct trace_array *tr,
+ const char *func, const char *module,
+ int enable)
+{
+ struct ftrace_mod_load *ftrace_mod;
+ struct list_head *mod_head = enable ? &tr->mod_trace : &tr->mod_notrace;
+
+ ftrace_mod = kzalloc(sizeof(*ftrace_mod), GFP_KERNEL);
+ if (!ftrace_mod)
+ return -ENOMEM;
+
+ ftrace_mod->func = kstrdup(func, GFP_KERNEL);
+ ftrace_mod->module = kstrdup(module, GFP_KERNEL);
+ ftrace_mod->enable = enable;
+
+ if (!ftrace_mod->func || !ftrace_mod->module)
+ goto out_free;
+
+ list_add(&ftrace_mod->list, mod_head);
+
+ return 0;
+
+ out_free:
+ free_ftrace_mod(ftrace_mod);
+
+ return -ENOMEM;
+}
+
static struct ftrace_hash *
alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
{
@@ -1359,6 +1410,9 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
if (!new_hash)
return NULL;
+ if (hash)
+ new_hash->flags = hash->flags;
+
/* Empty hash? */
if (ftrace_hash_empty(hash))
return new_hash;
@@ -1403,7 +1457,7 @@ __ftrace_hash_move(struct ftrace_hash *src)
/*
* If the new source is empty, just return the empty_hash.
*/
- if (!src->count)
+ if (ftrace_hash_empty(src))
return EMPTY_HASH;
/*
@@ -1420,6 +1474,8 @@ __ftrace_hash_move(struct ftrace_hash *src)
if (!new_hash)
return NULL;
+ new_hash->flags = src->flags;
+
size = 1 << src->size_bits;
for (i = 0; i < size; i++) {
hhd = &src->buckets[i];
@@ -1650,7 +1706,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
struct dyn_ftrace *rec;
bool update = false;
int count = 0;
- int all = 0;
+ int all = false;
/* Only update if the ops has been registered */
if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
@@ -1671,7 +1727,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
hash = ops->func_hash->filter_hash;
other_hash = ops->func_hash->notrace_hash;
if (ftrace_hash_empty(hash))
- all = 1;
+ all = true;
} else {
inc = !inc;
hash = ops->func_hash->notrace_hash;
@@ -3061,6 +3117,7 @@ ftrace_allocate_pages(unsigned long num_to_init)
struct ftrace_iterator {
loff_t pos;
loff_t func_pos;
+ loff_t mod_pos;
struct ftrace_page *pg;
struct dyn_ftrace *func;
struct ftrace_func_probe *probe;
@@ -3068,6 +3125,8 @@ struct ftrace_iterator {
struct trace_parser parser;
struct ftrace_hash *hash;
struct ftrace_ops *ops;
+ struct trace_array *tr;
+ struct list_head *mod_list;
int pidx;
int idx;
unsigned flags;
@@ -3152,13 +3211,13 @@ static void *t_probe_start(struct seq_file *m, loff_t *pos)
if (!(iter->flags & FTRACE_ITER_DO_PROBES))
return NULL;
- if (iter->func_pos > *pos)
+ if (iter->mod_pos > *pos)
return NULL;
iter->probe = NULL;
iter->probe_entry = NULL;
iter->pidx = 0;
- for (l = 0; l <= (*pos - iter->func_pos); ) {
+ for (l = 0; l <= (*pos - iter->mod_pos); ) {
p = t_probe_next(m, &l);
if (!p)
break;
@@ -3197,6 +3256,82 @@ t_probe_show(struct seq_file *m, struct ftrace_iterator *iter)
}
static void *
+t_mod_next(struct seq_file *m, loff_t *pos)
+{
+ struct ftrace_iterator *iter = m->private;
+ struct trace_array *tr = iter->tr;
+
+ (*pos)++;
+ iter->pos = *pos;
+
+ iter->mod_list = iter->mod_list->next;
+
+ if (iter->mod_list == &tr->mod_trace ||
+ iter->mod_list == &tr->mod_notrace) {
+ iter->flags &= ~FTRACE_ITER_MOD;
+ return NULL;
+ }
+
+ iter->mod_pos = *pos;
+
+ return iter;
+}
+
+static void *t_mod_start(struct seq_file *m, loff_t *pos)
+{
+ struct ftrace_iterator *iter = m->private;
+ void *p = NULL;
+ loff_t l;
+
+ if (iter->func_pos > *pos)
+ return NULL;
+
+ iter->mod_pos = iter->func_pos;
+
+ /* probes are only available if tr is set */
+ if (!iter->tr)
+ return NULL;
+
+ for (l = 0; l <= (*pos - iter->func_pos); ) {
+ p = t_mod_next(m, &l);
+ if (!p)
+ break;
+ }
+ if (!p) {
+ iter->flags &= ~FTRACE_ITER_MOD;
+ return t_probe_start(m, pos);
+ }
+
+ /* Only set this if we have an item */
+ iter->flags |= FTRACE_ITER_MOD;
+
+ return iter;
+}
+
+static int
+t_mod_show(struct seq_file *m, struct ftrace_iterator *iter)
+{
+ struct ftrace_mod_load *ftrace_mod;
+ struct trace_array *tr = iter->tr;
+
+ if (WARN_ON_ONCE(!iter->mod_list) ||
+ iter->mod_list == &tr->mod_trace ||
+ iter->mod_list == &tr->mod_notrace)
+ return -EIO;
+
+ ftrace_mod = list_entry(iter->mod_list, struct ftrace_mod_load, list);
+
+ if (ftrace_mod->func)
+ seq_printf(m, "%s", ftrace_mod->func);
+ else
+ seq_putc(m, '*');
+
+ seq_printf(m, ":mod:%s\n", ftrace_mod->module);
+
+ return 0;
+}
+
+static void *
t_func_next(struct seq_file *m, loff_t *pos)
{
struct ftrace_iterator *iter = m->private;
@@ -3237,7 +3372,7 @@ static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
struct ftrace_iterator *iter = m->private;
- loff_t l = *pos; /* t_hash_start() must use original pos */
+ loff_t l = *pos; /* t_probe_start() must use original pos */
void *ret;
if (unlikely(ftrace_disabled))
@@ -3246,16 +3381,19 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
if (iter->flags & FTRACE_ITER_PROBE)
return t_probe_next(m, pos);
+ if (iter->flags & FTRACE_ITER_MOD)
+ return t_mod_next(m, pos);
+
if (iter->flags & FTRACE_ITER_PRINTALL) {
/* next must increment pos, and t_probe_start does not */
(*pos)++;
- return t_probe_start(m, &l);
+ return t_mod_start(m, &l);
}
ret = t_func_next(m, pos);
if (!ret)
- return t_probe_start(m, &l);
+ return t_mod_start(m, &l);
return ret;
}
@@ -3264,7 +3402,7 @@ static void reset_iter_read(struct ftrace_iterator *iter)
{
iter->pos = 0;
iter->func_pos = 0;
- iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE);
+ iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE | FTRACE_ITER_MOD);
}
static void *t_start(struct seq_file *m, loff_t *pos)
@@ -3293,15 +3431,15 @@ static void *t_start(struct seq_file *m, loff_t *pos)
ftrace_hash_empty(iter->hash)) {
iter->func_pos = 1; /* Account for the message */
if (*pos > 0)
- return t_probe_start(m, pos);
+ return t_mod_start(m, pos);
iter->flags |= FTRACE_ITER_PRINTALL;
/* reset in case of seek/pread */
iter->flags &= ~FTRACE_ITER_PROBE;
return iter;
}
- if (iter->flags & FTRACE_ITER_PROBE)
- return t_probe_start(m, pos);
+ if (iter->flags & FTRACE_ITER_MOD)
+ return t_mod_start(m, pos);
/*
* Unfortunately, we need to restart at ftrace_pages_start
@@ -3317,7 +3455,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
}
if (!p)
- return t_probe_start(m, pos);
+ return t_mod_start(m, pos);
return iter;
}
@@ -3351,6 +3489,9 @@ static int t_show(struct seq_file *m, void *v)
if (iter->flags & FTRACE_ITER_PROBE)
return t_probe_show(m, iter);
+ if (iter->flags & FTRACE_ITER_MOD)
+ return t_mod_show(m, iter);
+
if (iter->flags & FTRACE_ITER_PRINTALL) {
if (iter->flags & FTRACE_ITER_NOTRACE)
seq_puts(m, "#### no functions disabled ####\n");
@@ -3457,6 +3598,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
{
struct ftrace_iterator *iter;
struct ftrace_hash *hash;
+ struct list_head *mod_head;
+ struct trace_array *tr = ops->private;
int ret = 0;
ftrace_ops_init(ops);
@@ -3475,21 +3618,29 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
iter->ops = ops;
iter->flags = flag;
+ iter->tr = tr;
mutex_lock(&ops->func_hash->regex_lock);
- if (flag & FTRACE_ITER_NOTRACE)
+ if (flag & FTRACE_ITER_NOTRACE) {
hash = ops->func_hash->notrace_hash;
- else
+ mod_head = tr ? &tr->mod_notrace : NULL;
+ } else {
hash = ops->func_hash->filter_hash;
+ mod_head = tr ? &tr->mod_trace : NULL;
+ }
+
+ iter->mod_list = mod_head;
if (file->f_mode & FMODE_WRITE) {
const int size_bits = FTRACE_HASH_DEFAULT_BITS;
- if (file->f_flags & O_TRUNC)
+ if (file->f_flags & O_TRUNC) {
iter->hash = alloc_ftrace_hash(size_bits);
- else
+ clear_ftrace_mod_list(mod_head);
+ } else {
iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash);
+ }
if (!iter->hash) {
trace_parser_put(&iter->parser);
@@ -3761,6 +3912,163 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
return ret;
}
+static bool module_exists(const char *module)
+{
+ /* All modules have the symbol __this_module */
+ const char this_mod[] = "__this_module";
+ const int modname_size = MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 1;
+ char modname[modname_size + 1];
+ unsigned long val;
+ int n;
+
+ n = snprintf(modname, modname_size + 1, "%s:%s", module, this_mod);
+
+ if (n > modname_size)
+ return false;
+
+ val = module_kallsyms_lookup_name(modname);
+ return val != 0;
+}
+
+static int cache_mod(struct trace_array *tr,
+ const char *func, char *module, int enable)
+{
+ struct ftrace_mod_load *ftrace_mod, *n;
+ struct list_head *head = enable ? &tr->mod_trace : &tr->mod_notrace;
+ int ret;
+
+ mutex_lock(&ftrace_lock);
+
+ /* We do not cache inverse filters */
+ if (func[0] == '!') {
+ func++;
+ ret = -EINVAL;
+
+ /* Look to remove this hash */
+ list_for_each_entry_safe(ftrace_mod, n, head, list) {
+ if (strcmp(ftrace_mod->module, module) != 0)
+ continue;
+
+ /* no func matches all */
+ if (!func || strcmp(func, "*") == 0 ||
+ (ftrace_mod->func &&
+ strcmp(ftrace_mod->func, func) == 0)) {
+ ret = 0;
+ free_ftrace_mod(ftrace_mod);
+ continue;
+ }
+ }
+ goto out;
+ }
+
+ ret = -EINVAL;
+ /* We only care about modules that have not been loaded yet */
+ if (module_exists(module))
+ goto out;
+
+ /* Save this string off, and execute it when the module is loaded */
+ ret = ftrace_add_mod(tr, func, module, enable);
+ out:
+ mutex_unlock(&ftrace_lock);
+
+ return ret;
+}
+
+static int
+ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
+ int reset, int enable);
+
+static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
+ char *mod, bool enable)
+{
+ struct ftrace_mod_load *ftrace_mod, *n;
+ struct ftrace_hash **orig_hash, *new_hash;
+ LIST_HEAD(process_mods);
+ char *func;
+ int ret;
+
+ mutex_lock(&ops->func_hash->regex_lock);
+
+ if (enable)
+ orig_hash = &ops->func_hash->filter_hash;
+ else
+ orig_hash = &ops->func_hash->notrace_hash;
+
+ new_hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS,
+ *orig_hash);
+ if (!new_hash)
+ goto out; /* warn? */
+
+ mutex_lock(&ftrace_lock);
+
+ list_for_each_entry_safe(ftrace_mod, n, head, list) {
+
+ if (strcmp(ftrace_mod->module, mod) != 0)
+ continue;
+
+ if (ftrace_mod->func)
+ func = kstrdup(ftrace_mod->func, GFP_KERNEL);
+ else
+ func = kstrdup("*", GFP_KERNEL);
+
+ if (!func) /* warn? */
+ continue;
+
+ list_del(&ftrace_mod->list);
+ list_add(&ftrace_mod->list, &process_mods);
+
+ /* Use the newly allocated func, as it may be "*" */
+ kfree(ftrace_mod->func);
+ ftrace_mod->func = func;
+ }
+
+ mutex_unlock(&ftrace_lock);
+
+ list_for_each_entry_safe(ftrace_mod, n, &process_mods, list) {
+
+ func = ftrace_mod->func;
+
+ /* Grabs ftrace_lock, which is why we have this extra step */
+ match_records(new_hash, func, strlen(func), mod);
+ free_ftrace_mod(ftrace_mod);
+ }
+
+ if (enable && list_empty(head))
+ new_hash->flags &= ~FTRACE_HASH_FL_MOD;
+
+ mutex_lock(&ftrace_lock);
+
+ ret = ftrace_hash_move_and_update_ops(ops, orig_hash,
+ new_hash, enable);
+ mutex_unlock(&ftrace_lock);
+
+ out:
+ mutex_unlock(&ops->func_hash->regex_lock);
+
+ free_ftrace_hash(new_hash);
+}
+
+static void process_cached_mods(const char *mod_name)
+{
+ struct trace_array *tr;
+ char *mod;
+
+ mod = kstrdup(mod_name, GFP_KERNEL);
+ if (!mod)
+ return;
+
+ mutex_lock(&trace_types_lock);
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (!list_empty(&tr->mod_trace))
+ process_mod_list(&tr->mod_trace, tr->ops, mod, true);
+ if (!list_empty(&tr->mod_notrace))
+ process_mod_list(&tr->mod_notrace, tr->ops, mod, false);
+ }
+ mutex_unlock(&trace_types_lock);
+
+ kfree(mod);
+}
+
/*
* We register the module command as a template to show others how
* to register the a command as well.
@@ -3768,10 +4076,16 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
static int
ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash,
- char *func, char *cmd, char *module, int enable)
+ char *func_orig, char *cmd, char *module, int enable)
{
+ char *func;
int ret;
+ /* match_records() modifies func, and we need the original */
+ func = kstrdup(func_orig, GFP_KERNEL);
+ if (!func)
+ return -ENOMEM;
+
/*
* cmd == 'mod' because we only registered this func
* for the 'mod' ftrace_func_command.
@@ -3780,8 +4094,10 @@ ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash,
* parameter.
*/
ret = match_records(hash, func, strlen(func), module);
+ kfree(func);
+
if (!ret)
- return -EINVAL;
+ return cache_mod(tr, func_orig, module, enable);
if (ret < 0)
return ret;
return 0;
@@ -4725,9 +5041,11 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
if (file->f_mode & FMODE_WRITE) {
filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
- if (filter_hash)
+ if (filter_hash) {
orig_hash = &iter->ops->func_hash->filter_hash;
- else
+ if (iter->tr && !list_empty(&iter->tr->mod_trace))
+ iter->hash->flags |= FTRACE_HASH_FL_MOD;
+ } else
orig_hash = &iter->ops->func_hash->notrace_hash;
mutex_lock(&ftrace_lock);
@@ -5385,6 +5703,7 @@ void ftrace_release_mod(struct module *mod)
if (pg == ftrace_pages)
ftrace_pages = next_to_ftrace_page(last_pg);
+ ftrace_update_tot_cnt -= pg->index;
*last_pg = pg->next;
order = get_count_order(pg->size / ENTRIES_PER_PAGE);
free_pages((unsigned long)pg->records, order);
@@ -5463,6 +5782,8 @@ void ftrace_module_enable(struct module *mod)
out_unlock:
mutex_unlock(&ftrace_lock);
+
+ process_cached_mods(mod->name);
}
void ftrace_module_init(struct module *mod)
@@ -5501,6 +5822,7 @@ void __init ftrace_free_init_mem(void)
if (!rec)
continue;
pg->index--;
+ ftrace_update_tot_cnt--;
if (!pg->index) {
*last_pg = pg->next;
order = get_count_order(pg->size / ENTRIES_PER_PAGE);
@@ -5567,6 +5889,8 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops)
void ftrace_init_trace_array(struct trace_array *tr)
{
INIT_LIST_HEAD(&tr->func_probes);
+ INIT_LIST_HEAD(&tr->mod_trace);
+ INIT_LIST_HEAD(&tr->mod_notrace);
}
#else
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 091e801145c9..948ec32e0c27 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -87,7 +87,7 @@ dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
* tracing is active, only save the comm when a trace event
* occurred.
*/
-static DEFINE_PER_CPU(bool, trace_cmdline_save);
+static DEFINE_PER_CPU(bool, trace_taskinfo_save);
/*
* Kill all tracing for good (never come back).
@@ -120,41 +120,41 @@ enum ftrace_dump_mode ftrace_dump_on_oops;
/* When set, tracing will stop when a WARN*() is hit */
int __disable_trace_on_warning;
-#ifdef CONFIG_TRACE_ENUM_MAP_FILE
-/* Map of enums to their values, for "enum_map" file */
-struct trace_enum_map_head {
+#ifdef CONFIG_TRACE_EVAL_MAP_FILE
+/* Map of enums to their values, for "eval_map" file */
+struct trace_eval_map_head {
struct module *mod;
unsigned long length;
};
-union trace_enum_map_item;
+union trace_eval_map_item;
-struct trace_enum_map_tail {
+struct trace_eval_map_tail {
/*
* "end" is first and points to NULL as it must be different
- * than "mod" or "enum_string"
+ * than "mod" or "eval_string"
*/
- union trace_enum_map_item *next;
+ union trace_eval_map_item *next;
const char *end; /* points to NULL */
};
-static DEFINE_MUTEX(trace_enum_mutex);
+static DEFINE_MUTEX(trace_eval_mutex);
/*
- * The trace_enum_maps are saved in an array with two extra elements,
+ * The trace_eval_maps are saved in an array with two extra elements,
* one at the beginning, and one at the end. The beginning item contains
* the count of the saved maps (head.length), and the module they
* belong to if not built in (head.mod). The ending item contains a
- * pointer to the next array of saved enum_map items.
+ * pointer to the next array of saved eval_map items.
*/
-union trace_enum_map_item {
- struct trace_enum_map map;
- struct trace_enum_map_head head;
- struct trace_enum_map_tail tail;
+union trace_eval_map_item {
+ struct trace_eval_map map;
+ struct trace_eval_map_head head;
+ struct trace_eval_map_tail tail;
};
-static union trace_enum_map_item *trace_enum_maps;
-#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
+static union trace_eval_map_item *trace_eval_maps;
+#endif /* CONFIG_TRACE_EVAL_MAP_FILE */
static int tracing_set_tracer(struct trace_array *tr, const char *buf);
@@ -790,7 +790,7 @@ EXPORT_SYMBOL_GPL(tracing_on);
static __always_inline void
__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
{
- __this_cpu_write(trace_cmdline_save, true);
+ __this_cpu_write(trace_taskinfo_save, true);
/* If this is the temp buffer, we need to commit fully */
if (this_cpu_read(trace_buffered_event) == event) {
@@ -1141,9 +1141,9 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
/*
* TRACE_FLAGS is defined as a tuple matching bit masks with strings.
- * It uses C(a, b) where 'a' is the enum name and 'b' is the string that
+ * It uses C(a, b) where 'a' is the eval (enum) name and 'b' is the string that
* matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list
- * of strings in the order that the enums were defined.
+ * of strings in the order that the evals (enum) were defined.
*/
#undef C
#define C(a, b) b
@@ -1709,6 +1709,8 @@ void tracing_reset_all_online_cpus(void)
}
}
+static int *tgid_map;
+
#define SAVED_CMDLINES_DEFAULT 128
#define NO_CMDLINE_MAP UINT_MAX
static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
@@ -1722,7 +1724,7 @@ struct saved_cmdlines_buffer {
static struct saved_cmdlines_buffer *savedcmd;
/* temporary disable recording */
-static atomic_t trace_record_cmdline_disabled __read_mostly;
+static atomic_t trace_record_taskinfo_disabled __read_mostly;
static inline char *get_saved_cmdlines(int idx)
{
@@ -1910,8 +1912,6 @@ static void tracing_stop_tr(struct trace_array *tr)
raw_spin_unlock_irqrestore(&tr->start_lock, flags);
}
-void trace_stop_cmdline_recording(void);
-
static int trace_save_cmdline(struct task_struct *tsk)
{
unsigned pid, idx;
@@ -1992,16 +1992,87 @@ void trace_find_cmdline(int pid, char comm[])
preempt_enable();
}
-void tracing_record_cmdline(struct task_struct *tsk)
+int trace_find_tgid(int pid)
+{
+ if (unlikely(!tgid_map || !pid || pid > PID_MAX_DEFAULT))
+ return 0;
+
+ return tgid_map[pid];
+}
+
+static int trace_save_tgid(struct task_struct *tsk)
+{
+ if (unlikely(!tgid_map || !tsk->pid || tsk->pid > PID_MAX_DEFAULT))
+ return 0;
+
+ tgid_map[tsk->pid] = tsk->tgid;
+ return 1;
+}
+
+static bool tracing_record_taskinfo_skip(int flags)
+{
+ if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID))))
+ return true;
+ if (atomic_read(&trace_record_taskinfo_disabled) || !tracing_is_on())
+ return true;
+ if (!__this_cpu_read(trace_taskinfo_save))
+ return true;
+ return false;
+}
+
+/**
+ * tracing_record_taskinfo - record the task info of a task
+ *
+ * @task - task to record
+ * @flags - TRACE_RECORD_CMDLINE for recording comm
+ * - TRACE_RECORD_TGID for recording tgid
+ */
+void tracing_record_taskinfo(struct task_struct *task, int flags)
+{
+ if (tracing_record_taskinfo_skip(flags))
+ return;
+ if ((flags & TRACE_RECORD_CMDLINE) && !trace_save_cmdline(task))
+ return;
+ if ((flags & TRACE_RECORD_TGID) && !trace_save_tgid(task))
+ return;
+
+ __this_cpu_write(trace_taskinfo_save, false);
+}
+
+/**
+ * tracing_record_taskinfo_sched_switch - record task info for sched_switch
+ *
+ * @prev - previous task during sched_switch
+ * @next - next task during sched_switch
+ * @flags - TRACE_RECORD_CMDLINE for recording comm
+ * TRACE_RECORD_TGID for recording tgid
+ */
+void tracing_record_taskinfo_sched_switch(struct task_struct *prev,
+ struct task_struct *next, int flags)
{
- if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on())
+ if (tracing_record_taskinfo_skip(flags))
+ return;
+
+ if ((flags & TRACE_RECORD_CMDLINE) &&
+ (!trace_save_cmdline(prev) || !trace_save_cmdline(next)))
return;
- if (!__this_cpu_read(trace_cmdline_save))
+ if ((flags & TRACE_RECORD_TGID) &&
+ (!trace_save_tgid(prev) || !trace_save_tgid(next)))
return;
- if (trace_save_cmdline(tsk))
- __this_cpu_write(trace_cmdline_save, false);
+ __this_cpu_write(trace_taskinfo_save, false);
+}
+
+/* Helpers to record a specific task information */
+void tracing_record_cmdline(struct task_struct *task)
+{
+ tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE);
+}
+
+void tracing_record_tgid(struct task_struct *task)
+{
+ tracing_record_taskinfo(task, TRACE_RECORD_TGID);
}
/*
@@ -3146,7 +3217,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
#endif
if (!iter->snapshot)
- atomic_inc(&trace_record_cmdline_disabled);
+ atomic_inc(&trace_record_taskinfo_disabled);
if (*pos != iter->pos) {
iter->ent = NULL;
@@ -3191,7 +3262,7 @@ static void s_stop(struct seq_file *m, void *p)
#endif
if (!iter->snapshot)
- atomic_dec(&trace_record_cmdline_disabled);
+ atomic_dec(&trace_record_taskinfo_disabled);
trace_access_unlock(iter->cpu_file);
trace_event_read_unlock();
@@ -3248,23 +3319,29 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
seq_puts(m, "#\n");
}
-static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
+static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m,
+ unsigned int flags)
{
+ bool tgid = flags & TRACE_ITER_RECORD_TGID;
+
print_event_info(buf, m);
- seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"
- "# | | | | |\n");
+
+ seq_printf(m, "# TASK-PID CPU# %s TIMESTAMP FUNCTION\n", tgid ? "TGID " : "");
+ seq_printf(m, "# | | | %s | |\n", tgid ? " | " : "");
}
-static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
+static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m,
+ unsigned int flags)
{
- print_event_info(buf, m);
- seq_puts(m, "# _-----=> irqs-off\n"
- "# / _----=> need-resched\n"
- "# | / _---=> hardirq/softirq\n"
- "# || / _--=> preempt-depth\n"
- "# ||| / delay\n"
- "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"
- "# | | | |||| | |\n");
+ bool tgid = flags & TRACE_ITER_RECORD_TGID;
+
+ seq_printf(m, "# %s _-----=> irqs-off\n", tgid ? " " : "");
+ seq_printf(m, "# %s / _----=> need-resched\n", tgid ? " " : "");
+ seq_printf(m, "# %s| / _---=> hardirq/softirq\n", tgid ? " " : "");
+ seq_printf(m, "# %s|| / _--=> preempt-depth\n", tgid ? " " : "");
+ seq_printf(m, "# %s||| / delay\n", tgid ? " " : "");
+ seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n", tgid ? " TGID " : "");
+ seq_printf(m, "# | | | %s|||| | |\n", tgid ? " | " : "");
}
void
@@ -3580,9 +3657,11 @@ void trace_default_header(struct seq_file *m)
} else {
if (!(trace_flags & TRACE_ITER_VERBOSE)) {
if (trace_flags & TRACE_ITER_IRQ_INFO)
- print_func_help_header_irq(iter->trace_buffer, m);
+ print_func_help_header_irq(iter->trace_buffer,
+ m, trace_flags);
else
- print_func_help_header(iter->trace_buffer, m);
+ print_func_help_header(iter->trace_buffer, m,
+ trace_flags);
}
}
}
@@ -4238,6 +4317,18 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
if (mask == TRACE_ITER_RECORD_CMD)
trace_event_enable_cmd_record(enabled);
+ if (mask == TRACE_ITER_RECORD_TGID) {
+ if (!tgid_map)
+ tgid_map = kzalloc((PID_MAX_DEFAULT + 1) * sizeof(*tgid_map),
+ GFP_KERNEL);
+ if (!tgid_map) {
+ tr->trace_flags &= ~TRACE_ITER_RECORD_TGID;
+ return -ENOMEM;
+ }
+
+ trace_event_enable_tgid_record(enabled);
+ }
+
if (mask == TRACE_ITER_EVENT_FORK)
trace_event_follow_fork(tr, enabled);
@@ -4473,7 +4564,8 @@ static const char readme_msg[] =
#endif
#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
"\t accepts: event-definitions (one definition per line)\n"
- "\t Format: p|r[:[<group>/]<event>] <place> [<args>]\n"
+ "\t Format: p[:[<group>/]<event>] <place> [<args>]\n"
+ "\t r[maxactive][:[<group>/]<event>] <place> [<args>]\n"
"\t -:[<group>/]<event>\n"
#ifdef CONFIG_KPROBE_EVENTS
"\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
@@ -4746,11 +4838,11 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = {
.write = tracing_saved_cmdlines_size_write,
};
-#ifdef CONFIG_TRACE_ENUM_MAP_FILE
-static union trace_enum_map_item *
-update_enum_map(union trace_enum_map_item *ptr)
+#ifdef CONFIG_TRACE_EVAL_MAP_FILE
+static union trace_eval_map_item *
+update_eval_map(union trace_eval_map_item *ptr)
{
- if (!ptr->map.enum_string) {
+ if (!ptr->map.eval_string) {
if (ptr->tail.next) {
ptr = ptr->tail.next;
/* Set ptr to the next real item (skip head) */
@@ -4761,15 +4853,15 @@ update_enum_map(union trace_enum_map_item *ptr)
return ptr;
}
-static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos)
+static void *eval_map_next(struct seq_file *m, void *v, loff_t *pos)
{
- union trace_enum_map_item *ptr = v;
+ union trace_eval_map_item *ptr = v;
/*
* Paranoid! If ptr points to end, we don't want to increment past it.
* This really should never happen.
*/
- ptr = update_enum_map(ptr);
+ ptr = update_eval_map(ptr);
if (WARN_ON_ONCE(!ptr))
return NULL;
@@ -4777,104 +4869,104 @@ static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos)
(*pos)++;
- ptr = update_enum_map(ptr);
+ ptr = update_eval_map(ptr);
return ptr;
}
-static void *enum_map_start(struct seq_file *m, loff_t *pos)
+static void *eval_map_start(struct seq_file *m, loff_t *pos)
{
- union trace_enum_map_item *v;
+ union trace_eval_map_item *v;
loff_t l = 0;
- mutex_lock(&trace_enum_mutex);
+ mutex_lock(&trace_eval_mutex);
- v = trace_enum_maps;
+ v = trace_eval_maps;
if (v)
v++;
while (v && l < *pos) {
- v = enum_map_next(m, v, &l);
+ v = eval_map_next(m, v, &l);
}
return v;
}
-static void enum_map_stop(struct seq_file *m, void *v)
+static void eval_map_stop(struct seq_file *m, void *v)
{
- mutex_unlock(&trace_enum_mutex);
+ mutex_unlock(&trace_eval_mutex);
}
-static int enum_map_show(struct seq_file *m, void *v)
+static int eval_map_show(struct seq_file *m, void *v)
{
- union trace_enum_map_item *ptr = v;
+ union trace_eval_map_item *ptr = v;
seq_printf(m, "%s %ld (%s)\n",
- ptr->map.enum_string, ptr->map.enum_value,
+ ptr->map.eval_string, ptr->map.eval_value,
ptr->map.system);
return 0;
}
-static const struct seq_operations tracing_enum_map_seq_ops = {
- .start = enum_map_start,
- .next = enum_map_next,
- .stop = enum_map_stop,
- .show = enum_map_show,
+static const struct seq_operations tracing_eval_map_seq_ops = {
+ .start = eval_map_start,
+ .next = eval_map_next,
+ .stop = eval_map_stop,
+ .show = eval_map_show,
};
-static int tracing_enum_map_open(struct inode *inode, struct file *filp)
+static int tracing_eval_map_open(struct inode *inode, struct file *filp)
{
if (tracing_disabled)
return -ENODEV;
- return seq_open(filp, &tracing_enum_map_seq_ops);
+ return seq_open(filp, &tracing_eval_map_seq_ops);
}
-static const struct file_operations tracing_enum_map_fops = {
- .open = tracing_enum_map_open,
+static const struct file_operations tracing_eval_map_fops = {
+ .open = tracing_eval_map_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
-static inline union trace_enum_map_item *
-trace_enum_jmp_to_tail(union trace_enum_map_item *ptr)
+static inline union trace_eval_map_item *
+trace_eval_jmp_to_tail(union trace_eval_map_item *ptr)
{
/* Return tail of array given the head */
return ptr + ptr->head.length + 1;
}
static void
-trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start,
+trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start,
int len)
{
- struct trace_enum_map **stop;
- struct trace_enum_map **map;
- union trace_enum_map_item *map_array;
- union trace_enum_map_item *ptr;
+ struct trace_eval_map **stop;
+ struct trace_eval_map **map;
+ union trace_eval_map_item *map_array;
+ union trace_eval_map_item *ptr;
stop = start + len;
/*
- * The trace_enum_maps contains the map plus a head and tail item,
+ * The trace_eval_maps contains the map plus a head and tail item,
* where the head holds the module and length of array, and the
* tail holds a pointer to the next list.
*/
map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL);
if (!map_array) {
- pr_warn("Unable to allocate trace enum mapping\n");
+ pr_warn("Unable to allocate trace eval mapping\n");
return;
}
- mutex_lock(&trace_enum_mutex);
+ mutex_lock(&trace_eval_mutex);
- if (!trace_enum_maps)
- trace_enum_maps = map_array;
+ if (!trace_eval_maps)
+ trace_eval_maps = map_array;
else {
- ptr = trace_enum_maps;
+ ptr = trace_eval_maps;
for (;;) {
- ptr = trace_enum_jmp_to_tail(ptr);
+ ptr = trace_eval_jmp_to_tail(ptr);
if (!ptr->tail.next)
break;
ptr = ptr->tail.next;
@@ -4892,34 +4984,34 @@ trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start,
}
memset(map_array, 0, sizeof(*map_array));
- mutex_unlock(&trace_enum_mutex);
+ mutex_unlock(&trace_eval_mutex);
}
-static void trace_create_enum_file(struct dentry *d_tracer)
+static void trace_create_eval_file(struct dentry *d_tracer)
{
- trace_create_file("enum_map", 0444, d_tracer,
- NULL, &tracing_enum_map_fops);
+ trace_create_file("eval_map", 0444, d_tracer,
+ NULL, &tracing_eval_map_fops);
}
-#else /* CONFIG_TRACE_ENUM_MAP_FILE */
-static inline void trace_create_enum_file(struct dentry *d_tracer) { }
-static inline void trace_insert_enum_map_file(struct module *mod,
- struct trace_enum_map **start, int len) { }
-#endif /* !CONFIG_TRACE_ENUM_MAP_FILE */
+#else /* CONFIG_TRACE_EVAL_MAP_FILE */
+static inline void trace_create_eval_file(struct dentry *d_tracer) { }
+static inline void trace_insert_eval_map_file(struct module *mod,
+ struct trace_eval_map **start, int len) { }
+#endif /* !CONFIG_TRACE_EVAL_MAP_FILE */
-static void trace_insert_enum_map(struct module *mod,
- struct trace_enum_map **start, int len)
+static void trace_insert_eval_map(struct module *mod,
+ struct trace_eval_map **start, int len)
{
- struct trace_enum_map **map;
+ struct trace_eval_map **map;
if (len <= 0)
return;
map = start;
- trace_event_enum_update(map, len);
+ trace_event_eval_update(map, len);
- trace_insert_enum_map_file(mod, start, len);
+ trace_insert_eval_map_file(mod, start, len);
}
static ssize_t
@@ -6739,33 +6831,18 @@ static const struct file_operations tracing_stats_fops = {
#ifdef CONFIG_DYNAMIC_FTRACE
-int __weak ftrace_arch_read_dyn_info(char *buf, int size)
-{
- return 0;
-}
-
static ssize_t
tracing_read_dyn_info(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- static char ftrace_dyn_info_buffer[1024];
- static DEFINE_MUTEX(dyn_info_mutex);
unsigned long *p = filp->private_data;
- char *buf = ftrace_dyn_info_buffer;
- int size = ARRAY_SIZE(ftrace_dyn_info_buffer);
+ char buf[64]; /* Not too big for a shallow stack */
int r;
- mutex_lock(&dyn_info_mutex);
- r = sprintf(buf, "%ld ", *p);
-
- r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r);
+ r = scnprintf(buf, 63, "%ld", *p);
buf[r++] = '\n';
- r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-
- mutex_unlock(&dyn_info_mutex);
-
- return r;
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
static const struct file_operations tracing_dyn_info_fops = {
@@ -7737,21 +7814,21 @@ struct dentry *tracing_init_dentry(void)
return NULL;
}
-extern struct trace_enum_map *__start_ftrace_enum_maps[];
-extern struct trace_enum_map *__stop_ftrace_enum_maps[];
+extern struct trace_eval_map *__start_ftrace_eval_maps[];
+extern struct trace_eval_map *__stop_ftrace_eval_maps[];
-static void __init trace_enum_init(void)
+static void __init trace_eval_init(void)
{
int len;
- len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps;
- trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len);
+ len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps;
+ trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len);
}
#ifdef CONFIG_MODULES
-static void trace_module_add_enums(struct module *mod)
+static void trace_module_add_evals(struct module *mod)
{
- if (!mod->num_trace_enums)
+ if (!mod->num_trace_evals)
return;
/*
@@ -7761,40 +7838,40 @@ static void trace_module_add_enums(struct module *mod)
if (trace_module_has_bad_taint(mod))
return;
- trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums);
+ trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals);
}
-#ifdef CONFIG_TRACE_ENUM_MAP_FILE
-static void trace_module_remove_enums(struct module *mod)
+#ifdef CONFIG_TRACE_EVAL_MAP_FILE
+static void trace_module_remove_evals(struct module *mod)
{
- union trace_enum_map_item *map;
- union trace_enum_map_item **last = &trace_enum_maps;
+ union trace_eval_map_item *map;
+ union trace_eval_map_item **last = &trace_eval_maps;
- if (!mod->num_trace_enums)
+ if (!mod->num_trace_evals)
return;
- mutex_lock(&trace_enum_mutex);
+ mutex_lock(&trace_eval_mutex);
- map = trace_enum_maps;
+ map = trace_eval_maps;
while (map) {
if (map->head.mod == mod)
break;
- map = trace_enum_jmp_to_tail(map);
+ map = trace_eval_jmp_to_tail(map);
last = &map->tail.next;
map = map->tail.next;
}
if (!map)
goto out;
- *last = trace_enum_jmp_to_tail(map)->tail.next;
+ *last = trace_eval_jmp_to_tail(map)->tail.next;
kfree(map);
out:
- mutex_unlock(&trace_enum_mutex);
+ mutex_unlock(&trace_eval_mutex);
}
#else
-static inline void trace_module_remove_enums(struct module *mod) { }
-#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
+static inline void trace_module_remove_evals(struct module *mod) { }
+#endif /* CONFIG_TRACE_EVAL_MAP_FILE */
static int trace_module_notify(struct notifier_block *self,
unsigned long val, void *data)
@@ -7803,10 +7880,10 @@ static int trace_module_notify(struct notifier_block *self,
switch (val) {
case MODULE_STATE_COMING:
- trace_module_add_enums(mod);
+ trace_module_add_evals(mod);
break;
case MODULE_STATE_GOING:
- trace_module_remove_enums(mod);
+ trace_module_remove_evals(mod);
break;
}
@@ -7844,9 +7921,9 @@ static __init int tracer_init_tracefs(void)
trace_create_file("saved_cmdlines_size", 0644, d_tracer,
NULL, &tracing_saved_cmdlines_size_fops);
- trace_enum_init();
+ trace_eval_init();
- trace_create_enum_file(d_tracer);
+ trace_create_eval_file(d_tracer);
#ifdef CONFIG_MODULES
register_module_notifier(&trace_module_nb);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 39fd77330aab..6ade1c55cc3a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -263,7 +263,10 @@ struct trace_array {
struct ftrace_ops *ops;
struct trace_pid_list __rcu *function_pids;
#ifdef CONFIG_DYNAMIC_FTRACE
+ /* All of these are protected by the ftrace_lock */
struct list_head func_probes;
+ struct list_head mod_trace;
+ struct list_head mod_notrace;
#endif
/* function tracing enabled */
int function_enabled;
@@ -637,6 +640,9 @@ void set_graph_array(struct trace_array *tr);
void tracing_start_cmdline_record(void);
void tracing_stop_cmdline_record(void);
+void tracing_start_tgid_record(void);
+void tracing_stop_tgid_record(void);
+
int register_tracer(struct tracer *type);
int is_tracing_stopped(void);
@@ -697,6 +703,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
extern u64 ftrace_now(int cpu);
extern void trace_find_cmdline(int pid, char comm[]);
+extern int trace_find_tgid(int pid);
extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -761,10 +768,24 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
extern char trace_find_mark(unsigned long long duration);
+struct ftrace_hash;
+
+struct ftrace_mod_load {
+ struct list_head list;
+ char *func;
+ char *module;
+ int enable;
+};
+
+enum {
+ FTRACE_HASH_FL_MOD = (1 << 0),
+};
+
struct ftrace_hash {
unsigned long size_bits;
struct hlist_head *buckets;
unsigned long count;
+ unsigned long flags;
struct rcu_head rcu;
};
@@ -773,7 +794,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip);
static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
{
- return !hash || !hash->count;
+ return !hash || !(hash->count || (hash->flags & FTRACE_HASH_FL_MOD));
}
/* Standard output formatting function used for function return traces */
@@ -1107,6 +1128,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \
C(LATENCY_FMT, "latency-format"), \
C(RECORD_CMD, "record-cmd"), \
+ C(RECORD_TGID, "record-tgid"), \
C(OVERWRITE, "overwrite"), \
C(STOP_ON_FREE, "disable_on_free"), \
C(IRQ_INFO, "irq-info"), \
@@ -1423,6 +1445,8 @@ struct ftrace_event_field *
trace_find_event_field(struct trace_event_call *call, char *name);
extern void trace_event_enable_cmd_record(bool enable);
+extern void trace_event_enable_tgid_record(bool enable);
+
extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
extern int event_trace_del_tracer(struct trace_array *tr);
@@ -1773,10 +1797,10 @@ static inline const char *get_syscall_name(int syscall)
#ifdef CONFIG_EVENT_TRACING
void trace_event_init(void);
-void trace_event_enum_update(struct trace_enum_map **map, int len);
+void trace_event_eval_update(struct trace_eval_map **map, int len);
#else
static inline void __init trace_event_init(void) { }
-static inline void trace_event_enum_update(struct trace_enum_map **map, int len) { }
+static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { }
#endif
extern struct trace_iterator *tracepoint_print_iter;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e7973e10398c..36132f9280e6 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -343,6 +343,28 @@ void trace_event_enable_cmd_record(bool enable)
mutex_unlock(&event_mutex);
}
+void trace_event_enable_tgid_record(bool enable)
+{
+ struct trace_event_file *file;
+ struct trace_array *tr;
+
+ mutex_lock(&event_mutex);
+ do_for_each_event_file(tr, file) {
+ if (!(file->flags & EVENT_FILE_FL_ENABLED))
+ continue;
+
+ if (enable) {
+ tracing_start_tgid_record();
+ set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
+ } else {
+ tracing_stop_tgid_record();
+ clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT,
+ &file->flags);
+ }
+ } while_for_each_event_file();
+ mutex_unlock(&event_mutex);
+}
+
static int __ftrace_event_enable_disable(struct trace_event_file *file,
int enable, int soft_disable)
{
@@ -381,6 +403,12 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
tracing_stop_cmdline_record();
clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
}
+
+ if (file->flags & EVENT_FILE_FL_RECORDED_TGID) {
+ tracing_stop_tgid_record();
+ clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
+ }
+
call->class->reg(call, TRACE_REG_UNREGISTER, file);
}
/* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
@@ -407,18 +435,30 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
}
if (!(file->flags & EVENT_FILE_FL_ENABLED)) {
+ bool cmd = false, tgid = false;
/* Keep the event disabled, when going to SOFT_MODE. */
if (soft_disable)
set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
if (tr->trace_flags & TRACE_ITER_RECORD_CMD) {
+ cmd = true;
tracing_start_cmdline_record();
set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
}
+
+ if (tr->trace_flags & TRACE_ITER_RECORD_TGID) {
+ tgid = true;
+ tracing_start_tgid_record();
+ set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
+ }
+
ret = call->class->reg(call, TRACE_REG_REGISTER, file);
if (ret) {
- tracing_stop_cmdline_record();
+ if (cmd)
+ tracing_stop_cmdline_record();
+ if (tgid)
+ tracing_stop_tgid_record();
pr_info("event trace: Could not enable event "
"%s\n", trace_event_name(call));
break;
@@ -2067,18 +2107,18 @@ __register_event(struct trace_event_call *call, struct module *mod)
return 0;
}
-static char *enum_replace(char *ptr, struct trace_enum_map *map, int len)
+static char *eval_replace(char *ptr, struct trace_eval_map *map, int len)
{
int rlen;
int elen;
- /* Find the length of the enum value as a string */
- elen = snprintf(ptr, 0, "%ld", map->enum_value);
+ /* Find the length of the eval value as a string */
+ elen = snprintf(ptr, 0, "%ld", map->eval_value);
/* Make sure there's enough room to replace the string with the value */
if (len < elen)
return NULL;
- snprintf(ptr, elen + 1, "%ld", map->enum_value);
+ snprintf(ptr, elen + 1, "%ld", map->eval_value);
/* Get the rest of the string of ptr */
rlen = strlen(ptr + len);
@@ -2090,11 +2130,11 @@ static char *enum_replace(char *ptr, struct trace_enum_map *map, int len)
}
static void update_event_printk(struct trace_event_call *call,
- struct trace_enum_map *map)
+ struct trace_eval_map *map)
{
char *ptr;
int quote = 0;
- int len = strlen(map->enum_string);
+ int len = strlen(map->eval_string);
for (ptr = call->print_fmt; *ptr; ptr++) {
if (*ptr == '\\') {
@@ -2125,16 +2165,16 @@ static void update_event_printk(struct trace_event_call *call,
continue;
}
if (isalpha(*ptr) || *ptr == '_') {
- if (strncmp(map->enum_string, ptr, len) == 0 &&
+ if (strncmp(map->eval_string, ptr, len) == 0 &&
!isalnum(ptr[len]) && ptr[len] != '_') {
- ptr = enum_replace(ptr, map, len);
- /* Hmm, enum string smaller than value */
+ ptr = eval_replace(ptr, map, len);
+ /* enum/sizeof string smaller than value */
if (WARN_ON_ONCE(!ptr))
return;
/*
- * No need to decrement here, as enum_replace()
+ * No need to decrement here, as eval_replace()
* returns the pointer to the character passed
- * the enum, and two enums can not be placed
+ * the eval, and two evals can not be placed
* back to back without something in between.
* We can skip that something in between.
*/
@@ -2165,7 +2205,7 @@ static void update_event_printk(struct trace_event_call *call,
}
}
-void trace_event_enum_update(struct trace_enum_map **map, int len)
+void trace_event_eval_update(struct trace_eval_map **map, int len)
{
struct trace_event_call *call, *p;
const char *last_system = NULL;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b53c8d369163..2c5221819be5 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -720,7 +720,7 @@ static int create_trace_kprobe(int argc, char **argv)
return ret;
}
if (offset && is_return &&
- !function_offset_within_entry(NULL, symbol, offset)) {
+ !kprobe_on_func_entry(NULL, symbol, offset)) {
pr_info("Given offset is not valid for return probe.\n");
return -EINVAL;
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 08f9bab8089e..bac629af2285 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -340,31 +340,41 @@ static inline const char *kretprobed(const char *name)
static void
seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
{
-#ifdef CONFIG_KALLSYMS
char str[KSYM_SYMBOL_LEN];
+#ifdef CONFIG_KALLSYMS
const char *name;
kallsyms_lookup(address, NULL, NULL, NULL, str);
name = kretprobed(str);
- trace_seq_printf(s, fmt, name);
+ if (name && strlen(name)) {
+ trace_seq_printf(s, fmt, name);
+ return;
+ }
#endif
+ snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address);
+ trace_seq_printf(s, fmt, str);
}
static void
seq_print_sym_offset(struct trace_seq *s, const char *fmt,
unsigned long address)
{
-#ifdef CONFIG_KALLSYMS
char str[KSYM_SYMBOL_LEN];
+#ifdef CONFIG_KALLSYMS
const char *name;
sprint_symbol(str, address);
name = kretprobed(str);
- trace_seq_printf(s, fmt, name);
+ if (name && strlen(name)) {
+ trace_seq_printf(s, fmt, name);
+ return;
+ }
#endif
+ snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address);
+ trace_seq_printf(s, fmt, str);
}
#ifndef CONFIG_64BIT
@@ -587,6 +597,15 @@ int trace_print_context(struct trace_iterator *iter)
trace_seq_printf(s, "%16s-%-5d [%03d] ",
comm, entry->pid, iter->cpu);
+ if (tr->trace_flags & TRACE_ITER_RECORD_TGID) {
+ unsigned int tgid = trace_find_tgid(entry->pid);
+
+ if (!tgid)
+ trace_seq_printf(s, "(-----) ");
+ else
+ trace_seq_printf(s, "(%5d) ", tgid);
+ }
+
if (tr->trace_flags & TRACE_ITER_IRQ_INFO)
trace_print_lat_fmt(s, entry);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 4c896a0101bd..b341c02730be 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -12,27 +12,38 @@
#include "trace.h"
-static int sched_ref;
+#define RECORD_CMDLINE 1
+#define RECORD_TGID 2
+
+static int sched_cmdline_ref;
+static int sched_tgid_ref;
static DEFINE_MUTEX(sched_register_mutex);
static void
probe_sched_switch(void *ignore, bool preempt,
struct task_struct *prev, struct task_struct *next)
{
- if (unlikely(!sched_ref))
- return;
+ int flags;
+
+ flags = (RECORD_TGID * !!sched_tgid_ref) +
+ (RECORD_CMDLINE * !!sched_cmdline_ref);
- tracing_record_cmdline(prev);
- tracing_record_cmdline(next);
+ if (!flags)
+ return;
+ tracing_record_taskinfo_sched_switch(prev, next, flags);
}
static void
probe_sched_wakeup(void *ignore, struct task_struct *wakee)
{
- if (unlikely(!sched_ref))
- return;
+ int flags;
+
+ flags = (RECORD_TGID * !!sched_tgid_ref) +
+ (RECORD_CMDLINE * !!sched_cmdline_ref);
- tracing_record_cmdline(current);
+ if (!flags)
+ return;
+ tracing_record_taskinfo(current, flags);
}
static int tracing_sched_register(void)
@@ -75,28 +86,61 @@ static void tracing_sched_unregister(void)
unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
}
-static void tracing_start_sched_switch(void)
+static void tracing_start_sched_switch(int ops)
{
+ bool sched_register = (!sched_cmdline_ref && !sched_tgid_ref);
mutex_lock(&sched_register_mutex);
- if (!(sched_ref++))
+
+ switch (ops) {
+ case RECORD_CMDLINE:
+ sched_cmdline_ref++;
+ break;
+
+ case RECORD_TGID:
+ sched_tgid_ref++;
+ break;
+ }
+
+ if (sched_register && (sched_cmdline_ref || sched_tgid_ref))
tracing_sched_register();
mutex_unlock(&sched_register_mutex);
}
-static void tracing_stop_sched_switch(void)
+static void tracing_stop_sched_switch(int ops)
{
mutex_lock(&sched_register_mutex);
- if (!(--sched_ref))
+
+ switch (ops) {
+ case RECORD_CMDLINE:
+ sched_cmdline_ref--;
+ break;
+
+ case RECORD_TGID:
+ sched_tgid_ref--;
+ break;
+ }
+
+ if (!sched_cmdline_ref && !sched_tgid_ref)
tracing_sched_unregister();
mutex_unlock(&sched_register_mutex);
}
void tracing_start_cmdline_record(void)
{
- tracing_start_sched_switch();
+ tracing_start_sched_switch(RECORD_CMDLINE);
}
void tracing_stop_cmdline_record(void)
{
- tracing_stop_sched_switch();
+ tracing_stop_sched_switch(RECORD_CMDLINE);
+}
+
+void tracing_start_tgid_record(void)
+{
+ tracing_start_sched_switch(RECORD_TGID);
+}
+
+void tracing_stop_tgid_record(void)
+{
+ tracing_stop_sched_switch(RECORD_TGID);
}