From 069545997510833281f45f83e097017b9fef19b7 Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Tue, 27 Sep 2022 15:59:42 -0700 Subject: audit: cache ctx->major in audit_filter_syscall() ctx->major contains the current syscall number. This is, of course, a constant for the duration of the syscall. Unfortunately, GCC's alias analysis cannot prove that it is not modified via a pointer in the audit_filter_syscall() loop, and so always loads it from memory. In and of itself the load isn't very expensive (ops dependent on the ctx->major load are only used to determine the direction of control flow and have short dependence chains and, in any case the related branches get predicted perfectly in the fastpath) but still cache ctx->major in a local for two reasons: * ctx->major is in the first cacheline of struct audit_context and has similar alignment as audit_entry::list audit_entry. For cases with a lot of audit rules, doing this reduces one source of contention from a potentially busy cache-set. * audit_in_mask() (called in the hot loop in audit_filter_syscall()) does cast manipulation and error checking on ctx->major: audit_in_mask(const struct audit_krule *rule, unsigned long val): if (val > 0xffffffff) return false; word = AUDIT_WORD(val); if (word >= AUDIT_BITMASK_SIZE) return false; bit = AUDIT_BIT(val); return rule->mask[word] & bit; The clauses related to the rule need to be evaluated in the loop, but the rest is unnecessarily re-evaluated for every loop iteration. (Note, however, that most of these are cheap ALU ops and the branches are perfectly predicted. However, see discussion on cycles improvement below for more on why it is still worth hoisting.) On a Skylakex system change in getpid() latency (aggregated over 12 boot cycles): Min Mean Median Max pstdev (ns) (ns) (ns) (ns) - 201.30 216.14 216.22 228.46 (+- 1.45%) + 196.63 207.86 206.60 230.98 (+- 3.92%) Performance counter stats for 'bin/getpid' (3 runs) go from: cycles 836.89 ( +- .80% ) instructions 2000.19 ( +- .03% ) IPC 2.39 ( +- .83% ) branches 430.14 ( +- .03% ) branch-misses 1.48 ( +- 3.37% ) L1-dcache-loads 471.11 ( +- .05% ) L1-dcache-load-misses 7.62 ( +- 46.98% ) to: cycles 805.58 ( +- 4.11% ) instructions 1654.11 ( +- .05% ) IPC 2.06 ( +- 3.39% ) branches 430.02 ( +- .05% ) branch-misses 1.55 ( +- 7.09% ) L1-dcache-loads 440.01 ( +- .09% ) L1-dcache-load-misses 9.05 ( +- 74.03% ) (Both aggregated over 12 boot cycles.) instructions: we reduce around 8 instructions/iteration because some of the computation is now hoisted out of the loop (branch count does not change because GCC, for reasons unclear, only hoists the computations while keeping the basic-blocks.) cycles: improve by about 5% (in aggregate and looking at individual run numbers.) This is likely because we now waste fewer pipeline resources on unnecessary instructions which allows the control flow to speculatively execute further ahead shortening the execution of the loop a little. The final gating factor on the performance of this loop remains the long dependence chain due to the linked-list load. Signed-off-by: Ankur Arora Signed-off-by: Paul Moore --- kernel/auditsc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9f8c05228d6d..4c11a7181bd4 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -843,13 +843,14 @@ static void audit_filter_syscall(struct task_struct *tsk, { struct audit_entry *e; enum audit_state state; + unsigned long major = ctx->major; if (auditd_test_task(tsk)) return; rcu_read_lock(); list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_EXIT], list) { - if (audit_in_mask(&e->rule, ctx->major) && + if (audit_in_mask(&e->rule, major) && audit_filter_rules(tsk, &e->rule, ctx, NULL, &state, false)) { rcu_read_unlock(); -- cgit v1.2.3 From 50979953c0c41e929e5f955800da68e1bb24c7ab Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 6 Oct 2022 17:49:43 -0700 Subject: audit: unify audit_filter_{uring(), inode_name(), syscall()} audit_filter_uring(), audit_filter_inode_name() are substantially similar to audit_filter_syscall(). Move the core logic to __audit_filter_op() which can be parametrized for all three. On a Skylakex system, getpid() latency (all results aggregated across 12 boot cycles): Min Mean Median Max pstdev (ns) (ns) (ns) (ns) - 196.63 207.86 206.60 230.98 (+- 3.92%) + 183.73 196.95 192.31 232.49 (+- 6.04%) Performance counter stats for 'bin/getpid' (3 runs) go from: cycles 805.58 ( +- 4.11% ) instructions 1654.11 ( +- .05% ) IPC 2.06 ( +- 3.39% ) branches 430.02 ( +- .05% ) branch-misses 1.55 ( +- 7.09% ) L1-dcache-loads 440.01 ( +- .09% ) L1-dcache-load-misses 9.05 ( +- 74.03% ) to: cycles 765.37 ( +- 6.66% ) instructions 1677.07 ( +- 0.04% ) IPC 2.20 ( +- 5.90% ) branches 431.10 ( +- 0.04% ) branch-misses 1.60 ( +- 11.25% ) L1-dcache-loads 521.04 ( +- 0.05% ) L1-dcache-load-misses 6.92 ( +- 77.60% ) (Both aggregated over 12 boot cycles.) The increased L1-dcache-loads are due to some intermediate values now coming from the stack. The improvement in cycles is due to a slightly denser loop (the list parameter in the list_for_each_entry_rcu() exit check now comes from a register rather than a constant as before.) Signed-off-by: Ankur Arora Signed-off-by: Paul Moore --- kernel/auditsc.c | 76 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 39 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4c11a7181bd4..547c88be8a28 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -806,30 +806,53 @@ static int audit_in_mask(const struct audit_krule *rule, unsigned long val) } /** - * audit_filter_uring - apply filters to an io_uring operation + * __audit_filter_op - common filter helper for operations (syscall/uring/etc) * @tsk: associated task * @ctx: audit context + * @list: audit filter list + * @name: audit_name (can be NULL) + * @op: current syscall/uring_op + * + * Run the udit filters specified in @list against @tsk using @ctx, + * @name, and @op, as necessary; the caller is responsible for ensuring + * that the call is made while the RCU read lock is held. The @name + * parameter can be NULL, but all others must be specified. + * Returns 1/true if the filter finds a match, 0/false if none are found. */ -static void audit_filter_uring(struct task_struct *tsk, - struct audit_context *ctx) +static int __audit_filter_op(struct task_struct *tsk, + struct audit_context *ctx, + struct list_head *list, + struct audit_names *name, + unsigned long op) { struct audit_entry *e; enum audit_state state; + list_for_each_entry_rcu(e, list, list) { + if (audit_in_mask(&e->rule, op) && + audit_filter_rules(tsk, &e->rule, ctx, name, + &state, false)) { + ctx->current_state = state; + return 1; + } + } + return 0; +} + +/** + * audit_filter_uring - apply filters to an io_uring operation + * @tsk: associated task + * @ctx: audit context + */ +static void audit_filter_uring(struct task_struct *tsk, + struct audit_context *ctx) +{ if (auditd_test_task(tsk)) return; rcu_read_lock(); - list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_URING_EXIT], - list) { - if (audit_in_mask(&e->rule, ctx->uring_op) && - audit_filter_rules(tsk, &e->rule, ctx, NULL, &state, - false)) { - rcu_read_unlock(); - ctx->current_state = state; - return; - } - } + __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_URING_EXIT], + NULL, ctx->uring_op); rcu_read_unlock(); } @@ -841,25 +864,13 @@ static void audit_filter_uring(struct task_struct *tsk, static void audit_filter_syscall(struct task_struct *tsk, struct audit_context *ctx) { - struct audit_entry *e; - enum audit_state state; - unsigned long major = ctx->major; - if (auditd_test_task(tsk)) return; rcu_read_lock(); - list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_EXIT], list) { - if (audit_in_mask(&e->rule, major) && - audit_filter_rules(tsk, &e->rule, ctx, NULL, - &state, false)) { - rcu_read_unlock(); - ctx->current_state = state; - return; - } - } + __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_EXIT], + NULL, ctx->major); rcu_read_unlock(); - return; } /* @@ -871,17 +882,8 @@ static int audit_filter_inode_name(struct task_struct *tsk, struct audit_context *ctx) { int h = audit_hash_ino((u32)n->ino); struct list_head *list = &audit_inode_hash[h]; - struct audit_entry *e; - enum audit_state state; - list_for_each_entry_rcu(e, list, list) { - if (audit_in_mask(&e->rule, ctx->major) && - audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { - ctx->current_state = state; - return 1; - } - } - return 0; + return __audit_filter_op(tsk, ctx, list, n, ctx->major); } /* At syscall exit time, this filter is called if any audit_names have been -- cgit v1.2.3