diff options
Diffstat (limited to 'net')
-rw-r--r-- | net/core/filter.c | 303 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 26 | ||||
-rw-r--r-- | net/ipv4/tcp_nv.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 6 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 7 |
5 files changed, 317 insertions, 27 deletions
diff --git a/net/core/filter.c b/net/core/filter.c index 18da42a81d0c..08ab4c65a998 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -401,8 +401,8 @@ do_pass: /* Classic BPF expects A and X to be reset first. These need * to be guaranteed to be the first two instructions. */ - *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); - *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); + *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); /* All programs must keep CTX in callee saved BPF_REG_CTX. * In eBPF case it's done by the compiler, here we need to @@ -459,8 +459,15 @@ do_pass: break; if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || - fp->code == (BPF_ALU | BPF_MOD | BPF_X)) + fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); + /* Error with exception code on div/mod by 0. + * For cBPF programs, this was always return 0. + */ + *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *insn++ = BPF_EXIT_INSN(); + } *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); break; @@ -3232,6 +3239,29 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, ret = -EINVAL; } #ifdef CONFIG_INET +#if IS_ENABLED(CONFIG_IPV6) + } else if (level == SOL_IPV6) { + if (optlen != sizeof(int) || sk->sk_family != AF_INET6) + return -EINVAL; + + val = *((int *)optval); + /* Only some options are supported */ + switch (optname) { + case IPV6_TCLASS: + if (val < -1 || val > 0xff) { + ret = -EINVAL; + } else { + struct ipv6_pinfo *np = inet6_sk(sk); + + if (val == -1) + val = 0; + np->tclass = val; + } + break; + default: + ret = -EINVAL; + } +#endif } else if (level == SOL_TCP && sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { @@ -3241,7 +3271,8 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); name[TCP_CA_NAME_MAX-1] = 0; - ret = tcp_set_congestion_control(sk, name, false, reinit); + ret = tcp_set_congestion_control(sk, name, false, + reinit); } else { struct tcp_sock *tp = tcp_sk(sk); @@ -3307,6 +3338,22 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, } else { goto err_clear; } +#if IS_ENABLED(CONFIG_IPV6) + } else if (level == SOL_IPV6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + if (optlen != sizeof(int) || sk->sk_family != AF_INET6) + goto err_clear; + + /* Only some options are supported */ + switch (optname) { + case IPV6_TCLASS: + *((int *)optval) = (int)np->tclass; + break; + default: + goto err_clear; + } +#endif } else { goto err_clear; } @@ -3328,6 +3375,33 @@ static const struct bpf_func_proto bpf_getsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, + int, argval) +{ + struct sock *sk = bpf_sock->sk; + int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; + + if (!sk_fullsock(sk)) + return -EINVAL; + +#ifdef CONFIG_INET + if (val) + tcp_sk(sk)->bpf_sock_ops_cb_flags = val; + + return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); +#else + return -EINVAL; +#endif +} + +static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { + .func = bpf_sock_ops_cb_flags_set, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -3510,6 +3584,8 @@ static const struct bpf_func_proto * return &bpf_setsockopt_proto; case BPF_FUNC_getsockopt: return &bpf_getsockopt_proto; + case BPF_FUNC_sock_ops_cb_flags_set: + return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; default: @@ -3826,34 +3902,44 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); -static bool __is_valid_sock_ops_access(int off, int size) +static bool sock_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) { + const int size_default = sizeof(__u32); + if (off < 0 || off >= sizeof(struct bpf_sock_ops)) return false; + /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - if (size != sizeof(__u32)) - return false; - - return true; -} -static bool sock_ops_is_valid_access(int off, int size, - enum bpf_access_type type, - struct bpf_insn_access_aux *info) -{ if (type == BPF_WRITE) { switch (off) { - case offsetof(struct bpf_sock_ops, op) ... - offsetof(struct bpf_sock_ops, replylong[3]): + case offsetof(struct bpf_sock_ops, reply): + case offsetof(struct bpf_sock_ops, sk_txhash): + if (size != size_default) + return false; break; default: return false; } + } else { + switch (off) { + case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received, + bytes_acked): + if (size != sizeof(__u64)) + return false; + break; + default: + if (size != size_default) + return false; + break; + } } - return __is_valid_sock_ops_access(off, size); + return true; } static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, @@ -4470,10 +4556,37 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, is_fullsock)); break; -/* Helper macro for adding read access to tcp_sock fields. */ -#define SOCK_OPS_GET_TCP32(FIELD_NAME) \ + case offsetof(struct bpf_sock_ops, state): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_state) != 1); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_state)); + break; + + case offsetof(struct bpf_sock_ops, rtt_min): + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) != + sizeof(struct minmax)); + BUILD_BUG_ON(sizeof(struct minmax) < + sizeof(struct minmax_sample)); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct tcp_sock, rtt_min) + + FIELD_SIZEOF(struct minmax_sample, t)); + break; + +/* Helper macro for adding read access to tcp_sock or sock fields. */ +#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ - BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \ + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ + FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_fullsock), \ @@ -4485,17 +4598,159 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, struct bpf_sock_ops_kern, sk),\ si->dst_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, \ - offsetof(struct tcp_sock, FIELD_NAME)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ + OBJ_FIELD), \ + si->dst_reg, si->dst_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ + } while (0) + +/* Helper macro for adding write access to tcp_sock or sock fields. + * The macro is called with two registers, dst_reg which contains a pointer + * to ctx (context) and src_reg which contains the value that should be + * stored. However, we need an additional register since we cannot overwrite + * dst_reg because it may be used later in the program. + * Instead we "borrow" one of the other register. We first save its value + * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore + * it at the end of the macro. + */ +#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ + do { \ + int reg = BPF_REG_9; \ + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ + FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD), \ + reg, si->src_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + } while (0) + +#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ + do { \ + if (TYPE == BPF_WRITE) \ + SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ + else \ + SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ } while (0) case offsetof(struct bpf_sock_ops, snd_cwnd): - SOCK_OPS_GET_TCP32(snd_cwnd); + SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, srtt_us): - SOCK_OPS_GET_TCP32(srtt_us); + SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock); break; + + case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags): + SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, snd_ssthresh): + SOCK_OPS_GET_FIELD(snd_ssthresh, snd_ssthresh, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rcv_nxt): + SOCK_OPS_GET_FIELD(rcv_nxt, rcv_nxt, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, snd_nxt): + SOCK_OPS_GET_FIELD(snd_nxt, snd_nxt, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, snd_una): + SOCK_OPS_GET_FIELD(snd_una, snd_una, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, mss_cache): + SOCK_OPS_GET_FIELD(mss_cache, mss_cache, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, ecn_flags): + SOCK_OPS_GET_FIELD(ecn_flags, ecn_flags, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rate_delivered): + SOCK_OPS_GET_FIELD(rate_delivered, rate_delivered, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rate_interval_us): + SOCK_OPS_GET_FIELD(rate_interval_us, rate_interval_us, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, packets_out): + SOCK_OPS_GET_FIELD(packets_out, packets_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, retrans_out): + SOCK_OPS_GET_FIELD(retrans_out, retrans_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, total_retrans): + SOCK_OPS_GET_FIELD(total_retrans, total_retrans, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, segs_in): + SOCK_OPS_GET_FIELD(segs_in, segs_in, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, data_segs_in): + SOCK_OPS_GET_FIELD(data_segs_in, data_segs_in, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, segs_out): + SOCK_OPS_GET_FIELD(segs_out, segs_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, data_segs_out): + SOCK_OPS_GET_FIELD(data_segs_out, data_segs_out, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, lost_out): + SOCK_OPS_GET_FIELD(lost_out, lost_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, sacked_out): + SOCK_OPS_GET_FIELD(sacked_out, sacked_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, sk_txhash): + SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, + struct sock, type); + break; + + case offsetof(struct bpf_sock_ops, bytes_received): + SOCK_OPS_GET_FIELD(bytes_received, bytes_received, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, bytes_acked): + SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock); + break; + } return insn - insn_buf; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d7cf861bf699..f013ddc191e0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -463,7 +463,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op) tcp_mtup_init(sk); icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); - tcp_call_bpf(sk, bpf_op); + tcp_call_bpf(sk, bpf_op, 0, NULL); tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); } @@ -2042,6 +2042,30 @@ void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; + /* We defined a new enum for TCP states that are exported in BPF + * so as not force the internal TCP states to be frozen. The + * following checks will detect if an internal state value ever + * differs from the BPF value. If this ever happens, then we will + * need to remap the internal value to the BPF value before calling + * tcp_call_bpf_2arg. + */ + BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED); + BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT); + BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV); + BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1); + BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2); + BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT); + BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE); + BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT); + BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK); + BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN); + BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING); + BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV); + BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES); + + if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG)) + tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state); + switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c index 0b5a05bd82e3..ddbce73edae8 100644 --- a/net/ipv4/tcp_nv.c +++ b/net/ipv4/tcp_nv.c @@ -146,7 +146,7 @@ static void tcpnv_init(struct sock *sk) * within a datacenter, where we have reasonable estimates of * RTTs */ - base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT); + base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT, 0, NULL); if (base_rtt > 0) { ca->nv_base_rtt = base_rtt; ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 95461f02ac9a..e9f985e42405 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2905,6 +2905,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) + tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, + TCP_SKB_CB(skb)->seq, segs, err); + if (likely(!err)) { TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; trace_tcp_retransmit_skb(sk, skb); @@ -3469,7 +3473,7 @@ int tcp_connect(struct sock *sk) struct sk_buff *buff; int err; - tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB); + tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL); if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 6db3124cdbda..257abdde23b0 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -213,11 +213,18 @@ static int tcp_write_timeout(struct sock *sk) icsk->icsk_user_timeout); } tcp_fastopen_active_detect_blackhole(sk, expired); + + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG)) + tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB, + icsk->icsk_retransmits, + icsk->icsk_rto, (int)expired); + if (expired) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; } + return 0; } |