diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/Makefile | 2 | ||||
-rw-r--r-- | net/core/dev.c | 148 | ||||
-rw-r--r-- | net/core/devlink.c | 111 | ||||
-rw-r--r-- | net/core/dst.c | 1 | ||||
-rw-r--r-- | net/core/ethtool.c | 63 | ||||
-rw-r--r-- | net/core/failover.c | 315 | ||||
-rw-r--r-- | net/core/fib_rules.c | 495 | ||||
-rw-r--r-- | net/core/filter.c | 1423 | ||||
-rw-r--r-- | net/core/flow_dissector.c | 19 | ||||
-rw-r--r-- | net/core/neighbour.c | 8 | ||||
-rw-r--r-- | net/core/net-traces.c | 4 | ||||
-rw-r--r-- | net/core/page_pool.c | 317 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 34 | ||||
-rw-r--r-- | net/core/skbuff.c | 25 | ||||
-rw-r--r-- | net/core/sock.c | 40 | ||||
-rw-r--r-- | net/core/xdp.c | 299 |
16 files changed, 2784 insertions, 520 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 6dbbba8c57ae..80175e6a2eb8 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -14,6 +14,7 @@ obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ fib_notifier.o xdp.o obj-y += net-sysfs.o +obj-$(CONFIG_PAGE_POOL) += page_pool.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o @@ -30,3 +31,4 @@ obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o +obj-$(CONFIG_FAILOVER) += failover.o diff --git a/net/core/dev.c b/net/core/dev.c index 9c149238a4ce..6e18242a1cae 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1285,6 +1285,7 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) return len; } +EXPORT_SYMBOL(dev_set_alias); /** * dev_get_alias - get ifalias of a device @@ -1586,7 +1587,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) - }; + } #undef N return "UNKNOWN_NETDEV_EVENT"; } @@ -1754,38 +1755,38 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) EXPORT_SYMBOL(call_netdevice_notifiers); #ifdef CONFIG_NET_INGRESS -static struct static_key ingress_needed __read_mostly; +static DEFINE_STATIC_KEY_FALSE(ingress_needed_key); void net_inc_ingress_queue(void) { - static_key_slow_inc(&ingress_needed); + static_branch_inc(&ingress_needed_key); } EXPORT_SYMBOL_GPL(net_inc_ingress_queue); void net_dec_ingress_queue(void) { - static_key_slow_dec(&ingress_needed); + static_branch_dec(&ingress_needed_key); } EXPORT_SYMBOL_GPL(net_dec_ingress_queue); #endif #ifdef CONFIG_NET_EGRESS -static struct static_key egress_needed __read_mostly; +static DEFINE_STATIC_KEY_FALSE(egress_needed_key); void net_inc_egress_queue(void) { - static_key_slow_inc(&egress_needed); + static_branch_inc(&egress_needed_key); } EXPORT_SYMBOL_GPL(net_inc_egress_queue); void net_dec_egress_queue(void) { - static_key_slow_dec(&egress_needed); + static_branch_dec(&egress_needed_key); } EXPORT_SYMBOL_GPL(net_dec_egress_queue); #endif -static struct static_key netstamp_needed __read_mostly; +static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); #ifdef HAVE_JUMP_LABEL static atomic_t netstamp_needed_deferred; static atomic_t netstamp_wanted; @@ -1796,9 +1797,9 @@ static void netstamp_clear(struct work_struct *work) wanted = atomic_add_return(deferred, &netstamp_wanted); if (wanted > 0) - static_key_enable(&netstamp_needed); + static_branch_enable(&netstamp_needed_key); else - static_key_disable(&netstamp_needed); + static_branch_disable(&netstamp_needed_key); } static DECLARE_WORK(netstamp_work, netstamp_clear); #endif @@ -1818,7 +1819,7 @@ void net_enable_timestamp(void) atomic_inc(&netstamp_needed_deferred); schedule_work(&netstamp_work); #else - static_key_slow_inc(&netstamp_needed); + static_branch_inc(&netstamp_needed_key); #endif } EXPORT_SYMBOL(net_enable_timestamp); @@ -1838,7 +1839,7 @@ void net_disable_timestamp(void) atomic_dec(&netstamp_needed_deferred); schedule_work(&netstamp_work); #else - static_key_slow_dec(&netstamp_needed); + static_branch_dec(&netstamp_needed_key); #endif } EXPORT_SYMBOL(net_disable_timestamp); @@ -1846,15 +1847,15 @@ EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { skb->tstamp = 0; - if (static_key_false(&netstamp_needed)) + if (static_branch_unlikely(&netstamp_needed_key)) __net_timestamp(skb); } -#define net_timestamp_check(COND, SKB) \ - if (static_key_false(&netstamp_needed)) { \ - if ((COND) && !(SKB)->tstamp) \ - __net_timestamp(SKB); \ - } \ +#define net_timestamp_check(COND, SKB) \ + if (static_branch_unlikely(&netstamp_needed_key)) { \ + if ((COND) && !(SKB)->tstamp) \ + __net_timestamp(SKB); \ + } \ bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) { @@ -2614,17 +2615,16 @@ EXPORT_SYMBOL(netif_device_attach); * Returns a Tx hash based on the given packet descriptor a Tx queues' number * to be used as a distribution range. */ -u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, - unsigned int num_tx_queues) +static u16 skb_tx_hash(const struct net_device *dev, struct sk_buff *skb) { u32 hash; u16 qoffset = 0; - u16 qcount = num_tx_queues; + u16 qcount = dev->real_num_tx_queues; if (skb_rx_queue_recorded(skb)) { hash = skb_get_rx_queue(skb); - while (unlikely(hash >= num_tx_queues)) - hash -= num_tx_queues; + while (unlikely(hash >= qcount)) + hash -= qcount; return hash; } @@ -2637,7 +2637,6 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; } -EXPORT_SYMBOL(__skb_tx_hash); static void skb_warn_bad_offload(const struct sk_buff *skb) { @@ -3095,6 +3094,10 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device if (unlikely(!skb)) goto out_null; + skb = sk_validate_xmit_skb(skb, dev); + if (unlikely(!skb)) + goto out_null; + if (netif_needs_gso(skb, features)) { struct sk_buff *segs; @@ -3223,7 +3226,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, rc = NET_XMIT_DROP; } else { rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; - __qdisc_run(q); + qdisc_run(q); } if (unlikely(to_free)) @@ -3511,7 +3514,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) #ifdef CONFIG_NET_CLS_ACT skb->tc_at_ingress = 0; # ifdef CONFIG_NET_EGRESS - if (static_key_false(&egress_needed)) { + if (static_branch_unlikely(&egress_needed_key)) { skb = sch_handle_egress(skb, &rc, dev); if (!skb) goto out; @@ -3606,6 +3609,44 @@ int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) } EXPORT_SYMBOL(dev_queue_xmit_accel); +int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) +{ + struct net_device *dev = skb->dev; + struct sk_buff *orig_skb = skb; + struct netdev_queue *txq; + int ret = NETDEV_TX_BUSY; + bool again = false; + + if (unlikely(!netif_running(dev) || + !netif_carrier_ok(dev))) + goto drop; + + skb = validate_xmit_skb_list(skb, dev, &again); + if (skb != orig_skb) + goto drop; + + skb_set_queue_mapping(skb, queue_id); + txq = skb_get_tx_queue(dev, skb); + + local_bh_disable(); + + HARD_TX_LOCK(dev, txq, smp_processor_id()); + if (!netif_xmit_frozen_or_drv_stopped(txq)) + ret = netdev_start_xmit(skb, dev, txq, false); + HARD_TX_UNLOCK(dev, txq); + + local_bh_enable(); + + if (!dev_xmit_complete(ret)) + kfree_skb(skb); + + return ret; +drop: + atomic_long_inc(&dev->tx_dropped); + kfree_skb_list(skb); + return NET_XMIT_DROP; +} +EXPORT_SYMBOL(dev_direct_xmit); /************************************************************************* * Receiver routines @@ -3975,12 +4016,12 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) } static u32 netif_receive_generic_xdp(struct sk_buff *skb, + struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct netdev_rx_queue *rxqueue; + void *orig_data, *orig_data_end; u32 metalen, act = XDP_DROP; - struct xdp_buff xdp; - void *orig_data; int hlen, off; u32 mac_len; @@ -4015,31 +4056,42 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, */ mac_len = skb->data - skb_mac_header(skb); hlen = skb_headlen(skb) + mac_len; - xdp.data = skb->data - mac_len; - xdp.data_meta = xdp.data; - xdp.data_end = xdp.data + hlen; - xdp.data_hard_start = skb->data - skb_headroom(skb); - orig_data = xdp.data; + xdp->data = skb->data - mac_len; + xdp->data_meta = xdp->data; + xdp->data_end = xdp->data + hlen; + xdp->data_hard_start = skb->data - skb_headroom(skb); + orig_data_end = xdp->data_end; + orig_data = xdp->data; rxqueue = netif_get_rxqueue(skb); - xdp.rxq = &rxqueue->xdp_rxq; + xdp->rxq = &rxqueue->xdp_rxq; - act = bpf_prog_run_xdp(xdp_prog, &xdp); + act = bpf_prog_run_xdp(xdp_prog, xdp); - off = xdp.data - orig_data; + off = xdp->data - orig_data; if (off > 0) __skb_pull(skb, off); else if (off < 0) __skb_push(skb, -off); skb->mac_header += off; + /* check if bpf_xdp_adjust_tail was used. it can only "shrink" + * pckt. + */ + off = orig_data_end - xdp->data_end; + if (off != 0) { + skb_set_tail_pointer(skb, xdp->data_end - xdp->data); + skb->len -= off; + + } + switch (act) { case XDP_REDIRECT: case XDP_TX: __skb_push(skb, mac_len); break; case XDP_PASS: - metalen = xdp.data - xdp.data_meta; + metalen = xdp->data - xdp->data_meta; if (metalen) skb_metadata_set(skb, metalen); break; @@ -4084,22 +4136,24 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) } EXPORT_SYMBOL_GPL(generic_xdp_tx); -static struct static_key generic_xdp_needed __read_mostly; +static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) { if (xdp_prog) { - u32 act = netif_receive_generic_xdp(skb, xdp_prog); + struct xdp_buff xdp; + u32 act; int err; + act = netif_receive_generic_xdp(skb, &xdp, xdp_prog); if (act != XDP_PASS) { switch (act) { case XDP_REDIRECT: err = xdp_do_generic_redirect(skb->dev, skb, - xdp_prog); + &xdp, xdp_prog); if (err) goto out_redir; - /* fallthru to submit skb */ + break; case XDP_TX: generic_xdp_tx(skb, xdp_prog); break; @@ -4122,7 +4176,7 @@ static int netif_rx_internal(struct sk_buff *skb) trace_netif_rx(skb); - if (static_key_false(&generic_xdp_needed)) { + if (static_branch_unlikely(&generic_xdp_needed_key)) { int ret; preempt_disable(); @@ -4494,7 +4548,7 @@ another_round: skip_taps: #ifdef CONFIG_NET_INGRESS - if (static_key_false(&ingress_needed)) { + if (static_branch_unlikely(&ingress_needed_key)) { skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev); if (!skb) goto out; @@ -4654,9 +4708,9 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) bpf_prog_put(old); if (old && !new) { - static_key_slow_dec(&generic_xdp_needed); + static_branch_dec(&generic_xdp_needed_key); } else if (new && !old) { - static_key_slow_inc(&generic_xdp_needed); + static_branch_inc(&generic_xdp_needed_key); dev_disable_lro(dev); dev_disable_gro_hw(dev); } @@ -4684,7 +4738,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb) if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; - if (static_key_false(&generic_xdp_needed)) { + if (static_branch_unlikely(&generic_xdp_needed_key)) { int ret; preempt_disable(); @@ -7852,6 +7906,8 @@ int register_netdevice(struct net_device *dev) int ret; struct net *net = dev_net(dev); + BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE < + NETDEV_FEATURE_COUNT); BUG_ON(dev_boot_phase); ASSERT_RTNL(); diff --git a/net/core/devlink.c b/net/core/devlink.c index ad1317376798..22099705cc41 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -453,6 +453,27 @@ static void devlink_notify(struct devlink *devlink, enum devlink_command cmd) msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } +static int devlink_nl_port_attrs_put(struct sk_buff *msg, + struct devlink_port *devlink_port) +{ + struct devlink_port_attrs *attrs = &devlink_port->attrs; + + if (!attrs->set) + return 0; + if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour)) + return -EMSGSIZE; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number)) + return -EMSGSIZE; + if (!attrs->split) + return 0; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, attrs->port_number)) + return -EMSGSIZE; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER, + attrs->split_subport_number)) + return -EMSGSIZE; + return 0; +} + static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_port *devlink_port, enum devlink_command cmd, u32 portid, @@ -492,9 +513,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, ibdev->name)) goto nla_put_failure; } - if (devlink_port->split && - nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, - devlink_port->split_group)) + if (devlink_nl_port_attrs_put(msg, devlink_port)) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -683,12 +702,13 @@ static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb, return 0; } -static int devlink_port_split(struct devlink *devlink, - u32 port_index, u32 count) +static int devlink_port_split(struct devlink *devlink, u32 port_index, + u32 count, struct netlink_ext_ack *extack) { if (devlink->ops && devlink->ops->port_split) - return devlink->ops->port_split(devlink, port_index, count); + return devlink->ops->port_split(devlink, port_index, count, + extack); return -EOPNOTSUPP; } @@ -705,14 +725,15 @@ static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb, port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]); - return devlink_port_split(devlink, port_index, count); + return devlink_port_split(devlink, port_index, count, info->extack); } -static int devlink_port_unsplit(struct devlink *devlink, u32 port_index) +static int devlink_port_unsplit(struct devlink *devlink, u32 port_index, + struct netlink_ext_ack *extack) { if (devlink->ops && devlink->ops->port_unsplit) - return devlink->ops->port_unsplit(devlink, port_index); + return devlink->ops->port_unsplit(devlink, port_index, extack); return -EOPNOTSUPP; } @@ -726,7 +747,7 @@ static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb, return -EINVAL; port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); - return devlink_port_unsplit(devlink, port_index); + return devlink_port_unsplit(devlink, port_index, info->extack); } static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink, @@ -1807,7 +1828,6 @@ send_done: nla_put_failure: err = -EMSGSIZE; err_table_put: - genlmsg_cancel(skb, hdr); nlmsg_free(skb); return err; } @@ -2013,7 +2033,6 @@ int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx) return 0; nla_put_failure: - genlmsg_cancel(dump_ctx->skb, dump_ctx->hdr); nlmsg_free(dump_ctx->skb); return -EMSGSIZE; } @@ -2230,7 +2249,6 @@ send_done: nla_put_failure: err = -EMSGSIZE; err_table_put: - genlmsg_cancel(skb, hdr); nlmsg_free(skb); return err; } @@ -2532,7 +2550,6 @@ nla_put_failure: err = -EMSGSIZE; err_resource_put: err_skb_send_alloc: - genlmsg_cancel(skb, hdr); nlmsg_free(skb); return err; } @@ -2584,7 +2601,7 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed"); return err; } - return devlink->ops->reload(devlink); + return devlink->ops->reload(devlink, info->extack); } static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { @@ -2737,7 +2754,8 @@ static const struct genl_ops devlink_nl_ops[] = { .doit = devlink_nl_cmd_eswitch_set_doit, .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_DPIPE_TABLE_GET, @@ -2971,19 +2989,64 @@ void devlink_port_type_clear(struct devlink_port *devlink_port) EXPORT_SYMBOL_GPL(devlink_port_type_clear); /** - * devlink_port_split_set - Set port is split + * devlink_port_attrs_set - Set port attributes * * @devlink_port: devlink port - * @split_group: split group - identifies group split port is part of + * @flavour: flavour of the port + * @port_number: number of the port that is facing user, for example + * the front panel port number + * @split: indicates if this is split port + * @split_subport_number: if the port is split, this is the number + * of subport. */ -void devlink_port_split_set(struct devlink_port *devlink_port, - u32 split_group) -{ - devlink_port->split = true; - devlink_port->split_group = split_group; +void devlink_port_attrs_set(struct devlink_port *devlink_port, + enum devlink_port_flavour flavour, + u32 port_number, bool split, + u32 split_subport_number) +{ + struct devlink_port_attrs *attrs = &devlink_port->attrs; + + attrs->set = true; + attrs->flavour = flavour; + attrs->port_number = port_number; + attrs->split = split; + attrs->split_subport_number = split_subport_number; devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); } -EXPORT_SYMBOL_GPL(devlink_port_split_set); +EXPORT_SYMBOL_GPL(devlink_port_attrs_set); + +int devlink_port_get_phys_port_name(struct devlink_port *devlink_port, + char *name, size_t len) +{ + struct devlink_port_attrs *attrs = &devlink_port->attrs; + int n = 0; + + if (!attrs->set) + return -EOPNOTSUPP; + + switch (attrs->flavour) { + case DEVLINK_PORT_FLAVOUR_PHYSICAL: + if (!attrs->split) + n = snprintf(name, len, "p%u", attrs->port_number); + else + n = snprintf(name, len, "p%us%u", attrs->port_number, + attrs->split_subport_number); + break; + case DEVLINK_PORT_FLAVOUR_CPU: + case DEVLINK_PORT_FLAVOUR_DSA: + /* As CPU and DSA ports do not have a netdevice associated + * case should not ever happen. + */ + WARN_ON(1); + return -EINVAL; + } + + if (n >= len) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_port_get_phys_port_name); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, diff --git a/net/core/dst.c b/net/core/dst.c index 007aa0b08291..2d9b37f8944a 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -58,6 +58,7 @@ const struct dst_metrics dst_default_metrics = { */ .refcnt = REFCOUNT_INIT(1), }; +EXPORT_SYMBOL(dst_default_metrics); void dst_init(struct dst_entry *dst, struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, diff --git a/net/core/ethtool.c b/net/core/ethtool.c index ba02f0dfe85c..c15075dc7572 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -92,6 +92,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial", [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation", [NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation", + [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", @@ -109,6 +110,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload", [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload", [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record", + [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload", }; static const char @@ -210,23 +212,6 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr) return ret; } -static int phy_get_sset_count(struct phy_device *phydev) -{ - int ret; - - if (phydev->drv->get_sset_count && - phydev->drv->get_strings && - phydev->drv->get_stats) { - mutex_lock(&phydev->lock); - ret = phydev->drv->get_sset_count(phydev); - mutex_unlock(&phydev->lock); - - return ret; - } - - return -EOPNOTSUPP; -} - static int __ethtool_get_sset_count(struct net_device *dev, int sset) { const struct ethtool_ops *ops = dev->ethtool_ops; @@ -243,12 +228,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset) if (sset == ETH_SS_PHY_TUNABLES) return ARRAY_SIZE(phy_tunable_strings); - if (sset == ETH_SS_PHY_STATS) { - if (dev->phydev) - return phy_get_sset_count(dev->phydev); - else - return -EOPNOTSUPP; - } + if (sset == ETH_SS_PHY_STATS && dev->phydev && + !ops->get_ethtool_phy_stats) + return phy_ethtool_get_sset_count(dev->phydev); if (ops->get_sset_count && ops->get_strings) return ops->get_sset_count(dev, sset); @@ -271,17 +253,10 @@ static void __ethtool_get_strings(struct net_device *dev, memcpy(data, tunable_strings, sizeof(tunable_strings)); else if (stringset == ETH_SS_PHY_TUNABLES) memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings)); - else if (stringset == ETH_SS_PHY_STATS) { - struct phy_device *phydev = dev->phydev; - - if (phydev) { - mutex_lock(&phydev->lock); - phydev->drv->get_strings(phydev, data); - mutex_unlock(&phydev->lock); - } else { - return; - } - } else + else if (stringset == ETH_SS_PHY_STATS && dev->phydev && + !ops->get_ethtool_phy_stats) + phy_ethtool_get_strings(dev->phydev, data); + else /* ops->get_strings is valid because checked earlier */ ops->get_strings(dev, stringset, data); } @@ -1998,15 +1973,19 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) { - struct ethtool_stats stats; + const struct ethtool_ops *ops = dev->ethtool_ops; struct phy_device *phydev = dev->phydev; + struct ethtool_stats stats; u64 *data; int ret, n_stats; - if (!phydev) + if (!phydev && (!ops->get_ethtool_phy_stats || !ops->get_sset_count)) return -EOPNOTSUPP; - n_stats = phy_get_sset_count(phydev); + if (dev->phydev && !ops->get_ethtool_phy_stats) + n_stats = phy_ethtool_get_sset_count(dev->phydev); + else + n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS); if (n_stats < 0) return n_stats; if (n_stats > S32_MAX / sizeof(u64)) @@ -2021,9 +2000,13 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) if (n_stats && !data) return -ENOMEM; - mutex_lock(&phydev->lock); - phydev->drv->get_stats(phydev, &stats, data); - mutex_unlock(&phydev->lock); + if (dev->phydev && !ops->get_ethtool_phy_stats) { + ret = phy_ethtool_get_stats(dev->phydev, &stats, data); + if (ret < 0) + return ret; + } else { + ops->get_ethtool_phy_stats(dev, &stats, data); + } ret = -EFAULT; if (copy_to_user(useraddr, &stats, sizeof(stats))) diff --git a/net/core/failover.c b/net/core/failover.c new file mode 100644 index 000000000000..4a92a98ccce9 --- /dev/null +++ b/net/core/failover.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2018, Intel Corporation. */ + +/* A common module to handle registrations and notifications for paravirtual + * drivers to enable accelerated datapath and support VF live migration. + * + * The notifier and event handling code is based on netvsc driver. + */ + +#include <linux/module.h> +#include <linux/etherdevice.h> +#include <uapi/linux/if_arp.h> +#include <linux/rtnetlink.h> +#include <linux/if_vlan.h> +#include <net/failover.h> + +static LIST_HEAD(failover_list); +static DEFINE_SPINLOCK(failover_lock); + +static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops) +{ + struct net_device *failover_dev; + struct failover *failover; + + spin_lock(&failover_lock); + list_for_each_entry(failover, &failover_list, list) { + failover_dev = rtnl_dereference(failover->failover_dev); + if (ether_addr_equal(failover_dev->perm_addr, mac)) { + *ops = rtnl_dereference(failover->ops); + spin_unlock(&failover_lock); + return failover_dev; + } + } + spin_unlock(&failover_lock); + return NULL; +} + +/** + * failover_slave_register - Register a slave netdev + * + * @slave_dev: slave netdev that is being registered + * + * Registers a slave device to a failover instance. Only ethernet devices + * are supported. + */ +static int failover_slave_register(struct net_device *slave_dev) +{ + struct netdev_lag_upper_info lag_upper_info; + struct net_device *failover_dev; + struct failover_ops *fops; + int err; + + if (slave_dev->type != ARPHRD_ETHER) + goto done; + + ASSERT_RTNL(); + + failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); + if (!failover_dev) + goto done; + + if (fops && fops->slave_pre_register && + fops->slave_pre_register(slave_dev, failover_dev)) + goto done; + + err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame, + failover_dev); + if (err) { + netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n", + err); + goto done; + } + + lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP; + err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL, + &lag_upper_info, NULL); + if (err) { + netdev_err(slave_dev, "can not set failover device %s (err = %d)\n", + failover_dev->name, err); + goto err_upper_link; + } + + slave_dev->priv_flags |= IFF_FAILOVER_SLAVE; + + if (fops && fops->slave_register && + !fops->slave_register(slave_dev, failover_dev)) + return NOTIFY_OK; + + netdev_upper_dev_unlink(slave_dev, failover_dev); + slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE; +err_upper_link: + netdev_rx_handler_unregister(slave_dev); +done: + return NOTIFY_DONE; +} + +/** + * failover_slave_unregister - Unregister a slave netdev + * + * @slave_dev: slave netdev that is being unregistered + * + * Unregisters a slave device from a failover instance. + */ +int failover_slave_unregister(struct net_device *slave_dev) +{ + struct net_device *failover_dev; + struct failover_ops *fops; + + if (!netif_is_failover_slave(slave_dev)) + goto done; + + ASSERT_RTNL(); + + failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); + if (!failover_dev) + goto done; + + if (fops && fops->slave_pre_unregister && + fops->slave_pre_unregister(slave_dev, failover_dev)) + goto done; + + netdev_rx_handler_unregister(slave_dev); + netdev_upper_dev_unlink(slave_dev, failover_dev); + slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE; + + if (fops && fops->slave_unregister && + !fops->slave_unregister(slave_dev, failover_dev)) + return NOTIFY_OK; + +done: + return NOTIFY_DONE; +} +EXPORT_SYMBOL_GPL(failover_slave_unregister); + +static int failover_slave_link_change(struct net_device *slave_dev) +{ + struct net_device *failover_dev; + struct failover_ops *fops; + + if (!netif_is_failover_slave(slave_dev)) + goto done; + + ASSERT_RTNL(); + + failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); + if (!failover_dev) + goto done; + + if (!netif_running(failover_dev)) + goto done; + + if (fops && fops->slave_link_change && + !fops->slave_link_change(slave_dev, failover_dev)) + return NOTIFY_OK; + +done: + return NOTIFY_DONE; +} + +static int failover_slave_name_change(struct net_device *slave_dev) +{ + struct net_device *failover_dev; + struct failover_ops *fops; + + if (!netif_is_failover_slave(slave_dev)) + goto done; + + ASSERT_RTNL(); + + failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); + if (!failover_dev) + goto done; + + if (!netif_running(failover_dev)) + goto done; + + if (fops && fops->slave_name_change && + !fops->slave_name_change(slave_dev, failover_dev)) + return NOTIFY_OK; + +done: + return NOTIFY_DONE; +} + +static int +failover_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); + + /* Skip parent events */ + if (netif_is_failover(event_dev)) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_REGISTER: + return failover_slave_register(event_dev); + case NETDEV_UNREGISTER: + return failover_slave_unregister(event_dev); + case NETDEV_UP: + case NETDEV_DOWN: + case NETDEV_CHANGE: + return failover_slave_link_change(event_dev); + case NETDEV_CHANGENAME: + return failover_slave_name_change(event_dev); + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block failover_notifier = { + .notifier_call = failover_event, +}; + +static void +failover_existing_slave_register(struct net_device *failover_dev) +{ + struct net *net = dev_net(failover_dev); + struct net_device *dev; + + rtnl_lock(); + for_each_netdev(net, dev) { + if (netif_is_failover(dev)) + continue; + if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) + failover_slave_register(dev); + } + rtnl_unlock(); +} + +/** + * failover_register - Register a failover instance + * + * @dev: failover netdev + * @ops: failover ops + * + * Allocate and register a failover instance for a failover netdev. ops + * provides handlers for slave device register/unregister/link change/ + * name change events. + * + * Return: pointer to failover instance + */ +struct failover *failover_register(struct net_device *dev, + struct failover_ops *ops) +{ + struct failover *failover; + + if (dev->type != ARPHRD_ETHER) + return ERR_PTR(-EINVAL); + + failover = kzalloc(sizeof(*failover), GFP_KERNEL); + if (!failover) + return ERR_PTR(-ENOMEM); + + rcu_assign_pointer(failover->ops, ops); + dev_hold(dev); + dev->priv_flags |= IFF_FAILOVER; + rcu_assign_pointer(failover->failover_dev, dev); + + spin_lock(&failover_lock); + list_add_tail(&failover->list, &failover_list); + spin_unlock(&failover_lock); + + netdev_info(dev, "failover master:%s registered\n", dev->name); + + failover_existing_slave_register(dev); + + return failover; +} +EXPORT_SYMBOL_GPL(failover_register); + +/** + * failover_unregister - Unregister a failover instance + * + * @failover: pointer to failover instance + * + * Unregisters and frees a failover instance. + */ +void failover_unregister(struct failover *failover) +{ + struct net_device *failover_dev; + + failover_dev = rcu_dereference(failover->failover_dev); + + netdev_info(failover_dev, "failover master:%s unregistered\n", + failover_dev->name); + + failover_dev->priv_flags &= ~IFF_FAILOVER; + dev_put(failover_dev); + + spin_lock(&failover_lock); + list_del(&failover->list); + spin_unlock(&failover_lock); + + kfree(failover); +} +EXPORT_SYMBOL_GPL(failover_unregister); + +static __init int +failover_init(void) +{ + register_netdevice_notifier(&failover_notifier); + + return 0; +} +module_init(failover_init); + +static __exit +void failover_exit(void) +{ + unregister_netdevice_notifier(&failover_notifier); +} +module_exit(failover_exit); + +MODULE_DESCRIPTION("Generic failover infrastructure/interface"); +MODULE_LICENSE("GPL v2"); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 33958f84c173..126ffc5bc630 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -387,247 +387,304 @@ unsigned int fib_rules_seq_read(struct net *net, int family) } EXPORT_SYMBOL_GPL(fib_rules_seq_read); -static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb, - struct fib_rules_ops *ops) -{ - int err = -EINVAL; - - if (frh->src_len) - if (tb[FRA_SRC] == NULL || - frh->src_len > (ops->addr_size * 8) || - nla_len(tb[FRA_SRC]) != ops->addr_size) - goto errout; - - if (frh->dst_len) - if (tb[FRA_DST] == NULL || - frh->dst_len > (ops->addr_size * 8) || - nla_len(tb[FRA_DST]) != ops->addr_size) - goto errout; - - err = 0; -errout: - return err; -} - -static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, - struct nlattr **tb, struct fib_rule *rule) +static struct fib_rule *rule_find(struct fib_rules_ops *ops, + struct fib_rule_hdr *frh, + struct nlattr **tb, + struct fib_rule *rule, + bool user_priority) { struct fib_rule *r; list_for_each_entry(r, &ops->rules_list, list) { - if (r->action != rule->action) + if (rule->action && r->action != rule->action) continue; - if (r->table != rule->table) + if (rule->table && r->table != rule->table) continue; - if (r->pref != rule->pref) + if (user_priority && r->pref != rule->pref) continue; - if (memcmp(r->iifname, rule->iifname, IFNAMSIZ)) + if (rule->iifname[0] && + memcmp(r->iifname, rule->iifname, IFNAMSIZ)) continue; - if (memcmp(r->oifname, rule->oifname, IFNAMSIZ)) + if (rule->oifname[0] && + memcmp(r->oifname, rule->oifname, IFNAMSIZ)) continue; - if (r->mark != rule->mark) + if (rule->mark && r->mark != rule->mark) continue; - if (r->mark_mask != rule->mark_mask) + if (rule->mark_mask && r->mark_mask != rule->mark_mask) continue; - if (r->tun_id != rule->tun_id) + if (rule->tun_id && r->tun_id != rule->tun_id) continue; if (r->fr_net != rule->fr_net) continue; - if (r->l3mdev != rule->l3mdev) + if (rule->l3mdev && r->l3mdev != rule->l3mdev) continue; - if (!uid_eq(r->uid_range.start, rule->uid_range.start) || - !uid_eq(r->uid_range.end, rule->uid_range.end)) + if (uid_range_set(&rule->uid_range) && + (!uid_eq(r->uid_range.start, rule->uid_range.start) || + !uid_eq(r->uid_range.end, rule->uid_range.end))) continue; - if (r->ip_proto != rule->ip_proto) + if (rule->ip_proto && r->ip_proto != rule->ip_proto) continue; - if (!fib_rule_port_range_compare(&r->sport_range, + if (fib_rule_port_range_set(&rule->sport_range) && + !fib_rule_port_range_compare(&r->sport_range, &rule->sport_range)) continue; - if (!fib_rule_port_range_compare(&r->dport_range, + if (fib_rule_port_range_set(&rule->dport_range) && + !fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; if (!ops->compare(r, frh, tb)) continue; - return 1; + return r; + } + + return NULL; +} + +#ifdef CONFIG_NET_L3_MASTER_DEV +static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, + struct netlink_ext_ack *extack) +{ + nlrule->l3mdev = nla_get_u8(nla); + if (nlrule->l3mdev != 1) { + NL_SET_ERR_MSG(extack, "Invalid l3mdev attribute"); + return -1; } + return 0; } +#else +static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, + struct netlink_ext_ack *extack) +{ + NL_SET_ERR_MSG(extack, "l3mdev support is not enabled in kernel"); + return -1; +} +#endif -int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + struct fib_rules_ops *ops, + struct nlattr *tb[], + struct fib_rule **rule, + bool *user_priority) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); - struct fib_rules_ops *ops = NULL; - struct fib_rule *rule, *r, *last = NULL; - struct nlattr *tb[FRA_MAX+1]; - int err = -EINVAL, unresolved = 0; - - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) - goto errout; + struct fib_rule *nlrule = NULL; + int err = -EINVAL; - ops = lookup_rules_ops(net, frh->family); - if (ops == NULL) { - err = -EAFNOSUPPORT; - goto errout; + if (frh->src_len) + if (!tb[FRA_SRC] || + frh->src_len > (ops->addr_size * 8) || + nla_len(tb[FRA_SRC]) != ops->addr_size) { + NL_SET_ERR_MSG(extack, "Invalid source address"); + goto errout; } - err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack); - if (err < 0) - goto errout; - - err = validate_rulemsg(frh, tb, ops); - if (err < 0) - goto errout; + if (frh->dst_len) + if (!tb[FRA_DST] || + frh->dst_len > (ops->addr_size * 8) || + nla_len(tb[FRA_DST]) != ops->addr_size) { + NL_SET_ERR_MSG(extack, "Invalid dst address"); + goto errout; + } - rule = kzalloc(ops->rule_size, GFP_KERNEL); - if (rule == NULL) { + nlrule = kzalloc(ops->rule_size, GFP_KERNEL); + if (!nlrule) { err = -ENOMEM; goto errout; } - refcount_set(&rule->refcnt, 1); - rule->fr_net = net; + refcount_set(&nlrule->refcnt, 1); + nlrule->fr_net = net; - rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY]) - : fib_default_rule_pref(ops); + if (tb[FRA_PRIORITY]) { + nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]); + *user_priority = true; + } else { + nlrule->pref = fib_default_rule_pref(ops); + } - rule->proto = tb[FRA_PROTOCOL] ? + nlrule->proto = tb[FRA_PROTOCOL] ? nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC; if (tb[FRA_IIFNAME]) { struct net_device *dev; - rule->iifindex = -1; - nla_strlcpy(rule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); - dev = __dev_get_by_name(net, rule->iifname); + nlrule->iifindex = -1; + nla_strlcpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); + dev = __dev_get_by_name(net, nlrule->iifname); if (dev) - rule->iifindex = dev->ifindex; + nlrule->iifindex = dev->ifindex; } if (tb[FRA_OIFNAME]) { struct net_device *dev; - rule->oifindex = -1; - nla_strlcpy(rule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); - dev = __dev_get_by_name(net, rule->oifname); + nlrule->oifindex = -1; + nla_strlcpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); + dev = __dev_get_by_name(net, nlrule->oifname); if (dev) - rule->oifindex = dev->ifindex; + nlrule->oifindex = dev->ifindex; } if (tb[FRA_FWMARK]) { - rule->mark = nla_get_u32(tb[FRA_FWMARK]); - if (rule->mark) + nlrule->mark = nla_get_u32(tb[FRA_FWMARK]); + if (nlrule->mark) /* compatibility: if the mark value is non-zero all bits * are compared unless a mask is explicitly specified. */ - rule->mark_mask = 0xFFFFFFFF; + nlrule->mark_mask = 0xFFFFFFFF; } if (tb[FRA_FWMASK]) - rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]); + nlrule->mark_mask = nla_get_u32(tb[FRA_FWMASK]); if (tb[FRA_TUN_ID]) - rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); + nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); err = -EINVAL; - if (tb[FRA_L3MDEV]) { -#ifdef CONFIG_NET_L3_MASTER_DEV - rule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]); - if (rule->l3mdev != 1) -#endif - goto errout_free; - } + if (tb[FRA_L3MDEV] && + fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0) + goto errout_free; - rule->action = frh->action; - rule->flags = frh->flags; - rule->table = frh_get_table(frh, tb); + nlrule->action = frh->action; + nlrule->flags = frh->flags; + nlrule->table = frh_get_table(frh, tb); if (tb[FRA_SUPPRESS_PREFIXLEN]) - rule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]); + nlrule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]); else - rule->suppress_prefixlen = -1; + nlrule->suppress_prefixlen = -1; if (tb[FRA_SUPPRESS_IFGROUP]) - rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); + nlrule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); else - rule->suppress_ifgroup = -1; + nlrule->suppress_ifgroup = -1; if (tb[FRA_GOTO]) { - if (rule->action != FR_ACT_GOTO) + if (nlrule->action != FR_ACT_GOTO) { + NL_SET_ERR_MSG(extack, "Unexpected goto"); goto errout_free; + } - rule->target = nla_get_u32(tb[FRA_GOTO]); + nlrule->target = nla_get_u32(tb[FRA_GOTO]); /* Backward jumps are prohibited to avoid endless loops */ - if (rule->target <= rule->pref) + if (nlrule->target <= nlrule->pref) { + NL_SET_ERR_MSG(extack, "Backward goto not supported"); goto errout_free; - - list_for_each_entry(r, &ops->rules_list, list) { - if (r->pref == rule->target) { - RCU_INIT_POINTER(rule->ctarget, r); - break; - } } - - if (rcu_dereference_protected(rule->ctarget, 1) == NULL) - unresolved = 1; - } else if (rule->action == FR_ACT_GOTO) + } else if (nlrule->action == FR_ACT_GOTO) { + NL_SET_ERR_MSG(extack, "Missing goto target for action goto"); goto errout_free; + } - if (rule->l3mdev && rule->table) + if (nlrule->l3mdev && nlrule->table) { + NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive"); goto errout_free; + } if (tb[FRA_UID_RANGE]) { if (current_user_ns() != net->user_ns) { err = -EPERM; + NL_SET_ERR_MSG(extack, "No permission to set uid"); goto errout_free; } - rule->uid_range = nla_get_kuid_range(tb); + nlrule->uid_range = nla_get_kuid_range(tb); - if (!uid_range_set(&rule->uid_range) || - !uid_lte(rule->uid_range.start, rule->uid_range.end)) + if (!uid_range_set(&nlrule->uid_range) || + !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) { + NL_SET_ERR_MSG(extack, "Invalid uid range"); goto errout_free; + } } else { - rule->uid_range = fib_kuid_range_unset; + nlrule->uid_range = fib_kuid_range_unset; } if (tb[FRA_IP_PROTO]) - rule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]); + nlrule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]); if (tb[FRA_SPORT_RANGE]) { err = nla_get_port_range(tb[FRA_SPORT_RANGE], - &rule->sport_range); - if (err) + &nlrule->sport_range); + if (err) { + NL_SET_ERR_MSG(extack, "Invalid sport range"); goto errout_free; + } } if (tb[FRA_DPORT_RANGE]) { err = nla_get_port_range(tb[FRA_DPORT_RANGE], - &rule->dport_range); - if (err) + &nlrule->dport_range); + if (err) { + NL_SET_ERR_MSG(extack, "Invalid dport range"); goto errout_free; + } } + *rule = nlrule; + + return 0; + +errout_free: + kfree(nlrule); +errout: + return err; +} + +int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct fib_rule_hdr *frh = nlmsg_data(nlh); + struct fib_rules_ops *ops = NULL; + struct fib_rule *rule = NULL, *r, *last = NULL; + struct nlattr *tb[FRA_MAX + 1]; + int err = -EINVAL, unresolved = 0; + bool user_priority = false; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + NL_SET_ERR_MSG(extack, "Invalid msg length"); + goto errout; + } + + ops = lookup_rules_ops(net, frh->family); + if (!ops) { + err = -EAFNOSUPPORT; + NL_SET_ERR_MSG(extack, "Rule family not supported"); + goto errout; + } + + err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Error parsing msg"); + goto errout; + } + + err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority); + if (err) + goto errout; + if ((nlh->nlmsg_flags & NLM_F_EXCL) && - rule_exists(ops, frh, tb, rule)) { + rule_find(ops, frh, tb, rule, user_priority)) { err = -EEXIST; goto errout_free; } - err = ops->configure(rule, skb, frh, tb); + err = ops->configure(rule, skb, frh, tb, extack); if (err < 0) goto errout_free; @@ -637,6 +694,16 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout_free; list_for_each_entry(r, &ops->rules_list, list) { + if (r->pref == rule->target) { + RCU_INIT_POINTER(rule->ctarget, r); + break; + } + } + + if (rcu_dereference_protected(rule->ctarget, 1) == NULL) + unresolved = 1; + + list_for_each_entry(r, &ops->rules_list, list) { if (r->pref > rule->pref) break; last = r; @@ -690,171 +757,97 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); - struct fib_rule_port_range sprange = {0, 0}; - struct fib_rule_port_range dprange = {0, 0}; struct fib_rules_ops *ops = NULL; - struct fib_rule *rule, *r; + struct fib_rule *rule = NULL, *r, *nlrule = NULL; struct nlattr *tb[FRA_MAX+1]; - struct fib_kuid_range range; int err = -EINVAL; + bool user_priority = false; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; + } ops = lookup_rules_ops(net, frh->family); if (ops == NULL) { err = -EAFNOSUPPORT; + NL_SET_ERR_MSG(extack, "Rule family not supported"); goto errout; } err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack); - if (err < 0) + if (err < 0) { + NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; + } - err = validate_rulemsg(frh, tb, ops); - if (err < 0) + err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority); + if (err) goto errout; - if (tb[FRA_UID_RANGE]) { - range = nla_get_kuid_range(tb); - if (!uid_range_set(&range)) { - err = -EINVAL; - goto errout; - } - } else { - range = fib_kuid_range_unset; + rule = rule_find(ops, frh, tb, nlrule, user_priority); + if (!rule) { + err = -ENOENT; + goto errout; } - if (tb[FRA_SPORT_RANGE]) { - err = nla_get_port_range(tb[FRA_SPORT_RANGE], - &sprange); - if (err) - goto errout; + if (rule->flags & FIB_RULE_PERMANENT) { + err = -EPERM; + goto errout; } - if (tb[FRA_DPORT_RANGE]) { - err = nla_get_port_range(tb[FRA_DPORT_RANGE], - &dprange); + if (ops->delete) { + err = ops->delete(rule); if (err) goto errout; } - list_for_each_entry(rule, &ops->rules_list, list) { - if (tb[FRA_PROTOCOL] && - (rule->proto != nla_get_u8(tb[FRA_PROTOCOL]))) - continue; - - if (frh->action && (frh->action != rule->action)) - continue; - - if (frh_get_table(frh, tb) && - (frh_get_table(frh, tb) != rule->table)) - continue; - - if (tb[FRA_PRIORITY] && - (rule->pref != nla_get_u32(tb[FRA_PRIORITY]))) - continue; - - if (tb[FRA_IIFNAME] && - nla_strcmp(tb[FRA_IIFNAME], rule->iifname)) - continue; - - if (tb[FRA_OIFNAME] && - nla_strcmp(tb[FRA_OIFNAME], rule->oifname)) - continue; - - if (tb[FRA_FWMARK] && - (rule->mark != nla_get_u32(tb[FRA_FWMARK]))) - continue; - - if (tb[FRA_FWMASK] && - (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK]))) - continue; - - if (tb[FRA_TUN_ID] && - (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID]))) - continue; - - if (tb[FRA_L3MDEV] && - (rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV]))) - continue; - - if (uid_range_set(&range) && - (!uid_eq(rule->uid_range.start, range.start) || - !uid_eq(rule->uid_range.end, range.end))) - continue; - - if (tb[FRA_IP_PROTO] && - (rule->ip_proto != nla_get_u8(tb[FRA_IP_PROTO]))) - continue; - - if (fib_rule_port_range_set(&sprange) && - !fib_rule_port_range_compare(&rule->sport_range, &sprange)) - continue; - - if (fib_rule_port_range_set(&dprange) && - !fib_rule_port_range_compare(&rule->dport_range, &dprange)) - continue; - - if (!ops->compare(rule, frh, tb)) - continue; - - if (rule->flags & FIB_RULE_PERMANENT) { - err = -EPERM; - goto errout; - } - - if (ops->delete) { - err = ops->delete(rule); - if (err) - goto errout; - } + if (rule->tun_id) + ip_tunnel_unneed_metadata(); - if (rule->tun_id) - ip_tunnel_unneed_metadata(); + list_del_rcu(&rule->list); - list_del_rcu(&rule->list); - - if (rule->action == FR_ACT_GOTO) { - ops->nr_goto_rules--; - if (rtnl_dereference(rule->ctarget) == NULL) - ops->unresolved_rules--; - } + if (rule->action == FR_ACT_GOTO) { + ops->nr_goto_rules--; + if (rtnl_dereference(rule->ctarget) == NULL) + ops->unresolved_rules--; + } - /* - * Check if this rule is a target to any of them. If so, - * adjust to the next one with the same preference or - * disable them. As this operation is eventually very - * expensive, it is only performed if goto rules, except - * current if it is goto rule, have actually been added. - */ - if (ops->nr_goto_rules > 0) { - struct fib_rule *n; - - n = list_next_entry(rule, list); - if (&n->list == &ops->rules_list || n->pref != rule->pref) - n = NULL; - list_for_each_entry(r, &ops->rules_list, list) { - if (rtnl_dereference(r->ctarget) != rule) - continue; - rcu_assign_pointer(r->ctarget, n); - if (!n) - ops->unresolved_rules++; - } + /* + * Check if this rule is a target to any of them. If so, + * adjust to the next one with the same preference or + * disable them. As this operation is eventually very + * expensive, it is only performed if goto rules, except + * current if it is goto rule, have actually been added. + */ + if (ops->nr_goto_rules > 0) { + struct fib_rule *n; + + n = list_next_entry(rule, list); + if (&n->list == &ops->rules_list || n->pref != rule->pref) + n = NULL; + list_for_each_entry(r, &ops->rules_list, list) { + if (rtnl_dereference(r->ctarget) != rule) + continue; + rcu_assign_pointer(r->ctarget, n); + if (!n) + ops->unresolved_rules++; } - - call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, - NULL); - notify_rule_change(RTM_DELRULE, rule, ops, nlh, - NETLINK_CB(skb).portid); - fib_rule_put(rule); - flush_route_cache(ops); - rules_ops_put(ops); - return 0; } - err = -ENOENT; + call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, + NULL); + notify_rule_change(RTM_DELRULE, rule, ops, nlh, + NETLINK_CB(skb).portid); + fib_rule_put(rule); + flush_route_cache(ops); + rules_ops_put(ops); + kfree(nlrule); + return 0; + errout: + if (nlrule) + kfree(nlrule); rules_ops_put(ops); return err; } diff --git a/net/core/filter.c b/net/core/filter.c index 201ff36b17a8..3d9ba7e5965a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -57,7 +57,17 @@ #include <net/sock_reuseport.h> #include <net/busy_poll.h> #include <net/tcp.h> +#include <net/xfrm.h> #include <linux/bpf_trace.h> +#include <net/xdp_sock.h> +#include <linux/inetdevice.h> +#include <net/ip_fib.h> +#include <net/flow.h> +#include <net/arp.h> +#include <net/ipv6.h> +#include <linux/seg6_local.h> +#include <net/seg6.h> +#include <net/seg6_local.h> /** * sk_filter_trim_cap - run a packet through a socket filter @@ -111,12 +121,12 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) } EXPORT_SYMBOL(sk_filter_trim_cap); -BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb) +BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb) { return skb_get_poff(skb); } -BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) +BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) { struct nlattr *nla; @@ -136,7 +146,7 @@ BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) return 0; } -BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) +BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) { struct nlattr *nla; @@ -160,13 +170,94 @@ BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) return 0; } -BPF_CALL_0(__get_raw_cpu_id) +BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + u8 tmp, *ptr; + const int len = sizeof(tmp); + + if (offset >= 0) { + if (headlen - offset >= len) + return *(u8 *)(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return tmp; + } else { + ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); + if (likely(ptr)) + return *(u8 *)ptr; + } + + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len, + offset); +} + +BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + u16 tmp, *ptr; + const int len = sizeof(tmp); + + if (offset >= 0) { + if (headlen - offset >= len) + return get_unaligned_be16(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be16_to_cpu(tmp); + } else { + ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); + if (likely(ptr)) + return get_unaligned_be16(ptr); + } + + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len, + offset); +} + +BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + u32 tmp, *ptr; + const int len = sizeof(tmp); + + if (likely(offset >= 0)) { + if (headlen - offset >= len) + return get_unaligned_be32(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be32_to_cpu(tmp); + } else { + ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); + if (likely(ptr)) + return get_unaligned_be32(ptr); + } + + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len, + offset); +} + +BPF_CALL_0(bpf_get_raw_cpu_id) { return raw_smp_processor_id(); } static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { - .func = __get_raw_cpu_id, + .func = bpf_get_raw_cpu_id, .gpl_only = false, .ret_type = RET_INTEGER, }; @@ -316,16 +407,16 @@ static bool convert_bpf_extensions(struct sock_filter *fp, /* Emit call(arg1=CTX, arg2=A, arg3=X) */ switch (fp->k) { case SKF_AD_OFF + SKF_AD_PAY_OFFSET: - *insn = BPF_EMIT_CALL(__skb_get_pay_offset); + *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset); break; case SKF_AD_OFF + SKF_AD_NLATTR: - *insn = BPF_EMIT_CALL(__skb_get_nlattr); + *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr); break; case SKF_AD_OFF + SKF_AD_NLATTR_NEST: - *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest); + *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest); break; case SKF_AD_OFF + SKF_AD_CPU: - *insn = BPF_EMIT_CALL(__get_raw_cpu_id); + *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id); break; case SKF_AD_OFF + SKF_AD_RANDOM: *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); @@ -352,26 +443,87 @@ static bool convert_bpf_extensions(struct sock_filter *fp, return true; } +static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) +{ + const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS); + int size = bpf_size_to_bytes(BPF_SIZE(fp->code)); + bool endian = BPF_SIZE(fp->code) == BPF_H || + BPF_SIZE(fp->code) == BPF_W; + bool indirect = BPF_MODE(fp->code) == BPF_IND; + const int ip_align = NET_IP_ALIGN; + struct bpf_insn *insn = *insnp; + int offset = fp->k; + + if (!indirect && + ((unaligned_ok && offset >= 0) || + (!unaligned_ok && offset >= 0 && + offset + ip_align >= 0 && + offset + ip_align % size == 0))) { + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); + *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); + *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian); + *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D, + offset); + if (endian) + *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8); + *insn++ = BPF_JMP_A(8); + } + + *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); + *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D); + *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H); + if (!indirect) { + *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset); + } else { + *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X); + if (fp->k) + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset); + } + + switch (BPF_SIZE(fp->code)) { + case BPF_B: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8); + break; + case BPF_H: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16); + break; + case BPF_W: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32); + break; + default: + return false; + } + + *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *insn = BPF_EXIT_INSN(); + + *insnp = insn; + return true; +} + /** * bpf_convert_filter - convert filter program * @prog: the user passed filter program * @len: the length of the user passed filter program * @new_prog: allocated 'struct bpf_prog' or NULL * @new_len: pointer to store length of converted program + * @seen_ld_abs: bool whether we've seen ld_abs/ind * * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' * style extended BPF (eBPF). * Conversion workflow: * * 1) First pass for calculating the new program length: - * bpf_convert_filter(old_prog, old_len, NULL, &new_len) + * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs) * * 2) 2nd pass to remap in two passes: 1st pass finds new * jump offsets, 2nd pass remapping: - * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); + * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs) */ static int bpf_convert_filter(struct sock_filter *prog, int len, - struct bpf_prog *new_prog, int *new_len) + struct bpf_prog *new_prog, int *new_len, + bool *seen_ld_abs) { int new_flen = 0, pass = 0, target, i, stack_off; struct bpf_insn *new_insn, *first_insn = NULL; @@ -410,12 +562,27 @@ do_pass: * do this ourself. Initial CTX is present in BPF_REG_ARG1. */ *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); + if (*seen_ld_abs) { + /* For packet access in classic BPF, cache skb->data + * in callee-saved BPF R8 and skb->len - skb->data_len + * (headlen) in BPF R9. Since classic BPF is read-only + * on CTX, we only need to cache it once. + */ + *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + BPF_REG_D, BPF_REG_CTX, + offsetof(struct sk_buff, data)); + *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX, + offsetof(struct sk_buff, len)); + *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX, + offsetof(struct sk_buff, data_len)); + *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP); + } } else { new_insn += 3; } for (i = 0; i < len; fp++, i++) { - struct bpf_insn tmp_insns[6] = { }; + struct bpf_insn tmp_insns[32] = { }; struct bpf_insn *insn = tmp_insns; if (addrs) @@ -458,6 +625,11 @@ do_pass: BPF_MODE(fp->code) == BPF_ABS && convert_bpf_extensions(fp, &insn)) break; + if (BPF_CLASS(fp->code) == BPF_LD && + convert_bpf_ld_abs(fp, &insn)) { + *seen_ld_abs = true; + break; + } if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { @@ -567,21 +739,31 @@ jmp_rest: break; /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ - case BPF_LDX | BPF_MSH | BPF_B: - /* tmp = A */ - *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A); + case BPF_LDX | BPF_MSH | BPF_B: { + struct sock_filter tmp = { + .code = BPF_LD | BPF_ABS | BPF_B, + .k = fp->k, + }; + + *seen_ld_abs = true; + + /* X = A */ + *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = BPF_R0 = *(u8 *) (skb->data + K) */ - *insn++ = BPF_LD_ABS(BPF_B, fp->k); + convert_bpf_ld_abs(&tmp, &insn); + insn++; /* A &= 0xf */ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); /* A <<= 2 */ *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); + /* tmp = X */ + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X); /* X = A */ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = tmp */ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); break; - + } /* RET_K is remaped into 2 insns. RET_A case doesn't need an * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. */ @@ -663,6 +845,8 @@ jmp_rest: if (!new_prog) { /* Only calculating new length. */ *new_len = new_insn - first_insn; + if (*seen_ld_abs) + *new_len += 4; /* Prologue bits. */ return 0; } @@ -1024,6 +1208,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) struct sock_filter *old_prog; struct bpf_prog *old_fp; int err, new_len, old_len = fp->len; + bool seen_ld_abs = false; /* We are free to overwrite insns et al right here as it * won't be used at this point in time anymore internally @@ -1045,7 +1230,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) } /* 1st pass: calculate the new program length. */ - err = bpf_convert_filter(old_prog, old_len, NULL, &new_len); + err = bpf_convert_filter(old_prog, old_len, NULL, &new_len, + &seen_ld_abs); if (err) goto out_err_free; @@ -1064,7 +1250,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) fp->len = new_len; /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ - err = bpf_convert_filter(old_prog, old_len, fp, &new_len); + err = bpf_convert_filter(old_prog, old_len, fp, &new_len, + &seen_ld_abs); if (err) /* 2nd bpf_convert_filter() can fail only if it fails * to allocate memory, remapping must succeed. Note, @@ -1512,6 +1699,47 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .arg4_type = ARG_CONST_SIZE, }; +BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, + u32, offset, void *, to, u32, len, u32, start_header) +{ + u8 *ptr; + + if (unlikely(offset > 0xffff || len > skb_headlen(skb))) + goto err_clear; + + switch (start_header) { + case BPF_HDR_START_MAC: + ptr = skb_mac_header(skb) + offset; + break; + case BPF_HDR_START_NET: + ptr = skb_network_header(skb) + offset; + break; + default: + goto err_clear; + } + + if (likely(ptr >= skb_mac_header(skb) && + ptr + len <= skb_tail_pointer(skb))) { + memcpy(to, ptr, len); + return 0; + } + +err_clear: + memset(to, 0, len); + return -EFAULT; +} + +static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = { + .func = bpf_skb_load_bytes_relative, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; + BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) { /* Idea is the following: should the needed direct read/write @@ -1857,6 +2085,33 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, + struct bpf_map *, map, void *, key, u64, flags) +{ + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + /* If user passes invalid input drop the packet. */ + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + + tcb->bpf.flags = flags; + tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key); + if (!tcb->bpf.sk_redir) + return SK_DROP; + + return SK_PASS; +} + +static const struct bpf_func_proto bpf_sk_redirect_hash_proto = { + .func = bpf_sk_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { @@ -1866,9 +2121,10 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; - tcb->bpf.key = key; tcb->bpf.flags = flags; - tcb->bpf.map = map; + tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key); + if (!tcb->bpf.sk_redir) + return SK_DROP; return SK_PASS; } @@ -1876,16 +2132,8 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct sock *do_sk_redirect_map(struct sk_buff *skb) { struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - struct sock *sk = NULL; - - if (tcb->bpf.map) { - sk = __sock_map_lookup_elem(tcb->bpf.map, tcb->bpf.key); - tcb->bpf.key = 0; - tcb->bpf.map = NULL; - } - - return sk; + return tcb->bpf.sk_redir; } static const struct bpf_func_proto bpf_sk_redirect_map_proto = { @@ -1898,32 +2146,49 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, - struct bpf_map *, map, u32, key, u64, flags) +BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg, + struct bpf_map *, map, void *, key, u64, flags) { /* If user passes invalid input drop the packet. */ if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; - msg->key = key; msg->flags = flags; - msg->map = map; + msg->sk_redir = __sock_hash_lookup_elem(map, key); + if (!msg->sk_redir) + return SK_DROP; return SK_PASS; } -struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) +static const struct bpf_func_proto bpf_msg_redirect_hash_proto = { + .func = bpf_msg_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, + struct bpf_map *, map, u32, key, u64, flags) { - struct sock *sk = NULL; + /* If user passes invalid input drop the packet. */ + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; - if (msg->map) { - sk = __sock_map_lookup_elem(msg->map, msg->key); + msg->flags = flags; + msg->sk_redir = __sock_map_lookup_elem(map, key); + if (!msg->sk_redir) + return SK_DROP; - msg->key = 0; - msg->map = NULL; - } + return SK_PASS; +} - return sk; +struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) +{ + return msg->sk_redir; } static const struct bpf_func_proto bpf_msg_redirect_map_proto = { @@ -2186,7 +2451,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, return ret; } -const struct bpf_func_proto bpf_skb_vlan_push_proto = { +static const struct bpf_func_proto bpf_skb_vlan_push_proto = { .func = bpf_skb_vlan_push, .gpl_only = false, .ret_type = RET_INTEGER, @@ -2194,7 +2459,6 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = { .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; -EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) { @@ -2208,13 +2472,12 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) return ret; } -const struct bpf_func_proto bpf_skb_vlan_pop_proto = { +static const struct bpf_func_proto bpf_skb_vlan_pop_proto = { .func = bpf_skb_vlan_pop, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; -EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto); static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) { @@ -2699,8 +2962,9 @@ static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) { + void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); unsigned long metalen = xdp_get_metalen(xdp); - void *data_start = xdp->data_hard_start + metalen; + void *data_start = xdp_frame_end + metalen; void *data = xdp->data + offset; if (unlikely(data < data_start || @@ -2724,14 +2988,39 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) +{ + void *data_end = xdp->data_end + offset; + + /* only shrinking is allowed for now. */ + if (unlikely(offset >= 0)) + return -EINVAL; + + if (unlikely(data_end < xdp->data + ETH_HLEN)) + return -EINVAL; + + xdp->data_end = data_end; + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = { + .func = bpf_xdp_adjust_tail, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) { + void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); void *meta = xdp->data_meta + offset; unsigned long metalen = xdp->data - meta; if (xdp_data_meta_unsupported(xdp)) return -ENOTSUPP; - if (unlikely(meta < xdp->data_hard_start || + if (unlikely(meta < xdp_frame_end || meta > xdp->data)) return -EINVAL; if (unlikely((metalen & (sizeof(__u32) - 1)) || @@ -2756,16 +3045,20 @@ static int __bpf_tx_xdp(struct net_device *dev, struct xdp_buff *xdp, u32 index) { - int err; + struct xdp_frame *xdpf; + int sent; if (!dev->netdev_ops->ndo_xdp_xmit) { return -EOPNOTSUPP; } - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); - if (err) - return err; - dev->netdev_ops->ndo_xdp_flush(dev); + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) + return -EOVERFLOW; + + sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, XDP_XMIT_FLUSH); + if (sent <= 0) + return sent; return 0; } @@ -2776,24 +3069,33 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, { int err; - if (map->map_type == BPF_MAP_TYPE_DEVMAP) { - struct net_device *dev = fwd; - - if (!dev->netdev_ops->ndo_xdp_xmit) - return -EOPNOTSUPP; + switch (map->map_type) { + case BPF_MAP_TYPE_DEVMAP: { + struct bpf_dtab_netdev *dst = fwd; - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); + err = dev_map_enqueue(dst, xdp, dev_rx); if (err) return err; __dev_map_insert_ctx(map, index); - - } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + break; + } + case BPF_MAP_TYPE_CPUMAP: { struct bpf_cpu_map_entry *rcpu = fwd; err = cpu_map_enqueue(rcpu, xdp, dev_rx); if (err) return err; __cpu_map_insert_ctx(map, index); + break; + } + case BPF_MAP_TYPE_XSKMAP: { + struct xdp_sock *xs = fwd; + + err = __xsk_map_redirect(map, xdp, xs); + return err; + } + default: + break; } return 0; } @@ -2812,6 +3114,9 @@ void xdp_do_flush_map(void) case BPF_MAP_TYPE_CPUMAP: __cpu_map_flush(map); break; + case BPF_MAP_TYPE_XSKMAP: + __xsk_map_flush(map); + break; default: break; } @@ -2826,6 +3131,8 @@ static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) return __dev_map_lookup_elem(map, index); case BPF_MAP_TYPE_CPUMAP: return __cpu_map_lookup_elem(map, index); + case BPF_MAP_TYPE_XSKMAP: + return __xsk_map_lookup_elem(map, index); default: return NULL; } @@ -2923,13 +3230,14 @@ static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd) static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, + struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; - struct net_device *fwd = NULL; u32 index = ri->ifindex; + void *fwd = NULL; int err = 0; ri->ifindex = 0; @@ -2951,6 +3259,14 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) goto err; skb->dev = fwd; + generic_xdp_tx(skb, xdp_prog); + } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { + struct xdp_sock *xs = fwd; + + err = xsk_generic_rcv(xs, xdp); + if (err) + goto err; + consume_skb(skb); } else { /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ err = -EBADRQC; @@ -2965,7 +3281,7 @@ err: } int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog) + struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); u32 index = ri->ifindex; @@ -2973,7 +3289,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, int err = 0; if (ri->map) - return xdp_do_generic_redirect_map(dev, skb, xdp_prog); + return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog); ri->ifindex = 0; fwd = dev_get_by_index_rcu(dev_net(dev), index); @@ -2987,6 +3303,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, skb->dev = fwd; _trace_xdp_redirect(dev, xdp_prog, index); + generic_xdp_tx(skb, xdp_prog); return 0; err: _trace_xdp_redirect_err(dev, xdp_prog, index, err); @@ -3045,27 +3362,6 @@ static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { .arg3_type = ARG_ANYTHING, }; -bool bpf_helper_changes_pkt_data(void *func) -{ - if (func == bpf_skb_vlan_push || - func == bpf_skb_vlan_pop || - func == bpf_skb_store_bytes || - func == bpf_skb_change_proto || - func == bpf_skb_change_head || - func == bpf_skb_change_tail || - func == bpf_skb_adjust_room || - func == bpf_skb_pull_data || - func == bpf_clone_redirect || - func == bpf_l3_csum_replace || - func == bpf_l4_csum_replace || - func == bpf_xdp_adjust_head || - func == bpf_xdp_adjust_meta || - func == bpf_msg_pull_data) - return true; - - return false; -} - static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, unsigned long off, unsigned long len) { @@ -3148,6 +3444,7 @@ set_compat: to->tunnel_id = be64_to_cpu(info->key.tun_id); to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; + to->tunnel_ext = 0; if (flags & BPF_F_TUNINFO_IPV6) { memcpy(to->remote_ipv6, &info->key.u.ipv6.src, @@ -3155,6 +3452,8 @@ set_compat: to->tunnel_label = be32_to_cpu(info->key.label); } else { to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); + memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); + to->tunnel_label = 0; } if (unlikely(size != sizeof(struct bpf_tunnel_key))) @@ -3364,6 +3663,27 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { .arg3_type = ARG_ANYTHING, }; +#ifdef CONFIG_SOCK_CGROUP_DATA +BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) +{ + struct sock *sk = skb_to_full_sk(skb); + struct cgroup *cgrp; + + if (!sk || !sk_fullsock(sk)) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + return cgrp->kn->id.id; +} + +static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { + .func = bpf_skb_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; +#endif + static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, unsigned long off, unsigned long len) { @@ -3711,6 +4031,603 @@ static const struct bpf_func_proto bpf_bind_proto = { .arg3_type = ARG_CONST_SIZE, }; +#ifdef CONFIG_XFRM +BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index, + struct bpf_xfrm_state *, to, u32, size, u64, flags) +{ + const struct sec_path *sp = skb_sec_path(skb); + const struct xfrm_state *x; + + if (!sp || unlikely(index >= sp->len || flags)) + goto err_clear; + + x = sp->xvec[index]; + + if (unlikely(size != sizeof(struct bpf_xfrm_state))) + goto err_clear; + + to->reqid = x->props.reqid; + to->spi = x->id.spi; + to->family = x->props.family; + to->ext = 0; + + if (to->family == AF_INET6) { + memcpy(to->remote_ipv6, x->props.saddr.a6, + sizeof(to->remote_ipv6)); + } else { + to->remote_ipv4 = x->props.saddr.a4; + memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); + } + + return 0; +err_clear: + memset(to, 0, size); + return -EINVAL; +} + +static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { + .func = bpf_skb_get_xfrm_state, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; +#endif + +#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) +static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, + const struct neighbour *neigh, + const struct net_device *dev) +{ + memcpy(params->dmac, neigh->ha, ETH_ALEN); + memcpy(params->smac, dev->dev_addr, ETH_ALEN); + params->h_vlan_TCI = 0; + params->h_vlan_proto = 0; + + return dev->ifindex; +} +#endif + +#if IS_ENABLED(CONFIG_INET) +static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, + u32 flags, bool check_mtu) +{ + struct in_device *in_dev; + struct neighbour *neigh; + struct net_device *dev; + struct fib_result res; + struct fib_nh *nh; + struct flowi4 fl4; + int err; + u32 mtu; + + dev = dev_get_by_index_rcu(net, params->ifindex); + if (unlikely(!dev)) + return -ENODEV; + + /* verify forwarding is enabled on this interface */ + in_dev = __in_dev_get_rcu(dev); + if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) + return 0; + + if (flags & BPF_FIB_LOOKUP_OUTPUT) { + fl4.flowi4_iif = 1; + fl4.flowi4_oif = params->ifindex; + } else { + fl4.flowi4_iif = params->ifindex; + fl4.flowi4_oif = 0; + } + fl4.flowi4_tos = params->tos & IPTOS_RT_MASK; + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.flowi4_flags = 0; + + fl4.flowi4_proto = params->l4_protocol; + fl4.daddr = params->ipv4_dst; + fl4.saddr = params->ipv4_src; + fl4.fl4_sport = params->sport; + fl4.fl4_dport = params->dport; + + if (flags & BPF_FIB_LOOKUP_DIRECT) { + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; + struct fib_table *tb; + + tb = fib_get_table(net, tbid); + if (unlikely(!tb)) + return 0; + + err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); + } else { + fl4.flowi4_mark = 0; + fl4.flowi4_secid = 0; + fl4.flowi4_tun_key.tun_id = 0; + fl4.flowi4_uid = sock_net_uid(net, NULL); + + err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); + } + + if (err || res.type != RTN_UNICAST) + return 0; + + if (res.fi->fib_nhs > 1) + fib_select_path(net, &res, &fl4, NULL); + + if (check_mtu) { + mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); + if (params->tot_len > mtu) + return 0; + } + + nh = &res.fi->fib_nh[res.nh_sel]; + + /* do not handle lwt encaps right now */ + if (nh->nh_lwtstate) + return 0; + + dev = nh->nh_dev; + if (unlikely(!dev)) + return 0; + + if (nh->nh_gw) + params->ipv4_dst = nh->nh_gw; + + params->rt_metric = res.fi->fib_priority; + + /* xdp and cls_bpf programs are run in RCU-bh so + * rcu_read_lock_bh is not needed here + */ + neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); + if (neigh) + return bpf_fib_set_fwd_params(params, neigh, dev); + + return 0; +} +#endif + +#if IS_ENABLED(CONFIG_IPV6) +static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, + u32 flags, bool check_mtu) +{ + struct in6_addr *src = (struct in6_addr *) params->ipv6_src; + struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst; + struct neighbour *neigh; + struct net_device *dev; + struct inet6_dev *idev; + struct fib6_info *f6i; + struct flowi6 fl6; + int strict = 0; + int oif; + u32 mtu; + + /* link local addresses are never forwarded */ + if (rt6_need_strict(dst) || rt6_need_strict(src)) + return 0; + + dev = dev_get_by_index_rcu(net, params->ifindex); + if (unlikely(!dev)) + return -ENODEV; + + idev = __in6_dev_get_safely(dev); + if (unlikely(!idev || !net->ipv6.devconf_all->forwarding)) + return 0; + + if (flags & BPF_FIB_LOOKUP_OUTPUT) { + fl6.flowi6_iif = 1; + oif = fl6.flowi6_oif = params->ifindex; + } else { + oif = fl6.flowi6_iif = params->ifindex; + fl6.flowi6_oif = 0; + strict = RT6_LOOKUP_F_HAS_SADDR; + } + fl6.flowlabel = params->flowinfo; + fl6.flowi6_scope = 0; + fl6.flowi6_flags = 0; + fl6.mp_hash = 0; + + fl6.flowi6_proto = params->l4_protocol; + fl6.daddr = *dst; + fl6.saddr = *src; + fl6.fl6_sport = params->sport; + fl6.fl6_dport = params->dport; + + if (flags & BPF_FIB_LOOKUP_DIRECT) { + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; + struct fib6_table *tb; + + tb = ipv6_stub->fib6_get_table(net, tbid); + if (unlikely(!tb)) + return 0; + + f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict); + } else { + fl6.flowi6_mark = 0; + fl6.flowi6_secid = 0; + fl6.flowi6_tun_key.tun_id = 0; + fl6.flowi6_uid = sock_net_uid(net, NULL); + + f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict); + } + + if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry)) + return 0; + + if (unlikely(f6i->fib6_flags & RTF_REJECT || + f6i->fib6_type != RTN_UNICAST)) + return 0; + + if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0) + f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6, + fl6.flowi6_oif, NULL, + strict); + + if (check_mtu) { + mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src); + if (params->tot_len > mtu) + return 0; + } + + if (f6i->fib6_nh.nh_lwtstate) + return 0; + + if (f6i->fib6_flags & RTF_GATEWAY) + *dst = f6i->fib6_nh.nh_gw; + + dev = f6i->fib6_nh.nh_dev; + params->rt_metric = f6i->fib6_metric; + + /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is + * not needed here. Can not use __ipv6_neigh_lookup_noref here + * because we need to get nd_tbl via the stub + */ + neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, + ndisc_hashfn, dst, dev); + if (neigh) + return bpf_fib_set_fwd_params(params, neigh, dev); + + return 0; +} +#endif + +BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, + struct bpf_fib_lookup *, params, int, plen, u32, flags) +{ + if (plen < sizeof(*params)) + return -EINVAL; + + if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT)) + return -EINVAL; + + switch (params->family) { +#if IS_ENABLED(CONFIG_INET) + case AF_INET: + return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params, + flags, true); +#endif +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params, + flags, true); +#endif + } + return -EAFNOSUPPORT; +} + +static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { + .func = bpf_xdp_fib_lookup, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, + struct bpf_fib_lookup *, params, int, plen, u32, flags) +{ + struct net *net = dev_net(skb->dev); + int index = -EAFNOSUPPORT; + + if (plen < sizeof(*params)) + return -EINVAL; + + if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT)) + return -EINVAL; + + switch (params->family) { +#if IS_ENABLED(CONFIG_INET) + case AF_INET: + index = bpf_ipv4_fib_lookup(net, params, flags, false); + break; +#endif +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + index = bpf_ipv6_fib_lookup(net, params, flags, false); + break; +#endif + } + + if (index > 0) { + struct net_device *dev; + + dev = dev_get_by_index_rcu(net, index); + if (!is_skb_forwardable(dev, skb)) + index = 0; + } + + return index; +} + +static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { + .func = bpf_skb_fib_lookup, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) +static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) +{ + int err; + struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr; + + if (!seg6_validate_srh(srh, len)) + return -EINVAL; + + switch (type) { + case BPF_LWT_ENCAP_SEG6_INLINE: + if (skb->protocol != htons(ETH_P_IPV6)) + return -EBADMSG; + + err = seg6_do_srh_inline(skb, srh); + break; + case BPF_LWT_ENCAP_SEG6: + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6); + break; + default: + return -EINVAL; + } + + bpf_compute_data_pointers(skb); + if (err) + return err; + + ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + + return seg6_lookup_nexthop(skb, NULL, 0); +} +#endif /* CONFIG_IPV6_SEG6_BPF */ + +BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, + u32, len) +{ + switch (type) { +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + case BPF_LWT_ENCAP_SEG6: + case BPF_LWT_ENCAP_SEG6_INLINE: + return bpf_push_seg6_encap(skb, type, hdr, len); +#endif + default: + return -EINVAL; + } +} + +static const struct bpf_func_proto bpf_lwt_push_encap_proto = { + .func = bpf_lwt_push_encap, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE +}; + +BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, + const void *, from, u32, len) +{ +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + void *srh_tlvs, *srh_end, *ptr; + struct ipv6_sr_hdr *srh; + int srhoff = 0; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4)); + srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen); + + ptr = skb->data + offset; + if (ptr >= srh_tlvs && ptr + len <= srh_end) + srh_state->valid = 0; + else if (ptr < (void *)&srh->flags || + ptr + len > (void *)&srh->segments) + return -EFAULT; + + if (unlikely(bpf_try_make_writable(skb, offset + len))) + return -EFAULT; + + memcpy(skb->data + offset, from, len); + return 0; +#else /* CONFIG_IPV6_SEG6_BPF */ + return -EOPNOTSUPP; +#endif +} + +static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { + .func = bpf_lwt_seg6_store_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE +}; + +BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, + u32, action, void *, param, u32, param_len) +{ +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + struct ipv6_sr_hdr *srh; + int srhoff = 0; + int err; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + + if (!srh_state->valid) { + if (unlikely((srh_state->hdrlen & 7) != 0)) + return -EBADMSG; + + srh->hdrlen = (u8)(srh_state->hdrlen >> 3); + if (unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))) + return -EBADMSG; + + srh_state->valid = 1; + } + + switch (action) { + case SEG6_LOCAL_ACTION_END_X: + if (param_len != sizeof(struct in6_addr)) + return -EINVAL; + return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0); + case SEG6_LOCAL_ACTION_END_T: + if (param_len != sizeof(int)) + return -EINVAL; + return seg6_lookup_nexthop(skb, NULL, *(int *)param); + case SEG6_LOCAL_ACTION_END_B6: + err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE, + param, param_len); + if (!err) + srh_state->hdrlen = + ((struct ipv6_sr_hdr *)param)->hdrlen << 3; + return err; + case SEG6_LOCAL_ACTION_END_B6_ENCAP: + err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6, + param, param_len); + if (!err) + srh_state->hdrlen = + ((struct ipv6_sr_hdr *)param)->hdrlen << 3; + return err; + default: + return -EINVAL; + } +#else /* CONFIG_IPV6_SEG6_BPF */ + return -EOPNOTSUPP; +#endif +} + +static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { + .func = bpf_lwt_seg6_action, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE +}; + +BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, + s32, len) +{ +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + void *srh_end, *srh_tlvs, *ptr; + struct ipv6_sr_hdr *srh; + struct ipv6hdr *hdr; + int srhoff = 0; + int ret; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + + srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) + + ((srh->first_segment + 1) << 4)); + srh_end = (void *)((unsigned char *)srh + sizeof(*srh) + + srh_state->hdrlen); + ptr = skb->data + offset; + + if (unlikely(ptr < srh_tlvs || ptr > srh_end)) + return -EFAULT; + if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end)) + return -EFAULT; + + if (len > 0) { + ret = skb_cow_head(skb, len); + if (unlikely(ret < 0)) + return ret; + + ret = bpf_skb_net_hdr_push(skb, offset, len); + } else { + ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len); + } + + bpf_compute_data_pointers(skb); + if (unlikely(ret < 0)) + return ret; + + hdr = (struct ipv6hdr *)skb->data; + hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + + srh_state->hdrlen += len; + srh_state->valid = 0; + return 0; +#else /* CONFIG_IPV6_SEG6_BPF */ + return -EOPNOTSUPP; +#endif +} + +static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { + .func = bpf_lwt_seg6_adjust_srh, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +bool bpf_helper_changes_pkt_data(void *func) +{ + if (func == bpf_skb_vlan_push || + func == bpf_skb_vlan_pop || + func == bpf_skb_store_bytes || + func == bpf_skb_change_proto || + func == bpf_skb_change_head || + func == bpf_skb_change_tail || + func == bpf_skb_adjust_room || + func == bpf_skb_pull_data || + func == bpf_clone_redirect || + func == bpf_l3_csum_replace || + func == bpf_l4_csum_replace || + func == bpf_xdp_adjust_head || + func == bpf_xdp_adjust_meta || + func == bpf_msg_pull_data || + func == bpf_xdp_adjust_tail || + func == bpf_lwt_push_encap || + func == bpf_lwt_seg6_store_bytes || + func == bpf_lwt_seg6_adjust_srh || + func == bpf_lwt_seg6_action + ) + return true; + + return false; +} + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -3781,6 +4698,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_load_bytes_relative: + return &bpf_skb_load_bytes_relative_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: @@ -3798,6 +4717,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_load_bytes_relative: + return &bpf_skb_load_bytes_relative_proto; case BPF_FUNC_skb_pull_data: return &bpf_skb_pull_data_proto; case BPF_FUNC_csum_diff: @@ -3852,6 +4773,16 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; + case BPF_FUNC_fib_lookup: + return &bpf_skb_fib_lookup_proto; +#ifdef CONFIG_XFRM + case BPF_FUNC_skb_get_xfrm_state: + return &bpf_skb_get_xfrm_state_proto; +#endif +#ifdef CONFIG_SOCK_CGROUP_DATA + case BPF_FUNC_skb_cgroup_id: + return &bpf_skb_cgroup_id_proto; +#endif default: return bpf_base_func_proto(func_id); } @@ -3875,33 +4806,10 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_redirect_proto; case BPF_FUNC_redirect_map: return &bpf_xdp_redirect_map_proto; - default: - return bpf_base_func_proto(func_id); - } -} - -static const struct bpf_func_proto * -lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - switch (func_id) { - case BPF_FUNC_skb_load_bytes: - return &bpf_skb_load_bytes_proto; - case BPF_FUNC_skb_pull_data: - return &bpf_skb_pull_data_proto; - case BPF_FUNC_csum_diff: - return &bpf_csum_diff_proto; - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_proto; - case BPF_FUNC_get_route_realm: - return &bpf_get_route_realm_proto; - case BPF_FUNC_get_hash_recalc: - return &bpf_get_hash_recalc_proto; - case BPF_FUNC_perf_event_output: - return &bpf_skb_event_output_proto; - case BPF_FUNC_get_smp_processor_id: - return &bpf_get_smp_processor_id_proto; - case BPF_FUNC_skb_under_cgroup: - return &bpf_skb_under_cgroup_proto; + case BPF_FUNC_xdp_adjust_tail: + return &bpf_xdp_adjust_tail_proto; + case BPF_FUNC_fib_lookup: + return &bpf_xdp_fib_lookup_proto; default: return bpf_base_func_proto(func_id); } @@ -3919,6 +4827,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; + case BPF_FUNC_sock_hash_update: + return &bpf_sock_hash_update_proto; default: return bpf_base_func_proto(func_id); } @@ -3930,6 +4840,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_msg_redirect_map: return &bpf_msg_redirect_map_proto; + case BPF_FUNC_msg_redirect_hash: + return &bpf_msg_redirect_hash_proto; case BPF_FUNC_msg_apply_bytes: return &bpf_msg_apply_bytes_proto; case BPF_FUNC_msg_cork_bytes: @@ -3961,12 +4873,52 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_uid_proto; case BPF_FUNC_sk_redirect_map: return &bpf_sk_redirect_map_proto; + case BPF_FUNC_sk_redirect_hash: + return &bpf_sk_redirect_hash_proto; default: return bpf_base_func_proto(func_id); } } static const struct bpf_func_proto * +lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_pull_data: + return &bpf_skb_pull_data_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_proto; + case BPF_FUNC_get_route_realm: + return &bpf_get_route_realm_proto; + case BPF_FUNC_get_hash_recalc: + return &bpf_get_hash_recalc_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_skb_under_cgroup: + return &bpf_skb_under_cgroup_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_lwt_push_encap: + return &bpf_lwt_push_encap_proto; + default: + return lwt_out_func_proto(func_id, prog); + } +} + +static const struct bpf_func_proto * lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { @@ -3997,7 +4949,22 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; default: - return lwt_inout_func_proto(func_id, prog); + return lwt_out_func_proto(func_id, prog); + } +} + +static const struct bpf_func_proto * +lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_lwt_seg6_store_bytes: + return &bpf_lwt_seg6_store_bytes_proto; + case BPF_FUNC_lwt_seg6_action: + return &bpf_lwt_seg6_action_proto; + case BPF_FUNC_lwt_seg6_adjust_srh: + return &bpf_lwt_seg6_adjust_srh_proto; + default: + return lwt_out_func_proto(func_id, prog); } } @@ -4105,7 +5072,6 @@ static bool lwt_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, prog, info); } - /* Attach type specific accesses */ static bool __sock_filter_check_attach_type(int off, enum bpf_access_type access_type, @@ -4221,6 +5187,41 @@ static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, return insn - insn_buf; } +static int bpf_gen_ld_abs(const struct bpf_insn *orig, + struct bpf_insn *insn_buf) +{ + bool indirect = BPF_MODE(orig->code) == BPF_IND; + struct bpf_insn *insn = insn_buf; + + /* We're guaranteed here that CTX is in R6. */ + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX); + if (!indirect) { + *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm); + } else { + *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg); + if (orig->imm) + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm); + } + + switch (BPF_SIZE(orig->code)) { + case BPF_B: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache); + break; + case BPF_H: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache); + break; + case BPF_W: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache); + break; + } + + *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0); + *insn++ = BPF_EXIT_INSN(); + + return insn - insn_buf; +} + static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { @@ -4279,8 +5280,15 @@ static bool xdp_is_valid_access(int off, int size, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { - if (type == BPF_WRITE) + if (type == BPF_WRITE) { + if (bpf_prog_is_dev_bound(prog->aux)) { + switch (off) { + case offsetof(struct xdp_md, rx_queue_index): + return __is_valid_xdp_access(off, size); + } + } return false; + } switch (off) { case offsetof(struct xdp_md, data): @@ -4327,6 +5335,7 @@ static bool sock_addr_is_valid_access(int off, int size, switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_UDP4_SENDMSG: break; default: return false; @@ -4336,6 +5345,24 @@ static bool sock_addr_is_valid_access(int off, int size, switch (prog->expected_attach_type) { case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP6_SENDMSG: + break; + default: + return false; + } + break; + case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): + switch (prog->expected_attach_type) { + case BPF_CGROUP_UDP4_SENDMSG: + break; + default: + return false; + } + break; + case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], + msg_src_ip6[3]): + switch (prog->expected_attach_type) { + case BPF_CGROUP_UDP6_SENDMSG: break; default: return false; @@ -4346,6 +5373,9 @@ static bool sock_addr_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct bpf_sock_addr, user_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): + case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], + msg_src_ip6[3]): /* Only narrow read access allowed for now. */ if (type == BPF_READ) { bpf_ctx_record_field_size(info, size_default); @@ -4465,18 +5495,23 @@ static bool sk_msg_is_valid_access(int off, int size, switch (off) { case offsetof(struct sk_msg_md, data): info->reg_type = PTR_TO_PACKET; + if (size != sizeof(__u64)) + return false; break; case offsetof(struct sk_msg_md, data_end): info->reg_type = PTR_TO_PACKET_END; + if (size != sizeof(__u64)) + return false; break; + default: + if (size != sizeof(__u32)) + return false; } if (off < 0 || off >= sizeof(struct sk_msg_md)) return false; if (off % size != 0) return false; - if (size != sizeof(__u64)) - return false; return true; } @@ -5095,6 +6130,23 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); break; + + case offsetof(struct bpf_sock_addr, msg_src_ip4): + /* Treat t_ctx as struct in_addr for msg_src_ip4. */ + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct in_addr, t_ctx, + s_addr, BPF_SIZE(si->code), 0, tmp_reg); + break; + + case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], + msg_src_ip6[3]): + off = si->off; + off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]); + /* Treat t_ctx as struct in6_addr for msg_src_ip6. */ + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct in6_addr, t_ctx, + s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); + break; } return insn - insn_buf; @@ -5152,7 +6204,8 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock_ops, local_ip4): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4); + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), @@ -5469,6 +6522,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; +#if IS_ENABLED(CONFIG_IPV6) + int off; +#endif switch (si->off) { case offsetof(struct sk_msg_md, data): @@ -5481,6 +6537,107 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct sk_msg_buff, data_end)); break; + case offsetof(struct sk_msg_md, family): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_family)); + break; + + case offsetof(struct sk_msg_md, remote_ip4): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_daddr)); + break; + + case offsetof(struct sk_msg_md, local_ip4): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_rcv_saddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_rcv_saddr)); + break; + + case offsetof(struct sk_msg_md, remote_ip6[0]) ... + offsetof(struct sk_msg_md, remote_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_v6_daddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct sk_msg_md, remote_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_daddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct sk_msg_md, local_ip6[0]) ... + offsetof(struct sk_msg_md, local_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct sk_msg_md, local_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct sk_msg_md, remote_port): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_dport)); +#ifndef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); +#endif + break; + + case offsetof(struct sk_msg_md, local_port): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_num)); + break; } return insn - insn_buf; @@ -5490,6 +6647,7 @@ const struct bpf_verifier_ops sk_filter_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, + .gen_ld_abs = bpf_gen_ld_abs, }; const struct bpf_prog_ops sk_filter_prog_ops = { @@ -5501,6 +6659,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .is_valid_access = tc_cls_act_is_valid_access, .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, + .gen_ld_abs = bpf_gen_ld_abs, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { @@ -5527,13 +6686,23 @@ const struct bpf_prog_ops cg_skb_prog_ops = { .test_run = bpf_prog_test_run_skb, }; -const struct bpf_verifier_ops lwt_inout_verifier_ops = { - .get_func_proto = lwt_inout_func_proto, +const struct bpf_verifier_ops lwt_in_verifier_ops = { + .get_func_proto = lwt_in_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; -const struct bpf_prog_ops lwt_inout_prog_ops = { +const struct bpf_prog_ops lwt_in_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + +const struct bpf_verifier_ops lwt_out_verifier_ops = { + .get_func_proto = lwt_out_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops lwt_out_prog_ops = { .test_run = bpf_prog_test_run_skb, }; @@ -5548,6 +6717,16 @@ const struct bpf_prog_ops lwt_xmit_prog_ops = { .test_run = bpf_prog_test_run_skb, }; +const struct bpf_verifier_ops lwt_seg6local_verifier_ops = { + .get_func_proto = lwt_seg6local_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops lwt_seg6local_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + const struct bpf_verifier_ops cg_sock_verifier_ops = { .get_func_proto = sock_filter_func_proto, .is_valid_access = sock_filter_is_valid_access, diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index d29f09bc5ff9..53f96e4f7bf5 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1253,7 +1253,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb) EXPORT_SYMBOL(skb_get_hash_perturb); u32 __skb_get_poff(const struct sk_buff *skb, void *data, - const struct flow_keys *keys, int hlen) + const struct flow_keys_basic *keys, int hlen) { u32 poff = keys->control.thoff; @@ -1314,9 +1314,9 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data, */ u32 skb_get_poff(const struct sk_buff *skb) { - struct flow_keys keys; + struct flow_keys_basic keys; - if (!skb_flow_dissect_flow_keys(skb, &keys, 0)) + if (!skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0)) return 0; return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); @@ -1334,7 +1334,7 @@ __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys) keys->ports.src = fl6->fl6_sport; keys->ports.dst = fl6->fl6_dport; keys->keyid.keyid = fl6->fl6_gre_key; - keys->tags.flow_label = (__force u32)fl6->flowlabel; + keys->tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); keys->basic.ip_proto = fl6->flowi6_proto; return flow_hash_from_keys(keys); @@ -1403,7 +1403,7 @@ static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = { }, }; -static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = { +static const struct flow_dissector_key flow_keys_basic_dissector_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct flow_keys, control), @@ -1417,7 +1417,8 @@ static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = { struct flow_dissector flow_keys_dissector __read_mostly; EXPORT_SYMBOL(flow_keys_dissector); -struct flow_dissector flow_keys_buf_dissector __read_mostly; +struct flow_dissector flow_keys_basic_dissector __read_mostly; +EXPORT_SYMBOL(flow_keys_basic_dissector); static int __init init_default_flow_dissectors(void) { @@ -1427,9 +1428,9 @@ static int __init init_default_flow_dissectors(void) skb_flow_dissector_init(&flow_keys_dissector_symmetric, flow_keys_dissector_symmetric_keys, ARRAY_SIZE(flow_keys_dissector_symmetric_keys)); - skb_flow_dissector_init(&flow_keys_buf_dissector, - flow_keys_buf_dissector_keys, - ARRAY_SIZE(flow_keys_buf_dissector_keys)); + skb_flow_dissector_init(&flow_keys_basic_dissector, + flow_keys_basic_dissector_keys, + ARRAY_SIZE(flow_keys_basic_dissector_keys)); return 0; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 1fb43bff417d..a7a9c3d738ba 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -820,7 +820,8 @@ static void neigh_periodic_work(struct work_struct *work) write_lock(&n->lock); state = n->nud_state; - if (state & (NUD_PERMANENT | NUD_IN_TIMER)) { + if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) || + (n->flags & NTF_EXT_LEARNED)) { write_unlock(&n->lock); goto next_elt; } @@ -1136,6 +1137,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, if (neigh->dead) goto out; + neigh_update_ext_learned(neigh, flags, ¬ify); + if (!(new & NUD_VALID)) { neigh_del_timer(neigh); if (old & NUD_CONNECTED) @@ -1781,6 +1784,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, flags &= ~NEIGH_UPDATE_F_OVERRIDE; } + if (ndm->ndm_flags & NTF_EXT_LEARNED) + flags |= NEIGH_UPDATE_F_EXT_LEARNED; + if (ndm->ndm_flags & NTF_USE) { neigh_event_send(neigh, NULL); err = 0; diff --git a/net/core/net-traces.c b/net/core/net-traces.c index 380934580fa1..419af6dfe29f 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -35,10 +35,6 @@ #include <trace/events/tcp.h> #include <trace/events/fib.h> #include <trace/events/qdisc.h> -#if IS_ENABLED(CONFIG_IPV6) -#include <trace/events/fib6.h> -EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); -#endif #if IS_ENABLED(CONFIG_BRIDGE) #include <trace/events/bridge.h> EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add); diff --git a/net/core/page_pool.c b/net/core/page_pool.c new file mode 100644 index 000000000000..68bf07206744 --- /dev/null +++ b/net/core/page_pool.c @@ -0,0 +1,317 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * page_pool.c + * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> + * Copyright (C) 2016 Red Hat, Inc. + */ +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/slab.h> + +#include <net/page_pool.h> +#include <linux/dma-direction.h> +#include <linux/dma-mapping.h> +#include <linux/page-flags.h> +#include <linux/mm.h> /* for __put_page() */ + +static int page_pool_init(struct page_pool *pool, + const struct page_pool_params *params) +{ + unsigned int ring_qsize = 1024; /* Default */ + + memcpy(&pool->p, params, sizeof(pool->p)); + + /* Validate only known flags were used */ + if (pool->p.flags & ~(PP_FLAG_ALL)) + return -EINVAL; + + if (pool->p.pool_size) + ring_qsize = pool->p.pool_size; + + /* Sanity limit mem that can be pinned down */ + if (ring_qsize > 32768) + return -E2BIG; + + /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. + * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, + * which is the XDP_TX use-case. + */ + if ((pool->p.dma_dir != DMA_FROM_DEVICE) && + (pool->p.dma_dir != DMA_BIDIRECTIONAL)) + return -EINVAL; + + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) + return -ENOMEM; + + return 0; +} + +struct page_pool *page_pool_create(const struct page_pool_params *params) +{ + struct page_pool *pool; + int err = 0; + + pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); + if (!pool) + return ERR_PTR(-ENOMEM); + + err = page_pool_init(pool, params); + if (err < 0) { + pr_warn("%s() gave up with errno %d\n", __func__, err); + kfree(pool); + return ERR_PTR(err); + } + return pool; +} +EXPORT_SYMBOL(page_pool_create); + +/* fast path */ +static struct page *__page_pool_get_cached(struct page_pool *pool) +{ + struct ptr_ring *r = &pool->ring; + struct page *page; + + /* Quicker fallback, avoid locks when ring is empty */ + if (__ptr_ring_empty(r)) + return NULL; + + /* Test for safe-context, caller should provide this guarantee */ + if (likely(in_serving_softirq())) { + if (likely(pool->alloc.count)) { + /* Fast-path */ + page = pool->alloc.cache[--pool->alloc.count]; + return page; + } + /* Slower-path: Alloc array empty, time to refill + * + * Open-coded bulk ptr_ring consumer. + * + * Discussion: the ring consumer lock is not really + * needed due to the softirq/NAPI protection, but + * later need the ability to reclaim pages on the + * ring. Thus, keeping the locks. + */ + spin_lock(&r->consumer_lock); + while ((page = __ptr_ring_consume(r))) { + if (pool->alloc.count == PP_ALLOC_CACHE_REFILL) + break; + pool->alloc.cache[pool->alloc.count++] = page; + } + spin_unlock(&r->consumer_lock); + return page; + } + + /* Slow-path: Get page from locked ring queue */ + page = ptr_ring_consume(&pool->ring); + return page; +} + +/* slow path */ +noinline +static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, + gfp_t _gfp) +{ + struct page *page; + gfp_t gfp = _gfp; + dma_addr_t dma; + + /* We could always set __GFP_COMP, and avoid this branch, as + * prep_new_page() can handle order-0 with __GFP_COMP. + */ + if (pool->p.order) + gfp |= __GFP_COMP; + + /* FUTURE development: + * + * Current slow-path essentially falls back to single page + * allocations, which doesn't improve performance. This code + * need bulk allocation support from the page allocator code. + */ + + /* Cache was empty, do real allocation */ + page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); + if (!page) + return NULL; + + if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + goto skip_dma_map; + + /* Setup DMA mapping: use page->private for DMA-addr + * This mapping is kept for lifetime of page, until leaving pool. + */ + dma = dma_map_page(pool->p.dev, page, 0, + (PAGE_SIZE << pool->p.order), + pool->p.dma_dir); + if (dma_mapping_error(pool->p.dev, dma)) { + put_page(page); + return NULL; + } + set_page_private(page, dma); /* page->private = dma; */ + +skip_dma_map: + /* When page just alloc'ed is should/must have refcnt 1. */ + return page; +} + +/* For using page_pool replace: alloc_pages() API calls, but provide + * synchronization guarantee for allocation side. + */ +struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) +{ + struct page *page; + + /* Fast-path: Get a page from cache */ + page = __page_pool_get_cached(pool); + if (page) + return page; + + /* Slow-path: cache empty, do real allocation */ + page = __page_pool_alloc_pages_slow(pool, gfp); + return page; +} +EXPORT_SYMBOL(page_pool_alloc_pages); + +/* Cleanup page_pool state from page */ +static void __page_pool_clean_page(struct page_pool *pool, + struct page *page) +{ + if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + return; + + /* DMA unmap */ + dma_unmap_page(pool->p.dev, page_private(page), + PAGE_SIZE << pool->p.order, pool->p.dma_dir); + set_page_private(page, 0); +} + +/* Return a page to the page allocator, cleaning up our state */ +static void __page_pool_return_page(struct page_pool *pool, struct page *page) +{ + __page_pool_clean_page(pool, page); + put_page(page); + /* An optimization would be to call __free_pages(page, pool->p.order) + * knowing page is not part of page-cache (thus avoiding a + * __page_cache_release() call). + */ +} + +static bool __page_pool_recycle_into_ring(struct page_pool *pool, + struct page *page) +{ + int ret; + /* BH protection not needed if current is serving softirq */ + if (in_serving_softirq()) + ret = ptr_ring_produce(&pool->ring, page); + else + ret = ptr_ring_produce_bh(&pool->ring, page); + + return (ret == 0) ? true : false; +} + +/* Only allow direct recycling in special circumstances, into the + * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. + * + * Caller must provide appropriate safe context. + */ +static bool __page_pool_recycle_direct(struct page *page, + struct page_pool *pool) +{ + if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) + return false; + + /* Caller MUST have verified/know (page_ref_count(page) == 1) */ + pool->alloc.cache[pool->alloc.count++] = page; + return true; +} + +void __page_pool_put_page(struct page_pool *pool, + struct page *page, bool allow_direct) +{ + /* This allocator is optimized for the XDP mode that uses + * one-frame-per-page, but have fallbacks that act like the + * regular page allocator APIs. + * + * refcnt == 1 means page_pool owns page, and can recycle it. + */ + if (likely(page_ref_count(page) == 1)) { + /* Read barrier done in page_ref_count / READ_ONCE */ + + if (allow_direct && in_serving_softirq()) + if (__page_pool_recycle_direct(page, pool)) + return; + + if (!__page_pool_recycle_into_ring(pool, page)) { + /* Cache full, fallback to free pages */ + __page_pool_return_page(pool, page); + } + return; + } + /* Fallback/non-XDP mode: API user have elevated refcnt. + * + * Many drivers split up the page into fragments, and some + * want to keep doing this to save memory and do refcnt based + * recycling. Support this use case too, to ease drivers + * switching between XDP/non-XDP. + * + * In-case page_pool maintains the DMA mapping, API user must + * call page_pool_put_page once. In this elevated refcnt + * case, the DMA is unmapped/released, as driver is likely + * doing refcnt based recycle tricks, meaning another process + * will be invoking put_page. + */ + __page_pool_clean_page(pool, page); + put_page(page); +} +EXPORT_SYMBOL(__page_pool_put_page); + +static void __page_pool_empty_ring(struct page_pool *pool) +{ + struct page *page; + + /* Empty recycle ring */ + while ((page = ptr_ring_consume(&pool->ring))) { + /* Verify the refcnt invariant of cached pages */ + if (!(page_ref_count(page) == 1)) + pr_crit("%s() page_pool refcnt %d violation\n", + __func__, page_ref_count(page)); + + __page_pool_return_page(pool, page); + } +} + +static void __page_pool_destroy_rcu(struct rcu_head *rcu) +{ + struct page_pool *pool; + + pool = container_of(rcu, struct page_pool, rcu); + + WARN(pool->alloc.count, "API usage violation"); + + __page_pool_empty_ring(pool); + ptr_ring_cleanup(&pool->ring, NULL); + kfree(pool); +} + +/* Cleanup and release resources */ +void page_pool_destroy(struct page_pool *pool) +{ + struct page *page; + + /* Empty alloc cache, assume caller made sure this is + * no-longer in use, and page_pool_alloc_pages() cannot be + * call concurrently. + */ + while (pool->alloc.count) { + page = pool->alloc.cache[--pool->alloc.count]; + __page_pool_return_page(pool, page); + } + + /* No more consumers should exist, but producers could still + * be in-flight. + */ + __page_pool_empty_ring(pool); + + /* An xdp_mem_allocator can still ref page_pool pointer */ + call_rcu(&pool->rcu, __page_pool_destroy_rcu); +} +EXPORT_SYMBOL(page_pool_destroy); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 45936922d7e2..5ef61222fdef 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -59,6 +59,9 @@ #include <net/rtnetlink.h> #include <net/net_namespace.h> +#define RTNL_MAX_TYPE 48 +#define RTNL_SLAVE_MAX_TYPE 36 + struct rtnl_link { rtnl_doit_func doit; rtnl_dumpit_func dumpit; @@ -389,6 +392,11 @@ int rtnl_link_register(struct rtnl_link_ops *ops) { int err; + /* Sanity-check max sizes to avoid stack buffer overflow. */ + if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE || + ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)) + return -EINVAL; + rtnl_lock(); err = __rtnl_link_register(ops); rtnl_unlock(); @@ -785,13 +793,15 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, long expires, u32 error) { struct rta_cacheinfo ci = { - .rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse), - .rta_used = dst->__use, - .rta_clntref = atomic_read(&(dst->__refcnt)), .rta_error = error, .rta_id = id, }; + if (dst) { + ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse); + ci.rta_used = dst->__use; + ci.rta_clntref = atomic_read(&dst->__refcnt); + } if (expires) { unsigned long clock; @@ -2256,6 +2266,10 @@ static int do_setlink(const struct sk_buff *skb, const struct net_device_ops *ops = dev->netdev_ops; int err; + err = validate_linkmsg(dev, tb); + if (err < 0) + return err; + if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_IF_NETNSID]) { struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev), tb, CAP_NET_ADMIN); @@ -2619,10 +2633,6 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - err = validate_linkmsg(dev, tb); - if (err < 0) - goto errout; - err = do_setlink(skb, dev, ifm, extack, tb, ifname, 0); errout: return err; @@ -2900,13 +2910,16 @@ replay: } if (1) { - struct nlattr *attr[ops ? ops->maxtype + 1 : 1]; - struct nlattr *slave_attr[m_ops ? m_ops->slave_maxtype + 1 : 1]; + struct nlattr *attr[RTNL_MAX_TYPE + 1]; + struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; struct nlattr **data = NULL; struct nlattr **slave_data = NULL; struct net *dest_net, *link_net = NULL; if (ops) { + if (ops->maxtype > RTNL_MAX_TYPE) + return -EINVAL; + if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { err = nla_parse_nested(attr, ops->maxtype, linkinfo[IFLA_INFO_DATA], @@ -2923,6 +2936,9 @@ replay: } if (m_ops) { + if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) + return -EINVAL; + if (m_ops->slave_maxtype && linkinfo[IFLA_INFO_SLAVE_DATA]) { err = nla_parse_nested(slave_attr, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 345b51837ca8..c642304f178c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1305,7 +1305,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off) skb->inner_mac_header += off; } -static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) +void skb_copy_header(struct sk_buff *new, const struct sk_buff *old) { __copy_skb_header(new, old); @@ -1313,6 +1313,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; } +EXPORT_SYMBOL(skb_copy_header); static inline int skb_alloc_rx_flag(const struct sk_buff *skb) { @@ -1355,7 +1356,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); - copy_skb_header(n, skb); + skb_copy_header(n, skb); return n; } EXPORT_SYMBOL(skb_copy); @@ -1419,7 +1420,7 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, skb_clone_fraglist(n); } - copy_skb_header(n, skb); + skb_copy_header(n, skb); out: return n; } @@ -1599,7 +1600,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, skb->len + head_copy_len)); - copy_skb_header(n, skb); + skb_copy_header(n, skb); skb_headers_offset_update(n, newheadroom - oldheadroom); @@ -1839,6 +1840,20 @@ done: } EXPORT_SYMBOL(___pskb_trim); +/* Note : use pskb_trim_rcsum() instead of calling this directly + */ +int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) { + int delta = skb->len - len; + + skb->csum = csum_sub(skb->csum, + skb_checksum(skb, len, delta, 0)); + } + return __pskb_trim(skb, len); +} +EXPORT_SYMBOL(pskb_trim_rcsum_slow); + /** * __pskb_pull_tail - advance tail of skb header * @skb: buffer to reallocate @@ -4926,6 +4941,8 @@ static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) thlen = tcp_hdrlen(skb); } else if (unlikely(skb_is_gso_sctp(skb))) { thlen = sizeof(struct sctphdr); + } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { + thlen = sizeof(struct udphdr); } /* UFO sets gso_size to the size of the fragmentation * payload, i.e. the size of the L4 (UDP) header is already diff --git a/net/core/sock.c b/net/core/sock.c index 2aed99a541d5..f333d75ef1a9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -226,7 +226,8 @@ static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ - x "AF_QIPCRTR", x "AF_SMC" , x "AF_MAX" + x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ + x "AF_MAX" static const char *const af_family_key_strings[AF_MAX+1] = { _sock_locks("sk_lock-") @@ -262,7 +263,8 @@ static const char *const af_family_rlock_key_strings[AF_MAX+1] = { "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" , "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" , "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" , - "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_MAX" + "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_XDP" , + "rlock-AF_MAX" }; static const char *const af_family_wlock_key_strings[AF_MAX+1] = { "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" , @@ -279,7 +281,8 @@ static const char *const af_family_wlock_key_strings[AF_MAX+1] = { "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" , "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" , "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" , - "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_MAX" + "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_XDP" , + "wlock-AF_MAX" }; static const char *const af_family_elock_key_strings[AF_MAX+1] = { "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" , @@ -296,7 +299,8 @@ static const char *const af_family_elock_key_strings[AF_MAX+1] = { "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" , "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" , "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" , - "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_MAX" + "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_XDP" , + "elock-AF_MAX" }; /* @@ -323,8 +327,8 @@ EXPORT_SYMBOL(sysctl_optmem_max); int sysctl_tstamp_allow_data __read_mostly = 1; -struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE; -EXPORT_SYMBOL_GPL(memalloc_socks); +DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); +EXPORT_SYMBOL_GPL(memalloc_socks_key); /** * sk_set_memalloc - sets %SOCK_MEMALLOC @@ -338,7 +342,7 @@ void sk_set_memalloc(struct sock *sk) { sock_set_flag(sk, SOCK_MEMALLOC); sk->sk_allocation |= __GFP_MEMALLOC; - static_key_slow_inc(&memalloc_socks); + static_branch_inc(&memalloc_socks_key); } EXPORT_SYMBOL_GPL(sk_set_memalloc); @@ -346,7 +350,7 @@ void sk_clear_memalloc(struct sock *sk) { sock_reset_flag(sk, SOCK_MEMALLOC); sk->sk_allocation &= ~__GFP_MEMALLOC; - static_key_slow_dec(&memalloc_socks); + static_branch_dec(&memalloc_socks_key); /* * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward @@ -724,9 +728,22 @@ int sock_setsockopt(struct socket *sock, int level, int optname, sock_valbool_flag(sk, SOCK_DBG, valbool); break; case SO_REUSEADDR: - sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); + val = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); + if ((sk->sk_family == PF_INET || sk->sk_family == PF_INET6) && + inet_sk(sk)->inet_num && + (sk->sk_reuse != val)) { + ret = (sk->sk_state == TCP_ESTABLISHED) ? -EISCONN : -EUCLEAN; + break; + } + sk->sk_reuse = val; break; case SO_REUSEPORT: + if ((sk->sk_family == PF_INET || sk->sk_family == PF_INET6) && + inet_sk(sk)->inet_num && + (sk->sk_reuseport != valbool)) { + ret = (sk->sk_state == TCP_ESTABLISHED) ? -EISCONN : -EUCLEAN; + break; + } sk->sk_reuseport = valbool; break; case SO_TYPE: @@ -905,7 +922,10 @@ set_rcvbuf: case SO_RCVLOWAT: if (val < 0) val = INT_MAX; - sk->sk_rcvlowat = val ? : 1; + if (sock->ops->set_rcvlowat) + ret = sock->ops->set_rcvlowat(sk, val); + else + sk->sk_rcvlowat = val ? : 1; break; case SO_RCVTIMEO: diff --git a/net/core/xdp.c b/net/core/xdp.c index 097a0f74e004..9d1f22072d5d 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -5,6 +5,10 @@ */ #include <linux/types.h> #include <linux/mm.h> +#include <linux/slab.h> +#include <linux/idr.h> +#include <linux/rhashtable.h> +#include <net/page_pool.h> #include <net/xdp.h> @@ -13,6 +17,105 @@ #define REG_STATE_UNREGISTERED 0x2 #define REG_STATE_UNUSED 0x3 +static DEFINE_IDA(mem_id_pool); +static DEFINE_MUTEX(mem_id_lock); +#define MEM_ID_MAX 0xFFFE +#define MEM_ID_MIN 1 +static int mem_id_next = MEM_ID_MIN; + +static bool mem_id_init; /* false */ +static struct rhashtable *mem_id_ht; + +struct xdp_mem_allocator { + struct xdp_mem_info mem; + union { + void *allocator; + struct page_pool *page_pool; + struct zero_copy_allocator *zc_alloc; + }; + struct rhash_head node; + struct rcu_head rcu; +}; + +static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed) +{ + const u32 *k = data; + const u32 key = *k; + + BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id) + != sizeof(u32)); + + /* Use cyclic increasing ID as direct hash key, see rht_bucket_index */ + return key << RHT_HASH_RESERVED_SPACE; +} + +static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct xdp_mem_allocator *xa = ptr; + u32 mem_id = *(u32 *)arg->key; + + return xa->mem.id != mem_id; +} + +static const struct rhashtable_params mem_id_rht_params = { + .nelem_hint = 64, + .head_offset = offsetof(struct xdp_mem_allocator, node), + .key_offset = offsetof(struct xdp_mem_allocator, mem.id), + .key_len = FIELD_SIZEOF(struct xdp_mem_allocator, mem.id), + .max_size = MEM_ID_MAX, + .min_size = 8, + .automatic_shrinking = true, + .hashfn = xdp_mem_id_hashfn, + .obj_cmpfn = xdp_mem_id_cmp, +}; + +static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) +{ + struct xdp_mem_allocator *xa; + + xa = container_of(rcu, struct xdp_mem_allocator, rcu); + + /* Allow this ID to be reused */ + ida_simple_remove(&mem_id_pool, xa->mem.id); + + /* Notice, driver is expected to free the *allocator, + * e.g. page_pool, and MUST also use RCU free. + */ + + /* Poison memory */ + xa->mem.id = 0xFFFF; + xa->mem.type = 0xF0F0; + xa->allocator = (void *)0xDEAD9001; + + kfree(xa); +} + +static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) +{ + struct xdp_mem_allocator *xa; + int id = xdp_rxq->mem.id; + int err; + + if (id == 0) + return; + + mutex_lock(&mem_id_lock); + + xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); + if (!xa) { + mutex_unlock(&mem_id_lock); + return; + } + + err = rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params); + WARN_ON(err); + + call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); + + mutex_unlock(&mem_id_lock); +} + void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) { /* Simplify driver cleanup code paths, allow unreg "unused" */ @@ -21,8 +124,14 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG"); + __xdp_rxq_info_unreg_mem_model(xdp_rxq); + xdp_rxq->reg_state = REG_STATE_UNREGISTERED; xdp_rxq->dev = NULL; + + /* Reset mem info to defaults */ + xdp_rxq->mem.id = 0; + xdp_rxq->mem.type = 0; } EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg); @@ -71,3 +180,193 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq) return (xdp_rxq->reg_state == REG_STATE_REGISTERED); } EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg); + +static int __mem_id_init_hash_table(void) +{ + struct rhashtable *rht; + int ret; + + if (unlikely(mem_id_init)) + return 0; + + rht = kzalloc(sizeof(*rht), GFP_KERNEL); + if (!rht) + return -ENOMEM; + + ret = rhashtable_init(rht, &mem_id_rht_params); + if (ret < 0) { + kfree(rht); + return ret; + } + mem_id_ht = rht; + smp_mb(); /* mutex lock should provide enough pairing */ + mem_id_init = true; + + return 0; +} + +/* Allocate a cyclic ID that maps to allocator pointer. + * See: https://www.kernel.org/doc/html/latest/core-api/idr.html + * + * Caller must lock mem_id_lock. + */ +static int __mem_id_cyclic_get(gfp_t gfp) +{ + int retries = 1; + int id; + +again: + id = ida_simple_get(&mem_id_pool, mem_id_next, MEM_ID_MAX, gfp); + if (id < 0) { + if (id == -ENOSPC) { + /* Cyclic allocator, reset next id */ + if (retries--) { + mem_id_next = MEM_ID_MIN; + goto again; + } + } + return id; /* errno */ + } + mem_id_next = id + 1; + + return id; +} + +static bool __is_supported_mem_type(enum xdp_mem_type type) +{ + if (type == MEM_TYPE_PAGE_POOL) + return is_page_pool_compiled_in(); + + if (type >= MEM_TYPE_MAX) + return false; + + return true; +} + +int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, + enum xdp_mem_type type, void *allocator) +{ + struct xdp_mem_allocator *xdp_alloc; + gfp_t gfp = GFP_KERNEL; + int id, errno, ret; + void *ptr; + + if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { + WARN(1, "Missing register, driver bug"); + return -EFAULT; + } + + if (!__is_supported_mem_type(type)) + return -EOPNOTSUPP; + + xdp_rxq->mem.type = type; + + if (!allocator) { + if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY) + return -EINVAL; /* Setup time check page_pool req */ + return 0; + } + + /* Delay init of rhashtable to save memory if feature isn't used */ + if (!mem_id_init) { + mutex_lock(&mem_id_lock); + ret = __mem_id_init_hash_table(); + mutex_unlock(&mem_id_lock); + if (ret < 0) { + WARN_ON(1); + return ret; + } + } + + xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp); + if (!xdp_alloc) + return -ENOMEM; + + mutex_lock(&mem_id_lock); + id = __mem_id_cyclic_get(gfp); + if (id < 0) { + errno = id; + goto err; + } + xdp_rxq->mem.id = id; + xdp_alloc->mem = xdp_rxq->mem; + xdp_alloc->allocator = allocator; + + /* Insert allocator into ID lookup table */ + ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node); + if (IS_ERR(ptr)) { + errno = PTR_ERR(ptr); + goto err; + } + + mutex_unlock(&mem_id_lock); + + return 0; +err: + mutex_unlock(&mem_id_lock); + kfree(xdp_alloc); + return errno; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); + +/* XDP RX runs under NAPI protection, and in different delivery error + * scenarios (e.g. queue full), it is possible to return the xdp_frame + * while still leveraging this protection. The @napi_direct boolian + * is used for those calls sites. Thus, allowing for faster recycling + * of xdp_frames/pages in those cases. + */ +static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, + unsigned long handle) +{ + struct xdp_mem_allocator *xa; + struct page *page; + + switch (mem->type) { + case MEM_TYPE_PAGE_POOL: + rcu_read_lock(); + /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ + xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + page = virt_to_head_page(data); + if (xa) + page_pool_put_page(xa->page_pool, page, napi_direct); + else + put_page(page); + rcu_read_unlock(); + break; + case MEM_TYPE_PAGE_SHARED: + page_frag_free(data); + break; + case MEM_TYPE_PAGE_ORDER0: + page = virt_to_page(data); /* Assumes order0 page*/ + put_page(page); + break; + case MEM_TYPE_ZERO_COPY: + /* NB! Only valid from an xdp_buff! */ + rcu_read_lock(); + /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ + xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + xa->zc_alloc->free(xa->zc_alloc, handle); + rcu_read_unlock(); + default: + /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ + break; + } +} + +void xdp_return_frame(struct xdp_frame *xdpf) +{ + __xdp_return(xdpf->data, &xdpf->mem, false, 0); +} +EXPORT_SYMBOL_GPL(xdp_return_frame); + +void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) +{ + __xdp_return(xdpf->data, &xdpf->mem, true, 0); +} +EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); + +void xdp_return_buff(struct xdp_buff *xdp) +{ + __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle); +} +EXPORT_SYMBOL_GPL(xdp_return_buff); |