diff options
Diffstat (limited to 'net/mctp/route.c')
-rw-r--r-- | net/mctp/route.c | 362 |
1 files changed, 293 insertions, 69 deletions
diff --git a/net/mctp/route.c b/net/mctp/route.c index 5ca186d53cb0..46c44823edb7 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -11,6 +11,7 @@ */ #include <linux/idr.h> +#include <linux/kconfig.h> #include <linux/mctp.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> @@ -23,7 +24,12 @@ #include <net/netlink.h> #include <net/sock.h> +#include <trace/events/mctp.h> + static const unsigned int mctp_message_maxlen = 64 * 1024; +static const unsigned long mctp_key_lifetime = 6 * CONFIG_HZ; + +static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev); /* route output callbacks */ static int mctp_route_discard(struct mctp_route *route, struct sk_buff *skb) @@ -83,25 +89,43 @@ static bool mctp_key_match(struct mctp_sk_key *key, mctp_eid_t local, return true; } +/* returns a key (with key->lock held, and refcounted), or NULL if no such + * key exists. + */ static struct mctp_sk_key *mctp_lookup_key(struct net *net, struct sk_buff *skb, - mctp_eid_t peer) + mctp_eid_t peer, + unsigned long *irqflags) + __acquires(&key->lock) { struct mctp_sk_key *key, *ret; + unsigned long flags; struct mctp_hdr *mh; u8 tag; - WARN_ON(!rcu_read_lock_held()); - mh = mctp_hdr(skb); tag = mh->flags_seq_tag & (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO); ret = NULL; + spin_lock_irqsave(&net->mctp.keys_lock, flags); - hlist_for_each_entry_rcu(key, &net->mctp.keys, hlist) { - if (mctp_key_match(key, mh->dest, peer, tag)) { + hlist_for_each_entry(key, &net->mctp.keys, hlist) { + if (!mctp_key_match(key, mh->dest, peer, tag)) + continue; + + spin_lock(&key->lock); + if (key->valid) { + refcount_inc(&key->refs); ret = key; break; } + spin_unlock(&key->lock); + } + + if (ret) { + spin_unlock(&net->mctp.keys_lock); + *irqflags = flags; + } else { + spin_unlock_irqrestore(&net->mctp.keys_lock, flags); } return ret; @@ -121,11 +145,30 @@ static struct mctp_sk_key *mctp_key_alloc(struct mctp_sock *msk, key->local_addr = local; key->tag = tag; key->sk = &msk->sk; - spin_lock_init(&key->reasm_lock); + key->valid = true; + spin_lock_init(&key->lock); + refcount_set(&key->refs, 1); return key; } +void mctp_key_unref(struct mctp_sk_key *key) +{ + unsigned long flags; + + if (!refcount_dec_and_test(&key->refs)) + return; + + /* even though no refs exist here, the lock allows us to stay + * consistent with the locking requirement of mctp_dev_release_key + */ + spin_lock_irqsave(&key->lock, flags); + mctp_dev_release_key(key->dev, key); + spin_unlock_irqrestore(&key->lock, flags); + + kfree(key); +} + static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk) { struct net *net = sock_net(&msk->sk); @@ -138,12 +181,20 @@ static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk) hlist_for_each_entry(tmp, &net->mctp.keys, hlist) { if (mctp_key_match(tmp, key->local_addr, key->peer_addr, key->tag)) { - rc = -EEXIST; - break; + spin_lock(&tmp->lock); + if (tmp->valid) + rc = -EEXIST; + spin_unlock(&tmp->lock); + if (rc) + break; } } if (!rc) { + refcount_inc(&key->refs); + key->expiry = jiffies + mctp_key_lifetime; + timer_reduce(&msk->key_expiry, key->expiry); + hlist_add_head(&key->hlist, &net->mctp.keys); hlist_add_head(&key->sklist, &msk->keys); } @@ -153,30 +204,72 @@ static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk) return rc; } -/* Must be called with key->reasm_lock, which it will release. Will schedule - * the key for an RCU free. +/* We're done with the key; unset valid and remove from lists. There may still + * be outstanding refs on the key though... */ static void __mctp_key_unlock_drop(struct mctp_sk_key *key, struct net *net, unsigned long flags) - __releases(&key->reasm_lock) + __releases(&key->lock) { struct sk_buff *skb; skb = key->reasm_head; key->reasm_head = NULL; key->reasm_dead = true; - spin_unlock_irqrestore(&key->reasm_lock, flags); + key->valid = false; + mctp_dev_release_key(key->dev, key); + spin_unlock_irqrestore(&key->lock, flags); spin_lock_irqsave(&net->mctp.keys_lock, flags); - hlist_del_rcu(&key->hlist); - hlist_del_rcu(&key->sklist); + hlist_del(&key->hlist); + hlist_del(&key->sklist); spin_unlock_irqrestore(&net->mctp.keys_lock, flags); - kfree_rcu(key, rcu); + + /* one unref for the lists */ + mctp_key_unref(key); + + /* and one for the local reference */ + mctp_key_unref(key); if (skb) kfree_skb(skb); + } +#ifdef CONFIG_MCTP_FLOWS +static void mctp_skb_set_flow(struct sk_buff *skb, struct mctp_sk_key *key) +{ + struct mctp_flow *flow; + + flow = skb_ext_add(skb, SKB_EXT_MCTP); + if (!flow) + return; + + refcount_inc(&key->refs); + flow->key = key; +} + +static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev) +{ + struct mctp_sk_key *key; + struct mctp_flow *flow; + + flow = skb_ext_find(skb, SKB_EXT_MCTP); + if (!flow) + return; + + key = flow->key; + + if (WARN_ON(key->dev && key->dev != dev)) + return; + + mctp_dev_set_key(dev, key); +} +#else +static void mctp_skb_set_flow(struct sk_buff *skb, struct mctp_sk_key *key) {} +static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev) {} +#endif + static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb) { struct mctp_hdr *hdr = mctp_hdr(skb); @@ -248,8 +341,10 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) rcu_read_lock(); - /* lookup socket / reasm context, exactly matching (src,dest,tag) */ - key = mctp_lookup_key(net, skb, mh->src); + /* lookup socket / reasm context, exactly matching (src,dest,tag). + * we hold a ref on the key, and key->lock held. + */ + key = mctp_lookup_key(net, skb, mh->src, &f); if (flags & MCTP_HDR_FLAG_SOM) { if (key) { @@ -260,10 +355,12 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) * key for reassembly - we'll create a more specific * one for future packets if required (ie, !EOM). */ - key = mctp_lookup_key(net, skb, MCTP_ADDR_ANY); + key = mctp_lookup_key(net, skb, MCTP_ADDR_ANY, &f); if (key) { msk = container_of(key->sk, struct mctp_sock, sk); + spin_unlock_irqrestore(&key->lock, f); + mctp_key_unref(key); key = NULL; } } @@ -282,11 +379,13 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) if (flags & MCTP_HDR_FLAG_EOM) { sock_queue_rcv_skb(&msk->sk, skb); if (key) { - spin_lock_irqsave(&key->reasm_lock, f); /* we've hit a pending reassembly; not much we * can do but drop it */ + trace_mctp_key_release(key, + MCTP_TRACE_KEY_REPLIED); __mctp_key_unlock_drop(key, net, f); + key = NULL; } rc = 0; goto out_unlock; @@ -303,7 +402,7 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) goto out_unlock; } - /* we can queue without the reasm lock here, as the + /* we can queue without the key lock here, as the * key isn't observable yet */ mctp_frag_queue(key, skb); @@ -318,17 +417,22 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) if (rc) kfree(key); - } else { - /* existing key: start reassembly */ - spin_lock_irqsave(&key->reasm_lock, f); + trace_mctp_key_acquire(key); + + /* we don't need to release key->lock on exit */ + mctp_key_unref(key); + key = NULL; + } else { if (key->reasm_head || key->reasm_dead) { /* duplicate start? drop everything */ + trace_mctp_key_release(key, + MCTP_TRACE_KEY_INVALIDATED); __mctp_key_unlock_drop(key, net, f); rc = -EEXIST; + key = NULL; } else { rc = mctp_frag_queue(key, skb); - spin_unlock_irqrestore(&key->reasm_lock, f); } } @@ -337,8 +441,6 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) * using the message-specific key */ - spin_lock_irqsave(&key->reasm_lock, f); - /* we need to be continuing an existing reassembly... */ if (!key->reasm_head) rc = -EINVAL; @@ -351,9 +453,9 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) if (!rc && flags & MCTP_HDR_FLAG_EOM) { sock_queue_rcv_skb(key->sk, key->reasm_head); key->reasm_head = NULL; + trace_mctp_key_release(key, MCTP_TRACE_KEY_REPLIED); __mctp_key_unlock_drop(key, net, f); - } else { - spin_unlock_irqrestore(&key->reasm_lock, f); + key = NULL; } } else { @@ -363,6 +465,10 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) out_unlock: rcu_read_unlock(); + if (key) { + spin_unlock_irqrestore(&key->lock, f); + mctp_key_unref(key); + } out: if (rc) kfree_skb(skb); @@ -376,6 +482,7 @@ static unsigned int mctp_route_mtu(struct mctp_route *rt) static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb) { + struct mctp_skb_cb *cb = mctp_cb(skb); struct mctp_hdr *hdr = mctp_hdr(skb); char daddr_buf[MAX_ADDR_LEN]; char *daddr = NULL; @@ -390,9 +497,14 @@ static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb) return -EMSGSIZE; } - /* If lookup fails let the device handle daddr==NULL */ - if (mctp_neigh_lookup(route->dev, hdr->dest, daddr_buf) == 0) - daddr = daddr_buf; + if (cb->ifindex) { + /* direct route; use the hwaddr we stashed in sendmsg */ + daddr = cb->haddr; + } else { + /* If lookup fails let the device handle daddr==NULL */ + if (mctp_neigh_lookup(route->dev, hdr->dest, daddr_buf) == 0) + daddr = daddr_buf; + } rc = dev_hard_header(skb, skb->dev, ntohs(skb->protocol), daddr, skb->dev->dev_addr, skb->len); @@ -401,6 +513,8 @@ static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb) return -EHOSTUNREACH; } + mctp_flow_prepare_output(skb, route->dev); + rc = dev_queue_xmit(skb); if (rc) rc = net_xmit_errno(rc); @@ -412,7 +526,7 @@ static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb) static void mctp_route_release(struct mctp_route *rt) { if (refcount_dec_and_test(&rt->refs)) { - dev_put(rt->dev->dev); + mctp_dev_put(rt->dev); kfree_rcu(rt, rcu); } } @@ -454,30 +568,38 @@ static void mctp_reserve_tag(struct net *net, struct mctp_sk_key *key, lockdep_assert_held(&mns->keys_lock); + key->expiry = jiffies + mctp_key_lifetime; + timer_reduce(&msk->key_expiry, key->expiry); + /* we hold the net->key_lock here, allowing updates to both * then net and sk */ hlist_add_head_rcu(&key->hlist, &mns->keys); hlist_add_head_rcu(&key->sklist, &msk->keys); + refcount_inc(&key->refs); } /* Allocate a locally-owned tag value for (saddr, daddr), and reserve * it for the socket msk */ -static int mctp_alloc_local_tag(struct mctp_sock *msk, - mctp_eid_t saddr, mctp_eid_t daddr, u8 *tagp) +static struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk, + mctp_eid_t saddr, + mctp_eid_t daddr, u8 *tagp) { struct net *net = sock_net(&msk->sk); struct netns_mctp *mns = &net->mctp; struct mctp_sk_key *key, *tmp; unsigned long flags; - int rc = -EAGAIN; u8 tagbits; + /* for NULL destination EIDs, we may get a response from any peer */ + if (daddr == MCTP_ADDR_NULL) + daddr = MCTP_ADDR_ANY; + /* be optimistic, alloc now */ key = mctp_key_alloc(msk, saddr, daddr, 0, GFP_KERNEL); if (!key) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* 8 possible tag values */ tagbits = 0xff; @@ -488,14 +610,26 @@ static int mctp_alloc_local_tag(struct mctp_sock *msk, * tags. If we find a conflict, clear that bit from tagbits */ hlist_for_each_entry(tmp, &mns->keys, hlist) { + /* We can check the lookup fields (*_addr, tag) without the + * lock held, they don't change over the lifetime of the key. + */ + /* if we don't own the tag, it can't conflict */ if (tmp->tag & MCTP_HDR_FLAG_TO) continue; - if ((tmp->peer_addr == daddr || - tmp->peer_addr == MCTP_ADDR_ANY) && - tmp->local_addr == saddr) + if (!((tmp->peer_addr == daddr || + tmp->peer_addr == MCTP_ADDR_ANY) && + tmp->local_addr == saddr)) + continue; + + spin_lock(&tmp->lock); + /* key must still be valid. If we find a match, clear the + * potential tag value + */ + if (tmp->valid) tagbits &= ~(1 << tmp->tag); + spin_unlock(&tmp->lock); if (!tagbits) break; @@ -504,16 +638,19 @@ static int mctp_alloc_local_tag(struct mctp_sock *msk, if (tagbits) { key->tag = __ffs(tagbits); mctp_reserve_tag(net, key, msk); + trace_mctp_key_acquire(key); + *tagp = key->tag; - rc = 0; } spin_unlock_irqrestore(&mns->keys_lock, flags); - if (!tagbits) + if (!tagbits) { kfree(key); + return ERR_PTR(-EBUSY); + } - return rc; + return key; } /* routing lookups */ @@ -552,14 +689,18 @@ struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet, return rt; } -/* sends a skb to rt and releases the route. */ -int mctp_do_route(struct mctp_route *rt, struct sk_buff *skb) +static struct mctp_route *mctp_route_lookup_null(struct net *net, + struct net_device *dev) { - int rc; + struct mctp_route *rt; - rc = rt->output(rt, skb); - mctp_route_release(rt); - return rc; + list_for_each_entry_rcu(rt, &net->mctp.routes, list) { + if (rt->dev->dev == dev && rt->type == RTN_LOCAL && + refcount_inc_not_zero(&rt->refs)) + return rt; + } + + return NULL; } static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb, @@ -628,7 +769,7 @@ static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb, /* copy message payload */ skb_copy_bits(skb, pos, skb_transport_header(skb2), size); - /* do route, but don't drop the rt reference */ + /* do route */ rc = rt->output(rt, skb2); if (rc) break; @@ -637,7 +778,6 @@ static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb, pos += size; } - mctp_route_release(rt); consume_skb(skb); return rc; } @@ -647,15 +787,52 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt, { struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); struct mctp_skb_cb *cb = mctp_cb(skb); + struct mctp_route tmp_rt; + struct mctp_sk_key *key; + struct net_device *dev; struct mctp_hdr *hdr; unsigned long flags; unsigned int mtu; mctp_eid_t saddr; + bool ext_rt; int rc; u8 tag; - if (WARN_ON(!rt->dev)) + rc = -ENODEV; + + if (rt) { + ext_rt = false; + dev = NULL; + + if (WARN_ON(!rt->dev)) + goto out_release; + + } else if (cb->ifindex) { + ext_rt = true; + rt = &tmp_rt; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(sk), cb->ifindex); + if (!dev) { + rcu_read_unlock(); + return rc; + } + + rt->dev = __mctp_dev_get(dev); + rcu_read_unlock(); + + if (!rt->dev) + goto out_release; + + /* establish temporary route - we set up enough to keep + * mctp_route_output happy + */ + rt->output = mctp_route_output; + rt->mtu = 0; + + } else { return -EINVAL; + } spin_lock_irqsave(&rt->dev->addrs_lock, flags); if (rt->dev->num_addrs == 0) { @@ -668,18 +845,23 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt, spin_unlock_irqrestore(&rt->dev->addrs_lock, flags); if (rc) - return rc; + goto out_release; if (req_tag & MCTP_HDR_FLAG_TO) { - rc = mctp_alloc_local_tag(msk, saddr, daddr, &tag); - if (rc) - return rc; + key = mctp_alloc_local_tag(msk, saddr, daddr, &tag); + if (IS_ERR(key)) { + rc = PTR_ERR(key); + goto out_release; + } + mctp_skb_set_flow(skb, key); + /* done with the key in this scope */ + mctp_key_unref(key); tag |= MCTP_HDR_FLAG_TO; } else { + key = NULL; tag = req_tag; } - skb->protocol = htons(ETH_P_MCTP); skb->priority = 0; skb_reset_transport_header(skb); @@ -699,12 +881,22 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt, mtu = mctp_route_mtu(rt); if (skb->len + sizeof(struct mctp_hdr) <= mtu) { - hdr->flags_seq_tag = MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM | - tag; - return mctp_do_route(rt, skb); + hdr->flags_seq_tag = MCTP_HDR_FLAG_SOM | + MCTP_HDR_FLAG_EOM | tag; + rc = rt->output(rt, skb); } else { - return mctp_do_fragment_route(rt, skb, mtu, tag); + rc = mctp_do_fragment_route(rt, skb, mtu, tag); } + +out_release: + if (!ext_rt) + mctp_route_release(rt); + + if (dev) + dev_put(dev); + + return rc; + } /* route management */ @@ -741,7 +933,7 @@ static int mctp_route_add(struct mctp_dev *mdev, mctp_eid_t daddr_start, rt->max = daddr_start + daddr_extent; rt->mtu = mtu; rt->dev = mdev; - dev_hold(rt->dev->dev); + mctp_dev_hold(rt->dev); rt->type = type; rt->output = rtfn; @@ -821,13 +1013,18 @@ static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev, struct net_device *orig_dev) { struct net *net = dev_net(dev); + struct mctp_dev *mdev; struct mctp_skb_cb *cb; struct mctp_route *rt; struct mctp_hdr *mh; - /* basic non-data sanity checks */ - if (dev->type != ARPHRD_MCTP) + rcu_read_lock(); + mdev = __mctp_dev_get(dev); + rcu_read_unlock(); + if (!mdev) { + /* basic non-data sanity checks */ goto err_drop; + } if (!pskb_may_pull(skb, sizeof(struct mctp_hdr))) goto err_drop; @@ -840,16 +1037,27 @@ static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev, if (mh->ver < MCTP_VER_MIN || mh->ver > MCTP_VER_MAX) goto err_drop; - cb = __mctp_cb(skb); - rcu_read_lock(); - cb->net = READ_ONCE(__mctp_dev_get(dev)->net); - rcu_read_unlock(); + /* MCTP drivers must populate halen/haddr */ + if (dev->type == ARPHRD_MCTP) { + cb = mctp_cb(skb); + } else { + cb = __mctp_cb(skb); + cb->halen = 0; + } + cb->net = READ_ONCE(mdev->net); + cb->ifindex = dev->ifindex; rt = mctp_route_lookup(net, cb->net, mh->dest); + + /* NULL EID, but addressed to our physical address */ + if (!rt && mh->dest == MCTP_ADDR_NULL && skb->pkt_type == PACKET_HOST) + rt = mctp_route_lookup_null(net, dev); + if (!rt) goto err_drop; - mctp_do_route(rt, skb); + rt->output(rt, skb); + mctp_route_release(rt); return NET_RX_SUCCESS; @@ -926,10 +1134,15 @@ static int mctp_route_nlparse(struct sk_buff *skb, struct nlmsghdr *nlh, return 0; } +static const struct nla_policy rta_metrics_policy[RTAX_MAX + 1] = { + [RTAX_MTU] = { .type = NLA_U32 }, +}; + static int mctp_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RTA_MAX + 1]; + struct nlattr *tbx[RTAX_MAX + 1]; mctp_eid_t daddr_start; struct mctp_dev *mdev; struct rtmsg *rtm; @@ -946,8 +1159,15 @@ static int mctp_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; } - /* TODO: parse mtu from nlparse */ mtu = 0; + if (tb[RTA_METRICS]) { + rc = nla_parse_nested(tbx, RTAX_MAX, tb[RTA_METRICS], + rta_metrics_policy, NULL); + if (rc < 0) + return rc; + if (tbx[RTAX_MTU]) + mtu = nla_get_u32(tbx[RTAX_MTU]); + } if (rtm->rtm_type != RTN_UNICAST) return -EINVAL; @@ -1116,3 +1336,7 @@ void __exit mctp_routes_exit(void) rtnl_unregister(PF_MCTP, RTM_GETROUTE); dev_remove_pack(&mctp_packet_type); } + +#if IS_ENABLED(CONFIG_MCTP_TEST) +#include "test/route-test.c" +#endif |