diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-11 05:01:30 +0100 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-11 05:01:30 +0100 |
commit | c5ce28df0e7c01a1de23c36ebdefcd803f2b6cbb (patch) | |
tree | 9830baf38832769e1cf621708889111bbe3c93df /net/openvswitch | |
parent | Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jik... (diff) | |
parent | crypto: fix af_alg_make_sg() conversion to iov_iter (diff) | |
download | linux-c5ce28df0e7c01a1de23c36ebdefcd803f2b6cbb.tar.xz linux-c5ce28df0e7c01a1de23c36ebdefcd803f2b6cbb.zip |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
1) More iov_iter conversion work from Al Viro.
[ The "crypto: switch af_alg_make_sg() to iov_iter" commit was
wrong, and this pull actually adds an extra commit on top of the
branch I'm pulling to fix that up, so that the pre-merge state is
ok. - Linus ]
2) Various optimizations to the ipv4 forwarding information base trie
lookup implementation. From Alexander Duyck.
3) Remove sock_iocb altogether, from CHristoph Hellwig.
4) Allow congestion control algorithm selection via routing metrics.
From Daniel Borkmann.
5) Make ipv4 uncached route list per-cpu, from Eric Dumazet.
6) Handle rfs hash collisions more gracefully, also from Eric Dumazet.
7) Add xmit_more support to r8169, e1000, and e1000e drivers. From
Florian Westphal.
8) Transparent Ethernet Bridging support for GRO, from Jesse Gross.
9) Add BPF packet actions to packet scheduler, from Jiri Pirko.
10) Add support for uniqu flow IDs to openvswitch, from Joe Stringer.
11) New NetCP ethernet driver, from Muralidharan Karicheri and Wingman
Kwok.
12) More sanely handle out-of-window dupacks, which can result in
serious ACK storms. From Neal Cardwell.
13) Various rhashtable bug fixes and enhancements, from Herbert Xu,
Patrick McHardy, and Thomas Graf.
14) Support xmit_more in be2net, from Sathya Perla.
15) Group Policy extensions for vxlan, from Thomas Graf.
16) Remove Checksum Offload support for vxlan, from Tom Herbert.
17) Like ipv4, support lockless transmit over ipv6 UDP sockets. From
Vlad Yasevich.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1494+1 commits)
crypto: fix af_alg_make_sg() conversion to iov_iter
ipv4: Namespecify TCP PMTU mechanism
i40e: Fix for stats init function call in Rx setup
tcp: don't include Fast Open option in SYN-ACK on pure SYN-data
openvswitch: Only set TUNNEL_VXLAN_OPT if VXLAN-GBP metadata is set
ipv6: Make __ipv6_select_ident static
ipv6: Fix fragment id assignment on LE arches.
bridge: Fix inability to add non-vlan fdb entry
net: Mellanox: Delete unnecessary checks before the function call "vunmap"
cxgb4: Add support in cxgb4 to get expansion rom version via ethtool
ethtool: rename reserved1 memeber in ethtool_drvinfo for expansion ROM version
net: dsa: Remove redundant phy_attach()
IB/mlx4: Reset flow support for IB kernel ULPs
IB/mlx4: Always use the correct port for mirrored multicast attachments
net/bonding: Fix potential bad memory access during bonding events
tipc: remove tipc_snprintf
tipc: nl compat add noop and remove legacy nl framework
tipc: convert legacy nl stats show to nl compat
tipc: convert legacy nl net id get to nl compat
tipc: convert legacy nl net id set to nl compat
...
Diffstat (limited to 'net/openvswitch')
-rw-r--r-- | net/openvswitch/actions.c | 377 | ||||
-rw-r--r-- | net/openvswitch/datapath.c | 237 | ||||
-rw-r--r-- | net/openvswitch/flow.c | 6 | ||||
-rw-r--r-- | net/openvswitch/flow.h | 42 | ||||
-rw-r--r-- | net/openvswitch/flow_netlink.c | 547 | ||||
-rw-r--r-- | net/openvswitch/flow_netlink.h | 13 | ||||
-rw-r--r-- | net/openvswitch/flow_table.c | 228 | ||||
-rw-r--r-- | net/openvswitch/flow_table.h | 8 | ||||
-rw-r--r-- | net/openvswitch/vport-geneve.c | 32 | ||||
-rw-r--r-- | net/openvswitch/vport-gre.c | 14 | ||||
-rw-r--r-- | net/openvswitch/vport-vxlan.c | 110 | ||||
-rw-r--r-- | net/openvswitch/vport-vxlan.h | 11 | ||||
-rw-r--r-- | net/openvswitch/vport.c | 12 | ||||
-rw-r--r-- | net/openvswitch/vport.h | 18 |
14 files changed, 1185 insertions, 470 deletions
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 770064c83711..b491c1c296fe 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -185,10 +185,15 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, return 0; } -static int set_mpls(struct sk_buff *skb, struct sw_flow_key *key, - const __be32 *mpls_lse) +/* 'KEY' must not have any bits set outside of the 'MASK' */ +#define MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK))) +#define SET_MASKED(OLD, KEY, MASK) ((OLD) = MASKED(OLD, KEY, MASK)) + +static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, + const __be32 *mpls_lse, const __be32 *mask) { __be32 *stack; + __be32 lse; int err; err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); @@ -196,14 +201,16 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *key, return err; stack = (__be32 *)skb_mpls_header(skb); + lse = MASKED(*stack, *mpls_lse, *mask); if (skb->ip_summed == CHECKSUM_COMPLETE) { - __be32 diff[] = { ~(*stack), *mpls_lse }; + __be32 diff[] = { ~(*stack), lse }; + skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum); } - *stack = *mpls_lse; - key->mpls.top_lse = *mpls_lse; + *stack = lse; + flow_key->mpls.top_lse = lse; return 0; } @@ -212,7 +219,7 @@ static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key) int err; err = skb_vlan_pop(skb); - if (vlan_tx_tag_present(skb)) + if (skb_vlan_tag_present(skb)) invalidate_flow_key(key); else key->eth.tci = 0; @@ -222,7 +229,7 @@ static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key) static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key, const struct ovs_action_push_vlan *vlan) { - if (vlan_tx_tag_present(skb)) + if (skb_vlan_tag_present(skb)) invalidate_flow_key(key); else key->eth.tci = vlan->vlan_tci; @@ -230,23 +237,39 @@ static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT); } -static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *key, - const struct ovs_key_ethernet *eth_key) +/* 'src' is already properly masked. */ +static void ether_addr_copy_masked(u8 *dst_, const u8 *src_, const u8 *mask_) +{ + u16 *dst = (u16 *)dst_; + const u16 *src = (const u16 *)src_; + const u16 *mask = (const u16 *)mask_; + + SET_MASKED(dst[0], src[0], mask[0]); + SET_MASKED(dst[1], src[1], mask[1]); + SET_MASKED(dst[2], src[2], mask[2]); +} + +static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct ovs_key_ethernet *key, + const struct ovs_key_ethernet *mask) { int err; + err = skb_ensure_writable(skb, ETH_HLEN); if (unlikely(err)) return err; skb_postpull_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); - ether_addr_copy(eth_hdr(skb)->h_source, eth_key->eth_src); - ether_addr_copy(eth_hdr(skb)->h_dest, eth_key->eth_dst); + ether_addr_copy_masked(eth_hdr(skb)->h_source, key->eth_src, + mask->eth_src); + ether_addr_copy_masked(eth_hdr(skb)->h_dest, key->eth_dst, + mask->eth_dst); ovs_skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); - ether_addr_copy(key->eth.src, eth_key->eth_src); - ether_addr_copy(key->eth.dst, eth_key->eth_dst); + ether_addr_copy(flow_key->eth.src, eth_hdr(skb)->h_source); + ether_addr_copy(flow_key->eth.dst, eth_hdr(skb)->h_dest); return 0; } @@ -304,6 +327,15 @@ static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto, } } +static void mask_ipv6_addr(const __be32 old[4], const __be32 addr[4], + const __be32 mask[4], __be32 masked[4]) +{ + masked[0] = MASKED(old[0], addr[0], mask[0]); + masked[1] = MASKED(old[1], addr[1], mask[1]); + masked[2] = MASKED(old[2], addr[2], mask[2]); + masked[3] = MASKED(old[3], addr[3], mask[3]); +} + static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, __be32 addr[4], const __be32 new_addr[4], bool recalculate_csum) @@ -315,29 +347,29 @@ static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, memcpy(addr, new_addr, sizeof(__be32[4])); } -static void set_ipv6_tc(struct ipv6hdr *nh, u8 tc) +static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl, u32 mask) { - nh->priority = tc >> 4; - nh->flow_lbl[0] = (nh->flow_lbl[0] & 0x0F) | ((tc & 0x0F) << 4); + /* Bits 21-24 are always unmasked, so this retains their values. */ + SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16)); + SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8)); + SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask); } -static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl) +static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl, + u8 mask) { - nh->flow_lbl[0] = (nh->flow_lbl[0] & 0xF0) | (fl & 0x000F0000) >> 16; - nh->flow_lbl[1] = (fl & 0x0000FF00) >> 8; - nh->flow_lbl[2] = fl & 0x000000FF; -} + new_ttl = MASKED(nh->ttl, new_ttl, mask); -static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl) -{ csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8)); nh->ttl = new_ttl; } -static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *key, - const struct ovs_key_ipv4 *ipv4_key) +static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct ovs_key_ipv4 *key, + const struct ovs_key_ipv4 *mask) { struct iphdr *nh; + __be32 new_addr; int err; err = skb_ensure_writable(skb, skb_network_offset(skb) + @@ -347,36 +379,49 @@ static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *key, nh = ip_hdr(skb); - if (ipv4_key->ipv4_src != nh->saddr) { - set_ip_addr(skb, nh, &nh->saddr, ipv4_key->ipv4_src); - key->ipv4.addr.src = ipv4_key->ipv4_src; - } + /* Setting an IP addresses is typically only a side effect of + * matching on them in the current userspace implementation, so it + * makes sense to check if the value actually changed. + */ + if (mask->ipv4_src) { + new_addr = MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src); - if (ipv4_key->ipv4_dst != nh->daddr) { - set_ip_addr(skb, nh, &nh->daddr, ipv4_key->ipv4_dst); - key->ipv4.addr.dst = ipv4_key->ipv4_dst; + if (unlikely(new_addr != nh->saddr)) { + set_ip_addr(skb, nh, &nh->saddr, new_addr); + flow_key->ipv4.addr.src = new_addr; + } } + if (mask->ipv4_dst) { + new_addr = MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst); - if (ipv4_key->ipv4_tos != nh->tos) { - ipv4_change_dsfield(nh, 0, ipv4_key->ipv4_tos); - key->ip.tos = nh->tos; + if (unlikely(new_addr != nh->daddr)) { + set_ip_addr(skb, nh, &nh->daddr, new_addr); + flow_key->ipv4.addr.dst = new_addr; + } } - - if (ipv4_key->ipv4_ttl != nh->ttl) { - set_ip_ttl(skb, nh, ipv4_key->ipv4_ttl); - key->ip.ttl = ipv4_key->ipv4_ttl; + if (mask->ipv4_tos) { + ipv4_change_dsfield(nh, ~mask->ipv4_tos, key->ipv4_tos); + flow_key->ip.tos = nh->tos; + } + if (mask->ipv4_ttl) { + set_ip_ttl(skb, nh, key->ipv4_ttl, mask->ipv4_ttl); + flow_key->ip.ttl = nh->ttl; } return 0; } -static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *key, - const struct ovs_key_ipv6 *ipv6_key) +static bool is_ipv6_mask_nonzero(const __be32 addr[4]) +{ + return !!(addr[0] | addr[1] | addr[2] | addr[3]); +} + +static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct ovs_key_ipv6 *key, + const struct ovs_key_ipv6 *mask) { struct ipv6hdr *nh; int err; - __be32 *saddr; - __be32 *daddr; err = skb_ensure_writable(skb, skb_network_offset(skb) + sizeof(struct ipv6hdr)); @@ -384,71 +429,77 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *key, return err; nh = ipv6_hdr(skb); - saddr = (__be32 *)&nh->saddr; - daddr = (__be32 *)&nh->daddr; - - if (memcmp(ipv6_key->ipv6_src, saddr, sizeof(ipv6_key->ipv6_src))) { - set_ipv6_addr(skb, ipv6_key->ipv6_proto, saddr, - ipv6_key->ipv6_src, true); - memcpy(&key->ipv6.addr.src, ipv6_key->ipv6_src, - sizeof(ipv6_key->ipv6_src)); - } - if (memcmp(ipv6_key->ipv6_dst, daddr, sizeof(ipv6_key->ipv6_dst))) { + /* Setting an IP addresses is typically only a side effect of + * matching on them in the current userspace implementation, so it + * makes sense to check if the value actually changed. + */ + if (is_ipv6_mask_nonzero(mask->ipv6_src)) { + __be32 *saddr = (__be32 *)&nh->saddr; + __be32 masked[4]; + + mask_ipv6_addr(saddr, key->ipv6_src, mask->ipv6_src, masked); + + if (unlikely(memcmp(saddr, masked, sizeof(masked)))) { + set_ipv6_addr(skb, key->ipv6_proto, saddr, masked, + true); + memcpy(&flow_key->ipv6.addr.src, masked, + sizeof(flow_key->ipv6.addr.src)); + } + } + if (is_ipv6_mask_nonzero(mask->ipv6_dst)) { unsigned int offset = 0; int flags = IP6_FH_F_SKIP_RH; bool recalc_csum = true; - - if (ipv6_ext_hdr(nh->nexthdr)) - recalc_csum = ipv6_find_hdr(skb, &offset, - NEXTHDR_ROUTING, NULL, - &flags) != NEXTHDR_ROUTING; - - set_ipv6_addr(skb, ipv6_key->ipv6_proto, daddr, - ipv6_key->ipv6_dst, recalc_csum); - memcpy(&key->ipv6.addr.dst, ipv6_key->ipv6_dst, - sizeof(ipv6_key->ipv6_dst)); + __be32 *daddr = (__be32 *)&nh->daddr; + __be32 masked[4]; + + mask_ipv6_addr(daddr, key->ipv6_dst, mask->ipv6_dst, masked); + + if (unlikely(memcmp(daddr, masked, sizeof(masked)))) { + if (ipv6_ext_hdr(nh->nexthdr)) + recalc_csum = (ipv6_find_hdr(skb, &offset, + NEXTHDR_ROUTING, + NULL, &flags) + != NEXTHDR_ROUTING); + + set_ipv6_addr(skb, key->ipv6_proto, daddr, masked, + recalc_csum); + memcpy(&flow_key->ipv6.addr.dst, masked, + sizeof(flow_key->ipv6.addr.dst)); + } + } + if (mask->ipv6_tclass) { + ipv6_change_dsfield(nh, ~mask->ipv6_tclass, key->ipv6_tclass); + flow_key->ip.tos = ipv6_get_dsfield(nh); + } + if (mask->ipv6_label) { + set_ipv6_fl(nh, ntohl(key->ipv6_label), + ntohl(mask->ipv6_label)); + flow_key->ipv6.label = + *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL); + } + if (mask->ipv6_hlimit) { + SET_MASKED(nh->hop_limit, key->ipv6_hlimit, mask->ipv6_hlimit); + flow_key->ip.ttl = nh->hop_limit; } - - set_ipv6_tc(nh, ipv6_key->ipv6_tclass); - key->ip.tos = ipv6_get_dsfield(nh); - - set_ipv6_fl(nh, ntohl(ipv6_key->ipv6_label)); - key->ipv6.label = *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL); - - nh->hop_limit = ipv6_key->ipv6_hlimit; - key->ip.ttl = ipv6_key->ipv6_hlimit; return 0; } /* Must follow skb_ensure_writable() since that can move the skb data. */ static void set_tp_port(struct sk_buff *skb, __be16 *port, - __be16 new_port, __sum16 *check) + __be16 new_port, __sum16 *check) { inet_proto_csum_replace2(check, skb, *port, new_port, 0); *port = new_port; - skb_clear_hash(skb); -} - -static void set_udp_port(struct sk_buff *skb, __be16 *port, __be16 new_port) -{ - struct udphdr *uh = udp_hdr(skb); - - if (uh->check && skb->ip_summed != CHECKSUM_PARTIAL) { - set_tp_port(skb, port, new_port, &uh->check); - - if (!uh->check) - uh->check = CSUM_MANGLED_0; - } else { - *port = new_port; - skb_clear_hash(skb); - } } -static int set_udp(struct sk_buff *skb, struct sw_flow_key *key, - const struct ovs_key_udp *udp_port_key) +static int set_udp(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct ovs_key_udp *key, + const struct ovs_key_udp *mask) { struct udphdr *uh; + __be16 src, dst; int err; err = skb_ensure_writable(skb, skb_transport_offset(skb) + @@ -457,23 +508,40 @@ static int set_udp(struct sk_buff *skb, struct sw_flow_key *key, return err; uh = udp_hdr(skb); - if (udp_port_key->udp_src != uh->source) { - set_udp_port(skb, &uh->source, udp_port_key->udp_src); - key->tp.src = udp_port_key->udp_src; - } + /* Either of the masks is non-zero, so do not bother checking them. */ + src = MASKED(uh->source, key->udp_src, mask->udp_src); + dst = MASKED(uh->dest, key->udp_dst, mask->udp_dst); - if (udp_port_key->udp_dst != uh->dest) { - set_udp_port(skb, &uh->dest, udp_port_key->udp_dst); - key->tp.dst = udp_port_key->udp_dst; + if (uh->check && skb->ip_summed != CHECKSUM_PARTIAL) { + if (likely(src != uh->source)) { + set_tp_port(skb, &uh->source, src, &uh->check); + flow_key->tp.src = src; + } + if (likely(dst != uh->dest)) { + set_tp_port(skb, &uh->dest, dst, &uh->check); + flow_key->tp.dst = dst; + } + + if (unlikely(!uh->check)) + uh->check = CSUM_MANGLED_0; + } else { + uh->source = src; + uh->dest = dst; + flow_key->tp.src = src; + flow_key->tp.dst = dst; } + skb_clear_hash(skb); + return 0; } -static int set_tcp(struct sk_buff *skb, struct sw_flow_key *key, - const struct ovs_key_tcp *tcp_port_key) +static int set_tcp(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct ovs_key_tcp *key, + const struct ovs_key_tcp *mask) { struct tcphdr *th; + __be16 src, dst; int err; err = skb_ensure_writable(skb, skb_transport_offset(skb) + @@ -482,50 +550,49 @@ static int set_tcp(struct sk_buff *skb, struct sw_flow_key *key, return err; th = tcp_hdr(skb); - if (tcp_port_key->tcp_src != th->source) { - set_tp_port(skb, &th->source, tcp_port_key->tcp_src, &th->check); - key->tp.src = tcp_port_key->tcp_src; + src = MASKED(th->source, key->tcp_src, mask->tcp_src); + if (likely(src != th->source)) { + set_tp_port(skb, &th->source, src, &th->check); + flow_key->tp.src = src; } - - if (tcp_port_key->tcp_dst != th->dest) { - set_tp_port(skb, &th->dest, tcp_port_key->tcp_dst, &th->check); - key->tp.dst = tcp_port_key->tcp_dst; + dst = MASKED(th->dest, key->tcp_dst, mask->tcp_dst); + if (likely(dst != th->dest)) { + set_tp_port(skb, &th->dest, dst, &th->check); + flow_key->tp.dst = dst; } + skb_clear_hash(skb); return 0; } -static int set_sctp(struct sk_buff *skb, struct sw_flow_key *key, - const struct ovs_key_sctp *sctp_port_key) +static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct ovs_key_sctp *key, + const struct ovs_key_sctp *mask) { + unsigned int sctphoff = skb_transport_offset(skb); struct sctphdr *sh; + __le32 old_correct_csum, new_csum, old_csum; int err; - unsigned int sctphoff = skb_transport_offset(skb); err = skb_ensure_writable(skb, sctphoff + sizeof(struct sctphdr)); if (unlikely(err)) return err; sh = sctp_hdr(skb); - if (sctp_port_key->sctp_src != sh->source || - sctp_port_key->sctp_dst != sh->dest) { - __le32 old_correct_csum, new_csum, old_csum; + old_csum = sh->checksum; + old_correct_csum = sctp_compute_cksum(skb, sctphoff); - old_csum = sh->checksum; - old_correct_csum = sctp_compute_cksum(skb, sctphoff); + sh->source = MASKED(sh->source, key->sctp_src, mask->sctp_src); + sh->dest = MASKED(sh->dest, key->sctp_dst, mask->sctp_dst); - sh->source = sctp_port_key->sctp_src; - sh->dest = sctp_port_key->sctp_dst; + new_csum = sctp_compute_cksum(skb, sctphoff); - new_csum = sctp_compute_cksum(skb, sctphoff); + /* Carry any checksum errors through. */ + sh->checksum = old_csum ^ old_correct_csum ^ new_csum; - /* Carry any checksum errors through. */ - sh->checksum = old_csum ^ old_correct_csum ^ new_csum; - - skb_clear_hash(skb); - key->tp.src = sctp_port_key->sctp_src; - key->tp.dst = sctp_port_key->sctp_dst; - } + skb_clear_hash(skb); + flow_key->tp.src = sh->source; + flow_key->tp.dst = sh->dest; return 0; } @@ -653,52 +720,77 @@ static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key, key->ovs_flow_hash = hash; } -static int execute_set_action(struct sk_buff *skb, struct sw_flow_key *key, - const struct nlattr *nested_attr) +static int execute_set_action(struct sk_buff *skb, + struct sw_flow_key *flow_key, + const struct nlattr *a) +{ + /* Only tunnel set execution is supported without a mask. */ + if (nla_type(a) == OVS_KEY_ATTR_TUNNEL_INFO) { + OVS_CB(skb)->egress_tun_info = nla_data(a); + return 0; + } + + return -EINVAL; +} + +/* Mask is at the midpoint of the data. */ +#define get_mask(a, type) ((const type)nla_data(a) + 1) + +static int execute_masked_set_action(struct sk_buff *skb, + struct sw_flow_key *flow_key, + const struct nlattr *a) { int err = 0; - switch (nla_type(nested_attr)) { + switch (nla_type(a)) { case OVS_KEY_ATTR_PRIORITY: - skb->priority = nla_get_u32(nested_attr); - key->phy.priority = skb->priority; + SET_MASKED(skb->priority, nla_get_u32(a), *get_mask(a, u32 *)); + flow_key->phy.priority = skb->priority; break; case OVS_KEY_ATTR_SKB_MARK: - skb->mark = nla_get_u32(nested_attr); - key->phy.skb_mark = skb->mark; + SET_MASKED(skb->mark, nla_get_u32(a), *get_mask(a, u32 *)); + flow_key->phy.skb_mark = skb->mark; break; case OVS_KEY_ATTR_TUNNEL_INFO: - OVS_CB(skb)->egress_tun_info = nla_data(nested_attr); + /* Masked data not supported for tunnel. */ + err = -EINVAL; break; case OVS_KEY_ATTR_ETHERNET: - err = set_eth_addr(skb, key, nla_data(nested_attr)); + err = set_eth_addr(skb, flow_key, nla_data(a), + get_mask(a, struct ovs_key_ethernet *)); break; case OVS_KEY_ATTR_IPV4: - err = set_ipv4(skb, key, nla_data(nested_attr)); + err = set_ipv4(skb, flow_key, nla_data(a), + get_mask(a, struct ovs_key_ipv4 *)); break; case OVS_KEY_ATTR_IPV6: - err = set_ipv6(skb, key, nla_data(nested_attr)); + err = set_ipv6(skb, flow_key, nla_data(a), + get_mask(a, struct ovs_key_ipv6 *)); break; case OVS_KEY_ATTR_TCP: - err = set_tcp(skb, key, nla_data(nested_attr)); + err = set_tcp(skb, flow_key, nla_data(a), + get_mask(a, struct ovs_key_tcp *)); break; case OVS_KEY_ATTR_UDP: - err = set_udp(skb, key, nla_data(nested_attr)); + err = set_udp(skb, flow_key, nla_data(a), + get_mask(a, struct ovs_key_udp *)); break; case OVS_KEY_ATTR_SCTP: - err = set_sctp(skb, key, nla_data(nested_attr)); + err = set_sctp(skb, flow_key, nla_data(a), + get_mask(a, struct ovs_key_sctp *)); break; case OVS_KEY_ATTR_MPLS: - err = set_mpls(skb, key, nla_data(nested_attr)); + err = set_mpls(skb, flow_key, nla_data(a), get_mask(a, + __be32 *)); break; } @@ -818,6 +910,11 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, err = execute_set_action(skb, key, nla_data(a)); break; + case OVS_ACTION_ATTR_SET_MASKED: + case OVS_ACTION_ATTR_SET_TO_MASKED: + err = execute_masked_set_action(skb, key, nla_data(a)); + break; + case OVS_ACTION_ATTR_SAMPLE: err = sample(dp, skb, key, a); break; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index b07349e82d78..ae5e77cdc0ca 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -65,6 +65,8 @@ static struct genl_family dp_packet_genl_family; static struct genl_family dp_flow_genl_family; static struct genl_family dp_datapath_genl_family; +static const struct nla_policy flow_policy[]; + static const struct genl_multicast_group ovs_dp_flow_multicast_group = { .name = OVS_FLOW_MCGROUP, }; @@ -419,7 +421,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, if (!dp_ifindex) return -ENODEV; - if (vlan_tx_tag_present(skb)) { + if (skb_vlan_tag_present(skb)) { nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return -ENOMEM; @@ -461,10 +463,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, 0, upcall_info->cmd); upcall->dp_ifindex = dp_ifindex; - nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY); - err = ovs_nla_put_flow(key, key, user_skb); + err = ovs_nla_put_key(key, key, OVS_PACKET_ATTR_KEY, false, user_skb); BUG_ON(err); - nla_nest_end(user_skb, nla); if (upcall_info->userdata) __nla_put(user_skb, OVS_PACKET_ATTR_USERDATA, @@ -664,46 +664,48 @@ static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats, } } -static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts) +static bool should_fill_key(const struct sw_flow_id *sfid, uint32_t ufid_flags) { - return NLMSG_ALIGN(sizeof(struct ovs_header)) - + nla_total_size(ovs_key_attr_size()) /* OVS_FLOW_ATTR_KEY */ - + nla_total_size(ovs_key_attr_size()) /* OVS_FLOW_ATTR_MASK */ - + nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ - + nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */ - + nla_total_size(8) /* OVS_FLOW_ATTR_USED */ - + nla_total_size(acts->actions_len); /* OVS_FLOW_ATTR_ACTIONS */ + return ovs_identifier_is_ufid(sfid) && + !(ufid_flags & OVS_UFID_F_OMIT_KEY); } -/* Called with ovs_mutex or RCU read lock. */ -static int ovs_flow_cmd_fill_match(const struct sw_flow *flow, - struct sk_buff *skb) +static bool should_fill_mask(uint32_t ufid_flags) { - struct nlattr *nla; - int err; + return !(ufid_flags & OVS_UFID_F_OMIT_MASK); +} - /* Fill flow key. */ - nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY); - if (!nla) - return -EMSGSIZE; +static bool should_fill_actions(uint32_t ufid_flags) +{ + return !(ufid_flags & OVS_UFID_F_OMIT_ACTIONS); +} - err = ovs_nla_put_flow(&flow->unmasked_key, &flow->unmasked_key, skb); - if (err) - return err; +static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts, + const struct sw_flow_id *sfid, + uint32_t ufid_flags) +{ + size_t len = NLMSG_ALIGN(sizeof(struct ovs_header)); - nla_nest_end(skb, nla); + /* OVS_FLOW_ATTR_UFID */ + if (sfid && ovs_identifier_is_ufid(sfid)) + len += nla_total_size(sfid->ufid_len); - /* Fill flow mask. */ - nla = nla_nest_start(skb, OVS_FLOW_ATTR_MASK); - if (!nla) - return -EMSGSIZE; + /* OVS_FLOW_ATTR_KEY */ + if (!sfid || should_fill_key(sfid, ufid_flags)) + len += nla_total_size(ovs_key_attr_size()); - err = ovs_nla_put_flow(&flow->key, &flow->mask->key, skb); - if (err) - return err; + /* OVS_FLOW_ATTR_MASK */ + if (should_fill_mask(ufid_flags)) + len += nla_total_size(ovs_key_attr_size()); - nla_nest_end(skb, nla); - return 0; + /* OVS_FLOW_ATTR_ACTIONS */ + if (should_fill_actions(ufid_flags)) + len += nla_total_size(acts->actions_len); + + return len + + nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ + + nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */ + + nla_total_size(8); /* OVS_FLOW_ATTR_USED */ } /* Called with ovs_mutex or RCU read lock. */ @@ -774,7 +776,7 @@ static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow, /* Called with ovs_mutex or RCU read lock. */ static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex, struct sk_buff *skb, u32 portid, - u32 seq, u32 flags, u8 cmd) + u32 seq, u32 flags, u8 cmd, u32 ufid_flags) { const int skb_orig_len = skb->len; struct ovs_header *ovs_header; @@ -787,19 +789,34 @@ static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex, ovs_header->dp_ifindex = dp_ifindex; - err = ovs_flow_cmd_fill_match(flow, skb); + err = ovs_nla_put_identifier(flow, skb); if (err) goto error; + if (should_fill_key(&flow->id, ufid_flags)) { + err = ovs_nla_put_masked_key(flow, skb); + if (err) + goto error; + } + + if (should_fill_mask(ufid_flags)) { + err = ovs_nla_put_mask(flow, skb); + if (err) + goto error; + } + err = ovs_flow_cmd_fill_stats(flow, skb); if (err) goto error; - err = ovs_flow_cmd_fill_actions(flow, skb, skb_orig_len); - if (err) - goto error; + if (should_fill_actions(ufid_flags)) { + err = ovs_flow_cmd_fill_actions(flow, skb, skb_orig_len); + if (err) + goto error; + } - return genlmsg_end(skb, ovs_header); + genlmsg_end(skb, ovs_header); + return 0; error: genlmsg_cancel(skb, ovs_header); @@ -808,15 +825,19 @@ error: /* May not be called with RCU read lock. */ static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *acts, + const struct sw_flow_id *sfid, struct genl_info *info, - bool always) + bool always, + uint32_t ufid_flags) { struct sk_buff *skb; + size_t len; if (!always && !ovs_must_notify(&dp_flow_genl_family, info, 0)) return NULL; - skb = genlmsg_new_unicast(ovs_flow_cmd_msg_size(acts), info, GFP_KERNEL); + len = ovs_flow_cmd_msg_size(acts, sfid, ufid_flags); + skb = genlmsg_new_unicast(len, info, GFP_KERNEL); if (!skb) return ERR_PTR(-ENOMEM); @@ -827,19 +848,19 @@ static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *act static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow, int dp_ifindex, struct genl_info *info, u8 cmd, - bool always) + bool always, u32 ufid_flags) { struct sk_buff *skb; int retval; - skb = ovs_flow_cmd_alloc_info(ovsl_dereference(flow->sf_acts), info, - always); + skb = ovs_flow_cmd_alloc_info(ovsl_dereference(flow->sf_acts), + &flow->id, info, always, ufid_flags); if (IS_ERR_OR_NULL(skb)) return skb; retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb, info->snd_portid, info->snd_seq, 0, - cmd); + cmd, ufid_flags); BUG_ON(retval < 0); return skb; } @@ -848,12 +869,14 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; - struct sw_flow *flow, *new_flow; + struct sw_flow *flow = NULL, *new_flow; struct sw_flow_mask mask; struct sk_buff *reply; struct datapath *dp; + struct sw_flow_key key; struct sw_flow_actions *acts; struct sw_flow_match match; + u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); int error; bool log = !a[OVS_FLOW_ATTR_PROBE]; @@ -878,13 +901,19 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } /* Extract key. */ - ovs_match_init(&match, &new_flow->unmasked_key, &mask); + ovs_match_init(&match, &key, &mask); error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) goto err_kfree_flow; - ovs_flow_mask_key(&new_flow->key, &new_flow->unmasked_key, &mask); + ovs_flow_mask_key(&new_flow->key, &key, &mask); + + /* Extract flow identifier. */ + error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], + &key, log); + if (error) + goto err_kfree_flow; /* Validate actions. */ error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, @@ -894,7 +923,8 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) goto err_kfree_flow; } - reply = ovs_flow_cmd_alloc_info(acts, info, false); + reply = ovs_flow_cmd_alloc_info(acts, &new_flow->id, info, false, + ufid_flags); if (IS_ERR(reply)) { error = PTR_ERR(reply); goto err_kfree_acts; @@ -906,8 +936,12 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) error = -ENODEV; goto err_unlock_ovs; } + /* Check if this is a duplicate flow */ - flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->unmasked_key); + if (ovs_identifier_is_ufid(&new_flow->id)) + flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id); + if (!flow) + flow = ovs_flow_tbl_lookup(&dp->table, &key); if (likely(!flow)) { rcu_assign_pointer(new_flow->sf_acts, acts); @@ -923,7 +957,8 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_header->dp_ifindex, reply, info->snd_portid, info->snd_seq, 0, - OVS_FLOW_CMD_NEW); + OVS_FLOW_CMD_NEW, + ufid_flags); BUG_ON(error < 0); } ovs_unlock(); @@ -941,10 +976,15 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) error = -EEXIST; goto err_unlock_ovs; } - /* The unmasked key has to be the same for flow updates. */ - if (unlikely(!ovs_flow_cmp_unmasked_key(flow, &match))) { - /* Look for any overlapping flow. */ - flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + /* The flow identifier has to be the same for flow updates. + * Look for any overlapping flow. + */ + if (unlikely(!ovs_flow_cmp(flow, &match))) { + if (ovs_identifier_is_key(&flow->id)) + flow = ovs_flow_tbl_lookup_exact(&dp->table, + &match); + else /* UFID matches but key is different */ + flow = NULL; if (!flow) { error = -ENOENT; goto err_unlock_ovs; @@ -959,7 +999,8 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_header->dp_ifindex, reply, info->snd_portid, info->snd_seq, 0, - OVS_FLOW_CMD_NEW); + OVS_FLOW_CMD_NEW, + ufid_flags); BUG_ON(error < 0); } ovs_unlock(); @@ -1015,8 +1056,11 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; struct sw_flow_actions *old_acts = NULL, *acts = NULL; struct sw_flow_match match; + struct sw_flow_id sfid; + u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); int error; bool log = !a[OVS_FLOW_ATTR_PROBE]; + bool ufid_present; /* Extract key. */ error = -EINVAL; @@ -1025,6 +1069,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) goto error; } + ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log); ovs_match_init(&match, &key, &mask); error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); @@ -1041,7 +1086,8 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) } /* Can allocate before locking if have acts. */ - reply = ovs_flow_cmd_alloc_info(acts, info, false); + reply = ovs_flow_cmd_alloc_info(acts, &sfid, info, false, + ufid_flags); if (IS_ERR(reply)) { error = PTR_ERR(reply); goto err_kfree_acts; @@ -1055,7 +1101,10 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) goto err_unlock_ovs; } /* Check that the flow exists. */ - flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + if (ufid_present) + flow = ovs_flow_tbl_lookup_ufid(&dp->table, &sfid); + else + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); if (unlikely(!flow)) { error = -ENOENT; goto err_unlock_ovs; @@ -1071,13 +1120,16 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) ovs_header->dp_ifindex, reply, info->snd_portid, info->snd_seq, 0, - OVS_FLOW_CMD_NEW); + OVS_FLOW_CMD_NEW, + ufid_flags); BUG_ON(error < 0); } } else { /* Could not alloc without acts before locking. */ reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, - info, OVS_FLOW_CMD_NEW, false); + info, OVS_FLOW_CMD_NEW, false, + ufid_flags); + if (unlikely(IS_ERR(reply))) { error = PTR_ERR(reply); goto err_unlock_ovs; @@ -1114,17 +1166,22 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) struct sw_flow *flow; struct datapath *dp; struct sw_flow_match match; - int err; + struct sw_flow_id ufid; + u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); + int err = 0; bool log = !a[OVS_FLOW_ATTR_PROBE]; + bool ufid_present; - if (!a[OVS_FLOW_ATTR_KEY]) { + ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); + if (a[OVS_FLOW_ATTR_KEY]) { + ovs_match_init(&match, &key, NULL); + err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL, + log); + } else if (!ufid_present) { OVS_NLERR(log, "Flow get message rejected, Key attribute missing."); - return -EINVAL; + err = -EINVAL; } - - ovs_match_init(&match, &key, NULL); - err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL, log); if (err) return err; @@ -1135,14 +1192,17 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) goto unlock; } - flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + if (ufid_present) + flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid); + else + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); if (!flow) { err = -ENOENT; goto unlock; } reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info, - OVS_FLOW_CMD_NEW, true); + OVS_FLOW_CMD_NEW, true, ufid_flags); if (IS_ERR(reply)) { err = PTR_ERR(reply); goto unlock; @@ -1161,13 +1221,17 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) struct ovs_header *ovs_header = info->userhdr; struct sw_flow_key key; struct sk_buff *reply; - struct sw_flow *flow; + struct sw_flow *flow = NULL; struct datapath *dp; struct sw_flow_match match; + struct sw_flow_id ufid; + u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); int err; bool log = !a[OVS_FLOW_ATTR_PROBE]; + bool ufid_present; - if (likely(a[OVS_FLOW_ATTR_KEY])) { + ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); + if (a[OVS_FLOW_ATTR_KEY]) { ovs_match_init(&match, &key, NULL); err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL, log); @@ -1182,12 +1246,15 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) goto unlock; } - if (unlikely(!a[OVS_FLOW_ATTR_KEY])) { + if (unlikely(!a[OVS_FLOW_ATTR_KEY] && !ufid_present)) { err = ovs_flow_tbl_flush(&dp->table); goto unlock; } - flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + if (ufid_present) + flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid); + else + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); if (unlikely(!flow)) { err = -ENOENT; goto unlock; @@ -1197,14 +1264,15 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) ovs_unlock(); reply = ovs_flow_cmd_alloc_info((const struct sw_flow_actions __force *) flow->sf_acts, - info, false); + &flow->id, info, false, ufid_flags); if (likely(reply)) { if (likely(!IS_ERR(reply))) { rcu_read_lock(); /*To keep RCU checker happy. */ err = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, reply, info->snd_portid, info->snd_seq, 0, - OVS_FLOW_CMD_DEL); + OVS_FLOW_CMD_DEL, + ufid_flags); rcu_read_unlock(); BUG_ON(err < 0); @@ -1223,9 +1291,18 @@ unlock: static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) { + struct nlattr *a[__OVS_FLOW_ATTR_MAX]; struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh)); struct table_instance *ti; struct datapath *dp; + u32 ufid_flags; + int err; + + err = genlmsg_parse(cb->nlh, &dp_flow_genl_family, a, + OVS_FLOW_ATTR_MAX, flow_policy); + if (err) + return err; + ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); rcu_read_lock(); dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); @@ -1248,7 +1325,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - OVS_FLOW_CMD_NEW) < 0) + OVS_FLOW_CMD_NEW, ufid_flags) < 0) break; cb->args[0] = bucket; @@ -1264,6 +1341,8 @@ static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED }, [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG }, [OVS_FLOW_ATTR_PROBE] = { .type = NLA_FLAG }, + [OVS_FLOW_ATTR_UFID] = { .type = NLA_UNSPEC, .len = 1 }, + [OVS_FLOW_ATTR_UFID_FLAGS] = { .type = NLA_U32 }, }; static const struct genl_ops dp_flow_genl_ops[] = { @@ -1349,7 +1428,8 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features)) goto nla_put_failure; - return genlmsg_end(skb, ovs_header); + genlmsg_end(skb, ovs_header); + return 0; nla_put_failure: genlmsg_cancel(skb, ovs_header); @@ -1723,7 +1803,8 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, if (err == -EMSGSIZE) goto error; - return genlmsg_end(skb, ovs_header); + genlmsg_end(skb, ovs_header); + return 0; nla_put_failure: err = -EMSGSIZE; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index da2fae0873a5..e2c348b8baca 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -70,7 +70,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, { struct flow_stats *stats; int node = numa_node_id(); - int len = skb->len + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); + int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); stats = rcu_dereference(flow->stats[node]); @@ -472,7 +472,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) */ key->eth.tci = 0; - if (vlan_tx_tag_present(skb)) + if (skb_vlan_tag_present(skb)) key->eth.tci = htons(skb->vlan_tci); else if (eth->h_proto == htons(ETH_P_8021Q)) if (unlikely(parse_vlan(skb, key))) @@ -691,7 +691,7 @@ int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info, BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) * 8)) - 1 > sizeof(key->tun_opts)); - memcpy(GENEVE_OPTS(key, tun_info->options_len), + memcpy(TUN_METADATA_OPTS(key, tun_info->options_len), tun_info->options, tun_info->options_len); key->tun_opts_len = tun_info->options_len; } else { diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index a8b30f334388..a076e445ccc2 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -53,7 +53,7 @@ struct ovs_key_ipv4_tunnel { struct ovs_tunnel_info { struct ovs_key_ipv4_tunnel tunnel; - const struct geneve_opt *options; + const void *options; u8 options_len; }; @@ -61,10 +61,10 @@ struct ovs_tunnel_info { * maximum size. This allows us to get the benefits of variable length * matching for small options. */ -#define GENEVE_OPTS(flow_key, opt_len) \ - ((struct geneve_opt *)((flow_key)->tun_opts + \ - FIELD_SIZEOF(struct sw_flow_key, tun_opts) - \ - opt_len)) +#define TUN_METADATA_OFFSET(opt_len) \ + (FIELD_SIZEOF(struct sw_flow_key, tun_opts) - opt_len) +#define TUN_METADATA_OPTS(flow_key, opt_len) \ + ((void *)((flow_key)->tun_opts + TUN_METADATA_OFFSET(opt_len))) static inline void __ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, __be32 saddr, __be32 daddr, @@ -73,7 +73,7 @@ static inline void __ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, __be16 tp_dst, __be64 tun_id, __be16 tun_flags, - const struct geneve_opt *opts, + const void *opts, u8 opts_len) { tun_info->tunnel.tun_id = tun_id; @@ -105,7 +105,7 @@ static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, __be16 tp_dst, __be64 tun_id, __be16 tun_flags, - const struct geneve_opt *opts, + const void *opts, u8 opts_len) { __ovs_flow_tun_info_init(tun_info, iph->saddr, iph->daddr, @@ -197,6 +197,16 @@ struct sw_flow_match { struct sw_flow_mask *mask; }; +#define MAX_UFID_LENGTH 16 /* 128 bits */ + +struct sw_flow_id { + u32 ufid_len; + union { + u32 ufid[MAX_UFID_LENGTH / 4]; + struct sw_flow_key *unmasked_key; + }; +}; + struct sw_flow_actions { struct rcu_head rcu; u32 actions_len; @@ -213,13 +223,15 @@ struct flow_stats { struct sw_flow { struct rcu_head rcu; - struct hlist_node hash_node[2]; - u32 hash; + struct { + struct hlist_node node[2]; + u32 hash; + } flow_table, ufid_table; int stats_last_writer; /* NUMA-node id of the last writer on * 'stats[0]'. */ struct sw_flow_key key; - struct sw_flow_key unmasked_key; + struct sw_flow_id id; struct sw_flow_mask *mask; struct sw_flow_actions __rcu *sf_acts; struct flow_stats __rcu *stats[]; /* One for each NUMA node. First one @@ -243,6 +255,16 @@ struct arp_eth_header { unsigned char ar_tip[4]; /* target IP address */ } __packed; +static inline bool ovs_identifier_is_ufid(const struct sw_flow_id *sfid) +{ + return sfid->ufid_len; +} + +static inline bool ovs_identifier_is_key(const struct sw_flow_id *sfid) +{ + return !ovs_identifier_is_ufid(sfid); +} + void ovs_flow_stats_update(struct sw_flow *, __be16 tcp_flags, const struct sk_buff *); void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *, diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index d1eecf707613..993281e6278d 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -49,6 +49,14 @@ #include <net/mpls.h> #include "flow_netlink.h" +#include "vport-vxlan.h" + +struct ovs_len_tbl { + int len; + const struct ovs_len_tbl *next; +}; + +#define OVS_ATTR_NESTED -1 static void update_range(struct sw_flow_match *match, size_t offset, size_t size, bool is_mask) @@ -261,6 +269,9 @@ size_t ovs_tun_key_attr_size(void) + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */ + nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */ + /* OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS is mutually exclusive with + * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it. + */ + nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_SRC */ + nla_total_size(2); /* OVS_TUNNEL_KEY_ATTR_TP_DST */ } @@ -289,29 +300,45 @@ size_t ovs_key_attr_size(void) + nla_total_size(28); /* OVS_KEY_ATTR_ND */ } +static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = { + [OVS_TUNNEL_KEY_ATTR_ID] = { .len = sizeof(u64) }, + [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = { .len = sizeof(u32) }, + [OVS_TUNNEL_KEY_ATTR_IPV4_DST] = { .len = sizeof(u32) }, + [OVS_TUNNEL_KEY_ATTR_TOS] = { .len = 1 }, + [OVS_TUNNEL_KEY_ATTR_TTL] = { .len = 1 }, + [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = { .len = 0 }, + [OVS_TUNNEL_KEY_ATTR_CSUM] = { .len = 0 }, + [OVS_TUNNEL_KEY_ATTR_TP_SRC] = { .len = sizeof(u16) }, + [OVS_TUNNEL_KEY_ATTR_TP_DST] = { .len = sizeof(u16) }, + [OVS_TUNNEL_KEY_ATTR_OAM] = { .len = 0 }, + [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_NESTED }, + [OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS] = { .len = OVS_ATTR_NESTED }, +}; + /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ -static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { - [OVS_KEY_ATTR_ENCAP] = -1, - [OVS_KEY_ATTR_PRIORITY] = sizeof(u32), - [OVS_KEY_ATTR_IN_PORT] = sizeof(u32), - [OVS_KEY_ATTR_SKB_MARK] = sizeof(u32), - [OVS_KEY_ATTR_ETHERNET] = sizeof(struct ovs_key_ethernet), - [OVS_KEY_ATTR_VLAN] = sizeof(__be16), - [OVS_KEY_ATTR_ETHERTYPE] = sizeof(__be16), - [OVS_KEY_ATTR_IPV4] = sizeof(struct ovs_key_ipv4), - [OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6), - [OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp), - [OVS_KEY_ATTR_TCP_FLAGS] = sizeof(__be16), - [OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp), - [OVS_KEY_ATTR_SCTP] = sizeof(struct ovs_key_sctp), - [OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp), - [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6), - [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp), - [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd), - [OVS_KEY_ATTR_RECIRC_ID] = sizeof(u32), - [OVS_KEY_ATTR_DP_HASH] = sizeof(u32), - [OVS_KEY_ATTR_TUNNEL] = -1, - [OVS_KEY_ATTR_MPLS] = sizeof(struct ovs_key_mpls), +static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { + [OVS_KEY_ATTR_ENCAP] = { .len = OVS_ATTR_NESTED }, + [OVS_KEY_ATTR_PRIORITY] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_IN_PORT] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_SKB_MARK] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_ETHERNET] = { .len = sizeof(struct ovs_key_ethernet) }, + [OVS_KEY_ATTR_VLAN] = { .len = sizeof(__be16) }, + [OVS_KEY_ATTR_ETHERTYPE] = { .len = sizeof(__be16) }, + [OVS_KEY_ATTR_IPV4] = { .len = sizeof(struct ovs_key_ipv4) }, + [OVS_KEY_ATTR_IPV6] = { .len = sizeof(struct ovs_key_ipv6) }, + [OVS_KEY_ATTR_TCP] = { .len = sizeof(struct ovs_key_tcp) }, + [OVS_KEY_ATTR_TCP_FLAGS] = { .len = sizeof(__be16) }, + [OVS_KEY_ATTR_UDP] = { .len = sizeof(struct ovs_key_udp) }, + [OVS_KEY_ATTR_SCTP] = { .len = sizeof(struct ovs_key_sctp) }, + [OVS_KEY_ATTR_ICMP] = { .len = sizeof(struct ovs_key_icmp) }, + [OVS_KEY_ATTR_ICMPV6] = { .len = sizeof(struct ovs_key_icmpv6) }, + [OVS_KEY_ATTR_ARP] = { .len = sizeof(struct ovs_key_arp) }, + [OVS_KEY_ATTR_ND] = { .len = sizeof(struct ovs_key_nd) }, + [OVS_KEY_ATTR_RECIRC_ID] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_DP_HASH] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_TUNNEL] = { .len = OVS_ATTR_NESTED, + .next = ovs_tunnel_key_lens, }, + [OVS_KEY_ATTR_MPLS] = { .len = sizeof(struct ovs_key_mpls) }, }; static bool is_all_zero(const u8 *fp, size_t size) @@ -352,8 +379,8 @@ static int __parse_flow_nlattrs(const struct nlattr *attr, return -EINVAL; } - expected_len = ovs_key_lens[type]; - if (nla_len(nla) != expected_len && expected_len != -1) { + expected_len = ovs_key_lens[type].len; + if (nla_len(nla) != expected_len && expected_len != OVS_ATTR_NESTED) { OVS_NLERR(log, "Key %d has unexpected len %d expected %d", type, nla_len(nla), expected_len); return -EINVAL; @@ -432,13 +459,47 @@ static int genev_tun_opt_from_nlattr(const struct nlattr *a, SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true); } - opt_key_offset = (unsigned long)GENEVE_OPTS((struct sw_flow_key *)0, - nla_len(a)); + opt_key_offset = TUN_METADATA_OFFSET(nla_len(a)); SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, nla_data(a), nla_len(a), is_mask); return 0; } +static const struct nla_policy vxlan_opt_policy[OVS_VXLAN_EXT_MAX + 1] = { + [OVS_VXLAN_EXT_GBP] = { .type = NLA_U32 }, +}; + +static int vxlan_tun_opt_from_nlattr(const struct nlattr *a, + struct sw_flow_match *match, bool is_mask, + bool log) +{ + struct nlattr *tb[OVS_VXLAN_EXT_MAX+1]; + unsigned long opt_key_offset; + struct ovs_vxlan_opts opts; + int err; + + BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); + + err = nla_parse_nested(tb, OVS_VXLAN_EXT_MAX, a, vxlan_opt_policy); + if (err < 0) + return err; + + memset(&opts, 0, sizeof(opts)); + + if (tb[OVS_VXLAN_EXT_GBP]) + opts.gbp = nla_get_u32(tb[OVS_VXLAN_EXT_GBP]); + + if (!is_mask) + SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), false); + else + SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true); + + opt_key_offset = TUN_METADATA_OFFSET(sizeof(opts)); + SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, &opts, sizeof(opts), + is_mask); + return 0; +} + static int ipv4_tun_from_nlattr(const struct nlattr *attr, struct sw_flow_match *match, bool is_mask, bool log) @@ -447,35 +508,22 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, int rem; bool ttl = false; __be16 tun_flags = 0; + int opts_type = 0; nla_for_each_nested(a, attr, rem) { int type = nla_type(a); int err; - static const u32 ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = { - [OVS_TUNNEL_KEY_ATTR_ID] = sizeof(u64), - [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = sizeof(u32), - [OVS_TUNNEL_KEY_ATTR_IPV4_DST] = sizeof(u32), - [OVS_TUNNEL_KEY_ATTR_TOS] = 1, - [OVS_TUNNEL_KEY_ATTR_TTL] = 1, - [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0, - [OVS_TUNNEL_KEY_ATTR_CSUM] = 0, - [OVS_TUNNEL_KEY_ATTR_TP_SRC] = sizeof(u16), - [OVS_TUNNEL_KEY_ATTR_TP_DST] = sizeof(u16), - [OVS_TUNNEL_KEY_ATTR_OAM] = 0, - [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = -1, - }; - if (type > OVS_TUNNEL_KEY_ATTR_MAX) { OVS_NLERR(log, "Tunnel attr %d out of range max %d", type, OVS_TUNNEL_KEY_ATTR_MAX); return -EINVAL; } - if (ovs_tunnel_key_lens[type] != nla_len(a) && - ovs_tunnel_key_lens[type] != -1) { + if (ovs_tunnel_key_lens[type].len != nla_len(a) && + ovs_tunnel_key_lens[type].len != OVS_ATTR_NESTED) { OVS_NLERR(log, "Tunnel attr %d has unexpected len %d expected %d", - type, nla_len(a), ovs_tunnel_key_lens[type]); + type, nla_len(a), ovs_tunnel_key_lens[type].len); return -EINVAL; } @@ -520,11 +568,30 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, tun_flags |= TUNNEL_OAM; break; case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS: + if (opts_type) { + OVS_NLERR(log, "Multiple metadata blocks provided"); + return -EINVAL; + } + err = genev_tun_opt_from_nlattr(a, match, is_mask, log); if (err) return err; - tun_flags |= TUNNEL_OPTIONS_PRESENT; + tun_flags |= TUNNEL_GENEVE_OPT; + opts_type = type; + break; + case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: + if (opts_type) { + OVS_NLERR(log, "Multiple metadata blocks provided"); + return -EINVAL; + } + + err = vxlan_tun_opt_from_nlattr(a, match, is_mask, log); + if (err) + return err; + + tun_flags |= TUNNEL_VXLAN_OPT; + opts_type = type; break; default: OVS_NLERR(log, "Unknown IPv4 tunnel attribute %d", @@ -553,13 +620,29 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, } } + return opts_type; +} + +static int vxlan_opt_to_nlattr(struct sk_buff *skb, + const void *tun_opts, int swkey_tun_opts_len) +{ + const struct ovs_vxlan_opts *opts = tun_opts; + struct nlattr *nla; + + nla = nla_nest_start(skb, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS); + if (!nla) + return -EMSGSIZE; + + if (nla_put_u32(skb, OVS_VXLAN_EXT_GBP, opts->gbp) < 0) + return -EMSGSIZE; + + nla_nest_end(skb, nla); return 0; } static int __ipv4_tun_to_nlattr(struct sk_buff *skb, const struct ovs_key_ipv4_tunnel *output, - const struct geneve_opt *tun_opts, - int swkey_tun_opts_len) + const void *tun_opts, int swkey_tun_opts_len) { if (output->tun_flags & TUNNEL_KEY && nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) @@ -590,18 +673,22 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb, if ((output->tun_flags & TUNNEL_OAM) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM)) return -EMSGSIZE; - if (tun_opts && - nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, - swkey_tun_opts_len, tun_opts)) - return -EMSGSIZE; + if (tun_opts) { + if (output->tun_flags & TUNNEL_GENEVE_OPT && + nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, + swkey_tun_opts_len, tun_opts)) + return -EMSGSIZE; + else if (output->tun_flags & TUNNEL_VXLAN_OPT && + vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len)) + return -EMSGSIZE; + } return 0; } static int ipv4_tun_to_nlattr(struct sk_buff *skb, const struct ovs_key_ipv4_tunnel *output, - const struct geneve_opt *tun_opts, - int swkey_tun_opts_len) + const void *tun_opts, int swkey_tun_opts_len) { struct nlattr *nla; int err; @@ -675,7 +762,7 @@ static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, } if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) { if (ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match, - is_mask, log)) + is_mask, log) < 0) return -EINVAL; *attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); } @@ -915,18 +1002,16 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, return 0; } -static void nlattr_set(struct nlattr *attr, u8 val, bool is_attr_mask_key) +static void nlattr_set(struct nlattr *attr, u8 val, + const struct ovs_len_tbl *tbl) { struct nlattr *nla; int rem; /* The nlattr stream should already have been validated */ nla_for_each_nested(nla, attr, rem) { - /* We assume that ovs_key_lens[type] == -1 means that type is a - * nested attribute - */ - if (is_attr_mask_key && ovs_key_lens[nla_type(nla)] == -1) - nlattr_set(nla, val, false); + if (tbl && tbl[nla_type(nla)].len == OVS_ATTR_NESTED) + nlattr_set(nla, val, tbl[nla_type(nla)].next); else memset(nla_data(nla), val, nla_len(nla)); } @@ -934,7 +1019,7 @@ static void nlattr_set(struct nlattr *attr, u8 val, bool is_attr_mask_key) static void mask_set_nlattr(struct nlattr *attr, u8 val) { - nlattr_set(attr, val, true); + nlattr_set(attr, val, ovs_key_lens); } /** @@ -1095,6 +1180,59 @@ free_newmask: return err; } +static size_t get_ufid_len(const struct nlattr *attr, bool log) +{ + size_t len; + + if (!attr) + return 0; + + len = nla_len(attr); + if (len < 1 || len > MAX_UFID_LENGTH) { + OVS_NLERR(log, "ufid size %u bytes exceeds the range (1, %d)", + nla_len(attr), MAX_UFID_LENGTH); + return 0; + } + + return len; +} + +/* Initializes 'flow->ufid', returning true if 'attr' contains a valid UFID, + * or false otherwise. + */ +bool ovs_nla_get_ufid(struct sw_flow_id *sfid, const struct nlattr *attr, + bool log) +{ + sfid->ufid_len = get_ufid_len(attr, log); + if (sfid->ufid_len) + memcpy(sfid->ufid, nla_data(attr), sfid->ufid_len); + + return sfid->ufid_len; +} + +int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid, + const struct sw_flow_key *key, bool log) +{ + struct sw_flow_key *new_key; + + if (ovs_nla_get_ufid(sfid, ufid, log)) + return 0; + + /* If UFID was not provided, use unmasked key. */ + new_key = kmalloc(sizeof(*new_key), GFP_KERNEL); + if (!new_key) + return -ENOMEM; + memcpy(new_key, key, sizeof(*key)); + sfid->unmasked_key = new_key; + + return 0; +} + +u32 ovs_nla_get_ufid_flags(const struct nlattr *attr) +{ + return attr ? nla_get_u32(attr) : 0; +} + /** * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key. * @key: Receives extracted in_port, priority, tun_key and skb_mark. @@ -1131,12 +1269,12 @@ int ovs_nla_get_flow_metadata(const struct nlattr *attr, return metadata_from_nlattrs(&match, &attrs, a, false, log); } -int ovs_nla_put_flow(const struct sw_flow_key *swkey, - const struct sw_flow_key *output, struct sk_buff *skb) +static int __ovs_nla_put_key(const struct sw_flow_key *swkey, + const struct sw_flow_key *output, bool is_mask, + struct sk_buff *skb) { struct ovs_key_ethernet *eth_key; struct nlattr *nla, *encap; - bool is_mask = (swkey != output); if (nla_put_u32(skb, OVS_KEY_ATTR_RECIRC_ID, output->recirc_id)) goto nla_put_failure; @@ -1148,10 +1286,10 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey, goto nla_put_failure; if ((swkey->tun_key.ipv4_dst || is_mask)) { - const struct geneve_opt *opts = NULL; + const void *opts = NULL; if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT) - opts = GENEVE_OPTS(output, swkey->tun_opts_len); + opts = TUN_METADATA_OPTS(output, swkey->tun_opts_len); if (ipv4_tun_to_nlattr(skb, &output->tun_key, opts, swkey->tun_opts_len)) @@ -1346,6 +1484,49 @@ nla_put_failure: return -EMSGSIZE; } +int ovs_nla_put_key(const struct sw_flow_key *swkey, + const struct sw_flow_key *output, int attr, bool is_mask, + struct sk_buff *skb) +{ + int err; + struct nlattr *nla; + + nla = nla_nest_start(skb, attr); + if (!nla) + return -EMSGSIZE; + err = __ovs_nla_put_key(swkey, output, is_mask, skb); + if (err) + return err; + nla_nest_end(skb, nla); + + return 0; +} + +/* Called with ovs_mutex or RCU read lock. */ +int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb) +{ + if (ovs_identifier_is_ufid(&flow->id)) + return nla_put(skb, OVS_FLOW_ATTR_UFID, flow->id.ufid_len, + flow->id.ufid); + + return ovs_nla_put_key(flow->id.unmasked_key, flow->id.unmasked_key, + OVS_FLOW_ATTR_KEY, false, skb); +} + +/* Called with ovs_mutex or RCU read lock. */ +int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb) +{ + return ovs_nla_put_key(&flow->mask->key, &flow->key, + OVS_FLOW_ATTR_KEY, false, skb); +} + +/* Called with ovs_mutex or RCU read lock. */ +int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb) +{ + return ovs_nla_put_key(&flow->key, &flow->mask->key, + OVS_FLOW_ATTR_MASK, true, skb); +} + #define MAX_ACTIONS_BUFSIZE (32 * 1024) static struct sw_flow_actions *nla_alloc_flow_actions(int size, bool log) @@ -1514,16 +1695,6 @@ static int validate_and_copy_sample(const struct nlattr *attr, return 0; } -static int validate_tp_port(const struct sw_flow_key *flow_key, - __be16 eth_type) -{ - if ((eth_type == htons(ETH_P_IP) || eth_type == htons(ETH_P_IPV6)) && - (flow_key->tp.src || flow_key->tp.dst)) - return 0; - - return -EINVAL; -} - void ovs_match_init(struct sw_flow_match *match, struct sw_flow_key *key, struct sw_flow_mask *mask) @@ -1540,6 +1711,34 @@ void ovs_match_init(struct sw_flow_match *match, } } +static int validate_geneve_opts(struct sw_flow_key *key) +{ + struct geneve_opt *option; + int opts_len = key->tun_opts_len; + bool crit_opt = false; + + option = (struct geneve_opt *)TUN_METADATA_OPTS(key, key->tun_opts_len); + while (opts_len > 0) { + int len; + + if (opts_len < sizeof(*option)) + return -EINVAL; + + len = sizeof(*option) + option->length * 4; + if (len > opts_len) + return -EINVAL; + + crit_opt |= !!(option->type & GENEVE_CRIT_OPT_TYPE); + + option = (struct geneve_opt *)((u8 *)option + len); + opts_len -= len; + }; + + key->tun_key.tun_flags |= crit_opt ? TUNNEL_CRIT_OPT : 0; + + return 0; +} + static int validate_and_copy_set_tun(const struct nlattr *attr, struct sw_flow_actions **sfa, bool log) { @@ -1547,36 +1746,23 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, struct sw_flow_key key; struct ovs_tunnel_info *tun_info; struct nlattr *a; - int err, start; + int err, start, opts_type; ovs_match_init(&match, &key, NULL); - err = ipv4_tun_from_nlattr(nla_data(attr), &match, false, log); - if (err) - return err; + opts_type = ipv4_tun_from_nlattr(nla_data(attr), &match, false, log); + if (opts_type < 0) + return opts_type; if (key.tun_opts_len) { - struct geneve_opt *option = GENEVE_OPTS(&key, - key.tun_opts_len); - int opts_len = key.tun_opts_len; - bool crit_opt = false; - - while (opts_len > 0) { - int len; - - if (opts_len < sizeof(*option)) - return -EINVAL; - - len = sizeof(*option) + option->length * 4; - if (len > opts_len) - return -EINVAL; - - crit_opt |= !!(option->type & GENEVE_CRIT_OPT_TYPE); - - option = (struct geneve_opt *)((u8 *)option + len); - opts_len -= len; - }; - - key.tun_key.tun_flags |= crit_opt ? TUNNEL_CRIT_OPT : 0; + switch (opts_type) { + case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS: + err = validate_geneve_opts(&key); + if (err < 0) + return err; + break; + case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: + break; + } }; start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET, log); @@ -1597,9 +1783,9 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, * everything else will go away after flow setup. We can append * it to tun_info and then point there. */ - memcpy((tun_info + 1), GENEVE_OPTS(&key, key.tun_opts_len), - key.tun_opts_len); - tun_info->options = (struct geneve_opt *)(tun_info + 1); + memcpy((tun_info + 1), + TUN_METADATA_OPTS(&key, key.tun_opts_len), key.tun_opts_len); + tun_info->options = (tun_info + 1); } else { tun_info->options = NULL; } @@ -1609,21 +1795,43 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, return err; } +/* Return false if there are any non-masked bits set. + * Mask follows data immediately, before any netlink padding. + */ +static bool validate_masked(u8 *data, int len) +{ + u8 *mask = data + len; + + while (len--) + if (*data++ & ~*mask++) + return false; + + return true; +} + static int validate_set(const struct nlattr *a, const struct sw_flow_key *flow_key, struct sw_flow_actions **sfa, - bool *set_tun, __be16 eth_type, bool log) + bool *skip_copy, __be16 eth_type, bool masked, bool log) { const struct nlattr *ovs_key = nla_data(a); int key_type = nla_type(ovs_key); + size_t key_len; /* There can be only one key in a action */ if (nla_total_size(nla_len(ovs_key)) != nla_len(a)) return -EINVAL; + key_len = nla_len(ovs_key); + if (masked) + key_len /= 2; + if (key_type > OVS_KEY_ATTR_MAX || - (ovs_key_lens[key_type] != nla_len(ovs_key) && - ovs_key_lens[key_type] != -1)) + (ovs_key_lens[key_type].len != key_len && + ovs_key_lens[key_type].len != OVS_ATTR_NESTED)) + return -EINVAL; + + if (masked && !validate_masked(nla_data(ovs_key), key_len)) return -EINVAL; switch (key_type) { @@ -1640,7 +1848,10 @@ static int validate_set(const struct nlattr *a, if (eth_p_mpls(eth_type)) return -EINVAL; - *set_tun = true; + if (masked) + return -EINVAL; /* Masked tunnel set not supported. */ + + *skip_copy = true; err = validate_and_copy_set_tun(a, sfa, log); if (err) return err; @@ -1650,48 +1861,66 @@ static int validate_set(const struct nlattr *a, if (eth_type != htons(ETH_P_IP)) return -EINVAL; - if (!flow_key->ip.proto) - return -EINVAL; - ipv4_key = nla_data(ovs_key); - if (ipv4_key->ipv4_proto != flow_key->ip.proto) - return -EINVAL; - if (ipv4_key->ipv4_frag != flow_key->ip.frag) - return -EINVAL; + if (masked) { + const struct ovs_key_ipv4 *mask = ipv4_key + 1; + + /* Non-writeable fields. */ + if (mask->ipv4_proto || mask->ipv4_frag) + return -EINVAL; + } else { + if (ipv4_key->ipv4_proto != flow_key->ip.proto) + return -EINVAL; + if (ipv4_key->ipv4_frag != flow_key->ip.frag) + return -EINVAL; + } break; case OVS_KEY_ATTR_IPV6: if (eth_type != htons(ETH_P_IPV6)) return -EINVAL; - if (!flow_key->ip.proto) - return -EINVAL; - ipv6_key = nla_data(ovs_key); - if (ipv6_key->ipv6_proto != flow_key->ip.proto) - return -EINVAL; - if (ipv6_key->ipv6_frag != flow_key->ip.frag) - return -EINVAL; + if (masked) { + const struct ovs_key_ipv6 *mask = ipv6_key + 1; + /* Non-writeable fields. */ + if (mask->ipv6_proto || mask->ipv6_frag) + return -EINVAL; + + /* Invalid bits in the flow label mask? */ + if (ntohl(mask->ipv6_label) & 0xFFF00000) + return -EINVAL; + } else { + if (ipv6_key->ipv6_proto != flow_key->ip.proto) + return -EINVAL; + + if (ipv6_key->ipv6_frag != flow_key->ip.frag) + return -EINVAL; + } if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000) return -EINVAL; break; case OVS_KEY_ATTR_TCP: - if (flow_key->ip.proto != IPPROTO_TCP) + if ((eth_type != htons(ETH_P_IP) && + eth_type != htons(ETH_P_IPV6)) || + flow_key->ip.proto != IPPROTO_TCP) return -EINVAL; - return validate_tp_port(flow_key, eth_type); + break; case OVS_KEY_ATTR_UDP: - if (flow_key->ip.proto != IPPROTO_UDP) + if ((eth_type != htons(ETH_P_IP) && + eth_type != htons(ETH_P_IPV6)) || + flow_key->ip.proto != IPPROTO_UDP) return -EINVAL; - return validate_tp_port(flow_key, eth_type); + break; case OVS_KEY_ATTR_MPLS: if (!eth_p_mpls(eth_type)) @@ -1699,15 +1928,45 @@ static int validate_set(const struct nlattr *a, break; case OVS_KEY_ATTR_SCTP: - if (flow_key->ip.proto != IPPROTO_SCTP) + if ((eth_type != htons(ETH_P_IP) && + eth_type != htons(ETH_P_IPV6)) || + flow_key->ip.proto != IPPROTO_SCTP) return -EINVAL; - return validate_tp_port(flow_key, eth_type); + break; default: return -EINVAL; } + /* Convert non-masked non-tunnel set actions to masked set actions. */ + if (!masked && key_type != OVS_KEY_ATTR_TUNNEL) { + int start, len = key_len * 2; + struct nlattr *at; + + *skip_copy = true; + + start = add_nested_action_start(sfa, + OVS_ACTION_ATTR_SET_TO_MASKED, + log); + if (start < 0) + return start; + + at = __add_action(sfa, key_type, NULL, len, log); + if (IS_ERR(at)) + return PTR_ERR(at); + + memcpy(nla_data(at), nla_data(ovs_key), key_len); /* Key. */ + memset(nla_data(at) + key_len, 0xff, key_len); /* Mask. */ + /* Clear non-writeable bits from otherwise writeable fields. */ + if (key_type == OVS_KEY_ATTR_IPV6) { + struct ovs_key_ipv6 *mask = nla_data(at) + key_len; + + mask->ipv6_label &= htonl(0x000FFFFF); + } + add_nested_action_end(*sfa, start); + } + return 0; } @@ -1769,6 +2028,7 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr, [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), [OVS_ACTION_ATTR_POP_VLAN] = 0, [OVS_ACTION_ATTR_SET] = (u32)-1, + [OVS_ACTION_ATTR_SET_MASKED] = (u32)-1, [OVS_ACTION_ATTR_SAMPLE] = (u32)-1, [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash) }; @@ -1864,7 +2124,14 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr, case OVS_ACTION_ATTR_SET: err = validate_set(a, key, sfa, - &skip_copy, eth_type, log); + &skip_copy, eth_type, false, log); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_SET_MASKED: + err = validate_set(a, key, sfa, + &skip_copy, eth_type, true, log); if (err) return err; break; @@ -1894,6 +2161,7 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr, return 0; } +/* 'key' must be the masked key. */ int ovs_nla_copy_actions(const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, bool log) @@ -1981,6 +2249,21 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) return 0; } +static int masked_set_action_to_set_action_attr(const struct nlattr *a, + struct sk_buff *skb) +{ + const struct nlattr *ovs_key = nla_data(a); + size_t key_len = nla_len(ovs_key) / 2; + + /* Revert the conversion we did from a non-masked set action to + * masked set action. + */ + if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a) - key_len, ovs_key)) + return -EMSGSIZE; + + return 0; +} + int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb) { const struct nlattr *a; @@ -1996,6 +2279,12 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb) return err; break; + case OVS_ACTION_ATTR_SET_TO_MASKED: + err = masked_set_action_to_set_action_attr(a, skb); + if (err) + return err; + break; + case OVS_ACTION_ATTR_SAMPLE: err = sample_action_to_attr(a, skb); if (err) diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index 577f12be3459..5c3d75bff310 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -43,16 +43,25 @@ size_t ovs_key_attr_size(void); void ovs_match_init(struct sw_flow_match *match, struct sw_flow_key *key, struct sw_flow_mask *mask); -int ovs_nla_put_flow(const struct sw_flow_key *, - const struct sw_flow_key *, struct sk_buff *); +int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *, + int attr, bool is_mask, struct sk_buff *); int ovs_nla_get_flow_metadata(const struct nlattr *, struct sw_flow_key *, bool log); +int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb); +int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb); +int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb); + int ovs_nla_get_match(struct sw_flow_match *, const struct nlattr *key, const struct nlattr *mask, bool log); int ovs_nla_put_egress_tunnel_key(struct sk_buff *, const struct ovs_tunnel_info *); +bool ovs_nla_get_ufid(struct sw_flow_id *, const struct nlattr *, bool log); +int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid, + const struct sw_flow_key *key, bool log); +u32 ovs_nla_get_ufid_flags(const struct nlattr *attr); + int ovs_nla_copy_actions(const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, bool log); diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 5899bf161c61..4613df8c8290 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -85,6 +85,8 @@ struct sw_flow *ovs_flow_alloc(void) flow->sf_acts = NULL; flow->mask = NULL; + flow->id.unmasked_key = NULL; + flow->id.ufid_len = 0; flow->stats_last_writer = NUMA_NO_NODE; /* Initialize the default stat node. */ @@ -139,6 +141,8 @@ static void flow_free(struct sw_flow *flow) { int node; + if (ovs_identifier_is_key(&flow->id)) + kfree(flow->id.unmasked_key); kfree((struct sw_flow_actions __force *)flow->sf_acts); for_each_node(node) if (flow->stats[node]) @@ -200,18 +204,28 @@ static struct table_instance *table_instance_alloc(int new_size) int ovs_flow_tbl_init(struct flow_table *table) { - struct table_instance *ti; + struct table_instance *ti, *ufid_ti; ti = table_instance_alloc(TBL_MIN_BUCKETS); if (!ti) return -ENOMEM; + ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS); + if (!ufid_ti) + goto free_ti; + rcu_assign_pointer(table->ti, ti); + rcu_assign_pointer(table->ufid_ti, ufid_ti); INIT_LIST_HEAD(&table->mask_list); table->last_rehash = jiffies; table->count = 0; + table->ufid_count = 0; return 0; + +free_ti: + __table_instance_destroy(ti); + return -ENOMEM; } static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) @@ -221,13 +235,16 @@ static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) __table_instance_destroy(ti); } -static void table_instance_destroy(struct table_instance *ti, bool deferred) +static void table_instance_destroy(struct table_instance *ti, + struct table_instance *ufid_ti, + bool deferred) { int i; if (!ti) return; + BUG_ON(!ufid_ti); if (ti->keep_flows) goto skip_flows; @@ -236,18 +253,24 @@ static void table_instance_destroy(struct table_instance *ti, bool deferred) struct hlist_head *head = flex_array_get(ti->buckets, i); struct hlist_node *n; int ver = ti->node_ver; + int ufid_ver = ufid_ti->node_ver; - hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) { - hlist_del_rcu(&flow->hash_node[ver]); + hlist_for_each_entry_safe(flow, n, head, flow_table.node[ver]) { + hlist_del_rcu(&flow->flow_table.node[ver]); + if (ovs_identifier_is_ufid(&flow->id)) + hlist_del_rcu(&flow->ufid_table.node[ufid_ver]); ovs_flow_free(flow, deferred); } } skip_flows: - if (deferred) + if (deferred) { call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); - else + call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb); + } else { __table_instance_destroy(ti); + __table_instance_destroy(ufid_ti); + } } /* No need for locking this function is called from RCU callback or @@ -256,8 +279,9 @@ skip_flows: void ovs_flow_tbl_destroy(struct flow_table *table) { struct table_instance *ti = rcu_dereference_raw(table->ti); + struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti); - table_instance_destroy(ti, false); + table_instance_destroy(ti, ufid_ti, false); } struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, @@ -272,7 +296,7 @@ struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, while (*bucket < ti->n_buckets) { i = 0; head = flex_array_get(ti->buckets, *bucket); - hlist_for_each_entry_rcu(flow, head, hash_node[ver]) { + hlist_for_each_entry_rcu(flow, head, flow_table.node[ver]) { if (i < *last) { i++; continue; @@ -294,16 +318,26 @@ static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash) (hash & (ti->n_buckets - 1))); } -static void table_instance_insert(struct table_instance *ti, struct sw_flow *flow) +static void table_instance_insert(struct table_instance *ti, + struct sw_flow *flow) +{ + struct hlist_head *head; + + head = find_bucket(ti, flow->flow_table.hash); + hlist_add_head_rcu(&flow->flow_table.node[ti->node_ver], head); +} + +static void ufid_table_instance_insert(struct table_instance *ti, + struct sw_flow *flow) { struct hlist_head *head; - head = find_bucket(ti, flow->hash); - hlist_add_head_rcu(&flow->hash_node[ti->node_ver], head); + head = find_bucket(ti, flow->ufid_table.hash); + hlist_add_head_rcu(&flow->ufid_table.node[ti->node_ver], head); } static void flow_table_copy_flows(struct table_instance *old, - struct table_instance *new) + struct table_instance *new, bool ufid) { int old_ver; int i; @@ -318,15 +352,21 @@ static void flow_table_copy_flows(struct table_instance *old, head = flex_array_get(old->buckets, i); - hlist_for_each_entry(flow, head, hash_node[old_ver]) - table_instance_insert(new, flow); + if (ufid) + hlist_for_each_entry(flow, head, + ufid_table.node[old_ver]) + ufid_table_instance_insert(new, flow); + else + hlist_for_each_entry(flow, head, + flow_table.node[old_ver]) + table_instance_insert(new, flow); } old->keep_flows = true; } static struct table_instance *table_instance_rehash(struct table_instance *ti, - int n_buckets) + int n_buckets, bool ufid) { struct table_instance *new_ti; @@ -334,32 +374,45 @@ static struct table_instance *table_instance_rehash(struct table_instance *ti, if (!new_ti) return NULL; - flow_table_copy_flows(ti, new_ti); + flow_table_copy_flows(ti, new_ti, ufid); return new_ti; } int ovs_flow_tbl_flush(struct flow_table *flow_table) { - struct table_instance *old_ti; - struct table_instance *new_ti; + struct table_instance *old_ti, *new_ti; + struct table_instance *old_ufid_ti, *new_ufid_ti; - old_ti = ovsl_dereference(flow_table->ti); new_ti = table_instance_alloc(TBL_MIN_BUCKETS); if (!new_ti) return -ENOMEM; + new_ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS); + if (!new_ufid_ti) + goto err_free_ti; + + old_ti = ovsl_dereference(flow_table->ti); + old_ufid_ti = ovsl_dereference(flow_table->ufid_ti); rcu_assign_pointer(flow_table->ti, new_ti); + rcu_assign_pointer(flow_table->ufid_ti, new_ufid_ti); flow_table->last_rehash = jiffies; flow_table->count = 0; + flow_table->ufid_count = 0; - table_instance_destroy(old_ti, true); + table_instance_destroy(old_ti, old_ufid_ti, true); return 0; + +err_free_ti: + __table_instance_destroy(new_ti); + return -ENOMEM; } -static u32 flow_hash(const struct sw_flow_key *key, int key_start, - int key_end) +static u32 flow_hash(const struct sw_flow_key *key, + const struct sw_flow_key_range *range) { + int key_start = range->start; + int key_end = range->end; const u32 *hash_key = (const u32 *)((const u8 *)key + key_start); int hash_u32s = (key_end - key_start) >> 2; @@ -395,19 +448,20 @@ static bool cmp_key(const struct sw_flow_key *key1, static bool flow_cmp_masked_key(const struct sw_flow *flow, const struct sw_flow_key *key, - int key_start, int key_end) + const struct sw_flow_key_range *range) { - return cmp_key(&flow->key, key, key_start, key_end); + return cmp_key(&flow->key, key, range->start, range->end); } -bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, - const struct sw_flow_match *match) +static bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, + const struct sw_flow_match *match) { struct sw_flow_key *key = match->key; int key_start = flow_key_start(key); int key_end = match->range.end; - return cmp_key(&flow->unmasked_key, key, key_start, key_end); + BUG_ON(ovs_identifier_is_ufid(&flow->id)); + return cmp_key(flow->id.unmasked_key, key, key_start, key_end); } static struct sw_flow *masked_flow_lookup(struct table_instance *ti, @@ -416,18 +470,15 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti, { struct sw_flow *flow; struct hlist_head *head; - int key_start = mask->range.start; - int key_end = mask->range.end; u32 hash; struct sw_flow_key masked_key; ovs_flow_mask_key(&masked_key, unmasked, mask); - hash = flow_hash(&masked_key, key_start, key_end); + hash = flow_hash(&masked_key, &mask->range); head = find_bucket(ti, hash); - hlist_for_each_entry_rcu(flow, head, hash_node[ti->node_ver]) { - if (flow->mask == mask && flow->hash == hash && - flow_cmp_masked_key(flow, &masked_key, - key_start, key_end)) + hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) { + if (flow->mask == mask && flow->flow_table.hash == hash && + flow_cmp_masked_key(flow, &masked_key, &mask->range)) return flow; } return NULL; @@ -469,7 +520,48 @@ struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, /* Always called under ovs-mutex. */ list_for_each_entry(mask, &tbl->mask_list, list) { flow = masked_flow_lookup(ti, match->key, mask); - if (flow && ovs_flow_cmp_unmasked_key(flow, match)) /* Found */ + if (flow && ovs_identifier_is_key(&flow->id) && + ovs_flow_cmp_unmasked_key(flow, match)) + return flow; + } + return NULL; +} + +static u32 ufid_hash(const struct sw_flow_id *sfid) +{ + return jhash(sfid->ufid, sfid->ufid_len, 0); +} + +static bool ovs_flow_cmp_ufid(const struct sw_flow *flow, + const struct sw_flow_id *sfid) +{ + if (flow->id.ufid_len != sfid->ufid_len) + return false; + + return !memcmp(flow->id.ufid, sfid->ufid, sfid->ufid_len); +} + +bool ovs_flow_cmp(const struct sw_flow *flow, const struct sw_flow_match *match) +{ + if (ovs_identifier_is_ufid(&flow->id)) + return flow_cmp_masked_key(flow, match->key, &match->range); + + return ovs_flow_cmp_unmasked_key(flow, match); +} + +struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl, + const struct sw_flow_id *ufid) +{ + struct table_instance *ti = rcu_dereference_ovsl(tbl->ufid_ti); + struct sw_flow *flow; + struct hlist_head *head; + u32 hash; + + hash = ufid_hash(ufid); + head = find_bucket(ti, hash); + hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver]) { + if (flow->ufid_table.hash == hash && + ovs_flow_cmp_ufid(flow, ufid)) return flow; } return NULL; @@ -486,9 +578,10 @@ int ovs_flow_tbl_num_masks(const struct flow_table *table) return num; } -static struct table_instance *table_instance_expand(struct table_instance *ti) +static struct table_instance *table_instance_expand(struct table_instance *ti, + bool ufid) { - return table_instance_rehash(ti, ti->n_buckets * 2); + return table_instance_rehash(ti, ti->n_buckets * 2, ufid); } /* Remove 'mask' from the mask list, if it is not needed any more. */ @@ -513,10 +606,15 @@ static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) { struct table_instance *ti = ovsl_dereference(table->ti); + struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti); BUG_ON(table->count == 0); - hlist_del_rcu(&flow->hash_node[ti->node_ver]); + hlist_del_rcu(&flow->flow_table.node[ti->node_ver]); table->count--; + if (ovs_identifier_is_ufid(&flow->id)) { + hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]); + table->ufid_count--; + } /* RCU delete the mask. 'flow->mask' is not NULLed, as it should be * accessible as long as the RCU read lock is held. @@ -585,34 +683,64 @@ static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow, } /* Must be called with OVS mutex held. */ -int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, - const struct sw_flow_mask *mask) +static void flow_key_insert(struct flow_table *table, struct sw_flow *flow) { struct table_instance *new_ti = NULL; struct table_instance *ti; - int err; - - err = flow_mask_insert(table, flow, mask); - if (err) - return err; - flow->hash = flow_hash(&flow->key, flow->mask->range.start, - flow->mask->range.end); + flow->flow_table.hash = flow_hash(&flow->key, &flow->mask->range); ti = ovsl_dereference(table->ti); table_instance_insert(ti, flow); table->count++; /* Expand table, if necessary, to make room. */ if (table->count > ti->n_buckets) - new_ti = table_instance_expand(ti); + new_ti = table_instance_expand(ti, false); else if (time_after(jiffies, table->last_rehash + REHASH_INTERVAL)) - new_ti = table_instance_rehash(ti, ti->n_buckets); + new_ti = table_instance_rehash(ti, ti->n_buckets, false); if (new_ti) { rcu_assign_pointer(table->ti, new_ti); - table_instance_destroy(ti, true); + call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); table->last_rehash = jiffies; } +} + +/* Must be called with OVS mutex held. */ +static void flow_ufid_insert(struct flow_table *table, struct sw_flow *flow) +{ + struct table_instance *ti; + + flow->ufid_table.hash = ufid_hash(&flow->id); + ti = ovsl_dereference(table->ufid_ti); + ufid_table_instance_insert(ti, flow); + table->ufid_count++; + + /* Expand table, if necessary, to make room. */ + if (table->ufid_count > ti->n_buckets) { + struct table_instance *new_ti; + + new_ti = table_instance_expand(ti, true); + if (new_ti) { + rcu_assign_pointer(table->ufid_ti, new_ti); + call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); + } + } +} + +/* Must be called with OVS mutex held. */ +int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, + const struct sw_flow_mask *mask) +{ + int err; + + err = flow_mask_insert(table, flow, mask); + if (err) + return err; + flow_key_insert(table, flow); + if (ovs_identifier_is_ufid(&flow->id)) + flow_ufid_insert(table, flow); + return 0; } diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 309fa6415689..616eda10d955 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -47,9 +47,11 @@ struct table_instance { struct flow_table { struct table_instance __rcu *ti; + struct table_instance __rcu *ufid_ti; struct list_head mask_list; unsigned long last_rehash; unsigned int count; + unsigned int ufid_count; }; extern struct kmem_cache *flow_stats_cache; @@ -78,8 +80,10 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, const struct sw_flow_key *); struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, const struct sw_flow_match *match); -bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, - const struct sw_flow_match *match); +struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *, + const struct sw_flow_id *); + +bool ovs_flow_cmp(const struct sw_flow *, const struct sw_flow_match *); void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, const struct sw_flow_mask *mask); diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 484864dd0e68..bf02fd5808c9 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -9,8 +9,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/version.h> - #include <linux/in.h> #include <linux/ip.h> #include <linux/net.h> @@ -90,7 +88,7 @@ static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb) opts_len = geneveh->opt_len * 4; - flags = TUNNEL_KEY | TUNNEL_OPTIONS_PRESENT | + flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT | (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0) | (geneveh->oam ? TUNNEL_OAM : 0) | (geneveh->critical ? TUNNEL_CRIT_OPT : 0); @@ -172,7 +170,7 @@ error: static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) { - struct ovs_key_ipv4_tunnel *tun_key; + const struct ovs_key_ipv4_tunnel *tun_key; struct ovs_tunnel_info *tun_info; struct net *net = ovs_dp_get_net(vport->dp); struct geneve_port *geneve_port = geneve_vport(vport); @@ -180,7 +178,7 @@ static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) __be16 sport; struct rtable *rt; struct flowi4 fl; - u8 vni[3]; + u8 vni[3], opts_len, *opts; __be16 df; int err; @@ -191,16 +189,7 @@ static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) } tun_key = &tun_info->tunnel; - - /* Route lookup */ - memset(&fl, 0, sizeof(fl)); - fl.daddr = tun_key->ipv4_dst; - fl.saddr = tun_key->ipv4_src; - fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos); - fl.flowi4_mark = skb->mark; - fl.flowi4_proto = IPPROTO_UDP; - - rt = ip_route_output_key(net, &fl); + rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP); if (IS_ERR(rt)) { err = PTR_ERR(rt); goto error; @@ -211,12 +200,19 @@ static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) tunnel_id_to_vni(tun_key->tun_id, vni); skb->ignore_df = 1; + if (tun_key->tun_flags & TUNNEL_GENEVE_OPT) { + opts = (u8 *)tun_info->options; + opts_len = tun_info->options_len; + } else { + opts = NULL; + opts_len = 0; + } + err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr, tun_key->ipv4_dst, tun_key->ipv4_tos, tun_key->ipv4_ttl, df, sport, dport, - tun_key->tun_flags, vni, - tun_info->options_len, (u8 *)tun_info->options, - false); + tun_key->tun_flags, vni, opts_len, opts, + !!(tun_key->tun_flags & TUNNEL_CSUM), false); if (err < 0) ip_rt_put(rt); return err; diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index d4168c442db5..f17ac9642f4e 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -134,7 +134,7 @@ static int gre_err(struct sk_buff *skb, u32 info, static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) { struct net *net = ovs_dp_get_net(vport->dp); - struct ovs_key_ipv4_tunnel *tun_key; + const struct ovs_key_ipv4_tunnel *tun_key; struct flowi4 fl; struct rtable *rt; int min_headroom; @@ -148,15 +148,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) } tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; - /* Route lookup */ - memset(&fl, 0, sizeof(fl)); - fl.daddr = tun_key->ipv4_dst; - fl.saddr = tun_key->ipv4_src; - fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos); - fl.flowi4_mark = skb->mark; - fl.flowi4_proto = IPPROTO_GRE; - - rt = ip_route_output_key(net, &fl); + rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_GRE); if (IS_ERR(rt)) { err = PTR_ERR(rt); goto err_free_skb; @@ -166,7 +158,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + tunnel_hlen + sizeof(struct iphdr) - + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); + + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { int head_delta = SKB_DATA_ALIGN(min_headroom - skb_headroom(skb) + diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index d7c46b301024..3277a7520e31 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -40,6 +40,7 @@ #include "datapath.h" #include "vport.h" +#include "vport-vxlan.h" /** * struct vxlan_port - Keeps track of open UDP ports @@ -49,6 +50,7 @@ struct vxlan_port { struct vxlan_sock *vs; char name[IFNAMSIZ]; + u32 exts; /* VXLAN_F_* in <net/vxlan.h> */ }; static struct vport_ops ovs_vxlan_vport_ops; @@ -59,19 +61,30 @@ static inline struct vxlan_port *vxlan_vport(const struct vport *vport) } /* Called with rcu_read_lock and BH disabled. */ -static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni) +static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, + struct vxlan_metadata *md) { struct ovs_tunnel_info tun_info; + struct vxlan_port *vxlan_port; struct vport *vport = vs->data; struct iphdr *iph; + struct ovs_vxlan_opts opts = { + .gbp = md->gbp, + }; __be64 key; + __be16 flags; + + flags = TUNNEL_KEY | (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0); + vxlan_port = vxlan_vport(vport); + if (vxlan_port->exts & VXLAN_F_GBP && md->gbp) + flags |= TUNNEL_VXLAN_OPT; /* Save outer tunnel values */ iph = ip_hdr(skb); - key = cpu_to_be64(ntohl(vx_vni) >> 8); + key = cpu_to_be64(ntohl(md->vni) >> 8); ovs_flow_tun_info_init(&tun_info, iph, udp_hdr(skb)->source, udp_hdr(skb)->dest, - key, TUNNEL_KEY, NULL, 0); + key, flags, &opts, sizeof(opts)); ovs_vport_receive(vport, skb, &tun_info); } @@ -83,6 +96,21 @@ static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port))) return -EMSGSIZE; + + if (vxlan_port->exts) { + struct nlattr *exts; + + exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION); + if (!exts) + return -EMSGSIZE; + + if (vxlan_port->exts & VXLAN_F_GBP && + nla_put_flag(skb, OVS_VXLAN_EXT_GBP)) + return -EMSGSIZE; + + nla_nest_end(skb, exts); + } + return 0; } @@ -95,6 +123,31 @@ static void vxlan_tnl_destroy(struct vport *vport) ovs_vport_deferred_free(vport); } +static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX+1] = { + [OVS_VXLAN_EXT_GBP] = { .type = NLA_FLAG, }, +}; + +static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr) +{ + struct nlattr *exts[OVS_VXLAN_EXT_MAX+1]; + struct vxlan_port *vxlan_port; + int err; + + if (nla_len(attr) < sizeof(struct nlattr)) + return -EINVAL; + + err = nla_parse_nested(exts, OVS_VXLAN_EXT_MAX, attr, exts_policy); + if (err < 0) + return err; + + vxlan_port = vxlan_vport(vport); + + if (exts[OVS_VXLAN_EXT_GBP]) + vxlan_port->exts |= VXLAN_F_GBP; + + return 0; +} + static struct vport *vxlan_tnl_create(const struct vport_parms *parms) { struct net *net = ovs_dp_get_net(parms->dp); @@ -127,7 +180,17 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms) vxlan_port = vxlan_vport(vport); strncpy(vxlan_port->name, parms->name, IFNAMSIZ); - vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, 0); + a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION); + if (a) { + err = vxlan_configure_exts(vport, a); + if (err) { + ovs_vport_free(vport); + goto error; + } + } + + vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, + vxlan_port->exts); if (IS_ERR(vs)) { ovs_vport_free(vport); return (void *)vs; @@ -140,17 +203,34 @@ error: return ERR_PTR(err); } +static int vxlan_ext_gbp(struct sk_buff *skb) +{ + const struct ovs_tunnel_info *tun_info; + const struct ovs_vxlan_opts *opts; + + tun_info = OVS_CB(skb)->egress_tun_info; + opts = tun_info->options; + + if (tun_info->tunnel.tun_flags & TUNNEL_VXLAN_OPT && + tun_info->options_len >= sizeof(*opts)) + return opts->gbp; + else + return 0; +} + static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) { struct net *net = ovs_dp_get_net(vport->dp); struct vxlan_port *vxlan_port = vxlan_vport(vport); __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; - struct ovs_key_ipv4_tunnel *tun_key; + const struct ovs_key_ipv4_tunnel *tun_key; + struct vxlan_metadata md = {0}; struct rtable *rt; struct flowi4 fl; __be16 src_port; __be16 df; int err; + u32 vxflags; if (unlikely(!OVS_CB(skb)->egress_tun_info)) { err = -EINVAL; @@ -158,15 +238,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) } tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; - /* Route lookup */ - memset(&fl, 0, sizeof(fl)); - fl.daddr = tun_key->ipv4_dst; - fl.saddr = tun_key->ipv4_src; - fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos); - fl.flowi4_mark = skb->mark; - fl.flowi4_proto = IPPROTO_UDP; - - rt = ip_route_output_key(net, &fl); + rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP); if (IS_ERR(rt)) { err = PTR_ERR(rt); goto error; @@ -178,13 +250,15 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) skb->ignore_df = 1; src_port = udp_flow_src_port(net, skb, 0, 0, true); + md.vni = htonl(be64_to_cpu(tun_key->tun_id) << 8); + md.gbp = vxlan_ext_gbp(skb); + vxflags = vxlan_port->exts | + (tun_key->tun_flags & TUNNEL_CSUM ? VXLAN_F_UDP_CSUM : 0); - err = vxlan_xmit_skb(vxlan_port->vs, rt, skb, - fl.saddr, tun_key->ipv4_dst, + err = vxlan_xmit_skb(rt, skb, fl.saddr, tun_key->ipv4_dst, tun_key->ipv4_tos, tun_key->ipv4_ttl, df, src_port, dst_port, - htonl(be64_to_cpu(tun_key->tun_id) << 8), - false); + &md, false, vxflags); if (err < 0) ip_rt_put(rt); return err; diff --git a/net/openvswitch/vport-vxlan.h b/net/openvswitch/vport-vxlan.h new file mode 100644 index 000000000000..4b08233e73d5 --- /dev/null +++ b/net/openvswitch/vport-vxlan.h @@ -0,0 +1,11 @@ +#ifndef VPORT_VXLAN_H +#define VPORT_VXLAN_H 1 + +#include <linux/kernel.h> +#include <linux/types.h> + +struct ovs_vxlan_opts { + __u32 gbp; +}; + +#endif diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 2034c6d9cb5a..ec2954ffc690 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -480,7 +480,8 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, stats = this_cpu_ptr(vport->percpu_stats); u64_stats_update_begin(&stats->syncp); stats->rx_packets++; - stats->rx_bytes += skb->len + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); + stats->rx_bytes += skb->len + + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); u64_stats_update_end(&stats->syncp); OVS_CB(skb)->input_vport = vport; @@ -594,14 +595,7 @@ int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, * The process may need to be changed if the corresponding process * in vports ops changed. */ - memset(&fl, 0, sizeof(fl)); - fl.daddr = tun_key->ipv4_dst; - fl.saddr = tun_key->ipv4_src; - fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos); - fl.flowi4_mark = skb_mark; - fl.flowi4_proto = ipproto; - - rt = ip_route_output_key(net, &fl); + rt = ovs_tunnel_route_lookup(net, tun_key, skb_mark, &fl, ipproto); if (IS_ERR(rt)) return PTR_ERR(rt); diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 99c8e71d9e6c..f8ae295fb001 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -236,4 +236,22 @@ static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, int ovs_vport_ops_register(struct vport_ops *ops); void ovs_vport_ops_unregister(struct vport_ops *ops); +static inline struct rtable *ovs_tunnel_route_lookup(struct net *net, + const struct ovs_key_ipv4_tunnel *key, + u32 mark, + struct flowi4 *fl, + u8 protocol) +{ + struct rtable *rt; + + memset(fl, 0, sizeof(*fl)); + fl->daddr = key->ipv4_dst; + fl->saddr = key->ipv4_src; + fl->flowi4_tos = RT_TOS(key->ipv4_tos); + fl->flowi4_mark = mark; + fl->flowi4_proto = protocol; + + rt = ip_route_output_key(net, fl); + return rt; +} #endif /* vport.h */ |