summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/8021q/vlan_dev.c2
-rw-r--r--net/bridge/br_if.c3
-rw-r--r--net/core/dev.c345
-rw-r--r--net/core/neighbour.c2
-rw-r--r--net/core/sysctl_net_core.c30
-rw-r--r--net/dsa/slave.c2
-rw-r--r--net/ipv4/Kconfig16
-rw-r--r--net/ipv4/af_inet.c12
-rw-r--r--net/ipv4/arp.c2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c10
-rw-r--r--net/ipv4/tcp.c28
-rw-r--r--net/ipv4/tcp_input.c32
-rw-r--r--net/ipv4/tcp_metrics.c44
-rw-r--r--net/ipv4/tcp_output.c2
-rw-r--r--net/ipv4/udp.c13
-rw-r--r--net/ipv6/addrconf.c59
-rw-r--r--net/ipv6/addrconf_core.c50
-rw-r--r--net/ipv6/af_inet6.c14
-rw-r--r--net/ipv6/ip6_offload.c4
-rw-r--r--net/ipv6/ip6_output.c25
-rw-r--r--net/ipv6/ndisc.c28
-rw-r--r--net/ipv6/output_core.c48
-rw-r--r--net/ipv6/route.c19
-rw-r--r--net/ipv6/udp_offload.c105
-rw-r--r--net/openvswitch/vport-vxlan.c2
-rw-r--r--net/packet/af_packet.c15
-rw-r--r--net/sched/Kconfig14
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/sch_api.c53
-rw-r--r--net/sched/sch_fq.c793
-rw-r--r--net/sched/sch_generic.c20
-rw-r--r--net/sched/sch_mq.c2
-rw-r--r--net/sched/sch_mqprio.c2
-rw-r--r--net/sctp/probe.c18
-rw-r--r--net/sctp/sm_make_chunk.c26
35 files changed, 1527 insertions, 314 deletions
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 9ab8a7ed99c0..09bf1c38805b 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -582,7 +582,7 @@ static int vlan_dev_init(struct net_device *dev)
dev->dev_id = real_dev->dev_id;
if (is_zero_ether_addr(dev->dev_addr))
- memcpy(dev->dev_addr, real_dev->dev_addr, dev->addr_len);
+ eth_hw_addr_inherit(dev, real_dev);
if (is_zero_ether_addr(dev->broadcast))
memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len);
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index aa6c9a8ba32a..c41d5fbb91d0 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -383,6 +383,9 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
netdev_update_features(br->dev);
+ if (br->dev->needed_headroom < dev->needed_headroom)
+ br->dev->needed_headroom = dev->needed_headroom;
+
spin_lock_bh(&br->lock);
changed_addr = br_stp_recalculate_bridge_id(br);
diff --git a/net/core/dev.c b/net/core/dev.c
index 1ed2b66a10a6..6fbb0c90849b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4367,57 +4367,48 @@ softnet_break:
goto out;
}
-struct netdev_upper {
+struct netdev_adjacent {
struct net_device *dev;
+
+ /* upper master flag, there can only be one master device per list */
bool master;
+
+ /* indicates that this dev is our first-level lower/upper device */
+ bool neighbour;
+
+ /* counter for the number of times this device was added to us */
+ u16 ref_nr;
+
struct list_head list;
struct rcu_head rcu;
- struct list_head search_list;
};
-static void __append_search_uppers(struct list_head *search_list,
- struct net_device *dev)
+static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
+ struct net_device *adj_dev,
+ bool upper)
{
- struct netdev_upper *upper;
+ struct netdev_adjacent *adj;
+ struct list_head *dev_list;
- list_for_each_entry(upper, &dev->upper_dev_list, list) {
- /* check if this upper is not already in search list */
- if (list_empty(&upper->search_list))
- list_add_tail(&upper->search_list, search_list);
+ dev_list = upper ? &dev->upper_dev_list : &dev->lower_dev_list;
+
+ list_for_each_entry(adj, dev_list, list) {
+ if (adj->dev == adj_dev)
+ return adj;
}
+ return NULL;
}
-static bool __netdev_search_upper_dev(struct net_device *dev,
- struct net_device *upper_dev)
+static inline struct netdev_adjacent *__netdev_find_upper(struct net_device *dev,
+ struct net_device *udev)
{
- LIST_HEAD(search_list);
- struct netdev_upper *upper;
- struct netdev_upper *tmp;
- bool ret = false;
-
- __append_search_uppers(&search_list, dev);
- list_for_each_entry(upper, &search_list, search_list) {
- if (upper->dev == upper_dev) {
- ret = true;
- break;
- }
- __append_search_uppers(&search_list, upper->dev);
- }
- list_for_each_entry_safe(upper, tmp, &search_list, search_list)
- INIT_LIST_HEAD(&upper->search_list);
- return ret;
+ return __netdev_find_adj(dev, udev, true);
}
-static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
- struct net_device *upper_dev)
+static inline struct netdev_adjacent *__netdev_find_lower(struct net_device *dev,
+ struct net_device *ldev)
{
- struct netdev_upper *upper;
-
- list_for_each_entry(upper, &dev->upper_dev_list, list) {
- if (upper->dev == upper_dev)
- return upper;
- }
- return NULL;
+ return __netdev_find_adj(dev, ldev, false);
}
/**
@@ -4462,7 +4453,7 @@ EXPORT_SYMBOL(netdev_has_any_upper_dev);
*/
struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
{
- struct netdev_upper *upper;
+ struct netdev_adjacent *upper;
ASSERT_RTNL();
@@ -4470,13 +4461,38 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
return NULL;
upper = list_first_entry(&dev->upper_dev_list,
- struct netdev_upper, list);
+ struct netdev_adjacent, list);
if (likely(upper->master))
return upper->dev;
return NULL;
}
EXPORT_SYMBOL(netdev_master_upper_dev_get);
+/* netdev_upper_get_next_dev_rcu - Get the next dev from upper list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next device from the dev's upper list, starting from iter
+ * position. The caller must hold RCU read lock.
+ */
+struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *upper;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+ if (&upper->list == &dev->upper_dev_list)
+ return NULL;
+
+ *iter = &upper->list;
+
+ return upper->dev;
+}
+EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
+
/**
* netdev_master_upper_dev_get_rcu - Get master upper device
* @dev: device
@@ -4486,20 +4502,158 @@ EXPORT_SYMBOL(netdev_master_upper_dev_get);
*/
struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
{
- struct netdev_upper *upper;
+ struct netdev_adjacent *upper;
upper = list_first_or_null_rcu(&dev->upper_dev_list,
- struct netdev_upper, list);
+ struct netdev_adjacent, list);
if (upper && likely(upper->master))
return upper->dev;
return NULL;
}
EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
+static int __netdev_adjacent_dev_insert(struct net_device *dev,
+ struct net_device *adj_dev,
+ bool neighbour, bool master,
+ bool upper)
+{
+ struct netdev_adjacent *adj;
+
+ adj = __netdev_find_adj(dev, adj_dev, upper);
+
+ if (adj) {
+ BUG_ON(neighbour);
+ adj->ref_nr++;
+ return 0;
+ }
+
+ adj = kmalloc(sizeof(*adj), GFP_KERNEL);
+ if (!adj)
+ return -ENOMEM;
+
+ adj->dev = adj_dev;
+ adj->master = master;
+ adj->neighbour = neighbour;
+ adj->ref_nr = 1;
+
+ dev_hold(adj_dev);
+ pr_debug("dev_hold for %s, because of %s link added from %s to %s\n",
+ adj_dev->name, upper ? "upper" : "lower", dev->name,
+ adj_dev->name);
+
+ if (!upper) {
+ list_add_tail_rcu(&adj->list, &dev->lower_dev_list);
+ return 0;
+ }
+
+ /* Ensure that master upper link is always the first item in list. */
+ if (master)
+ list_add_rcu(&adj->list, &dev->upper_dev_list);
+ else
+ list_add_tail_rcu(&adj->list, &dev->upper_dev_list);
+
+ return 0;
+}
+
+static inline int __netdev_upper_dev_insert(struct net_device *dev,
+ struct net_device *udev,
+ bool master, bool neighbour)
+{
+ return __netdev_adjacent_dev_insert(dev, udev, neighbour, master,
+ true);
+}
+
+static inline int __netdev_lower_dev_insert(struct net_device *dev,
+ struct net_device *ldev,
+ bool neighbour)
+{
+ return __netdev_adjacent_dev_insert(dev, ldev, neighbour, false,
+ false);
+}
+
+void __netdev_adjacent_dev_remove(struct net_device *dev,
+ struct net_device *adj_dev, bool upper)
+{
+ struct netdev_adjacent *adj;
+
+ if (upper)
+ adj = __netdev_find_upper(dev, adj_dev);
+ else
+ adj = __netdev_find_lower(dev, adj_dev);
+
+ if (!adj)
+ BUG();
+
+ if (adj->ref_nr > 1) {
+ adj->ref_nr--;
+ return;
+ }
+
+ list_del_rcu(&adj->list);
+ pr_debug("dev_put for %s, because of %s link removed from %s to %s\n",
+ adj_dev->name, upper ? "upper" : "lower", dev->name,
+ adj_dev->name);
+ dev_put(adj_dev);
+ kfree_rcu(adj, rcu);
+}
+
+static inline void __netdev_upper_dev_remove(struct net_device *dev,
+ struct net_device *udev)
+{
+ return __netdev_adjacent_dev_remove(dev, udev, true);
+}
+
+static inline void __netdev_lower_dev_remove(struct net_device *dev,
+ struct net_device *ldev)
+{
+ return __netdev_adjacent_dev_remove(dev, ldev, false);
+}
+
+int __netdev_adjacent_dev_insert_link(struct net_device *dev,
+ struct net_device *upper_dev,
+ bool master, bool neighbour)
+{
+ int ret;
+
+ ret = __netdev_upper_dev_insert(dev, upper_dev, master, neighbour);
+ if (ret)
+ return ret;
+
+ ret = __netdev_lower_dev_insert(upper_dev, dev, neighbour);
+ if (ret) {
+ __netdev_upper_dev_remove(dev, upper_dev);
+ return ret;
+ }
+
+ return 0;
+}
+
+static inline int __netdev_adjacent_dev_link(struct net_device *dev,
+ struct net_device *udev)
+{
+ return __netdev_adjacent_dev_insert_link(dev, udev, false, false);
+}
+
+static inline int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
+ struct net_device *udev,
+ bool master)
+{
+ return __netdev_adjacent_dev_insert_link(dev, udev, master, true);
+}
+
+void __netdev_adjacent_dev_unlink(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ __netdev_upper_dev_remove(dev, upper_dev);
+ __netdev_lower_dev_remove(upper_dev, dev);
+}
+
+
static int __netdev_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev, bool master)
{
- struct netdev_upper *upper;
+ struct netdev_adjacent *i, *j, *to_i, *to_j;
+ int ret = 0;
ASSERT_RTNL();
@@ -4507,7 +4661,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
return -EBUSY;
/* To prevent loops, check if dev is not upper device to upper_dev. */
- if (__netdev_search_upper_dev(upper_dev, dev))
+ if (__netdev_find_upper(upper_dev, dev))
return -EBUSY;
if (__netdev_find_upper(dev, upper_dev))
@@ -4516,22 +4670,76 @@ static int __netdev_upper_dev_link(struct net_device *dev,
if (master && netdev_master_upper_dev_get(dev))
return -EBUSY;
- upper = kmalloc(sizeof(*upper), GFP_KERNEL);
- if (!upper)
- return -ENOMEM;
+ ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, master);
+ if (ret)
+ return ret;
- upper->dev = upper_dev;
- upper->master = master;
- INIT_LIST_HEAD(&upper->search_list);
+ /* Now that we linked these devs, make all the upper_dev's
+ * upper_dev_list visible to every dev's lower_dev_list and vice
+ * versa, and don't forget the devices itself. All of these
+ * links are non-neighbours.
+ */
+ list_for_each_entry(i, &upper_dev->upper_dev_list, list) {
+ list_for_each_entry(j, &dev->lower_dev_list, list) {
+ ret = __netdev_adjacent_dev_link(i->dev, j->dev);
+ if (ret)
+ goto rollback_mesh;
+ }
+ }
+
+ /* add dev to every upper_dev's upper device */
+ list_for_each_entry(i, &upper_dev->upper_dev_list, list) {
+ ret = __netdev_adjacent_dev_link(dev, i->dev);
+ if (ret)
+ goto rollback_upper_mesh;
+ }
+
+ /* add upper_dev to every dev's lower device */
+ list_for_each_entry(i, &dev->lower_dev_list, list) {
+ ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
+ if (ret)
+ goto rollback_lower_mesh;
+ }
- /* Ensure that master upper link is always the first item in list. */
- if (master)
- list_add_rcu(&upper->list, &dev->upper_dev_list);
- else
- list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
- dev_hold(upper_dev);
call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
return 0;
+
+rollback_lower_mesh:
+ to_i = i;
+ list_for_each_entry(i, &dev->lower_dev_list, list) {
+ if (i == to_i)
+ break;
+ __netdev_adjacent_dev_unlink(i->dev, upper_dev);
+ }
+
+ i = NULL;
+
+rollback_upper_mesh:
+ to_i = i;
+ list_for_each_entry(i, &upper_dev->upper_dev_list, list) {
+ if (i == to_i)
+ break;
+ __netdev_adjacent_dev_unlink(dev, i->dev);
+ }
+
+ i = j = NULL;
+
+rollback_mesh:
+ to_i = i;
+ to_j = j;
+ list_for_each_entry(i, &dev->lower_dev_list, list) {
+ list_for_each_entry(j, &upper_dev->upper_dev_list, list) {
+ if (i == to_i && j == to_j)
+ break;
+ __netdev_adjacent_dev_unlink(i->dev, j->dev);
+ }
+ if (i == to_i)
+ break;
+ }
+
+ __netdev_adjacent_dev_unlink(dev, upper_dev);
+
+ return ret;
}
/**
@@ -4580,16 +4788,28 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link);
void netdev_upper_dev_unlink(struct net_device *dev,
struct net_device *upper_dev)
{
- struct netdev_upper *upper;
-
+ struct netdev_adjacent *i, *j;
ASSERT_RTNL();
- upper = __netdev_find_upper(dev, upper_dev);
- if (!upper)
- return;
- list_del_rcu(&upper->list);
- dev_put(upper_dev);
- kfree_rcu(upper, rcu);
+ __netdev_adjacent_dev_unlink(dev, upper_dev);
+
+ /* Here is the tricky part. We must remove all dev's lower
+ * devices from all upper_dev's upper devices and vice
+ * versa, to maintain the graph relationship.
+ */
+ list_for_each_entry(i, &dev->lower_dev_list, list)
+ list_for_each_entry(j, &upper_dev->upper_dev_list, list)
+ __netdev_adjacent_dev_unlink(i->dev, j->dev);
+
+ /* remove also the devices itself from lower/upper device
+ * list
+ */
+ list_for_each_entry(i, &dev->lower_dev_list, list)
+ __netdev_adjacent_dev_unlink(i->dev, upper_dev);
+
+ list_for_each_entry(i, &upper_dev->upper_dev_list, list)
+ __netdev_adjacent_dev_unlink(dev, i->dev);
+
call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
}
EXPORT_SYMBOL(netdev_upper_dev_unlink);
@@ -5850,6 +6070,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
INIT_LIST_HEAD(&dev->unreg_list);
INIT_LIST_HEAD(&dev->link_watch_list);
INIT_LIST_HEAD(&dev->upper_dev_list);
+ INIT_LIST_HEAD(&dev->lower_dev_list);
dev->priv_flags = IFF_XMIT_DST_RELEASE;
setup(dev);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 60533db8b72d..6072610a8672 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2759,13 +2759,11 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
}
-#ifdef CONFIG_ARPD
void neigh_app_ns(struct neighbour *n)
{
__neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST);
}
EXPORT_SYMBOL(neigh_app_ns);
-#endif /* CONFIG_ARPD */
#ifdef CONFIG_SYSCTL
static int zero;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 31107abd2783..cca444190907 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -20,6 +20,7 @@
#include <net/sock.h>
#include <net/net_ratelimit.h>
#include <net/busy_poll.h>
+#include <net/pkt_sched.h>
static int zero = 0;
static int one = 1;
@@ -193,6 +194,26 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,
}
#endif /* CONFIG_NET_FLOW_LIMIT */
+#ifdef CONFIG_NET_SCHED
+static int set_default_qdisc(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ char id[IFNAMSIZ];
+ struct ctl_table tbl = {
+ .data = id,
+ .maxlen = IFNAMSIZ,
+ };
+ int ret;
+
+ qdisc_get_default(id, IFNAMSIZ);
+
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ ret = qdisc_set_default(id);
+ return ret;
+}
+#endif
+
static struct ctl_table net_core_table[] = {
#ifdef CONFIG_NET
{
@@ -315,7 +336,14 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
-#
+#endif
+#ifdef CONFIG_NET_SCHED
+ {
+ .procname = "default_qdisc",
+ .mode = 0644,
+ .maxlen = IFNAMSIZ,
+ .proc_handler = set_default_qdisc
+ },
#endif
#endif /* CONFIG_NET */
{
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 6ebd8fbd9285..29d684ebca6a 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -347,7 +347,7 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent,
slave_dev->features = master->vlan_features;
SET_ETHTOOL_OPS(slave_dev, &dsa_slave_ethtool_ops);
- memcpy(slave_dev->dev_addr, master->dev_addr, ETH_ALEN);
+ eth_hw_addr_inherit(slave_dev, master);
slave_dev->tx_queue_len = 0;
switch (ds->dst->tag_protocol) {
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 37cf1a6ea3ad..05c57f0fcabe 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -259,22 +259,6 @@ config IP_PIMSM_V2
gated-5). This routing protocol is not used widely, so say N unless
you want to play with it.
-config ARPD
- bool "IP: ARP daemon support"
- ---help---
- The kernel maintains an internal cache which maps IP addresses to
- hardware addresses on the local network, so that Ethernet
- frames are sent to the proper address on the physical networking
- layer. Normally, kernel uses the ARP protocol to resolve these
- mappings.
-
- Saying Y here adds support to have an user space daemon to do this
- resolution instead. This is useful for implementing an alternate
- address resolution protocol (e.g. NHRP on mGRE tunnels) and also for
- testing purposes.
-
- If unsure, say N.
-
config SYN_COOKIES
bool "IP: TCP syncookie support"
---help---
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index b4d0be2b7ce9..7a1874b7b8fd 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1532,18 +1532,6 @@ int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align)
}
EXPORT_SYMBOL_GPL(snmp_mib_init);
-void snmp_mib_free(void __percpu *ptr[SNMP_ARRAY_SZ])
-{
- int i;
-
- BUG_ON(ptr == NULL);
- for (i = 0; i < SNMP_ARRAY_SZ; i++) {
- free_percpu(ptr[i]);
- ptr[i] = NULL;
- }
-}
-EXPORT_SYMBOL_GPL(snmp_mib_free);
-
#ifdef CONFIG_IP_MULTICAST
static const struct net_protocol igmp_protocol = {
.handler = igmp_rcv,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 4429b013f269..7808093cede6 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -368,9 +368,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
} else {
probes -= neigh->parms->app_probes;
if (probes < 0) {
-#ifdef CONFIG_ARPD
neigh_app_ns(neigh);
-#endif
return;
}
}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 8ed7c32ae28e..540279f4c531 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -29,6 +29,7 @@
static int zero;
static int one = 1;
static int four = 4;
+static int gso_max_segs = GSO_MAX_SEGS;
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -761,6 +762,15 @@ static struct ctl_table ipv4_table[] = {
.extra2 = &four,
},
{
+ .procname = "tcp_min_tso_segs",
+ .data = &sysctl_tcp_min_tso_segs,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &gso_max_segs,
+ },
+ {
.procname = "udp_mem",
.data = &sysctl_udp_mem,
.maxlen = sizeof(sysctl_udp_mem),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4e42c03859f4..fdf74090a001 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -283,6 +283,8 @@
int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
+int sysctl_tcp_min_tso_segs __read_mostly = 2;
+
struct percpu_counter tcp_orphan_count;
EXPORT_SYMBOL_GPL(tcp_orphan_count);
@@ -785,12 +787,28 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
xmit_size_goal = mss_now;
if (large_allowed && sk_can_gso(sk)) {
- xmit_size_goal = ((sk->sk_gso_max_size - 1) -
- inet_csk(sk)->icsk_af_ops->net_header_len -
- inet_csk(sk)->icsk_ext_hdr_len -
- tp->tcp_header_len);
+ u32 gso_size, hlen;
+
+ /* Maybe we should/could use sk->sk_prot->max_header here ? */
+ hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
+ inet_csk(sk)->icsk_ext_hdr_len +
+ tp->tcp_header_len;
+
+ /* Goal is to send at least one packet per ms,
+ * not one big TSO packet every 100 ms.
+ * This preserves ACK clocking and is consistent
+ * with tcp_tso_should_defer() heuristic.
+ */
+ gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);
+ gso_size = max_t(u32, gso_size,
+ sysctl_tcp_min_tso_segs * mss_now);
+
+ xmit_size_goal = min_t(u32, gso_size,
+ sk->sk_gso_max_size - 1 - hlen);
- /* TSQ : try to have two TSO segments in flight */
+ /* TSQ : try to have at least two segments in flight
+ * (one in NIC TX ring, another in Qdisc)
+ */
xmit_size_goal = min_t(u32, xmit_size_goal,
sysctl_tcp_limit_output_bytes >> 1);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ec492eae0cd7..1a84fffe6993 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -688,6 +688,34 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
}
}
+/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
+ * Note: TCP stack does not yet implement pacing.
+ * FQ packet scheduler can be used to implement cheap but effective
+ * TCP pacing, to smooth the burst on large writes when packets
+ * in flight is significantly lower than cwnd (or rwin)
+ */
+static void tcp_update_pacing_rate(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u64 rate;
+
+ /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
+ rate = (u64)tp->mss_cache * 2 * (HZ << 3);
+
+ rate *= max(tp->snd_cwnd, tp->packets_out);
+
+ /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3),
+ * be conservative and assume srtt = 1 (125 us instead of 1.25 ms)
+ * We probably need usec resolution in the future.
+ * Note: This also takes care of possible srtt=0 case,
+ * when tcp_rtt_estimator() was not yet called.
+ */
+ if (tp->srtt > 8 + 2)
+ do_div(rate, tp->srtt);
+
+ sk->sk_pacing_rate = min_t(u64, rate, ~0U);
+}
+
/* Calculate rto without backoff. This is the second half of Van Jacobson's
* routine referred to above.
*/
@@ -3278,7 +3306,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
bool is_dupack = false;
- u32 prior_in_flight;
+ u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;
u32 prior_fackets;
int prior_packets = tp->packets_out;
const int prior_unsacked = tp->packets_out - tp->sacked_out;
@@ -3383,6 +3411,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
tcp_schedule_loss_probe(sk);
+ if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd)
+ tcp_update_pacing_rate(sk);
return 1;
no_queue:
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index f6a005c485a9..273ed735cca2 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -443,7 +443,7 @@ void tcp_init_metrics(struct sock *sk)
struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_metrics_block *tm;
- u32 val;
+ u32 val, crtt = 0; /* cached RTT scaled by 8 */
if (dst == NULL)
goto reset;
@@ -478,40 +478,18 @@ void tcp_init_metrics(struct sock *sk)
tp->reordering = val;
}
- val = tcp_metric_get(tm, TCP_METRIC_RTT);
- if (val == 0 || tp->srtt == 0) {
- rcu_read_unlock();
- goto reset;
- }
- /* Initial rtt is determined from SYN,SYN-ACK.
- * The segment is small and rtt may appear much
- * less than real one. Use per-dst memory
- * to make it more realistic.
- *
- * A bit of theory. RTT is time passed after "normal" sized packet
- * is sent until it is ACKed. In normal circumstances sending small
- * packets force peer to delay ACKs and calculation is correct too.
- * The algorithm is adaptive and, provided we follow specs, it
- * NEVER underestimate RTT. BUT! If peer tries to make some clever
- * tricks sort of "quick acks" for time long enough to decrease RTT
- * to low value, and then abruptly stops to do it and starts to delay
- * ACKs, wait for troubles.
- */
- val = msecs_to_jiffies(val);
- if (val > tp->srtt) {
- tp->srtt = val;
- tp->rtt_seq = tp->snd_nxt;
- }
- val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
- if (val > tp->mdev) {
- tp->mdev = val;
- tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
- }
+ crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
rcu_read_unlock();
-
- tcp_set_rto(sk);
reset:
- if (tp->srtt == 0) {
+ if (crtt > tp->srtt) {
+ /* Initial RTT (tp->srtt) from SYN usually don't measure
+ * serialization delay on low BW links well so RTO may be
+ * under-estimated. Stay conservative and seed RTO with
+ * the RTTs from past data exchanges, using the same seeding
+ * formula in tcp_rtt_estimator().
+ */
+ inet_csk(sk)->icsk_rto = crtt + max(crtt >> 2, tcp_rto_min(sk));
+ } else if (tp->srtt == 0) {
/* RFC6298: 5.7 We've failed to get a valid RTT sample from
* 3WHS. This is most likely due to retransmission,
* including spurious one. Reset the RTO back to 3secs
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 884efff5b531..e63ae4c9691d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1631,7 +1631,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
/* If a full-sized TSO skb can be sent, do it. */
if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
- sk->sk_gso_max_segs * tp->mss_cache))
+ tp->xmit_size_goal_segs * tp->mss_cache))
goto send_now;
/* Middle in queue won't get any more data, full sendable already? */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0b24508bcdc4..74d2c95db57f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2337,7 +2337,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
uh->len = htons(skb->len - udp_offset);
/* csum segment if tunnel sets skb with csum. */
- if (unlikely(uh->check)) {
+ if (protocol == htons(ETH_P_IP) && unlikely(uh->check)) {
struct iphdr *iph = ip_hdr(skb);
uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
@@ -2348,7 +2348,18 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
if (uh->check == 0)
uh->check = CSUM_MANGLED_0;
+ } else if (protocol == htons(ETH_P_IPV6)) {
+ struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ u32 len = skb->len - udp_offset;
+
+ uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
+ len, IPPROTO_UDP, 0);
+ uh->check = csum_fold(skb_checksum(skb, udp_offset, len, 0));
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+ skb->ip_summed = CHECKSUM_NONE;
}
+
skb->protocol = protocol;
} while ((skb = skb->next));
out:
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 2d6d1793bbfe..2a66eaad047f 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -204,6 +204,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
.accept_source_route = 0, /* we do not accept RH0 by default. */
.disable_ipv6 = 0,
.accept_dad = 1,
+ .suppress_frag_ndisc = 1,
};
static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -241,17 +242,9 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
.accept_source_route = 0, /* we do not accept RH0 by default. */
.disable_ipv6 = 0,
.accept_dad = 1,
+ .suppress_frag_ndisc = 1,
};
-/* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */
-const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
-const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT;
-const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
-const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT;
-const struct in6_addr in6addr_interfacelocal_allnodes = IN6ADDR_INTERFACELOCAL_ALLNODES_INIT;
-const struct in6_addr in6addr_interfacelocal_allrouters = IN6ADDR_INTERFACELOCAL_ALLROUTERS_INIT;
-const struct in6_addr in6addr_sitelocal_allrouters = IN6ADDR_SITELOCAL_ALLROUTERS_INIT;
-
/* Check if a valid qdisc is available */
static inline bool addrconf_qdisc_ok(const struct net_device *dev)
{
@@ -311,36 +304,6 @@ err_ip:
return -ENOMEM;
}
-static void snmp6_free_dev(struct inet6_dev *idev)
-{
- kfree(idev->stats.icmpv6msgdev);
- kfree(idev->stats.icmpv6dev);
- snmp_mib_free((void __percpu **)idev->stats.ipv6);
-}
-
-/* Nobody refers to this device, we may destroy it. */
-
-void in6_dev_finish_destroy(struct inet6_dev *idev)
-{
- struct net_device *dev = idev->dev;
-
- WARN_ON(!list_empty(&idev->addr_list));
- WARN_ON(idev->mc_list != NULL);
- WARN_ON(timer_pending(&idev->rs_timer));
-
-#ifdef NET_REFCNT_DEBUG
- pr_debug("%s: %s\n", __func__, dev ? dev->name : "NIL");
-#endif
- dev_put(dev);
- if (!idev->dead) {
- pr_warn("Freeing alive inet6 device %p\n", idev);
- return;
- }
- snmp6_free_dev(idev);
- kfree_rcu(idev, rcu);
-}
-EXPORT_SYMBOL(in6_dev_finish_destroy);
-
static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
{
struct inet6_dev *ndev;
@@ -3097,6 +3060,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
static void addrconf_rs_timer(unsigned long data)
{
struct inet6_dev *idev = (struct inet6_dev *)data;
+ struct net_device *dev = idev->dev;
struct in6_addr lladdr;
write_lock(&idev->lock);
@@ -3111,12 +3075,14 @@ static void addrconf_rs_timer(unsigned long data)
goto out;
if (idev->rs_probes++ < idev->cnf.rtr_solicits) {
- if (!__ipv6_get_lladdr(idev, &lladdr, IFA_F_TENTATIVE))
- ndisc_send_rs(idev->dev, &lladdr,
+ write_unlock(&idev->lock);
+ if (!ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE))
+ ndisc_send_rs(dev, &lladdr,
&in6addr_linklocal_allrouters);
else
- goto out;
+ goto put;
+ write_lock(&idev->lock);
/* The wait after the last probe can be shorter */
addrconf_mod_rs_timer(idev, (idev->rs_probes ==
idev->cnf.rtr_solicits) ?
@@ -3132,6 +3098,7 @@ static void addrconf_rs_timer(unsigned long data)
out:
write_unlock(&idev->lock);
+put:
in6_dev_put(idev);
}
@@ -4188,6 +4155,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad;
array[DEVCONF_FORCE_TLLAO] = cnf->force_tllao;
array[DEVCONF_NDISC_NOTIFY] = cnf->ndisc_notify;
+ array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc;
}
static inline size_t inet6_ifla6_size(void)
@@ -5002,6 +4970,13 @@ static struct addrconf_sysctl_table
.proc_handler = proc_dointvec
},
{
+ .procname = "suppress_frag_ndisc",
+ .data = &ipv6_devconf.suppress_frag_ndisc,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
/* sentinel */
}
},
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index d2f87427244b..4c11cbcf8308 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -6,6 +6,7 @@
#include <linux/export.h>
#include <net/ipv6.h>
#include <net/addrconf.h>
+#include <net/ip.h>
#define IPV6_ADDR_SCOPE_TYPE(scope) ((scope) << 16)
@@ -98,3 +99,52 @@ int inet6addr_notifier_call_chain(unsigned long val, void *v)
return atomic_notifier_call_chain(&inet6addr_chain, val, v);
}
EXPORT_SYMBOL(inet6addr_notifier_call_chain);
+
+const struct ipv6_stub *ipv6_stub __read_mostly;
+EXPORT_SYMBOL_GPL(ipv6_stub);
+
+/* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */
+const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT;
+EXPORT_SYMBOL(in6addr_loopback);
+const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
+EXPORT_SYMBOL(in6addr_any);
+const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
+EXPORT_SYMBOL(in6addr_linklocal_allnodes);
+const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT;
+EXPORT_SYMBOL(in6addr_linklocal_allrouters);
+const struct in6_addr in6addr_interfacelocal_allnodes = IN6ADDR_INTERFACELOCAL_ALLNODES_INIT;
+EXPORT_SYMBOL(in6addr_interfacelocal_allnodes);
+const struct in6_addr in6addr_interfacelocal_allrouters = IN6ADDR_INTERFACELOCAL_ALLROUTERS_INIT;
+EXPORT_SYMBOL(in6addr_interfacelocal_allrouters);
+const struct in6_addr in6addr_sitelocal_allrouters = IN6ADDR_SITELOCAL_ALLROUTERS_INIT;
+EXPORT_SYMBOL(in6addr_sitelocal_allrouters);
+
+static void snmp6_free_dev(struct inet6_dev *idev)
+{
+ kfree(idev->stats.icmpv6msgdev);
+ kfree(idev->stats.icmpv6dev);
+ snmp_mib_free((void __percpu **)idev->stats.ipv6);
+}
+
+/* Nobody refers to this device, we may destroy it. */
+
+void in6_dev_finish_destroy(struct inet6_dev *idev)
+{
+ struct net_device *dev = idev->dev;
+
+ WARN_ON(!list_empty(&idev->addr_list));
+ WARN_ON(idev->mc_list != NULL);
+ WARN_ON(timer_pending(&idev->rs_timer));
+
+#ifdef NET_REFCNT_DEBUG
+ pr_debug("%s: %s\n", __func__, dev ? dev->name : "NIL");
+#endif
+ dev_put(dev);
+ if (!idev->dead) {
+ pr_warn("Freeing alive inet6 device %p\n", idev);
+ return;
+ }
+ snmp6_free_dev(idev);
+ kfree_rcu(idev, rcu);
+}
+EXPORT_SYMBOL(in6_dev_finish_destroy);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 0d1a9b153fbb..136fe55c1a47 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -56,6 +56,7 @@
#include <net/transp_v6.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
+#include <net/ndisc.h>
#ifdef CONFIG_IPV6_TUNNEL
#include <net/ip6_tunnel.h>
#endif
@@ -810,6 +811,15 @@ static struct pernet_operations inet6_net_ops = {
.exit = inet6_net_exit,
};
+static const struct ipv6_stub ipv6_stub_impl = {
+ .ipv6_sock_mc_join = ipv6_sock_mc_join,
+ .ipv6_sock_mc_drop = ipv6_sock_mc_drop,
+ .ipv6_dst_lookup = ip6_dst_lookup,
+ .udpv6_encap_enable = udpv6_encap_enable,
+ .ndisc_send_na = ndisc_send_na,
+ .nd_tbl = &nd_tbl,
+};
+
static int __init inet6_init(void)
{
struct list_head *r;
@@ -884,6 +894,9 @@ static int __init inet6_init(void)
err = igmp6_init();
if (err)
goto igmp_fail;
+
+ ipv6_stub = &ipv6_stub_impl;
+
err = ipv6_netfilter_init();
if (err)
goto netfilter_fail;
@@ -1040,6 +1053,7 @@ static void __exit inet6_exit(void)
raw6_proc_exit();
#endif
ipv6_netfilter_fini();
+ ipv6_stub = NULL;
igmp6_cleanup();
ndisc_cleanup();
ip6_mr_cleanup();
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index a263b990ee11..d82de7228100 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -91,6 +91,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
unsigned int unfrag_ip6hlen;
u8 *prevhdr;
int offset = 0;
+ bool tunnel;
if (unlikely(skb_shinfo(skb)->gso_type &
~(SKB_GSO_UDP |
@@ -106,6 +107,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h))))
goto out;
+ tunnel = skb->encapsulation;
ipv6h = ipv6_hdr(skb);
__skb_pull(skb, sizeof(*ipv6h));
segs = ERR_PTR(-EPROTONOSUPPORT);
@@ -126,7 +128,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
ipv6h = ipv6_hdr(skb);
ipv6h->payload_len = htons(skb->len - skb->mac_len -
sizeof(*ipv6h));
- if (proto == IPPROTO_UDP) {
+ if (!tunnel && proto == IPPROTO_UDP) {
unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr);
fptr = (struct frag_hdr *)(skb_network_header(skb) +
unfrag_ip6hlen);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 6e3ddf806ec2..dd08cfd8999e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -56,31 +56,6 @@
#include <net/checksum.h>
#include <linux/mroute6.h>
-int __ip6_local_out(struct sk_buff *skb)
-{
- int len;
-
- len = skb->len - sizeof(struct ipv6hdr);
- if (len > IPV6_MAXPLEN)
- len = 0;
- ipv6_hdr(skb)->payload_len = htons(len);
-
- return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
- skb_dst(skb)->dev, dst_output);
-}
-
-int ip6_local_out(struct sk_buff *skb)
-{
- int err;
-
- err = __ip6_local_out(skb);
- if (likely(err == 1))
- err = dst_output(skb);
-
- return err;
-}
-EXPORT_SYMBOL_GPL(ip6_local_out);
-
static int ip6_finish_output2(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 04d31c2fbef1..22210650596f 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -428,7 +428,6 @@ static void ndisc_send_skb(struct sk_buff *skb,
type = icmp6h->icmp6_type;
if (!dst) {
- struct sock *sk = net->ipv6.ndisc_sk;
struct flowi6 fl6;
icmpv6_flow_init(sk, &fl6, type, saddr, daddr, skb->dev->ifindex);
@@ -462,10 +461,10 @@ static void ndisc_send_skb(struct sk_buff *skb,
rcu_read_unlock();
}
-static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh,
- const struct in6_addr *daddr,
- const struct in6_addr *solicited_addr,
- bool router, bool solicited, bool override, bool inc_opt)
+void ndisc_send_na(struct net_device *dev, struct neighbour *neigh,
+ const struct in6_addr *daddr,
+ const struct in6_addr *solicited_addr,
+ bool router, bool solicited, bool override, bool inc_opt)
{
struct sk_buff *skb;
struct in6_addr tmpaddr;
@@ -663,9 +662,7 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
}
ndisc_send_ns(dev, neigh, target, target, saddr);
} else if ((probes -= neigh->parms->app_probes) < 0) {
-#ifdef CONFIG_ARPD
neigh_app_ns(neigh);
-#endif
} else {
addrconf_addr_solict_mult(target, &mcaddr);
ndisc_send_ns(dev, NULL, target, &mcaddr, saddr);
@@ -1519,10 +1516,27 @@ static void pndisc_redo(struct sk_buff *skb)
kfree_skb(skb);
}
+static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb)
+{
+ struct inet6_dev *idev = __in6_dev_get(skb->dev);
+
+ if (!idev)
+ return true;
+ if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED &&
+ idev->cnf.suppress_frag_ndisc) {
+ net_warn_ratelimited("Received fragmented ndisc packet. Carefully consider disabling suppress_frag_ndisc.\n");
+ return true;
+ }
+ return false;
+}
+
int ndisc_rcv(struct sk_buff *skb)
{
struct nd_msg *msg;
+ if (ndisc_suppress_frag_ndisc(skb))
+ return 0;
+
if (skb_linearize(skb))
return 0;
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index ab92a3673fbb..827f795209cf 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -5,6 +5,7 @@
#include <linux/export.h>
#include <net/ipv6.h>
#include <net/ip6_fib.h>
+#include <net/addrconf.h>
void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
{
@@ -75,3 +76,50 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
return offset;
}
EXPORT_SYMBOL(ip6_find_1stfragopt);
+
+#if IS_ENABLED(CONFIG_IPV6)
+int ip6_dst_hoplimit(struct dst_entry *dst)
+{
+ int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
+ if (hoplimit == 0) {
+ struct net_device *dev = dst->dev;
+ struct inet6_dev *idev;
+
+ rcu_read_lock();
+ idev = __in6_dev_get(dev);
+ if (idev)
+ hoplimit = idev->cnf.hop_limit;
+ else
+ hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
+ rcu_read_unlock();
+ }
+ return hoplimit;
+}
+EXPORT_SYMBOL(ip6_dst_hoplimit);
+#endif
+
+int __ip6_local_out(struct sk_buff *skb)
+{
+ int len;
+
+ len = skb->len - sizeof(struct ipv6hdr);
+ if (len > IPV6_MAXPLEN)
+ len = 0;
+ ipv6_hdr(skb)->payload_len = htons(len);
+
+ return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
+ skb_dst(skb)->dev, dst_output);
+}
+EXPORT_SYMBOL_GPL(__ip6_local_out);
+
+int ip6_local_out(struct sk_buff *skb)
+{
+ int err;
+
+ err = __ip6_local_out(skb);
+ if (likely(err == 1))
+ err = dst_output(skb);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(ip6_local_out);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 55236a84c748..b770085ae36d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1354,25 +1354,6 @@ out:
return entries > rt_max_size;
}
-int ip6_dst_hoplimit(struct dst_entry *dst)
-{
- int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
- if (hoplimit == 0) {
- struct net_device *dev = dst->dev;
- struct inet6_dev *idev;
-
- rcu_read_lock();
- idev = __in6_dev_get(dev);
- if (idev)
- hoplimit = idev->cnf.hop_limit;
- else
- hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
- rcu_read_unlock();
- }
- return hoplimit;
-}
-EXPORT_SYMBOL(ip6_dst_hoplimit);
-
/*
*
*/
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 5d1b8d7ac993..60559511bd9c 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -21,26 +21,25 @@ static int udp6_ufo_send_check(struct sk_buff *skb)
const struct ipv6hdr *ipv6h;
struct udphdr *uh;
- /* UDP Tunnel offload on ipv6 is not yet supported. */
- if (skb->encapsulation)
- return -EINVAL;
-
if (!pskb_may_pull(skb, sizeof(*uh)))
return -EINVAL;
- ipv6h = ipv6_hdr(skb);
- uh = udp_hdr(skb);
+ if (likely(!skb->encapsulation)) {
+ ipv6h = ipv6_hdr(skb);
+ uh = udp_hdr(skb);
+
+ uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len,
+ IPPROTO_UDP, 0);
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ }
- uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len,
- IPPROTO_UDP, 0);
- skb->csum_start = skb_transport_header(skb) - skb->head;
- skb->csum_offset = offsetof(struct udphdr, check);
- skb->ip_summed = CHECKSUM_PARTIAL;
return 0;
}
static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
- netdev_features_t features)
+ netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
unsigned int mss;
@@ -75,47 +74,51 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
goto out;
}
- /* Do software UFO. Complete and fill in the UDP checksum as HW cannot
- * do checksum of UDP packets sent as multiple IP fragments.
- */
- offset = skb_checksum_start_offset(skb);
- csum = skb_checksum(skb, offset, skb->len - offset, 0);
- offset += skb->csum_offset;
- *(__sum16 *)(skb->data + offset) = csum_fold(csum);
- skb->ip_summed = CHECKSUM_NONE;
-
- /* Check if there is enough headroom to insert fragment header. */
- tnl_hlen = skb_tnl_header_len(skb);
- if (skb_headroom(skb) < (tnl_hlen + frag_hdr_sz)) {
- if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz))
- goto out;
+ if (skb->encapsulation && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL)
+ segs = skb_udp_tunnel_segment(skb, features);
+ else {
+ /* Do software UFO. Complete and fill in the UDP checksum as HW cannot
+ * do checksum of UDP packets sent as multiple IP fragments.
+ */
+ offset = skb_checksum_start_offset(skb);
+ csum = skb_checksum(skb, offset, skb->len - offset, 0);
+ offset += skb->csum_offset;
+ *(__sum16 *)(skb->data + offset) = csum_fold(csum);
+ skb->ip_summed = CHECKSUM_NONE;
+
+ /* Check if there is enough headroom to insert fragment header. */
+ tnl_hlen = skb_tnl_header_len(skb);
+ if (skb_headroom(skb) < (tnl_hlen + frag_hdr_sz)) {
+ if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz))
+ goto out;
+ }
+
+ /* Find the unfragmentable header and shift it left by frag_hdr_sz
+ * bytes to insert fragment header.
+ */
+ unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr);
+ nexthdr = *prevhdr;
+ *prevhdr = NEXTHDR_FRAGMENT;
+ unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) +
+ unfrag_ip6hlen + tnl_hlen;
+ packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset;
+ memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len);
+
+ SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz;
+ skb->mac_header -= frag_hdr_sz;
+ skb->network_header -= frag_hdr_sz;
+
+ fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen);
+ fptr->nexthdr = nexthdr;
+ fptr->reserved = 0;
+ ipv6_select_ident(fptr, (struct rt6_info *)skb_dst(skb));
+
+ /* Fragment the skb. ipv6 header and the remaining fields of the
+ * fragment header are updated in ipv6_gso_segment()
+ */
+ segs = skb_segment(skb, features);
}
- /* Find the unfragmentable header and shift it left by frag_hdr_sz
- * bytes to insert fragment header.
- */
- unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr);
- nexthdr = *prevhdr;
- *prevhdr = NEXTHDR_FRAGMENT;
- unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) +
- unfrag_ip6hlen + tnl_hlen;
- packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset;
- memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len);
-
- SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz;
- skb->mac_header -= frag_hdr_sz;
- skb->network_header -= frag_hdr_sz;
-
- fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen);
- fptr->nexthdr = nexthdr;
- fptr->reserved = 0;
- ipv6_select_ident(fptr, (struct rt6_info *)skb_dst(skb));
-
- /* Fragment the skb. ipv6 header and the remaining fields of the
- * fragment header are updated in ipv6_gso_segment()
- */
- segs = skb_segment(skb, features);
-
out:
return segs;
}
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 36848bd54a77..a0060245b4e1 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -123,7 +123,7 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
vxlan_port = vxlan_vport(vport);
strncpy(vxlan_port->name, parms->name, IFNAMSIZ);
- vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true);
+ vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, false);
if (IS_ERR(vs)) {
ovs_vport_free(vport);
return (void *)vs;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 1fdf9ab91c3f..2e8286b47c28 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -88,7 +88,7 @@
#include <linux/virtio_net.h>
#include <linux/errqueue.h>
#include <linux/net_tstamp.h>
-
+#include <linux/reciprocal_div.h>
#ifdef CONFIG_INET
#include <net/inet_common.h>
#endif
@@ -1135,7 +1135,7 @@ static unsigned int fanout_demux_hash(struct packet_fanout *f,
struct sk_buff *skb,
unsigned int num)
{
- return (((u64)skb->rxhash) * num) >> 32;
+ return reciprocal_divide(skb->rxhash, num);
}
static unsigned int fanout_demux_lb(struct packet_fanout *f,
@@ -1158,6 +1158,13 @@ static unsigned int fanout_demux_cpu(struct packet_fanout *f,
return smp_processor_id() % num;
}
+static unsigned int fanout_demux_rnd(struct packet_fanout *f,
+ struct sk_buff *skb,
+ unsigned int num)
+{
+ return reciprocal_divide(prandom_u32(), num);
+}
+
static unsigned int fanout_demux_rollover(struct packet_fanout *f,
struct sk_buff *skb,
unsigned int idx, unsigned int skip,
@@ -1215,6 +1222,9 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
case PACKET_FANOUT_CPU:
idx = fanout_demux_cpu(f, skb, num);
break;
+ case PACKET_FANOUT_RND:
+ idx = fanout_demux_rnd(f, skb, num);
+ break;
case PACKET_FANOUT_ROLLOVER:
idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
break;
@@ -1284,6 +1294,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
case PACKET_FANOUT_HASH:
case PACKET_FANOUT_LB:
case PACKET_FANOUT_CPU:
+ case PACKET_FANOUT_RND:
break;
default:
return -EINVAL;
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 235e01acac51..c03a32a0418e 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -272,6 +272,20 @@ config NET_SCH_FQ_CODEL
If unsure, say N.
+config NET_SCH_FQ
+ tristate "Fair Queue"
+ help
+ Say Y here if you want to use the FQ packet scheduling algorithm.
+
+ FQ does flow separation, and is able to respect pacing requirements
+ set by TCP stack into sk->sk_pacing_rate (for localy generated
+ traffic)
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_fq.
+
+ If unsure, say N.
+
config NET_SCH_INGRESS
tristate "Ingress Qdisc"
depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 978cbf004e80..e5f9abe9a5db 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o
obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o
obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o
+obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 51b968d3febb..2adda7fa2d39 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -200,6 +200,58 @@ int unregister_qdisc(struct Qdisc_ops *qops)
}
EXPORT_SYMBOL(unregister_qdisc);
+/* Get default qdisc if not otherwise specified */
+void qdisc_get_default(char *name, size_t len)
+{
+ read_lock(&qdisc_mod_lock);
+ strlcpy(name, default_qdisc_ops->id, len);
+ read_unlock(&qdisc_mod_lock);
+}
+
+static struct Qdisc_ops *qdisc_lookup_default(const char *name)
+{
+ struct Qdisc_ops *q = NULL;
+
+ for (q = qdisc_base; q; q = q->next) {
+ if (!strcmp(name, q->id)) {
+ if (!try_module_get(q->owner))
+ q = NULL;
+ break;
+ }
+ }
+
+ return q;
+}
+
+/* Set new default qdisc to use */
+int qdisc_set_default(const char *name)
+{
+ const struct Qdisc_ops *ops;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ write_lock(&qdisc_mod_lock);
+ ops = qdisc_lookup_default(name);
+ if (!ops) {
+ /* Not found, drop lock and try to load module */
+ write_unlock(&qdisc_mod_lock);
+ request_module("sch_%s", name);
+ write_lock(&qdisc_mod_lock);
+
+ ops = qdisc_lookup_default(name);
+ }
+
+ if (ops) {
+ /* Set new default */
+ module_put(default_qdisc_ops->owner);
+ default_qdisc_ops = ops;
+ }
+ write_unlock(&qdisc_mod_lock);
+
+ return ops ? 0 : -ENOENT;
+}
+
/* We know handle. Find qdisc among all qdisc's attached to device
(root qdisc, all its children, children of children etc.)
*/
@@ -1854,6 +1906,7 @@ static int __init pktsched_init(void)
return err;
}
+ register_qdisc(&pfifo_fast_ops);
register_qdisc(&pfifo_qdisc_ops);
register_qdisc(&bfifo_qdisc_ops);
register_qdisc(&pfifo_head_drop_qdisc_ops);
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
new file mode 100644
index 000000000000..32ad015ee8ce
--- /dev/null
+++ b/net/sched/sch_fq.c
@@ -0,0 +1,793 @@
+/*
+ * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
+ *
+ * Copyright (C) 2013 Eric Dumazet <edumazet@google.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Meant to be mostly used for localy generated traffic :
+ * Fast classification depends on skb->sk being set before reaching us.
+ * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
+ * All packets belonging to a socket are considered as a 'flow'.
+ *
+ * Flows are dynamically allocated and stored in a hash table of RB trees
+ * They are also part of one Round Robin 'queues' (new or old flows)
+ *
+ * Burst avoidance (aka pacing) capability :
+ *
+ * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
+ * bunch of packets, and this packet scheduler adds delay between
+ * packets to respect rate limitation.
+ *
+ * enqueue() :
+ * - lookup one RB tree (out of 1024 or more) to find the flow.
+ * If non existent flow, create it, add it to the tree.
+ * Add skb to the per flow list of skb (fifo).
+ * - Use a special fifo for high prio packets
+ *
+ * dequeue() : serves flows in Round Robin
+ * Note : When a flow becomes empty, we do not immediately remove it from
+ * rb trees, for performance reasons (its expected to send additional packets,
+ * or SLAB cache will reuse socket for another flow)
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/hash.h>
+#include <linux/prefetch.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+
+/*
+ * Per flow structure, dynamically allocated
+ */
+struct fq_flow {
+ struct sk_buff *head; /* list of skbs for this flow : first skb */
+ union {
+ struct sk_buff *tail; /* last skb in the list */
+ unsigned long age; /* jiffies when flow was emptied, for gc */
+ };
+ struct rb_node fq_node; /* anchor in fq_root[] trees */
+ struct sock *sk;
+ int qlen; /* number of packets in flow queue */
+ int credit;
+ u32 socket_hash; /* sk_hash */
+ struct fq_flow *next; /* next pointer in RR lists, or &detached */
+
+ struct rb_node rate_node; /* anchor in q->delayed tree */
+ u64 time_next_packet;
+};
+
+struct fq_flow_head {
+ struct fq_flow *first;
+ struct fq_flow *last;
+};
+
+struct fq_sched_data {
+ struct fq_flow_head new_flows;
+
+ struct fq_flow_head old_flows;
+
+ struct rb_root delayed; /* for rate limited flows */
+ u64 time_next_delayed_flow;
+
+ struct fq_flow internal; /* for non classified or high prio packets */
+ u32 quantum;
+ u32 initial_quantum;
+ u32 flow_default_rate;/* rate per flow : bytes per second */
+ u32 flow_max_rate; /* optional max rate per flow */
+ u32 flow_plimit; /* max packets per flow */
+ struct rb_root *fq_root;
+ u8 rate_enable;
+ u8 fq_trees_log;
+
+ u32 flows;
+ u32 inactive_flows;
+ u32 throttled_flows;
+
+ u64 stat_gc_flows;
+ u64 stat_internal_packets;
+ u64 stat_tcp_retrans;
+ u64 stat_throttled;
+ u64 stat_flows_plimit;
+ u64 stat_pkts_too_long;
+ u64 stat_allocation_errors;
+ struct qdisc_watchdog watchdog;
+};
+
+/* special value to mark a detached flow (not on old/new list) */
+static struct fq_flow detached, throttled;
+
+static void fq_flow_set_detached(struct fq_flow *f)
+{
+ f->next = &detached;
+}
+
+static bool fq_flow_is_detached(const struct fq_flow *f)
+{
+ return f->next == &detached;
+}
+
+static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
+{
+ struct rb_node **p = &q->delayed.rb_node, *parent = NULL;
+
+ while (*p) {
+ struct fq_flow *aux;
+
+ parent = *p;
+ aux = container_of(parent, struct fq_flow, rate_node);
+ if (f->time_next_packet >= aux->time_next_packet)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&f->rate_node, parent, p);
+ rb_insert_color(&f->rate_node, &q->delayed);
+ q->throttled_flows++;
+ q->stat_throttled++;
+
+ f->next = &throttled;
+ if (q->time_next_delayed_flow > f->time_next_packet)
+ q->time_next_delayed_flow = f->time_next_packet;
+}
+
+
+static struct kmem_cache *fq_flow_cachep __read_mostly;
+
+static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
+{
+ if (head->first)
+ head->last->next = flow;
+ else
+ head->first = flow;
+ head->last = flow;
+ flow->next = NULL;
+}
+
+/* limit number of collected flows per round */
+#define FQ_GC_MAX 8
+#define FQ_GC_AGE (3*HZ)
+
+static bool fq_gc_candidate(const struct fq_flow *f)
+{
+ return fq_flow_is_detached(f) &&
+ time_after(jiffies, f->age + FQ_GC_AGE);
+}
+
+static void fq_gc(struct fq_sched_data *q,
+ struct rb_root *root,
+ struct sock *sk)
+{
+ struct fq_flow *f, *tofree[FQ_GC_MAX];
+ struct rb_node **p, *parent;
+ int fcnt = 0;
+
+ p = &root->rb_node;
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+
+ f = container_of(parent, struct fq_flow, fq_node);
+ if (f->sk == sk)
+ break;
+
+ if (fq_gc_candidate(f)) {
+ tofree[fcnt++] = f;
+ if (fcnt == FQ_GC_MAX)
+ break;
+ }
+
+ if (f->sk > sk)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+
+ q->flows -= fcnt;
+ q->inactive_flows -= fcnt;
+ q->stat_gc_flows += fcnt;
+ while (fcnt) {
+ struct fq_flow *f = tofree[--fcnt];
+
+ rb_erase(&f->fq_node, root);
+ kmem_cache_free(fq_flow_cachep, f);
+ }
+}
+
+static const u8 prio2band[TC_PRIO_MAX + 1] = {
+ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
+};
+
+static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
+{
+ struct rb_node **p, *parent;
+ struct sock *sk = skb->sk;
+ struct rb_root *root;
+ struct fq_flow *f;
+ int band;
+
+ /* warning: no starvation prevention... */
+ band = prio2band[skb->priority & TC_PRIO_MAX];
+ if (unlikely(band == 0))
+ return &q->internal;
+
+ if (unlikely(!sk)) {
+ /* By forcing low order bit to 1, we make sure to not
+ * collide with a local flow (socket pointers are word aligned)
+ */
+ sk = (struct sock *)(skb_get_rxhash(skb) | 1L);
+ }
+
+ root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)];
+
+ if (q->flows >= (2U << q->fq_trees_log) &&
+ q->inactive_flows > q->flows/2)
+ fq_gc(q, root, sk);
+
+ p = &root->rb_node;
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+
+ f = container_of(parent, struct fq_flow, fq_node);
+ if (f->sk == sk) {
+ /* socket might have been reallocated, so check
+ * if its sk_hash is the same.
+ * It not, we need to refill credit with
+ * initial quantum
+ */
+ if (unlikely(skb->sk &&
+ f->socket_hash != sk->sk_hash)) {
+ f->credit = q->initial_quantum;
+ f->socket_hash = sk->sk_hash;
+ }
+ return f;
+ }
+ if (f->sk > sk)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+
+ f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN);
+ if (unlikely(!f)) {
+ q->stat_allocation_errors++;
+ return &q->internal;
+ }
+ fq_flow_set_detached(f);
+ f->sk = sk;
+ if (skb->sk)
+ f->socket_hash = sk->sk_hash;
+ f->credit = q->initial_quantum;
+
+ rb_link_node(&f->fq_node, parent, p);
+ rb_insert_color(&f->fq_node, root);
+
+ q->flows++;
+ q->inactive_flows++;
+ return f;
+}
+
+
+/* remove one skb from head of flow queue */
+static struct sk_buff *fq_dequeue_head(struct fq_flow *flow)
+{
+ struct sk_buff *skb = flow->head;
+
+ if (skb) {
+ flow->head = skb->next;
+ skb->next = NULL;
+ flow->qlen--;
+ }
+ return skb;
+}
+
+/* We might add in the future detection of retransmits
+ * For the time being, just return false
+ */
+static bool skb_is_retransmit(struct sk_buff *skb)
+{
+ return false;
+}
+
+/* add skb to flow queue
+ * flow queue is a linked list, kind of FIFO, except for TCP retransmits
+ * We special case tcp retransmits to be transmitted before other packets.
+ * We rely on fact that TCP retransmits are unlikely, so we do not waste
+ * a separate queue or a pointer.
+ * head-> [retrans pkt 1]
+ * [retrans pkt 2]
+ * [ normal pkt 1]
+ * [ normal pkt 2]
+ * [ normal pkt 3]
+ * tail-> [ normal pkt 4]
+ */
+static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
+{
+ struct sk_buff *prev, *head = flow->head;
+
+ skb->next = NULL;
+ if (!head) {
+ flow->head = skb;
+ flow->tail = skb;
+ return;
+ }
+ if (likely(!skb_is_retransmit(skb))) {
+ flow->tail->next = skb;
+ flow->tail = skb;
+ return;
+ }
+
+ /* This skb is a tcp retransmit,
+ * find the last retrans packet in the queue
+ */
+ prev = NULL;
+ while (skb_is_retransmit(head)) {
+ prev = head;
+ head = head->next;
+ if (!head)
+ break;
+ }
+ if (!prev) { /* no rtx packet in queue, become the new head */
+ skb->next = flow->head;
+ flow->head = skb;
+ } else {
+ if (prev == flow->tail)
+ flow->tail = skb;
+ else
+ skb->next = prev->next;
+ prev->next = skb;
+ }
+}
+
+static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct fq_flow *f;
+
+ if (unlikely(sch->q.qlen >= sch->limit))
+ return qdisc_drop(skb, sch);
+
+ f = fq_classify(skb, q);
+ if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
+ q->stat_flows_plimit++;
+ return qdisc_drop(skb, sch);
+ }
+
+ f->qlen++;
+ flow_queue_add(f, skb);
+ if (skb_is_retransmit(skb))
+ q->stat_tcp_retrans++;
+ sch->qstats.backlog += qdisc_pkt_len(skb);
+ if (fq_flow_is_detached(f)) {
+ fq_flow_add_tail(&q->new_flows, f);
+ if (q->quantum > f->credit)
+ f->credit = q->quantum;
+ q->inactive_flows--;
+ qdisc_unthrottled(sch);
+ }
+ if (unlikely(f == &q->internal)) {
+ q->stat_internal_packets++;
+ qdisc_unthrottled(sch);
+ }
+ sch->q.qlen++;
+
+ return NET_XMIT_SUCCESS;
+}
+
+static void fq_check_throttled(struct fq_sched_data *q, u64 now)
+{
+ struct rb_node *p;
+
+ if (q->time_next_delayed_flow > now)
+ return;
+
+ q->time_next_delayed_flow = ~0ULL;
+ while ((p = rb_first(&q->delayed)) != NULL) {
+ struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
+
+ if (f->time_next_packet > now) {
+ q->time_next_delayed_flow = f->time_next_packet;
+ break;
+ }
+ rb_erase(p, &q->delayed);
+ q->throttled_flows--;
+ fq_flow_add_tail(&q->old_flows, f);
+ }
+}
+
+static struct sk_buff *fq_dequeue(struct Qdisc *sch)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ u64 now = ktime_to_ns(ktime_get());
+ struct fq_flow_head *head;
+ struct sk_buff *skb;
+ struct fq_flow *f;
+
+ skb = fq_dequeue_head(&q->internal);
+ if (skb)
+ goto out;
+ fq_check_throttled(q, now);
+begin:
+ head = &q->new_flows;
+ if (!head->first) {
+ head = &q->old_flows;
+ if (!head->first) {
+ if (q->time_next_delayed_flow != ~0ULL)
+ qdisc_watchdog_schedule_ns(&q->watchdog,
+ q->time_next_delayed_flow);
+ return NULL;
+ }
+ }
+ f = head->first;
+
+ if (f->credit <= 0) {
+ f->credit += q->quantum;
+ head->first = f->next;
+ fq_flow_add_tail(&q->old_flows, f);
+ goto begin;
+ }
+
+ if (unlikely(f->head && now < f->time_next_packet)) {
+ head->first = f->next;
+ fq_flow_set_throttled(q, f);
+ goto begin;
+ }
+
+ skb = fq_dequeue_head(f);
+ if (!skb) {
+ head->first = f->next;
+ /* force a pass through old_flows to prevent starvation */
+ if ((head == &q->new_flows) && q->old_flows.first) {
+ fq_flow_add_tail(&q->old_flows, f);
+ } else {
+ fq_flow_set_detached(f);
+ f->age = jiffies;
+ q->inactive_flows++;
+ }
+ goto begin;
+ }
+ prefetch(&skb->end);
+ f->time_next_packet = now;
+ f->credit -= qdisc_pkt_len(skb);
+
+ if (f->credit <= 0 &&
+ q->rate_enable &&
+ skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) {
+ u32 rate = skb->sk->sk_pacing_rate ?: q->flow_default_rate;
+
+ rate = min(rate, q->flow_max_rate);
+ if (rate) {
+ u64 len = (u64)qdisc_pkt_len(skb) * NSEC_PER_SEC;
+
+ do_div(len, rate);
+ /* Since socket rate can change later,
+ * clamp the delay to 125 ms.
+ * TODO: maybe segment the too big skb, as in commit
+ * e43ac79a4bc ("sch_tbf: segment too big GSO packets")
+ */
+ if (unlikely(len > 125 * NSEC_PER_MSEC)) {
+ len = 125 * NSEC_PER_MSEC;
+ q->stat_pkts_too_long++;
+ }
+
+ f->time_next_packet = now + len;
+ }
+ }
+out:
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ qdisc_bstats_update(sch, skb);
+ sch->q.qlen--;
+ qdisc_unthrottled(sch);
+ return skb;
+}
+
+static void fq_reset(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+
+ while ((skb = fq_dequeue(sch)) != NULL)
+ kfree_skb(skb);
+}
+
+static void fq_rehash(struct fq_sched_data *q,
+ struct rb_root *old_array, u32 old_log,
+ struct rb_root *new_array, u32 new_log)
+{
+ struct rb_node *op, **np, *parent;
+ struct rb_root *oroot, *nroot;
+ struct fq_flow *of, *nf;
+ int fcnt = 0;
+ u32 idx;
+
+ for (idx = 0; idx < (1U << old_log); idx++) {
+ oroot = &old_array[idx];
+ while ((op = rb_first(oroot)) != NULL) {
+ rb_erase(op, oroot);
+ of = container_of(op, struct fq_flow, fq_node);
+ if (fq_gc_candidate(of)) {
+ fcnt++;
+ kmem_cache_free(fq_flow_cachep, of);
+ continue;
+ }
+ nroot = &new_array[hash_32((u32)(long)of->sk, new_log)];
+
+ np = &nroot->rb_node;
+ parent = NULL;
+ while (*np) {
+ parent = *np;
+
+ nf = container_of(parent, struct fq_flow, fq_node);
+ BUG_ON(nf->sk == of->sk);
+
+ if (nf->sk > of->sk)
+ np = &parent->rb_right;
+ else
+ np = &parent->rb_left;
+ }
+
+ rb_link_node(&of->fq_node, parent, np);
+ rb_insert_color(&of->fq_node, nroot);
+ }
+ }
+ q->flows -= fcnt;
+ q->inactive_flows -= fcnt;
+ q->stat_gc_flows += fcnt;
+}
+
+static int fq_resize(struct fq_sched_data *q, u32 log)
+{
+ struct rb_root *array;
+ u32 idx;
+
+ if (q->fq_root && log == q->fq_trees_log)
+ return 0;
+
+ array = kmalloc(sizeof(struct rb_root) << log, GFP_KERNEL);
+ if (!array)
+ return -ENOMEM;
+
+ for (idx = 0; idx < (1U << log); idx++)
+ array[idx] = RB_ROOT;
+
+ if (q->fq_root) {
+ fq_rehash(q, q->fq_root, q->fq_trees_log, array, log);
+ kfree(q->fq_root);
+ }
+ q->fq_root = array;
+ q->fq_trees_log = log;
+
+ return 0;
+}
+
+static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
+ [TCA_FQ_PLIMIT] = { .type = NLA_U32 },
+ [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
+ [TCA_FQ_QUANTUM] = { .type = NLA_U32 },
+ [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 },
+ [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 },
+ [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 },
+ [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
+ [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
+};
+
+static int fq_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_FQ_MAX + 1];
+ int err, drop_count = 0;
+ u32 fq_log;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy);
+ if (err < 0)
+ return err;
+
+ sch_tree_lock(sch);
+
+ fq_log = q->fq_trees_log;
+
+ if (tb[TCA_FQ_BUCKETS_LOG]) {
+ u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
+
+ if (nval >= 1 && nval <= ilog2(256*1024))
+ fq_log = nval;
+ else
+ err = -EINVAL;
+ }
+ if (tb[TCA_FQ_PLIMIT])
+ sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
+
+ if (tb[TCA_FQ_FLOW_PLIMIT])
+ q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
+
+ if (tb[TCA_FQ_QUANTUM])
+ q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
+
+ if (tb[TCA_FQ_INITIAL_QUANTUM])
+ q->quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
+
+ if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
+ q->flow_default_rate = nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]);
+
+ if (tb[TCA_FQ_FLOW_MAX_RATE])
+ q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
+
+ if (tb[TCA_FQ_RATE_ENABLE]) {
+ u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
+
+ if (enable <= 1)
+ q->rate_enable = enable;
+ else
+ err = -EINVAL;
+ }
+
+ if (!err)
+ err = fq_resize(q, fq_log);
+
+ while (sch->q.qlen > sch->limit) {
+ struct sk_buff *skb = fq_dequeue(sch);
+
+ kfree_skb(skb);
+ drop_count++;
+ }
+ qdisc_tree_decrease_qlen(sch, drop_count);
+
+ sch_tree_unlock(sch);
+ return err;
+}
+
+static void fq_destroy(struct Qdisc *sch)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct rb_root *root;
+ struct rb_node *p;
+ unsigned int idx;
+
+ if (q->fq_root) {
+ for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
+ root = &q->fq_root[idx];
+ while ((p = rb_first(root)) != NULL) {
+ rb_erase(p, root);
+ kmem_cache_free(fq_flow_cachep,
+ container_of(p, struct fq_flow, fq_node));
+ }
+ }
+ kfree(q->fq_root);
+ }
+ qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static int fq_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ int err;
+
+ sch->limit = 10000;
+ q->flow_plimit = 100;
+ q->quantum = 2 * psched_mtu(qdisc_dev(sch));
+ q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
+ q->flow_default_rate = 0;
+ q->flow_max_rate = ~0U;
+ q->rate_enable = 1;
+ q->new_flows.first = NULL;
+ q->old_flows.first = NULL;
+ q->delayed = RB_ROOT;
+ q->fq_root = NULL;
+ q->fq_trees_log = ilog2(1024);
+ qdisc_watchdog_init(&q->watchdog, sch);
+
+ if (opt)
+ err = fq_change(sch, opt);
+ else
+ err = fq_resize(q, q->fq_trees_log);
+
+ return err;
+}
+
+static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
+ nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
+ nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
+ nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
+ nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
+ nla_put_u32(skb, TCA_FQ_FLOW_DEFAULT_RATE, q->flow_default_rate) ||
+ nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) ||
+ nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, opts);
+ return skb->len;
+
+nla_put_failure:
+ return -1;
+}
+
+static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ u64 now = ktime_to_ns(ktime_get());
+ struct tc_fq_qd_stats st = {
+ .gc_flows = q->stat_gc_flows,
+ .highprio_packets = q->stat_internal_packets,
+ .tcp_retrans = q->stat_tcp_retrans,
+ .throttled = q->stat_throttled,
+ .flows_plimit = q->stat_flows_plimit,
+ .pkts_too_long = q->stat_pkts_too_long,
+ .allocation_errors = q->stat_allocation_errors,
+ .flows = q->flows,
+ .inactive_flows = q->inactive_flows,
+ .throttled_flows = q->throttled_flows,
+ .time_next_delayed_flow = q->time_next_delayed_flow - now,
+ };
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
+ .id = "fq",
+ .priv_size = sizeof(struct fq_sched_data),
+
+ .enqueue = fq_enqueue,
+ .dequeue = fq_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = fq_init,
+ .reset = fq_reset,
+ .destroy = fq_destroy,
+ .change = fq_change,
+ .dump = fq_dump,
+ .dump_stats = fq_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init fq_module_init(void)
+{
+ int ret;
+
+ fq_flow_cachep = kmem_cache_create("fq_flow_cache",
+ sizeof(struct fq_flow),
+ 0, 0, NULL);
+ if (!fq_flow_cachep)
+ return -ENOMEM;
+
+ ret = register_qdisc(&fq_qdisc_ops);
+ if (ret)
+ kmem_cache_destroy(fq_flow_cachep);
+ return ret;
+}
+
+static void __exit fq_module_exit(void)
+{
+ unregister_qdisc(&fq_qdisc_ops);
+ kmem_cache_destroy(fq_flow_cachep);
+}
+
+module_init(fq_module_init)
+module_exit(fq_module_exit)
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 48be3d5c0d92..a74e278654aa 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -30,6 +30,10 @@
#include <net/pkt_sched.h>
#include <net/dst.h>
+/* Qdisc to use by default */
+const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
+EXPORT_SYMBOL(default_qdisc_ops);
+
/* Main transmission queue. */
/* Modifications to data participating in scheduling must be protected with
@@ -530,12 +534,11 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
.dump = pfifo_fast_dump,
.owner = THIS_MODULE,
};
-EXPORT_SYMBOL(pfifo_fast_ops);
static struct lock_class_key qdisc_tx_busylock;
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
- struct Qdisc_ops *ops)
+ const struct Qdisc_ops *ops)
{
void *p;
struct Qdisc *sch;
@@ -579,10 +582,14 @@ errout:
}
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
- struct Qdisc_ops *ops, unsigned int parentid)
+ const struct Qdisc_ops *ops,
+ unsigned int parentid)
{
struct Qdisc *sch;
+ if (!try_module_get(ops->owner))
+ goto errout;
+
sch = qdisc_alloc(dev_queue, ops);
if (IS_ERR(sch))
goto errout;
@@ -686,7 +693,7 @@ static void attach_one_default_qdisc(struct net_device *dev,
if (dev->tx_queue_len) {
qdisc = qdisc_create_dflt(dev_queue,
- &pfifo_fast_ops, TC_H_ROOT);
+ default_qdisc_ops, TC_H_ROOT);
if (!qdisc) {
netdev_info(dev, "activation failed\n");
return;
@@ -739,9 +746,8 @@ void dev_activate(struct net_device *dev)
int need_watchdog;
/* No queueing discipline is attached to device;
- create default one i.e. pfifo_fast for devices,
- which need queueing and noqueue_qdisc for
- virtual interfaces
+ * create default one for devices, which need queueing
+ * and noqueue_qdisc for virtual interfaces
*/
if (dev->qdisc == &noop_qdisc)
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 5da78a19ac9a..2e56185736d6 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -57,7 +57,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
dev_queue = netdev_get_tx_queue(dev, ntx);
- qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops,
+ qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,
TC_H_MAKE(TC_H_MAJ(sch->handle),
TC_H_MIN(ntx + 1)));
if (qdisc == NULL)
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index accec33c454c..d44c868cb537 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -124,7 +124,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
for (i = 0; i < dev->num_tx_queues; i++) {
dev_queue = netdev_get_tx_queue(dev, i);
- qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops,
+ qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,
TC_H_MAKE(TC_H_MAJ(sch->handle),
TC_H_MIN(i + 1)));
if (qdisc == NULL) {
diff --git a/net/sctp/probe.c b/net/sctp/probe.c
index cd72ae57aff1..53c452efb40b 100644
--- a/net/sctp/probe.c
+++ b/net/sctp/probe.c
@@ -46,6 +46,10 @@ static int port __read_mostly = 0;
MODULE_PARM_DESC(port, "Port to match (0=all)");
module_param(port, int, 0);
+static unsigned int fwmark __read_mostly = 0;
+MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)");
+module_param(fwmark, uint, 0);
+
static int bufsize __read_mostly = 64 * 1024;
MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)");
module_param(bufsize, int, 0);
@@ -129,15 +133,19 @@ static sctp_disposition_t jsctp_sf_eat_sack(struct net *net,
void *arg,
sctp_cmd_seq_t *commands)
{
+ struct sctp_chunk *chunk = arg;
+ struct sk_buff *skb = chunk->skb;
struct sctp_transport *sp;
static __u32 lcwnd = 0;
struct timespec now;
sp = asoc->peer.primary_path;
- if ((full || sp->cwnd != lcwnd) &&
- (!port || asoc->peer.port == port ||
- ep->base.bind_addr.port == port)) {
+ if (((port == 0 && fwmark == 0) ||
+ asoc->peer.port == port ||
+ ep->base.bind_addr.port == port ||
+ (fwmark > 0 && skb->mark == fwmark)) &&
+ (full || sp->cwnd != lcwnd)) {
lcwnd = sp->cwnd;
getnstimeofday(&now);
@@ -198,8 +206,8 @@ static __init int sctpprobe_init(void)
if (ret)
goto remove_proc;
- pr_info("probe registered (port=%d)\n", port);
-
+ pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n",
+ port, fwmark, bufsize);
return 0;
remove_proc:
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 01e97836ca6c..d244a23ab8d3 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2240,25 +2240,23 @@ int sctp_verify_init(struct net *net, const struct sctp_association *asoc,
struct sctp_chunk **errp)
{
union sctp_params param;
- int has_cookie = 0;
+ bool has_cookie = false;
int result;
- /* Verify stream values are non-zero. */
- if ((0 == peer_init->init_hdr.num_outbound_streams) ||
- (0 == peer_init->init_hdr.num_inbound_streams) ||
- (0 == peer_init->init_hdr.init_tag) ||
- (SCTP_DEFAULT_MINWINDOW > ntohl(peer_init->init_hdr.a_rwnd))) {
-
+ /* Check for missing mandatory parameters. Note: Initial TSN is
+ * also mandatory, but is not checked here since the valid range
+ * is 0..2**32-1. RFC4960, section 3.3.3.
+ */
+ if (peer_init->init_hdr.num_outbound_streams == 0 ||
+ peer_init->init_hdr.num_inbound_streams == 0 ||
+ peer_init->init_hdr.init_tag == 0 ||
+ ntohl(peer_init->init_hdr.a_rwnd) < SCTP_DEFAULT_MINWINDOW)
return sctp_process_inv_mandatory(asoc, chunk, errp);
- }
- /* Check for missing mandatory parameters. */
sctp_walk_params(param, peer_init, init_hdr.params) {
-
- if (SCTP_PARAM_STATE_COOKIE == param.p->type)
- has_cookie = 1;
-
- } /* for (loop through all parameters) */
+ if (param.p->type == SCTP_PARAM_STATE_COOKIE)
+ has_cookie = true;
+ }
/* There is a possibility that a parameter length was bad and
* in that case we would have stoped walking the parameters.