summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/Kconfig3
-rw-r--r--net/bpf/test_run.c3
-rw-r--r--net/bridge/br.c16
-rw-r--r--net/bridge/br_fdb.c69
-rw-r--r--net/bridge/br_forward.c3
-rw-r--r--net/bridge/br_if.c15
-rw-r--r--net/bridge/br_private.h19
-rw-r--r--net/bridge/br_switchdev.c12
-rw-r--r--net/bridge/br_vlan.c39
-rw-r--r--net/bridge/netfilter/ebtables.c11
-rw-r--r--net/caif/chnl_net.c2
-rw-r--r--net/ceph/messenger.c7
-rw-r--r--net/ceph/mon_client.c14
-rw-r--r--net/compat.c6
-rw-r--r--net/core/dev.c32
-rw-r--r--net/core/dev_addr_lists.c2
-rw-r--r--net/core/ethtool.c68
-rw-r--r--net/core/fib_rules.c495
-rw-r--r--net/core/filter.c90
-rw-r--r--net/core/neighbour.c48
-rw-r--r--net/core/skbuff.c25
-rw-r--r--net/dccp/ccids/ccid2.c14
-rw-r--r--net/dccp/timer.c2
-rw-r--r--net/decnet/dn_rules.c7
-rw-r--r--net/dsa/master.c62
-rw-r--r--net/dsa/port.c90
-rw-r--r--net/dsa/slave.c10
-rw-r--r--net/ife/ife.c38
-rw-r--r--net/ipv4/af_inet.c2
-rw-r--r--net/ipv4/fib_rules.c7
-rw-r--r--net/ipv4/ip_gre.c6
-rw-r--r--net/ipv4/ip_output.c49
-rw-r--r--net/ipv4/ipconfig.c151
-rw-r--r--net/ipv4/ipmr.c3
-rw-r--r--net/ipv4/netfilter/Makefile5
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic_main.c2
-rw-r--r--net/ipv4/proc.c2
-rw-r--r--net/ipv4/route.c118
-rw-r--r--net/ipv4/tcp.c260
-rw-r--r--net/ipv4/tcp_bbr.c4
-rw-r--r--net/ipv4/tcp_input.c67
-rw-r--r--net/ipv4/tcp_output.c34
-rw-r--r--net/ipv4/udp.c85
-rw-r--r--net/ipv4/udp_offload.c67
-rw-r--r--net/ipv6/addrconf.c82
-rw-r--r--net/ipv6/af_inet6.c2
-rw-r--r--net/ipv6/anycast.c24
-rw-r--r--net/ipv6/fib6_rules.c7
-rw-r--r--net/ipv6/ip6_fib.c239
-rw-r--r--net/ipv6/ip6_gre.c8
-rw-r--r--net/ipv6/ip6_offload.c6
-rw-r--r--net/ipv6/ip6_output.c54
-rw-r--r--net/ipv6/ip6mr.c3
-rw-r--r--net/ipv6/ndisc.c2
-rw-r--r--net/ipv6/netfilter/Kconfig55
-rw-r--r--net/ipv6/route.c535
-rw-r--r--net/ipv6/seg6_iptunnel.c26
-rw-r--r--net/ipv6/sysctl_net_ipv6.c8
-rw-r--r--net/ipv6/udp.c35
-rw-r--r--net/ipv6/udp_offload.c19
-rw-r--r--net/l2tp/l2tp_core.c40
-rw-r--r--net/l2tp/l2tp_core.h3
-rw-r--r--net/l2tp/l2tp_debugfs.c34
-rw-r--r--net/l2tp/l2tp_netlink.c11
-rw-r--r--net/l2tp/l2tp_ppp.c51
-rw-r--r--net/llc/af_llc.c14
-rw-r--r--net/llc/llc_c_ac.c9
-rw-r--r--net/llc/llc_conn.c22
-rw-r--r--net/netfilter/Kconfig1
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c8
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c155
-rw-r--r--net/netfilter/nf_conntrack_expect.c5
-rw-r--r--net/netfilter/nf_conntrack_extend.c2
-rw-r--r--net/netfilter/nf_conntrack_sip.c16
-rw-r--r--net/netfilter/nf_tables_api.c69
-rw-r--r--net/netfilter/xt_connmark.c49
-rw-r--r--net/packet/af_packet.c83
-rw-r--r--net/packet/internal.h10
-rw-r--r--net/qrtr/Kconfig7
-rw-r--r--net/qrtr/Makefile2
-rw-r--r--net/qrtr/qrtr.c1
-rw-r--r--net/qrtr/tun.c161
-rw-r--r--net/rds/ib_cm.c3
-rw-r--r--net/rds/recv.c1
-rw-r--r--net/sched/act_csum.c6
-rw-r--r--net/sched/act_ife.c9
-rw-r--r--net/sched/cls_flower.c275
-rw-r--r--net/sched/sch_fq.c37
-rw-r--r--net/sctp/associola.c85
-rw-r--r--net/sctp/chunk.c12
-rw-r--r--net/sctp/inqueue.c2
-rw-r--r--net/sctp/ipv6.c61
-rw-r--r--net/sctp/output.c28
-rw-r--r--net/sctp/outqueue.c48
-rw-r--r--net/sctp/sm_make_chunk.c143
-rw-r--r--net/sctp/sm_statefuns.c8
-rw-r--r--net/sctp/socket.c43
-rw-r--r--net/sctp/stream.c2
-rw-r--r--net/sctp/transport.c37
-rw-r--r--net/smc/af_smc.c246
-rw-r--r--net/smc/smc.h7
-rw-r--r--net/smc/smc_cdc.c2
-rw-r--r--net/smc/smc_cdc.h2
-rw-r--r--net/smc/smc_core.c68
-rw-r--r--net/smc/smc_core.h8
-rw-r--r--net/smc/smc_diag.c39
-rw-r--r--net/smc/smc_llc.c62
-rw-r--r--net/smc/smc_llc.h3
-rw-r--r--net/smc/smc_rx.c200
-rw-r--r--net/smc/smc_rx.h11
-rw-r--r--net/smc/smc_tx.c24
-rw-r--r--net/smc/smc_wr.c1
-rw-r--r--net/strparser/strparser.c9
-rw-r--r--net/sunrpc/clnt.c8
-rw-r--r--net/sunrpc/rpc_pipe.c1
-rw-r--r--net/sunrpc/sched.c10
-rw-r--r--net/sunrpc/stats.c16
-rw-r--r--net/sunrpc/sunrpc.h6
-rw-r--r--net/sunrpc/xdr.c82
-rw-r--r--net/sunrpc/xprt.c34
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c7
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c13
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c53
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c32
-rw-r--r--net/sunrpc/xprtrdma/transport.c43
-rw-r--r--net/sunrpc/xprtrdma/verbs.c44
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h4
-rw-r--r--net/sunrpc/xprtsock.c4
-rw-r--r--net/tipc/bearer.c29
-rw-r--r--net/tipc/bearer.h3
-rw-r--r--net/tipc/monitor.c2
-rw-r--r--net/tipc/name_table.c34
-rw-r--r--net/tipc/name_table.h2
-rw-r--r--net/tipc/net.c2
-rw-r--r--net/tipc/netlink.c5
-rw-r--r--net/tipc/node.c46
-rw-r--r--net/tipc/node.h3
-rw-r--r--net/tipc/socket.c17
-rw-r--r--net/tipc/subscr.c5
-rw-r--r--net/tipc/udp_media.c4
-rw-r--r--net/tipc/udp_media.h14
-rw-r--r--net/tls/Kconfig10
-rw-r--r--net/tls/Makefile2
-rw-r--r--net/tls/tls_device.c764
-rw-r--r--net/tls/tls_device_fallback.c450
-rw-r--r--net/tls/tls_main.c150
-rw-r--r--net/tls/tls_sw.c151
-rw-r--r--net/vmw_vsock/af_vsock.c6
148 files changed, 5342 insertions, 2118 deletions
diff --git a/net/Kconfig b/net/Kconfig
index 6fa1a4493b8c..b62089fb1332 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -407,6 +407,9 @@ config GRO_CELLS
bool
default n
+config SOCK_VALIDATE_XMIT
+ bool
+
config NET_DEVLINK
tristate "Network physical/parent device Netlink interface"
help
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 2ced48662c1f..68c3578343b4 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -170,7 +170,8 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
xdp.rxq = &rxqueue->xdp_rxq;
retval = bpf_test_run(prog, &xdp, repeat, &duration);
- if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN)
+ if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN ||
+ xdp.data_end != xdp.data + size)
size = xdp.data_end - xdp.data;
ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration);
kfree(data);
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 671d13c10f6f..b0a0b82e2d91 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -34,6 +34,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct net_bridge_port *p;
struct net_bridge *br;
+ bool notified = false;
bool changed_addr;
int err;
@@ -67,7 +68,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
break;
case NETDEV_CHANGE:
- br_port_carrier_check(p);
+ br_port_carrier_check(p, &notified);
break;
case NETDEV_FEAT_CHANGE:
@@ -76,8 +77,10 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
case NETDEV_DOWN:
spin_lock_bh(&br->lock);
- if (br->dev->flags & IFF_UP)
+ if (br->dev->flags & IFF_UP) {
br_stp_disable_port(p);
+ notified = true;
+ }
spin_unlock_bh(&br->lock);
break;
@@ -85,6 +88,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
if (netif_running(br->dev) && netif_oper_up(dev)) {
spin_lock_bh(&br->lock);
br_stp_enable_port(p);
+ notified = true;
spin_unlock_bh(&br->lock);
}
break;
@@ -110,8 +114,8 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
}
/* Events that may cause spanning tree to refresh */
- if (event == NETDEV_CHANGEADDR || event == NETDEV_UP ||
- event == NETDEV_CHANGE || event == NETDEV_DOWN)
+ if (!notified && (event == NETDEV_CHANGEADDR || event == NETDEV_UP ||
+ event == NETDEV_CHANGE || event == NETDEV_DOWN))
br_ifinfo_notify(RTM_NEWLINK, NULL, p);
return NOTIFY_DONE;
@@ -141,7 +145,7 @@ static int br_switchdev_event(struct notifier_block *unused,
case SWITCHDEV_FDB_ADD_TO_BRIDGE:
fdb_info = ptr;
err = br_fdb_external_learn_add(br, p, fdb_info->addr,
- fdb_info->vid);
+ fdb_info->vid, false);
if (err) {
err = notifier_from_errno(err);
break;
@@ -152,7 +156,7 @@ static int br_switchdev_event(struct notifier_block *unused,
case SWITCHDEV_FDB_DEL_TO_BRIDGE:
fdb_info = ptr;
err = br_fdb_external_learn_del(br, p, fdb_info->addr,
- fdb_info->vid);
+ fdb_info->vid, false);
if (err)
err = notifier_from_errno(err);
break;
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index d9e69e4514be..b19e3104afd6 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -40,7 +40,7 @@ static struct kmem_cache *br_fdb_cache __read_mostly;
static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
const unsigned char *addr, u16 vid);
static void fdb_notify(struct net_bridge *br,
- const struct net_bridge_fdb_entry *, int);
+ const struct net_bridge_fdb_entry *, int, bool);
int __init br_fdb_init(void)
{
@@ -121,6 +121,28 @@ static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br,
return fdb;
}
+struct net_device *br_fdb_find_port(const struct net_device *br_dev,
+ const unsigned char *addr,
+ __u16 vid)
+{
+ struct net_bridge_fdb_entry *f;
+ struct net_device *dev = NULL;
+ struct net_bridge *br;
+
+ ASSERT_RTNL();
+
+ if (!netif_is_bridge_master(br_dev))
+ return NULL;
+
+ br = netdev_priv(br_dev);
+ f = br_fdb_find(br, addr, vid);
+ if (f && f->dst)
+ dev = f->dst->dev;
+
+ return dev;
+}
+EXPORT_SYMBOL_GPL(br_fdb_find_port);
+
struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
const unsigned char *addr,
__u16 vid)
@@ -173,7 +195,8 @@ static void fdb_del_hw_addr(struct net_bridge *br, const unsigned char *addr)
}
}
-static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
+static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f,
+ bool swdev_notify)
{
trace_fdb_delete(br, f);
@@ -183,7 +206,7 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
hlist_del_init_rcu(&f->fdb_node);
rhashtable_remove_fast(&br->fdb_hash_tbl, &f->rhnode,
br_fdb_rht_params);
- fdb_notify(br, f, RTM_DELNEIGH);
+ fdb_notify(br, f, RTM_DELNEIGH, swdev_notify);
call_rcu(&f->rcu, fdb_rcu_free);
}
@@ -219,7 +242,7 @@ static void fdb_delete_local(struct net_bridge *br,
return;
}
- fdb_delete(br, f);
+ fdb_delete(br, f, true);
}
void br_fdb_find_delete_local(struct net_bridge *br,
@@ -334,7 +357,7 @@ void br_fdb_cleanup(struct work_struct *work)
} else {
spin_lock_bh(&br->hash_lock);
if (!hlist_unhashed(&f->fdb_node))
- fdb_delete(br, f);
+ fdb_delete(br, f, true);
spin_unlock_bh(&br->hash_lock);
}
}
@@ -354,7 +377,7 @@ void br_fdb_flush(struct net_bridge *br)
spin_lock_bh(&br->hash_lock);
hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) {
if (!f->is_static)
- fdb_delete(br, f);
+ fdb_delete(br, f, true);
}
spin_unlock_bh(&br->hash_lock);
}
@@ -383,7 +406,7 @@ void br_fdb_delete_by_port(struct net_bridge *br,
if (f->is_local)
fdb_delete_local(br, p, f);
else
- fdb_delete(br, f);
+ fdb_delete(br, f, true);
}
spin_unlock_bh(&br->hash_lock);
}
@@ -509,7 +532,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
return 0;
br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n",
source ? source->dev->name : br->dev->name, addr, vid);
- fdb_delete(br, fdb);
+ fdb_delete(br, fdb, true);
}
fdb = fdb_create(br, source, addr, vid, 1, 1);
@@ -517,7 +540,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
return -ENOMEM;
fdb_add_hw_addr(br, addr);
- fdb_notify(br, fdb, RTM_NEWNEIGH);
+ fdb_notify(br, fdb, RTM_NEWNEIGH, true);
return 0;
}
@@ -572,7 +595,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
fdb->added_by_user = 1;
if (unlikely(fdb_modified)) {
trace_br_fdb_update(br, source, addr, vid, added_by_user);
- fdb_notify(br, fdb, RTM_NEWNEIGH);
+ fdb_notify(br, fdb, RTM_NEWNEIGH, true);
}
}
} else {
@@ -583,7 +606,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
fdb->added_by_user = 1;
trace_br_fdb_update(br, source, addr, vid,
added_by_user);
- fdb_notify(br, fdb, RTM_NEWNEIGH);
+ fdb_notify(br, fdb, RTM_NEWNEIGH, true);
}
/* else we lose race and someone else inserts
* it first, don't bother updating
@@ -665,13 +688,15 @@ static inline size_t fdb_nlmsg_size(void)
}
static void fdb_notify(struct net_bridge *br,
- const struct net_bridge_fdb_entry *fdb, int type)
+ const struct net_bridge_fdb_entry *fdb, int type,
+ bool swdev_notify)
{
struct net *net = dev_net(br->dev);
struct sk_buff *skb;
int err = -ENOBUFS;
- br_switchdev_fdb_notify(fdb, type);
+ if (swdev_notify)
+ br_switchdev_fdb_notify(fdb, type);
skb = nlmsg_new(fdb_nlmsg_size(), GFP_ATOMIC);
if (skb == NULL)
@@ -810,7 +835,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
fdb->used = jiffies;
if (modified) {
fdb->updated = jiffies;
- fdb_notify(br, fdb, RTM_NEWNEIGH);
+ fdb_notify(br, fdb, RTM_NEWNEIGH, true);
}
return 0;
@@ -834,7 +859,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br,
rcu_read_unlock();
local_bh_enable();
} else if (ndm->ndm_flags & NTF_EXT_LEARNED) {
- err = br_fdb_external_learn_add(br, p, addr, vid);
+ err = br_fdb_external_learn_add(br, p, addr, vid, true);
} else {
spin_lock_bh(&br->hash_lock);
err = fdb_add_entry(br, p, addr, ndm->ndm_state,
@@ -923,7 +948,7 @@ static int fdb_delete_by_addr_and_port(struct net_bridge *br,
if (!fdb || fdb->dst != p)
return -ENOENT;
- fdb_delete(br, fdb);
+ fdb_delete(br, fdb, true);
return 0;
}
@@ -1043,7 +1068,8 @@ void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p)
}
int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
- const unsigned char *addr, u16 vid)
+ const unsigned char *addr, u16 vid,
+ bool swdev_notify)
{
struct net_bridge_fdb_entry *fdb;
bool modified = false;
@@ -1061,7 +1087,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
goto err_unlock;
}
fdb->added_by_external_learn = 1;
- fdb_notify(br, fdb, RTM_NEWNEIGH);
+ fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify);
} else {
fdb->updated = jiffies;
@@ -1080,7 +1106,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
}
if (modified)
- fdb_notify(br, fdb, RTM_NEWNEIGH);
+ fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify);
}
err_unlock:
@@ -1090,7 +1116,8 @@ err_unlock:
}
int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
- const unsigned char *addr, u16 vid)
+ const unsigned char *addr, u16 vid,
+ bool swdev_notify)
{
struct net_bridge_fdb_entry *fdb;
int err = 0;
@@ -1099,7 +1126,7 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
fdb = br_fdb_find(br, addr, vid);
if (fdb && fdb->added_by_external_learn)
- fdb_delete(br, fdb);
+ fdb_delete(br, fdb, swdev_notify);
else
err = -ENOENT;
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index b4eed113d2ec..7a7fd672ccf2 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -274,8 +274,7 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
struct net_bridge_port *port, *lport, *rport;
lport = p ? p->port : NULL;
- rport = rp ? hlist_entry(rp, struct net_bridge_port, rlist) :
- NULL;
+ rport = hlist_entry_safe(rp, struct net_bridge_port, rlist);
if ((unsigned long)lport > (unsigned long)rport) {
port = lport;
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 82c1a6f430b3..05e42d86882d 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -64,7 +64,7 @@ static int port_cost(struct net_device *dev)
/* Check for port carrier transitions. */
-void br_port_carrier_check(struct net_bridge_port *p)
+void br_port_carrier_check(struct net_bridge_port *p, bool *notified)
{
struct net_device *dev = p->dev;
struct net_bridge *br = p->br;
@@ -73,16 +73,21 @@ void br_port_carrier_check(struct net_bridge_port *p)
netif_running(dev) && netif_oper_up(dev))
p->path_cost = port_cost(dev);
+ *notified = false;
if (!netif_running(br->dev))
return;
spin_lock_bh(&br->lock);
if (netif_running(dev) && netif_oper_up(dev)) {
- if (p->state == BR_STATE_DISABLED)
+ if (p->state == BR_STATE_DISABLED) {
br_stp_enable_port(p);
+ *notified = true;
+ }
} else {
- if (p->state != BR_STATE_DISABLED)
+ if (p->state != BR_STATE_DISABLED) {
br_stp_disable_port(p);
+ *notified = true;
+ }
}
spin_unlock_bh(&br->lock);
}
@@ -518,8 +523,8 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
return -ELOOP;
}
- /* Device is already being bridged */
- if (br_port_exists(dev))
+ /* Device has master upper dev */
+ if (netdev_master_upper_dev_get(dev))
return -EBUSY;
/* No bridging devices that dislike that (e.g. wireless) */
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index a7cb3ece5031..742f40aefdaf 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -553,9 +553,11 @@ int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p);
void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p);
int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
- const unsigned char *addr, u16 vid);
+ const unsigned char *addr, u16 vid,
+ bool swdev_notify);
int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
- const unsigned char *addr, u16 vid);
+ const unsigned char *addr, u16 vid,
+ bool swdev_notify);
void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
const unsigned char *addr, u16 vid);
@@ -573,7 +575,7 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
enum br_pkt_type pkt_type, bool local_rcv, bool local_orig);
/* br_if.c */
-void br_port_carrier_check(struct net_bridge_port *p);
+void br_port_carrier_check(struct net_bridge_port *p, bool *notified);
int br_add_bridge(struct net *net, const char *name);
int br_del_bridge(struct net *net, const char *name);
int br_add_if(struct net_bridge *br, struct net_device *dev,
@@ -594,11 +596,22 @@ static inline bool br_rx_handler_check_rcu(const struct net_device *dev)
return rcu_dereference(dev->rx_handler) == br_handle_frame;
}
+static inline bool br_rx_handler_check_rtnl(const struct net_device *dev)
+{
+ return rcu_dereference_rtnl(dev->rx_handler) == br_handle_frame;
+}
+
static inline struct net_bridge_port *br_port_get_check_rcu(const struct net_device *dev)
{
return br_rx_handler_check_rcu(dev) ? br_port_get_rcu(dev) : NULL;
}
+static inline struct net_bridge_port *
+br_port_get_check_rtnl(const struct net_device *dev)
+{
+ return br_rx_handler_check_rtnl(dev) ? br_port_get_rtnl_rcu(dev) : NULL;
+}
+
/* br_ioctl.c */
int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd,
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index ee775f4ff76c..35474d49555d 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -102,13 +102,15 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p,
static void
br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac,
- u16 vid, struct net_device *dev)
+ u16 vid, struct net_device *dev,
+ bool added_by_user)
{
struct switchdev_notifier_fdb_info info;
unsigned long notifier_type;
info.addr = mac;
info.vid = vid;
+ info.added_by_user = added_by_user;
notifier_type = adding ? SWITCHDEV_FDB_ADD_TO_DEVICE : SWITCHDEV_FDB_DEL_TO_DEVICE;
call_switchdev_notifiers(notifier_type, dev, &info.info);
}
@@ -116,19 +118,21 @@ br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac,
void
br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
{
- if (!fdb->added_by_user || !fdb->dst)
+ if (!fdb->dst)
return;
switch (type) {
case RTM_DELNEIGH:
br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr,
fdb->key.vlan_id,
- fdb->dst->dev);
+ fdb->dst->dev,
+ fdb->added_by_user);
break;
case RTM_NEWNEIGH:
br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr,
fdb->key.vlan_id,
- fdb->dst->dev);
+ fdb->dst->dev,
+ fdb->added_by_user);
break;
}
}
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 9896f4975353..df37a5137c25 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -1149,3 +1149,42 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v,
stats->tx_packets += txpackets;
}
}
+
+int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
+{
+ struct net_bridge_vlan_group *vg;
+
+ ASSERT_RTNL();
+ if (netif_is_bridge_master(dev))
+ vg = br_vlan_group(netdev_priv(dev));
+ else
+ return -EINVAL;
+
+ *p_pvid = br_get_pvid(vg);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_pvid);
+
+int br_vlan_get_info(const struct net_device *dev, u16 vid,
+ struct bridge_vlan_info *p_vinfo)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+ struct net_bridge_port *p;
+
+ ASSERT_RTNL();
+ p = br_port_get_check_rtnl(dev);
+ if (p)
+ vg = nbp_vlan_group(p);
+ else
+ return -EINVAL;
+
+ v = br_vlan_find(vg, vid);
+ if (!v)
+ return -ENOENT;
+
+ p_vinfo->vid = vid;
+ p_vinfo->flags = v->flags;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_info);
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 9be240129448..b286ed5596c3 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1820,13 +1820,14 @@ static int compat_table_info(const struct ebt_table_info *info,
{
unsigned int size = info->entries_size;
const void *entries = info->entries;
- int ret;
newinfo->entries_size = size;
-
- ret = xt_compat_init_offsets(NFPROTO_BRIDGE, info->nentries);
- if (ret)
- return ret;
+ if (info->nentries) {
+ int ret = xt_compat_init_offsets(NFPROTO_BRIDGE,
+ info->nentries);
+ if (ret)
+ return ret;
+ }
return EBT_ENTRY_ITERATE(entries, size, compat_calc_entry, info,
entries, newinfo);
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index 53ecda10b790..13e2ae6be620 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -174,7 +174,7 @@ static void chnl_flowctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow,
flow == CAIF_CTRLCMD_DEINIT_RSP ? "CLOSE/DEINIT" :
flow == CAIF_CTRLCMD_INIT_FAIL_RSP ? "OPEN_FAIL" :
flow == CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND ?
- "REMOTE_SHUTDOWN" : "UKNOWN CTRL COMMAND");
+ "REMOTE_SHUTDOWN" : "UNKNOWN CTRL COMMAND");
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index fcb40c12b1f8..3b3d33ea9ed8 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2569,6 +2569,11 @@ static int try_write(struct ceph_connection *con)
int ret = 1;
dout("try_write start %p state %lu\n", con, con->state);
+ if (con->state != CON_STATE_PREOPEN &&
+ con->state != CON_STATE_CONNECTING &&
+ con->state != CON_STATE_NEGOTIATING &&
+ con->state != CON_STATE_OPEN)
+ return 0;
more:
dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
@@ -2594,6 +2599,8 @@ more:
}
more_kvec:
+ BUG_ON(!con->sock);
+
/* kvec data queued? */
if (con->out_kvec_left) {
ret = write_partial_kvec(con);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index b3dac24412d3..21ac6e3b96bb 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -209,6 +209,14 @@ static void reopen_session(struct ceph_mon_client *monc)
__open_session(monc);
}
+static void un_backoff(struct ceph_mon_client *monc)
+{
+ monc->hunt_mult /= 2; /* reduce by 50% */
+ if (monc->hunt_mult < 1)
+ monc->hunt_mult = 1;
+ dout("%s hunt_mult now %d\n", __func__, monc->hunt_mult);
+}
+
/*
* Reschedule delayed work timer.
*/
@@ -963,6 +971,7 @@ static void delayed_work(struct work_struct *work)
if (!monc->hunting) {
ceph_con_keepalive(&monc->con);
__validate_auth(monc);
+ un_backoff(monc);
}
if (is_auth &&
@@ -1123,9 +1132,8 @@ static void finish_hunting(struct ceph_mon_client *monc)
dout("%s found mon%d\n", __func__, monc->cur_mon);
monc->hunting = false;
monc->had_a_connection = true;
- monc->hunt_mult /= 2; /* reduce by 50% */
- if (monc->hunt_mult < 1)
- monc->hunt_mult = 1;
+ un_backoff(monc);
+ __schedule_delayed(monc);
}
}
diff --git a/net/compat.c b/net/compat.c
index 5ae7437d3853..7242cce5631b 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -377,7 +377,8 @@ static int compat_sock_setsockopt(struct socket *sock, int level, int optname,
optname == SO_ATTACH_REUSEPORT_CBPF)
return do_set_attach_filter(sock, level, optname,
optval, optlen);
- if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)
+ if (!COMPAT_USE_64BIT_TIME &&
+ (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO))
return do_set_sock_timeout(sock, level, optname, optval, optlen);
return sock_setsockopt(sock, level, optname, optval, optlen);
@@ -448,7 +449,8 @@ static int do_get_sock_timeout(struct socket *sock, int level, int optname,
static int compat_sock_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
- if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)
+ if (!COMPAT_USE_64BIT_TIME &&
+ (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO))
return do_get_sock_timeout(sock, level, optname, optval, optlen);
return sock_getsockopt(sock, level, optname, optval, optlen);
}
diff --git a/net/core/dev.c b/net/core/dev.c
index a490ef643586..bb81a6e1d354 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1587,7 +1587,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
- };
+ }
#undef N
return "UNKNOWN_NETDEV_EVENT";
}
@@ -2615,17 +2615,16 @@ EXPORT_SYMBOL(netif_device_attach);
* Returns a Tx hash based on the given packet descriptor a Tx queues' number
* to be used as a distribution range.
*/
-u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
- unsigned int num_tx_queues)
+static u16 skb_tx_hash(const struct net_device *dev, struct sk_buff *skb)
{
u32 hash;
u16 qoffset = 0;
- u16 qcount = num_tx_queues;
+ u16 qcount = dev->real_num_tx_queues;
if (skb_rx_queue_recorded(skb)) {
hash = skb_get_rx_queue(skb);
- while (unlikely(hash >= num_tx_queues))
- hash -= num_tx_queues;
+ while (unlikely(hash >= qcount))
+ hash -= qcount;
return hash;
}
@@ -2638,7 +2637,6 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
}
-EXPORT_SYMBOL(__skb_tx_hash);
static void skb_warn_bad_offload(const struct sk_buff *skb)
{
@@ -2970,7 +2968,7 @@ netdev_features_t passthru_features_check(struct sk_buff *skb,
}
EXPORT_SYMBOL(passthru_features_check);
-static netdev_features_t dflt_features_check(const struct sk_buff *skb,
+static netdev_features_t dflt_features_check(struct sk_buff *skb,
struct net_device *dev,
netdev_features_t features)
{
@@ -3114,6 +3112,10 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
if (unlikely(!skb))
goto out_null;
+ skb = sk_validate_xmit_skb(skb, dev);
+ if (unlikely(!skb))
+ goto out_null;
+
if (netif_needs_gso(skb, features)) {
struct sk_buff *segs;
@@ -3997,9 +3999,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
struct bpf_prog *xdp_prog)
{
struct netdev_rx_queue *rxqueue;
+ void *orig_data, *orig_data_end;
u32 metalen, act = XDP_DROP;
struct xdp_buff xdp;
- void *orig_data;
int hlen, off;
u32 mac_len;
@@ -4038,6 +4040,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
xdp.data_meta = xdp.data;
xdp.data_end = xdp.data + hlen;
xdp.data_hard_start = skb->data - skb_headroom(skb);
+ orig_data_end = xdp.data_end;
orig_data = xdp.data;
rxqueue = netif_get_rxqueue(skb);
@@ -4052,6 +4055,15 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
__skb_push(skb, -off);
skb->mac_header += off;
+ /* check if bpf_xdp_adjust_tail was used. it can only "shrink"
+ * pckt.
+ */
+ off = orig_data_end - xdp.data_end;
+ if (off != 0) {
+ skb_set_tail_pointer(skb, xdp.data_end - xdp.data);
+ skb->len -= off;
+ }
+
switch (act) {
case XDP_REDIRECT:
case XDP_TX:
@@ -7871,6 +7883,8 @@ int register_netdevice(struct net_device *dev)
int ret;
struct net *net = dev_net(dev);
+ BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
+ NETDEV_FEATURE_COUNT);
BUG_ON(dev_boot_phase);
ASSERT_RTNL();
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index e3e6a3e2ca22..d884d8f5f0e5 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -839,7 +839,7 @@ void dev_mc_flush(struct net_device *dev)
EXPORT_SYMBOL(dev_mc_flush);
/**
- * dev_mc_flush - Init multicast address list
+ * dev_mc_init - Init multicast address list
* @dev: device
*
* Init multicast address list.
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 03416e6dd5d7..c15075dc7572 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -92,6 +92,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial",
[NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation",
[NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation",
+ [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation",
[NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
[NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp",
@@ -109,6 +110,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload",
[NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload",
[NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record",
+ [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload",
};
static const char
@@ -210,23 +212,6 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
return ret;
}
-static int phy_get_sset_count(struct phy_device *phydev)
-{
- int ret;
-
- if (phydev->drv->get_sset_count &&
- phydev->drv->get_strings &&
- phydev->drv->get_stats) {
- mutex_lock(&phydev->lock);
- ret = phydev->drv->get_sset_count(phydev);
- mutex_unlock(&phydev->lock);
-
- return ret;
- }
-
- return -EOPNOTSUPP;
-}
-
static int __ethtool_get_sset_count(struct net_device *dev, int sset)
{
const struct ethtool_ops *ops = dev->ethtool_ops;
@@ -243,12 +228,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
if (sset == ETH_SS_PHY_TUNABLES)
return ARRAY_SIZE(phy_tunable_strings);
- if (sset == ETH_SS_PHY_STATS) {
- if (dev->phydev)
- return phy_get_sset_count(dev->phydev);
- else
- return -EOPNOTSUPP;
- }
+ if (sset == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats)
+ return phy_ethtool_get_sset_count(dev->phydev);
if (ops->get_sset_count && ops->get_strings)
return ops->get_sset_count(dev, sset);
@@ -271,17 +253,10 @@ static void __ethtool_get_strings(struct net_device *dev,
memcpy(data, tunable_strings, sizeof(tunable_strings));
else if (stringset == ETH_SS_PHY_TUNABLES)
memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings));
- else if (stringset == ETH_SS_PHY_STATS) {
- struct phy_device *phydev = dev->phydev;
-
- if (phydev) {
- mutex_lock(&phydev->lock);
- phydev->drv->get_strings(phydev, data);
- mutex_unlock(&phydev->lock);
- } else {
- return;
- }
- } else
+ else if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats)
+ phy_ethtool_get_strings(dev->phydev, data);
+ else
/* ops->get_strings is valid because checked earlier */
ops->get_strings(dev, stringset, data);
}
@@ -1032,6 +1007,11 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
info_size = sizeof(info);
if (copy_from_user(&info, useraddr, info_size))
return -EFAULT;
+ /* Since malicious users may modify the original data,
+ * we need to check whether FLOW_RSS is still requested.
+ */
+ if (!(info.flow_type & FLOW_RSS))
+ return -EINVAL;
}
if (info.cmd == ETHTOOL_GRXCLSRLALL) {
@@ -1993,15 +1973,19 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
{
- struct ethtool_stats stats;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
struct phy_device *phydev = dev->phydev;
+ struct ethtool_stats stats;
u64 *data;
int ret, n_stats;
- if (!phydev)
+ if (!phydev && (!ops->get_ethtool_phy_stats || !ops->get_sset_count))
return -EOPNOTSUPP;
- n_stats = phy_get_sset_count(phydev);
+ if (dev->phydev && !ops->get_ethtool_phy_stats)
+ n_stats = phy_ethtool_get_sset_count(dev->phydev);
+ else
+ n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
if (n_stats < 0)
return n_stats;
if (n_stats > S32_MAX / sizeof(u64))
@@ -2016,9 +2000,13 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
if (n_stats && !data)
return -ENOMEM;
- mutex_lock(&phydev->lock);
- phydev->drv->get_stats(phydev, &stats, data);
- mutex_unlock(&phydev->lock);
+ if (dev->phydev && !ops->get_ethtool_phy_stats) {
+ ret = phy_ethtool_get_stats(dev->phydev, &stats, data);
+ if (ret < 0)
+ return ret;
+ } else {
+ ops->get_ethtool_phy_stats(dev, &stats, data);
+ }
ret = -EFAULT;
if (copy_to_user(useraddr, &stats, sizeof(stats)))
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 33958f84c173..126ffc5bc630 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -387,247 +387,304 @@ unsigned int fib_rules_seq_read(struct net *net, int family)
}
EXPORT_SYMBOL_GPL(fib_rules_seq_read);
-static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb,
- struct fib_rules_ops *ops)
-{
- int err = -EINVAL;
-
- if (frh->src_len)
- if (tb[FRA_SRC] == NULL ||
- frh->src_len > (ops->addr_size * 8) ||
- nla_len(tb[FRA_SRC]) != ops->addr_size)
- goto errout;
-
- if (frh->dst_len)
- if (tb[FRA_DST] == NULL ||
- frh->dst_len > (ops->addr_size * 8) ||
- nla_len(tb[FRA_DST]) != ops->addr_size)
- goto errout;
-
- err = 0;
-errout:
- return err;
-}
-
-static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
- struct nlattr **tb, struct fib_rule *rule)
+static struct fib_rule *rule_find(struct fib_rules_ops *ops,
+ struct fib_rule_hdr *frh,
+ struct nlattr **tb,
+ struct fib_rule *rule,
+ bool user_priority)
{
struct fib_rule *r;
list_for_each_entry(r, &ops->rules_list, list) {
- if (r->action != rule->action)
+ if (rule->action && r->action != rule->action)
continue;
- if (r->table != rule->table)
+ if (rule->table && r->table != rule->table)
continue;
- if (r->pref != rule->pref)
+ if (user_priority && r->pref != rule->pref)
continue;
- if (memcmp(r->iifname, rule->iifname, IFNAMSIZ))
+ if (rule->iifname[0] &&
+ memcmp(r->iifname, rule->iifname, IFNAMSIZ))
continue;
- if (memcmp(r->oifname, rule->oifname, IFNAMSIZ))
+ if (rule->oifname[0] &&
+ memcmp(r->oifname, rule->oifname, IFNAMSIZ))
continue;
- if (r->mark != rule->mark)
+ if (rule->mark && r->mark != rule->mark)
continue;
- if (r->mark_mask != rule->mark_mask)
+ if (rule->mark_mask && r->mark_mask != rule->mark_mask)
continue;
- if (r->tun_id != rule->tun_id)
+ if (rule->tun_id && r->tun_id != rule->tun_id)
continue;
if (r->fr_net != rule->fr_net)
continue;
- if (r->l3mdev != rule->l3mdev)
+ if (rule->l3mdev && r->l3mdev != rule->l3mdev)
continue;
- if (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
- !uid_eq(r->uid_range.end, rule->uid_range.end))
+ if (uid_range_set(&rule->uid_range) &&
+ (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
+ !uid_eq(r->uid_range.end, rule->uid_range.end)))
continue;
- if (r->ip_proto != rule->ip_proto)
+ if (rule->ip_proto && r->ip_proto != rule->ip_proto)
continue;
- if (!fib_rule_port_range_compare(&r->sport_range,
+ if (fib_rule_port_range_set(&rule->sport_range) &&
+ !fib_rule_port_range_compare(&r->sport_range,
&rule->sport_range))
continue;
- if (!fib_rule_port_range_compare(&r->dport_range,
+ if (fib_rule_port_range_set(&rule->dport_range) &&
+ !fib_rule_port_range_compare(&r->dport_range,
&rule->dport_range))
continue;
if (!ops->compare(r, frh, tb))
continue;
- return 1;
+ return r;
+ }
+
+ return NULL;
+}
+
+#ifdef CONFIG_NET_L3_MASTER_DEV
+static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule,
+ struct netlink_ext_ack *extack)
+{
+ nlrule->l3mdev = nla_get_u8(nla);
+ if (nlrule->l3mdev != 1) {
+ NL_SET_ERR_MSG(extack, "Invalid l3mdev attribute");
+ return -1;
}
+
return 0;
}
+#else
+static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule,
+ struct netlink_ext_ack *extack)
+{
+ NL_SET_ERR_MSG(extack, "l3mdev support is not enabled in kernel");
+ return -1;
+}
+#endif
-int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack)
+static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack,
+ struct fib_rules_ops *ops,
+ struct nlattr *tb[],
+ struct fib_rule **rule,
+ bool *user_priority)
{
struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
- struct fib_rules_ops *ops = NULL;
- struct fib_rule *rule, *r, *last = NULL;
- struct nlattr *tb[FRA_MAX+1];
- int err = -EINVAL, unresolved = 0;
-
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
- goto errout;
+ struct fib_rule *nlrule = NULL;
+ int err = -EINVAL;
- ops = lookup_rules_ops(net, frh->family);
- if (ops == NULL) {
- err = -EAFNOSUPPORT;
- goto errout;
+ if (frh->src_len)
+ if (!tb[FRA_SRC] ||
+ frh->src_len > (ops->addr_size * 8) ||
+ nla_len(tb[FRA_SRC]) != ops->addr_size) {
+ NL_SET_ERR_MSG(extack, "Invalid source address");
+ goto errout;
}
- err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
- if (err < 0)
- goto errout;
-
- err = validate_rulemsg(frh, tb, ops);
- if (err < 0)
- goto errout;
+ if (frh->dst_len)
+ if (!tb[FRA_DST] ||
+ frh->dst_len > (ops->addr_size * 8) ||
+ nla_len(tb[FRA_DST]) != ops->addr_size) {
+ NL_SET_ERR_MSG(extack, "Invalid dst address");
+ goto errout;
+ }
- rule = kzalloc(ops->rule_size, GFP_KERNEL);
- if (rule == NULL) {
+ nlrule = kzalloc(ops->rule_size, GFP_KERNEL);
+ if (!nlrule) {
err = -ENOMEM;
goto errout;
}
- refcount_set(&rule->refcnt, 1);
- rule->fr_net = net;
+ refcount_set(&nlrule->refcnt, 1);
+ nlrule->fr_net = net;
- rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY])
- : fib_default_rule_pref(ops);
+ if (tb[FRA_PRIORITY]) {
+ nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]);
+ *user_priority = true;
+ } else {
+ nlrule->pref = fib_default_rule_pref(ops);
+ }
- rule->proto = tb[FRA_PROTOCOL] ?
+ nlrule->proto = tb[FRA_PROTOCOL] ?
nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC;
if (tb[FRA_IIFNAME]) {
struct net_device *dev;
- rule->iifindex = -1;
- nla_strlcpy(rule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
- dev = __dev_get_by_name(net, rule->iifname);
+ nlrule->iifindex = -1;
+ nla_strlcpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
+ dev = __dev_get_by_name(net, nlrule->iifname);
if (dev)
- rule->iifindex = dev->ifindex;
+ nlrule->iifindex = dev->ifindex;
}
if (tb[FRA_OIFNAME]) {
struct net_device *dev;
- rule->oifindex = -1;
- nla_strlcpy(rule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
- dev = __dev_get_by_name(net, rule->oifname);
+ nlrule->oifindex = -1;
+ nla_strlcpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
+ dev = __dev_get_by_name(net, nlrule->oifname);
if (dev)
- rule->oifindex = dev->ifindex;
+ nlrule->oifindex = dev->ifindex;
}
if (tb[FRA_FWMARK]) {
- rule->mark = nla_get_u32(tb[FRA_FWMARK]);
- if (rule->mark)
+ nlrule->mark = nla_get_u32(tb[FRA_FWMARK]);
+ if (nlrule->mark)
/* compatibility: if the mark value is non-zero all bits
* are compared unless a mask is explicitly specified.
*/
- rule->mark_mask = 0xFFFFFFFF;
+ nlrule->mark_mask = 0xFFFFFFFF;
}
if (tb[FRA_FWMASK])
- rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
+ nlrule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
if (tb[FRA_TUN_ID])
- rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
+ nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
err = -EINVAL;
- if (tb[FRA_L3MDEV]) {
-#ifdef CONFIG_NET_L3_MASTER_DEV
- rule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]);
- if (rule->l3mdev != 1)
-#endif
- goto errout_free;
- }
+ if (tb[FRA_L3MDEV] &&
+ fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0)
+ goto errout_free;
- rule->action = frh->action;
- rule->flags = frh->flags;
- rule->table = frh_get_table(frh, tb);
+ nlrule->action = frh->action;
+ nlrule->flags = frh->flags;
+ nlrule->table = frh_get_table(frh, tb);
if (tb[FRA_SUPPRESS_PREFIXLEN])
- rule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]);
+ nlrule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]);
else
- rule->suppress_prefixlen = -1;
+ nlrule->suppress_prefixlen = -1;
if (tb[FRA_SUPPRESS_IFGROUP])
- rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
+ nlrule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
else
- rule->suppress_ifgroup = -1;
+ nlrule->suppress_ifgroup = -1;
if (tb[FRA_GOTO]) {
- if (rule->action != FR_ACT_GOTO)
+ if (nlrule->action != FR_ACT_GOTO) {
+ NL_SET_ERR_MSG(extack, "Unexpected goto");
goto errout_free;
+ }
- rule->target = nla_get_u32(tb[FRA_GOTO]);
+ nlrule->target = nla_get_u32(tb[FRA_GOTO]);
/* Backward jumps are prohibited to avoid endless loops */
- if (rule->target <= rule->pref)
+ if (nlrule->target <= nlrule->pref) {
+ NL_SET_ERR_MSG(extack, "Backward goto not supported");
goto errout_free;
-
- list_for_each_entry(r, &ops->rules_list, list) {
- if (r->pref == rule->target) {
- RCU_INIT_POINTER(rule->ctarget, r);
- break;
- }
}
-
- if (rcu_dereference_protected(rule->ctarget, 1) == NULL)
- unresolved = 1;
- } else if (rule->action == FR_ACT_GOTO)
+ } else if (nlrule->action == FR_ACT_GOTO) {
+ NL_SET_ERR_MSG(extack, "Missing goto target for action goto");
goto errout_free;
+ }
- if (rule->l3mdev && rule->table)
+ if (nlrule->l3mdev && nlrule->table) {
+ NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive");
goto errout_free;
+ }
if (tb[FRA_UID_RANGE]) {
if (current_user_ns() != net->user_ns) {
err = -EPERM;
+ NL_SET_ERR_MSG(extack, "No permission to set uid");
goto errout_free;
}
- rule->uid_range = nla_get_kuid_range(tb);
+ nlrule->uid_range = nla_get_kuid_range(tb);
- if (!uid_range_set(&rule->uid_range) ||
- !uid_lte(rule->uid_range.start, rule->uid_range.end))
+ if (!uid_range_set(&nlrule->uid_range) ||
+ !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) {
+ NL_SET_ERR_MSG(extack, "Invalid uid range");
goto errout_free;
+ }
} else {
- rule->uid_range = fib_kuid_range_unset;
+ nlrule->uid_range = fib_kuid_range_unset;
}
if (tb[FRA_IP_PROTO])
- rule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]);
+ nlrule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]);
if (tb[FRA_SPORT_RANGE]) {
err = nla_get_port_range(tb[FRA_SPORT_RANGE],
- &rule->sport_range);
- if (err)
+ &nlrule->sport_range);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Invalid sport range");
goto errout_free;
+ }
}
if (tb[FRA_DPORT_RANGE]) {
err = nla_get_port_range(tb[FRA_DPORT_RANGE],
- &rule->dport_range);
- if (err)
+ &nlrule->dport_range);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Invalid dport range");
goto errout_free;
+ }
}
+ *rule = nlrule;
+
+ return 0;
+
+errout_free:
+ kfree(nlrule);
+errout:
+ return err;
+}
+
+int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fib_rule_hdr *frh = nlmsg_data(nlh);
+ struct fib_rules_ops *ops = NULL;
+ struct fib_rule *rule = NULL, *r, *last = NULL;
+ struct nlattr *tb[FRA_MAX + 1];
+ int err = -EINVAL, unresolved = 0;
+ bool user_priority = false;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+ NL_SET_ERR_MSG(extack, "Invalid msg length");
+ goto errout;
+ }
+
+ ops = lookup_rules_ops(net, frh->family);
+ if (!ops) {
+ err = -EAFNOSUPPORT;
+ NL_SET_ERR_MSG(extack, "Rule family not supported");
+ goto errout;
+ }
+
+ err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Error parsing msg");
+ goto errout;
+ }
+
+ err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority);
+ if (err)
+ goto errout;
+
if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
- rule_exists(ops, frh, tb, rule)) {
+ rule_find(ops, frh, tb, rule, user_priority)) {
err = -EEXIST;
goto errout_free;
}
- err = ops->configure(rule, skb, frh, tb);
+ err = ops->configure(rule, skb, frh, tb, extack);
if (err < 0)
goto errout_free;
@@ -637,6 +694,16 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout_free;
list_for_each_entry(r, &ops->rules_list, list) {
+ if (r->pref == rule->target) {
+ RCU_INIT_POINTER(rule->ctarget, r);
+ break;
+ }
+ }
+
+ if (rcu_dereference_protected(rule->ctarget, 1) == NULL)
+ unresolved = 1;
+
+ list_for_each_entry(r, &ops->rules_list, list) {
if (r->pref > rule->pref)
break;
last = r;
@@ -690,171 +757,97 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
{
struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
- struct fib_rule_port_range sprange = {0, 0};
- struct fib_rule_port_range dprange = {0, 0};
struct fib_rules_ops *ops = NULL;
- struct fib_rule *rule, *r;
+ struct fib_rule *rule = NULL, *r, *nlrule = NULL;
struct nlattr *tb[FRA_MAX+1];
- struct fib_kuid_range range;
int err = -EINVAL;
+ bool user_priority = false;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+ NL_SET_ERR_MSG(extack, "Invalid msg length");
goto errout;
+ }
ops = lookup_rules_ops(net, frh->family);
if (ops == NULL) {
err = -EAFNOSUPPORT;
+ NL_SET_ERR_MSG(extack, "Rule family not supported");
goto errout;
}
err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Error parsing msg");
goto errout;
+ }
- err = validate_rulemsg(frh, tb, ops);
- if (err < 0)
+ err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority);
+ if (err)
goto errout;
- if (tb[FRA_UID_RANGE]) {
- range = nla_get_kuid_range(tb);
- if (!uid_range_set(&range)) {
- err = -EINVAL;
- goto errout;
- }
- } else {
- range = fib_kuid_range_unset;
+ rule = rule_find(ops, frh, tb, nlrule, user_priority);
+ if (!rule) {
+ err = -ENOENT;
+ goto errout;
}
- if (tb[FRA_SPORT_RANGE]) {
- err = nla_get_port_range(tb[FRA_SPORT_RANGE],
- &sprange);
- if (err)
- goto errout;
+ if (rule->flags & FIB_RULE_PERMANENT) {
+ err = -EPERM;
+ goto errout;
}
- if (tb[FRA_DPORT_RANGE]) {
- err = nla_get_port_range(tb[FRA_DPORT_RANGE],
- &dprange);
+ if (ops->delete) {
+ err = ops->delete(rule);
if (err)
goto errout;
}
- list_for_each_entry(rule, &ops->rules_list, list) {
- if (tb[FRA_PROTOCOL] &&
- (rule->proto != nla_get_u8(tb[FRA_PROTOCOL])))
- continue;
-
- if (frh->action && (frh->action != rule->action))
- continue;
-
- if (frh_get_table(frh, tb) &&
- (frh_get_table(frh, tb) != rule->table))
- continue;
-
- if (tb[FRA_PRIORITY] &&
- (rule->pref != nla_get_u32(tb[FRA_PRIORITY])))
- continue;
-
- if (tb[FRA_IIFNAME] &&
- nla_strcmp(tb[FRA_IIFNAME], rule->iifname))
- continue;
-
- if (tb[FRA_OIFNAME] &&
- nla_strcmp(tb[FRA_OIFNAME], rule->oifname))
- continue;
-
- if (tb[FRA_FWMARK] &&
- (rule->mark != nla_get_u32(tb[FRA_FWMARK])))
- continue;
-
- if (tb[FRA_FWMASK] &&
- (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK])))
- continue;
-
- if (tb[FRA_TUN_ID] &&
- (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID])))
- continue;
-
- if (tb[FRA_L3MDEV] &&
- (rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV])))
- continue;
-
- if (uid_range_set(&range) &&
- (!uid_eq(rule->uid_range.start, range.start) ||
- !uid_eq(rule->uid_range.end, range.end)))
- continue;
-
- if (tb[FRA_IP_PROTO] &&
- (rule->ip_proto != nla_get_u8(tb[FRA_IP_PROTO])))
- continue;
-
- if (fib_rule_port_range_set(&sprange) &&
- !fib_rule_port_range_compare(&rule->sport_range, &sprange))
- continue;
-
- if (fib_rule_port_range_set(&dprange) &&
- !fib_rule_port_range_compare(&rule->dport_range, &dprange))
- continue;
-
- if (!ops->compare(rule, frh, tb))
- continue;
-
- if (rule->flags & FIB_RULE_PERMANENT) {
- err = -EPERM;
- goto errout;
- }
-
- if (ops->delete) {
- err = ops->delete(rule);
- if (err)
- goto errout;
- }
+ if (rule->tun_id)
+ ip_tunnel_unneed_metadata();
- if (rule->tun_id)
- ip_tunnel_unneed_metadata();
+ list_del_rcu(&rule->list);
- list_del_rcu(&rule->list);
-
- if (rule->action == FR_ACT_GOTO) {
- ops->nr_goto_rules--;
- if (rtnl_dereference(rule->ctarget) == NULL)
- ops->unresolved_rules--;
- }
+ if (rule->action == FR_ACT_GOTO) {
+ ops->nr_goto_rules--;
+ if (rtnl_dereference(rule->ctarget) == NULL)
+ ops->unresolved_rules--;
+ }
- /*
- * Check if this rule is a target to any of them. If so,
- * adjust to the next one with the same preference or
- * disable them. As this operation is eventually very
- * expensive, it is only performed if goto rules, except
- * current if it is goto rule, have actually been added.
- */
- if (ops->nr_goto_rules > 0) {
- struct fib_rule *n;
-
- n = list_next_entry(rule, list);
- if (&n->list == &ops->rules_list || n->pref != rule->pref)
- n = NULL;
- list_for_each_entry(r, &ops->rules_list, list) {
- if (rtnl_dereference(r->ctarget) != rule)
- continue;
- rcu_assign_pointer(r->ctarget, n);
- if (!n)
- ops->unresolved_rules++;
- }
+ /*
+ * Check if this rule is a target to any of them. If so,
+ * adjust to the next one with the same preference or
+ * disable them. As this operation is eventually very
+ * expensive, it is only performed if goto rules, except
+ * current if it is goto rule, have actually been added.
+ */
+ if (ops->nr_goto_rules > 0) {
+ struct fib_rule *n;
+
+ n = list_next_entry(rule, list);
+ if (&n->list == &ops->rules_list || n->pref != rule->pref)
+ n = NULL;
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (rtnl_dereference(r->ctarget) != rule)
+ continue;
+ rcu_assign_pointer(r->ctarget, n);
+ if (!n)
+ ops->unresolved_rules++;
}
-
- call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops,
- NULL);
- notify_rule_change(RTM_DELRULE, rule, ops, nlh,
- NETLINK_CB(skb).portid);
- fib_rule_put(rule);
- flush_route_cache(ops);
- rules_ops_put(ops);
- return 0;
}
- err = -ENOENT;
+ call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops,
+ NULL);
+ notify_rule_change(RTM_DELRULE, rule, ops, nlh,
+ NETLINK_CB(skb).portid);
+ fib_rule_put(rule);
+ flush_route_cache(ops);
+ rules_ops_put(ops);
+ kfree(nlrule);
+ return 0;
+
errout:
+ if (nlrule)
+ kfree(nlrule);
rules_ops_put(ops);
return err;
}
diff --git a/net/core/filter.c b/net/core/filter.c
index a374b8560bc4..d3781daa26ab 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -57,6 +57,7 @@
#include <net/sock_reuseport.h>
#include <net/busy_poll.h>
#include <net/tcp.h>
+#include <net/xfrm.h>
#include <linux/bpf_trace.h>
/**
@@ -2694,20 +2695,13 @@ BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
{
void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
unsigned long metalen = xdp_get_metalen(xdp);
- void *data_start = xdp->data_hard_start + metalen;
+ void *data_start = xdp_frame_end + metalen;
void *data = xdp->data + offset;
if (unlikely(data < data_start ||
data > xdp->data_end - ETH_HLEN))
return -EINVAL;
- /* Avoid info leak, when reusing area prev used by xdp_frame */
- if (data < xdp_frame_end) {
- unsigned long clearlen = xdp_frame_end - data;
-
- memset(data, 0, clearlen);
- }
-
if (metalen)
memmove(xdp->data_meta + offset,
xdp->data_meta, metalen);
@@ -2725,14 +2719,39 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
.arg2_type = ARG_ANYTHING,
};
+BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
+{
+ void *data_end = xdp->data_end + offset;
+
+ /* only shrinking is allowed for now. */
+ if (unlikely(offset >= 0))
+ return -EINVAL;
+
+ if (unlikely(data_end < xdp->data + ETH_HLEN))
+ return -EINVAL;
+
+ xdp->data_end = data_end;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = {
+ .func = bpf_xdp_adjust_tail,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
{
+ void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
void *meta = xdp->data_meta + offset;
unsigned long metalen = xdp->data - meta;
if (xdp_data_meta_unsupported(xdp))
return -ENOTSUPP;
- if (unlikely(meta < xdp->data_hard_start ||
+ if (unlikely(meta < xdp_frame_end ||
meta > xdp->data))
return -EINVAL;
if (unlikely((metalen & (sizeof(__u32) - 1)) ||
@@ -3074,7 +3093,8 @@ bool bpf_helper_changes_pkt_data(void *func)
func == bpf_l4_csum_replace ||
func == bpf_xdp_adjust_head ||
func == bpf_xdp_adjust_meta ||
- func == bpf_msg_pull_data)
+ func == bpf_msg_pull_data ||
+ func == bpf_xdp_adjust_tail)
return true;
return false;
@@ -3261,6 +3281,7 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
skb_dst_set(skb, (struct dst_entry *) md);
info = &md->u.tun_info;
+ memset(info, 0, sizeof(*info));
info->mode = IP_TUNNEL_INFO_TX;
info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
@@ -3724,6 +3745,49 @@ static const struct bpf_func_proto bpf_bind_proto = {
.arg3_type = ARG_CONST_SIZE,
};
+#ifdef CONFIG_XFRM
+BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
+ struct bpf_xfrm_state *, to, u32, size, u64, flags)
+{
+ const struct sec_path *sp = skb_sec_path(skb);
+ const struct xfrm_state *x;
+
+ if (!sp || unlikely(index >= sp->len || flags))
+ goto err_clear;
+
+ x = sp->xvec[index];
+
+ if (unlikely(size != sizeof(struct bpf_xfrm_state)))
+ goto err_clear;
+
+ to->reqid = x->props.reqid;
+ to->spi = x->id.spi;
+ to->family = x->props.family;
+ if (to->family == AF_INET6) {
+ memcpy(to->remote_ipv6, x->props.saddr.a6,
+ sizeof(to->remote_ipv6));
+ } else {
+ to->remote_ipv4 = x->props.saddr.a4;
+ }
+
+ return 0;
+err_clear:
+ memset(to, 0, size);
+ return -EINVAL;
+}
+
+static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
+ .func = bpf_skb_get_xfrm_state,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_ANYTHING,
+};
+#endif
+
static const struct bpf_func_proto *
bpf_base_func_proto(enum bpf_func_id func_id)
{
@@ -3865,6 +3929,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_socket_cookie_proto;
case BPF_FUNC_get_socket_uid:
return &bpf_get_socket_uid_proto;
+#ifdef CONFIG_XFRM
+ case BPF_FUNC_skb_get_xfrm_state:
+ return &bpf_skb_get_xfrm_state_proto;
+#endif
default:
return bpf_base_func_proto(func_id);
}
@@ -3888,6 +3956,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_xdp_redirect_proto;
case BPF_FUNC_redirect_map:
return &bpf_xdp_redirect_map_proto;
+ case BPF_FUNC_xdp_adjust_tail:
+ return &bpf_xdp_adjust_tail_proto;
default:
return bpf_base_func_proto(func_id);
}
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 7b7a14abba28..5afae29367c1 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -55,7 +55,8 @@ static void neigh_timer_handler(struct timer_list *t);
static void __neigh_notify(struct neighbour *n, int type, int flags,
u32 pid);
static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid);
-static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
+static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
+ struct net_device *dev);
#ifdef CONFIG_PROC_FS
static const struct file_operations neigh_stat_seq_fops;
@@ -291,8 +292,7 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
{
write_lock_bh(&tbl->lock);
neigh_flush_dev(tbl, dev);
- pneigh_ifdown(tbl, dev);
- write_unlock_bh(&tbl->lock);
+ pneigh_ifdown_and_unlock(tbl, dev);
del_timer_sync(&tbl->proxy_timer);
pneigh_queue_purge(&tbl->proxy_queue);
@@ -681,9 +681,10 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
return -ENOENT;
}
-static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
+static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
+ struct net_device *dev)
{
- struct pneigh_entry *n, **np;
+ struct pneigh_entry *n, **np, *freelist = NULL;
u32 h;
for (h = 0; h <= PNEIGH_HASHMASK; h++) {
@@ -691,16 +692,23 @@ static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
while ((n = *np) != NULL) {
if (!dev || n->dev == dev) {
*np = n->next;
- if (tbl->pdestructor)
- tbl->pdestructor(n);
- if (n->dev)
- dev_put(n->dev);
- kfree(n);
+ n->next = freelist;
+ freelist = n;
continue;
}
np = &n->next;
}
}
+ write_unlock_bh(&tbl->lock);
+ while ((n = freelist)) {
+ freelist = n->next;
+ n->next = NULL;
+ if (tbl->pdestructor)
+ tbl->pdestructor(n);
+ if (n->dev)
+ dev_put(n->dev);
+ kfree(n);
+ }
return -ENOENT;
}
@@ -812,7 +820,8 @@ static void neigh_periodic_work(struct work_struct *work)
write_lock(&n->lock);
state = n->nud_state;
- if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
+ if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) ||
+ (n->flags & NTF_EXT_LEARNED)) {
write_unlock(&n->lock);
goto next_elt;
}
@@ -1128,6 +1137,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
if (neigh->dead)
goto out;
+ neigh_update_ext_learned(neigh, flags, &notify);
+
if (!(new & NUD_VALID)) {
neigh_del_timer(neigh);
if (old & NUD_CONNECTED)
@@ -1773,6 +1784,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
flags &= ~NEIGH_UPDATE_F_OVERRIDE;
}
+ if (ndm->ndm_flags & NTF_EXT_LEARNED)
+ flags |= NEIGH_UPDATE_F_EXT_LEARNED;
+
if (ndm->ndm_flags & NTF_USE) {
neigh_event_send(neigh, NULL);
err = 0;
@@ -2323,12 +2337,16 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL, NULL);
if (!err) {
- if (tb[NDA_IFINDEX])
+ if (tb[NDA_IFINDEX]) {
+ if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32))
+ return -EINVAL;
filter_idx = nla_get_u32(tb[NDA_IFINDEX]);
-
- if (tb[NDA_MASTER])
+ }
+ if (tb[NDA_MASTER]) {
+ if (nla_len(tb[NDA_MASTER]) != sizeof(u32))
+ return -EINVAL;
filter_master_idx = nla_get_u32(tb[NDA_MASTER]);
-
+ }
if (filter_idx || filter_master_idx)
flags |= NLM_F_DUMP_FILTERED;
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 345b51837ca8..c642304f178c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1305,7 +1305,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off)
skb->inner_mac_header += off;
}
-static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
+void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
{
__copy_skb_header(new, old);
@@ -1313,6 +1313,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
}
+EXPORT_SYMBOL(skb_copy_header);
static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
{
@@ -1355,7 +1356,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));
- copy_skb_header(n, skb);
+ skb_copy_header(n, skb);
return n;
}
EXPORT_SYMBOL(skb_copy);
@@ -1419,7 +1420,7 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
skb_clone_fraglist(n);
}
- copy_skb_header(n, skb);
+ skb_copy_header(n, skb);
out:
return n;
}
@@ -1599,7 +1600,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
skb->len + head_copy_len));
- copy_skb_header(n, skb);
+ skb_copy_header(n, skb);
skb_headers_offset_update(n, newheadroom - oldheadroom);
@@ -1839,6 +1840,20 @@ done:
}
EXPORT_SYMBOL(___pskb_trim);
+/* Note : use pskb_trim_rcsum() instead of calling this directly
+ */
+int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
+{
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ int delta = skb->len - len;
+
+ skb->csum = csum_sub(skb->csum,
+ skb_checksum(skb, len, delta, 0));
+ }
+ return __pskb_trim(skb, len);
+}
+EXPORT_SYMBOL(pskb_trim_rcsum_slow);
+
/**
* __pskb_pull_tail - advance tail of skb header
* @skb: buffer to reallocate
@@ -4926,6 +4941,8 @@ static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
thlen = tcp_hdrlen(skb);
} else if (unlikely(skb_is_gso_sctp(skb))) {
thlen = sizeof(struct sctphdr);
+ } else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
+ thlen = sizeof(struct udphdr);
}
/* UFO sets gso_size to the size of the fragmentation
* payload, i.e. the size of the L4 (UDP) header is already
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index 92d016e87816..385f153fe031 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -126,6 +126,16 @@ static void ccid2_change_l_seq_window(struct sock *sk, u64 val)
DCCPF_SEQ_WMAX));
}
+static void dccp_tasklet_schedule(struct sock *sk)
+{
+ struct tasklet_struct *t = &dccp_sk(sk)->dccps_xmitlet;
+
+ if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
+ sock_hold(sk);
+ __tasklet_schedule(t);
+ }
+}
+
static void ccid2_hc_tx_rto_expire(struct timer_list *t)
{
struct ccid2_hc_tx_sock *hc = from_timer(hc, t, tx_rtotimer);
@@ -166,7 +176,7 @@ static void ccid2_hc_tx_rto_expire(struct timer_list *t)
/* if we were blocked before, we may now send cwnd=1 packet */
if (sender_was_blocked)
- tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
+ dccp_tasklet_schedule(sk);
/* restart backed-off timer */
sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
out:
@@ -706,7 +716,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
done:
/* check if incoming Acks allow pending packets to be sent */
if (sender_was_blocked && !ccid2_cwnd_network_limited(hc))
- tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
+ dccp_tasklet_schedule(sk);
dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
}
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index b50a8732ff43..1501a20a94ca 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -232,6 +232,7 @@ static void dccp_write_xmitlet(unsigned long data)
else
dccp_write_xmit(sk);
bh_unlock_sock(sk);
+ sock_put(sk);
}
static void dccp_write_xmit_timer(struct timer_list *t)
@@ -240,7 +241,6 @@ static void dccp_write_xmit_timer(struct timer_list *t)
struct sock *sk = &dp->dccps_inet_connection.icsk_inet.sk;
dccp_write_xmitlet((unsigned long)sk);
- sock_put(sk);
}
void dccp_init_xmit_timers(struct sock *sk)
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index c795c3f509c9..72236695db3d 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -121,13 +121,16 @@ static int dn_fib_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
static int dn_fib_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct fib_rule_hdr *frh,
- struct nlattr **tb)
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
{
int err = -EINVAL;
struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
- if (frh->tos)
+ if (frh->tos) {
+ NL_SET_ERR_MSG(extack, "Invalid tos value");
goto errout;
+ }
if (rule->table == RT_TABLE_UNSPEC) {
if (rule->action == FR_ACT_TO_TBL) {
diff --git a/net/dsa/master.c b/net/dsa/master.c
index 90e6df0351eb..c90ee3227dea 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -22,7 +22,7 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev,
int port = cpu_dp->index;
int count = 0;
- if (ops && ops->get_sset_count && ops->get_ethtool_stats) {
+ if (ops->get_sset_count && ops->get_ethtool_stats) {
count = ops->get_sset_count(dev, ETH_SS_STATS);
ops->get_ethtool_stats(dev, stats, data);
}
@@ -31,6 +31,32 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev,
ds->ops->get_ethtool_stats(ds, port, data + count);
}
+static void dsa_master_get_ethtool_phy_stats(struct net_device *dev,
+ struct ethtool_stats *stats,
+ uint64_t *data)
+{
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+ struct dsa_switch *ds = cpu_dp->ds;
+ int port = cpu_dp->index;
+ int count = 0;
+
+ if (dev->phydev && !ops->get_ethtool_phy_stats) {
+ count = phy_ethtool_get_sset_count(dev->phydev);
+ if (count >= 0)
+ phy_ethtool_get_stats(dev->phydev, stats, data);
+ } else if (ops->get_sset_count && ops->get_ethtool_phy_stats) {
+ count = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
+ ops->get_ethtool_phy_stats(dev, stats, data);
+ }
+
+ if (count < 0)
+ count = 0;
+
+ if (ds->ops->get_ethtool_phy_stats)
+ ds->ops->get_ethtool_phy_stats(ds, port, data + count);
+}
+
static int dsa_master_get_sset_count(struct net_device *dev, int sset)
{
struct dsa_port *cpu_dp = dev->dsa_ptr;
@@ -38,11 +64,17 @@ static int dsa_master_get_sset_count(struct net_device *dev, int sset)
struct dsa_switch *ds = cpu_dp->ds;
int count = 0;
- if (ops && ops->get_sset_count)
- count += ops->get_sset_count(dev, sset);
+ if (sset == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats)
+ count = phy_ethtool_get_sset_count(dev->phydev);
+ else if (ops->get_sset_count)
+ count = ops->get_sset_count(dev, sset);
+
+ if (count < 0)
+ count = 0;
- if (sset == ETH_SS_STATS && ds->ops->get_sset_count)
- count += ds->ops->get_sset_count(ds, cpu_dp->index);
+ if (ds->ops->get_sset_count)
+ count += ds->ops->get_sset_count(ds, cpu_dp->index, sset);
return count;
}
@@ -64,19 +96,28 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
/* We do not want to be NULL-terminated, since this is a prefix */
pfx[sizeof(pfx) - 1] = '_';
- if (ops && ops->get_sset_count && ops->get_strings) {
- mcount = ops->get_sset_count(dev, ETH_SS_STATS);
+ if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats) {
+ mcount = phy_ethtool_get_sset_count(dev->phydev);
+ if (mcount < 0)
+ mcount = 0;
+ else
+ phy_ethtool_get_strings(dev->phydev, data);
+ } else if (ops->get_sset_count && ops->get_strings) {
+ mcount = ops->get_sset_count(dev, stringset);
+ if (mcount < 0)
+ mcount = 0;
ops->get_strings(dev, stringset, data);
}
- if (stringset == ETH_SS_STATS && ds->ops->get_strings) {
+ if (ds->ops->get_strings) {
ndata = data + mcount * len;
/* This function copies ETH_GSTRINGS_LEN bytes, we will mangle
* the output after to prepend our CPU port prefix we
* constructed earlier
*/
- ds->ops->get_strings(ds, port, ndata);
- count = ds->ops->get_sset_count(ds, port);
+ ds->ops->get_strings(ds, port, stringset, ndata);
+ count = ds->ops->get_sset_count(ds, port, stringset);
for (i = 0; i < count; i++) {
memmove(ndata + (i * len + sizeof(pfx)),
ndata + i * len, len - sizeof(pfx));
@@ -102,6 +143,7 @@ static int dsa_master_ethtool_setup(struct net_device *dev)
ops->get_sset_count = dsa_master_get_sset_count;
ops->get_ethtool_stats = dsa_master_get_ethtool_stats;
ops->get_strings = dsa_master_get_strings;
+ ops->get_ethtool_phy_stats = dsa_master_get_ethtool_phy_stats;
dev->ethtool_ops = ops;
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 7acc1169d75e..2413beb995be 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -273,25 +273,38 @@ int dsa_port_vlan_del(struct dsa_port *dp,
return 0;
}
-static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable)
+static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp)
{
- struct device_node *port_dn = dp->dn;
struct device_node *phy_dn;
- struct dsa_switch *ds = dp->ds;
struct phy_device *phydev;
- int port = dp->index;
- int err = 0;
- phy_dn = of_parse_phandle(port_dn, "phy-handle", 0);
+ phy_dn = of_parse_phandle(dp->dn, "phy-handle", 0);
if (!phy_dn)
- return 0;
+ return NULL;
phydev = of_phy_find_device(phy_dn);
if (!phydev) {
- err = -EPROBE_DEFER;
- goto err_put_of;
+ of_node_put(phy_dn);
+ return ERR_PTR(-EPROBE_DEFER);
}
+ return phydev;
+}
+
+static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct phy_device *phydev;
+ int port = dp->index;
+ int err = 0;
+
+ phydev = dsa_port_get_phy_device(dp);
+ if (!phydev)
+ return 0;
+
+ if (IS_ERR(phydev))
+ return PTR_ERR(phydev);
+
if (enable) {
err = genphy_config_init(phydev);
if (err < 0)
@@ -317,8 +330,6 @@ static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable)
err_put_dev:
put_device(&phydev->mdio.dev);
-err_put_of:
- of_node_put(phy_dn);
return err;
}
@@ -372,3 +383,60 @@ void dsa_port_link_unregister_of(struct dsa_port *dp)
else
dsa_port_setup_phy_of(dp, false);
}
+
+int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data)
+{
+ struct phy_device *phydev;
+ int ret = -EOPNOTSUPP;
+
+ if (of_phy_is_fixed_link(dp->dn))
+ return ret;
+
+ phydev = dsa_port_get_phy_device(dp);
+ if (IS_ERR_OR_NULL(phydev))
+ return ret;
+
+ ret = phy_ethtool_get_strings(phydev, data);
+ put_device(&phydev->mdio.dev);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dsa_port_get_phy_strings);
+
+int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data)
+{
+ struct phy_device *phydev;
+ int ret = -EOPNOTSUPP;
+
+ if (of_phy_is_fixed_link(dp->dn))
+ return ret;
+
+ phydev = dsa_port_get_phy_device(dp);
+ if (IS_ERR_OR_NULL(phydev))
+ return ret;
+
+ ret = phy_ethtool_get_stats(phydev, NULL, data);
+ put_device(&phydev->mdio.dev);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dsa_port_get_ethtool_phy_stats);
+
+int dsa_port_get_phy_sset_count(struct dsa_port *dp)
+{
+ struct phy_device *phydev;
+ int ret = -EOPNOTSUPP;
+
+ if (of_phy_is_fixed_link(dp->dn))
+ return ret;
+
+ phydev = dsa_port_get_phy_device(dp);
+ if (IS_ERR_OR_NULL(phydev))
+ return ret;
+
+ ret = phy_ethtool_get_sset_count(phydev);
+ put_device(&phydev->mdio.dev);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dsa_port_get_phy_sset_count);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 18561af7a8f1..c287f1ef964c 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -560,7 +560,8 @@ static void dsa_slave_get_strings(struct net_device *dev,
strncpy(data + 2 * len, "rx_packets", len);
strncpy(data + 3 * len, "rx_bytes", len);
if (ds->ops->get_strings)
- ds->ops->get_strings(ds, dp->index, data + 4 * len);
+ ds->ops->get_strings(ds, dp->index, stringset,
+ data + 4 * len);
}
}
@@ -605,7 +606,7 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
count = 4;
if (ds->ops->get_sset_count)
- count += ds->ops->get_sset_count(ds, dp->index);
+ count += ds->ops->get_sset_count(ds, dp->index, sset);
return count;
}
@@ -1440,6 +1441,7 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
+ struct switchdev_notifier_fdb_info *fdb_info = ptr;
struct dsa_switchdev_event_work *switchdev_work;
if (!dsa_slave_dev_check(dev))
@@ -1457,8 +1459,10 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused,
switch (event) {
case SWITCHDEV_FDB_ADD_TO_DEVICE: /* fall through */
case SWITCHDEV_FDB_DEL_TO_DEVICE:
+ if (!fdb_info->added_by_user)
+ break;
if (dsa_slave_switchdev_fdb_work_init(switchdev_work,
- ptr))
+ fdb_info))
goto err_fdb_work_init;
dev_hold(dev);
break;
diff --git a/net/ife/ife.c b/net/ife/ife.c
index 7d1ec76e7f43..13bbf8cb6a39 100644
--- a/net/ife/ife.c
+++ b/net/ife/ife.c
@@ -69,6 +69,9 @@ void *ife_decode(struct sk_buff *skb, u16 *metalen)
int total_pull;
u16 ifehdrln;
+ if (!pskb_may_pull(skb, skb->dev->hard_header_len + IFE_METAHDRLEN))
+ return NULL;
+
ifehdr = (struct ifeheadr *) (skb->data + skb->dev->hard_header_len);
ifehdrln = ntohs(ifehdr->metalen);
total_pull = skb->dev->hard_header_len + ifehdrln;
@@ -92,12 +95,43 @@ struct meta_tlvhdr {
__be16 len;
};
+static bool __ife_tlv_meta_valid(const unsigned char *skbdata,
+ const unsigned char *ifehdr_end)
+{
+ const struct meta_tlvhdr *tlv;
+ u16 tlvlen;
+
+ if (unlikely(skbdata + sizeof(*tlv) > ifehdr_end))
+ return false;
+
+ tlv = (const struct meta_tlvhdr *)skbdata;
+ tlvlen = ntohs(tlv->len);
+
+ /* tlv length field is inc header, check on minimum */
+ if (tlvlen < NLA_HDRLEN)
+ return false;
+
+ /* overflow by NLA_ALIGN check */
+ if (NLA_ALIGN(tlvlen) < tlvlen)
+ return false;
+
+ if (unlikely(skbdata + NLA_ALIGN(tlvlen) > ifehdr_end))
+ return false;
+
+ return true;
+}
+
/* Caller takes care of presenting data in network order
*/
-void *ife_tlv_meta_decode(void *skbdata, u16 *attrtype, u16 *dlen, u16 *totlen)
+void *ife_tlv_meta_decode(void *skbdata, const void *ifehdr_end, u16 *attrtype,
+ u16 *dlen, u16 *totlen)
{
- struct meta_tlvhdr *tlv = (struct meta_tlvhdr *) skbdata;
+ struct meta_tlvhdr *tlv;
+
+ if (!__ife_tlv_meta_valid(skbdata, ifehdr_end))
+ return NULL;
+ tlv = (struct meta_tlvhdr *)skbdata;
*dlen = ntohs(tlv->len) - NLA_HDRLEN;
*attrtype = ntohs(tlv->type);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 3ebf599cebae..b403499fdabe 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -994,7 +994,9 @@ const struct proto_ops inet_stream_ops = {
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
+#ifdef CONFIG_MMU
.mmap = tcp_mmap,
+#endif
.sendpage = inet_sendpage,
.splice_read = tcp_splice_read,
.read_sock = tcp_read_sock,
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 737d11bc8838..f8eb78d042a4 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -213,14 +213,17 @@ static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct fib_rule_hdr *frh,
- struct nlattr **tb)
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
int err = -EINVAL;
struct fib4_rule *rule4 = (struct fib4_rule *) rule;
- if (frh->tos & ~IPTOS_TOS_MASK)
+ if (frh->tos & ~IPTOS_TOS_MASK) {
+ NL_SET_ERR_MSG(extack, "Invalid tos");
goto errout;
+ }
/* split local/main if they are not already split */
err = fib_unmerge(net);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 9c169bb2444d..dfe5b22f6ed4 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -578,6 +578,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
int tunnel_hlen;
int version;
__be16 df;
+ int nhoff;
tun_info = skb_tunnel_info(skb);
if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
@@ -605,6 +606,11 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
truncate = true;
}
+ nhoff = skb_network_header(skb) - skb_mac_header(skb);
+ if (skb->protocol == htons(ETH_P_IP) &&
+ (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
+ truncate = true;
+
if (version == 1) {
erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)),
ntohl(md->u.index), truncate, true);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 4c11b810a447..f2338e40c37d 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -878,11 +878,14 @@ static int __ip_append_data(struct sock *sk,
struct rtable *rt = (struct rtable *)cork->dst;
unsigned int wmem_alloc_delta = 0;
u32 tskey = 0;
+ bool paged;
skb = skb_peek_tail(queue);
exthdrlen = !skb ? rt->dst.header_len : 0;
- mtu = cork->fragsize;
+ mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
+ paged = !!cork->gso_size;
+
if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
tskey = sk->sk_tskey++;
@@ -906,7 +909,7 @@ static int __ip_append_data(struct sock *sk,
if (transhdrlen &&
length + fragheaderlen <= mtu &&
rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
- !(flags & MSG_MORE) &&
+ (!(flags & MSG_MORE) || cork->gso_size) &&
!exthdrlen)
csummode = CHECKSUM_PARTIAL;
@@ -933,6 +936,7 @@ static int __ip_append_data(struct sock *sk,
unsigned int fraglen;
unsigned int fraggap;
unsigned int alloclen;
+ unsigned int pagedlen = 0;
struct sk_buff *skb_prev;
alloc_new_skb:
skb_prev = skb;
@@ -953,8 +957,12 @@ alloc_new_skb:
if ((flags & MSG_MORE) &&
!(rt->dst.dev->features&NETIF_F_SG))
alloclen = mtu;
- else
+ else if (!paged)
alloclen = fraglen;
+ else {
+ alloclen = min_t(int, fraglen, MAX_HEADER);
+ pagedlen = fraglen - alloclen;
+ }
alloclen += exthdrlen;
@@ -998,7 +1006,7 @@ alloc_new_skb:
/*
* Find where to start putting bytes.
*/
- data = skb_put(skb, fraglen + exthdrlen);
+ data = skb_put(skb, fraglen + exthdrlen - pagedlen);
skb_set_network_header(skb, exthdrlen);
skb->transport_header = (skb->network_header +
fragheaderlen);
@@ -1014,7 +1022,7 @@ alloc_new_skb:
pskb_trim_unique(skb_prev, maxfraglen);
}
- copy = datalen - transhdrlen - fraggap;
+ copy = datalen - transhdrlen - fraggap - pagedlen;
if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
err = -EFAULT;
kfree_skb(skb);
@@ -1022,7 +1030,7 @@ alloc_new_skb:
}
offset += copy;
- length -= datalen - fraggap;
+ length -= copy + transhdrlen;
transhdrlen = 0;
exthdrlen = 0;
csummode = CHECKSUM_NONE;
@@ -1109,6 +1117,10 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
struct ip_options_rcu *opt;
struct rtable *rt;
+ rt = *rtp;
+ if (unlikely(!rt))
+ return -EFAULT;
+
/*
* setup for corking.
*/
@@ -1124,15 +1136,15 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
cork->flags |= IPCORK_OPT;
cork->addr = ipc->addr;
}
- rt = *rtp;
- if (unlikely(!rt))
- return -EFAULT;
+
/*
* We steal reference to this route, caller should not release it
*/
*rtp = NULL;
cork->fragsize = ip_sk_use_pmtu(sk) ?
dst_mtu(&rt->dst) : rt->dst.dev->mtu;
+
+ cork->gso_size = sk->sk_type == SOCK_DGRAM ? ipc->gso_size : 0;
cork->dst = &rt->dst;
cork->length = 0;
cork->ttl = ipc->ttl;
@@ -1212,7 +1224,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
return -EOPNOTSUPP;
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
- mtu = cork->fragsize;
+ mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
@@ -1468,9 +1480,8 @@ struct sk_buff *ip_make_skb(struct sock *sk,
int len, int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
struct ipcm_cookie *ipc, struct rtable **rtp,
- unsigned int flags)
+ struct inet_cork *cork, unsigned int flags)
{
- struct inet_cork cork;
struct sk_buff_head queue;
int err;
@@ -1479,22 +1490,22 @@ struct sk_buff *ip_make_skb(struct sock *sk,
__skb_queue_head_init(&queue);
- cork.flags = 0;
- cork.addr = 0;
- cork.opt = NULL;
- err = ip_setup_cork(sk, &cork, ipc, rtp);
+ cork->flags = 0;
+ cork->addr = 0;
+ cork->opt = NULL;
+ err = ip_setup_cork(sk, cork, ipc, rtp);
if (err)
return ERR_PTR(err);
- err = __ip_append_data(sk, fl4, &queue, &cork,
+ err = __ip_append_data(sk, fl4, &queue, cork,
&current->task_frag, getfrag,
from, length, transhdrlen, flags);
if (err) {
- __ip_flush_pending_frames(sk, &queue, &cork);
+ __ip_flush_pending_frames(sk, &queue, cork);
return ERR_PTR(err);
}
- return __ip_make_skb(sk, fl4, &queue, &cork);
+ return __ip_make_skb(sk, fl4, &queue, cork);
}
/*
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 43f620feb1c4..d839d74853fc 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -28,6 +28,9 @@
*
* Multiple Nameservers in /proc/net/pnp
* -- Josef Siemes <jsiemes@web.de>, Aug 2002
+ *
+ * NTP servers in /proc/net/ipconfig/ntp_servers
+ * -- Chris Novakovic <chris@chrisn.me.uk>, April 2018
*/
#include <linux/types.h>
@@ -93,6 +96,7 @@
#define CONF_TIMEOUT_MAX (HZ*30) /* Maximum allowed timeout */
#define CONF_NAMESERVERS_MAX 3 /* Maximum number of nameservers
- '3' from resolv.h */
+#define CONF_NTP_SERVERS_MAX 3 /* Maximum number of NTP servers */
#define NONE cpu_to_be32(INADDR_NONE)
#define ANY cpu_to_be32(INADDR_ANY)
@@ -152,12 +156,16 @@ static int ic_proto_used; /* Protocol used, if any */
#define ic_proto_used 0
#endif
static __be32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */
+static __be32 ic_ntp_servers[CONF_NTP_SERVERS_MAX]; /* NTP server IP addresses */
static u8 ic_domain[64]; /* DNS (not NIS) domain name */
/*
* Private state.
*/
+/* proc_dir_entry for /proc/net/ipconfig */
+static struct proc_dir_entry *ipconfig_dir;
+
/* Name of user-selected boot device */
static char user_dev_name[IFNAMSIZ] __initdata = { 0, };
@@ -576,6 +584,15 @@ static inline void __init ic_nameservers_predef(void)
ic_nameservers[i] = NONE;
}
+/* Predefine NTP servers */
+static inline void __init ic_ntp_servers_predef(void)
+{
+ int i;
+
+ for (i = 0; i < CONF_NTP_SERVERS_MAX; i++)
+ ic_ntp_servers[i] = NONE;
+}
+
/*
* DHCP/BOOTP support.
*/
@@ -671,6 +688,7 @@ ic_dhcp_init_options(u8 *options, struct ic_device *d)
17, /* Boot path */
26, /* MTU */
40, /* NIS domain name */
+ 42, /* NTP servers */
};
*e++ = 55; /* Parameter request list */
@@ -721,9 +739,11 @@ static void __init ic_bootp_init_ext(u8 *e)
*e++ = 3; /* Default gateway request */
*e++ = 4;
e += 4;
- *e++ = 5; /* Name server request */
- *e++ = 8;
- e += 8;
+#if CONF_NAMESERVERS_MAX > 0
+ *e++ = 6; /* (DNS) name server request */
+ *e++ = 4 * CONF_NAMESERVERS_MAX;
+ e += 4 * CONF_NAMESERVERS_MAX;
+#endif
*e++ = 12; /* Host name request */
*e++ = 32;
e += 32;
@@ -748,7 +768,13 @@ static void __init ic_bootp_init_ext(u8 *e)
*/
static inline void __init ic_bootp_init(void)
{
+ /* Re-initialise all name servers and NTP servers to NONE, in case any
+ * were set via the "ip=" or "nfsaddrs=" kernel command line parameters:
+ * any IP addresses specified there will already have been decoded but
+ * are no longer needed
+ */
ic_nameservers_predef();
+ ic_ntp_servers_predef();
dev_add_pack(&bootp_packet_type);
}
@@ -912,6 +938,15 @@ static void __init ic_do_bootp_ext(u8 *ext)
ic_bootp_string(utsname()->domainname, ext+1, *ext,
__NEW_UTS_LEN);
break;
+ case 42: /* NTP servers */
+ servers = *ext / 4;
+ if (servers > CONF_NTP_SERVERS_MAX)
+ servers = CONF_NTP_SERVERS_MAX;
+ for (i = 0; i < servers; i++) {
+ if (ic_ntp_servers[i] == NONE)
+ memcpy(&ic_ntp_servers[i], ext+1+4*i, 4);
+ }
+ break;
}
}
@@ -1258,6 +1293,7 @@ static int __init ic_dynamic(void)
#ifdef CONFIG_PROC_FS
+/* Name servers: */
static int pnp_seq_show(struct seq_file *seq, void *v)
{
int i;
@@ -1294,6 +1330,62 @@ static const struct file_operations pnp_seq_fops = {
.llseek = seq_lseek,
.release = single_release,
};
+
+/* Create the /proc/net/ipconfig directory */
+static int __init ipconfig_proc_net_init(void)
+{
+ ipconfig_dir = proc_net_mkdir(&init_net, "ipconfig", init_net.proc_net);
+ if (!ipconfig_dir)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/* Create a new file under /proc/net/ipconfig */
+static int ipconfig_proc_net_create(const char *name,
+ const struct file_operations *fops)
+{
+ char *pname;
+ struct proc_dir_entry *p;
+
+ if (!ipconfig_dir)
+ return -ENOMEM;
+
+ pname = kasprintf(GFP_KERNEL, "%s%s", "ipconfig/", name);
+ if (!pname)
+ return -ENOMEM;
+
+ p = proc_create(pname, 0444, init_net.proc_net, fops);
+ kfree(pname);
+ if (!p)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/* Write NTP server IP addresses to /proc/net/ipconfig/ntp_servers */
+static int ntp_servers_seq_show(struct seq_file *seq, void *v)
+{
+ int i;
+
+ for (i = 0; i < CONF_NTP_SERVERS_MAX; i++) {
+ if (ic_ntp_servers[i] != NONE)
+ seq_printf(seq, "%pI4\n", &ic_ntp_servers[i]);
+ }
+ return 0;
+}
+
+static int ntp_servers_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, ntp_servers_seq_show, NULL);
+}
+
+static const struct file_operations ntp_servers_seq_fops = {
+ .open = ntp_servers_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif /* CONFIG_PROC_FS */
/*
@@ -1368,8 +1460,20 @@ static int __init ip_auto_config(void)
int err;
unsigned int i;
+ /* Initialise all name servers and NTP servers to NONE (but only if the
+ * "ip=" or "nfsaddrs=" kernel command line parameters weren't decoded,
+ * otherwise we'll overwrite the IP addresses specified there)
+ */
+ if (ic_set_manually == 0) {
+ ic_nameservers_predef();
+ ic_ntp_servers_predef();
+ }
+
#ifdef CONFIG_PROC_FS
proc_create("pnp", 0444, init_net.proc_net, &pnp_seq_fops);
+
+ if (ipconfig_proc_net_init() == 0)
+ ipconfig_proc_net_create("ntp_servers", &ntp_servers_seq_fops);
#endif /* CONFIG_PROC_FS */
if (!ic_enable)
@@ -1481,16 +1585,32 @@ static int __init ip_auto_config(void)
&ic_servaddr, &root_server_addr, root_server_path);
if (ic_dev_mtu)
pr_cont(", mtu=%d", ic_dev_mtu);
- for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
+ /* Name servers (if any): */
+ for (i = 0; i < CONF_NAMESERVERS_MAX; i++) {
if (ic_nameservers[i] != NONE) {
- pr_cont(" nameserver%u=%pI4",
- i, &ic_nameservers[i]);
- break;
+ if (i == 0)
+ pr_info(" nameserver%u=%pI4",
+ i, &ic_nameservers[i]);
+ else
+ pr_cont(", nameserver%u=%pI4",
+ i, &ic_nameservers[i]);
}
- for (i++; i < CONF_NAMESERVERS_MAX; i++)
- if (ic_nameservers[i] != NONE)
- pr_cont(", nameserver%u=%pI4", i, &ic_nameservers[i]);
- pr_cont("\n");
+ if (i + 1 == CONF_NAMESERVERS_MAX)
+ pr_cont("\n");
+ }
+ /* NTP servers (if any): */
+ for (i = 0; i < CONF_NTP_SERVERS_MAX; i++) {
+ if (ic_ntp_servers[i] != NONE) {
+ if (i == 0)
+ pr_info(" ntpserver%u=%pI4",
+ i, &ic_ntp_servers[i]);
+ else
+ pr_cont(", ntpserver%u=%pI4",
+ i, &ic_ntp_servers[i]);
+ }
+ if (i + 1 == CONF_NTP_SERVERS_MAX)
+ pr_cont("\n");
+ }
#endif /* !SILENT */
/*
@@ -1588,7 +1708,9 @@ static int __init ip_auto_config_setup(char *addrs)
return 1;
}
+ /* Initialise all name servers and NTP servers to NONE */
ic_nameservers_predef();
+ ic_ntp_servers_predef();
/* Parse string for static IP assignment. */
ip = addrs;
@@ -1647,6 +1769,13 @@ static int __init ip_auto_config_setup(char *addrs)
ic_nameservers[1] = NONE;
}
break;
+ case 9:
+ if (CONF_NTP_SERVERS_MAX >= 1) {
+ ic_ntp_servers[0] = in_aton(ip);
+ if (ic_ntp_servers[0] == ANY)
+ ic_ntp_servers[0] = NONE;
+ }
+ break;
}
}
ip = cp;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 2fb4de3f7f66..38e092eafc97 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -201,7 +201,8 @@ static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
};
static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
- struct fib_rule_hdr *frh, struct nlattr **tb)
+ struct fib_rule_hdr *frh, struct nlattr **tb,
+ struct netlink_ext_ack *extack)
{
return 0;
}
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 7523ddb2566b..0e5edd0c7926 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -28,10 +28,9 @@ obj-$(CONFIG_NF_REJECT_IPV4) += nf_reject_ipv4.o
obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
-nf_nat_snmp_basic-y := nf_nat_snmp_basic-asn1.o nf_nat_snmp_basic_main.o
-$(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic-asn1.h
+nf_nat_snmp_basic-y := nf_nat_snmp_basic.asn1.o nf_nat_snmp_basic_main.o
+$(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h
obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
-clean-files := nf_nat_snmp_basic-asn1.c nf_nat_snmp_basic-asn1.h
obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
index b6e277093e7e..ac110c1d55b5 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
@@ -54,7 +54,7 @@
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <linux/netfilter/nf_conntrack_snmp.h>
-#include "nf_nat_snmp_basic-asn1.h"
+#include "nf_nat_snmp_basic.asn1.h"
MODULE_LICENSE("GPL");
MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index a058de677e94..261b71d0ccc5 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -296,6 +296,8 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE),
SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL),
SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
+ SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED),
+ SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ccb25d80f679..1412a7baf0b9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -709,7 +709,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
fnhe->fnhe_gw = gw;
fnhe->fnhe_pmtu = pmtu;
fnhe->fnhe_mtu_locked = lock;
- fnhe->fnhe_expires = expires;
+ fnhe->fnhe_expires = max(1UL, expires);
/* Exception created; mark the cached routes for the nexthop
* stale, so anyone caching it rechecks if this exception
@@ -1297,6 +1297,36 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
}
+static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
+{
+ struct fnhe_hash_bucket *hash;
+ struct fib_nh_exception *fnhe, __rcu **fnhe_p;
+ u32 hval = fnhe_hashfun(daddr);
+
+ spin_lock_bh(&fnhe_lock);
+
+ hash = rcu_dereference_protected(nh->nh_exceptions,
+ lockdep_is_held(&fnhe_lock));
+ hash += hval;
+
+ fnhe_p = &hash->chain;
+ fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
+ while (fnhe) {
+ if (fnhe->fnhe_daddr == daddr) {
+ rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
+ fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
+ fnhe_flush_routes(fnhe);
+ kfree_rcu(fnhe, rcu);
+ break;
+ }
+ fnhe_p = &fnhe->fnhe_next;
+ fnhe = rcu_dereference_protected(fnhe->fnhe_next,
+ lockdep_is_held(&fnhe_lock));
+ }
+
+ spin_unlock_bh(&fnhe_lock);
+}
+
static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
{
struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
@@ -1310,8 +1340,14 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
fnhe = rcu_dereference(fnhe->fnhe_next)) {
- if (fnhe->fnhe_daddr == daddr)
+ if (fnhe->fnhe_daddr == daddr) {
+ if (fnhe->fnhe_expires &&
+ time_after(jiffies, fnhe->fnhe_expires)) {
+ ip_del_fnhe(nh, daddr);
+ break;
+ }
return fnhe;
+ }
}
return NULL;
}
@@ -1636,36 +1672,6 @@ static void ip_handle_martian_source(struct net_device *dev,
#endif
}
-static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
-{
- struct fnhe_hash_bucket *hash;
- struct fib_nh_exception *fnhe, __rcu **fnhe_p;
- u32 hval = fnhe_hashfun(daddr);
-
- spin_lock_bh(&fnhe_lock);
-
- hash = rcu_dereference_protected(nh->nh_exceptions,
- lockdep_is_held(&fnhe_lock));
- hash += hval;
-
- fnhe_p = &hash->chain;
- fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
- while (fnhe) {
- if (fnhe->fnhe_daddr == daddr) {
- rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
- fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
- fnhe_flush_routes(fnhe);
- kfree_rcu(fnhe, rcu);
- break;
- }
- fnhe_p = &fnhe->fnhe_next;
- fnhe = rcu_dereference_protected(fnhe->fnhe_next,
- lockdep_is_held(&fnhe_lock));
- }
-
- spin_unlock_bh(&fnhe_lock);
-}
-
/* called in rcu_read_lock() section */
static int __mkroute_input(struct sk_buff *skb,
const struct fib_result *res,
@@ -1719,20 +1725,10 @@ static int __mkroute_input(struct sk_buff *skb,
fnhe = find_exception(&FIB_RES_NH(*res), daddr);
if (do_cache) {
- if (fnhe) {
+ if (fnhe)
rth = rcu_dereference(fnhe->fnhe_rth_input);
- if (rth && rth->dst.expires &&
- time_after(jiffies, rth->dst.expires)) {
- ip_del_fnhe(&FIB_RES_NH(*res), daddr);
- fnhe = NULL;
- } else {
- goto rt_cache;
- }
- }
-
- rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
-
-rt_cache:
+ else
+ rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
if (rt_cache_valid(rth)) {
skb_dst_set_noref(skb, &rth->dst);
goto out;
@@ -2216,39 +2212,31 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
* the loopback interface and the IP_PKTINFO ipi_ifindex will
* be set to the loopback interface as well.
*/
- fi = NULL;
+ do_cache = false;
}
fnhe = NULL;
do_cache &= fi != NULL;
- if (do_cache) {
+ if (fi) {
struct rtable __rcu **prth;
struct fib_nh *nh = &FIB_RES_NH(*res);
fnhe = find_exception(nh, fl4->daddr);
+ if (!do_cache)
+ goto add;
if (fnhe) {
prth = &fnhe->fnhe_rth_output;
- rth = rcu_dereference(*prth);
- if (rth && rth->dst.expires &&
- time_after(jiffies, rth->dst.expires)) {
- ip_del_fnhe(nh, fl4->daddr);
- fnhe = NULL;
- } else {
- goto rt_cache;
+ } else {
+ if (unlikely(fl4->flowi4_flags &
+ FLOWI_FLAG_KNOWN_NH &&
+ !(nh->nh_gw &&
+ nh->nh_scope == RT_SCOPE_LINK))) {
+ do_cache = false;
+ goto add;
}
+ prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
}
-
- if (unlikely(fl4->flowi4_flags &
- FLOWI_FLAG_KNOWN_NH &&
- !(nh->nh_gw &&
- nh->nh_scope == RT_SCOPE_LINK))) {
- do_cache = false;
- goto add;
- }
- prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
rth = rcu_dereference(*prth);
-
-rt_cache:
if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
return rth;
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 438fbca96cd3..62b776f90037 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -697,7 +697,7 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
{
return skb->len < size_goal &&
sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
- skb != tcp_write_queue_head(sk) &&
+ !tcp_rtx_queue_empty(sk) &&
refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
}
@@ -1204,7 +1204,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
uarg->zerocopy = 0;
}
- if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) {
+ if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
+ !tp->repair) {
err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
if (err == -EINPROGRESS && copied_syn > 0)
goto out;
@@ -1726,118 +1727,113 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
}
EXPORT_SYMBOL(tcp_set_rcvlowat);
-/* When user wants to mmap X pages, we first need to perform the mapping
- * before freeing any skbs in receive queue, otherwise user would be unable
- * to fallback to standard recvmsg(). This happens if some data in the
- * requested block is not exactly fitting in a page.
- *
- * We only support order-0 pages for the moment.
- * mmap() on TCP is very strict, there is no point
- * trying to accommodate with pathological layouts.
- */
+#ifdef CONFIG_MMU
+static const struct vm_operations_struct tcp_vm_ops = {
+};
+
int tcp_mmap(struct file *file, struct socket *sock,
struct vm_area_struct *vma)
{
- unsigned long size = vma->vm_end - vma->vm_start;
- unsigned int nr_pages = size >> PAGE_SHIFT;
- struct page **pages_array = NULL;
- u32 seq, len, offset, nr = 0;
- struct sock *sk = sock->sk;
- const skb_frag_t *frags;
+ if (vma->vm_flags & (VM_WRITE | VM_EXEC))
+ return -EPERM;
+ vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
+
+ /* Instruct vm_insert_page() to not down_read(mmap_sem) */
+ vma->vm_flags |= VM_MIXEDMAP;
+
+ vma->vm_ops = &tcp_vm_ops;
+ return 0;
+}
+EXPORT_SYMBOL(tcp_mmap);
+
+static int tcp_zerocopy_receive(struct sock *sk,
+ struct tcp_zerocopy_receive *zc)
+{
+ unsigned long address = (unsigned long)zc->address;
+ const skb_frag_t *frags = NULL;
+ u32 length = 0, seq, offset;
+ struct vm_area_struct *vma;
+ struct sk_buff *skb = NULL;
struct tcp_sock *tp;
- struct sk_buff *skb;
int ret;
- if (vma->vm_pgoff || !nr_pages)
+ if (address & (PAGE_SIZE - 1) || address != zc->address)
return -EINVAL;
- if (vma->vm_flags & VM_WRITE)
- return -EPERM;
- /* TODO: Maybe the following is not needed if pages are COW */
- vma->vm_flags &= ~VM_MAYWRITE;
-
- lock_sock(sk);
-
- ret = -ENOTCONN;
if (sk->sk_state == TCP_LISTEN)
- goto out;
+ return -ENOTCONN;
sock_rps_record_flow(sk);
- if (tcp_inq(sk) < size) {
- ret = sock_flag(sk, SOCK_DONE) ? -EIO : -EAGAIN;
+ down_read(&current->mm->mmap_sem);
+
+ ret = -EINVAL;
+ vma = find_vma(current->mm, address);
+ if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops)
goto out;
- }
+ zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
+
tp = tcp_sk(sk);
seq = tp->copied_seq;
- /* Abort if urgent data is in the area */
- if (unlikely(tp->urg_data)) {
- u32 urg_offset = tp->urg_seq - seq;
+ zc->length = min_t(u32, zc->length, tcp_inq(sk));
+ zc->length &= ~(PAGE_SIZE - 1);
- ret = -EINVAL;
- if (urg_offset < size)
- goto out;
- }
- ret = -ENOMEM;
- pages_array = kvmalloc_array(nr_pages, sizeof(struct page *),
- GFP_KERNEL);
- if (!pages_array)
- goto out;
- skb = tcp_recv_skb(sk, seq, &offset);
- ret = -EINVAL;
-skb_start:
- /* We do not support anything not in page frags */
- offset -= skb_headlen(skb);
- if ((int)offset < 0)
- goto out;
- if (skb_has_frag_list(skb))
- goto out;
- len = skb->data_len - offset;
- frags = skb_shinfo(skb)->frags;
- while (offset) {
- if (frags->size > offset)
- goto out;
- offset -= frags->size;
- frags++;
- }
- while (nr < nr_pages) {
- if (len) {
- if (len < PAGE_SIZE)
- goto out;
- if (frags->size != PAGE_SIZE || frags->page_offset)
- goto out;
- pages_array[nr++] = skb_frag_page(frags);
- frags++;
- len -= PAGE_SIZE;
- seq += PAGE_SIZE;
- continue;
+ zap_page_range(vma, address, zc->length);
+
+ zc->recv_skip_hint = 0;
+ ret = 0;
+ while (length + PAGE_SIZE <= zc->length) {
+ if (zc->recv_skip_hint < PAGE_SIZE) {
+ if (skb) {
+ skb = skb->next;
+ offset = seq - TCP_SKB_CB(skb)->seq;
+ } else {
+ skb = tcp_recv_skb(sk, seq, &offset);
+ }
+
+ zc->recv_skip_hint = skb->len - offset;
+ offset -= skb_headlen(skb);
+ if ((int)offset < 0 || skb_has_frag_list(skb))
+ break;
+ frags = skb_shinfo(skb)->frags;
+ while (offset) {
+ if (frags->size > offset)
+ goto out;
+ offset -= frags->size;
+ frags++;
+ }
}
- skb = skb->next;
- offset = seq - TCP_SKB_CB(skb)->seq;
- goto skb_start;
- }
- /* OK, we have a full set of pages ready to be inserted into vma */
- for (nr = 0; nr < nr_pages; nr++) {
- ret = vm_insert_page(vma, vma->vm_start + (nr << PAGE_SHIFT),
- pages_array[nr]);
+ if (frags->size != PAGE_SIZE || frags->page_offset)
+ break;
+ ret = vm_insert_page(vma, address + length,
+ skb_frag_page(frags));
if (ret)
- goto out;
+ break;
+ length += PAGE_SIZE;
+ seq += PAGE_SIZE;
+ zc->recv_skip_hint -= PAGE_SIZE;
+ frags++;
}
- /* operation is complete, we can 'consume' all skbs */
- tp->copied_seq = seq;
- tcp_rcv_space_adjust(sk);
-
- /* Clean up data we have read: This will do ACK frames. */
- tcp_recv_skb(sk, seq, &offset);
- tcp_cleanup_rbuf(sk, size);
-
- ret = 0;
out:
- release_sock(sk);
- kvfree(pages_array);
+ up_read(&current->mm->mmap_sem);
+ if (length) {
+ tp->copied_seq = seq;
+ tcp_rcv_space_adjust(sk);
+
+ /* Clean up data we have read: This will do ACK frames. */
+ tcp_recv_skb(sk, seq, &offset);
+ tcp_cleanup_rbuf(sk, length);
+ ret = 0;
+ if (length == zc->length)
+ zc->recv_skip_hint = 0;
+ } else {
+ if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
+ ret = -EIO;
+ }
+ zc->length = length;
return ret;
}
-EXPORT_SYMBOL(tcp_mmap);
+#endif
static void tcp_update_recv_tstamps(struct sk_buff *skb,
struct scm_timestamping *tss)
@@ -1894,6 +1890,22 @@ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
}
}
+static int tcp_inq_hint(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u32 copied_seq = READ_ONCE(tp->copied_seq);
+ u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
+ int inq;
+
+ inq = rcv_nxt - copied_seq;
+ if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
+ lock_sock(sk);
+ inq = tp->rcv_nxt - tp->copied_seq;
+ release_sock(sk);
+ }
+ return inq;
+}
+
/*
* This routine copies from a sock struct into the user buffer.
*
@@ -1910,13 +1922,14 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
u32 peek_seq;
u32 *seq;
unsigned long used;
- int err;
+ int err, inq;
int target; /* Read at least this many bytes */
long timeo;
struct sk_buff *skb, *last;
u32 urg_hole = 0;
struct scm_timestamping tss;
bool has_tss = false;
+ bool has_cmsg;
if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len, addr_len);
@@ -1931,6 +1944,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
if (sk->sk_state == TCP_LISTEN)
goto out;
+ has_cmsg = tp->recvmsg_inq;
timeo = sock_rcvtimeo(sk, nonblock);
/* Urgent data needs to be handled specially. */
@@ -2117,6 +2131,7 @@ skip_copy:
if (TCP_SKB_CB(skb)->has_rxtstamp) {
tcp_update_recv_tstamps(skb, &tss);
has_tss = true;
+ has_cmsg = true;
}
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
@@ -2136,13 +2151,20 @@ skip_copy:
* on connected socket. I was just happy when found this 8) --ANK
*/
- if (has_tss)
- tcp_recv_timestamp(msg, sk, &tss);
-
/* Clean up data we have read: This will do ACK frames. */
tcp_cleanup_rbuf(sk, copied);
release_sock(sk);
+
+ if (has_cmsg) {
+ if (has_tss)
+ tcp_recv_timestamp(msg, sk, &tss);
+ if (tp->recvmsg_inq) {
+ inq = tcp_inq_hint(sk);
+ put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
+ }
+ }
+
return copied;
out:
@@ -2506,6 +2528,7 @@ void tcp_write_queue_purge(struct sock *sk)
INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
sk_mem_reclaim(sk);
tcp_clear_all_retrans_hints(tcp_sk(sk));
+ tcp_sk(sk)->packets_out = 0;
}
int tcp_disconnect(struct sock *sk, int flags)
@@ -2555,10 +2578,10 @@ int tcp_disconnect(struct sock *sk, int flags)
icsk->icsk_backoff = 0;
tp->snd_cwnd = 2;
icsk->icsk_probes_out = 0;
- tp->packets_out = 0;
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_cnt = 0;
tp->window_clamp = 0;
+ tp->delivered_ce = 0;
tcp_set_ca_state(sk, TCP_CA_Open);
tp->is_sack_reneg = 0;
tcp_clear_retrans(tp);
@@ -2811,7 +2834,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
case TCP_REPAIR_QUEUE:
if (!tp->repair)
err = -EPERM;
- else if (val < TCP_QUEUES_NR)
+ else if ((unsigned int)val < TCP_QUEUES_NR)
tp->repair_queue = val;
else
err = -EINVAL;
@@ -2951,8 +2974,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
#ifdef CONFIG_TCP_MD5SIG
case TCP_MD5SIG:
case TCP_MD5SIG_EXT:
- /* Read the IP->Key mappings from userspace */
- err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
+ if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
+ err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
+ else
+ err = -EINVAL;
break;
#endif
case TCP_USER_TIMEOUT:
@@ -3008,6 +3033,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
tp->notsent_lowat = val;
sk->sk_write_space(sk);
break;
+ case TCP_INQ:
+ if (val > 1 || val < 0)
+ err = -EINVAL;
+ else
+ tp->recvmsg_inq = val;
+ break;
default:
err = -ENOPROTOOPT;
break;
@@ -3166,6 +3197,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
rate64 = tcp_compute_delivery_rate(tp);
if (rate64)
info->tcpi_delivery_rate = rate64;
+ info->tcpi_delivered = tp->delivered;
+ info->tcpi_delivered_ce = tp->delivered_ce;
unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3179,7 +3212,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
u32 rate;
stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) +
- 5 * nla_total_size(sizeof(u32)) +
+ 7 * nla_total_size(sizeof(u32)) +
3 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
if (!stats)
return NULL;
@@ -3210,9 +3243,12 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
+ nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
+ nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
+
return stats;
}
@@ -3428,6 +3464,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_NOTSENT_LOWAT:
val = tp->notsent_lowat;
break;
+ case TCP_INQ:
+ val = tp->recvmsg_inq;
+ break;
case TCP_SAVE_SYN:
val = tp->save_syn;
break;
@@ -3464,6 +3503,25 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
}
return 0;
}
+#ifdef CONFIG_MMU
+ case TCP_ZEROCOPY_RECEIVE: {
+ struct tcp_zerocopy_receive zc;
+ int err;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len != sizeof(zc))
+ return -EINVAL;
+ if (copy_from_user(&zc, optval, len))
+ return -EFAULT;
+ lock_sock(sk);
+ err = tcp_zerocopy_receive(sk, &zc);
+ release_sock(sk);
+ if (!err && copy_to_user(optval, &zc, len))
+ err = -EFAULT;
+ return err;
+ }
+#endif
default:
return -ENOPROTOOPT;
}
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 158d105e76da..58e2f479ffb4 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -806,7 +806,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
}
}
}
- bbr->idle_restart = 0;
+ /* Restart after idle ends only once we process a new S/ACK for data */
+ if (rs->delivered > 0)
+ bbr->idle_restart = 0;
}
static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f93687f97d80..b188e0d75edd 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -111,6 +111,25 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
#define REXMIT_LOST 1 /* retransmit packets marked lost */
#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+static DEFINE_STATIC_KEY_FALSE(clean_acked_data_enabled);
+
+void clean_acked_data_enable(struct inet_connection_sock *icsk,
+ void (*cad)(struct sock *sk, u32 ack_seq))
+{
+ icsk->icsk_clean_acked = cad;
+ static_branch_inc(&clean_acked_data_enabled);
+}
+EXPORT_SYMBOL_GPL(clean_acked_data_enable);
+
+void clean_acked_data_disable(struct inet_connection_sock *icsk)
+{
+ static_branch_dec(&clean_acked_data_enabled);
+ icsk->icsk_clean_acked = NULL;
+}
+EXPORT_SYMBOL_GPL(clean_acked_data_disable);
+#endif
+
static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
unsigned int len)
{
@@ -582,6 +601,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
u32 copied;
int time;
+ trace_tcp_rcv_space_adjust(sk);
+
tcp_mstamp_refresh(tp);
time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
@@ -3496,6 +3517,22 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit)
tcp_xmit_retransmit_queue(sk);
}
+/* Returns the number of packets newly acked or sacked by the current ACK */
+static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
+{
+ const struct net *net = sock_net(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 delivered;
+
+ delivered = tp->delivered - prior_delivered;
+ NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
+ if (flag & FLAG_ECE) {
+ tp->delivered_ce += delivered;
+ NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
+ }
+ return delivered;
+}
+
/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
@@ -3542,6 +3579,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, prior_snd_una)) {
flag |= FLAG_SND_UNA_ADVANCED;
icsk->icsk_retransmits = 0;
+
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+ if (static_branch_unlikely(&clean_acked_data_enabled))
+ if (icsk->icsk_clean_acked)
+ icsk->icsk_clean_acked(sk, ack);
+#endif
}
prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
@@ -3619,7 +3662,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
sk_dst_confirm(sk);
- delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
+ delivered = tcp_newly_delivered(sk, delivered, flag);
lost = tp->lost - lost; /* freshly marked lost */
rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
@@ -3629,9 +3672,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
no_queue:
/* If data was DSACKed, see if we can undo a cwnd reduction. */
- if (flag & FLAG_DSACKING_ACK)
+ if (flag & FLAG_DSACKING_ACK) {
tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
&rexmit);
+ tcp_newly_delivered(sk, delivered, flag);
+ }
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
@@ -3655,6 +3700,7 @@ old_ack:
&sack_state);
tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
&rexmit);
+ tcp_newly_delivered(sk, delivered, flag);
tcp_xmit_recovery(sk, rexmit);
}
@@ -3868,11 +3914,8 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
int length = (th->doff << 2) - sizeof(*th);
const u8 *ptr = (const u8 *)(th + 1);
- /* If the TCP option is too short, we can short cut */
- if (length < TCPOLEN_MD5SIG)
- return NULL;
-
- while (length > 0) {
+ /* If not enough data remaining, we can short cut */
+ while (length >= TCPOLEN_MD5SIG) {
int opcode = *ptr++;
int opsize;
@@ -5567,9 +5610,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
return true;
}
tp->syn_data_acked = tp->syn_data;
- if (tp->syn_data_acked)
- NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPFASTOPENACTIVE);
+ if (tp->syn_data_acked) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
+ /* SYN-data is counted as two separate packets in tcp_ack() */
+ if (tp->delivered > 1)
+ --tp->delivered;
+ }
tcp_fastopen_add_skb(sk, synack);
@@ -5901,6 +5947,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
}
switch (sk->sk_state) {
case TCP_SYN_RECV:
+ tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
if (!tp->srtt_us)
tcp_synack_rtt_meas(sk, req);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 383cac0ff0ec..d07c0dcc99aa 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -229,11 +229,9 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
}
}
- if (mss > (1 << *rcv_wscale)) {
- if (!init_rcv_wnd) /* Use default unless specified otherwise */
- init_rcv_wnd = tcp_default_init_rwnd(mss);
- *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
- }
+ if (!init_rcv_wnd) /* Use default unless specified otherwise */
+ init_rcv_wnd = tcp_default_init_rwnd(mss);
+ *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
/* Set the clamp no higher than max representable value */
(*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
@@ -585,14 +583,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
unsigned int remaining = MAX_TCP_OPTION_SPACE;
struct tcp_fastopen_request *fastopen = tp->fastopen_req;
+ *md5 = NULL;
#ifdef CONFIG_TCP_MD5SIG
- *md5 = tp->af_specific->md5_lookup(sk, sk);
- if (*md5) {
- opts->options |= OPTION_MD5;
- remaining -= TCPOLEN_MD5SIG_ALIGNED;
+ if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
+ *md5 = tp->af_specific->md5_lookup(sk, sk);
+ if (*md5) {
+ opts->options |= OPTION_MD5;
+ remaining -= TCPOLEN_MD5SIG_ALIGNED;
+ }
}
-#else
- *md5 = NULL;
#endif
/* We always get an MSS option. The option bytes which will be seen in
@@ -720,14 +719,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
opts->options = 0;
+ *md5 = NULL;
#ifdef CONFIG_TCP_MD5SIG
- *md5 = tp->af_specific->md5_lookup(sk, sk);
- if (unlikely(*md5)) {
- opts->options |= OPTION_MD5;
- size += TCPOLEN_MD5SIG_ALIGNED;
+ if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
+ *md5 = tp->af_specific->md5_lookup(sk, sk);
+ if (*md5) {
+ opts->options |= OPTION_MD5;
+ size += TCPOLEN_MD5SIG_ALIGNED;
+ }
}
-#else
- *md5 = NULL;
#endif
if (likely(tp->rx_opt.tstamp_ok)) {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 24b5c59b1c53..dd3102a37ef9 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -757,7 +757,8 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb,
}
EXPORT_SYMBOL(udp_set_csum);
-static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
+static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
+ struct inet_cork *cork)
{
struct sock *sk = skb->sk;
struct inet_sock *inet = inet_sk(sk);
@@ -777,6 +778,24 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
uh->len = htons(len);
uh->check = 0;
+ if (cork->gso_size) {
+ const int hlen = skb_network_header_len(skb) +
+ sizeof(struct udphdr);
+
+ if (hlen + cork->gso_size > cork->fragsize)
+ return -EINVAL;
+ if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
+ return -EINVAL;
+ if (sk->sk_no_check_tx)
+ return -EINVAL;
+ if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
+ return -EIO;
+
+ skb_shinfo(skb)->gso_size = cork->gso_size;
+ skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+ goto csum_partial;
+ }
+
if (is_udplite) /* UDP-Lite */
csum = udplite_csum(skb);
@@ -786,6 +805,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
goto send;
} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
+csum_partial:
udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
goto send;
@@ -828,7 +848,7 @@ int udp_push_pending_frames(struct sock *sk)
if (!skb)
goto out;
- err = udp_send_skb(skb, fl4);
+ err = udp_send_skb(skb, fl4, &inet->cork.base);
out:
up->len = 0;
@@ -837,6 +857,43 @@ out:
}
EXPORT_SYMBOL(udp_push_pending_frames);
+static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size)
+{
+ switch (cmsg->cmsg_type) {
+ case UDP_SEGMENT:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u16)))
+ return -EINVAL;
+ *gso_size = *(__u16 *)CMSG_DATA(cmsg);
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size)
+{
+ struct cmsghdr *cmsg;
+ bool need_ip = false;
+ int err;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ if (cmsg->cmsg_level != SOL_UDP) {
+ need_ip = true;
+ continue;
+ }
+
+ err = __udp_cmsg_send(cmsg, gso_size);
+ if (err)
+ return err;
+ }
+
+ return need_ip;
+}
+EXPORT_SYMBOL_GPL(udp_cmsg_send);
+
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct inet_sock *inet = inet_sk(sk);
@@ -922,10 +979,14 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc.sockc.tsflags = sk->sk_tsflags;
ipc.addr = inet->inet_saddr;
ipc.oif = sk->sk_bound_dev_if;
+ ipc.gso_size = up->gso_size;
if (msg->msg_controllen) {
- err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6);
- if (unlikely(err)) {
+ err = udp_cmsg_send(sk, msg, &ipc.gso_size);
+ if (err > 0)
+ err = ip_cmsg_send(sk, msg, &ipc,
+ sk->sk_family == AF_INET6);
+ if (unlikely(err < 0)) {
kfree(ipc.opt);
return err;
}
@@ -1030,12 +1091,14 @@ back_from_confirm:
/* Lockless fast path for the non-corking case. */
if (!corkreq) {
+ struct inet_cork cork;
+
skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,
sizeof(struct udphdr), &ipc, &rt,
- msg->msg_flags);
+ &cork, msg->msg_flags);
err = PTR_ERR(skb);
if (!IS_ERR_OR_NULL(skb))
- err = udp_send_skb(skb, fl4);
+ err = udp_send_skb(skb, fl4, &cork);
goto out;
}
@@ -2365,6 +2428,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
up->no_check6_rx = valbool;
break;
+ case UDP_SEGMENT:
+ if (val < 0 || val > USHRT_MAX)
+ return -EINVAL;
+ up->gso_size = val;
+ break;
+
/*
* UDP-Lite's partial checksum coverage (RFC 3828).
*/
@@ -2455,6 +2524,10 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
val = up->no_check6_rx;
break;
+ case UDP_SEGMENT:
+ val = up->gso_size;
+ break;
+
/* The following two cannot be changed on UDP sockets, the return is
* always 0 (which corresponds to the full checksum coverage of UDP). */
case UDPLITE_SEND_CSCOV:
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index ea6e6e7df0ee..006257092f06 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -187,6 +187,68 @@ out_unlock:
}
EXPORT_SYMBOL(skb_udp_tunnel_segment);
+struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
+ netdev_features_t features,
+ unsigned int mss, __sum16 check)
+{
+ struct sock *sk = gso_skb->sk;
+ unsigned int sum_truesize = 0;
+ struct sk_buff *segs, *seg;
+ unsigned int hdrlen;
+ struct udphdr *uh;
+
+ if (gso_skb->len <= sizeof(*uh) + mss)
+ return ERR_PTR(-EINVAL);
+
+ hdrlen = gso_skb->data - skb_mac_header(gso_skb);
+ skb_pull(gso_skb, sizeof(*uh));
+
+ /* clear destructor to avoid skb_segment assigning it to tail */
+ WARN_ON_ONCE(gso_skb->destructor != sock_wfree);
+ gso_skb->destructor = NULL;
+
+ segs = skb_segment(gso_skb, features);
+ if (unlikely(IS_ERR_OR_NULL(segs))) {
+ gso_skb->destructor = sock_wfree;
+ return segs;
+ }
+
+ for (seg = segs; seg; seg = seg->next) {
+ uh = udp_hdr(seg);
+ uh->len = htons(seg->len - hdrlen);
+ uh->check = check;
+
+ /* last packet can be partial gso_size */
+ if (!seg->next)
+ csum_replace2(&uh->check, htons(mss),
+ htons(seg->len - hdrlen - sizeof(*uh)));
+
+ uh->check = ~uh->check;
+ seg->destructor = sock_wfree;
+ seg->sk = sk;
+ sum_truesize += seg->truesize;
+ }
+
+ refcount_add(sum_truesize - gso_skb->truesize, &sk->sk_wmem_alloc);
+
+ return segs;
+}
+EXPORT_SYMBOL_GPL(__udp_gso_segment);
+
+static struct sk_buff *__udp4_gso_segment(struct sk_buff *gso_skb,
+ netdev_features_t features)
+{
+ const struct iphdr *iph = ip_hdr(gso_skb);
+ unsigned int mss = skb_shinfo(gso_skb)->gso_size;
+
+ if (!can_checksum_protocol(features, htons(ETH_P_IP)))
+ return ERR_PTR(-EIO);
+
+ return __udp_gso_segment(gso_skb, features, mss,
+ udp_v4_check(sizeof(struct udphdr) + mss,
+ iph->saddr, iph->daddr, 0));
+}
+
static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
netdev_features_t features)
{
@@ -203,12 +265,15 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
goto out;
}
- if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP))
+ if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_UDP | SKB_GSO_UDP_L4)))
goto out;
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
goto out;
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
+ return __udp4_gso_segment(skb, features);
+
mss = skb_shinfo(skb)->gso_size;
if (unlikely(skb->len <= mss))
goto out;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a294e86a9b25..fbfd71a2d9c8 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -994,7 +994,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC;
struct net *net = dev_net(idev->dev);
struct inet6_ifaddr *ifa = NULL;
- struct fib6_info *rt = NULL;
+ struct fib6_info *f6i = NULL;
int err = 0;
int addr_type = ipv6_addr_type(addr);
@@ -1036,16 +1036,16 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
goto out;
}
- rt = addrconf_dst_alloc(net, idev, addr, false, gfp_flags);
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
- rt = NULL;
+ f6i = addrconf_f6i_alloc(net, idev, addr, false, gfp_flags);
+ if (IS_ERR(f6i)) {
+ err = PTR_ERR(f6i);
+ f6i = NULL;
goto out;
}
if (net->ipv6.devconf_all->disable_policy ||
idev->cnf.disable_policy)
- rt->dst_nopolicy = true;
+ f6i->dst_nopolicy = true;
neigh_parms_data_state_setall(idev->nd_parms);
@@ -1067,7 +1067,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
ifa->cstamp = ifa->tstamp = jiffies;
ifa->tokenized = false;
- ifa->rt = rt;
+ ifa->rt = f6i;
ifa->idev = idev;
in6_dev_hold(idev);
@@ -1101,7 +1101,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
inet6addr_notifier_call_chain(NETDEV_UP, ifa);
out:
if (unlikely(err < 0)) {
- fib6_info_release(rt);
+ fib6_info_release(f6i);
if (ifa) {
if (ifa->idev)
@@ -1178,19 +1178,19 @@ check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires)
static void
cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt)
{
- struct fib6_info *rt;
+ struct fib6_info *f6i;
- rt = addrconf_get_prefix_route(&ifp->addr,
+ f6i = addrconf_get_prefix_route(&ifp->addr,
ifp->prefix_len,
ifp->idev->dev,
0, RTF_GATEWAY | RTF_DEFAULT);
- if (rt) {
+ if (f6i) {
if (del_rt)
- ip6_del_rt(dev_net(ifp->idev->dev), rt);
+ ip6_del_rt(dev_net(ifp->idev->dev), f6i);
else {
- if (!(rt->rt6i_flags & RTF_EXPIRES))
- fib6_set_expires(rt, expires);
- fib6_info_release(rt);
+ if (!(f6i->fib6_flags & RTF_EXPIRES))
+ fib6_set_expires(f6i, expires);
+ fib6_info_release(f6i);
}
}
}
@@ -2370,9 +2370,9 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
for_each_fib6_node_rt_rcu(fn) {
if (rt->fib6_nh.nh_dev->ifindex != dev->ifindex)
continue;
- if ((rt->rt6i_flags & flags) != flags)
+ if ((rt->fib6_flags & flags) != flags)
continue;
- if ((rt->rt6i_flags & noflags) != 0)
+ if ((rt->fib6_flags & noflags) != 0)
continue;
fib6_info_hold(rt);
break;
@@ -3245,7 +3245,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
addrconf_add_linklocal(idev, &addr, 0);
else if (prefix_route)
addrconf_prefix_route(&addr, 64, idev->dev,
- 0, 0, GFP_ATOMIC);
+ 0, 0, GFP_KERNEL);
break;
case IN6_ADDR_GEN_MODE_NONE:
default:
@@ -3341,22 +3341,22 @@ static int fixup_permanent_addr(struct net *net,
struct inet6_dev *idev,
struct inet6_ifaddr *ifp)
{
- /* !rt6i_node means the host route was removed from the
+ /* !fib6_node means the host route was removed from the
* FIB, for example, if 'lo' device is taken down. In that
* case regenerate the host route.
*/
- if (!ifp->rt || !ifp->rt->rt6i_node) {
- struct fib6_info *rt, *prev;
+ if (!ifp->rt || !ifp->rt->fib6_node) {
+ struct fib6_info *f6i, *prev;
- rt = addrconf_dst_alloc(net, idev, &ifp->addr, false,
- GFP_ATOMIC);
- if (IS_ERR(rt))
- return PTR_ERR(rt);
+ f6i = addrconf_f6i_alloc(net, idev, &ifp->addr, false,
+ GFP_ATOMIC);
+ if (IS_ERR(f6i))
+ return PTR_ERR(f6i);
/* ifp->rt can be accessed outside of rtnl */
spin_lock(&ifp->lock);
prev = ifp->rt;
- ifp->rt = rt;
+ ifp->rt = f6i;
spin_unlock(&ifp->lock);
fib6_info_release(prev);
@@ -3622,8 +3622,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
struct net *net = dev_net(dev);
struct inet6_dev *idev;
struct inet6_ifaddr *ifa, *tmp;
- int _keep_addr;
- bool keep_addr;
+ bool keep_addr = false;
int state, i;
ASSERT_RTNL();
@@ -3649,15 +3648,18 @@ static int addrconf_ifdown(struct net_device *dev, int how)
}
- /* aggregate the system setting and interface setting */
- _keep_addr = net->ipv6.devconf_all->keep_addr_on_down;
- if (!_keep_addr)
- _keep_addr = idev->cnf.keep_addr_on_down;
-
/* combine the user config with event to determine if permanent
* addresses are to be removed from address hash table
*/
- keep_addr = !(how || _keep_addr <= 0 || idev->cnf.disable_ipv6);
+ if (!how && !idev->cnf.disable_ipv6) {
+ /* aggregate the system setting and interface setting */
+ int _keep_addr = net->ipv6.devconf_all->keep_addr_on_down;
+
+ if (!_keep_addr)
+ _keep_addr = idev->cnf.keep_addr_on_down;
+
+ keep_addr = (_keep_addr > 0);
+ }
/* Step 2: clear hash table */
for (i = 0; i < IN6_ADDR_HSIZE; i++) {
@@ -3707,11 +3709,6 @@ restart:
write_lock_bh(&idev->lock);
}
- /* re-combine the user config with event to determine if permanent
- * addresses are to be removed from the interface list
- */
- keep_addr = (!how && _keep_addr > 0 && !idev->cnf.disable_ipv6);
-
list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) {
struct fib6_info *rt = NULL;
bool keep;
@@ -4817,9 +4814,10 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
u32 portid, u32 seq, int event, unsigned int flags)
{
+ struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt);
+ int ifindex = dev ? dev->ifindex : 1;
struct nlmsghdr *nlh;
u8 scope = RT_SCOPE_UNIVERSE;
- int ifindex = ifaca->aca_idev->dev->ifindex;
if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE)
scope = RT_SCOPE_SITE;
@@ -5612,14 +5610,14 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
* our DAD process, so we don't need
* to do it again
*/
- if (!rcu_access_pointer(ifp->rt->rt6i_node))
+ if (!rcu_access_pointer(ifp->rt->fib6_node))
ip6_ins_rt(net, ifp->rt);
if (ifp->idev->cnf.forwarding)
addrconf_join_anycast(ifp);
if (!ipv6_addr_any(&ifp->peer_addr))
addrconf_prefix_route(&ifp->peer_addr, 128,
ifp->idev->dev, 0, 0,
- GFP_KERNEL);
+ GFP_ATOMIC);
break;
case RTM_DELADDR:
if (ifp->idev->cnf.forwarding)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 36d622c477b1..d0af96e0d109 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -578,7 +578,9 @@ const struct proto_ops inet6_stream_ops = {
.getsockopt = sock_common_getsockopt, /* ok */
.sendmsg = inet_sendmsg, /* ok */
.recvmsg = inet_recvmsg, /* ok */
+#ifdef CONFIG_MMU
.mmap = tcp_mmap,
+#endif
.sendpage = inet_sendpage,
.sendmsg_locked = tcp_sendmsg_locked,
.sendpage_locked = tcp_sendpage_locked,
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 0e35657501a7..0250d199e527 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -212,16 +212,14 @@ static void aca_get(struct ifacaddr6 *aca)
static void aca_put(struct ifacaddr6 *ac)
{
if (refcount_dec_and_test(&ac->aca_refcnt)) {
- in6_dev_put(ac->aca_idev);
fib6_info_release(ac->aca_rt);
kfree(ac);
}
}
-static struct ifacaddr6 *aca_alloc(struct fib6_info *rt,
+static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i,
const struct in6_addr *addr)
{
- struct inet6_dev *idev = rt->rt6i_idev;
struct ifacaddr6 *aca;
aca = kzalloc(sizeof(*aca), GFP_ATOMIC);
@@ -229,10 +227,8 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *rt,
return NULL;
aca->aca_addr = *addr;
- in6_dev_hold(idev);
- aca->aca_idev = idev;
- fib6_info_hold(rt);
- aca->aca_rt = rt;
+ fib6_info_hold(f6i);
+ aca->aca_rt = f6i;
aca->aca_users = 1;
/* aca_tstamp should be updated upon changes */
aca->aca_cstamp = aca->aca_tstamp = jiffies;
@@ -247,7 +243,7 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *rt,
int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
{
struct ifacaddr6 *aca;
- struct fib6_info *rt;
+ struct fib6_info *f6i;
struct net *net;
int err;
@@ -268,14 +264,14 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
}
net = dev_net(idev->dev);
- rt = addrconf_dst_alloc(net, idev, addr, true, GFP_ATOMIC);
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
+ f6i = addrconf_f6i_alloc(net, idev, addr, true, GFP_ATOMIC);
+ if (IS_ERR(f6i)) {
+ err = PTR_ERR(f6i);
goto out;
}
- aca = aca_alloc(rt, addr);
+ aca = aca_alloc(f6i, addr);
if (!aca) {
- fib6_info_release(rt);
+ fib6_info_release(f6i);
err = -ENOMEM;
goto out;
}
@@ -289,7 +285,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
aca_get(aca);
write_unlock_bh(&idev->lock);
- ip6_ins_rt(net, rt);
+ ip6_ins_rt(net, f6i);
addrconf_join_solict(idev->dev, &aca->aca_addr);
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index df113c7b5fc8..6547fc6491a6 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -245,15 +245,18 @@ static const struct nla_policy fib6_rule_policy[FRA_MAX+1] = {
static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct fib_rule_hdr *frh,
- struct nlattr **tb)
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
{
int err = -EINVAL;
struct net *net = sock_net(skb->sk);
struct fib6_rule *rule6 = (struct fib6_rule *) rule;
if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) {
- if (rule->table == RT6_TABLE_UNSPEC)
+ if (rule->table == RT6_TABLE_UNSPEC) {
+ NL_SET_ERR_MSG(extack, "Invalid table");
goto errout;
+ }
if (fib6_new_table(net, rule->table) == NULL) {
err = -ENOBUFS;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 2ab49b7cac22..f0a4262a4789 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -105,12 +105,12 @@ enum {
FIB6_NO_SERNUM_CHANGE = 0,
};
-void fib6_update_sernum(struct net *net, struct fib6_info *rt)
+void fib6_update_sernum(struct net *net, struct fib6_info *f6i)
{
struct fib6_node *fn;
- fn = rcu_dereference_protected(rt->rt6i_node,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ fn = rcu_dereference_protected(f6i->fib6_node,
+ lockdep_is_held(&f6i->fib6_table->tb6_lock));
if (fn)
fn->fn_sernum = fib6_new_sernum(net);
}
@@ -159,10 +159,10 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
return NULL;
}
- INIT_LIST_HEAD(&f6i->rt6i_siblings);
+ INIT_LIST_HEAD(&f6i->fib6_siblings);
f6i->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
- atomic_inc(&f6i->rt6i_ref);
+ atomic_inc(&f6i->fib6_ref);
return f6i;
}
@@ -172,7 +172,7 @@ void fib6_info_destroy(struct fib6_info *f6i)
struct rt6_exception_bucket *bucket;
struct dst_metrics *m;
- WARN_ON(f6i->rt6i_node);
+ WARN_ON(f6i->fib6_node);
bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1);
if (bucket) {
@@ -197,8 +197,6 @@ void fib6_info_destroy(struct fib6_info *f6i)
}
}
- if (f6i->rt6i_idev)
- in6_dev_put(f6i->rt6i_idev);
if (f6i->fib6_nh.nh_dev)
dev_put(f6i->fib6_nh.nh_dev);
@@ -401,7 +399,7 @@ static int call_fib6_entry_notifiers(struct net *net,
.rt = rt,
};
- rt->rt6i_table->fib_seq++;
+ rt->fib6_table->fib_seq++;
return call_fib6_notifiers(net, event_type, &info.info);
}
@@ -483,10 +481,10 @@ static int fib6_dump_node(struct fib6_walker *w)
* last sibling of this route (no need to dump the
* sibling routes again)
*/
- if (rt->rt6i_nsiblings)
- rt = list_last_entry(&rt->rt6i_siblings,
+ if (rt->fib6_nsiblings)
+ rt = list_last_entry(&rt->fib6_siblings,
struct fib6_info,
- rt6i_siblings);
+ fib6_siblings);
}
w->leaf = NULL;
return 0;
@@ -810,7 +808,7 @@ insert_above:
RCU_INIT_POINTER(in->parent, pn);
in->leaf = fn->leaf;
atomic_inc(&rcu_dereference_protected(in->leaf,
- lockdep_is_held(&table->tb6_lock))->rt6i_ref);
+ lockdep_is_held(&table->tb6_lock))->fib6_ref);
/* update parent pointer */
if (dir)
@@ -862,12 +860,37 @@ insert_above:
return ln;
}
+static void fib6_drop_pcpu_from(struct fib6_info *f6i,
+ const struct fib6_table *table)
+{
+ int cpu;
+
+ /* release the reference to this fib entry from
+ * all of its cached pcpu routes
+ */
+ for_each_possible_cpu(cpu) {
+ struct rt6_info **ppcpu_rt;
+ struct rt6_info *pcpu_rt;
+
+ ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
+ pcpu_rt = *ppcpu_rt;
+ if (pcpu_rt) {
+ struct fib6_info *from;
+
+ from = rcu_dereference_protected(pcpu_rt->from,
+ lockdep_is_held(&table->tb6_lock));
+ rcu_assign_pointer(pcpu_rt->from, NULL);
+ fib6_info_release(from);
+ }
+ }
+}
+
static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
struct net *net)
{
- struct fib6_table *table = rt->rt6i_table;
+ struct fib6_table *table = rt->fib6_table;
- if (atomic_read(&rt->rt6i_ref) != 1) {
+ if (atomic_read(&rt->fib6_ref) != 1) {
/* This route is used as dummy address holder in some split
* nodes. It is not leaked, but it still holds other resources,
* which must be released in time. So, scan ascendant nodes
@@ -880,7 +903,7 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
struct fib6_info *new_leaf;
if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
new_leaf = fib6_find_prefix(net, table, fn);
- atomic_inc(&new_leaf->rt6i_ref);
+ atomic_inc(&new_leaf->fib6_ref);
rcu_assign_pointer(fn->leaf, new_leaf);
fib6_info_release(rt);
@@ -889,24 +912,8 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
lockdep_is_held(&table->tb6_lock));
}
- if (rt->rt6i_pcpu) {
- int cpu;
-
- /* release the reference to this fib entry from
- * all of its cached pcpu routes
- */
- for_each_possible_cpu(cpu) {
- struct rt6_info **ppcpu_rt;
- struct rt6_info *pcpu_rt;
-
- ppcpu_rt = per_cpu_ptr(rt->rt6i_pcpu, cpu);
- pcpu_rt = *ppcpu_rt;
- if (pcpu_rt) {
- fib6_info_release(pcpu_rt->from);
- pcpu_rt->from = NULL;
- }
- }
- }
+ if (rt->rt6i_pcpu)
+ fib6_drop_pcpu_from(rt, table);
}
}
@@ -919,7 +926,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
struct netlink_ext_ack *extack)
{
struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
struct fib6_info *iter = NULL;
struct fib6_info __rcu **ins;
struct fib6_info __rcu **fallback_ins = NULL;
@@ -938,13 +945,13 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
ins = &fn->leaf;
for (iter = leaf; iter;
- iter = rcu_dereference_protected(iter->rt6_next,
- lockdep_is_held(&rt->rt6i_table->tb6_lock))) {
+ iter = rcu_dereference_protected(iter->fib6_next,
+ lockdep_is_held(&rt->fib6_table->tb6_lock))) {
/*
* Search for duplicates
*/
- if (iter->rt6i_metric == rt->rt6i_metric) {
+ if (iter->fib6_metric == rt->fib6_metric) {
/*
* Same priority level
*/
@@ -964,11 +971,11 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
}
if (rt6_duplicate_nexthop(iter, rt)) {
- if (rt->rt6i_nsiblings)
- rt->rt6i_nsiblings = 0;
- if (!(iter->rt6i_flags & RTF_EXPIRES))
+ if (rt->fib6_nsiblings)
+ rt->fib6_nsiblings = 0;
+ if (!(iter->fib6_flags & RTF_EXPIRES))
return -EEXIST;
- if (!(rt->rt6i_flags & RTF_EXPIRES))
+ if (!(rt->fib6_flags & RTF_EXPIRES))
fib6_clean_expires(iter);
else
fib6_set_expires(iter, rt->expires);
@@ -988,21 +995,21 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
*/
if (rt_can_ecmp &&
rt6_qualify_for_ecmp(iter))
- rt->rt6i_nsiblings++;
+ rt->fib6_nsiblings++;
}
- if (iter->rt6i_metric > rt->rt6i_metric)
+ if (iter->fib6_metric > rt->fib6_metric)
break;
next_iter:
- ins = &iter->rt6_next;
+ ins = &iter->fib6_next;
}
if (fallback_ins && !found) {
/* No ECMP-able route found, replace first non-ECMP one */
ins = fallback_ins;
iter = rcu_dereference_protected(*ins,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
found++;
}
@@ -1011,34 +1018,34 @@ next_iter:
fn->rr_ptr = NULL;
/* Link this route to others same route. */
- if (rt->rt6i_nsiblings) {
- unsigned int rt6i_nsiblings;
+ if (rt->fib6_nsiblings) {
+ unsigned int fib6_nsiblings;
struct fib6_info *sibling, *temp_sibling;
/* Find the first route that have the same metric */
sibling = leaf;
while (sibling) {
- if (sibling->rt6i_metric == rt->rt6i_metric &&
+ if (sibling->fib6_metric == rt->fib6_metric &&
rt6_qualify_for_ecmp(sibling)) {
- list_add_tail(&rt->rt6i_siblings,
- &sibling->rt6i_siblings);
+ list_add_tail(&rt->fib6_siblings,
+ &sibling->fib6_siblings);
break;
}
- sibling = rcu_dereference_protected(sibling->rt6_next,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ sibling = rcu_dereference_protected(sibling->fib6_next,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
}
/* For each sibling in the list, increment the counter of
* siblings. BUG() if counters does not match, list of siblings
* is broken!
*/
- rt6i_nsiblings = 0;
+ fib6_nsiblings = 0;
list_for_each_entry_safe(sibling, temp_sibling,
- &rt->rt6i_siblings, rt6i_siblings) {
- sibling->rt6i_nsiblings++;
- BUG_ON(sibling->rt6i_nsiblings != rt->rt6i_nsiblings);
- rt6i_nsiblings++;
+ &rt->fib6_siblings, fib6_siblings) {
+ sibling->fib6_nsiblings++;
+ BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
+ fib6_nsiblings++;
}
- BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings);
+ BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
rt6_multipath_rebalance(temp_sibling);
}
@@ -1058,9 +1065,9 @@ add:
if (err)
return err;
- rcu_assign_pointer(rt->rt6_next, iter);
- atomic_inc(&rt->rt6i_ref);
- rcu_assign_pointer(rt->rt6i_node, fn);
+ rcu_assign_pointer(rt->fib6_next, iter);
+ atomic_inc(&rt->fib6_ref);
+ rcu_assign_pointer(rt->fib6_node, fn);
rcu_assign_pointer(*ins, rt);
if (!info->skip_notify)
inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
@@ -1087,9 +1094,9 @@ add:
if (err)
return err;
- atomic_inc(&rt->rt6i_ref);
- rcu_assign_pointer(rt->rt6i_node, fn);
- rt->rt6_next = iter->rt6_next;
+ atomic_inc(&rt->fib6_ref);
+ rcu_assign_pointer(rt->fib6_node, fn);
+ rt->fib6_next = iter->fib6_next;
rcu_assign_pointer(*ins, rt);
if (!info->skip_notify)
inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
@@ -1097,8 +1104,8 @@ add:
info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
fn->fn_flags |= RTN_RTINFO;
}
- nsiblings = iter->rt6i_nsiblings;
- iter->rt6i_node = NULL;
+ nsiblings = iter->fib6_nsiblings;
+ iter->fib6_node = NULL;
fib6_purge_rt(iter, fn, info->nl_net);
if (rcu_access_pointer(fn->rr_ptr) == iter)
fn->rr_ptr = NULL;
@@ -1106,15 +1113,15 @@ add:
if (nsiblings) {
/* Replacing an ECMP route, remove all siblings */
- ins = &rt->rt6_next;
+ ins = &rt->fib6_next;
iter = rcu_dereference_protected(*ins,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
while (iter) {
- if (iter->rt6i_metric > rt->rt6i_metric)
+ if (iter->fib6_metric > rt->fib6_metric)
break;
if (rt6_qualify_for_ecmp(iter)) {
- *ins = iter->rt6_next;
- iter->rt6i_node = NULL;
+ *ins = iter->fib6_next;
+ iter->fib6_node = NULL;
fib6_purge_rt(iter, fn, info->nl_net);
if (rcu_access_pointer(fn->rr_ptr) == iter)
fn->rr_ptr = NULL;
@@ -1122,10 +1129,10 @@ add:
nsiblings--;
info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
} else {
- ins = &iter->rt6_next;
+ ins = &iter->fib6_next;
}
iter = rcu_dereference_protected(*ins,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
}
WARN_ON(nsiblings != 0);
}
@@ -1137,7 +1144,7 @@ add:
static void fib6_start_gc(struct net *net, struct fib6_info *rt)
{
if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
- (rt->rt6i_flags & RTF_EXPIRES))
+ (rt->fib6_flags & RTF_EXPIRES))
mod_timer(&net->ipv6.ip6_fib_timer,
jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
}
@@ -1152,15 +1159,15 @@ void fib6_force_start_gc(struct net *net)
static void __fib6_update_sernum_upto_root(struct fib6_info *rt,
int sernum)
{
- struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
/* paired with smp_rmb() in rt6_get_cookie_safe() */
smp_wmb();
while (fn) {
fn->fn_sernum = sernum;
fn = rcu_dereference_protected(fn->parent,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
}
}
@@ -1179,7 +1186,7 @@ void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
int fib6_add(struct fib6_node *root, struct fib6_info *rt,
struct nl_info *info, struct netlink_ext_ack *extack)
{
- struct fib6_table *table = rt->rt6i_table;
+ struct fib6_table *table = rt->fib6_table;
struct fib6_node *fn, *pn = NULL;
int err = -ENOMEM;
int allow_create = 1;
@@ -1196,8 +1203,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
fn = fib6_add_1(info->nl_net, table, root,
- &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
- offsetof(struct fib6_info, rt6i_dst), allow_create,
+ &rt->fib6_dst.addr, rt->fib6_dst.plen,
+ offsetof(struct fib6_info, fib6_dst), allow_create,
replace_required, extack);
if (IS_ERR(fn)) {
err = PTR_ERR(fn);
@@ -1208,7 +1215,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
pn = fn;
#ifdef CONFIG_IPV6_SUBTREES
- if (rt->rt6i_src.plen) {
+ if (rt->fib6_src.plen) {
struct fib6_node *sn;
if (!rcu_access_pointer(fn->subtree)) {
@@ -1229,7 +1236,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
if (!sfn)
goto failure;
- atomic_inc(&info->nl_net->ipv6.fib6_null_entry->rt6i_ref);
+ atomic_inc(&info->nl_net->ipv6.fib6_null_entry->fib6_ref);
rcu_assign_pointer(sfn->leaf,
info->nl_net->ipv6.fib6_null_entry);
sfn->fn_flags = RTN_ROOT;
@@ -1237,8 +1244,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
/* Now add the first leaf node to new subtree */
sn = fib6_add_1(info->nl_net, table, sfn,
- &rt->rt6i_src.addr, rt->rt6i_src.plen,
- offsetof(struct fib6_info, rt6i_src),
+ &rt->fib6_src.addr, rt->fib6_src.plen,
+ offsetof(struct fib6_info, fib6_src),
allow_create, replace_required, extack);
if (IS_ERR(sn)) {
@@ -1256,8 +1263,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
rcu_assign_pointer(fn->subtree, sfn);
} else {
sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
- &rt->rt6i_src.addr, rt->rt6i_src.plen,
- offsetof(struct fib6_info, rt6i_src),
+ &rt->fib6_src.addr, rt->fib6_src.plen,
+ offsetof(struct fib6_info, fib6_src),
allow_create, replace_required, extack);
if (IS_ERR(sn)) {
@@ -1272,7 +1279,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
rcu_assign_pointer(fn->leaf,
info->nl_net->ipv6.fib6_null_entry);
} else {
- atomic_inc(&rt->rt6i_ref);
+ atomic_inc(&rt->fib6_ref);
rcu_assign_pointer(fn->leaf, rt);
}
}
@@ -1421,12 +1428,12 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad
struct fib6_node *fn;
struct lookup_args args[] = {
{
- .offset = offsetof(struct fib6_info, rt6i_dst),
+ .offset = offsetof(struct fib6_info, fib6_dst),
.addr = daddr,
},
#ifdef CONFIG_IPV6_SUBTREES
{
- .offset = offsetof(struct fib6_info, rt6i_src),
+ .offset = offsetof(struct fib6_info, fib6_src),
.addr = saddr,
},
#endif
@@ -1511,7 +1518,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
struct fib6_node *fn;
fn = fib6_locate_1(root, daddr, dst_len,
- offsetof(struct fib6_info, rt6i_dst),
+ offsetof(struct fib6_info, fib6_dst),
exact_match);
#ifdef CONFIG_IPV6_SUBTREES
@@ -1522,7 +1529,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
if (subtree) {
fn = fib6_locate_1(subtree, saddr, src_len,
- offsetof(struct fib6_info, rt6i_src),
+ offsetof(struct fib6_info, fib6_src),
exact_match);
}
}
@@ -1705,8 +1712,8 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
RT6_TRACE("fib6_del_route\n");
/* Unlink it */
- *rtp = rt->rt6_next;
- rt->rt6i_node = NULL;
+ *rtp = rt->fib6_next;
+ rt->fib6_node = NULL;
net->ipv6.rt6_stats->fib_rt_entries--;
net->ipv6.rt6_stats->fib_discarded_routes++;
@@ -1718,14 +1725,14 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
fn->rr_ptr = NULL;
/* Remove this entry from other siblings */
- if (rt->rt6i_nsiblings) {
+ if (rt->fib6_nsiblings) {
struct fib6_info *sibling, *next_sibling;
list_for_each_entry_safe(sibling, next_sibling,
- &rt->rt6i_siblings, rt6i_siblings)
- sibling->rt6i_nsiblings--;
- rt->rt6i_nsiblings = 0;
- list_del_init(&rt->rt6i_siblings);
+ &rt->fib6_siblings, fib6_siblings)
+ sibling->fib6_nsiblings--;
+ rt->fib6_nsiblings = 0;
+ list_del_init(&rt->fib6_siblings);
rt6_multipath_rebalance(next_sibling);
}
@@ -1734,7 +1741,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
FOR_WALKERS(net, w) {
if (w->state == FWS_C && w->leaf == rt) {
RT6_TRACE("walker %p adjusted by delroute\n", w);
- w->leaf = rcu_dereference_protected(rt->rt6_next,
+ w->leaf = rcu_dereference_protected(rt->fib6_next,
lockdep_is_held(&table->tb6_lock));
if (!w->leaf)
w->state = FWS_U;
@@ -1765,9 +1772,9 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
/* Need to own table->tb6_lock */
int fib6_del(struct fib6_info *rt, struct nl_info *info)
{
- struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
- struct fib6_table *table = rt->rt6i_table;
+ struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
+ struct fib6_table *table = rt->fib6_table;
struct net *net = info->nl_net;
struct fib6_info __rcu **rtp;
struct fib6_info __rcu **rtp_next;
@@ -1788,7 +1795,7 @@ int fib6_del(struct fib6_info *rt, struct nl_info *info)
fib6_del_route(table, fn, rtp, info);
return 0;
}
- rtp_next = &cur->rt6_next;
+ rtp_next = &cur->fib6_next;
}
return -ENOENT;
}
@@ -1951,17 +1958,17 @@ static int fib6_clean_node(struct fib6_walker *w)
#if RT6_DEBUG >= 2
pr_debug("%s: del failed: rt=%p@%p err=%d\n",
__func__, rt,
- rcu_access_pointer(rt->rt6i_node),
+ rcu_access_pointer(rt->fib6_node),
res);
#endif
continue;
}
return 0;
} else if (res == -2) {
- if (WARN_ON(!rt->rt6i_nsiblings))
+ if (WARN_ON(!rt->fib6_nsiblings))
continue;
- rt = list_last_entry(&rt->rt6i_siblings,
- struct fib6_info, rt6i_siblings);
+ rt = list_last_entry(&rt->fib6_siblings,
+ struct fib6_info, fib6_siblings);
continue;
}
WARN_ON(res != 0);
@@ -2045,7 +2052,7 @@ static int fib6_age(struct fib6_info *rt, void *arg)
* Routes are expired even if they are in use.
*/
- if (rt->rt6i_flags & RTF_EXPIRES && rt->expires) {
+ if (rt->fib6_flags & RTF_EXPIRES && rt->expires) {
if (time_after(now, rt->expires)) {
RT6_TRACE("expiring %p\n", rt);
return -1;
@@ -2243,22 +2250,22 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
struct ipv6_route_iter *iter = seq->private;
const struct net_device *dev;
- seq_printf(seq, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
+ seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
#ifdef CONFIG_IPV6_SUBTREES
- seq_printf(seq, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
+ seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen);
#else
seq_puts(seq, "00000000000000000000000000000000 00 ");
#endif
- if (rt->rt6i_flags & RTF_GATEWAY)
+ if (rt->fib6_flags & RTF_GATEWAY)
seq_printf(seq, "%pi6", &rt->fib6_nh.nh_gw);
else
seq_puts(seq, "00000000000000000000000000000000");
dev = rt->fib6_nh.nh_dev;
seq_printf(seq, " %08x %08x %08x %08x %8s\n",
- rt->rt6i_metric, atomic_read(&rt->rt6i_ref), 0,
- rt->rt6i_flags, dev ? dev->name : "");
+ rt->fib6_metric, atomic_read(&rt->fib6_ref), 0,
+ rt->fib6_flags, dev ? dev->name : "");
iter->w.leaf = NULL;
return 0;
}
@@ -2272,7 +2279,7 @@ static int ipv6_route_yield(struct fib6_walker *w)
do {
iter->w.leaf = rcu_dereference_protected(
- iter->w.leaf->rt6_next,
+ iter->w.leaf->fib6_next,
lockdep_is_held(&iter->tbl->tb6_lock));
iter->skip--;
if (!iter->skip && iter->w.leaf)
@@ -2338,7 +2345,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
if (!v)
goto iter_table;
- n = rcu_dereference_bh(((struct fib6_info *)v)->rt6_next);
+ n = rcu_dereference_bh(((struct fib6_info *)v)->fib6_next);
if (n) {
++*pos;
return n;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 69727bc168cb..04c69e0c84b3 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -807,7 +807,7 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
}
/**
- * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own
+ * ip6gre_tnl_addr_conflict - compare packet addresses to tunnel's own
* @t: the outgoing tunnel device
* @hdr: IPv6 header from the incoming packet
*
@@ -896,6 +896,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
struct flowi6 fl6;
int err = -EINVAL;
__u32 mtu;
+ int nhoff;
if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr))
goto tx_err;
@@ -908,6 +909,11 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
truncate = true;
}
+ nhoff = skb_network_header(skb) - skb_mac_header(skb);
+ if (skb->protocol == htons(ETH_P_IP) &&
+ (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
+ truncate = true;
+
if (skb_cow_head(skb, dev->needed_headroom))
goto tx_err;
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 4a87f9428ca5..5b3f2f89ef41 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -88,9 +88,11 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
if (skb->encapsulation &&
skb_shinfo(skb)->gso_type & (SKB_GSO_IPXIP4 | SKB_GSO_IPXIP6))
- udpfrag = proto == IPPROTO_UDP && encap;
+ udpfrag = proto == IPPROTO_UDP && encap &&
+ (skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
else
- udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
+ udpfrag = proto == IPPROTO_UDP && !skb->encapsulation &&
+ (skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
ops = rcu_dereference(inet6_offloads[proto]);
if (likely(ops && ops->callbacks.gso_segment)) {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index cec49e137dbb..7f4493080df6 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -940,16 +940,21 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
* that's why we try it again later.
*/
if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
+ struct fib6_info *from;
struct rt6_info *rt;
bool had_dst = *dst != NULL;
if (!had_dst)
*dst = ip6_route_output(net, sk, fl6);
rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
- err = ip6_route_get_saddr(net, rt ? rt->from : NULL,
- &fl6->daddr,
+
+ rcu_read_lock();
+ from = rt ? rcu_dereference(rt->from) : NULL;
+ err = ip6_route_get_saddr(net, from, &fl6->daddr,
sk ? inet6_sk(sk)->srcprefs : 0,
&fl6->saddr);
+ rcu_read_unlock();
+
if (err)
goto out_err_release;
@@ -1213,6 +1218,8 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
if (mtu < IPV6_MIN_MTU)
return -EINVAL;
cork->base.fragsize = mtu;
+ cork->base.gso_size = sk->sk_type == SOCK_DGRAM ? ipc6->gso_size : 0;
+
if (dst_allfrag(xfrm_dst_path(&rt->dst)))
cork->base.flags |= IPCORK_ALLFRAG;
cork->base.length = 0;
@@ -1247,6 +1254,7 @@ static int __ip6_append_data(struct sock *sk,
int csummode = CHECKSUM_NONE;
unsigned int maxnonfragsize, headersize;
unsigned int wmem_alloc_delta = 0;
+ bool paged;
skb = skb_peek_tail(queue);
if (!skb) {
@@ -1254,7 +1262,8 @@ static int __ip6_append_data(struct sock *sk,
dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
}
- mtu = cork->fragsize;
+ paged = !!cork->gso_size;
+ mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
orig_mtu = mtu;
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
@@ -1302,7 +1311,7 @@ emsgsize:
if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
headersize == sizeof(struct ipv6hdr) &&
length <= mtu - headersize &&
- !(flags & MSG_MORE) &&
+ (!(flags & MSG_MORE) || cork->gso_size) &&
rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
csummode = CHECKSUM_PARTIAL;
@@ -1345,6 +1354,7 @@ emsgsize:
unsigned int fraglen;
unsigned int fraggap;
unsigned int alloclen;
+ unsigned int pagedlen = 0;
alloc_new_skb:
/* There's no room in the current skb */
if (skb)
@@ -1367,11 +1377,17 @@ alloc_new_skb:
if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
+ fraglen = datalen + fragheaderlen;
+
if ((flags & MSG_MORE) &&
!(rt->dst.dev->features&NETIF_F_SG))
alloclen = mtu;
- else
- alloclen = datalen + fragheaderlen;
+ else if (!paged)
+ alloclen = fraglen;
+ else {
+ alloclen = min_t(int, fraglen, MAX_HEADER);
+ pagedlen = fraglen - alloclen;
+ }
alloclen += dst_exthdrlen;
@@ -1393,7 +1409,7 @@ alloc_new_skb:
*/
alloclen += sizeof(struct frag_hdr);
- copy = datalen - transhdrlen - fraggap;
+ copy = datalen - transhdrlen - fraggap - pagedlen;
if (copy < 0) {
err = -EINVAL;
goto error;
@@ -1432,7 +1448,7 @@ alloc_new_skb:
/*
* Find where to start putting bytes
*/
- data = skb_put(skb, fraglen);
+ data = skb_put(skb, fraglen - pagedlen);
skb_set_network_header(skb, exthdrlen);
data += fragheaderlen;
skb->transport_header = (skb->network_header +
@@ -1455,7 +1471,7 @@ alloc_new_skb:
}
offset += copy;
- length -= datalen - fraggap;
+ length -= copy + transhdrlen;
transhdrlen = 0;
exthdrlen = 0;
dst_exthdrlen = 0;
@@ -1728,9 +1744,9 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
void *from, int length, int transhdrlen,
struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
struct rt6_info *rt, unsigned int flags,
+ struct inet_cork_full *cork,
const struct sockcm_cookie *sockc)
{
- struct inet_cork_full cork;
struct inet6_cork v6_cork;
struct sk_buff_head queue;
int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
@@ -1741,27 +1757,27 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
__skb_queue_head_init(&queue);
- cork.base.flags = 0;
- cork.base.addr = 0;
- cork.base.opt = NULL;
- cork.base.dst = NULL;
+ cork->base.flags = 0;
+ cork->base.addr = 0;
+ cork->base.opt = NULL;
+ cork->base.dst = NULL;
v6_cork.opt = NULL;
- err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
+ err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
if (err) {
- ip6_cork_release(&cork, &v6_cork);
+ ip6_cork_release(cork, &v6_cork);
return ERR_PTR(err);
}
if (ipc6->dontfrag < 0)
ipc6->dontfrag = inet6_sk(sk)->dontfrag;
- err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
+ err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
&current->task_frag, getfrag, from,
length + exthdrlen, transhdrlen + exthdrlen,
flags, ipc6, sockc);
if (err) {
- __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
+ __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
return ERR_PTR(err);
}
- return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
+ return __ip6_make_skb(sk, &queue, cork, &v6_cork);
}
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 298fd8b6ed17..20a419ee8000 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -180,7 +180,8 @@ static const struct nla_policy ip6mr_rule_policy[FRA_MAX + 1] = {
};
static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
- struct fib_rule_hdr *frh, struct nlattr **tb)
+ struct fib_rule_hdr *frh, struct nlattr **tb,
+ struct netlink_ext_ack *extack)
{
return 0;
}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 102645298692..9ac5366064e3 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1318,7 +1318,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
}
neigh->flags |= NTF_ROUTER;
} else if (rt) {
- rt->rt6i_flags = (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
+ rt->fib6_flags = (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
}
if (rt)
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index ccbfa83e4bb0..ce77bcc2490c 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -48,6 +48,34 @@ config NFT_CHAIN_ROUTE_IPV6
fields such as the source, destination, flowlabel, hop-limit and
the packet mark.
+if NF_NAT_IPV6
+
+config NFT_CHAIN_NAT_IPV6
+ tristate "IPv6 nf_tables nat chain support"
+ help
+ This option enables the "nat" chain for IPv6 in nf_tables. This
+ chain type is used to perform Network Address Translation (NAT)
+ packet transformations such as the source, destination address and
+ source and destination ports.
+
+config NFT_MASQ_IPV6
+ tristate "IPv6 masquerade support for nf_tables"
+ depends on NFT_MASQ
+ select NF_NAT_MASQUERADE_IPV6
+ help
+ This is the expression that provides IPv4 masquerading support for
+ nf_tables.
+
+config NFT_REDIR_IPV6
+ tristate "IPv6 redirect support for nf_tables"
+ depends on NFT_REDIR
+ select NF_NAT_REDIRECT
+ help
+ This is the expression that provides IPv4 redirect support for
+ nf_tables.
+
+endif # NF_NAT_IPV6
+
config NFT_REJECT_IPV6
select NF_REJECT_IPV6
default NFT_REJECT
@@ -107,39 +135,12 @@ config NF_NAT_IPV6
if NF_NAT_IPV6
-config NFT_CHAIN_NAT_IPV6
- depends on NF_TABLES_IPV6
- tristate "IPv6 nf_tables nat chain support"
- help
- This option enables the "nat" chain for IPv6 in nf_tables. This
- chain type is used to perform Network Address Translation (NAT)
- packet transformations such as the source, destination address and
- source and destination ports.
-
config NF_NAT_MASQUERADE_IPV6
tristate "IPv6 masquerade support"
help
This is the kernel functionality to provide NAT in the masquerade
flavour (automatic source address selection) for IPv6.
-config NFT_MASQ_IPV6
- tristate "IPv6 masquerade support for nf_tables"
- depends on NF_TABLES_IPV6
- depends on NFT_MASQ
- select NF_NAT_MASQUERADE_IPV6
- help
- This is the expression that provides IPv4 masquerading support for
- nf_tables.
-
-config NFT_REDIR_IPV6
- tristate "IPv6 redirect support for nf_tables"
- depends on NF_TABLES_IPV6
- depends on NFT_REDIR
- select NF_NAT_REDIRECT
- help
- This is the expression that provides IPv4 redirect support for
- nf_tables.
-
endif # NF_NAT_IPV6
config IP6_NF_IPTABLES
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f9c363327d62..daa3662da0ee 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -284,10 +284,10 @@ static const u32 ip6_template_metrics[RTAX_MAX] = {
};
static const struct fib6_info fib6_null_entry_template = {
- .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
- .rt6i_protocol = RTPROT_KERNEL,
- .rt6i_metric = ~(u32)0,
- .rt6i_ref = ATOMIC_INIT(1),
+ .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
+ .fib6_protocol = RTPROT_KERNEL,
+ .fib6_metric = ~(u32)0,
+ .fib6_ref = ATOMIC_INIT(1),
.fib6_type = RTN_UNREACHABLE,
.fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
};
@@ -359,7 +359,7 @@ EXPORT_SYMBOL(ip6_dst_alloc);
static void ip6_dst_destroy(struct dst_entry *dst)
{
struct rt6_info *rt = (struct rt6_info *)dst;
- struct fib6_info *from = rt->from;
+ struct fib6_info *from;
struct inet6_dev *idev;
dst_destroy_metrics_generic(dst);
@@ -371,8 +371,11 @@ static void ip6_dst_destroy(struct dst_entry *dst)
in6_dev_put(idev);
}
- rt->from = NULL;
+ rcu_read_lock();
+ from = rcu_dereference(rt->from);
+ rcu_assign_pointer(rt->from, NULL);
fib6_info_release(from);
+ rcu_read_unlock();
}
static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
@@ -402,12 +405,16 @@ static bool __rt6_check_expired(const struct rt6_info *rt)
static bool rt6_check_expired(const struct rt6_info *rt)
{
+ struct fib6_info *from;
+
+ from = rcu_dereference(rt->from);
+
if (rt->rt6i_flags & RTF_EXPIRES) {
if (time_after(jiffies, rt->dst.expires))
return true;
- } else if (rt->from) {
+ } else if (from) {
return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
- fib6_check_expired(rt->from);
+ fib6_check_expired(from);
}
return false;
}
@@ -429,8 +436,8 @@ static struct fib6_info *rt6_multipath_select(const struct net *net,
if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
return match;
- list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
- rt6i_siblings) {
+ list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
+ fib6_siblings) {
int nh_upper_bound;
nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
@@ -455,14 +462,13 @@ static inline struct fib6_info *rt6_device_match(struct net *net,
int oif,
int flags)
{
- struct fib6_info *local = NULL;
struct fib6_info *sprt;
if (!oif && ipv6_addr_any(saddr) &&
!(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
return rt;
- for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
+ for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
const struct net_device *dev = sprt->fib6_nh.nh_dev;
if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
@@ -471,17 +477,6 @@ static inline struct fib6_info *rt6_device_match(struct net *net,
if (oif) {
if (dev->ifindex == oif)
return sprt;
- if (dev->flags & IFF_LOOPBACK) {
- if (!sprt->rt6i_idev ||
- sprt->rt6i_idev->dev->ifindex != oif) {
- if (flags & RT6_LOOKUP_F_IFACE)
- continue;
- if (local &&
- local->rt6i_idev->dev->ifindex == oif)
- continue;
- }
- local = sprt;
- }
} else {
if (ipv6_chk_addr(net, saddr, dev,
flags & RT6_LOOKUP_F_IFACE))
@@ -489,13 +484,8 @@ static inline struct fib6_info *rt6_device_match(struct net *net,
}
}
- if (oif) {
- if (local)
- return local;
-
- if (flags & RT6_LOOKUP_F_IFACE)
- return net->ipv6.fib6_null_entry;
- }
+ if (oif && flags & RT6_LOOKUP_F_IFACE)
+ return net->ipv6.fib6_null_entry;
return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
}
@@ -534,7 +524,7 @@ static void rt6_probe(struct fib6_info *rt)
* Router Reachability Probe MUST be rate-limited
* to no more than one per minute.
*/
- if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
+ if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
return;
nh_gw = &rt->fib6_nh.nh_gw;
@@ -542,15 +532,17 @@ static void rt6_probe(struct fib6_info *rt)
rcu_read_lock_bh();
neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
if (neigh) {
+ struct inet6_dev *idev;
+
if (neigh->nud_state & NUD_VALID)
goto out;
+ idev = __in6_dev_get(dev);
work = NULL;
write_lock(&neigh->lock);
if (!(neigh->nud_state & NUD_VALID) &&
time_after(jiffies,
- neigh->updated +
- rt->rt6i_idev->cnf.rtr_probe_interval)) {
+ neigh->updated + idev->cnf.rtr_probe_interval)) {
work = kmalloc(sizeof(*work), GFP_ATOMIC);
if (work)
__neigh_set_probe_once(neigh);
@@ -586,9 +578,6 @@ static inline int rt6_check_dev(struct fib6_info *rt, int oif)
if (!oif || dev->ifindex == oif)
return 2;
- if ((dev->flags & IFF_LOOPBACK) &&
- rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
- return 1;
return 0;
}
@@ -597,8 +586,8 @@ static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
struct neighbour *neigh;
- if (rt->rt6i_flags & RTF_NONEXTHOP ||
- !(rt->rt6i_flags & RTF_GATEWAY))
+ if (rt->fib6_flags & RTF_NONEXTHOP ||
+ !(rt->fib6_flags & RTF_GATEWAY))
return RT6_NUD_SUCCEED;
rcu_read_lock_bh();
@@ -632,7 +621,7 @@ static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
if (!m && (strict & RT6_LOOKUP_F_IFACE))
return RT6_NUD_FAIL_HARD;
#ifdef CONFIG_IPV6_ROUTER_PREF
- m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
+ m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
#endif
if (strict & RT6_LOOKUP_F_REACHABLE) {
int n = rt6_check_neigh(rt);
@@ -642,18 +631,32 @@ static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
return m;
}
+/* called with rc_read_lock held */
+static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
+{
+ const struct net_device *dev = fib6_info_nh_dev(f6i);
+ bool rc = false;
+
+ if (dev) {
+ const struct inet6_dev *idev = __in6_dev_get(dev);
+
+ rc = !!idev->cnf.ignore_routes_with_linkdown;
+ }
+
+ return rc;
+}
+
static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
int *mpri, struct fib6_info *match,
bool *do_rr)
{
int m;
bool match_do_rr = false;
- struct inet6_dev *idev = rt->rt6i_idev;
if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
goto out;
- if (idev->cnf.ignore_routes_with_linkdown &&
+ if (fib6_ignore_linkdown(rt) &&
rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
!(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
goto out;
@@ -693,8 +696,8 @@ static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
match = NULL;
cont = NULL;
- for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
- if (rt->rt6i_metric != metric) {
+ for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
+ if (rt->fib6_metric != metric) {
cont = rt;
break;
}
@@ -703,8 +706,8 @@ static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
}
for (rt = leaf; rt && rt != rr_head;
- rt = rcu_dereference(rt->rt6_next)) {
- if (rt->rt6i_metric != metric) {
+ rt = rcu_dereference(rt->fib6_next)) {
+ if (rt->fib6_metric != metric) {
cont = rt;
break;
}
@@ -715,7 +718,7 @@ static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
if (match || !cont)
return match;
- for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
+ for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
match = find_match(rt, oif, strict, &mpri, match, do_rr);
return match;
@@ -741,30 +744,30 @@ static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
* (This might happen if all routes under fn are deleted from
* the tree and fib6_repair_tree() is called on the node.)
*/
- key_plen = rt0->rt6i_dst.plen;
+ key_plen = rt0->fib6_dst.plen;
#ifdef CONFIG_IPV6_SUBTREES
- if (rt0->rt6i_src.plen)
- key_plen = rt0->rt6i_src.plen;
+ if (rt0->fib6_src.plen)
+ key_plen = rt0->fib6_src.plen;
#endif
if (fn->fn_bit != key_plen)
return net->ipv6.fib6_null_entry;
- match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
+ match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
&do_rr);
if (do_rr) {
- struct fib6_info *next = rcu_dereference(rt0->rt6_next);
+ struct fib6_info *next = rcu_dereference(rt0->fib6_next);
/* no entries matched; do round-robin */
- if (!next || next->rt6i_metric != rt0->rt6i_metric)
+ if (!next || next->fib6_metric != rt0->fib6_metric)
next = leaf;
if (next != rt0) {
- spin_lock_bh(&leaf->rt6i_table->tb6_lock);
+ spin_lock_bh(&leaf->fib6_table->tb6_lock);
/* make sure next is not being deleted from the tree */
- if (next->rt6i_node)
+ if (next->fib6_node)
rcu_assign_pointer(fn->rr_ptr, next);
- spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
+ spin_unlock_bh(&leaf->fib6_table->tb6_lock);
}
}
@@ -773,7 +776,7 @@ static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
{
- return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
+ return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
}
#ifdef CONFIG_IPV6_ROUTE_INFO
@@ -837,8 +840,8 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
dev, pref);
else if (rt)
- rt->rt6i_flags = RTF_ROUTEINFO |
- (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
+ rt->fib6_flags = RTF_ROUTEINFO |
+ (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
if (rt) {
if (!addrconf_finite_timeout(lifetime))
@@ -861,13 +864,13 @@ static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
{
struct net_device *dev = rt->fib6_nh.nh_dev;
- if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
+ if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
/* for copies of local routes, dst->dev needs to be the
* device if it is a master device, the master device if
* device is enslaved, and the loopback as the default
*/
if (netif_is_l3_slave(dev) &&
- !rt6_need_strict(&rt->rt6i_dst.addr))
+ !rt6_need_strict(&rt->fib6_dst.addr))
dev = l3mdev_master_dev_rcu(dev);
else if (!netif_is_l3_master(dev))
dev = dev_net(dev)->loopback_dev;
@@ -939,7 +942,7 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
{
rt->dst.flags |= fib6_info_dst_flags(ort);
- if (ort->rt6i_flags & RTF_REJECT) {
+ if (ort->fib6_flags & RTF_REJECT) {
ip6_rt_init_dst_reject(rt, ort);
return;
}
@@ -949,7 +952,7 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
if (ort->fib6_type == RTN_LOCAL) {
rt->dst.input = ip6_input;
- } else if (ipv6_addr_type(&ort->rt6i_dst.addr) & IPV6_ADDR_MULTICAST) {
+ } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
rt->dst.input = ip6_mc_input;
} else {
rt->dst.input = ip6_forward;
@@ -967,7 +970,7 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
{
rt->rt6i_flags &= ~RTF_EXPIRES;
fib6_info_hold(from);
- rt->from = from;
+ rcu_assign_pointer(rt->from, from);
dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
if (from->fib6_metrics != &dst_default_metrics) {
rt->dst._metrics |= DST_METRICS_REFCOUNTED;
@@ -977,19 +980,19 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
{
+ struct net_device *dev = fib6_info_nh_dev(ort);
+
ip6_rt_init_dst(rt, ort);
- rt->rt6i_dst = ort->rt6i_dst;
- rt->rt6i_idev = ort->rt6i_idev;
- if (rt->rt6i_idev)
- in6_dev_hold(rt->rt6i_idev);
+ rt->rt6i_dst = ort->fib6_dst;
+ rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
rt->rt6i_gateway = ort->fib6_nh.nh_gw;
- rt->rt6i_flags = ort->rt6i_flags;
+ rt->rt6i_flags = ort->fib6_flags;
rt6_set_from(rt, ort);
#ifdef CONFIG_IPV6_SUBTREES
- rt->rt6i_src = ort->rt6i_src;
+ rt->rt6i_src = ort->fib6_src;
#endif
- rt->rt6i_prefsrc = ort->rt6i_prefsrc;
+ rt->rt6i_prefsrc = ort->fib6_prefsrc;
rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
}
@@ -1064,7 +1067,7 @@ restart:
} else {
f6i = rt6_device_match(net, f6i, &fl6->saddr,
fl6->flowi6_oif, flags);
- if (f6i->rt6i_nsiblings && fl6->flowi6_oif == 0)
+ if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
f6i = rt6_multipath_select(net, f6i, fl6,
fl6->flowi6_oif, skb, flags);
}
@@ -1142,7 +1145,7 @@ static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
int err;
struct fib6_table *table;
- table = rt->rt6i_table;
+ table = rt->fib6_table;
spin_lock_bh(&table->tb6_lock);
err = fib6_add(&table->tb6_root, rt, info, extack);
spin_unlock_bh(&table->tb6_lock);
@@ -1168,10 +1171,8 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
* Clone the route.
*/
- rcu_read_lock();
dev = ip6_rt_get_dev_rcu(ort);
rt = ip6_dst_alloc(dev_net(dev), dev, 0);
- rcu_read_unlock();
if (!rt)
return NULL;
@@ -1182,8 +1183,8 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
rt->rt6i_dst.plen = 128;
if (!rt6_is_gw_or_nonexthop(ort)) {
- if (ort->rt6i_dst.plen != 128 &&
- ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
+ if (ort->fib6_dst.plen != 128 &&
+ ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
rt->rt6i_flags |= RTF_ANYCAST;
#ifdef CONFIG_IPV6_SUBTREES
if (rt->rt6i_src.plen && saddr) {
@@ -1375,7 +1376,18 @@ static unsigned int fib6_mtu(const struct fib6_info *rt)
{
unsigned int mtu;
- mtu = rt->fib6_pmtu ? : rt->rt6i_idev->cnf.mtu6;
+ if (rt->fib6_pmtu) {
+ mtu = rt->fib6_pmtu;
+ } else {
+ struct net_device *dev = fib6_info_nh_dev(rt);
+ struct inet6_dev *idev;
+
+ rcu_read_lock();
+ idev = __in6_dev_get(dev);
+ mtu = idev->cnf.mtu6;
+ rcu_read_unlock();
+ }
+
mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
@@ -1416,14 +1428,14 @@ static int rt6_insert_exception(struct rt6_info *nrt,
* Otherwise, the exception table is indexed by
* a hash of only rt6i_dst.
*/
- if (ort->rt6i_src.plen)
+ if (ort->fib6_src.plen)
src_key = &nrt->rt6i_src.addr;
#endif
/* Update rt6i_prefsrc as it could be changed
* in rt6_remove_prefsrc()
*/
- nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
+ nrt->rt6i_prefsrc = ort->fib6_prefsrc;
/* rt6_mtu_change() might lower mtu on ort.
* Only insert this exception route if its mtu
* is less than ort's mtu value.
@@ -1457,9 +1469,9 @@ out:
/* Update fn->fn_sernum to invalidate all cached dst */
if (!err) {
- spin_lock_bh(&ort->rt6i_table->tb6_lock);
+ spin_lock_bh(&ort->fib6_table->tb6_lock);
fib6_update_sernum(net, ort);
- spin_unlock_bh(&ort->rt6i_table->tb6_lock);
+ spin_unlock_bh(&ort->fib6_table->tb6_lock);
fib6_force_start_gc(net);
}
@@ -1514,7 +1526,7 @@ static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
* Otherwise, the exception table is indexed by
* a hash of only rt6i_dst.
*/
- if (rt->rt6i_src.plen)
+ if (rt->fib6_src.plen)
src_key = saddr;
#endif
rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
@@ -1529,11 +1541,12 @@ static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
static int rt6_remove_exception_rt(struct rt6_info *rt)
{
struct rt6_exception_bucket *bucket;
- struct fib6_info *from = rt->from;
struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
+ struct fib6_info *from;
int err;
+ from = rcu_dereference(rt->from);
if (!from ||
!(rt->rt6i_flags & RTF_CACHE))
return -EINVAL;
@@ -1551,7 +1564,7 @@ static int rt6_remove_exception_rt(struct rt6_info *rt)
* Otherwise, the exception table is indexed by
* a hash of only rt6i_dst.
*/
- if (from->rt6i_src.plen)
+ if (from->fib6_src.plen)
src_key = &rt->rt6i_src.addr;
#endif
rt6_ex = __rt6_find_exception_spinlock(&bucket,
@@ -1592,7 +1605,7 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
* Otherwise, the exception table is indexed by
* a hash of only rt6i_dst.
*/
- if (from->rt6i_src.plen)
+ if (from->fib6_src.plen)
src_key = &rt->rt6i_src.addr;
#endif
rt6_ex = __rt6_find_exception_rcu(&bucket,
@@ -1810,7 +1823,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
redo_rt6_select:
f6i = rt6_select(net, fn, oif, strict);
- if (f6i->rt6i_nsiblings)
+ if (f6i->fib6_nsiblings)
f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
if (f6i == net->ipv6.fib6_null_entry) {
fn = fib6_backtrack(fn, &fl6->saddr);
@@ -1842,20 +1855,17 @@ redo_rt6_select:
trace_fib6_table_lookup(net, rt, table, fl6);
return rt;
} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
- !(f6i->rt6i_flags & RTF_GATEWAY))) {
+ !(f6i->fib6_flags & RTF_GATEWAY))) {
/* Create a RTF_CACHE clone which will not be
* owned by the fib6 tree. It is for the special case where
* the daddr in the skb during the neighbor look-up is different
* from the fl6->daddr used to look-up route here.
*/
-
struct rt6_info *uncached_rt;
- fib6_info_hold(f6i);
- rcu_read_unlock();
-
uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
- fib6_info_release(f6i);
+
+ rcu_read_unlock();
if (uncached_rt) {
/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
@@ -1922,11 +1932,16 @@ static void ip6_multipath_l3_keys(const struct sk_buff *skb,
const struct ipv6hdr *inner_iph;
const struct icmp6hdr *icmph;
struct ipv6hdr _inner_iph;
+ struct icmp6hdr _icmph;
if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
goto out;
- icmph = icmp6_hdr(skb);
+ icmph = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_icmph), &_icmph);
+ if (!icmph)
+ goto out;
+
if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
@@ -2121,8 +2136,7 @@ static bool fib6_check(struct fib6_info *f6i, u32 cookie)
{
u32 rt_cookie = 0;
- if ((f6i && !rt6_get_cookie_safe(f6i, &rt_cookie)) ||
- rt_cookie != cookie)
+ if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
return false;
if (fib6_check_expired(f6i))
@@ -2131,11 +2145,13 @@ static bool fib6_check(struct fib6_info *f6i, u32 cookie)
return true;
}
-static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
+static struct dst_entry *rt6_check(struct rt6_info *rt,
+ struct fib6_info *from,
+ u32 cookie)
{
u32 rt_cookie = 0;
- if ((rt->from && !rt6_get_cookie_safe(rt->from, &rt_cookie)) ||
+ if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
rt_cookie != cookie)
return NULL;
@@ -2145,11 +2161,13 @@ static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
return &rt->dst;
}
-static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
+static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
+ struct fib6_info *from,
+ u32 cookie)
{
if (!__rt6_check_expired(rt) &&
rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
- fib6_check(rt->from, cookie))
+ fib6_check(from, cookie))
return &rt->dst;
else
return NULL;
@@ -2157,20 +2175,30 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
{
+ struct dst_entry *dst_ret;
+ struct fib6_info *from;
struct rt6_info *rt;
- rt = (struct rt6_info *) dst;
+ rt = container_of(dst, struct rt6_info, dst);
+
+ rcu_read_lock();
/* All IPV6 dsts are created with ->obsolete set to the value
* DST_OBSOLETE_FORCE_CHK which forces validation calls down
* into this function always.
*/
- if (rt->rt6i_flags & RTF_PCPU ||
- (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
- return rt6_dst_from_check(rt, cookie);
+ from = rcu_dereference(rt->from);
+
+ if (from && (rt->rt6i_flags & RTF_PCPU ||
+ unlikely(!list_empty(&rt->rt6i_uncached))))
+ dst_ret = rt6_dst_from_check(rt, from, cookie);
else
- return rt6_check(rt, cookie);
+ dst_ret = rt6_check(rt, from, cookie);
+
+ rcu_read_unlock();
+
+ return dst_ret;
}
static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
@@ -2179,10 +2207,12 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
if (rt) {
if (rt->rt6i_flags & RTF_CACHE) {
+ rcu_read_lock();
if (rt6_check_expired(rt)) {
rt6_remove_exception_rt(rt);
dst = NULL;
}
+ rcu_read_unlock();
} else {
dst_release(dst);
dst = NULL;
@@ -2199,21 +2229,41 @@ static void ip6_link_failure(struct sk_buff *skb)
rt = (struct rt6_info *) skb_dst(skb);
if (rt) {
+ rcu_read_lock();
if (rt->rt6i_flags & RTF_CACHE) {
if (dst_hold_safe(&rt->dst))
rt6_remove_exception_rt(rt);
- } else if (rt->from) {
+ } else {
+ struct fib6_info *from;
struct fib6_node *fn;
- rcu_read_lock();
- fn = rcu_dereference(rt->from->rt6i_node);
- if (fn && (rt->rt6i_flags & RTF_DEFAULT))
- fn->fn_sernum = -1;
- rcu_read_unlock();
+ from = rcu_dereference(rt->from);
+ if (from) {
+ fn = rcu_dereference(from->fib6_node);
+ if (fn && (rt->rt6i_flags & RTF_DEFAULT))
+ fn->fn_sernum = -1;
+ }
}
+ rcu_read_unlock();
}
}
+static void rt6_update_expires(struct rt6_info *rt0, int timeout)
+{
+ if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
+ struct fib6_info *from;
+
+ rcu_read_lock();
+ from = rcu_dereference(rt0->from);
+ if (from)
+ rt0->dst.expires = from->expires;
+ rcu_read_unlock();
+ }
+
+ dst_set_expires(&rt0->dst, timeout);
+ rt0->rt6i_flags |= RTF_EXPIRES;
+}
+
static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
{
struct net *net = dev_net(rt->dst.dev);
@@ -2225,8 +2275,14 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
{
+ bool from_set;
+
+ rcu_read_lock();
+ from_set = !!rcu_dereference(rt->from);
+ rcu_read_unlock();
+
return !(rt->rt6i_flags & RTF_CACHE) &&
- (rt->rt6i_flags & RTF_PCPU || rt->from);
+ (rt->rt6i_flags & RTF_PCPU || from_set);
}
static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
@@ -2262,14 +2318,18 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
if (rt6->rt6i_flags & RTF_CACHE)
rt6_update_exception_stamp_rt(rt6);
} else if (daddr) {
+ struct fib6_info *from;
struct rt6_info *nrt6;
- nrt6 = ip6_rt_cache_alloc(rt6->from, daddr, saddr);
+ rcu_read_lock();
+ from = rcu_dereference(rt6->from);
+ nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
if (nrt6) {
rt6_do_update_pmtu(nrt6, mtu);
- if (rt6_insert_exception(nrt6, rt6->from))
+ if (rt6_insert_exception(nrt6, from))
dst_release_immediate(&nrt6->dst);
}
+ rcu_read_unlock();
}
}
@@ -2372,9 +2432,9 @@ restart:
continue;
if (fib6_check_expired(rt))
continue;
- if (rt->rt6i_flags & RTF_REJECT)
+ if (rt->fib6_flags & RTF_REJECT)
break;
- if (!(rt->rt6i_flags & RTF_GATEWAY))
+ if (!(rt->fib6_flags & RTF_GATEWAY))
continue;
if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
continue;
@@ -2400,7 +2460,7 @@ restart:
if (!rt)
rt = net->ipv6.fib6_null_entry;
- else if (rt->rt6i_flags & RTF_REJECT) {
+ else if (rt->fib6_flags & RTF_REJECT) {
ret = net->ipv6.ip6_null_entry;
goto out;
}
@@ -2600,21 +2660,19 @@ out:
static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
struct fib6_config *cfg)
{
- int err = 0;
+ struct dst_metrics *p;
- if (cfg->fc_mx) {
- rt->fib6_metrics = kzalloc(sizeof(*rt->fib6_metrics),
- GFP_KERNEL);
- if (unlikely(!rt->fib6_metrics))
- return -ENOMEM;
+ if (!cfg->fc_mx)
+ return 0;
- refcount_set(&rt->fib6_metrics->refcnt, 1);
+ p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
+ if (unlikely(!p))
+ return -ENOMEM;
- err = ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len,
- rt->fib6_metrics->metrics);
- }
+ refcount_set(&p->refcnt, 1);
+ rt->fib6_metrics = p;
- return err;
+ return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
}
static struct rt6_info *ip6_nh_lookup_table(struct net *net,
@@ -2907,7 +2965,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
if (cfg->fc_protocol == RTPROT_UNSPEC)
cfg->fc_protocol = RTPROT_BOOT;
- rt->rt6i_protocol = cfg->fc_protocol;
+ rt->fib6_protocol = cfg->fc_protocol;
addr_type = ipv6_addr_type(&cfg->fc_dst);
@@ -2922,17 +2980,17 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
}
- ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
- rt->rt6i_dst.plen = cfg->fc_dst_len;
- if (rt->rt6i_dst.plen == 128)
+ ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
+ rt->fib6_dst.plen = cfg->fc_dst_len;
+ if (rt->fib6_dst.plen == 128)
rt->dst_host = true;
#ifdef CONFIG_IPV6_SUBTREES
- ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
- rt->rt6i_src.plen = cfg->fc_src_len;
+ ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
+ rt->fib6_src.plen = cfg->fc_src_len;
#endif
- rt->rt6i_metric = cfg->fc_metric;
+ rt->fib6_metric = cfg->fc_metric;
rt->fib6_nh.nh_weight = 1;
rt->fib6_type = cfg->fc_type;
@@ -2958,7 +3016,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
goto out;
}
}
- rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
+ rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
goto install_route;
}
@@ -2992,24 +3050,26 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
err = -EINVAL;
goto out;
}
- rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
- rt->rt6i_prefsrc.plen = 128;
+ rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
+ rt->fib6_prefsrc.plen = 128;
} else
- rt->rt6i_prefsrc.plen = 0;
+ rt->fib6_prefsrc.plen = 0;
- rt->rt6i_flags = cfg->fc_flags;
+ rt->fib6_flags = cfg->fc_flags;
install_route:
- if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
+ if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
!netif_carrier_ok(dev))
rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
rt->fib6_nh.nh_dev = dev;
- rt->rt6i_idev = idev;
- rt->rt6i_table = table;
+ rt->fib6_table = table;
cfg->fc_nlinfo.nl_net = dev_net(dev);
+ if (idev)
+ in6_dev_put(idev);
+
return rt;
out:
if (dev)
@@ -3048,7 +3108,7 @@ static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
goto out;
}
- table = rt->rt6i_table;
+ table = rt->fib6_table;
spin_lock_bh(&table->tb6_lock);
err = fib6_del(rt, info);
spin_unlock_bh(&table->tb6_lock);
@@ -3075,10 +3135,10 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
if (rt == net->ipv6.fib6_null_entry)
goto out_put;
- table = rt->rt6i_table;
+ table = rt->fib6_table;
spin_lock_bh(&table->tb6_lock);
- if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
+ if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
struct fib6_info *sibling, *next_sibling;
/* prefer to send a single notification with all hops */
@@ -3096,8 +3156,8 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
}
list_for_each_entry_safe(sibling, next_sibling,
- &rt->rt6i_siblings,
- rt6i_siblings) {
+ &rt->fib6_siblings,
+ fib6_siblings) {
err = fib6_del(sibling, info);
if (err)
goto out_unlock;
@@ -3176,9 +3236,9 @@ static int ip6_route_del(struct fib6_config *cfg,
if (cfg->fc_flags & RTF_GATEWAY &&
!ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
continue;
- if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
+ if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
continue;
- if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
+ if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
continue;
fib6_info_hold(rt);
rcu_read_unlock();
@@ -3202,6 +3262,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
struct ndisc_options ndopts;
struct inet6_dev *in6_dev;
struct neighbour *neigh;
+ struct fib6_info *from;
struct rd_msg *msg;
int optlen, on_link;
u8 *lladdr;
@@ -3283,7 +3344,12 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
NEIGH_UPDATE_F_ISROUTER)),
NDISC_REDIRECT, &ndopts);
- nrt = ip6_rt_cache_alloc(rt->from, &msg->dest, NULL);
+ rcu_read_lock();
+ from = rcu_dereference(rt->from);
+ fib6_info_hold(from);
+ rcu_read_unlock();
+
+ nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
if (!nrt)
goto out;
@@ -3297,7 +3363,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
* a cached route because rt6_insert_exception() will
* takes care of it
*/
- if (rt6_insert_exception(nrt, rt->from)) {
+ if (rt6_insert_exception(nrt, from)) {
dst_release_immediate(&nrt->dst);
goto out;
}
@@ -3309,6 +3375,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
out:
+ fib6_info_release(from);
neigh_release(neigh);
}
@@ -3336,7 +3403,7 @@ static struct fib6_info *rt6_get_route_info(struct net *net,
for_each_fib6_node_rt_rcu(fn) {
if (rt->fib6_nh.nh_dev->ifindex != ifindex)
continue;
- if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
+ if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
continue;
if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
continue;
@@ -3396,7 +3463,7 @@ struct fib6_info *rt6_get_dflt_router(struct net *net,
rcu_read_lock();
for_each_fib6_node_rt_rcu(&table->tb6_root) {
if (dev == rt->fib6_nh.nh_dev &&
- ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
+ ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
break;
}
@@ -3445,8 +3512,11 @@ static void __rt6_purge_dflt_routers(struct net *net,
restart:
rcu_read_lock();
for_each_fib6_node_rt_rcu(&table->tb6_root) {
- if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
- (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
+ struct net_device *dev = fib6_info_nh_dev(rt);
+ struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
+
+ if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
+ (!idev || idev->cnf.accept_ra != 2)) {
fib6_info_hold(rt);
rcu_read_unlock();
ip6_del_rt(net, rt);
@@ -3591,44 +3661,40 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff
* Allocate a dst for local (unicast / anycast) address.
*/
-struct fib6_info *addrconf_dst_alloc(struct net *net,
- struct inet6_dev *idev,
- const struct in6_addr *addr,
- bool anycast, gfp_t gfp_flags)
+struct fib6_info *addrconf_f6i_alloc(struct net *net,
+ struct inet6_dev *idev,
+ const struct in6_addr *addr,
+ bool anycast, gfp_t gfp_flags)
{
u32 tb_id;
struct net_device *dev = idev->dev;
- struct fib6_info *rt;
+ struct fib6_info *f6i;
- rt = fib6_info_alloc(gfp_flags);
- if (!rt)
+ f6i = fib6_info_alloc(gfp_flags);
+ if (!f6i)
return ERR_PTR(-ENOMEM);
- rt->dst_nocount = true;
-
- in6_dev_hold(idev);
- rt->rt6i_idev = idev;
-
- rt->dst_host = true;
- rt->rt6i_protocol = RTPROT_KERNEL;
- rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
+ f6i->dst_nocount = true;
+ f6i->dst_host = true;
+ f6i->fib6_protocol = RTPROT_KERNEL;
+ f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
if (anycast) {
- rt->fib6_type = RTN_ANYCAST;
- rt->rt6i_flags |= RTF_ANYCAST;
+ f6i->fib6_type = RTN_ANYCAST;
+ f6i->fib6_flags |= RTF_ANYCAST;
} else {
- rt->fib6_type = RTN_LOCAL;
- rt->rt6i_flags |= RTF_LOCAL;
+ f6i->fib6_type = RTN_LOCAL;
+ f6i->fib6_flags |= RTF_LOCAL;
}
- rt->fib6_nh.nh_gw = *addr;
+ f6i->fib6_nh.nh_gw = *addr;
dev_hold(dev);
- rt->fib6_nh.nh_dev = dev;
- rt->rt6i_dst.addr = *addr;
- rt->rt6i_dst.plen = 128;
+ f6i->fib6_nh.nh_dev = dev;
+ f6i->fib6_dst.addr = *addr;
+ f6i->fib6_dst.plen = 128;
tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
- rt->rt6i_table = fib6_get_table(net, tb_id);
+ f6i->fib6_table = fib6_get_table(net, tb_id);
- return rt;
+ return f6i;
}
/* remove deleted ip from prefsrc entries */
@@ -3646,10 +3712,10 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
rt != net->ipv6.fib6_null_entry &&
- ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
+ ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
spin_lock_bh(&rt6_exception_lock);
/* remove prefsrc entry */
- rt->rt6i_prefsrc.plen = 0;
+ rt->fib6_prefsrc.plen = 0;
/* need to update cache as well */
rt6_exceptions_remove_prefsrc(rt);
spin_unlock_bh(&rt6_exception_lock);
@@ -3675,7 +3741,7 @@ static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
{
struct in6_addr *gateway = (struct in6_addr *)arg;
- if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
+ if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
return -1;
}
@@ -3707,16 +3773,16 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
struct fib6_info *iter;
struct fib6_node *fn;
- fn = rcu_dereference_protected(rt->rt6i_node,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ fn = rcu_dereference_protected(rt->fib6_node,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
iter = rcu_dereference_protected(fn->leaf,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
while (iter) {
- if (iter->rt6i_metric == rt->rt6i_metric &&
+ if (iter->fib6_metric == rt->fib6_metric &&
rt6_qualify_for_ecmp(iter))
return iter;
- iter = rcu_dereference_protected(iter->rt6_next,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ iter = rcu_dereference_protected(iter->fib6_next,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
}
return NULL;
@@ -3726,7 +3792,7 @@ static bool rt6_is_dead(const struct fib6_info *rt)
{
if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
(rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
- rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
+ fib6_ignore_linkdown(rt)))
return true;
return false;
@@ -3740,7 +3806,7 @@ static int rt6_multipath_total_weight(const struct fib6_info *rt)
if (!rt6_is_dead(rt))
total += rt->fib6_nh.nh_weight;
- list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
+ list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
if (!rt6_is_dead(iter))
total += iter->fib6_nh.nh_weight;
}
@@ -3767,7 +3833,7 @@ static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
rt6_upper_bound_set(rt, &weight, total);
- list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
+ list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
rt6_upper_bound_set(iter, &weight, total);
}
@@ -3780,7 +3846,7 @@ void rt6_multipath_rebalance(struct fib6_info *rt)
* then there is no need to rebalance upon the removal of every
* sibling route.
*/
- if (!rt->rt6i_nsiblings || rt->should_flush)
+ if (!rt->fib6_nsiblings || rt->should_flush)
return;
/* During lookup routes are evaluated in order, so we need to
@@ -3831,7 +3897,7 @@ static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
if (rt->fib6_nh.nh_dev == dev)
return true;
- list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
+ list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
if (iter->fib6_nh.nh_dev == dev)
return true;
@@ -3843,7 +3909,7 @@ static void rt6_multipath_flush(struct fib6_info *rt)
struct fib6_info *iter;
rt->should_flush = 1;
- list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
+ list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
iter->should_flush = 1;
}
@@ -3856,7 +3922,7 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
if (rt->fib6_nh.nh_dev == down_dev ||
rt->fib6_nh.nh_flags & RTNH_F_DEAD)
dead++;
- list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
+ list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
if (iter->fib6_nh.nh_dev == down_dev ||
iter->fib6_nh.nh_flags & RTNH_F_DEAD)
dead++;
@@ -3872,7 +3938,7 @@ static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
if (rt->fib6_nh.nh_dev == dev)
rt->fib6_nh.nh_flags |= nh_flags;
- list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
+ list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
if (iter->fib6_nh.nh_dev == dev)
iter->fib6_nh.nh_flags |= nh_flags;
}
@@ -3893,13 +3959,13 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
case NETDEV_DOWN:
if (rt->should_flush)
return -1;
- if (!rt->rt6i_nsiblings)
+ if (!rt->fib6_nsiblings)
return rt->fib6_nh.nh_dev == dev ? -1 : 0;
if (rt6_multipath_uses_dev(rt, dev)) {
unsigned int count;
count = rt6_multipath_dead_count(rt, dev);
- if (rt->rt6i_nsiblings + 1 == count) {
+ if (rt->fib6_nsiblings + 1 == count) {
rt6_multipath_flush(rt);
return -1;
}
@@ -3911,7 +3977,7 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
return -2;
case NETDEV_CHANGE:
if (rt->fib6_nh.nh_dev != dev ||
- rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
+ rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
break;
rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
rt6_multipath_rebalance(rt);
@@ -3992,6 +4058,7 @@ void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
[RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
+ [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
[RTA_OIF] = { .type = NLA_U32 },
[RTA_IIF] = { .type = NLA_U32 },
[RTA_PRIORITY] = { .type = NLA_U32 },
@@ -4003,6 +4070,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
[RTA_EXPIRES] = { .type = NLA_U32 },
[RTA_UID] = { .type = NLA_U32 },
[RTA_MARK] = { .type = NLA_U32 },
+ [RTA_TABLE] = { .type = NLA_U32 },
};
static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -4188,10 +4256,10 @@ static void ip6_route_mpath_notify(struct fib6_info *rt,
* nexthop. Since sibling routes are always added at the end of
* the list, find the first sibling of the last route appended
*/
- if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
- rt = list_first_entry(&rt_last->rt6i_siblings,
+ if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
+ rt = list_first_entry(&rt_last->fib6_siblings,
struct fib6_info,
- rt6i_siblings);
+ fib6_siblings);
}
if (rt)
@@ -4410,13 +4478,13 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt)
{
int nexthop_len = 0;
- if (rt->rt6i_nsiblings) {
+ if (rt->fib6_nsiblings) {
nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
+ NLA_ALIGN(sizeof(struct rtnexthop))
+ nla_total_size(16) /* RTA_GATEWAY */
+ lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
- nexthop_len *= rt->rt6i_nsiblings;
+ nexthop_len *= rt->fib6_nsiblings;
}
return NLMSG_ALIGN(sizeof(struct rtmsg))
@@ -4444,11 +4512,14 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
*flags |= RTNH_F_LINKDOWN;
- if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
+
+ rcu_read_lock();
+ if (fib6_ignore_linkdown(rt))
*flags |= RTNH_F_DEAD;
+ rcu_read_unlock();
}
- if (rt->rt6i_flags & RTF_GATEWAY) {
+ if (rt->fib6_flags & RTF_GATEWAY) {
if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
goto nla_put_failure;
}
@@ -4518,11 +4589,11 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
rtm = nlmsg_data(nlh);
rtm->rtm_family = AF_INET6;
- rtm->rtm_dst_len = rt->rt6i_dst.plen;
- rtm->rtm_src_len = rt->rt6i_src.plen;
+ rtm->rtm_dst_len = rt->fib6_dst.plen;
+ rtm->rtm_src_len = rt->fib6_src.plen;
rtm->rtm_tos = 0;
- if (rt->rt6i_table)
- table = rt->rt6i_table->tb6_id;
+ if (rt->fib6_table)
+ table = rt->fib6_table->tb6_id;
else
table = RT6_TABLE_UNSPEC;
rtm->rtm_table = table;
@@ -4532,9 +4603,9 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
rtm->rtm_type = rt->fib6_type;
rtm->rtm_flags = 0;
rtm->rtm_scope = RT_SCOPE_UNIVERSE;
- rtm->rtm_protocol = rt->rt6i_protocol;
+ rtm->rtm_protocol = rt->fib6_protocol;
- if (rt->rt6i_flags & RTF_CACHE)
+ if (rt->fib6_flags & RTF_CACHE)
rtm->rtm_flags |= RTM_F_CLONED;
if (dest) {
@@ -4542,7 +4613,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
goto nla_put_failure;
rtm->rtm_dst_len = 128;
} else if (rtm->rtm_dst_len)
- if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
+ if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
goto nla_put_failure;
#ifdef CONFIG_IPV6_SUBTREES
if (src) {
@@ -4550,12 +4621,12 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
goto nla_put_failure;
rtm->rtm_src_len = 128;
} else if (rtm->rtm_src_len &&
- nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
+ nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
goto nla_put_failure;
#endif
if (iif) {
#ifdef CONFIG_IPV6_MROUTE
- if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
+ if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
int err = ip6mr_get_route(net, skb, rtm, portid);
if (err == 0)
@@ -4573,9 +4644,9 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
goto nla_put_failure;
}
- if (rt->rt6i_prefsrc.plen) {
+ if (rt->fib6_prefsrc.plen) {
struct in6_addr saddr_buf;
- saddr_buf = rt->rt6i_prefsrc.addr;
+ saddr_buf = rt->fib6_prefsrc.addr;
if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
goto nla_put_failure;
}
@@ -4584,13 +4655,13 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
if (rtnetlink_put_metrics(skb, pmetrics) < 0)
goto nla_put_failure;
- if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
+ if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
goto nla_put_failure;
/* For multipath routes, walk the siblings list and add
* each as a nexthop within RTA_MULTIPATH.
*/
- if (rt->rt6i_nsiblings) {
+ if (rt->fib6_nsiblings) {
struct fib6_info *sibling, *next_sibling;
struct nlattr *mp;
@@ -4602,7 +4673,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
goto nla_put_failure;
list_for_each_entry_safe(sibling, next_sibling,
- &rt->rt6i_siblings, rt6i_siblings) {
+ &rt->fib6_siblings, fib6_siblings) {
if (rt6_add_nexthop(skb, sibling) < 0)
goto nla_put_failure;
}
@@ -4613,7 +4684,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
goto nla_put_failure;
}
- if (rt->rt6i_flags & RTF_EXPIRES) {
+ if (rt->fib6_flags & RTF_EXPIRES) {
expires = dst ? dst->expires : rt->expires;
expires -= jiffies;
}
@@ -4621,7 +4692,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
goto nla_put_failure;
- if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
+ if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
goto nla_put_failure;
@@ -4646,7 +4717,7 @@ int rt6_dump_route(struct fib6_info *rt, void *p_arg)
/* user wants prefix routes only */
if (rtm->rtm_flags & RTM_F_PREFIX &&
- !(rt->rt6i_flags & RTF_PREFIX_RT)) {
+ !(rt->fib6_flags & RTF_PREFIX_RT)) {
/* success since this is not a prefix route */
return 1;
}
@@ -4663,6 +4734,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct net *net = sock_net(in_skb->sk);
struct nlattr *tb[RTA_MAX+1];
int err, iif = 0, oif = 0;
+ struct fib6_info *from;
struct dst_entry *dst;
struct rt6_info *rt;
struct sk_buff *skb;
@@ -4759,15 +4831,21 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
}
skb_dst_set(skb, &rt->dst);
+
+ rcu_read_lock();
+ from = rcu_dereference(rt->from);
+
if (fibmatch)
- err = rt6_fill_node(net, skb, rt->from, NULL, NULL, NULL, iif,
+ err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
nlh->nlmsg_seq, 0);
else
- err = rt6_fill_node(net, skb, rt->from, dst,
- &fl6.daddr, &fl6.saddr, iif, RTM_NEWROUTE,
+ err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
+ &fl6.saddr, iif, RTM_NEWROUTE,
NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
0);
+ rcu_read_unlock();
+
if (err < 0) {
kfree_skb(skb);
goto errout;
@@ -4820,7 +4898,6 @@ static int ip6_route_dev_notify(struct notifier_block *this,
if (event == NETDEV_REGISTER) {
net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
- net->ipv6.fib6_null_entry->rt6i_idev = in6_dev_get(dev);
net->ipv6.ip6_null_entry->dst.dev = dev;
net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -4834,7 +4911,6 @@ static int ip6_route_dev_notify(struct notifier_block *this,
/* NETDEV_UNREGISTER could be fired for multiple times by
* netdev_wait_allrefs(). Make sure we only call this once.
*/
- in6_dev_put_clear(&net->ipv6.fib6_null_entry->rt6i_idev);
in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
@@ -5157,7 +5233,6 @@ void __init ip6_route_init_special_entries(void)
* the loopback reference in rt6_info will not be taken, do it
* manually for init_net */
init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
- init_net.ipv6.fib6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index f343e6f0fc95..eab39bd91548 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -91,6 +91,24 @@ static void set_tun_src(struct net *net, struct net_device *dev,
rcu_read_unlock();
}
+/* Compute flowlabel for outer IPv6 header */
+static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb,
+ struct ipv6hdr *inner_hdr)
+{
+ int do_flowlabel = net->ipv6.sysctl.seg6_flowlabel;
+ __be32 flowlabel = 0;
+ u32 hash;
+
+ if (do_flowlabel > 0) {
+ hash = skb_get_hash(skb);
+ rol32(hash, 16);
+ flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK;
+ } else if (!do_flowlabel && skb->protocol == htons(ETH_P_IPV6)) {
+ flowlabel = ip6_flowlabel(inner_hdr);
+ }
+ return flowlabel;
+}
+
/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */
int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
{
@@ -99,6 +117,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
struct ipv6hdr *hdr, *inner_hdr;
struct ipv6_sr_hdr *isrh;
int hdrlen, tot_len, err;
+ __be32 flowlabel;
hdrlen = (osrh->hdrlen + 1) << 3;
tot_len = hdrlen + sizeof(*hdr);
@@ -108,6 +127,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
return err;
inner_hdr = ipv6_hdr(skb);
+ flowlabel = seg6_make_flowlabel(net, skb, inner_hdr);
skb_push(skb, tot_len);
skb_reset_network_header(skb);
@@ -121,10 +141,10 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
if (skb->protocol == htons(ETH_P_IPV6)) {
ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)),
- ip6_flowlabel(inner_hdr));
+ flowlabel);
hdr->hop_limit = inner_hdr->hop_limit;
} else {
- ip6_flow_hdr(hdr, 0, 0);
+ ip6_flow_hdr(hdr, 0, flowlabel);
hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb));
}
@@ -136,7 +156,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
isrh->nexthdr = proto;
hdr->daddr = isrh->segments[isrh->first_segment];
- set_tun_src(net, ip6_dst_idev(dst)->dev, &hdr->daddr, &hdr->saddr);
+ set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr);
#ifdef CONFIG_IPV6_SEG6_HMAC
if (sr_has_hmac(isrh)) {
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 6fbdef630152..e15cd37024fd 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -152,6 +152,13 @@ static struct ctl_table ipv6_table_template[] = {
.extra1 = &zero,
.extra2 = &one,
},
+ {
+ .procname = "seg6_flowlabel",
+ .data = &init_net.ipv6.sysctl.seg6_flowlabel,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
{ }
};
@@ -217,6 +224,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net)
ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len;
ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len;
ipv6_table[14].data = &net->ipv6.sysctl.multipath_hash_policy,
+ ipv6_table[15].data = &net->ipv6.sysctl.seg6_flowlabel;
ipv6_route_table = ipv6_route_sysctl_init(net);
if (!ipv6_route_table)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 4ec76a87aeb8..a34e28ac03a7 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1023,7 +1023,8 @@ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
* Sending
*/
-static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6)
+static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
+ struct inet_cork *cork)
{
struct sock *sk = skb->sk;
struct udphdr *uh;
@@ -1042,12 +1043,31 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6)
uh->len = htons(len);
uh->check = 0;
+ if (cork->gso_size) {
+ const int hlen = skb_network_header_len(skb) +
+ sizeof(struct udphdr);
+
+ if (hlen + cork->gso_size > cork->fragsize)
+ return -EINVAL;
+ if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
+ return -EINVAL;
+ if (udp_sk(sk)->no_check6_tx)
+ return -EINVAL;
+ if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
+ return -EIO;
+
+ skb_shinfo(skb)->gso_size = cork->gso_size;
+ skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+ goto csum_partial;
+ }
+
if (is_udplite)
csum = udplite_csum(skb);
else if (udp_sk(sk)->no_check6_tx) { /* UDP csum disabled */
skb->ip_summed = CHECKSUM_NONE;
goto send;
} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
+csum_partial:
udp6_hwcsum_outgoing(sk, skb, &fl6->saddr, &fl6->daddr, len);
goto send;
} else
@@ -1093,7 +1113,7 @@ static int udp_v6_push_pending_frames(struct sock *sk)
if (!skb)
goto out;
- err = udp_v6_send_skb(skb, &fl6);
+ err = udp_v6_send_skb(skb, &fl6, &inet_sk(sk)->cork.base);
out:
up->len = 0;
@@ -1127,6 +1147,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc6.hlimit = -1;
ipc6.tclass = -1;
ipc6.dontfrag = -1;
+ ipc6.gso_size = up->gso_size;
sockc.tsflags = sk->sk_tsflags;
/* destination address check */
@@ -1259,7 +1280,10 @@ do_udp_sendmsg:
opt->tot_len = sizeof(*opt);
ipc6.opt = opt;
- err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, &sockc);
+ err = udp_cmsg_send(sk, msg, &ipc6.gso_size);
+ if (err > 0)
+ err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6,
+ &ipc6, &sockc);
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
@@ -1324,15 +1348,16 @@ back_from_confirm:
/* Lockless fast path for the non-corking case */
if (!corkreq) {
+ struct inet_cork_full cork;
struct sk_buff *skb;
skb = ip6_make_skb(sk, getfrag, msg, ulen,
sizeof(struct udphdr), &ipc6,
&fl6, (struct rt6_info *)dst,
- msg->msg_flags, &sockc);
+ msg->msg_flags, &cork, &sockc);
err = PTR_ERR(skb);
if (!IS_ERR_OR_NULL(skb))
- err = udp_v6_send_skb(skb, &fl6);
+ err = udp_v6_send_skb(skb, &fl6, &cork.base);
goto out;
}
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 2a04dc9c781b..f7b85b1e6b3e 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -17,6 +17,20 @@
#include <net/ip6_checksum.h>
#include "ip6_offload.h"
+static struct sk_buff *__udp6_gso_segment(struct sk_buff *gso_skb,
+ netdev_features_t features)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(gso_skb);
+ unsigned int mss = skb_shinfo(gso_skb)->gso_size;
+
+ if (!can_checksum_protocol(features, htons(ETH_P_IPV6)))
+ return ERR_PTR(-EIO);
+
+ return __udp_gso_segment(gso_skb, features, mss,
+ udp_v6_check(sizeof(struct udphdr) + mss,
+ &ip6h->saddr, &ip6h->daddr, 0));
+}
+
static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
netdev_features_t features)
{
@@ -42,12 +56,15 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
const struct ipv6hdr *ipv6h;
struct udphdr *uh;
- if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP))
+ if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_UDP | SKB_GSO_UDP_L4)))
goto out;
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
goto out;
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
+ return __udp6_gso_segment(skb, features);
+
/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
* do checksum of UDP packets sent as multiple IP fragments.
*/
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 0fbd3ee26165..40261cb68e83 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -183,6 +183,26 @@ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id)
}
EXPORT_SYMBOL_GPL(l2tp_tunnel_get);
+struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth)
+{
+ const struct l2tp_net *pn = l2tp_pernet(net);
+ struct l2tp_tunnel *tunnel;
+ int count = 0;
+
+ rcu_read_lock_bh();
+ list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) {
+ if (++count > nth) {
+ l2tp_tunnel_inc_refcount(tunnel);
+ rcu_read_unlock_bh();
+ return tunnel;
+ }
+ }
+ rcu_read_unlock_bh();
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(l2tp_tunnel_get_nth);
+
/* Lookup a session. A new reference is held on the returned session. */
struct l2tp_session *l2tp_session_get(const struct net *net,
struct l2tp_tunnel *tunnel,
@@ -335,26 +355,6 @@ err_tlock:
}
EXPORT_SYMBOL_GPL(l2tp_session_register);
-struct l2tp_tunnel *l2tp_tunnel_find_nth(const struct net *net, int nth)
-{
- struct l2tp_net *pn = l2tp_pernet(net);
- struct l2tp_tunnel *tunnel;
- int count = 0;
-
- rcu_read_lock_bh();
- list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) {
- if (++count > nth) {
- rcu_read_unlock_bh();
- return tunnel;
- }
- }
-
- rcu_read_unlock_bh();
-
- return NULL;
-}
-EXPORT_SYMBOL_GPL(l2tp_tunnel_find_nth);
-
/*****************************************************************************
* Receive data handling
*****************************************************************************/
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index ba33cbec71eb..c199020f8a8a 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -212,6 +212,8 @@ static inline void *l2tp_session_priv(struct l2tp_session *session)
}
struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id);
+struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth);
+
void l2tp_tunnel_free(struct l2tp_tunnel *tunnel);
struct l2tp_session *l2tp_session_get(const struct net *net,
@@ -220,7 +222,6 @@ struct l2tp_session *l2tp_session_get(const struct net *net,
struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth);
struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net,
const char *ifname);
-struct l2tp_tunnel *l2tp_tunnel_find_nth(const struct net *net, int nth);
int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id,
u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg,
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index 72e713da4733..e87686f7d63c 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -47,12 +47,20 @@ struct l2tp_dfs_seq_data {
static void l2tp_dfs_next_tunnel(struct l2tp_dfs_seq_data *pd)
{
- pd->tunnel = l2tp_tunnel_find_nth(pd->net, pd->tunnel_idx);
+ /* Drop reference taken during previous invocation */
+ if (pd->tunnel)
+ l2tp_tunnel_dec_refcount(pd->tunnel);
+
+ pd->tunnel = l2tp_tunnel_get_nth(pd->net, pd->tunnel_idx);
pd->tunnel_idx++;
}
static void l2tp_dfs_next_session(struct l2tp_dfs_seq_data *pd)
{
+ /* Drop reference taken during previous invocation */
+ if (pd->session)
+ l2tp_session_dec_refcount(pd->session);
+
pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx);
pd->session_idx++;
@@ -96,7 +104,22 @@ static void *l2tp_dfs_seq_next(struct seq_file *m, void *v, loff_t *pos)
static void l2tp_dfs_seq_stop(struct seq_file *p, void *v)
{
- /* nothing to do */
+ struct l2tp_dfs_seq_data *pd = v;
+
+ if (!pd || pd == SEQ_START_TOKEN)
+ return;
+
+ /* Drop reference taken by last invocation of l2tp_dfs_next_session()
+ * or l2tp_dfs_next_tunnel().
+ */
+ if (pd->session) {
+ l2tp_session_dec_refcount(pd->session);
+ pd->session = NULL;
+ }
+ if (pd->tunnel) {
+ l2tp_tunnel_dec_refcount(pd->tunnel);
+ pd->tunnel = NULL;
+ }
}
static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v)
@@ -236,13 +259,10 @@ static int l2tp_dfs_seq_show(struct seq_file *m, void *v)
goto out;
}
- /* Show the tunnel or session context */
- if (!pd->session) {
+ if (!pd->session)
l2tp_dfs_seq_tunnel_show(m, pd->tunnel);
- } else {
+ else
l2tp_dfs_seq_session_show(m, pd->session);
- l2tp_session_dec_refcount(pd->session);
- }
out:
return 0;
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index b05dbd9ffcb2..6616c9fd292f 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -487,14 +487,17 @@ static int l2tp_nl_cmd_tunnel_dump(struct sk_buff *skb, struct netlink_callback
struct net *net = sock_net(skb->sk);
for (;;) {
- tunnel = l2tp_tunnel_find_nth(net, ti);
+ tunnel = l2tp_tunnel_get_nth(net, ti);
if (tunnel == NULL)
goto out;
if (l2tp_nl_tunnel_send(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
- tunnel, L2TP_CMD_TUNNEL_GET) < 0)
+ tunnel, L2TP_CMD_TUNNEL_GET) < 0) {
+ l2tp_tunnel_dec_refcount(tunnel);
goto out;
+ }
+ l2tp_tunnel_dec_refcount(tunnel);
ti++;
}
@@ -848,7 +851,7 @@ static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback
for (;;) {
if (tunnel == NULL) {
- tunnel = l2tp_tunnel_find_nth(net, ti);
+ tunnel = l2tp_tunnel_get_nth(net, ti);
if (tunnel == NULL)
goto out;
}
@@ -856,6 +859,7 @@ static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback
session = l2tp_session_get_nth(tunnel, si);
if (session == NULL) {
ti++;
+ l2tp_tunnel_dec_refcount(tunnel);
tunnel = NULL;
si = 0;
continue;
@@ -865,6 +869,7 @@ static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback
cb->nlh->nlmsg_seq, NLM_F_MULTI,
session, L2TP_CMD_SESSION_GET) < 0) {
l2tp_session_dec_refcount(session);
+ l2tp_tunnel_dec_refcount(tunnel);
break;
}
l2tp_session_dec_refcount(session);
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 896bbca9bdaa..f951c768dcf2 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -619,6 +619,13 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
lock_sock(sk);
error = -EINVAL;
+
+ if (sockaddr_len != sizeof(struct sockaddr_pppol2tp) &&
+ sockaddr_len != sizeof(struct sockaddr_pppol2tpv3) &&
+ sockaddr_len != sizeof(struct sockaddr_pppol2tpin6) &&
+ sockaddr_len != sizeof(struct sockaddr_pppol2tpv3in6))
+ goto end;
+
if (sp->sa_protocol != PX_PROTO_OL2TP)
goto end;
@@ -1551,21 +1558,28 @@ struct pppol2tp_seq_data {
static void pppol2tp_next_tunnel(struct net *net, struct pppol2tp_seq_data *pd)
{
+ /* Drop reference taken during previous invocation */
+ if (pd->tunnel)
+ l2tp_tunnel_dec_refcount(pd->tunnel);
+
for (;;) {
- pd->tunnel = l2tp_tunnel_find_nth(net, pd->tunnel_idx);
+ pd->tunnel = l2tp_tunnel_get_nth(net, pd->tunnel_idx);
pd->tunnel_idx++;
- if (pd->tunnel == NULL)
- break;
+ /* Only accept L2TPv2 tunnels */
+ if (!pd->tunnel || pd->tunnel->version == 2)
+ return;
- /* Ignore L2TPv3 tunnels */
- if (pd->tunnel->version < 3)
- break;
+ l2tp_tunnel_dec_refcount(pd->tunnel);
}
}
static void pppol2tp_next_session(struct net *net, struct pppol2tp_seq_data *pd)
{
+ /* Drop reference taken during previous invocation */
+ if (pd->session)
+ l2tp_session_dec_refcount(pd->session);
+
pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx);
pd->session_idx++;
@@ -1609,7 +1623,22 @@ static void *pppol2tp_seq_next(struct seq_file *m, void *v, loff_t *pos)
static void pppol2tp_seq_stop(struct seq_file *p, void *v)
{
- /* nothing to do */
+ struct pppol2tp_seq_data *pd = v;
+
+ if (!pd || pd == SEQ_START_TOKEN)
+ return;
+
+ /* Drop reference taken by last invocation of pppol2tp_next_session()
+ * or pppol2tp_next_tunnel().
+ */
+ if (pd->session) {
+ l2tp_session_dec_refcount(pd->session);
+ pd->session = NULL;
+ }
+ if (pd->tunnel) {
+ l2tp_tunnel_dec_refcount(pd->tunnel);
+ pd->tunnel = NULL;
+ }
}
static void pppol2tp_seq_tunnel_show(struct seq_file *m, void *v)
@@ -1703,14 +1732,10 @@ static int pppol2tp_seq_show(struct seq_file *m, void *v)
goto out;
}
- /* Show the tunnel or session context.
- */
- if (!pd->session) {
+ if (!pd->session)
pppol2tp_seq_tunnel_show(m, pd->tunnel);
- } else {
+ else
pppol2tp_seq_session_show(m, pd->session);
- l2tp_session_dec_refcount(pd->session);
- }
out:
return 0;
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 01dcc0823d1f..cb80ebb38311 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -199,9 +199,19 @@ static int llc_ui_release(struct socket *sock)
llc->laddr.lsap, llc->daddr.lsap);
if (!llc_send_disc(sk))
llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo);
- if (!sock_flag(sk, SOCK_ZAPPED))
+ if (!sock_flag(sk, SOCK_ZAPPED)) {
+ struct llc_sap *sap = llc->sap;
+
+ /* Hold this for release_sock(), so that llc_backlog_rcv()
+ * could still use it.
+ */
+ llc_sap_hold(sap);
llc_sap_remove_socket(llc->sap, sk);
- release_sock(sk);
+ release_sock(sk);
+ llc_sap_put(sap);
+ } else {
+ release_sock(sk);
+ }
if (llc->dev)
dev_put(llc->dev);
sock_put(sk);
diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c
index 163121192aca..4d78375f9872 100644
--- a/net/llc/llc_c_ac.c
+++ b/net/llc/llc_c_ac.c
@@ -1099,14 +1099,7 @@ int llc_conn_ac_inc_tx_win_size(struct sock *sk, struct sk_buff *skb)
int llc_conn_ac_stop_all_timers(struct sock *sk, struct sk_buff *skb)
{
- struct llc_sock *llc = llc_sk(sk);
-
- del_timer(&llc->pf_cycle_timer.timer);
- del_timer(&llc->ack_timer.timer);
- del_timer(&llc->rej_sent_timer.timer);
- del_timer(&llc->busy_state_timer.timer);
- llc->ack_must_be_send = 0;
- llc->ack_pf = 0;
+ llc_sk_stop_all_timers(sk, false);
return 0;
}
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 110e32bcb399..c0ac522b48a1 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -961,6 +961,26 @@ out:
return sk;
}
+void llc_sk_stop_all_timers(struct sock *sk, bool sync)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ if (sync) {
+ del_timer_sync(&llc->pf_cycle_timer.timer);
+ del_timer_sync(&llc->ack_timer.timer);
+ del_timer_sync(&llc->rej_sent_timer.timer);
+ del_timer_sync(&llc->busy_state_timer.timer);
+ } else {
+ del_timer(&llc->pf_cycle_timer.timer);
+ del_timer(&llc->ack_timer.timer);
+ del_timer(&llc->rej_sent_timer.timer);
+ del_timer(&llc->busy_state_timer.timer);
+ }
+
+ llc->ack_must_be_send = 0;
+ llc->ack_pf = 0;
+}
+
/**
* llc_sk_free - Frees a LLC socket
* @sk - socket to free
@@ -973,7 +993,7 @@ void llc_sk_free(struct sock *sk)
llc->state = LLC_CONN_OUT_OF_SVC;
/* Stop all (possibly) running timers */
- llc_conn_ac_stop_all_timers(sk, NULL);
+ llc_sk_stop_all_timers(sk, true);
#ifdef DEBUG_LLC_CONN_ALLOC
printk(KERN_INFO "%s: unackq=%d, txq=%d\n", __func__,
skb_queue_len(&llc->pdu_unack_q),
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index e25cdb3d51cb..e57c9d479503 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -579,6 +579,7 @@ config NFT_QUOTA
config NFT_REJECT
default m if NETFILTER_ADVANCED=n
tristate "Netfilter nf_tables reject support"
+ depends on !NF_TABLES_INET || (IPV6!=m || m)
help
This option adds the "reject" expression that you can use to
explicitly deny and notify via TCP reset/ICMP informational errors
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index b91bb70ece92..d4f68d0f7df7 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2388,11 +2388,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
strlcpy(cfg.mcast_ifn, dm->mcast_ifn,
sizeof(cfg.mcast_ifn));
cfg.syncid = dm->syncid;
- rtnl_lock();
- mutex_lock(&ipvs->sync_mutex);
ret = start_sync_thread(ipvs, &cfg, dm->state);
- mutex_unlock(&ipvs->sync_mutex);
- rtnl_unlock();
} else {
mutex_lock(&ipvs->sync_mutex);
ret = stop_sync_thread(ipvs, dm->state);
@@ -3485,12 +3481,8 @@ static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
if (ipvs->mixed_address_family_dests > 0)
return -EINVAL;
- rtnl_lock();
- mutex_lock(&ipvs->sync_mutex);
ret = start_sync_thread(ipvs, &c,
nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
- mutex_unlock(&ipvs->sync_mutex);
- rtnl_unlock();
return ret;
}
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index fbaf3bd05b2e..001501e25625 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -49,6 +49,7 @@
#include <linux/kthread.h>
#include <linux/wait.h>
#include <linux/kernel.h>
+#include <linux/sched/signal.h>
#include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */
@@ -1360,15 +1361,9 @@ static void set_mcast_pmtudisc(struct sock *sk, int val)
/*
* Specifiy default interface for outgoing multicasts
*/
-static int set_mcast_if(struct sock *sk, char *ifname)
+static int set_mcast_if(struct sock *sk, struct net_device *dev)
{
- struct net_device *dev;
struct inet_sock *inet = inet_sk(sk);
- struct net *net = sock_net(sk);
-
- dev = __dev_get_by_name(net, ifname);
- if (!dev)
- return -ENODEV;
if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
return -EINVAL;
@@ -1396,19 +1391,14 @@ static int set_mcast_if(struct sock *sk, char *ifname)
* in the in_addr structure passed in as a parameter.
*/
static int
-join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
+join_mcast_group(struct sock *sk, struct in_addr *addr, struct net_device *dev)
{
- struct net *net = sock_net(sk);
struct ip_mreqn mreq;
- struct net_device *dev;
int ret;
memset(&mreq, 0, sizeof(mreq));
memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
- dev = __dev_get_by_name(net, ifname);
- if (!dev)
- return -ENODEV;
if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
return -EINVAL;
@@ -1423,15 +1413,10 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
#ifdef CONFIG_IP_VS_IPV6
static int join_mcast_group6(struct sock *sk, struct in6_addr *addr,
- char *ifname)
+ struct net_device *dev)
{
- struct net *net = sock_net(sk);
- struct net_device *dev;
int ret;
- dev = __dev_get_by_name(net, ifname);
- if (!dev)
- return -ENODEV;
if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
return -EINVAL;
@@ -1443,24 +1428,18 @@ static int join_mcast_group6(struct sock *sk, struct in6_addr *addr,
}
#endif
-static int bind_mcastif_addr(struct socket *sock, char *ifname)
+static int bind_mcastif_addr(struct socket *sock, struct net_device *dev)
{
- struct net *net = sock_net(sock->sk);
- struct net_device *dev;
__be32 addr;
struct sockaddr_in sin;
- dev = __dev_get_by_name(net, ifname);
- if (!dev)
- return -ENODEV;
-
addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
if (!addr)
pr_err("You probably need to specify IP address on "
"multicast interface.\n");
IP_VS_DBG(7, "binding socket with (%s) %pI4\n",
- ifname, &addr);
+ dev->name, &addr);
/* Now bind the socket with the address of multicast interface */
sin.sin_family = AF_INET;
@@ -1493,7 +1472,8 @@ static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen,
/*
* Set up sending multicast socket over UDP
*/
-static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id)
+static int make_send_sock(struct netns_ipvs *ipvs, int id,
+ struct net_device *dev, struct socket **sock_ret)
{
/* multicast addr */
union ipvs_sockaddr mcast_addr;
@@ -1505,9 +1485,10 @@ static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id)
IPPROTO_UDP, &sock);
if (result < 0) {
pr_err("Error during creation of socket; terminating\n");
- return ERR_PTR(result);
+ goto error;
}
- result = set_mcast_if(sock->sk, ipvs->mcfg.mcast_ifn);
+ *sock_ret = sock;
+ result = set_mcast_if(sock->sk, dev);
if (result < 0) {
pr_err("Error setting outbound mcast interface\n");
goto error;
@@ -1522,7 +1503,7 @@ static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id)
set_sock_size(sock->sk, 1, result);
if (AF_INET == ipvs->mcfg.mcast_af)
- result = bind_mcastif_addr(sock, ipvs->mcfg.mcast_ifn);
+ result = bind_mcastif_addr(sock, dev);
else
result = 0;
if (result < 0) {
@@ -1538,19 +1519,18 @@ static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id)
goto error;
}
- return sock;
+ return 0;
error:
- sock_release(sock);
- return ERR_PTR(result);
+ return result;
}
/*
* Set up receiving multicast socket over UDP
*/
-static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id,
- int ifindex)
+static int make_receive_sock(struct netns_ipvs *ipvs, int id,
+ struct net_device *dev, struct socket **sock_ret)
{
/* multicast addr */
union ipvs_sockaddr mcast_addr;
@@ -1562,8 +1542,9 @@ static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id,
IPPROTO_UDP, &sock);
if (result < 0) {
pr_err("Error during creation of socket; terminating\n");
- return ERR_PTR(result);
+ goto error;
}
+ *sock_ret = sock;
/* it is equivalent to the REUSEADDR option in user-space */
sock->sk->sk_reuse = SK_CAN_REUSE;
result = sysctl_sync_sock_size(ipvs);
@@ -1571,7 +1552,7 @@ static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id,
set_sock_size(sock->sk, 0, result);
get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id);
- sock->sk->sk_bound_dev_if = ifindex;
+ sock->sk->sk_bound_dev_if = dev->ifindex;
result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen);
if (result < 0) {
pr_err("Error binding to the multicast addr\n");
@@ -1582,21 +1563,20 @@ static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id,
#ifdef CONFIG_IP_VS_IPV6
if (ipvs->bcfg.mcast_af == AF_INET6)
result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr,
- ipvs->bcfg.mcast_ifn);
+ dev);
else
#endif
result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr,
- ipvs->bcfg.mcast_ifn);
+ dev);
if (result < 0) {
pr_err("Error joining to the multicast group\n");
goto error;
}
- return sock;
+ return 0;
error:
- sock_release(sock);
- return ERR_PTR(result);
+ return result;
}
@@ -1778,13 +1758,12 @@ static int sync_thread_backup(void *data)
int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
int state)
{
- struct ip_vs_sync_thread_data *tinfo;
+ struct ip_vs_sync_thread_data *tinfo = NULL;
struct task_struct **array = NULL, *task;
- struct socket *sock;
struct net_device *dev;
char *name;
int (*threadfn)(void *data);
- int id, count, hlen;
+ int id = 0, count, hlen;
int result = -ENOMEM;
u16 mtu, min_mtu;
@@ -1792,6 +1771,18 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n",
sizeof(struct ip_vs_sync_conn_v0));
+ /* Do not hold one mutex and then to block on another */
+ for (;;) {
+ rtnl_lock();
+ if (mutex_trylock(&ipvs->sync_mutex))
+ break;
+ rtnl_unlock();
+ mutex_lock(&ipvs->sync_mutex);
+ if (rtnl_trylock())
+ break;
+ mutex_unlock(&ipvs->sync_mutex);
+ }
+
if (!ipvs->sync_state) {
count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX);
ipvs->threads_mask = count - 1;
@@ -1810,7 +1801,8 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
dev = __dev_get_by_name(ipvs->net, c->mcast_ifn);
if (!dev) {
pr_err("Unknown mcast interface: %s\n", c->mcast_ifn);
- return -ENODEV;
+ result = -ENODEV;
+ goto out_early;
}
hlen = (AF_INET6 == c->mcast_af) ?
sizeof(struct ipv6hdr) + sizeof(struct udphdr) :
@@ -1827,26 +1819,30 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
c->sync_maxlen = mtu - hlen;
if (state == IP_VS_STATE_MASTER) {
+ result = -EEXIST;
if (ipvs->ms)
- return -EEXIST;
+ goto out_early;
ipvs->mcfg = *c;
name = "ipvs-m:%d:%d";
threadfn = sync_thread_master;
} else if (state == IP_VS_STATE_BACKUP) {
+ result = -EEXIST;
if (ipvs->backup_threads)
- return -EEXIST;
+ goto out_early;
ipvs->bcfg = *c;
name = "ipvs-b:%d:%d";
threadfn = sync_thread_backup;
} else {
- return -EINVAL;
+ result = -EINVAL;
+ goto out_early;
}
if (state == IP_VS_STATE_MASTER) {
struct ipvs_master_sync_state *ms;
+ result = -ENOMEM;
ipvs->ms = kcalloc(count, sizeof(ipvs->ms[0]), GFP_KERNEL);
if (!ipvs->ms)
goto out;
@@ -1862,39 +1858,38 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
} else {
array = kcalloc(count, sizeof(struct task_struct *),
GFP_KERNEL);
+ result = -ENOMEM;
if (!array)
goto out;
}
- tinfo = NULL;
for (id = 0; id < count; id++) {
- if (state == IP_VS_STATE_MASTER)
- sock = make_send_sock(ipvs, id);
- else
- sock = make_receive_sock(ipvs, id, dev->ifindex);
- if (IS_ERR(sock)) {
- result = PTR_ERR(sock);
- goto outtinfo;
- }
+ result = -ENOMEM;
tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
if (!tinfo)
- goto outsocket;
+ goto out;
tinfo->ipvs = ipvs;
- tinfo->sock = sock;
+ tinfo->sock = NULL;
if (state == IP_VS_STATE_BACKUP) {
tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen,
GFP_KERNEL);
if (!tinfo->buf)
- goto outtinfo;
+ goto out;
} else {
tinfo->buf = NULL;
}
tinfo->id = id;
+ if (state == IP_VS_STATE_MASTER)
+ result = make_send_sock(ipvs, id, dev, &tinfo->sock);
+ else
+ result = make_receive_sock(ipvs, id, dev, &tinfo->sock);
+ if (result < 0)
+ goto out;
task = kthread_run(threadfn, tinfo, name, ipvs->gen, id);
if (IS_ERR(task)) {
result = PTR_ERR(task);
- goto outtinfo;
+ goto out;
}
tinfo = NULL;
if (state == IP_VS_STATE_MASTER)
@@ -1911,20 +1906,20 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
ipvs->sync_state |= state;
spin_unlock_bh(&ipvs->sync_buff_lock);
+ mutex_unlock(&ipvs->sync_mutex);
+ rtnl_unlock();
+
/* increase the module use count */
ip_vs_use_count_inc();
return 0;
-outsocket:
- sock_release(sock);
-
-outtinfo:
- if (tinfo) {
- sock_release(tinfo->sock);
- kfree(tinfo->buf);
- kfree(tinfo);
- }
+out:
+ /* We do not need RTNL lock anymore, release it here so that
+ * sock_release below and in the kthreads can use rtnl_lock
+ * to leave the mcast group.
+ */
+ rtnl_unlock();
count = id;
while (count-- > 0) {
if (state == IP_VS_STATE_MASTER)
@@ -1932,13 +1927,23 @@ outtinfo:
else
kthread_stop(array[count]);
}
- kfree(array);
-
-out:
if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
kfree(ipvs->ms);
ipvs->ms = NULL;
}
+ mutex_unlock(&ipvs->sync_mutex);
+ if (tinfo) {
+ if (tinfo->sock)
+ sock_release(tinfo->sock);
+ kfree(tinfo->buf);
+ kfree(tinfo);
+ }
+ kfree(array);
+ return result;
+
+out_early:
+ mutex_unlock(&ipvs->sync_mutex);
+ rtnl_unlock();
return result;
}
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 8ef21d9f9a00..4b2b3d53acfc 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -252,7 +252,7 @@ static inline int expect_clash(const struct nf_conntrack_expect *a,
static inline int expect_matches(const struct nf_conntrack_expect *a,
const struct nf_conntrack_expect *b)
{
- return a->master == b->master && a->class == b->class &&
+ return a->master == b->master &&
nf_ct_tuple_equal(&a->tuple, &b->tuple) &&
nf_ct_tuple_mask_equal(&a->mask, &b->mask) &&
net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) &&
@@ -421,6 +421,9 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
h = nf_ct_expect_dst_hash(net, &expect->tuple);
hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) {
if (expect_matches(i, expect)) {
+ if (i->class != expect->class)
+ return -EALREADY;
+
if (nf_ct_remove_expect(i))
break;
} else if (expect_clash(i, expect)) {
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index 9fe0ddc333fb..277bbfe26478 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -9,6 +9,7 @@
* 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
+#include <linux/kmemleak.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/rcupdate.h>
@@ -71,6 +72,7 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
rcu_read_unlock();
alloc = max(newlen, NF_CT_EXT_PREALLOC);
+ kmemleak_not_leak(old);
new = __krealloc(old, alloc, gfp);
if (!new)
return NULL;
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 148ce1a52cc7..c8d2b6688a2a 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -938,11 +938,19 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,
datalen, rtp_exp, rtcp_exp,
mediaoff, medialen, daddr);
else {
- if (nf_ct_expect_related(rtp_exp) == 0) {
- if (nf_ct_expect_related(rtcp_exp) != 0)
- nf_ct_unexpect_related(rtp_exp);
- else
+ /* -EALREADY handling works around end-points that send
+ * SDP messages with identical port but different media type,
+ * we pretend expectation was set up.
+ */
+ int errp = nf_ct_expect_related(rtp_exp);
+
+ if (errp == 0 || errp == -EALREADY) {
+ int errcp = nf_ct_expect_related(rtcp_exp);
+
+ if (errcp == 0 || errcp == -EALREADY)
ret = NF_ACCEPT;
+ else if (errp == 0)
+ nf_ct_unexpect_related(rtp_exp);
}
}
nf_ct_expect_put(rtcp_exp);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index d57aeea89a79..18bd584fadda 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2374,41 +2374,46 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
}
if (nlh->nlmsg_flags & NLM_F_REPLACE) {
- if (nft_is_active_next(net, old_rule)) {
- trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE,
- old_rule);
- if (trans == NULL) {
- err = -ENOMEM;
- goto err2;
- }
- nft_deactivate_next(net, old_rule);
- chain->use--;
- list_add_tail_rcu(&rule->list, &old_rule->list);
- } else {
+ if (!nft_is_active_next(net, old_rule)) {
err = -ENOENT;
goto err2;
}
- } else if (nlh->nlmsg_flags & NLM_F_APPEND)
- if (old_rule)
- list_add_rcu(&rule->list, &old_rule->list);
- else
- list_add_tail_rcu(&rule->list, &chain->rules);
- else {
- if (old_rule)
- list_add_tail_rcu(&rule->list, &old_rule->list);
- else
- list_add_rcu(&rule->list, &chain->rules);
- }
+ trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE,
+ old_rule);
+ if (trans == NULL) {
+ err = -ENOMEM;
+ goto err2;
+ }
+ nft_deactivate_next(net, old_rule);
+ chain->use--;
- if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) {
- err = -ENOMEM;
- goto err3;
+ if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) {
+ err = -ENOMEM;
+ goto err2;
+ }
+
+ list_add_tail_rcu(&rule->list, &old_rule->list);
+ } else {
+ if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) {
+ err = -ENOMEM;
+ goto err2;
+ }
+
+ if (nlh->nlmsg_flags & NLM_F_APPEND) {
+ if (old_rule)
+ list_add_rcu(&rule->list, &old_rule->list);
+ else
+ list_add_tail_rcu(&rule->list, &chain->rules);
+ } else {
+ if (old_rule)
+ list_add_tail_rcu(&rule->list, &old_rule->list);
+ else
+ list_add_rcu(&rule->list, &chain->rules);
+ }
}
chain->use++;
return 0;
-err3:
- list_del_rcu(&rule->list);
err2:
nf_tables_rule_destroy(&ctx, rule);
err1:
@@ -3248,18 +3253,20 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
err = ops->init(set, &desc, nla);
if (err < 0)
- goto err2;
+ goto err3;
err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set);
if (err < 0)
- goto err3;
+ goto err4;
list_add_tail_rcu(&set->list, &table->sets);
table->use++;
return 0;
-err3:
+err4:
ops->destroy(set);
+err3:
+ kfree(set->name);
err2:
kvfree(set);
err1:
@@ -5798,7 +5805,7 @@ static void nft_chain_commit_update(struct nft_trans *trans)
struct nft_base_chain *basechain;
if (nft_trans_chain_name(trans))
- strcpy(trans->ctx.chain->name, nft_trans_chain_name(trans));
+ swap(trans->ctx.chain->name, nft_trans_chain_name(trans));
if (!nft_is_base_chain(trans->ctx.chain))
return;
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index 773da82190dc..94df000abb92 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -36,11 +36,10 @@ MODULE_ALIAS("ipt_connmark");
MODULE_ALIAS("ip6t_connmark");
static unsigned int
-connmark_tg_shift(struct sk_buff *skb,
- const struct xt_connmark_tginfo1 *info,
- u8 shift_bits, u8 shift_dir)
+connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info)
{
enum ip_conntrack_info ctinfo;
+ u_int32_t new_targetmark;
struct nf_conn *ct;
u_int32_t newmark;
@@ -51,34 +50,39 @@ connmark_tg_shift(struct sk_buff *skb,
switch (info->mode) {
case XT_CONNMARK_SET:
newmark = (ct->mark & ~info->ctmask) ^ info->ctmark;
- if (shift_dir == D_SHIFT_RIGHT)
- newmark >>= shift_bits;
+ if (info->shift_dir == D_SHIFT_RIGHT)
+ newmark >>= info->shift_bits;
else
- newmark <<= shift_bits;
+ newmark <<= info->shift_bits;
+
if (ct->mark != newmark) {
ct->mark = newmark;
nf_conntrack_event_cache(IPCT_MARK, ct);
}
break;
case XT_CONNMARK_SAVE:
- newmark = (ct->mark & ~info->ctmask) ^
- (skb->mark & info->nfmask);
- if (shift_dir == D_SHIFT_RIGHT)
- newmark >>= shift_bits;
+ new_targetmark = (skb->mark & info->nfmask);
+ if (info->shift_dir == D_SHIFT_RIGHT)
+ new_targetmark >>= info->shift_bits;
else
- newmark <<= shift_bits;
+ new_targetmark <<= info->shift_bits;
+
+ newmark = (ct->mark & ~info->ctmask) ^
+ new_targetmark;
if (ct->mark != newmark) {
ct->mark = newmark;
nf_conntrack_event_cache(IPCT_MARK, ct);
}
break;
case XT_CONNMARK_RESTORE:
- newmark = (skb->mark & ~info->nfmask) ^
- (ct->mark & info->ctmask);
- if (shift_dir == D_SHIFT_RIGHT)
- newmark >>= shift_bits;
+ new_targetmark = (ct->mark & info->ctmask);
+ if (info->shift_dir == D_SHIFT_RIGHT)
+ new_targetmark >>= info->shift_bits;
else
- newmark <<= shift_bits;
+ new_targetmark <<= info->shift_bits;
+
+ newmark = (skb->mark & ~info->nfmask) ^
+ new_targetmark;
skb->mark = newmark;
break;
}
@@ -89,8 +93,14 @@ static unsigned int
connmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_connmark_tginfo1 *info = par->targinfo;
-
- return connmark_tg_shift(skb, info, 0, 0);
+ const struct xt_connmark_tginfo2 info2 = {
+ .ctmark = info->ctmark,
+ .ctmask = info->ctmask,
+ .nfmask = info->nfmask,
+ .mode = info->mode,
+ };
+
+ return connmark_tg_shift(skb, &info2);
}
static unsigned int
@@ -98,8 +108,7 @@ connmark_tg_v2(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_connmark_tginfo2 *info = par->targinfo;
- return connmark_tg_shift(skb, (const struct xt_connmark_tginfo1 *)info,
- info->shift_bits, info->shift_dir);
+ return connmark_tg_shift(skb, info);
}
static int connmark_tg_check(const struct xt_tgchk_param *par)
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 616cb9c18f88..01f3515cada0 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -329,11 +329,11 @@ static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
skb_set_queue_mapping(skb, queue_index);
}
-/* register_prot_hook must be invoked with the po->bind_lock held,
+/* __register_prot_hook must be invoked through register_prot_hook
* or from a context in which asynchronous accesses to the packet
* socket is not possible (packet_create()).
*/
-static void register_prot_hook(struct sock *sk)
+static void __register_prot_hook(struct sock *sk)
{
struct packet_sock *po = pkt_sk(sk);
@@ -348,8 +348,13 @@ static void register_prot_hook(struct sock *sk)
}
}
-/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
- * held. If the sync parameter is true, we will temporarily drop
+static void register_prot_hook(struct sock *sk)
+{
+ lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
+ __register_prot_hook(sk);
+}
+
+/* If the sync parameter is true, we will temporarily drop
* the po->bind_lock and do a synchronize_net to make sure no
* asynchronous packet processing paths still refer to the elements
* of po->prot_hook. If the sync parameter is false, it is the
@@ -359,6 +364,8 @@ static void __unregister_prot_hook(struct sock *sk, bool sync)
{
struct packet_sock *po = pkt_sk(sk);
+ lockdep_assert_held_once(&po->bind_lock);
+
po->running = 0;
if (po->fanout)
@@ -3008,6 +3015,7 @@ static int packet_release(struct socket *sock)
packet_flush_mclist(sk);
+ lock_sock(sk);
if (po->rx_ring.pg_vec) {
memset(&req_u, 0, sizeof(req_u));
packet_set_ring(sk, &req_u, 1, 0);
@@ -3017,6 +3025,7 @@ static int packet_release(struct socket *sock)
memset(&req_u, 0, sizeof(req_u));
packet_set_ring(sk, &req_u, 1, 1);
}
+ release_sock(sk);
f = fanout_release(sk);
@@ -3250,7 +3259,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
if (proto) {
po->prot_hook.type = proto;
- register_prot_hook(sk);
+ __register_prot_hook(sk);
}
mutex_lock(&net->packet.sklist_lock);
@@ -3643,6 +3652,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
union tpacket_req_u req_u;
int len;
+ lock_sock(sk);
switch (po->tp_version) {
case TPACKET_V1:
case TPACKET_V2:
@@ -3653,12 +3663,17 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
len = sizeof(req_u.req3);
break;
}
- if (optlen < len)
- return -EINVAL;
- if (copy_from_user(&req_u.req, optval, len))
- return -EFAULT;
- return packet_set_ring(sk, &req_u, 0,
- optname == PACKET_TX_RING);
+ if (optlen < len) {
+ ret = -EINVAL;
+ } else {
+ if (copy_from_user(&req_u.req, optval, len))
+ ret = -EFAULT;
+ else
+ ret = packet_set_ring(sk, &req_u, 0,
+ optname == PACKET_TX_RING);
+ }
+ release_sock(sk);
+ return ret;
}
case PACKET_COPY_THRESH:
{
@@ -3724,12 +3739,18 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
if (optlen != sizeof(val))
return -EINVAL;
- if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
- return -EBUSY;
if (copy_from_user(&val, optval, sizeof(val)))
return -EFAULT;
- po->tp_loss = !!val;
- return 0;
+
+ lock_sock(sk);
+ if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
+ ret = -EBUSY;
+ } else {
+ po->tp_loss = !!val;
+ ret = 0;
+ }
+ release_sock(sk);
+ return ret;
}
case PACKET_AUXDATA:
{
@@ -3740,7 +3761,9 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
if (copy_from_user(&val, optval, sizeof(val)))
return -EFAULT;
+ lock_sock(sk);
po->auxdata = !!val;
+ release_sock(sk);
return 0;
}
case PACKET_ORIGDEV:
@@ -3752,7 +3775,9 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
if (copy_from_user(&val, optval, sizeof(val)))
return -EFAULT;
+ lock_sock(sk);
po->origdev = !!val;
+ release_sock(sk);
return 0;
}
case PACKET_VNET_HDR:
@@ -3761,15 +3786,20 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
if (sock->type != SOCK_RAW)
return -EINVAL;
- if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
- return -EBUSY;
if (optlen < sizeof(val))
return -EINVAL;
if (copy_from_user(&val, optval, sizeof(val)))
return -EFAULT;
- po->has_vnet_hdr = !!val;
- return 0;
+ lock_sock(sk);
+ if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
+ ret = -EBUSY;
+ } else {
+ po->has_vnet_hdr = !!val;
+ ret = 0;
+ }
+ release_sock(sk);
+ return ret;
}
case PACKET_TIMESTAMP:
{
@@ -3807,11 +3837,17 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
if (optlen != sizeof(val))
return -EINVAL;
- if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
- return -EBUSY;
if (copy_from_user(&val, optval, sizeof(val)))
return -EFAULT;
- po->tp_tx_has_off = !!val;
+
+ lock_sock(sk);
+ if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
+ ret = -EBUSY;
+ } else {
+ po->tp_tx_has_off = !!val;
+ ret = 0;
+ }
+ release_sock(sk);
return 0;
}
case PACKET_QDISC_BYPASS:
@@ -4208,8 +4244,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
/* Added to avoid minimal code churn */
struct tpacket_req *req = &req_u->req;
- lock_sock(sk);
-
rb = tx_ring ? &po->tx_ring : &po->rx_ring;
rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
@@ -4347,7 +4381,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
if (pg_vec)
free_pg_vec(pg_vec, order, req->tp_block_nr);
out:
- release_sock(sk);
return err;
}
diff --git a/net/packet/internal.h b/net/packet/internal.h
index a1d2b2319ae9..3bb7c5fb3bff 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -112,10 +112,12 @@ struct packet_sock {
int copy_thresh;
spinlock_t bind_lock;
struct mutex pg_vec_lock;
- unsigned int running:1, /* prot_hook is attached*/
- auxdata:1,
+ unsigned int running; /* bind_lock must be held */
+ unsigned int auxdata:1, /* writer must hold sock lock */
origdev:1,
- has_vnet_hdr:1;
+ has_vnet_hdr:1,
+ tp_loss:1,
+ tp_tx_has_off:1;
int pressure;
int ifindex; /* bound device */
__be16 num;
@@ -125,8 +127,6 @@ struct packet_sock {
enum tpacket_versions tp_version;
unsigned int tp_hdrlen;
unsigned int tp_reserve;
- unsigned int tp_loss:1;
- unsigned int tp_tx_has_off:1;
unsigned int tp_tstamp;
struct net_device __rcu *cached_dev;
int (*xmit)(struct sk_buff *skb);
diff --git a/net/qrtr/Kconfig b/net/qrtr/Kconfig
index 326fd97444f5..1944834d225c 100644
--- a/net/qrtr/Kconfig
+++ b/net/qrtr/Kconfig
@@ -21,4 +21,11 @@ config QRTR_SMD
Say Y here to support SMD based ipcrouter channels. SMD is the
most common transport for IPC Router.
+config QRTR_TUN
+ tristate "TUN device for Qualcomm IPC Router"
+ ---help---
+ Say Y here to expose a character device that allows user space to
+ implement endpoints of QRTR, for purpose of tunneling data to other
+ hosts or testing purposes.
+
endif # QRTR
diff --git a/net/qrtr/Makefile b/net/qrtr/Makefile
index ab09e40f7c74..be012bfd3e52 100644
--- a/net/qrtr/Makefile
+++ b/net/qrtr/Makefile
@@ -2,3 +2,5 @@ obj-$(CONFIG_QRTR) := qrtr.o
obj-$(CONFIG_QRTR_SMD) += qrtr-smd.o
qrtr-smd-y := smd.o
+obj-$(CONFIG_QRTR_TUN) += qrtr-tun.o
+qrtr-tun-y := tun.o
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index b33e5aeb4c06..2aa07b547b16 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -1135,3 +1135,4 @@ module_exit(qrtr_proto_fini);
MODULE_DESCRIPTION("Qualcomm IPC-router driver");
MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_NETPROTO(PF_QIPCRTR);
diff --git a/net/qrtr/tun.c b/net/qrtr/tun.c
new file mode 100644
index 000000000000..ccff1e544c21
--- /dev/null
+++ b/net/qrtr/tun.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Linaro Ltd */
+
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/poll.h>
+#include <linux/skbuff.h>
+#include <linux/uaccess.h>
+
+#include "qrtr.h"
+
+struct qrtr_tun {
+ struct qrtr_endpoint ep;
+
+ struct sk_buff_head queue;
+ wait_queue_head_t readq;
+};
+
+static int qrtr_tun_send(struct qrtr_endpoint *ep, struct sk_buff *skb)
+{
+ struct qrtr_tun *tun = container_of(ep, struct qrtr_tun, ep);
+
+ skb_queue_tail(&tun->queue, skb);
+
+ /* wake up any blocking processes, waiting for new data */
+ wake_up_interruptible(&tun->readq);
+
+ return 0;
+}
+
+static int qrtr_tun_open(struct inode *inode, struct file *filp)
+{
+ struct qrtr_tun *tun;
+
+ tun = kzalloc(sizeof(*tun), GFP_KERNEL);
+ if (!tun)
+ return -ENOMEM;
+
+ skb_queue_head_init(&tun->queue);
+ init_waitqueue_head(&tun->readq);
+
+ tun->ep.xmit = qrtr_tun_send;
+
+ filp->private_data = tun;
+
+ return qrtr_endpoint_register(&tun->ep, QRTR_EP_NID_AUTO);
+}
+
+static ssize_t qrtr_tun_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct file *filp = iocb->ki_filp;
+ struct qrtr_tun *tun = filp->private_data;
+ struct sk_buff *skb;
+ int count;
+
+ while (!(skb = skb_dequeue(&tun->queue))) {
+ if (filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ /* Wait until we get data or the endpoint goes away */
+ if (wait_event_interruptible(tun->readq,
+ !skb_queue_empty(&tun->queue)))
+ return -ERESTARTSYS;
+ }
+
+ count = min_t(size_t, iov_iter_count(to), skb->len);
+ if (copy_to_iter(skb->data, count, to) != count)
+ count = -EFAULT;
+
+ kfree_skb(skb);
+
+ return count;
+}
+
+static ssize_t qrtr_tun_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *filp = iocb->ki_filp;
+ struct qrtr_tun *tun = filp->private_data;
+ size_t len = iov_iter_count(from);
+ ssize_t ret;
+ void *kbuf;
+
+ kbuf = kzalloc(len, GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ if (!copy_from_iter_full(kbuf, len, from))
+ return -EFAULT;
+
+ ret = qrtr_endpoint_post(&tun->ep, kbuf, len);
+
+ return ret < 0 ? ret : len;
+}
+
+static __poll_t qrtr_tun_poll(struct file *filp, poll_table *wait)
+{
+ struct qrtr_tun *tun = filp->private_data;
+ __poll_t mask = 0;
+
+ poll_wait(filp, &tun->readq, wait);
+
+ if (!skb_queue_empty(&tun->queue))
+ mask |= EPOLLIN | EPOLLRDNORM;
+
+ return mask;
+}
+
+static int qrtr_tun_release(struct inode *inode, struct file *filp)
+{
+ struct qrtr_tun *tun = filp->private_data;
+ struct sk_buff *skb;
+
+ qrtr_endpoint_unregister(&tun->ep);
+
+ /* Discard all SKBs */
+ while (!skb_queue_empty(&tun->queue)) {
+ skb = skb_dequeue(&tun->queue);
+ kfree_skb(skb);
+ }
+
+ kfree(tun);
+
+ return 0;
+}
+
+static const struct file_operations qrtr_tun_ops = {
+ .owner = THIS_MODULE,
+ .open = qrtr_tun_open,
+ .poll = qrtr_tun_poll,
+ .read_iter = qrtr_tun_read_iter,
+ .write_iter = qrtr_tun_write_iter,
+ .release = qrtr_tun_release,
+};
+
+static struct miscdevice qrtr_tun_miscdev = {
+ MISC_DYNAMIC_MINOR,
+ "qrtr-tun",
+ &qrtr_tun_ops,
+};
+
+static int __init qrtr_tun_init(void)
+{
+ int ret;
+
+ ret = misc_register(&qrtr_tun_miscdev);
+ if (ret)
+ pr_err("failed to register Qualcomm IPC Router tun device\n");
+
+ return ret;
+}
+
+static void __exit qrtr_tun_exit(void)
+{
+ misc_deregister(&qrtr_tun_miscdev);
+}
+
+module_init(qrtr_tun_init);
+module_exit(qrtr_tun_exit);
+
+MODULE_DESCRIPTION("Qualcomm IPC Router TUN device");
+MODULE_LICENSE("GPL v2");
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index eea1d8611b20..13b38ad0fa4a 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -547,7 +547,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd,
ic->i_send_cq, ic->i_recv_cq);
- return ret;
+ goto out;
sends_out:
vfree(ic->i_sends);
@@ -572,6 +572,7 @@ send_cq_out:
ic->i_send_cq = NULL;
rds_ibdev_out:
rds_ib_remove_conn(rds_ibdev, conn);
+out:
rds_ib_dev_put(rds_ibdev);
return ret;
diff --git a/net/rds/recv.c b/net/rds/recv.c
index de50e2126e40..dc67458b52f0 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -558,6 +558,7 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
struct rds_cmsg_rx_trace t;
int i, j;
+ memset(&t, 0, sizeof(t));
inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock();
t.rx_traces = rs->rs_rx_traces;
for (i = 0; i < rs->rs_rx_traces; i++) {
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 7e28b2ce1437..526a8e491626 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -648,6 +648,11 @@ static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index,
return tcf_idr_search(tn, a, index);
}
+static size_t tcf_csum_get_fill_size(const struct tc_action *act)
+{
+ return nla_total_size(sizeof(struct tc_csum));
+}
+
static struct tc_action_ops act_csum_ops = {
.kind = "csum",
.type = TCA_ACT_CSUM,
@@ -658,6 +663,7 @@ static struct tc_action_ops act_csum_ops = {
.cleanup = tcf_csum_cleanup,
.walk = tcf_csum_walker,
.lookup = tcf_csum_search,
+ .get_fill_size = tcf_csum_get_fill_size,
.size = sizeof(struct tcf_csum),
};
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index a5994cf0512b..8527cfdc446d 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -652,7 +652,7 @@ static int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife,
}
}
- return 0;
+ return -ENOENT;
}
static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
@@ -682,7 +682,12 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
u16 mtype;
u16 dlen;
- curr_data = ife_tlv_meta_decode(tlv_data, &mtype, &dlen, NULL);
+ curr_data = ife_tlv_meta_decode(tlv_data, ifehdr_end, &mtype,
+ &dlen, NULL);
+ if (!curr_data) {
+ qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats));
+ return TC_ACT_SHOT;
+ }
if (find_decode_metaid(skb, ife, mtype, dlen, curr_data)) {
/* abuse overlimits to count when we receive metadata
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index d964e60c730e..eacaaf803914 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -61,16 +61,18 @@ struct fl_flow_mask_range {
struct fl_flow_mask {
struct fl_flow_key key;
struct fl_flow_mask_range range;
- struct rcu_head rcu;
+ struct rhash_head ht_node;
+ struct rhashtable ht;
+ struct rhashtable_params filter_ht_params;
+ struct flow_dissector dissector;
+ struct list_head filters;
+ struct rcu_head rcu;
+ struct list_head list;
};
struct cls_fl_head {
struct rhashtable ht;
- struct fl_flow_mask mask;
- struct flow_dissector dissector;
- bool mask_assigned;
- struct list_head filters;
- struct rhashtable_params ht_params;
+ struct list_head masks;
union {
struct work_struct work;
struct rcu_head rcu;
@@ -79,6 +81,7 @@ struct cls_fl_head {
};
struct cls_fl_filter {
+ struct fl_flow_mask *mask;
struct rhash_head ht_node;
struct fl_flow_key mkey;
struct tcf_exts exts;
@@ -94,6 +97,13 @@ struct cls_fl_filter {
struct net_device *hw_dev;
};
+static const struct rhashtable_params mask_ht_params = {
+ .key_offset = offsetof(struct fl_flow_mask, key),
+ .key_len = sizeof(struct fl_flow_key),
+ .head_offset = offsetof(struct fl_flow_mask, ht_node),
+ .automatic_shrinking = true,
+};
+
static unsigned short int fl_mask_range(const struct fl_flow_mask *mask)
{
return mask->range.end - mask->range.start;
@@ -103,13 +113,19 @@ static void fl_mask_update_range(struct fl_flow_mask *mask)
{
const u8 *bytes = (const u8 *) &mask->key;
size_t size = sizeof(mask->key);
- size_t i, first = 0, last = size - 1;
+ size_t i, first = 0, last;
- for (i = 0; i < sizeof(mask->key); i++) {
+ for (i = 0; i < size; i++) {
+ if (bytes[i]) {
+ first = i;
+ break;
+ }
+ }
+ last = first;
+ for (i = size - 1; i != first; i--) {
if (bytes[i]) {
- if (!first && i)
- first = i;
last = i;
+ break;
}
}
mask->range.start = rounddown(first, sizeof(long));
@@ -140,12 +156,11 @@ static void fl_clear_masked_range(struct fl_flow_key *key,
memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask));
}
-static struct cls_fl_filter *fl_lookup(struct cls_fl_head *head,
+static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
struct fl_flow_key *mkey)
{
- return rhashtable_lookup_fast(&head->ht,
- fl_key_get_start(mkey, &head->mask),
- head->ht_params);
+ return rhashtable_lookup_fast(&mask->ht, fl_key_get_start(mkey, mask),
+ mask->filter_ht_params);
}
static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -153,28 +168,28 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
{
struct cls_fl_head *head = rcu_dereference_bh(tp->root);
struct cls_fl_filter *f;
+ struct fl_flow_mask *mask;
struct fl_flow_key skb_key;
struct fl_flow_key skb_mkey;
- if (!atomic_read(&head->ht.nelems))
- return -1;
-
- fl_clear_masked_range(&skb_key, &head->mask);
+ list_for_each_entry_rcu(mask, &head->masks, list) {
+ fl_clear_masked_range(&skb_key, mask);
- skb_key.indev_ifindex = skb->skb_iif;
- /* skb_flow_dissect() does not set n_proto in case an unknown protocol,
- * so do it rather here.
- */
- skb_key.basic.n_proto = skb->protocol;
- skb_flow_dissect_tunnel_info(skb, &head->dissector, &skb_key);
- skb_flow_dissect(skb, &head->dissector, &skb_key, 0);
+ skb_key.indev_ifindex = skb->skb_iif;
+ /* skb_flow_dissect() does not set n_proto in case an unknown
+ * protocol, so do it rather here.
+ */
+ skb_key.basic.n_proto = skb->protocol;
+ skb_flow_dissect_tunnel_info(skb, &mask->dissector, &skb_key);
+ skb_flow_dissect(skb, &mask->dissector, &skb_key, 0);
- fl_set_masked_key(&skb_mkey, &skb_key, &head->mask);
+ fl_set_masked_key(&skb_mkey, &skb_key, mask);
- f = fl_lookup(head, &skb_mkey);
- if (f && !tc_skip_sw(f->flags)) {
- *res = f->res;
- return tcf_exts_exec(skb, &f->exts, res);
+ f = fl_lookup(mask, &skb_mkey);
+ if (f && !tc_skip_sw(f->flags)) {
+ *res = f->res;
+ return tcf_exts_exec(skb, &f->exts, res);
+ }
}
return -1;
}
@@ -187,11 +202,28 @@ static int fl_init(struct tcf_proto *tp)
if (!head)
return -ENOBUFS;
- INIT_LIST_HEAD_RCU(&head->filters);
+ INIT_LIST_HEAD_RCU(&head->masks);
rcu_assign_pointer(tp->root, head);
idr_init(&head->handle_idr);
- return 0;
+ return rhashtable_init(&head->ht, &mask_ht_params);
+}
+
+static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask,
+ bool async)
+{
+ if (!list_empty(&mask->filters))
+ return false;
+
+ rhashtable_remove_fast(&head->ht, &mask->ht_node, mask_ht_params);
+ rhashtable_destroy(&mask->ht);
+ list_del_rcu(&mask->list);
+ if (async)
+ kfree_rcu(mask, rcu);
+ else
+ kfree(mask);
+
+ return true;
}
static void __fl_destroy_filter(struct cls_fl_filter *f)
@@ -234,8 +266,6 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
}
static int fl_hw_replace_filter(struct tcf_proto *tp,
- struct flow_dissector *dissector,
- struct fl_flow_key *mask,
struct cls_fl_filter *f,
struct netlink_ext_ack *extack)
{
@@ -247,8 +277,8 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
cls_flower.command = TC_CLSFLOWER_REPLACE;
cls_flower.cookie = (unsigned long) f;
- cls_flower.dissector = dissector;
- cls_flower.mask = mask;
+ cls_flower.dissector = &f->mask->dissector;
+ cls_flower.mask = &f->mask->key;
cls_flower.key = &f->mkey;
cls_flower.exts = &f->exts;
cls_flower.classid = f->res.classid;
@@ -283,28 +313,31 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
&cls_flower, false);
}
-static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
+static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
struct netlink_ext_ack *extack)
{
struct cls_fl_head *head = rtnl_dereference(tp->root);
+ bool async = tcf_exts_get_net(&f->exts);
+ bool last;
idr_remove(&head->handle_idr, f->handle);
list_del_rcu(&f->list);
+ last = fl_mask_put(head, f->mask, async);
if (!tc_skip_hw(f->flags))
fl_hw_destroy_filter(tp, f, extack);
tcf_unbind_filter(tp, &f->res);
- if (tcf_exts_get_net(&f->exts))
+ if (async)
call_rcu(&f->rcu, fl_destroy_filter);
else
__fl_destroy_filter(f);
+
+ return last;
}
static void fl_destroy_sleepable(struct work_struct *work)
{
struct cls_fl_head *head = container_of(work, struct cls_fl_head,
work);
- if (head->mask_assigned)
- rhashtable_destroy(&head->ht);
kfree(head);
module_put(THIS_MODULE);
}
@@ -320,10 +353,15 @@ static void fl_destroy_rcu(struct rcu_head *rcu)
static void fl_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
{
struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct fl_flow_mask *mask, *next_mask;
struct cls_fl_filter *f, *next;
- list_for_each_entry_safe(f, next, &head->filters, list)
- __fl_delete(tp, f, extack);
+ list_for_each_entry_safe(mask, next_mask, &head->masks, list) {
+ list_for_each_entry_safe(f, next, &mask->filters, list) {
+ if (__fl_delete(tp, f, extack))
+ break;
+ }
+ }
idr_destroy(&head->handle_idr);
__module_get(THIS_MODULE);
@@ -715,14 +753,14 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
return ret;
}
-static bool fl_mask_eq(struct fl_flow_mask *mask1,
- struct fl_flow_mask *mask2)
+static void fl_mask_copy(struct fl_flow_mask *dst,
+ struct fl_flow_mask *src)
{
- const long *lmask1 = fl_key_get_start(&mask1->key, mask1);
- const long *lmask2 = fl_key_get_start(&mask2->key, mask2);
+ const void *psrc = fl_key_get_start(&src->key, src);
+ void *pdst = fl_key_get_start(&dst->key, src);
- return !memcmp(&mask1->range, &mask2->range, sizeof(mask1->range)) &&
- !memcmp(lmask1, lmask2, fl_mask_range(mask1));
+ memcpy(pdst, psrc, fl_mask_range(src));
+ dst->range = src->range;
}
static const struct rhashtable_params fl_ht_params = {
@@ -731,14 +769,13 @@ static const struct rhashtable_params fl_ht_params = {
.automatic_shrinking = true,
};
-static int fl_init_hashtable(struct cls_fl_head *head,
- struct fl_flow_mask *mask)
+static int fl_init_mask_hashtable(struct fl_flow_mask *mask)
{
- head->ht_params = fl_ht_params;
- head->ht_params.key_len = fl_mask_range(mask);
- head->ht_params.key_offset += mask->range.start;
+ mask->filter_ht_params = fl_ht_params;
+ mask->filter_ht_params.key_len = fl_mask_range(mask);
+ mask->filter_ht_params.key_offset += mask->range.start;
- return rhashtable_init(&head->ht, &head->ht_params);
+ return rhashtable_init(&mask->ht, &mask->filter_ht_params);
}
#define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member)
@@ -761,8 +798,7 @@ static int fl_init_hashtable(struct cls_fl_head *head,
FL_KEY_SET(keys, cnt, id, member); \
} while(0);
-static void fl_init_dissector(struct cls_fl_head *head,
- struct fl_flow_mask *mask)
+static void fl_init_dissector(struct fl_flow_mask *mask)
{
struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX];
size_t cnt = 0;
@@ -802,31 +838,66 @@ static void fl_init_dissector(struct cls_fl_head *head,
FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp);
- skb_flow_dissector_init(&head->dissector, keys, cnt);
+ skb_flow_dissector_init(&mask->dissector, keys, cnt);
+}
+
+static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head,
+ struct fl_flow_mask *mask)
+{
+ struct fl_flow_mask *newmask;
+ int err;
+
+ newmask = kzalloc(sizeof(*newmask), GFP_KERNEL);
+ if (!newmask)
+ return ERR_PTR(-ENOMEM);
+
+ fl_mask_copy(newmask, mask);
+
+ err = fl_init_mask_hashtable(newmask);
+ if (err)
+ goto errout_free;
+
+ fl_init_dissector(newmask);
+
+ INIT_LIST_HEAD_RCU(&newmask->filters);
+
+ err = rhashtable_insert_fast(&head->ht, &newmask->ht_node,
+ mask_ht_params);
+ if (err)
+ goto errout_destroy;
+
+ list_add_tail_rcu(&newmask->list, &head->masks);
+
+ return newmask;
+
+errout_destroy:
+ rhashtable_destroy(&newmask->ht);
+errout_free:
+ kfree(newmask);
+
+ return ERR_PTR(err);
}
static int fl_check_assign_mask(struct cls_fl_head *head,
+ struct cls_fl_filter *fnew,
+ struct cls_fl_filter *fold,
struct fl_flow_mask *mask)
{
- int err;
+ struct fl_flow_mask *newmask;
- if (head->mask_assigned) {
- if (!fl_mask_eq(&head->mask, mask))
+ fnew->mask = rhashtable_lookup_fast(&head->ht, mask, mask_ht_params);
+ if (!fnew->mask) {
+ if (fold)
return -EINVAL;
- else
- return 0;
- }
- /* Mask is not assigned yet. So assign it and init hashtable
- * according to that.
- */
- err = fl_init_hashtable(head, mask);
- if (err)
- return err;
- memcpy(&head->mask, mask, sizeof(head->mask));
- head->mask_assigned = true;
+ newmask = fl_create_new_mask(head, mask);
+ if (IS_ERR(newmask))
+ return PTR_ERR(newmask);
- fl_init_dissector(head, mask);
+ fnew->mask = newmask;
+ } else if (fold && fold->mask == fnew->mask) {
+ return -EINVAL;
+ }
return 0;
}
@@ -924,30 +995,26 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
if (err)
goto errout_idr;
- err = fl_check_assign_mask(head, &mask);
+ err = fl_check_assign_mask(head, fnew, fold, &mask);
if (err)
goto errout_idr;
if (!tc_skip_sw(fnew->flags)) {
- if (!fold && fl_lookup(head, &fnew->mkey)) {
+ if (!fold && fl_lookup(fnew->mask, &fnew->mkey)) {
err = -EEXIST;
- goto errout_idr;
+ goto errout_mask;
}
- err = rhashtable_insert_fast(&head->ht, &fnew->ht_node,
- head->ht_params);
+ err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node,
+ fnew->mask->filter_ht_params);
if (err)
- goto errout_idr;
+ goto errout_mask;
}
if (!tc_skip_hw(fnew->flags)) {
- err = fl_hw_replace_filter(tp,
- &head->dissector,
- &mask.key,
- fnew,
- extack);
+ err = fl_hw_replace_filter(tp, fnew, extack);
if (err)
- goto errout_idr;
+ goto errout_mask;
}
if (!tc_in_hw(fnew->flags))
@@ -955,8 +1022,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
if (fold) {
if (!tc_skip_sw(fold->flags))
- rhashtable_remove_fast(&head->ht, &fold->ht_node,
- head->ht_params);
+ rhashtable_remove_fast(&fold->mask->ht,
+ &fold->ht_node,
+ fold->mask->filter_ht_params);
if (!tc_skip_hw(fold->flags))
fl_hw_destroy_filter(tp, fold, NULL);
}
@@ -970,12 +1038,15 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
tcf_exts_get_net(&fold->exts);
call_rcu(&fold->rcu, fl_destroy_filter);
} else {
- list_add_tail_rcu(&fnew->list, &head->filters);
+ list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
}
kfree(tb);
return 0;
+errout_mask:
+ fl_mask_put(head, fnew->mask, false);
+
errout_idr:
if (fnew->handle)
idr_remove(&head->handle_idr, fnew->handle);
@@ -994,10 +1065,10 @@ static int fl_delete(struct tcf_proto *tp, void *arg, bool *last,
struct cls_fl_filter *f = arg;
if (!tc_skip_sw(f->flags))
- rhashtable_remove_fast(&head->ht, &f->ht_node,
- head->ht_params);
+ rhashtable_remove_fast(&f->mask->ht, &f->ht_node,
+ f->mask->filter_ht_params);
__fl_delete(tp, f, extack);
- *last = list_empty(&head->filters);
+ *last = list_empty(&head->masks);
return 0;
}
@@ -1005,16 +1076,19 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct cls_fl_head *head = rtnl_dereference(tp->root);
struct cls_fl_filter *f;
-
- list_for_each_entry_rcu(f, &head->filters, list) {
- if (arg->count < arg->skip)
- goto skip;
- if (arg->fn(tp, f, arg) < 0) {
- arg->stop = 1;
- break;
- }
+ struct fl_flow_mask *mask;
+
+ list_for_each_entry_rcu(mask, &head->masks, list) {
+ list_for_each_entry_rcu(f, &mask->filters, list) {
+ if (arg->count < arg->skip)
+ goto skip;
+ if (arg->fn(tp, f, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
skip:
- arg->count++;
+ arg->count++;
+ }
}
}
@@ -1150,7 +1224,6 @@ static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask)
static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
struct sk_buff *skb, struct tcmsg *t)
{
- struct cls_fl_head *head = rtnl_dereference(tp->root);
struct cls_fl_filter *f = fh;
struct nlattr *nest;
struct fl_flow_key *key, *mask;
@@ -1169,7 +1242,7 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
goto nla_put_failure;
key = &f->key;
- mask = &head->mask.key;
+ mask = &f->mask->key;
if (mask->indev_ifindex) {
struct net_device *dev;
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index a366e4c9413a..4808713c73b9 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -128,6 +128,28 @@ static bool fq_flow_is_detached(const struct fq_flow *f)
return f->next == &detached;
}
+static bool fq_flow_is_throttled(const struct fq_flow *f)
+{
+ return f->next == &throttled;
+}
+
+static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
+{
+ if (head->first)
+ head->last->next = flow;
+ else
+ head->first = flow;
+ head->last = flow;
+ flow->next = NULL;
+}
+
+static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f)
+{
+ rb_erase(&f->rate_node, &q->delayed);
+ q->throttled_flows--;
+ fq_flow_add_tail(&q->old_flows, f);
+}
+
static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
{
struct rb_node **p = &q->delayed.rb_node, *parent = NULL;
@@ -155,15 +177,6 @@ static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
static struct kmem_cache *fq_flow_cachep __read_mostly;
-static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
-{
- if (head->first)
- head->last->next = flow;
- else
- head->first = flow;
- head->last = flow;
- flow->next = NULL;
-}
/* limit number of collected flows per round */
#define FQ_GC_MAX 8
@@ -267,6 +280,8 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
f->socket_hash != sk->sk_hash)) {
f->credit = q->initial_quantum;
f->socket_hash = sk->sk_hash;
+ if (fq_flow_is_throttled(f))
+ fq_flow_unset_throttled(q, f);
f->time_next_packet = 0ULL;
}
return f;
@@ -438,9 +453,7 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now)
q->time_next_delayed_flow = f->time_next_packet;
break;
}
- rb_erase(p, &q->delayed);
- q->throttled_flows--;
- fq_flow_add_tail(&q->old_flows, f);
+ fq_flow_unset_throttled(q, f);
}
}
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 837806dd5799..039fdb862b17 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -652,33 +652,20 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
*/
peer->param_flags = asoc->param_flags;
- sctp_transport_route(peer, NULL, sp);
-
/* Initialize the pmtu of the transport. */
- if (peer->param_flags & SPP_PMTUD_DISABLE) {
- if (asoc->pathmtu)
- peer->pathmtu = asoc->pathmtu;
- else
- peer->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
- }
+ sctp_transport_route(peer, NULL, sp);
/* If this is the first transport addr on this association,
* initialize the association PMTU to the peer's PMTU.
* If not and the current association PMTU is higher than the new
* peer's PMTU, reset the association PMTU to the new peer's PMTU.
*/
- if (asoc->pathmtu)
- asoc->pathmtu = min_t(int, peer->pathmtu, asoc->pathmtu);
- else
- asoc->pathmtu = peer->pathmtu;
-
- pr_debug("%s: association:%p PMTU set to %d\n", __func__, asoc,
- asoc->pathmtu);
+ sctp_assoc_set_pmtu(asoc, asoc->pathmtu ?
+ min_t(int, peer->pathmtu, asoc->pathmtu) :
+ peer->pathmtu);
peer->pmtu_pending = 0;
- asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu);
-
/* The asoc->peer.port might not be meaningful yet, but
* initialize the packet structure anyway.
*/
@@ -988,31 +975,6 @@ out:
return match;
}
-/* Is this the association we are looking for? */
-struct sctp_transport *sctp_assoc_is_match(struct sctp_association *asoc,
- struct net *net,
- const union sctp_addr *laddr,
- const union sctp_addr *paddr)
-{
- struct sctp_transport *transport;
-
- if ((htons(asoc->base.bind_addr.port) == laddr->v4.sin_port) &&
- (htons(asoc->peer.port) == paddr->v4.sin_port) &&
- net_eq(sock_net(asoc->base.sk), net)) {
- transport = sctp_assoc_lookup_paddr(asoc, paddr);
- if (!transport)
- goto out;
-
- if (sctp_bind_addr_match(&asoc->base.bind_addr, laddr,
- sctp_sk(asoc->base.sk)))
- goto out;
- }
- transport = NULL;
-
-out:
- return transport;
-}
-
/* Do delayed input processing. This is scheduled by sctp_rcv(). */
static void sctp_assoc_bh_rcv(struct work_struct *work)
{
@@ -1406,6 +1368,31 @@ sctp_assoc_choose_alter_transport(struct sctp_association *asoc,
}
}
+void sctp_assoc_update_frag_point(struct sctp_association *asoc)
+{
+ int frag = sctp_mtu_payload(sctp_sk(asoc->base.sk), asoc->pathmtu,
+ sctp_datachk_len(&asoc->stream));
+
+ if (asoc->user_frag)
+ frag = min_t(int, frag, asoc->user_frag);
+
+ frag = min_t(int, frag, SCTP_MAX_CHUNK_LEN -
+ sctp_datachk_len(&asoc->stream));
+
+ asoc->frag_point = SCTP_TRUNC4(frag);
+}
+
+void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu)
+{
+ if (asoc->pathmtu != pmtu) {
+ asoc->pathmtu = pmtu;
+ sctp_assoc_update_frag_point(asoc);
+ }
+
+ pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc,
+ asoc->pathmtu, asoc->frag_point);
+}
+
/* Update the association's pmtu and frag_point by going through all the
* transports. This routine is called when a transport's PMTU has changed.
*/
@@ -1418,24 +1405,16 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
return;
/* Get the lowest pmtu of all the transports. */
- list_for_each_entry(t, &asoc->peer.transport_addr_list,
- transports) {
+ list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) {
if (t->pmtu_pending && t->dst) {
- sctp_transport_update_pmtu(
- t, SCTP_TRUNC4(dst_mtu(t->dst)));
+ sctp_transport_update_pmtu(t, sctp_dst_mtu(t->dst));
t->pmtu_pending = 0;
}
if (!pmtu || (t->pathmtu < pmtu))
pmtu = t->pathmtu;
}
- if (pmtu) {
- asoc->pathmtu = pmtu;
- asoc->frag_point = sctp_frag_point(asoc, pmtu);
- }
-
- pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc,
- asoc->pathmtu, asoc->frag_point);
+ sctp_assoc_set_pmtu(asoc, pmtu);
}
/* Should we send a SACK to update our peer? */
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index be296d633e95..79daa98208c3 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -172,8 +172,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
struct list_head *pos, *temp;
struct sctp_chunk *chunk;
struct sctp_datamsg *msg;
- struct sctp_sock *sp;
- struct sctp_af *af;
int err;
msg = sctp_datamsg_new(GFP_KERNEL);
@@ -192,12 +190,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
/* This is the biggest possible DATA chunk that can fit into
* the packet
*/
- sp = sctp_sk(asoc->base.sk);
- af = sp->pf->af;
- max_data = asoc->pathmtu - af->net_header_len -
- sizeof(struct sctphdr) - sctp_datachk_len(&asoc->stream) -
- af->ip_options_len(asoc->base.sk);
- max_data = SCTP_TRUNC4(max_data);
+ max_data = asoc->frag_point;
/* If the the peer requested that we authenticate DATA chunks
* we need to account for bundling of the AUTH chunks along with
@@ -222,9 +215,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
}
}
- /* Check what's our max considering the above */
- max_data = min_t(size_t, max_data, asoc->frag_point);
-
/* Set first_len and then account for possible bundles on first frag */
first_len = max_data;
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index 23ebc5318edc..eb93ffe2408b 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -217,7 +217,7 @@ new_skb:
skb_pull(chunk->skb, sizeof(*ch));
chunk->subh.v = NULL; /* Subheader is no longer valid. */
- if (chunk->chunk_end + sizeof(*ch) < skb_tail_pointer(chunk->skb)) {
+ if (chunk->chunk_end + sizeof(*ch) <= skb_tail_pointer(chunk->skb)) {
/* This is not a singleton */
chunk->singleton = 0;
} else if (chunk->chunk_end > skb_tail_pointer(chunk->skb)) {
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 31083b5035ec..42247110d842 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -556,46 +556,49 @@ static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr,
addr->v6.sin6_scope_id = 0;
}
-/* Compare addresses exactly.
- * v4-mapped-v6 is also in consideration.
- */
-static int sctp_v6_cmp_addr(const union sctp_addr *addr1,
- const union sctp_addr *addr2)
+static int __sctp_v6_cmp_addr(const union sctp_addr *addr1,
+ const union sctp_addr *addr2)
{
if (addr1->sa.sa_family != addr2->sa.sa_family) {
if (addr1->sa.sa_family == AF_INET &&
addr2->sa.sa_family == AF_INET6 &&
- ipv6_addr_v4mapped(&addr2->v6.sin6_addr)) {
- if (addr2->v6.sin6_port == addr1->v4.sin_port &&
- addr2->v6.sin6_addr.s6_addr32[3] ==
- addr1->v4.sin_addr.s_addr)
- return 1;
- }
+ ipv6_addr_v4mapped(&addr2->v6.sin6_addr) &&
+ addr2->v6.sin6_addr.s6_addr32[3] ==
+ addr1->v4.sin_addr.s_addr)
+ return 1;
+
if (addr2->sa.sa_family == AF_INET &&
addr1->sa.sa_family == AF_INET6 &&
- ipv6_addr_v4mapped(&addr1->v6.sin6_addr)) {
- if (addr1->v6.sin6_port == addr2->v4.sin_port &&
- addr1->v6.sin6_addr.s6_addr32[3] ==
- addr2->v4.sin_addr.s_addr)
- return 1;
- }
+ ipv6_addr_v4mapped(&addr1->v6.sin6_addr) &&
+ addr1->v6.sin6_addr.s6_addr32[3] ==
+ addr2->v4.sin_addr.s_addr)
+ return 1;
+
return 0;
}
- if (addr1->v6.sin6_port != addr2->v6.sin6_port)
- return 0;
+
if (!ipv6_addr_equal(&addr1->v6.sin6_addr, &addr2->v6.sin6_addr))
return 0;
+
/* If this is a linklocal address, compare the scope_id. */
- if (ipv6_addr_type(&addr1->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) {
- if (addr1->v6.sin6_scope_id && addr2->v6.sin6_scope_id &&
- (addr1->v6.sin6_scope_id != addr2->v6.sin6_scope_id)) {
- return 0;
- }
- }
+ if ((ipv6_addr_type(&addr1->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) &&
+ addr1->v6.sin6_scope_id && addr2->v6.sin6_scope_id &&
+ addr1->v6.sin6_scope_id != addr2->v6.sin6_scope_id)
+ return 0;
return 1;
}
+/* Compare addresses exactly.
+ * v4-mapped-v6 is also in consideration.
+ */
+static int sctp_v6_cmp_addr(const union sctp_addr *addr1,
+ const union sctp_addr *addr2)
+{
+ return __sctp_v6_cmp_addr(addr1, addr2) &&
+ addr1->v6.sin6_port == addr2->v6.sin6_port;
+}
+
/* Initialize addr struct to INADDR_ANY. */
static void sctp_v6_inaddr_any(union sctp_addr *addr, __be16 port)
{
@@ -875,8 +878,8 @@ static int sctp_inet6_cmp_addr(const union sctp_addr *addr1,
const union sctp_addr *addr2,
struct sctp_sock *opt)
{
- struct sctp_af *af1, *af2;
struct sock *sk = sctp_opt2sk(opt);
+ struct sctp_af *af1, *af2;
af1 = sctp_get_af_specific(addr1->sa.sa_family);
af2 = sctp_get_af_specific(addr2->sa.sa_family);
@@ -892,10 +895,10 @@ static int sctp_inet6_cmp_addr(const union sctp_addr *addr1,
if (sctp_is_any(sk, addr1) || sctp_is_any(sk, addr2))
return 1;
- if (addr1->sa.sa_family != addr2->sa.sa_family)
- return 0;
+ if (addr1->sa.sa_family == AF_INET && addr2->sa.sa_family == AF_INET)
+ return addr1->v4.sin_addr.s_addr == addr2->v4.sin_addr.s_addr;
- return af1->cmp_addr(addr1, addr2);
+ return __sctp_v6_cmp_addr(addr1, addr2);
}
/* Verify that the provided sockaddr looks bindable. Common verification,
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 690d8557bb7b..e672dee302c7 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -90,8 +90,8 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag,
{
struct sctp_transport *tp = packet->transport;
struct sctp_association *asoc = tp->asoc;
+ struct sctp_sock *sp = NULL;
struct sock *sk;
- size_t overhead = sizeof(struct ipv6hdr) + sizeof(struct sctphdr);
pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag);
packet->vtag = vtag;
@@ -102,28 +102,20 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag,
/* set packet max_size with pathmtu, then calculate overhead */
packet->max_size = tp->pathmtu;
+
if (asoc) {
- struct sctp_sock *sp = sctp_sk(asoc->base.sk);
- struct sctp_af *af = sp->pf->af;
-
- overhead = af->net_header_len +
- af->ip_options_len(asoc->base.sk);
- overhead += sizeof(struct sctphdr);
- packet->overhead = overhead;
- packet->size = overhead;
- } else {
- packet->overhead = overhead;
- packet->size = overhead;
- return;
+ sk = asoc->base.sk;
+ sp = sctp_sk(sk);
}
+ packet->overhead = sctp_mtu_payload(sp, 0, 0);
+ packet->size = packet->overhead;
+
+ if (!asoc)
+ return;
/* update dst or transport pathmtu if in need */
- sk = asoc->base.sk;
if (!sctp_transport_dst_check(tp)) {
- sctp_transport_route(tp, NULL, sctp_sk(sk));
- if (asoc->param_flags & SPP_PMTUD_ENABLE)
- sctp_assoc_sync_pmtu(asoc);
- } else if (!sctp_transport_pmtu_check(tp)) {
+ sctp_transport_route(tp, NULL, sp);
if (asoc->param_flags & SPP_PMTUD_ENABLE)
sctp_assoc_sync_pmtu(asoc);
}
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index f211b3db6a35..dee7cbd54831 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -1457,7 +1457,7 @@ static void sctp_check_transmitted(struct sctp_outq *q,
* the outstanding bytes for this chunk, so only
* count bytes associated with a transport.
*/
- if (transport) {
+ if (transport && !tchunk->tsn_gap_acked) {
/* If this chunk is being used for RTT
* measurement, calculate the RTT and update
* the RTO using this value.
@@ -1469,14 +1469,34 @@ static void sctp_check_transmitted(struct sctp_outq *q,
* first instance of the packet or a later
* instance).
*/
- if (!tchunk->tsn_gap_acked &&
- !sctp_chunk_retransmitted(tchunk) &&
+ if (!sctp_chunk_retransmitted(tchunk) &&
tchunk->rtt_in_progress) {
tchunk->rtt_in_progress = 0;
rtt = jiffies - tchunk->sent_at;
sctp_transport_update_rto(transport,
rtt);
}
+
+ if (TSN_lte(tsn, sack_ctsn)) {
+ /*
+ * SFR-CACC algorithm:
+ * 2) If the SACK contains gap acks
+ * and the flag CHANGEOVER_ACTIVE is
+ * set the receiver of the SACK MUST
+ * take the following action:
+ *
+ * B) For each TSN t being acked that
+ * has not been acked in any SACK so
+ * far, set cacc_saw_newack to 1 for
+ * the destination that the TSN was
+ * sent to.
+ */
+ if (sack->num_gap_ack_blocks &&
+ q->asoc->peer.primary_path->cacc.
+ changeover_active)
+ transport->cacc.cacc_saw_newack
+ = 1;
+ }
}
/* If the chunk hasn't been marked as ACKED,
@@ -1508,28 +1528,6 @@ static void sctp_check_transmitted(struct sctp_outq *q,
restart_timer = 1;
forward_progress = true;
- if (!tchunk->tsn_gap_acked) {
- /*
- * SFR-CACC algorithm:
- * 2) If the SACK contains gap acks
- * and the flag CHANGEOVER_ACTIVE is
- * set the receiver of the SACK MUST
- * take the following action:
- *
- * B) For each TSN t being acked that
- * has not been acked in any SACK so
- * far, set cacc_saw_newack to 1 for
- * the destination that the TSN was
- * sent to.
- */
- if (transport &&
- sack->num_gap_ack_blocks &&
- q->asoc->peer.primary_path->cacc.
- changeover_active)
- transport->cacc.cacc_saw_newack
- = 1;
- }
-
list_add_tail(&tchunk->transmitted_list,
&q->sacked);
} else {
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 5a4fb1dc8400..4d7b3ccea078 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -81,8 +81,6 @@ static int sctp_process_param(struct sctp_association *asoc,
gfp_t gfp);
static void *sctp_addto_param(struct sctp_chunk *chunk, int len,
const void *data);
-static void *sctp_addto_chunk_fixed(struct sctp_chunk *, int len,
- const void *data);
/* Control chunk destructor */
static void sctp_control_release_owner(struct sk_buff *skb)
@@ -154,12 +152,11 @@ static const struct sctp_paramhdr prsctp_param = {
cpu_to_be16(sizeof(struct sctp_paramhdr)),
};
-/* A helper to initialize an op error inside a
- * provided chunk, as most cause codes will be embedded inside an
- * abort chunk.
+/* A helper to initialize an op error inside a provided chunk, as most
+ * cause codes will be embedded inside an abort chunk.
*/
-void sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
- size_t paylen)
+int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
+ size_t paylen)
{
struct sctp_errhdr err;
__u16 len;
@@ -167,33 +164,16 @@ void sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
/* Cause code constants are now defined in network order. */
err.cause = cause_code;
len = sizeof(err) + paylen;
- err.length = htons(len);
- chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(err), &err);
-}
-
-/* A helper to initialize an op error inside a
- * provided chunk, as most cause codes will be embedded inside an
- * abort chunk. Differs from sctp_init_cause in that it won't oops
- * if there isn't enough space in the op error chunk
- */
-static int sctp_init_cause_fixed(struct sctp_chunk *chunk, __be16 cause_code,
- size_t paylen)
-{
- struct sctp_errhdr err;
- __u16 len;
-
- /* Cause code constants are now defined in network order. */
- err.cause = cause_code;
- len = sizeof(err) + paylen;
- err.length = htons(len);
+ err.length = htons(len);
if (skb_tailroom(chunk->skb) < len)
return -ENOSPC;
- chunk->subh.err_hdr = sctp_addto_chunk_fixed(chunk, sizeof(err), &err);
+ chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(err), &err);
return 0;
}
+
/* 3.3.2 Initiation (INIT) (1)
*
* This chunk is used to initiate a SCTP association between two
@@ -779,10 +759,9 @@ struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc,
* association. This reports on which TSN's we've seen to date,
* including duplicates and gaps.
*/
-struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
+struct sctp_chunk *sctp_make_sack(struct sctp_association *asoc)
{
struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map;
- struct sctp_association *aptr = (struct sctp_association *)asoc;
struct sctp_gap_ack_block gabs[SCTP_MAX_GABS];
__u16 num_gabs, num_dup_tsns;
struct sctp_transport *trans;
@@ -857,7 +836,7 @@ struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
/* Add the duplicate TSN information. */
if (num_dup_tsns) {
- aptr->stats.idupchunks += num_dup_tsns;
+ asoc->stats.idupchunks += num_dup_tsns;
sctp_addto_chunk(retval, sizeof(__u32) * num_dup_tsns,
sctp_tsnmap_get_dups(map));
}
@@ -869,11 +848,11 @@ struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
* association so no transport will match after a wrap event like this,
* Until the next sack
*/
- if (++aptr->peer.sack_generation == 0) {
+ if (++asoc->peer.sack_generation == 0) {
list_for_each_entry(trans, &asoc->peer.transport_addr_list,
transports)
trans->sack_generation = 0;
- aptr->peer.sack_generation = 1;
+ asoc->peer.sack_generation = 1;
}
nodata:
return retval;
@@ -1258,20 +1237,26 @@ nodata:
return retval;
}
-/* Create an Operation Error chunk of a fixed size,
- * specifically, max(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT)
- * This is a helper function to allocate an error chunk for
- * for those invalid parameter codes in which we may not want
- * to report all the errors, if the incoming chunk is large
+/* Create an Operation Error chunk of a fixed size, specifically,
+ * min(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT) - overheads.
+ * This is a helper function to allocate an error chunk for for those
+ * invalid parameter codes in which we may not want to report all the
+ * errors, if the incoming chunk is large. If it can't fit in a single
+ * packet, we ignore it.
*/
-static inline struct sctp_chunk *sctp_make_op_error_fixed(
+static inline struct sctp_chunk *sctp_make_op_error_limited(
const struct sctp_association *asoc,
const struct sctp_chunk *chunk)
{
- size_t size = asoc ? asoc->pathmtu : 0;
+ size_t size = SCTP_DEFAULT_MAXSEGMENT;
+ struct sctp_sock *sp = NULL;
- if (!size)
- size = SCTP_DEFAULT_MAXSEGMENT;
+ if (asoc) {
+ size = min_t(size_t, size, asoc->pathmtu);
+ sp = sctp_sk(asoc->base.sk);
+ }
+
+ size = sctp_mtu_payload(sp, size, sizeof(struct sctp_errhdr));
return sctp_make_op_error_space(asoc, chunk, size);
}
@@ -1523,18 +1508,6 @@ void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data)
return target;
}
-/* Append bytes to the end of a chunk. Returns NULL if there isn't sufficient
- * space in the chunk
- */
-static void *sctp_addto_chunk_fixed(struct sctp_chunk *chunk,
- int len, const void *data)
-{
- if (skb_tailroom(chunk->skb) >= len)
- return sctp_addto_chunk(chunk, len, data);
- else
- return NULL;
-}
-
/* Append bytes from user space to the end of a chunk. Will panic if
* chunk is not big enough.
* Returns a kernel err value.
@@ -1829,6 +1802,9 @@ no_hmac:
kt = ktime_get_real();
if (!asoc && ktime_before(bear_cookie->expiration, kt)) {
+ suseconds_t usecs = ktime_to_us(ktime_sub(kt, bear_cookie->expiration));
+ __be32 n = htonl(usecs);
+
/*
* Section 3.3.10.3 Stale Cookie Error (3)
*
@@ -1837,17 +1813,12 @@ no_hmac:
* Stale Cookie Error: Indicates the receipt of a valid State
* Cookie that has expired.
*/
- len = ntohs(chunk->chunk_hdr->length);
- *errp = sctp_make_op_error_space(asoc, chunk, len);
- if (*errp) {
- suseconds_t usecs = ktime_to_us(ktime_sub(kt, bear_cookie->expiration));
- __be32 n = htonl(usecs);
-
- sctp_init_cause(*errp, SCTP_ERROR_STALE_COOKIE,
- sizeof(n));
- sctp_addto_chunk(*errp, sizeof(n), &n);
+ *errp = sctp_make_op_error(asoc, chunk,
+ SCTP_ERROR_STALE_COOKIE, &n,
+ sizeof(n), 0);
+ if (*errp)
*error = -SCTP_IERROR_STALE_COOKIE;
- } else
+ else
*error = -SCTP_IERROR_NOMEM;
goto fail;
@@ -1998,12 +1969,8 @@ static int sctp_process_hn_param(const struct sctp_association *asoc,
if (*errp)
sctp_chunk_free(*errp);
- *errp = sctp_make_op_error_space(asoc, chunk, len);
-
- if (*errp) {
- sctp_init_cause(*errp, SCTP_ERROR_DNS_FAILED, len);
- sctp_addto_chunk(*errp, len, param.v);
- }
+ *errp = sctp_make_op_error(asoc, chunk, SCTP_ERROR_DNS_FAILED,
+ param.v, len, 0);
/* Stop processing this chunk. */
return 0;
@@ -2128,23 +2095,23 @@ static enum sctp_ierror sctp_process_unk_param(
/* Make an ERROR chunk, preparing enough room for
* returning multiple unknown parameters.
*/
- if (NULL == *errp)
- *errp = sctp_make_op_error_fixed(asoc, chunk);
-
- if (*errp) {
- if (!sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM,
- SCTP_PAD4(ntohs(param.p->length))))
- sctp_addto_chunk_fixed(*errp,
- SCTP_PAD4(ntohs(param.p->length)),
- param.v);
- } else {
- /* If there is no memory for generating the ERROR
- * report as specified, an ABORT will be triggered
- * to the peer and the association won't be
- * established.
- */
- retval = SCTP_IERROR_NOMEM;
+ if (!*errp) {
+ *errp = sctp_make_op_error_limited(asoc, chunk);
+ if (!*errp) {
+ /* If there is no memory for generating the
+ * ERROR report as specified, an ABORT will be
+ * triggered to the peer and the association
+ * won't be established.
+ */
+ retval = SCTP_IERROR_NOMEM;
+ break;
+ }
}
+
+ if (!sctp_init_cause(*errp, SCTP_ERROR_UNKNOWN_PARAM,
+ ntohs(param.p->length)))
+ sctp_addto_chunk(*errp, ntohs(param.p->length),
+ param.v);
break;
default:
break;
@@ -2220,10 +2187,10 @@ static enum sctp_ierror sctp_verify_param(struct net *net,
* MUST be aborted. The ABORT chunk SHOULD contain the error
* cause 'Protocol Violation'.
*/
- if (SCTP_AUTH_RANDOM_LENGTH !=
- ntohs(param.p->length) - sizeof(struct sctp_paramhdr)) {
+ if (SCTP_AUTH_RANDOM_LENGTH != ntohs(param.p->length) -
+ sizeof(struct sctp_paramhdr)) {
sctp_process_inv_paramlength(asoc, param.p,
- chunk, err_chunk);
+ chunk, err_chunk);
retval = SCTP_IERROR_ABORT;
}
break;
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index dd0594a10961..28c070e187c2 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -1794,6 +1794,9 @@ static enum sctp_disposition sctp_sf_do_dupcook_a(
GFP_ATOMIC))
goto nomem;
+ if (sctp_auth_asoc_init_active_key(new_asoc, GFP_ATOMIC))
+ goto nomem;
+
/* Make sure no new addresses are being added during the
* restart. Though this is a pretty complicated attack
* since you'd have to get inside the cookie.
@@ -1906,6 +1909,9 @@ static enum sctp_disposition sctp_sf_do_dupcook_b(
GFP_ATOMIC))
goto nomem;
+ if (sctp_auth_asoc_init_active_key(new_asoc, GFP_ATOMIC))
+ goto nomem;
+
/* Update the content of current association. */
sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc));
sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
@@ -2050,7 +2056,7 @@ static enum sctp_disposition sctp_sf_do_dupcook_d(
}
}
- repl = sctp_make_cookie_ack(new_asoc, chunk);
+ repl = sctp_make_cookie_ack(asoc, chunk);
if (!repl)
goto nomem;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 80835ac26d2c..1b4593b842b0 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -644,16 +644,15 @@ static int sctp_send_asconf_add_ip(struct sock *sk,
list_for_each_entry(trans,
&asoc->peer.transport_addr_list, transports) {
- /* Clear the source and route cache */
- sctp_transport_dst_release(trans);
trans->cwnd = min(4*asoc->pathmtu, max_t(__u32,
2*asoc->pathmtu, 4380));
trans->ssthresh = asoc->peer.i.a_rwnd;
trans->rto = asoc->rto_initial;
sctp_max_rto(asoc, trans);
trans->rtt = trans->srtt = trans->rttvar = 0;
+ /* Clear the source and route cache */
sctp_transport_route(trans, NULL,
- sctp_sk(asoc->base.sk));
+ sctp_sk(asoc->base.sk));
}
}
retval = sctp_send_asconf(asoc, chunk);
@@ -896,7 +895,6 @@ skip_mkasconf:
*/
list_for_each_entry(transport, &asoc->peer.transport_addr_list,
transports) {
- sctp_transport_dst_release(transport);
sctp_transport_route(transport, NULL,
sctp_sk(asoc->base.sk));
}
@@ -1895,6 +1893,7 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
struct sctp_sndrcvinfo *sinfo)
{
struct sock *sk = asoc->base.sk;
+ struct sctp_sock *sp = sctp_sk(sk);
struct net *net = sock_net(sk);
struct sctp_datamsg *datamsg;
bool wait_connect = false;
@@ -1913,13 +1912,16 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
goto err;
}
- if (sctp_sk(sk)->disable_fragments && msg_len > asoc->frag_point) {
+ if (sp->disable_fragments && msg_len > asoc->frag_point) {
err = -EMSGSIZE;
goto err;
}
- if (asoc->pmtu_pending)
- sctp_assoc_pending_pmtu(asoc);
+ if (asoc->pmtu_pending) {
+ if (sp->param_flags & SPP_PMTUD_ENABLE)
+ sctp_assoc_sync_pmtu(asoc);
+ asoc->pmtu_pending = 0;
+ }
if (sctp_wspace(asoc) < msg_len)
sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc));
@@ -1936,7 +1938,7 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
if (err)
goto err;
- if (sctp_sk(sk)->strm_interleave) {
+ if (sp->strm_interleave) {
timeo = sock_sndtimeo(sk, 0);
err = sctp_wait_for_connect(asoc, &timeo);
if (err)
@@ -2539,7 +2541,7 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
trans->pathmtu = params->spp_pathmtu;
sctp_assoc_sync_pmtu(asoc);
} else if (asoc) {
- asoc->pathmtu = params->spp_pathmtu;
+ sctp_assoc_set_pmtu(asoc, params->spp_pathmtu);
} else {
sp->pathmtu = params->spp_pathmtu;
}
@@ -3209,7 +3211,6 @@ static int sctp_setsockopt_mappedv4(struct sock *sk, char __user *optval, unsign
static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned int optlen)
{
struct sctp_sock *sp = sctp_sk(sk);
- struct sctp_af *af = sp->pf->af;
struct sctp_assoc_value params;
struct sctp_association *asoc;
int val;
@@ -3231,30 +3232,24 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
return -EINVAL;
}
+ asoc = sctp_id2assoc(sk, params.assoc_id);
+
if (val) {
int min_len, max_len;
+ __u16 datasize = asoc ? sctp_datachk_len(&asoc->stream) :
+ sizeof(struct sctp_data_chunk);
- min_len = SCTP_DEFAULT_MINSEGMENT - af->net_header_len;
- min_len -= af->ip_options_len(sk);
- min_len -= sizeof(struct sctphdr) +
- sizeof(struct sctp_data_chunk);
-
- max_len = SCTP_MAX_CHUNK_LEN - sizeof(struct sctp_data_chunk);
+ min_len = sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT,
+ datasize);
+ max_len = SCTP_MAX_CHUNK_LEN - datasize;
if (val < min_len || val > max_len)
return -EINVAL;
}
- asoc = sctp_id2assoc(sk, params.assoc_id);
if (asoc) {
- if (val == 0) {
- val = asoc->pathmtu - af->net_header_len;
- val -= af->ip_options_len(sk);
- val -= sizeof(struct sctphdr) +
- sctp_datachk_len(&asoc->stream);
- }
asoc->user_frag = val;
- asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu);
+ sctp_assoc_update_frag_point(asoc);
} else {
if (params.assoc_id && sctp_style(sk, UDP))
return -EINVAL;
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index f799043abec9..f1f1d1b232ba 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -240,6 +240,8 @@ void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new)
new->out = NULL;
new->in = NULL;
+ new->outcnt = 0;
+ new->incnt = 0;
}
static int sctp_send_reconf(struct sctp_association *asoc,
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 47f82bd794d9..4a95e260b674 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -242,9 +242,18 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
&transport->fl, sk);
}
- if (transport->dst) {
- transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst));
- } else
+ if (transport->param_flags & SPP_PMTUD_DISABLE) {
+ struct sctp_association *asoc = transport->asoc;
+
+ if (!transport->pathmtu && asoc && asoc->pathmtu)
+ transport->pathmtu = asoc->pathmtu;
+ if (transport->pathmtu)
+ return;
+ }
+
+ if (transport->dst)
+ transport->pathmtu = sctp_dst_mtu(transport->dst);
+ else
transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
}
@@ -290,6 +299,7 @@ void sctp_transport_route(struct sctp_transport *transport,
struct sctp_association *asoc = transport->asoc;
struct sctp_af *af = transport->af_specific;
+ sctp_transport_dst_release(transport);
af->get_dst(transport, saddr, &transport->fl, sctp_opt2sk(opt));
if (saddr)
@@ -297,21 +307,14 @@ void sctp_transport_route(struct sctp_transport *transport,
else
af->get_saddr(opt, transport, &transport->fl);
- if ((transport->param_flags & SPP_PMTUD_DISABLE) && transport->pathmtu) {
- return;
- }
- if (transport->dst) {
- transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst));
+ sctp_transport_pmtu(transport, sctp_opt2sk(opt));
- /* Initialize sk->sk_rcv_saddr, if the transport is the
- * association's active path for getsockname().
- */
- if (asoc && (!asoc->peer.primary_path ||
- (transport == asoc->peer.active_path)))
- opt->pf->to_sk_saddr(&transport->saddr,
- asoc->base.sk);
- } else
- transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
+ /* Initialize sk->sk_rcv_saddr, if the transport is the
+ * association's active path for getsockname().
+ */
+ if (transport->dst && asoc &&
+ (!asoc->peer.primary_path || transport == asoc->peer.active_path))
+ opt->pf->to_sk_saddr(&transport->saddr, asoc->base.sk);
}
/* Hold a reference to a transport. */
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 5f8046c62d90..17688a02035b 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -29,6 +29,7 @@
#include <net/sock.h>
#include <net/tcp.h>
#include <net/smc.h>
+#include <asm/ioctls.h>
#include "smc.h"
#include "smc_clc.h"
@@ -292,8 +293,20 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
}
+/* register a new rmb */
+static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc)
+{
+ /* register memory region for new rmb */
+ if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
+ rmb_desc->regerr = 1;
+ return -EFAULT;
+ }
+ return 0;
+}
+
static int smc_clnt_conf_first_link(struct smc_sock *smc)
{
+ struct net *net = sock_net(smc->clcsock->sk);
struct smc_link_group *lgr = smc->conn.lgr;
struct smc_link *link;
int rest;
@@ -321,9 +334,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
smc_wr_remember_qp_attr(link);
- rc = smc_wr_reg_send(link,
- smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]);
- if (rc)
+ if (smc_reg_rmb(link, smc->conn.rmb_desc))
return SMC_CLC_DECL_INTERR;
/* send CONFIRM LINK response over RoCE fabric */
@@ -353,7 +364,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
if (rc < 0)
return SMC_CLC_DECL_TCL;
- link->state = SMC_LNK_ACTIVE;
+ smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
return 0;
}
@@ -391,6 +402,9 @@ static int smc_connect_rdma(struct smc_sock *smc)
sock_hold(&smc->sk); /* sock put in passive closing */
+ if (smc->use_fallback)
+ goto out_connected;
+
if (!tcp_sk(smc->clcsock->sk)->syn_smc) {
/* peer has not signalled SMC-capability */
smc->use_fallback = true;
@@ -473,13 +487,8 @@ static int smc_connect_rdma(struct smc_sock *smc)
goto decline_rdma_unlock;
}
} else {
- struct smc_buf_desc *buf_desc = smc->conn.rmb_desc;
-
- if (!buf_desc->reused) {
- /* register memory region for new rmb */
- rc = smc_wr_reg_send(link,
- buf_desc->mr_rx[SMC_SINGLE_LINK]);
- if (rc) {
+ if (!smc->conn.rmb_desc->reused) {
+ if (smc_reg_rmb(link, smc->conn.rmb_desc)) {
reason_code = SMC_CLC_DECL_INTERR;
goto decline_rdma_unlock;
}
@@ -712,6 +721,7 @@ void smc_close_non_accepted(struct sock *sk)
static int smc_serv_conf_first_link(struct smc_sock *smc)
{
+ struct net *net = sock_net(smc->clcsock->sk);
struct smc_link_group *lgr = smc->conn.lgr;
struct smc_link *link;
int rest;
@@ -719,9 +729,7 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)
link = &lgr->lnk[SMC_SINGLE_LINK];
- rc = smc_wr_reg_send(link,
- smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]);
- if (rc)
+ if (smc_reg_rmb(link, smc->conn.rmb_desc))
return SMC_CLC_DECL_INTERR;
/* send CONFIRM LINK request to client over the RoCE fabric */
@@ -766,7 +774,7 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)
return rc;
}
- link->state = SMC_LNK_ACTIVE;
+ smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
return 0;
}
@@ -790,6 +798,9 @@ static void smc_listen_work(struct work_struct *work)
int rc = 0;
u8 ibport;
+ if (new_smc->use_fallback)
+ goto out_connected;
+
/* check if peer is smc capable */
if (!tcp_sk(newclcsock->sk)->syn_smc) {
new_smc->use_fallback = true;
@@ -854,13 +865,8 @@ static void smc_listen_work(struct work_struct *work)
smc_rx_init(new_smc);
if (local_contact != SMC_FIRST_CONTACT) {
- struct smc_buf_desc *buf_desc = new_smc->conn.rmb_desc;
-
- if (!buf_desc->reused) {
- /* register memory region for new rmb */
- rc = smc_wr_reg_send(link,
- buf_desc->mr_rx[SMC_SINGLE_LINK]);
- if (rc) {
+ if (!new_smc->conn.rmb_desc->reused) {
+ if (smc_reg_rmb(link, new_smc->conn.rmb_desc)) {
reason_code = SMC_CLC_DECL_INTERR;
goto decline_rdma_unlock;
}
@@ -968,7 +974,7 @@ static void smc_tcp_listen_work(struct work_struct *work)
continue;
new_smc->listen_smc = lsmc;
- new_smc->use_fallback = false; /* assume rdma capability first*/
+ new_smc->use_fallback = lsmc->use_fallback;
sock_hold(lsk); /* sock_put in smc_listen_work */
INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
smc_copy_sock_settings_to_smc(new_smc);
@@ -978,10 +984,6 @@ static void smc_tcp_listen_work(struct work_struct *work)
}
out:
- if (lsmc->clcsock) {
- sock_release(lsmc->clcsock);
- lsmc->clcsock = NULL;
- }
release_sock(lsk);
sock_put(&lsmc->sk); /* sock_hold in smc_listen */
}
@@ -1008,7 +1010,8 @@ static int smc_listen(struct socket *sock, int backlog)
* them to the clc socket -- copy smc socket options to clc socket
*/
smc_copy_sock_settings_to_clc(smc);
- tcp_sk(smc->clcsock->sk)->syn_smc = 1;
+ if (!smc->use_fallback)
+ tcp_sk(smc->clcsock->sk)->syn_smc = 1;
rc = kernel_listen(smc->clcsock, backlog);
if (rc)
@@ -1041,6 +1044,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
if (lsmc->sk.sk_state != SMC_LISTEN) {
rc = -EINVAL;
+ release_sock(sk);
goto out;
}
@@ -1068,9 +1072,29 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
if (!rc)
rc = sock_error(nsk);
+ release_sock(sk);
+ if (rc)
+ goto out;
+
+ if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
+ /* wait till data arrives on the socket */
+ timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
+ MSEC_PER_SEC);
+ if (smc_sk(nsk)->use_fallback) {
+ struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
+
+ lock_sock(clcsk);
+ if (skb_queue_empty(&clcsk->sk_receive_queue))
+ sk_wait_data(clcsk, &timeo, NULL);
+ release_sock(clcsk);
+ } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
+ lock_sock(nsk);
+ smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
+ release_sock(nsk);
+ }
+ }
out:
- release_sock(sk);
sock_put(sk); /* sock_hold above */
return rc;
}
@@ -1101,6 +1125,16 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
(sk->sk_state != SMC_APPCLOSEWAIT1) &&
(sk->sk_state != SMC_INIT))
goto out;
+
+ if (msg->msg_flags & MSG_FASTOPEN) {
+ if (sk->sk_state == SMC_INIT) {
+ smc->use_fallback = true;
+ } else {
+ rc = -EINVAL;
+ goto out;
+ }
+ }
+
if (smc->use_fallback)
rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
else
@@ -1129,10 +1163,12 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
goto out;
}
- if (smc->use_fallback)
+ if (smc->use_fallback) {
rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
- else
- rc = smc_rx_recvmsg(smc, msg, len, flags);
+ } else {
+ msg->msg_namelen = 0;
+ rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
+ }
out:
release_sock(sk);
@@ -1170,13 +1206,15 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
/* delegate to CLC child sock */
release_sock(sk);
mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
- /* if non-blocking connect finished ... */
lock_sock(sk);
- if ((sk->sk_state == SMC_INIT) && (mask & EPOLLOUT)) {
- sk->sk_err = smc->clcsock->sk->sk_err;
- if (sk->sk_err) {
- mask |= EPOLLERR;
- } else {
+ sk->sk_err = smc->clcsock->sk->sk_err;
+ if (sk->sk_err) {
+ mask |= EPOLLERR;
+ } else {
+ /* if non-blocking connect finished ... */
+ if (sk->sk_state == SMC_INIT &&
+ mask & EPOLLOUT &&
+ smc->clcsock->sk->sk_state != TCP_CLOSE) {
rc = smc_connect_rdma(smc);
if (rc < 0)
mask |= EPOLLERR;
@@ -1259,14 +1297,12 @@ static int smc_shutdown(struct socket *sock, int how)
rc = smc_close_shutdown_write(smc);
break;
case SHUT_RD:
- if (sk->sk_state == SMC_LISTEN)
- rc = smc_close_active(smc);
- else
- rc = 0;
- /* nothing more to do because peer is not involved */
+ rc = 0;
+ /* nothing more to do because peer is not involved */
break;
}
- rc1 = kernel_sock_shutdown(smc->clcsock, how);
+ if (smc->clcsock)
+ rc1 = kernel_sock_shutdown(smc->clcsock, how);
/* map sock_shutdown_cmd constants to sk_shutdown value range */
sk->sk_shutdown |= how + 1;
@@ -1280,14 +1316,64 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
{
struct sock *sk = sock->sk;
struct smc_sock *smc;
+ int val, rc;
smc = smc_sk(sk);
/* generic setsockopts reaching us here always apply to the
* CLC socket
*/
- return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
- optval, optlen);
+ rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
+ optval, optlen);
+ if (smc->clcsock->sk->sk_err) {
+ sk->sk_err = smc->clcsock->sk->sk_err;
+ sk->sk_error_report(sk);
+ }
+ if (rc)
+ return rc;
+
+ if (optlen < sizeof(int))
+ return rc;
+ get_user(val, (int __user *)optval);
+
+ lock_sock(sk);
+ switch (optname) {
+ case TCP_ULP:
+ case TCP_FASTOPEN:
+ case TCP_FASTOPEN_CONNECT:
+ case TCP_FASTOPEN_KEY:
+ case TCP_FASTOPEN_NO_COOKIE:
+ /* option not supported by SMC */
+ if (sk->sk_state == SMC_INIT) {
+ smc->use_fallback = true;
+ } else {
+ if (!smc->use_fallback)
+ rc = -EINVAL;
+ }
+ break;
+ case TCP_NODELAY:
+ if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
+ if (val)
+ mod_delayed_work(system_wq, &smc->conn.tx_work,
+ 0);
+ }
+ break;
+ case TCP_CORK:
+ if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
+ if (!val)
+ mod_delayed_work(system_wq, &smc->conn.tx_work,
+ 0);
+ }
+ break;
+ case TCP_DEFER_ACCEPT:
+ smc->sockopt_defer_accept = val;
+ break;
+ default:
+ break;
+ }
+ release_sock(sk);
+
+ return rc;
}
static int smc_getsockopt(struct socket *sock, int level, int optname,
@@ -1305,12 +1391,38 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd,
unsigned long arg)
{
struct smc_sock *smc;
+ int answ;
smc = smc_sk(sock->sk);
- if (smc->use_fallback)
+ if (smc->use_fallback) {
+ if (!smc->clcsock)
+ return -EBADF;
return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
- else
- return sock_no_ioctl(sock, cmd, arg);
+ }
+ switch (cmd) {
+ case SIOCINQ: /* same as FIONREAD */
+ if (smc->sk.sk_state == SMC_LISTEN)
+ return -EINVAL;
+ answ = atomic_read(&smc->conn.bytes_to_rcv);
+ break;
+ case SIOCOUTQ:
+ /* output queue size (not send + not acked) */
+ if (smc->sk.sk_state == SMC_LISTEN)
+ return -EINVAL;
+ answ = smc->conn.sndbuf_size -
+ atomic_read(&smc->conn.sndbuf_space);
+ break;
+ case SIOCOUTQNSD:
+ /* output queue size (not send only) */
+ if (smc->sk.sk_state == SMC_LISTEN)
+ return -EINVAL;
+ answ = smc_tx_prepared_sends(&smc->conn);
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return put_user(answ, (int __user *)arg);
}
static ssize_t smc_sendpage(struct socket *sock, struct page *page,
@@ -1322,8 +1434,11 @@ static ssize_t smc_sendpage(struct socket *sock, struct page *page,
smc = smc_sk(sk);
lock_sock(sk);
- if (sk->sk_state != SMC_ACTIVE)
+ if (sk->sk_state != SMC_ACTIVE) {
+ release_sock(sk);
goto out;
+ }
+ release_sock(sk);
if (smc->use_fallback)
rc = kernel_sendpage(smc->clcsock, page, offset,
size, flags);
@@ -1331,13 +1446,18 @@ static ssize_t smc_sendpage(struct socket *sock, struct page *page,
rc = sock_no_sendpage(sock, page, offset, size, flags);
out:
- release_sock(sk);
return rc;
}
+/* Map the affected portions of the rmbe into an spd, note the number of bytes
+ * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
+ * updates till whenever a respective page has been fully processed.
+ * Note that subsequent recv() calls have to wait till all splice() processing
+ * completed.
+ */
static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
- unsigned int flags)
+ unsigned int flags)
{
struct sock *sk = sock->sk;
struct smc_sock *smc;
@@ -1345,16 +1465,34 @@ static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
smc = smc_sk(sk);
lock_sock(sk);
- if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
+
+ if (sk->sk_state == SMC_INIT ||
+ sk->sk_state == SMC_LISTEN ||
+ sk->sk_state == SMC_CLOSED)
+ goto out;
+
+ if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
+ rc = 0;
goto out;
+ }
+
if (smc->use_fallback) {
rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
pipe, len, flags);
} else {
- rc = -EOPNOTSUPP;
+ if (*ppos) {
+ rc = -ESPIPE;
+ goto out;
+ }
+ if (flags & SPLICE_F_NONBLOCK)
+ flags = MSG_DONTWAIT;
+ else
+ flags = 0;
+ rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
}
out:
release_sock(sk);
+
return rc;
}
diff --git a/net/smc/smc.h b/net/smc/smc.h
index e4829a2f46ba..ec209cd48d42 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -164,6 +164,9 @@ struct smc_connection {
atomic_t bytes_to_rcv; /* arrived data,
* not yet received
*/
+ atomic_t splice_pending; /* number of spliced bytes
+ * pending processing
+ */
#ifndef KERNEL_HAS_ATOMIC64
spinlock_t acurs_lock; /* protect cursors */
#endif
@@ -180,6 +183,10 @@ struct smc_sock { /* smc sock container */
struct list_head accept_q; /* sockets to be accepted */
spinlock_t accept_q_lock; /* protects accept_q */
bool use_fallback; /* fallback to tcp */
+ int sockopt_defer_accept;
+ /* sockopt TCP_DEFER_ACCEPT
+ * value
+ */
u8 wait_close_tx_prepared : 1;
/* shutdown wr or close
* started, waiting for unsent
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index b42395d24cba..42ad57365eca 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -82,7 +82,7 @@ static inline void smc_cdc_add_pending_send(struct smc_connection *conn,
sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE,
"must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)");
BUILD_BUG_ON_MSG(
- offsetof(struct smc_cdc_msg, reserved) > SMC_WR_TX_SIZE,
+ sizeof(struct smc_cdc_msg) != SMC_WR_TX_SIZE,
"must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
BUILD_BUG_ON_MSG(
sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
index ab240b37ad11..d2012fd22100 100644
--- a/net/smc/smc_cdc.h
+++ b/net/smc/smc_cdc.h
@@ -48,7 +48,7 @@ struct smc_cdc_msg {
struct smc_cdc_producer_flags prod_flags;
struct smc_cdc_conn_state_flags conn_state_flags;
u8 reserved[18];
-} __aligned(8);
+} __packed; /* format defined in RFC7609 */
static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn)
{
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index f44f6803f7ff..9c74844e2008 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -32,6 +32,9 @@
static u32 smc_lgr_num; /* unique link group number */
+static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk,
+ bool is_rmb);
+
static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
{
/* client link group creation always follows the server link group
@@ -234,9 +237,22 @@ static void smc_buf_unuse(struct smc_connection *conn)
conn->sndbuf_size = 0;
}
if (conn->rmb_desc) {
- conn->rmb_desc->reused = true;
- conn->rmb_desc->used = 0;
- conn->rmbe_size = 0;
+ if (!conn->rmb_desc->regerr) {
+ conn->rmb_desc->reused = 1;
+ conn->rmb_desc->used = 0;
+ conn->rmbe_size = 0;
+ } else {
+ /* buf registration failed, reuse not possible */
+ struct smc_link_group *lgr = conn->lgr;
+ struct smc_link *lnk;
+
+ write_lock_bh(&lgr->rmbs_lock);
+ list_del(&conn->rmb_desc->list);
+ write_unlock_bh(&lgr->rmbs_lock);
+
+ lnk = &lgr->lnk[SMC_SINGLE_LINK];
+ smc_buf_free(conn->rmb_desc, lnk, true);
+ }
}
}
@@ -274,8 +290,8 @@ static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk,
DMA_TO_DEVICE);
}
sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]);
- if (buf_desc->cpu_addr)
- free_pages((unsigned long)buf_desc->cpu_addr, buf_desc->order);
+ if (buf_desc->pages)
+ __free_pages(buf_desc->pages, buf_desc->order);
kfree(buf_desc);
}
@@ -310,6 +326,7 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr)
/* remove a link group */
void smc_lgr_free(struct smc_link_group *lgr)
{
+ smc_llc_link_flush(&lgr->lnk[SMC_SINGLE_LINK]);
smc_lgr_free_bufs(lgr);
smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
kfree(lgr);
@@ -332,6 +349,7 @@ void smc_lgr_terminate(struct smc_link_group *lgr)
struct rb_node *node;
smc_lgr_forget(lgr);
+ smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
write_lock_bh(&lgr->conns_lock);
node = rb_first(&lgr->conns_all);
@@ -358,7 +376,8 @@ void smc_lgr_terminate(struct smc_link_group *lgr)
static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
{
struct dst_entry *dst = sk_dst_get(clcsock->sk);
- int rc = 0;
+ struct net_device *ndev;
+ int i, nest_lvl, rc = 0;
*vlan_id = 0;
if (!dst) {
@@ -370,8 +389,27 @@ static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
goto out_rel;
}
- if (is_vlan_dev(dst->dev))
- *vlan_id = vlan_dev_vlan_id(dst->dev);
+ ndev = dst->dev;
+ if (is_vlan_dev(ndev)) {
+ *vlan_id = vlan_dev_vlan_id(ndev);
+ goto out_rel;
+ }
+
+ rtnl_lock();
+ nest_lvl = dev_get_nest_level(ndev);
+ for (i = 0; i < nest_lvl; i++) {
+ struct list_head *lower = &ndev->adj_list.lower;
+
+ if (list_empty(lower))
+ break;
+ lower = lower->next;
+ ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower);
+ if (is_vlan_dev(ndev)) {
+ *vlan_id = vlan_dev_vlan_id(ndev);
+ break;
+ }
+ }
+ rtnl_unlock();
out_rel:
dst_release(dst);
@@ -528,16 +566,16 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr,
if (!buf_desc)
return ERR_PTR(-ENOMEM);
- buf_desc->cpu_addr =
- (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN |
- __GFP_NOMEMALLOC |
- __GFP_NORETRY | __GFP_ZERO,
- get_order(bufsize));
- if (!buf_desc->cpu_addr) {
+ buf_desc->order = get_order(bufsize);
+ buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN |
+ __GFP_NOMEMALLOC | __GFP_COMP |
+ __GFP_NORETRY | __GFP_ZERO,
+ buf_desc->order);
+ if (!buf_desc->pages) {
kfree(buf_desc);
return ERR_PTR(-EAGAIN);
}
- buf_desc->order = get_order(bufsize);
+ buf_desc->cpu_addr = (void *)page_address(buf_desc->pages);
/* build the sg table from the pages */
lnk = &lgr->lnk[SMC_SINGLE_LINK];
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 07e2a393e6d9..fca8624e5e71 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -79,6 +79,7 @@ struct smc_link {
dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */
u64 wr_rx_id; /* seq # of last recv WR */
u32 wr_rx_cnt; /* number of WR recv buffers */
+ unsigned long wr_rx_tstamp; /* jiffies when last buf rx */
struct ib_reg_wr wr_reg; /* WR register memory region */
wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */
@@ -101,6 +102,9 @@ struct smc_link {
int llc_confirm_resp_rc; /* rc from conf_resp msg */
struct completion llc_add; /* wait for rx of add link */
struct completion llc_add_resp; /* wait for rx of add link rsp*/
+ struct delayed_work llc_testlink_wrk; /* testlink worker */
+ struct completion llc_testlink_resp; /* wait for rx of testlink */
+ int llc_testlink_time; /* testlink interval */
};
/* For now we just allow one parallel link per link group. The SMC protocol
@@ -116,6 +120,7 @@ struct smc_link {
struct smc_buf_desc {
struct list_head list;
void *cpu_addr; /* virtual address of buffer */
+ struct page *pages;
struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];/* virtual buffer */
struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX];
/* for rmb only: memory region
@@ -123,7 +128,8 @@ struct smc_buf_desc {
*/
u32 order; /* allocation order */
u32 used; /* currently used / unused */
- bool reused; /* new created / reused */
+ u8 reused : 1; /* new created / reused */
+ u8 regerr : 1; /* err during registration */
};
struct smc_rtoken { /* address/key of remote RMB */
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index 427b91c1c964..05dd7e6d314d 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -38,17 +38,27 @@ static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk)
{
struct smc_sock *smc = smc_sk(sk);
- r->diag_family = sk->sk_family;
if (!smc->clcsock)
return;
r->id.idiag_sport = htons(smc->clcsock->sk->sk_num);
r->id.idiag_dport = smc->clcsock->sk->sk_dport;
r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if;
sock_diag_save_cookie(sk, r->id.idiag_cookie);
- memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
- memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
- r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr;
- r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr;
+ if (sk->sk_protocol == SMCPROTO_SMC) {
+ r->diag_family = PF_INET;
+ memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+ memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+ r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr;
+ r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (sk->sk_protocol == SMCPROTO_SMC6) {
+ r->diag_family = PF_INET6;
+ memcpy(&r->id.idiag_src, &smc->clcsock->sk->sk_v6_rcv_saddr,
+ sizeof(smc->clcsock->sk->sk_v6_rcv_saddr));
+ memcpy(&r->id.idiag_dst, &smc->clcsock->sk->sk_v6_daddr,
+ sizeof(smc->clcsock->sk->sk_v6_daddr));
+#endif
+ }
}
static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
@@ -153,7 +163,8 @@ errout:
return -EMSGSIZE;
}
-static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb,
+ struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
struct nlattr *bc = NULL;
@@ -161,8 +172,8 @@ static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
struct sock *sk;
int rc = 0;
- read_lock(&smc_proto.h.smc_hash->lock);
- head = &smc_proto.h.smc_hash->ht;
+ read_lock(&prot->h.smc_hash->lock);
+ head = &prot->h.smc_hash->ht;
if (hlist_empty(head))
goto out;
@@ -175,7 +186,17 @@ static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
}
out:
- read_unlock(&smc_proto.h.smc_hash->lock);
+ read_unlock(&prot->h.smc_hash->lock);
+ return rc;
+}
+
+static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int rc = 0;
+
+ rc = smc_diag_dump_proto(&smc_proto, skb, cb);
+ if (!rc)
+ rc = smc_diag_dump_proto(&smc_proto6, skb, cb);
return rc;
}
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index ea4b21981b4b..33b4d856f4c6 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -397,7 +397,8 @@ static void smc_llc_rx_test_link(struct smc_link *link,
struct smc_llc_msg_test_link *llc)
{
if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
- /* unused as long as we don't send this type of msg */
+ if (link->state == SMC_LNK_ACTIVE)
+ complete(&link->llc_testlink_resp);
} else {
smc_llc_send_test_link(link, llc->user_data, SMC_LLC_RESP);
}
@@ -502,6 +503,65 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
}
}
+/***************************** worker ****************************************/
+
+static void smc_llc_testlink_work(struct work_struct *work)
+{
+ struct smc_link *link = container_of(to_delayed_work(work),
+ struct smc_link, llc_testlink_wrk);
+ unsigned long next_interval;
+ struct smc_link_group *lgr;
+ unsigned long expire_time;
+ u8 user_data[16] = { 0 };
+ int rc;
+
+ lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
+ if (link->state != SMC_LNK_ACTIVE)
+ return; /* don't reschedule worker */
+ expire_time = link->wr_rx_tstamp + link->llc_testlink_time;
+ if (time_is_after_jiffies(expire_time)) {
+ next_interval = expire_time - jiffies;
+ goto out;
+ }
+ reinit_completion(&link->llc_testlink_resp);
+ smc_llc_send_test_link(link, user_data, SMC_LLC_REQ);
+ /* receive TEST LINK response over RoCE fabric */
+ rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp,
+ SMC_LLC_WAIT_TIME);
+ if (rc <= 0) {
+ smc_lgr_terminate(lgr);
+ return;
+ }
+ next_interval = link->llc_testlink_time;
+out:
+ schedule_delayed_work(&link->llc_testlink_wrk, next_interval);
+}
+
+void smc_llc_link_active(struct smc_link *link, int testlink_time)
+{
+ init_completion(&link->llc_testlink_resp);
+ INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work);
+ link->state = SMC_LNK_ACTIVE;
+ if (testlink_time) {
+ link->llc_testlink_time = testlink_time * HZ;
+ schedule_delayed_work(&link->llc_testlink_wrk,
+ link->llc_testlink_time);
+ }
+}
+
+/* called in tasklet context */
+void smc_llc_link_inactive(struct smc_link *link)
+{
+ link->state = SMC_LNK_INACTIVE;
+ cancel_delayed_work(&link->llc_testlink_wrk);
+}
+
+/* called in worker context */
+void smc_llc_link_flush(struct smc_link *link)
+{
+ cancel_delayed_work_sync(&link->llc_testlink_wrk);
+}
+
/***************************** init, exit, misc ******************************/
static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index e4a7d5e234d5..d6e42116485e 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -44,6 +44,9 @@ int smc_llc_send_delete_link(struct smc_link *link,
enum smc_llc_reqresp reqresp);
int smc_llc_send_test_link(struct smc_link *lnk, u8 user_data[16],
enum smc_llc_reqresp reqresp);
+void smc_llc_link_active(struct smc_link *link, int testlink_time);
+void smc_llc_link_inactive(struct smc_link *link);
+void smc_llc_link_flush(struct smc_link *link);
int smc_llc_init(void) __init;
#endif /* SMC_LLC_H */
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index eff4e0d0bb31..ed45569289f5 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -22,11 +22,10 @@
#include "smc_tx.h" /* smc_tx_consumer_update() */
#include "smc_rx.h"
-/* callback implementation for sk.sk_data_ready()
- * to wakeup rcvbuf consumers that blocked with smc_rx_wait_data().
+/* callback implementation to wakeup consumers blocked with smc_rx_wait().
* indirectly called by smc_cdc_msg_recv_action().
*/
-static void smc_rx_data_ready(struct sock *sk)
+static void smc_rx_wake_up(struct sock *sk)
{
struct socket_wq *wq;
@@ -44,28 +43,140 @@ static void smc_rx_data_ready(struct sock *sk)
rcu_read_unlock();
}
+/* Update consumer cursor
+ * @conn connection to update
+ * @cons consumer cursor
+ * @len number of Bytes consumed
+ */
+static void smc_rx_update_consumer(struct smc_connection *conn,
+ union smc_host_cursor cons, size_t len)
+{
+ smc_curs_add(conn->rmbe_size, &cons, len);
+ smc_curs_write(&conn->local_tx_ctrl.cons, smc_curs_read(&cons, conn),
+ conn);
+ /* send consumer cursor update if required */
+ /* similar to advertising new TCP rcv_wnd if required */
+ smc_tx_consumer_update(conn);
+}
+
+struct smc_spd_priv {
+ struct smc_sock *smc;
+ size_t len;
+};
+
+static void smc_rx_pipe_buf_release(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ struct smc_spd_priv *priv = (struct smc_spd_priv *)buf->private;
+ struct smc_sock *smc = priv->smc;
+ struct smc_connection *conn;
+ union smc_host_cursor cons;
+ struct sock *sk = &smc->sk;
+
+ if (sk->sk_state == SMC_CLOSED ||
+ sk->sk_state == SMC_PEERFINCLOSEWAIT ||
+ sk->sk_state == SMC_APPFINCLOSEWAIT)
+ goto out;
+ conn = &smc->conn;
+ lock_sock(sk);
+ smc_curs_write(&cons, smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+ conn);
+ smc_rx_update_consumer(conn, cons, priv->len);
+ release_sock(sk);
+ if (atomic_sub_and_test(priv->len, &conn->splice_pending))
+ smc_rx_wake_up(sk);
+out:
+ kfree(priv);
+ put_page(buf->page);
+ sock_put(sk);
+}
+
+static int smc_rx_pipe_buf_nosteal(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ return 1;
+}
+
+static const struct pipe_buf_operations smc_pipe_ops = {
+ .can_merge = 0,
+ .confirm = generic_pipe_buf_confirm,
+ .release = smc_rx_pipe_buf_release,
+ .steal = smc_rx_pipe_buf_nosteal,
+ .get = generic_pipe_buf_get
+};
+
+static void smc_rx_spd_release(struct splice_pipe_desc *spd,
+ unsigned int i)
+{
+ put_page(spd->pages[i]);
+}
+
+static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len,
+ struct smc_sock *smc)
+{
+ struct splice_pipe_desc spd;
+ struct partial_page partial;
+ struct smc_spd_priv *priv;
+ struct page *page;
+ int bytes;
+
+ page = virt_to_page(smc->conn.rmb_desc->cpu_addr);
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+ priv->len = len;
+ priv->smc = smc;
+ partial.offset = src - (char *)smc->conn.rmb_desc->cpu_addr;
+ partial.len = len;
+ partial.private = (unsigned long)priv;
+
+ spd.nr_pages_max = 1;
+ spd.nr_pages = 1;
+ spd.pages = &page;
+ spd.partial = &partial;
+ spd.ops = &smc_pipe_ops;
+ spd.spd_release = smc_rx_spd_release;
+
+ bytes = splice_to_pipe(pipe, &spd);
+ if (bytes > 0) {
+ sock_hold(&smc->sk);
+ get_page(smc->conn.rmb_desc->pages);
+ atomic_add(bytes, &smc->conn.splice_pending);
+ }
+
+ return bytes;
+}
+
+static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn)
+{
+ return atomic_read(&conn->bytes_to_rcv) &&
+ !atomic_read(&conn->splice_pending);
+}
+
/* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted
* @smc smc socket
* @timeo pointer to max seconds to wait, pointer to value 0 for no timeout
+ * @fcrit add'l criterion to evaluate as function pointer
* Returns:
* 1 if at least 1 byte available in rcvbuf or if socket error/shutdown.
* 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted).
*/
-static int smc_rx_wait_data(struct smc_sock *smc, long *timeo)
+int smc_rx_wait(struct smc_sock *smc, long *timeo,
+ int (*fcrit)(struct smc_connection *conn))
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct smc_connection *conn = &smc->conn;
struct sock *sk = &smc->sk;
int rc;
- if (atomic_read(&conn->bytes_to_rcv))
+ if (fcrit(conn))
return 1;
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
add_wait_queue(sk_sleep(sk), &wait);
rc = sk_wait_event(sk, timeo,
sk->sk_err ||
sk->sk_shutdown & RCV_SHUTDOWN ||
- atomic_read(&conn->bytes_to_rcv) ||
+ fcrit(conn) ||
smc_cdc_rxed_any_close_or_senddone(conn),
&wait);
remove_wait_queue(sk_sleep(sk), &wait);
@@ -73,19 +184,25 @@ static int smc_rx_wait_data(struct smc_sock *smc, long *timeo)
return rc;
}
-/* rcvbuf consumer: main API called by socket layer.
- * called under sk lock.
+/* smc_rx_recvmsg - receive data from RMBE
+ * @msg: copy data to receive buffer
+ * @pipe: copy data to pipe if set - indicates splice() call
+ *
+ * rcvbuf consumer: main API called by socket layer.
+ * Called under sk lock.
*/
-int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
- int flags)
+int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
+ struct pipe_inode_info *pipe, size_t len, int flags)
{
size_t copylen, read_done = 0, read_remaining = len;
size_t chunk_len, chunk_off, chunk_len_sum;
struct smc_connection *conn = &smc->conn;
+ int (*func)(struct smc_connection *conn);
union smc_host_cursor cons;
int readable, chunk;
char *rcvbuf_base;
struct sock *sk;
+ int splbytes;
long timeo;
int target; /* Read at least these many bytes */
int rc;
@@ -101,37 +218,32 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
- msg->msg_namelen = 0;
/* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */
rcvbuf_base = conn->rmb_desc->cpu_addr;
do { /* while (read_remaining) */
- if (read_done >= target)
+ if (read_done >= target || (pipe && read_done))
break;
if (atomic_read(&conn->bytes_to_rcv))
goto copy;
+ if (sk->sk_shutdown & RCV_SHUTDOWN ||
+ smc_cdc_rxed_any_close_or_senddone(conn) ||
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort)
+ break;
+
if (read_done) {
if (sk->sk_err ||
sk->sk_state == SMC_CLOSED ||
- sk->sk_shutdown & RCV_SHUTDOWN ||
!timeo ||
- signal_pending(current) ||
- smc_cdc_rxed_any_close_or_senddone(conn) ||
- conn->local_tx_ctrl.conn_state_flags.
- peer_conn_abort)
+ signal_pending(current))
break;
} else {
if (sk->sk_err) {
read_done = sock_error(sk);
break;
}
- if (sk->sk_shutdown & RCV_SHUTDOWN ||
- smc_cdc_rxed_any_close_or_senddone(conn) ||
- conn->local_tx_ctrl.conn_state_flags.
- peer_conn_abort)
- break;
if (sk->sk_state == SMC_CLOSED) {
if (!sock_flag(sk, SOCK_DONE)) {
/* This occurs when user tries to read
@@ -150,20 +262,33 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
return -EAGAIN;
}
- if (!atomic_read(&conn->bytes_to_rcv)) {
- smc_rx_wait_data(smc, &timeo);
+ if (!smc_rx_data_available(conn)) {
+ smc_rx_wait(smc, &timeo, smc_rx_data_available);
continue;
}
copy:
/* initialize variables for 1st iteration of subsequent loop */
- /* could be just 1 byte, even after smc_rx_wait_data above */
+ /* could be just 1 byte, even after waiting on data above */
readable = atomic_read(&conn->bytes_to_rcv);
+ splbytes = atomic_read(&conn->splice_pending);
+ if (!readable || (msg && splbytes)) {
+ if (splbytes)
+ func = smc_rx_data_available_and_no_splice_pend;
+ else
+ func = smc_rx_data_available;
+ smc_rx_wait(smc, &timeo, func);
+ continue;
+ }
+
/* not more than what user space asked for */
copylen = min_t(size_t, read_remaining, readable);
smc_curs_write(&cons,
smc_curs_read(&conn->local_tx_ctrl.cons, conn),
conn);
+ /* subsequent splice() calls pick up where previous left */
+ if (splbytes)
+ smc_curs_add(conn->rmbe_size, &cons, splbytes);
/* determine chunks where to read from rcvbuf */
/* either unwrapped case, or 1st chunk of wrapped case */
chunk_len = min_t(size_t,
@@ -173,9 +298,16 @@ copy:
smc_rmb_sync_sg_for_cpu(conn);
for (chunk = 0; chunk < 2; chunk++) {
if (!(flags & MSG_TRUNC)) {
- rc = memcpy_to_msg(msg, rcvbuf_base + chunk_off,
- chunk_len);
- if (rc) {
+ if (msg) {
+ rc = memcpy_to_msg(msg, rcvbuf_base +
+ chunk_off,
+ chunk_len);
+ } else {
+ rc = smc_rx_splice(pipe, rcvbuf_base +
+ chunk_off, chunk_len,
+ smc);
+ }
+ if (rc < 0) {
if (!read_done)
read_done = -EFAULT;
smc_rmb_sync_sg_for_device(conn);
@@ -196,18 +328,13 @@ copy:
/* update cursors */
if (!(flags & MSG_PEEK)) {
- smc_curs_add(conn->rmbe_size, &cons, copylen);
/* increased in recv tasklet smc_cdc_msg_rcv() */
smp_mb__before_atomic();
atomic_sub(copylen, &conn->bytes_to_rcv);
/* guarantee 0 <= bytes_to_rcv <= rmbe_size */
smp_mb__after_atomic();
- smc_curs_write(&conn->local_tx_ctrl.cons,
- smc_curs_read(&cons, conn),
- conn);
- /* send consumer cursor update if required */
- /* similar to advertising new TCP rcv_wnd if required */
- smc_tx_consumer_update(conn);
+ if (msg)
+ smc_rx_update_consumer(conn, cons, copylen);
}
} while (read_remaining);
out:
@@ -217,5 +344,6 @@ out:
/* Initialize receive properties on connection establishment. NB: not __init! */
void smc_rx_init(struct smc_sock *smc)
{
- smc->sk.sk_data_ready = smc_rx_data_ready;
+ smc->sk.sk_data_ready = smc_rx_wake_up;
+ atomic_set(&smc->conn.splice_pending, 0);
}
diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h
index 3a32b59bf06c..db823c97d824 100644
--- a/net/smc/smc_rx.h
+++ b/net/smc/smc_rx.h
@@ -18,7 +18,14 @@
#include "smc.h"
void smc_rx_init(struct smc_sock *smc);
-int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
- int flags);
+
+int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
+ struct pipe_inode_info *pipe, size_t len, int flags);
+int smc_rx_wait(struct smc_sock *smc, long *timeo,
+ int (*fcrit)(struct smc_connection *conn));
+static inline int smc_rx_data_available(struct smc_connection *conn)
+{
+ return atomic_read(&conn->bytes_to_rcv);
+}
#endif /* SMC_RX_H */
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 72f004c9c9b1..58dfe0bd9d60 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -19,6 +19,7 @@
#include <linux/sched/signal.h>
#include <net/sock.h>
+#include <net/tcp.h>
#include "smc.h"
#include "smc_wr.h"
@@ -26,6 +27,7 @@
#include "smc_tx.h"
#define SMC_TX_WORK_DELAY HZ
+#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */
/***************************** sndbuf producer *******************************/
@@ -115,6 +117,13 @@ static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
return rc;
}
+static bool smc_tx_is_corked(struct smc_sock *smc)
+{
+ struct tcp_sock *tp = tcp_sk(smc->clcsock->sk);
+
+ return (tp->nonagle & TCP_NAGLE_CORK) ? true : false;
+}
+
/* sndbuf producer: main API called by socket layer.
* called under sock lock.
*/
@@ -209,7 +218,16 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
/* since we just produced more new data into sndbuf,
* trigger sndbuf consumer: RDMA write into peer RMBE and CDC
*/
- smc_tx_sndbuf_nonempty(conn);
+ if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) &&
+ (atomic_read(&conn->sndbuf_space) >
+ (conn->sndbuf_size >> 1)))
+ /* for a corked socket defer the RDMA writes if there
+ * is still sufficient sndbuf_space available
+ */
+ schedule_delayed_work(&conn->tx_work,
+ SMC_TX_CORK_DELAY);
+ else
+ smc_tx_sndbuf_nonempty(conn);
} /* while (msg_data_left(msg)) */
return send_done;
@@ -409,8 +427,8 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
}
rc = 0;
if (conn->alert_token_local) /* connection healthy */
- schedule_delayed_work(&conn->tx_work,
- SMC_TX_WORK_DELAY);
+ mod_delayed_work(system_wq, &conn->tx_work,
+ SMC_TX_WORK_DELAY);
}
goto out_unlock;
}
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 1b8af23e6e2b..cc7c1bb60fe8 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -376,6 +376,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
for (i = 0; i < num; i++) {
link = wc[i].qp->qp_context;
if (wc[i].status == IB_WC_SUCCESS) {
+ link->wr_rx_tstamp = jiffies;
smc_wr_rx_demultiplex(&wc[i]);
smc_wr_rx_post(link); /* refill WR RX */
} else {
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index b9283ce5cd85..092bebc70048 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -67,7 +67,7 @@ static void strp_abort_strp(struct strparser *strp, int err)
static void strp_start_timer(struct strparser *strp, long timeo)
{
- if (timeo)
+ if (timeo && timeo != LONG_MAX)
mod_delayed_work(strp_wq, &strp->msg_timer_work, timeo);
}
@@ -296,9 +296,9 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
strp_start_timer(strp, timeo);
}
+ stm->accum_len += cand_len;
strp->need_bytes = stm->strp.full_len -
stm->accum_len;
- stm->accum_len += cand_len;
stm->early_eaten = cand_len;
STRP_STATS_ADD(strp->stats.bytes, cand_len);
desc->count = 0; /* Stop reading socket */
@@ -321,6 +321,7 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
/* Hurray, we have a new message! */
cancel_delayed_work(&strp->msg_timer_work);
strp->skb_head = NULL;
+ strp->need_bytes = 0;
STRP_STATS_INCR(strp->stats.msgs);
/* Give skb to upper layer */
@@ -410,9 +411,7 @@ void strp_data_ready(struct strparser *strp)
return;
if (strp->need_bytes) {
- if (strp_peek_len(strp) >= strp->need_bytes)
- strp->need_bytes = 0;
- else
+ if (strp_peek_len(strp) < strp->need_bytes)
return;
}
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 806395687bb6..c2266f387213 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1887,7 +1887,7 @@ call_connect_status(struct rpc_task *task)
dprint_status(task);
- trace_rpc_connect_status(task, status);
+ trace_rpc_connect_status(task);
task->tk_status = 0;
switch (status) {
case -ECONNREFUSED:
@@ -2014,6 +2014,9 @@ call_transmit_status(struct rpc_task *task)
case -EPERM:
if (RPC_IS_SOFTCONN(task)) {
xprt_end_transmit(task);
+ if (!task->tk_msg.rpc_proc->p_proc)
+ trace_xprt_ping(task->tk_xprt,
+ task->tk_status);
rpc_exit(task, task->tk_status);
break;
}
@@ -2112,6 +2115,9 @@ call_status(struct rpc_task *task)
struct rpc_rqst *req = task->tk_rqstp;
int status;
+ if (!task->tk_msg.rpc_proc->p_proc)
+ trace_xprt_ping(task->tk_xprt, task->tk_status);
+
if (req->rq_reply_bytes_recvd > 0 && !req->rq_bytes_sent)
task->tk_status = req->rq_reply_bytes_recvd;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 0f08934b2cea..c81ef5e6c981 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -1375,6 +1375,7 @@ rpc_gssd_dummy_depopulate(struct dentry *pipe_dentry)
struct dentry *clnt_dir = pipe_dentry->d_parent;
struct dentry *gssd_dir = clnt_dir->d_parent;
+ dget(pipe_dentry);
__rpc_rmpipe(d_inode(clnt_dir), pipe_dentry);
__rpc_depopulate(clnt_dir, gssd_dummy_info_file, 0, 1);
__rpc_depopulate(gssd_dir, gssd_dummy_clnt_dir, 0, 1);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index d9db2eab3a8d..3fe5d60ab0e2 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -276,7 +276,7 @@ static void rpc_set_active(struct rpc_task *task)
{
rpc_task_set_debuginfo(task);
set_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
- trace_rpc_task_begin(task->tk_client, task, NULL);
+ trace_rpc_task_begin(task, NULL);
}
/*
@@ -291,7 +291,7 @@ static int rpc_complete_task(struct rpc_task *task)
unsigned long flags;
int ret;
- trace_rpc_task_complete(task->tk_client, task, NULL);
+ trace_rpc_task_complete(task, NULL);
spin_lock_irqsave(&wq->lock, flags);
clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
@@ -358,7 +358,7 @@ static void __rpc_sleep_on_priority(struct rpc_wait_queue *q,
dprintk("RPC: %5u sleep_on(queue \"%s\" time %lu)\n",
task->tk_pid, rpc_qname(q), jiffies);
- trace_rpc_task_sleep(task->tk_client, task, q);
+ trace_rpc_task_sleep(task, q);
__rpc_add_wait_queue(q, task, queue_priority);
@@ -428,7 +428,7 @@ static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq,
return;
}
- trace_rpc_task_wakeup(task->tk_client, task, queue);
+ trace_rpc_task_wakeup(task, queue);
__rpc_remove_wait_queue(queue, task);
@@ -780,7 +780,7 @@ static void __rpc_execute(struct rpc_task *task)
}
if (!do_action)
break;
- trace_rpc_task_run_action(task->tk_client, task, do_action);
+ trace_rpc_task_run_action(task, do_action);
do_action(task);
/*
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 1e671333c3d5..f68aa46c9dd7 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -24,6 +24,8 @@
#include <linux/sunrpc/metrics.h>
#include <linux/rcupdate.h>
+#include <trace/events/sunrpc.h>
+
#include "netns.h"
#define RPCDBG_FACILITY RPCDBG_MISC
@@ -148,7 +150,7 @@ void rpc_count_iostats_metrics(const struct rpc_task *task,
struct rpc_iostats *op_metrics)
{
struct rpc_rqst *req = task->tk_rqstp;
- ktime_t delta, now;
+ ktime_t backlog, execute, now;
if (!op_metrics || !req)
return;
@@ -164,16 +166,20 @@ void rpc_count_iostats_metrics(const struct rpc_task *task,
op_metrics->om_bytes_sent += req->rq_xmit_bytes_sent;
op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd;
+ backlog = 0;
if (ktime_to_ns(req->rq_xtime)) {
- delta = ktime_sub(req->rq_xtime, task->tk_start);
- op_metrics->om_queue = ktime_add(op_metrics->om_queue, delta);
+ backlog = ktime_sub(req->rq_xtime, task->tk_start);
+ op_metrics->om_queue = ktime_add(op_metrics->om_queue, backlog);
}
+
op_metrics->om_rtt = ktime_add(op_metrics->om_rtt, req->rq_rtt);
- delta = ktime_sub(now, task->tk_start);
- op_metrics->om_execute = ktime_add(op_metrics->om_execute, delta);
+ execute = ktime_sub(now, task->tk_start);
+ op_metrics->om_execute = ktime_add(op_metrics->om_execute, execute);
spin_unlock(&op_metrics->om_lock);
+
+ trace_rpc_stats_latency(req->rq_task, backlog, req->rq_rtt, execute);
}
EXPORT_SYMBOL_GPL(rpc_count_iostats_metrics);
diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h
index f2b7cb540e61..09a0315ea77b 100644
--- a/net/sunrpc/sunrpc.h
+++ b/net/sunrpc/sunrpc.h
@@ -37,12 +37,6 @@ struct rpc_buffer {
char data[];
};
-static inline int rpc_reply_expected(struct rpc_task *task)
-{
- return (task->tk_msg.rpc_proc != NULL) &&
- (task->tk_msg.rpc_proc->p_decode != NULL);
-}
-
static inline int sock_is_loopback(struct sock *sk)
{
struct dst_entry *dst;
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index e34f4ee7f2b6..30afbd236656 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1519,6 +1519,88 @@ out:
EXPORT_SYMBOL_GPL(xdr_process_buf);
/**
+ * xdr_stream_decode_opaque - Decode variable length opaque
+ * @xdr: pointer to xdr_stream
+ * @ptr: location to store opaque data
+ * @size: size of storage buffer @ptr
+ *
+ * Return values:
+ * On success, returns size of object stored in *@ptr
+ * %-EBADMSG on XDR buffer overflow
+ * %-EMSGSIZE on overflow of storage buffer @ptr
+ */
+ssize_t xdr_stream_decode_opaque(struct xdr_stream *xdr, void *ptr, size_t size)
+{
+ ssize_t ret;
+ void *p;
+
+ ret = xdr_stream_decode_opaque_inline(xdr, &p, size);
+ if (ret <= 0)
+ return ret;
+ memcpy(ptr, p, ret);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xdr_stream_decode_opaque);
+
+/**
+ * xdr_stream_decode_opaque_dup - Decode and duplicate variable length opaque
+ * @xdr: pointer to xdr_stream
+ * @ptr: location to store pointer to opaque data
+ * @maxlen: maximum acceptable object size
+ * @gfp_flags: GFP mask to use
+ *
+ * Return values:
+ * On success, returns size of object stored in *@ptr
+ * %-EBADMSG on XDR buffer overflow
+ * %-EMSGSIZE if the size of the object would exceed @maxlen
+ * %-ENOMEM on memory allocation failure
+ */
+ssize_t xdr_stream_decode_opaque_dup(struct xdr_stream *xdr, void **ptr,
+ size_t maxlen, gfp_t gfp_flags)
+{
+ ssize_t ret;
+ void *p;
+
+ ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen);
+ if (ret > 0) {
+ *ptr = kmemdup(p, ret, gfp_flags);
+ if (*ptr != NULL)
+ return ret;
+ ret = -ENOMEM;
+ }
+ *ptr = NULL;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xdr_stream_decode_opaque_dup);
+
+/**
+ * xdr_stream_decode_string - Decode variable length string
+ * @xdr: pointer to xdr_stream
+ * @str: location to store string
+ * @size: size of storage buffer @str
+ *
+ * Return values:
+ * On success, returns length of NUL-terminated string stored in *@str
+ * %-EBADMSG on XDR buffer overflow
+ * %-EMSGSIZE on overflow of storage buffer @str
+ */
+ssize_t xdr_stream_decode_string(struct xdr_stream *xdr, char *str, size_t size)
+{
+ ssize_t ret;
+ void *p;
+
+ ret = xdr_stream_decode_opaque_inline(xdr, &p, size);
+ if (ret > 0) {
+ memcpy(str, p, ret);
+ str[ret] = '\0';
+ return strlen(str);
+ }
+ *str = '\0';
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xdr_stream_decode_string);
+
+/**
* xdr_stream_decode_string_dup - Decode and duplicate variable length string
* @xdr: pointer to xdr_stream
* @str: location to store pointer to string
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 8f0ad4f268da..70f005044f06 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -826,6 +826,7 @@ static void xprt_connect_status(struct rpc_task *task)
* @xprt: transport on which the original request was transmitted
* @xid: RPC XID of incoming reply
*
+ * Caller holds xprt->recv_lock.
*/
struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid)
{
@@ -834,6 +835,7 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid)
list_for_each_entry(entry, &xprt->recv, rq_list)
if (entry->rq_xid == xid) {
trace_xprt_lookup_rqst(xprt, xid, 0);
+ entry->rq_rtt = ktime_sub(ktime_get(), entry->rq_xtime);
return entry;
}
@@ -889,7 +891,13 @@ __must_hold(&req->rq_xprt->recv_lock)
}
}
-static void xprt_update_rtt(struct rpc_task *task)
+/**
+ * xprt_update_rtt - Update RPC RTT statistics
+ * @task: RPC request that recently completed
+ *
+ * Caller holds xprt->recv_lock.
+ */
+void xprt_update_rtt(struct rpc_task *task)
{
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_rtt *rtt = task->tk_client->cl_rtt;
@@ -902,13 +910,14 @@ static void xprt_update_rtt(struct rpc_task *task)
rpc_set_timeo(rtt, timer, req->rq_ntrans - 1);
}
}
+EXPORT_SYMBOL_GPL(xprt_update_rtt);
/**
* xprt_complete_rqst - called when reply processing is complete
* @task: RPC request that recently completed
* @copied: actual number of bytes received from the transport
*
- * Caller holds transport lock.
+ * Caller holds xprt->recv_lock.
*/
void xprt_complete_rqst(struct rpc_task *task, int copied)
{
@@ -920,9 +929,6 @@ void xprt_complete_rqst(struct rpc_task *task, int copied)
trace_xprt_complete_rqst(xprt, req->rq_xid, copied);
xprt->stat.recvs++;
- req->rq_rtt = ktime_sub(ktime_get(), req->rq_xtime);
- if (xprt->ops->timer != NULL)
- xprt_update_rtt(task);
list_del_init(&req->rq_list);
req->rq_private_buf.len = copied;
@@ -1003,7 +1009,7 @@ void xprt_transmit(struct rpc_task *task)
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
unsigned int connect_cookie;
- int status, numreqs;
+ int status;
dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen);
@@ -1027,7 +1033,6 @@ void xprt_transmit(struct rpc_task *task)
return;
connect_cookie = xprt->connect_cookie;
- req->rq_xtime = ktime_get();
status = xprt->ops->send_request(task);
trace_xprt_transmit(xprt, req->rq_xid, status);
if (status != 0) {
@@ -1042,9 +1047,6 @@ void xprt_transmit(struct rpc_task *task)
xprt->ops->set_retrans_timeout(task);
- numreqs = atomic_read(&xprt->num_reqs);
- if (numreqs > xprt->stat.max_slots)
- xprt->stat.max_slots = numreqs;
xprt->stat.sends++;
xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs;
xprt->stat.bklog_u += xprt->backlog.qlen;
@@ -1106,14 +1108,15 @@ static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt)
{
struct rpc_rqst *req = ERR_PTR(-EAGAIN);
- if (!atomic_add_unless(&xprt->num_reqs, 1, xprt->max_reqs))
+ if (xprt->num_reqs >= xprt->max_reqs)
goto out;
+ ++xprt->num_reqs;
spin_unlock(&xprt->reserve_lock);
req = kzalloc(sizeof(struct rpc_rqst), GFP_NOFS);
spin_lock(&xprt->reserve_lock);
if (req != NULL)
goto out;
- atomic_dec(&xprt->num_reqs);
+ --xprt->num_reqs;
req = ERR_PTR(-ENOMEM);
out:
return req;
@@ -1121,7 +1124,8 @@ out:
static bool xprt_dynamic_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
{
- if (atomic_add_unless(&xprt->num_reqs, -1, xprt->min_reqs)) {
+ if (xprt->num_reqs > xprt->min_reqs) {
+ --xprt->num_reqs;
kfree(req);
return true;
}
@@ -1157,6 +1161,8 @@ void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
spin_unlock(&xprt->reserve_lock);
return;
out_init_req:
+ xprt->stat.max_slots = max_t(unsigned int, xprt->stat.max_slots,
+ xprt->num_reqs);
task->tk_status = 0;
task->tk_rqstp = req;
xprt_request_init(task, xprt);
@@ -1224,7 +1230,7 @@ struct rpc_xprt *xprt_alloc(struct net *net, size_t size,
else
xprt->max_reqs = num_prealloc;
xprt->min_reqs = num_prealloc;
- atomic_set(&xprt->num_reqs, num_prealloc);
+ xprt->num_reqs = num_prealloc;
return xprt;
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index ed1a4a3065ee..47ebac949769 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -44,13 +44,6 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
if (IS_ERR(req))
return PTR_ERR(req);
- rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE,
- DMA_TO_DEVICE, GFP_KERNEL);
- if (IS_ERR(rb))
- goto out_fail;
- req->rl_rdmabuf = rb;
- xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb));
-
size = r_xprt->rx_data.inline_rsize;
rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL);
if (IS_ERR(rb))
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index d5f95bb39300..5cc68a824f45 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -191,7 +191,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
mr = rpcrdma_mr_get(r_xprt);
if (!mr)
- return ERR_PTR(-ENOBUFS);
+ return ERR_PTR(-EAGAIN);
pageoff = offset_in_page(seg1->mr_offset);
seg1->mr_offset -= pageoff; /* start of page */
@@ -251,6 +251,16 @@ out_maperr:
return ERR_PTR(-EIO);
}
+/* Post Send WR containing the RPC Call message.
+ */
+static int
+fmr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+{
+ struct ib_send_wr *bad_wr;
+
+ return ib_post_send(ia->ri_id->qp, &req->rl_sendctx->sc_wr, &bad_wr);
+}
+
/* Invalidate all memory regions that were registered for "req".
*
* Sleeps until it is safe for the host CPU to access the
@@ -305,6 +315,7 @@ out_reset:
const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
.ro_map = fmr_op_map,
+ .ro_send = fmr_op_send,
.ro_unmap_sync = fmr_op_unmap_sync,
.ro_recover_mr = fmr_op_recover_mr,
.ro_open = fmr_op_open,
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 90f688f19783..c5743a0960be 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -357,8 +357,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
struct rpcrdma_mr *mr;
struct ib_mr *ibmr;
struct ib_reg_wr *reg_wr;
- struct ib_send_wr *bad_wr;
- int rc, i, n;
+ int i, n;
u8 key;
mr = NULL;
@@ -367,7 +366,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
rpcrdma_mr_defer_recovery(mr);
mr = rpcrdma_mr_get(r_xprt);
if (!mr)
- return ERR_PTR(-ENOBUFS);
+ return ERR_PTR(-EAGAIN);
} while (mr->frwr.fr_state != FRWR_IS_INVALID);
frwr = &mr->frwr;
frwr->fr_state = FRWR_IS_VALID;
@@ -407,22 +406,12 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
ib_update_fast_reg_key(ibmr, ++key);
reg_wr = &frwr->fr_regwr;
- reg_wr->wr.next = NULL;
- reg_wr->wr.opcode = IB_WR_REG_MR;
- frwr->fr_cqe.done = frwr_wc_fastreg;
- reg_wr->wr.wr_cqe = &frwr->fr_cqe;
- reg_wr->wr.num_sge = 0;
- reg_wr->wr.send_flags = 0;
reg_wr->mr = ibmr;
reg_wr->key = ibmr->rkey;
reg_wr->access = writing ?
IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
IB_ACCESS_REMOTE_READ;
- rc = ib_post_send(ia->ri_id->qp, &reg_wr->wr, &bad_wr);
- if (rc)
- goto out_senderr;
-
mr->mr_handle = ibmr->rkey;
mr->mr_length = ibmr->length;
mr->mr_offset = ibmr->iova;
@@ -442,11 +431,40 @@ out_mapmr_err:
frwr->fr_mr, n, mr->mr_nents);
rpcrdma_mr_defer_recovery(mr);
return ERR_PTR(-EIO);
+}
-out_senderr:
- pr_err("rpcrdma: FRWR registration ib_post_send returned %i\n", rc);
- rpcrdma_mr_defer_recovery(mr);
- return ERR_PTR(-ENOTCONN);
+/* Post Send WR containing the RPC Call message.
+ *
+ * For FRMR, chain any FastReg WRs to the Send WR. Only a
+ * single ib_post_send call is needed to register memory
+ * and then post the Send WR.
+ */
+static int
+frwr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+{
+ struct ib_send_wr *post_wr, *bad_wr;
+ struct rpcrdma_mr *mr;
+
+ post_wr = &req->rl_sendctx->sc_wr;
+ list_for_each_entry(mr, &req->rl_registered, mr_list) {
+ struct rpcrdma_frwr *frwr;
+
+ frwr = &mr->frwr;
+
+ frwr->fr_cqe.done = frwr_wc_fastreg;
+ frwr->fr_regwr.wr.next = post_wr;
+ frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe;
+ frwr->fr_regwr.wr.num_sge = 0;
+ frwr->fr_regwr.wr.opcode = IB_WR_REG_MR;
+ frwr->fr_regwr.wr.send_flags = 0;
+
+ post_wr = &frwr->fr_regwr.wr;
+ }
+
+ /* If ib_post_send fails, the next ->send_request for
+ * @req will queue these MWs for recovery.
+ */
+ return ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
}
/* Handle a remotely invalidated mr on the @mrs list
@@ -561,6 +579,7 @@ reset_mrs:
const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
.ro_map = frwr_op_map,
+ .ro_send = frwr_op_send,
.ro_reminv = frwr_op_reminv,
.ro_unmap_sync = frwr_op_unmap_sync,
.ro_recover_mr = frwr_op_recover_mr,
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index f0855a959a27..e8adad33d0bb 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -365,7 +365,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
false, &mr);
if (IS_ERR(seg))
- return PTR_ERR(seg);
+ goto out_maperr;
rpcrdma_mr_push(mr, &req->rl_registered);
if (encode_read_segment(xdr, mr, pos) < 0)
@@ -377,6 +377,11 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
} while (nsegs);
return 0;
+
+out_maperr:
+ if (PTR_ERR(seg) == -EAGAIN)
+ xprt_wait_for_buffer_space(rqst->rq_task, NULL);
+ return PTR_ERR(seg);
}
/* Register and XDR encode the Write list. Supports encoding a list
@@ -423,7 +428,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
true, &mr);
if (IS_ERR(seg))
- return PTR_ERR(seg);
+ goto out_maperr;
rpcrdma_mr_push(mr, &req->rl_registered);
if (encode_rdma_segment(xdr, mr) < 0)
@@ -440,6 +445,11 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
*segcount = cpu_to_be32(nchunks);
return 0;
+
+out_maperr:
+ if (PTR_ERR(seg) == -EAGAIN)
+ xprt_wait_for_buffer_space(rqst->rq_task, NULL);
+ return PTR_ERR(seg);
}
/* Register and XDR encode the Reply chunk. Supports encoding an array
@@ -481,7 +491,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
true, &mr);
if (IS_ERR(seg))
- return PTR_ERR(seg);
+ goto out_maperr;
rpcrdma_mr_push(mr, &req->rl_registered);
if (encode_rdma_segment(xdr, mr) < 0)
@@ -498,6 +508,11 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
*segcount = cpu_to_be32(nchunks);
return 0;
+
+out_maperr:
+ if (PTR_ERR(seg) == -EAGAIN)
+ xprt_wait_for_buffer_space(rqst->rq_task, NULL);
+ return PTR_ERR(seg);
}
/**
@@ -724,8 +739,8 @@ rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
* Returns:
* %0 if the RPC was sent successfully,
* %-ENOTCONN if the connection was lost,
- * %-EAGAIN if not enough pages are available for on-demand reply buffer,
- * %-ENOBUFS if no MRs are available to register chunks,
+ * %-EAGAIN if the caller should call again with the same arguments,
+ * %-ENOBUFS if the caller should call again after a delay,
* %-EMSGSIZE if the transport header is too small,
* %-EIO if a permanent problem occurred while marshaling.
*/
@@ -868,10 +883,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
return 0;
out_err:
- if (ret != -ENOBUFS) {
- pr_err("rpcrdma: header marshaling failed (%d)\n", ret);
- r_xprt->rx_stats.failed_marshal_count++;
- }
+ r_xprt->rx_stats.failed_marshal_count++;
return ret;
}
@@ -1366,7 +1378,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
trace_xprtrdma_reply(rqst->rq_task, rep, req, credits);
- queue_work_on(req->rl_cpu, rpcrdma_receive_wq, &rep->rr_work);
+ queue_work(rpcrdma_receive_wq, &rep->rr_work);
return;
out_badstatus:
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 4b1ecfe979cf..cc1aad325496 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -52,7 +52,6 @@
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/sunrpc/addr.h>
-#include <linux/smp.h>
#include "xprt_rdma.h"
@@ -237,8 +236,6 @@ rpcrdma_connect_worker(struct work_struct *work)
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
spin_lock_bh(&xprt->transport_lock);
- if (++xprt->connect_cookie == 0) /* maintain a reserved value */
- ++xprt->connect_cookie;
if (ep->rep_connected > 0) {
if (!xprt_test_and_set_connected(xprt))
xprt_wake_pending_tasks(xprt, 0);
@@ -540,29 +537,6 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
}
}
-/* Allocate a fixed-size buffer in which to construct and send the
- * RPC-over-RDMA header for this request.
- */
-static bool
-rpcrdma_get_rdmabuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
- gfp_t flags)
-{
- size_t size = RPCRDMA_HDRBUF_SIZE;
- struct rpcrdma_regbuf *rb;
-
- if (req->rl_rdmabuf)
- return true;
-
- rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags);
- if (IS_ERR(rb))
- return false;
-
- r_xprt->rx_stats.hardway_register_count += size;
- req->rl_rdmabuf = rb;
- xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb));
- return true;
-}
-
static bool
rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
size_t size, gfp_t flags)
@@ -644,15 +618,11 @@ xprt_rdma_allocate(struct rpc_task *task)
if (RPC_IS_SWAPPER(task))
flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
- if (!rpcrdma_get_rdmabuf(r_xprt, req, flags))
- goto out_fail;
if (!rpcrdma_get_sendbuf(r_xprt, req, rqst->rq_callsize, flags))
goto out_fail;
if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags))
goto out_fail;
- req->rl_cpu = smp_processor_id();
- req->rl_connect_cookie = 0; /* our reserved value */
rpcrdma_set_xprtdata(rqst, req);
rqst->rq_buffer = req->rl_sendbuf->rg_base;
rqst->rq_rbuffer = req->rl_recvbuf->rg_base;
@@ -694,7 +664,8 @@ xprt_rdma_free(struct rpc_task *task)
* Returns:
* %0 if the RPC message has been sent
* %-ENOTCONN if the caller should reconnect and call again
- * %-ENOBUFS if the caller should call again later
+ * %-EAGAIN if the caller should call again
+ * %-ENOBUFS if the caller should call again after a delay
* %-EIO if a permanent error occurred and the request was not
* sent. Do not try to send this message again.
*/
@@ -723,9 +694,9 @@ xprt_rdma_send_request(struct rpc_task *task)
rpcrdma_recv_buffer_get(req);
/* Must suppress retransmit to maintain credits */
- if (req->rl_connect_cookie == xprt->connect_cookie)
+ if (rqst->rq_connect_cookie == xprt->connect_cookie)
goto drop_connection;
- req->rl_connect_cookie = xprt->connect_cookie;
+ rqst->rq_xtime = ktime_get();
__set_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags);
if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
@@ -733,6 +704,12 @@ xprt_rdma_send_request(struct rpc_task *task)
rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len;
rqst->rq_bytes_sent = 0;
+
+ /* An RPC with no reply will throw off credit accounting,
+ * so drop the connection to reset the credit grant.
+ */
+ if (!rpc_reply_expected(task))
+ goto drop_connection;
return 0;
failed_marshal:
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index e6f84a6434a0..fe5eaca2d197 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -250,11 +250,11 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
wait_for_completion(&ia->ri_remove_done);
ia->ri_id = NULL;
- ia->ri_pd = NULL;
ia->ri_device = NULL;
/* Return 1 to ensure the core destroys the id. */
return 1;
case RDMA_CM_EVENT_ESTABLISHED:
+ ++xprt->rx_xprt.connect_cookie;
connstate = 1;
rpcrdma_update_connect_private(xprt, &event->param.conn);
goto connected;
@@ -273,6 +273,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
connstate = -EAGAIN;
goto connected;
case RDMA_CM_EVENT_DISCONNECTED:
+ ++xprt->rx_xprt.connect_cookie;
connstate = -ECONNABORTED;
connected:
xprt->rx_buf.rb_credits = 1;
@@ -445,7 +446,9 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia)
ia->ri_id->qp = NULL;
}
ib_free_cq(ep->rep_attr.recv_cq);
+ ep->rep_attr.recv_cq = NULL;
ib_free_cq(ep->rep_attr.send_cq);
+ ep->rep_attr.send_cq = NULL;
/* The ULP is responsible for ensuring all DMA
* mappings and MRs are gone.
@@ -458,6 +461,8 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia)
rpcrdma_dma_unmap_regbuf(req->rl_recvbuf);
}
rpcrdma_mrs_destroy(buf);
+ ib_dealloc_pd(ia->ri_pd);
+ ia->ri_pd = NULL;
/* Allow waiters to continue */
complete(&ia->ri_remove_done);
@@ -589,11 +594,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
/* Client offers RDMA Read but does not initiate */
ep->rep_remote_cma.initiator_depth = 0;
- if (ia->ri_device->attrs.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
- ep->rep_remote_cma.responder_resources = 32;
- else
- ep->rep_remote_cma.responder_resources =
- ia->ri_device->attrs.max_qp_rd_atom;
+ ep->rep_remote_cma.responder_resources =
+ min_t(int, U8_MAX, ia->ri_device->attrs.max_qp_rd_atom);
/* Limit transport retries so client can detect server
* GID changes quickly. RPC layer handles re-establishing
@@ -628,14 +630,16 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
cancel_delayed_work_sync(&ep->rep_connect_worker);
- if (ia->ri_id->qp) {
+ if (ia->ri_id && ia->ri_id->qp) {
rpcrdma_ep_disconnect(ep, ia);
rdma_destroy_qp(ia->ri_id);
ia->ri_id->qp = NULL;
}
- ib_free_cq(ep->rep_attr.recv_cq);
- ib_free_cq(ep->rep_attr.send_cq);
+ if (ep->rep_attr.recv_cq)
+ ib_free_cq(ep->rep_attr.recv_cq);
+ if (ep->rep_attr.send_cq)
+ ib_free_cq(ep->rep_attr.send_cq);
}
/* Re-establish a connection after a device removal event.
@@ -1024,7 +1028,7 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
LIST_HEAD(free);
LIST_HEAD(all);
- for (count = 0; count < 32; count++) {
+ for (count = 0; count < 3; count++) {
struct rpcrdma_mr *mr;
int rc;
@@ -1049,8 +1053,9 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
list_splice(&all, &buf->rb_all);
r_xprt->rx_stats.mrs_allocated += count;
spin_unlock(&buf->rb_mrlock);
-
trace_xprtrdma_createmrs(r_xprt, count);
+
+ xprt_write_space(&r_xprt->rx_xprt);
}
static void
@@ -1068,17 +1073,27 @@ struct rpcrdma_req *
rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
+ struct rpcrdma_regbuf *rb;
struct rpcrdma_req *req;
req = kzalloc(sizeof(*req), GFP_KERNEL);
if (req == NULL)
return ERR_PTR(-ENOMEM);
+ rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE,
+ DMA_TO_DEVICE, GFP_KERNEL);
+ if (IS_ERR(rb)) {
+ kfree(req);
+ return ERR_PTR(-ENOMEM);
+ }
+ req->rl_rdmabuf = rb;
+ xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb));
+ req->rl_buffer = buffer;
+ INIT_LIST_HEAD(&req->rl_registered);
+
spin_lock(&buffer->rb_reqslock);
list_add(&req->rl_all, &buffer->rb_allreqs);
spin_unlock(&buffer->rb_reqslock);
- req->rl_buffer = &r_xprt->rx_buf;
- INIT_LIST_HEAD(&req->rl_registered);
return req;
}
@@ -1535,7 +1550,6 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
struct rpcrdma_req *req)
{
struct ib_send_wr *send_wr = &req->rl_sendctx->sc_wr;
- struct ib_send_wr *send_wr_fail;
int rc;
if (req->rl_reply) {
@@ -1554,7 +1568,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
--ep->rep_send_count;
}
- rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail);
+ rc = ia->ri_ops->ro_send(ia, req);
trace_xprtrdma_post_send(req, rc);
if (rc)
return -ENOTCONN;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 69883a960a3f..3d3b423fa9c1 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -334,8 +334,6 @@ enum {
struct rpcrdma_buffer;
struct rpcrdma_req {
struct list_head rl_list;
- int rl_cpu;
- unsigned int rl_connect_cookie;
struct rpcrdma_buffer *rl_buffer;
struct rpcrdma_rep *rl_reply;
struct xdr_stream rl_stream;
@@ -474,6 +472,8 @@ struct rpcrdma_memreg_ops {
(*ro_map)(struct rpcrdma_xprt *,
struct rpcrdma_mr_seg *, int, bool,
struct rpcrdma_mr **);
+ int (*ro_send)(struct rpcrdma_ia *ia,
+ struct rpcrdma_req *req);
void (*ro_reminv)(struct rpcrdma_rep *rep,
struct list_head *mrs);
void (*ro_unmap_sync)(struct rpcrdma_xprt *,
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 956e29c1438d..c8902f11efdd 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -527,6 +527,7 @@ static int xs_local_send_request(struct rpc_task *task)
xs_pktdump("packet data:",
req->rq_svec->iov_base, req->rq_svec->iov_len);
+ req->rq_xtime = ktime_get();
status = xs_sendpages(transport->sock, NULL, 0, xdr, req->rq_bytes_sent,
true, &sent);
dprintk("RPC: %s(%u) = %d\n",
@@ -589,6 +590,7 @@ static int xs_udp_send_request(struct rpc_task *task)
if (!xprt_bound(xprt))
return -ENOTCONN;
+ req->rq_xtime = ktime_get();
status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen,
xdr, req->rq_bytes_sent, true, &sent);
@@ -678,6 +680,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
/* Continue transmitting the packet/record. We must be careful
* to cope with writespace callbacks arriving _after_ we have
* called sendmsg(). */
+ req->rq_xtime = ktime_get();
while (1) {
sent = 0;
status = xs_sendpages(transport->sock, NULL, 0, xdr,
@@ -1060,6 +1063,7 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt,
if (!rovr)
goto out_unlock;
xprt_pin_rqst(rovr);
+ xprt_update_rtt(rovr->rq_task);
spin_unlock(&xprt->recv_lock);
task = rovr->rq_task;
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index f7d47c89d658..2dfb492a7c94 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -697,6 +697,9 @@ static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg,
goto prop_msg_full;
if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->window))
goto prop_msg_full;
+ if (bearer->media->type_id == TIPC_MEDIA_TYPE_UDP)
+ if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, bearer->mtu))
+ goto prop_msg_full;
nla_nest_end(msg->skb, prop);
@@ -979,12 +982,23 @@ int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info)
if (props[TIPC_NLA_PROP_TOL]) {
b->tolerance = nla_get_u32(props[TIPC_NLA_PROP_TOL]);
- tipc_node_apply_tolerance(net, b);
+ tipc_node_apply_property(net, b, TIPC_NLA_PROP_TOL);
}
if (props[TIPC_NLA_PROP_PRIO])
b->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]);
if (props[TIPC_NLA_PROP_WIN])
b->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
+ if (props[TIPC_NLA_PROP_MTU]) {
+ if (b->media->type_id != TIPC_MEDIA_TYPE_UDP)
+ return -EINVAL;
+#ifdef CONFIG_TIPC_MEDIA_UDP
+ if (tipc_udp_mtu_bad(nla_get_u32
+ (props[TIPC_NLA_PROP_MTU])))
+ return -EINVAL;
+ b->mtu = nla_get_u32(props[TIPC_NLA_PROP_MTU]);
+ tipc_node_apply_property(net, b, TIPC_NLA_PROP_MTU);
+#endif
+ }
}
return 0;
@@ -1029,6 +1043,9 @@ static int __tipc_nl_add_media(struct tipc_nl_msg *msg,
goto prop_msg_full;
if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->window))
goto prop_msg_full;
+ if (media->type_id == TIPC_MEDIA_TYPE_UDP)
+ if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, media->mtu))
+ goto prop_msg_full;
nla_nest_end(msg->skb, prop);
nla_nest_end(msg->skb, attrs);
@@ -1158,6 +1175,16 @@ int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info)
m->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]);
if (props[TIPC_NLA_PROP_WIN])
m->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
+ if (props[TIPC_NLA_PROP_MTU]) {
+ if (m->type_id != TIPC_MEDIA_TYPE_UDP)
+ return -EINVAL;
+#ifdef CONFIG_TIPC_MEDIA_UDP
+ if (tipc_udp_mtu_bad(nla_get_u32
+ (props[TIPC_NLA_PROP_MTU])))
+ return -EINVAL;
+ m->mtu = nla_get_u32(props[TIPC_NLA_PROP_MTU]);
+#endif
+ }
}
return 0;
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index 6efcee63a381..394290cbbb1d 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -94,6 +94,8 @@ struct tipc_bearer;
* @priority: default link (and bearer) priority
* @tolerance: default time (in ms) before declaring link failure
* @window: default window (in packets) before declaring link congestion
+ * @mtu: max packet size bearer can support for media type not dependent on
+ * underlying device MTU
* @type_id: TIPC media identifier
* @hwaddr_len: TIPC media address len
* @name: media name
@@ -118,6 +120,7 @@ struct tipc_media {
u32 priority;
u32 tolerance;
u32 window;
+ u32 mtu;
u32 type_id;
u32 hwaddr_len;
char name[TIPC_MAX_MEDIA_NAME];
diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c
index 32dc33a94bc7..5453e564da82 100644
--- a/net/tipc/monitor.c
+++ b/net/tipc/monitor.c
@@ -777,7 +777,7 @@ int __tipc_nl_add_monitor(struct net *net, struct tipc_nl_msg *msg,
ret = tipc_bearer_get_name(net, bearer_name, bearer_id);
if (ret || !mon)
- return -EINVAL;
+ return 0;
hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
NLM_F_MULTI, TIPC_NL_MON_GET);
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index b1fe20972aa9..dd1c4fa2eb78 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -241,7 +241,8 @@ err:
static struct publication *tipc_service_remove_publ(struct net *net,
struct tipc_service *sc,
u32 lower, u32 upper,
- u32 node, u32 key)
+ u32 node, u32 key,
+ struct service_range **rng)
{
struct tipc_subscription *sub, *tmp;
struct service_range *sr;
@@ -275,19 +276,15 @@ static struct publication *tipc_service_remove_publ(struct net *net,
list_del(&p->all_publ);
list_del(&p->local_publ);
-
- /* Remove service range item if this was its last publication */
- if (list_empty(&sr->all_publ)) {
+ if (list_empty(&sr->all_publ))
last = true;
- rb_erase(&sr->tree_node, &sc->ranges);
- kfree(sr);
- }
/* Notify any waiting subscriptions */
list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) {
tipc_sub_report_overlap(sub, p->lower, p->upper, TIPC_WITHDRAWN,
p->port, p->node, p->scope, last);
}
+ *rng = sr;
return p;
}
@@ -379,13 +376,20 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
u32 node, u32 key)
{
struct tipc_service *sc = tipc_service_find(net, type);
+ struct service_range *sr = NULL;
struct publication *p = NULL;
if (!sc)
return NULL;
spin_lock_bh(&sc->lock);
- p = tipc_service_remove_publ(net, sc, lower, upper, node, key);
+ p = tipc_service_remove_publ(net, sc, lower, upper, node, key, &sr);
+
+ /* Remove service range item if this was its last publication */
+ if (sr && list_empty(&sr->all_publ)) {
+ rb_erase(&sr->tree_node, &sc->ranges);
+ kfree(sr);
+ }
/* Delete service item if this no more publications and subscriptions */
if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) {
@@ -665,13 +669,14 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower,
/**
* tipc_nametbl_subscribe - add a subscription object to the name table
*/
-void tipc_nametbl_subscribe(struct tipc_subscription *sub)
+bool tipc_nametbl_subscribe(struct tipc_subscription *sub)
{
struct name_table *nt = tipc_name_table(sub->net);
struct tipc_net *tn = tipc_net(sub->net);
struct tipc_subscr *s = &sub->evt.s;
u32 type = tipc_sub_read(s, seq.type);
struct tipc_service *sc;
+ bool res = true;
spin_lock_bh(&tn->nametbl_lock);
sc = tipc_service_find(sub->net, type);
@@ -685,8 +690,10 @@ void tipc_nametbl_subscribe(struct tipc_subscription *sub)
pr_warn("Failed to subscribe for {%u,%u,%u}\n", type,
tipc_sub_read(s, seq.lower),
tipc_sub_read(s, seq.upper));
+ res = false;
}
spin_unlock_bh(&tn->nametbl_lock);
+ return res;
}
/**
@@ -744,16 +751,17 @@ int tipc_nametbl_init(struct net *net)
static void tipc_service_delete(struct net *net, struct tipc_service *sc)
{
struct service_range *sr, *tmpr;
- struct publication *p, *tmpb;
+ struct publication *p, *tmp;
spin_lock_bh(&sc->lock);
rbtree_postorder_for_each_entry_safe(sr, tmpr, &sc->ranges, tree_node) {
- list_for_each_entry_safe(p, tmpb,
- &sr->all_publ, all_publ) {
+ list_for_each_entry_safe(p, tmp, &sr->all_publ, all_publ) {
tipc_service_remove_publ(net, sc, p->lower, p->upper,
- p->node, p->key);
+ p->node, p->key, &sr);
kfree_rcu(p, rcu);
}
+ rb_erase(&sr->tree_node, &sc->ranges);
+ kfree(sr);
}
hlist_del_init_rcu(&sc->service_list);
spin_unlock_bh(&sc->lock);
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index 4b14fc28d9e2..0febba41da86 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -126,7 +126,7 @@ struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type,
struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
u32 lower, u32 upper,
u32 node, u32 key);
-void tipc_nametbl_subscribe(struct tipc_subscription *s);
+bool tipc_nametbl_subscribe(struct tipc_subscription *s);
void tipc_nametbl_unsubscribe(struct tipc_subscription *s);
int tipc_nametbl_init(struct net *net);
void tipc_nametbl_stop(struct net *net);
diff --git a/net/tipc/net.c b/net/tipc/net.c
index 856f9e97ea29..4fbaa0464405 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -252,6 +252,8 @@ int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info)
u64 *w0 = (u64 *)&node_id[0];
u64 *w1 = (u64 *)&node_id[8];
+ if (!attrs[TIPC_NLA_NET_NODEID_W1])
+ return -EINVAL;
*w0 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID]);
*w1 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID_W1]);
tipc_net_init(net, node_id, 0);
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index b76f13f6fea1..6ff2254088f6 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -79,7 +79,10 @@ const struct nla_policy tipc_nl_sock_policy[TIPC_NLA_SOCK_MAX + 1] = {
const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = {
[TIPC_NLA_NET_UNSPEC] = { .type = NLA_UNSPEC },
- [TIPC_NLA_NET_ID] = { .type = NLA_U32 }
+ [TIPC_NLA_NET_ID] = { .type = NLA_U32 },
+ [TIPC_NLA_NET_ADDR] = { .type = NLA_U32 },
+ [TIPC_NLA_NET_NODEID] = { .type = NLA_U64 },
+ [TIPC_NLA_NET_NODEID_W1] = { .type = NLA_U64 },
};
const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = {
diff --git a/net/tipc/node.c b/net/tipc/node.c
index c77dd2f3c589..b0b840084f0a 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -195,6 +195,27 @@ int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel)
return mtu;
}
+bool tipc_node_get_id(struct net *net, u32 addr, u8 *id)
+{
+ u8 *own_id = tipc_own_id(net);
+ struct tipc_node *n;
+
+ if (!own_id)
+ return true;
+
+ if (addr == tipc_own_addr(net)) {
+ memcpy(id, own_id, TIPC_NODEID_LEN);
+ return true;
+ }
+ n = tipc_node_find(net, addr);
+ if (!n)
+ return false;
+
+ memcpy(id, &n->peer_id, TIPC_NODEID_LEN);
+ tipc_node_put(n);
+ return true;
+}
+
u16 tipc_node_get_capabilities(struct net *net, u32 addr)
{
struct tipc_node *n;
@@ -1681,7 +1702,8 @@ discard:
kfree_skb(skb);
}
-void tipc_node_apply_tolerance(struct net *net, struct tipc_bearer *b)
+void tipc_node_apply_property(struct net *net, struct tipc_bearer *b,
+ int prop)
{
struct tipc_net *tn = tipc_net(net);
int bearer_id = b->identity;
@@ -1696,8 +1718,13 @@ void tipc_node_apply_tolerance(struct net *net, struct tipc_bearer *b)
list_for_each_entry_rcu(n, &tn->node_list, list) {
tipc_node_write_lock(n);
e = &n->links[bearer_id];
- if (e->link)
- tipc_link_set_tolerance(e->link, b->tolerance, &xmitq);
+ if (e->link) {
+ if (prop == TIPC_NLA_PROP_TOL)
+ tipc_link_set_tolerance(e->link, b->tolerance,
+ &xmitq);
+ else if (prop == TIPC_NLA_PROP_MTU)
+ tipc_link_set_mtu(e->link, b->mtu);
+ }
tipc_node_write_unlock(n);
tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr);
}
@@ -2232,8 +2259,8 @@ int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb)
struct net *net = sock_net(skb->sk);
u32 prev_bearer = cb->args[0];
struct tipc_nl_msg msg;
+ int bearer_id;
int err;
- int i;
if (prev_bearer == MAX_BEARERS)
return 0;
@@ -2243,16 +2270,13 @@ int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb)
msg.seq = cb->nlh->nlmsg_seq;
rtnl_lock();
- for (i = prev_bearer; i < MAX_BEARERS; i++) {
- prev_bearer = i;
- err = __tipc_nl_add_monitor(net, &msg, prev_bearer);
+ for (bearer_id = prev_bearer; bearer_id < MAX_BEARERS; bearer_id++) {
+ err = __tipc_nl_add_monitor(net, &msg, bearer_id);
if (err)
- goto out;
+ break;
}
-
-out:
rtnl_unlock();
- cb->args[0] = prev_bearer;
+ cb->args[0] = bearer_id;
return skb->len;
}
diff --git a/net/tipc/node.h b/net/tipc/node.h
index f24b83500df1..846c8f240872 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -60,6 +60,7 @@ enum {
#define INVALID_BEARER_ID -1
void tipc_node_stop(struct net *net);
+bool tipc_node_get_id(struct net *net, u32 addr, u8 *id);
u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr);
void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128,
struct tipc_bearer *bearer,
@@ -67,7 +68,7 @@ void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128,
struct tipc_media_addr *maddr,
bool *respond, bool *dupl_addr);
void tipc_node_delete_links(struct net *net, int bearer_id);
-void tipc_node_apply_tolerance(struct net *net, struct tipc_bearer *b);
+void tipc_node_apply_property(struct net *net, struct tipc_bearer *b, int prop);
int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 node,
char *linkname, size_t len);
int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode,
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 1fd1c8b5ce03..c4992002fd68 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1278,7 +1278,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
struct tipc_msg *hdr = &tsk->phdr;
struct tipc_name_seq *seq;
struct sk_buff_head pkts;
- u32 dnode, dport;
+ u32 dport, dnode = 0;
u32 type, inst;
int mtu, rc;
@@ -1348,6 +1348,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
msg_set_destnode(hdr, dnode);
msg_set_destport(hdr, dest->addr.id.ref);
msg_set_hdr_sz(hdr, BASIC_H_SIZE);
+ } else {
+ return -EINVAL;
}
/* Block or return if destination link is congested */
@@ -2971,7 +2973,8 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt,
static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
- struct sock *sk = sock->sk;
+ struct net *net = sock_net(sock->sk);
+ struct tipc_sioc_nodeid_req nr = {0};
struct tipc_sioc_ln_req lnr;
void __user *argp = (void __user *)arg;
@@ -2979,7 +2982,7 @@ static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
case SIOCGETLINKNAME:
if (copy_from_user(&lnr, argp, sizeof(lnr)))
return -EFAULT;
- if (!tipc_node_get_linkname(sock_net(sk),
+ if (!tipc_node_get_linkname(net,
lnr.bearer_id & 0xffff, lnr.peer,
lnr.linkname, TIPC_MAX_LINK_NAME)) {
if (copy_to_user(argp, &lnr, sizeof(lnr)))
@@ -2987,6 +2990,14 @@ static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
return 0;
}
return -EADDRNOTAVAIL;
+ case SIOCGETNODEID:
+ if (copy_from_user(&nr, argp, sizeof(nr)))
+ return -EFAULT;
+ if (!tipc_node_get_id(net, nr.peer, nr.node_id))
+ return -EADDRNOTAVAIL;
+ if (copy_to_user(argp, &nr, sizeof(nr)))
+ return -EFAULT;
+ return 0;
default:
return -ENOIOCTLCMD;
}
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index b7d80bc5f4ab..f340e53da625 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -153,7 +153,10 @@ struct tipc_subscription *tipc_sub_subscribe(struct net *net,
memcpy(&sub->evt.s, s, sizeof(*s));
spin_lock_init(&sub->lock);
kref_init(&sub->kref);
- tipc_nametbl_subscribe(sub);
+ if (!tipc_nametbl_subscribe(sub)) {
+ kfree(sub);
+ return NULL;
+ }
timer_setup(&sub->timer, tipc_sub_timeout, 0);
timeout = tipc_sub_read(&sub->evt.s, timeout);
if (timeout != TIPC_WAIT_FOREVER)
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index e7d91f5d5cae..9783101bc4a9 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -713,8 +713,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
err = -EINVAL;
goto err;
}
- b->mtu = dev->mtu - sizeof(struct iphdr)
- - sizeof(struct udphdr);
+ b->mtu = b->media->mtu;
#if IS_ENABLED(CONFIG_IPV6)
} else if (local.proto == htons(ETH_P_IPV6)) {
udp_conf.family = AF_INET6;
@@ -803,6 +802,7 @@ struct tipc_media udp_media_info = {
.priority = TIPC_DEF_LINK_PRI,
.tolerance = TIPC_DEF_LINK_TOL,
.window = TIPC_DEF_LINK_WIN,
+ .mtu = TIPC_DEF_LINK_UDP_MTU,
.type_id = TIPC_MEDIA_TYPE_UDP,
.hwaddr_len = 0,
.name = "udp"
diff --git a/net/tipc/udp_media.h b/net/tipc/udp_media.h
index 281bbae87726..e7455cc73e16 100644
--- a/net/tipc/udp_media.h
+++ b/net/tipc/udp_media.h
@@ -38,9 +38,23 @@
#ifndef _TIPC_UDP_MEDIA_H
#define _TIPC_UDP_MEDIA_H
+#include <linux/ip.h>
+#include <linux/udp.h>
+
int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr);
int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b);
int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb);
+/* check if configured MTU is too low for tipc headers */
+static inline bool tipc_udp_mtu_bad(u32 mtu)
+{
+ if (mtu >= (TIPC_MIN_BEARER_MTU + sizeof(struct iphdr) +
+ sizeof(struct udphdr)))
+ return false;
+
+ pr_warn("MTU too low for tipc bearer\n");
+ return true;
+}
+
#endif
#endif
diff --git a/net/tls/Kconfig b/net/tls/Kconfig
index 89b8745a986f..73f05ece53d0 100644
--- a/net/tls/Kconfig
+++ b/net/tls/Kconfig
@@ -14,3 +14,13 @@ config TLS
encryption handling of the TLS protocol to be done in-kernel.
If unsure, say N.
+
+config TLS_DEVICE
+ bool "Transport Layer Security HW offload"
+ depends on TLS
+ select SOCK_VALIDATE_XMIT
+ default n
+ help
+ Enable kernel support for HW offload of the TLS protocol.
+
+ If unsure, say N.
diff --git a/net/tls/Makefile b/net/tls/Makefile
index a930fd1c4f7b..4d6b728a67d0 100644
--- a/net/tls/Makefile
+++ b/net/tls/Makefile
@@ -5,3 +5,5 @@
obj-$(CONFIG_TLS) += tls.o
tls-y := tls_main.o tls_sw.o
+
+tls-$(CONFIG_TLS_DEVICE) += tls_device.o tls_device_fallback.o
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
new file mode 100644
index 000000000000..ac45d6224e41
--- /dev/null
+++ b/net/tls/tls_device.c
@@ -0,0 +1,764 @@
+/* Copyright (c) 2018, Mellanox Technologies All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <crypto/aead.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <net/dst.h>
+#include <net/inet_connection_sock.h>
+#include <net/tcp.h>
+#include <net/tls.h>
+
+/* device_offload_lock is used to synchronize tls_dev_add
+ * against NETDEV_DOWN notifications.
+ */
+static DECLARE_RWSEM(device_offload_lock);
+
+static void tls_device_gc_task(struct work_struct *work);
+
+static DECLARE_WORK(tls_device_gc_work, tls_device_gc_task);
+static LIST_HEAD(tls_device_gc_list);
+static LIST_HEAD(tls_device_list);
+static DEFINE_SPINLOCK(tls_device_lock);
+
+static void tls_device_free_ctx(struct tls_context *ctx)
+{
+ struct tls_offload_context *offload_ctx = tls_offload_ctx(ctx);
+
+ kfree(offload_ctx);
+ kfree(ctx);
+}
+
+static void tls_device_gc_task(struct work_struct *work)
+{
+ struct tls_context *ctx, *tmp;
+ unsigned long flags;
+ LIST_HEAD(gc_list);
+
+ spin_lock_irqsave(&tls_device_lock, flags);
+ list_splice_init(&tls_device_gc_list, &gc_list);
+ spin_unlock_irqrestore(&tls_device_lock, flags);
+
+ list_for_each_entry_safe(ctx, tmp, &gc_list, list) {
+ struct net_device *netdev = ctx->netdev;
+
+ if (netdev) {
+ netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
+ TLS_OFFLOAD_CTX_DIR_TX);
+ dev_put(netdev);
+ }
+
+ list_del(&ctx->list);
+ tls_device_free_ctx(ctx);
+ }
+}
+
+static void tls_device_queue_ctx_destruction(struct tls_context *ctx)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&tls_device_lock, flags);
+ list_move_tail(&ctx->list, &tls_device_gc_list);
+
+ /* schedule_work inside the spinlock
+ * to make sure tls_device_down waits for that work.
+ */
+ schedule_work(&tls_device_gc_work);
+
+ spin_unlock_irqrestore(&tls_device_lock, flags);
+}
+
+/* We assume that the socket is already connected */
+static struct net_device *get_netdev_for_sock(struct sock *sk)
+{
+ struct dst_entry *dst = sk_dst_get(sk);
+ struct net_device *netdev = NULL;
+
+ if (likely(dst)) {
+ netdev = dst->dev;
+ dev_hold(netdev);
+ }
+
+ dst_release(dst);
+
+ return netdev;
+}
+
+static void destroy_record(struct tls_record_info *record)
+{
+ int nr_frags = record->num_frags;
+ skb_frag_t *frag;
+
+ while (nr_frags-- > 0) {
+ frag = &record->frags[nr_frags];
+ __skb_frag_unref(frag);
+ }
+ kfree(record);
+}
+
+static void delete_all_records(struct tls_offload_context *offload_ctx)
+{
+ struct tls_record_info *info, *temp;
+
+ list_for_each_entry_safe(info, temp, &offload_ctx->records_list, list) {
+ list_del(&info->list);
+ destroy_record(info);
+ }
+
+ offload_ctx->retransmit_hint = NULL;
+}
+
+static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_record_info *info, *temp;
+ struct tls_offload_context *ctx;
+ u64 deleted_records = 0;
+ unsigned long flags;
+
+ if (!tls_ctx)
+ return;
+
+ ctx = tls_offload_ctx(tls_ctx);
+
+ spin_lock_irqsave(&ctx->lock, flags);
+ info = ctx->retransmit_hint;
+ if (info && !before(acked_seq, info->end_seq)) {
+ ctx->retransmit_hint = NULL;
+ list_del(&info->list);
+ destroy_record(info);
+ deleted_records++;
+ }
+
+ list_for_each_entry_safe(info, temp, &ctx->records_list, list) {
+ if (before(acked_seq, info->end_seq))
+ break;
+ list_del(&info->list);
+
+ destroy_record(info);
+ deleted_records++;
+ }
+
+ ctx->unacked_record_sn += deleted_records;
+ spin_unlock_irqrestore(&ctx->lock, flags);
+}
+
+/* At this point, there should be no references on this
+ * socket and no in-flight SKBs associated with this
+ * socket, so it is safe to free all the resources.
+ */
+void tls_device_sk_destruct(struct sock *sk)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+
+ if (ctx->open_record)
+ destroy_record(ctx->open_record);
+
+ delete_all_records(ctx);
+ crypto_free_aead(ctx->aead_send);
+ ctx->sk_destruct(sk);
+ clean_acked_data_disable(inet_csk(sk));
+
+ if (refcount_dec_and_test(&tls_ctx->refcount))
+ tls_device_queue_ctx_destruction(tls_ctx);
+}
+EXPORT_SYMBOL(tls_device_sk_destruct);
+
+static void tls_append_frag(struct tls_record_info *record,
+ struct page_frag *pfrag,
+ int size)
+{
+ skb_frag_t *frag;
+
+ frag = &record->frags[record->num_frags - 1];
+ if (frag->page.p == pfrag->page &&
+ frag->page_offset + frag->size == pfrag->offset) {
+ frag->size += size;
+ } else {
+ ++frag;
+ frag->page.p = pfrag->page;
+ frag->page_offset = pfrag->offset;
+ frag->size = size;
+ ++record->num_frags;
+ get_page(pfrag->page);
+ }
+
+ pfrag->offset += size;
+ record->len += size;
+}
+
+static int tls_push_record(struct sock *sk,
+ struct tls_context *ctx,
+ struct tls_offload_context *offload_ctx,
+ struct tls_record_info *record,
+ struct page_frag *pfrag,
+ int flags,
+ unsigned char record_type)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct page_frag dummy_tag_frag;
+ skb_frag_t *frag;
+ int i;
+
+ /* fill prepend */
+ frag = &record->frags[0];
+ tls_fill_prepend(ctx,
+ skb_frag_address(frag),
+ record->len - ctx->tx.prepend_size,
+ record_type);
+
+ /* HW doesn't care about the data in the tag, because it fills it. */
+ dummy_tag_frag.page = skb_frag_page(frag);
+ dummy_tag_frag.offset = 0;
+
+ tls_append_frag(record, &dummy_tag_frag, ctx->tx.tag_size);
+ record->end_seq = tp->write_seq + record->len;
+ spin_lock_irq(&offload_ctx->lock);
+ list_add_tail(&record->list, &offload_ctx->records_list);
+ spin_unlock_irq(&offload_ctx->lock);
+ offload_ctx->open_record = NULL;
+ set_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags);
+ tls_advance_record_sn(sk, &ctx->tx);
+
+ for (i = 0; i < record->num_frags; i++) {
+ frag = &record->frags[i];
+ sg_unmark_end(&offload_ctx->sg_tx_data[i]);
+ sg_set_page(&offload_ctx->sg_tx_data[i], skb_frag_page(frag),
+ frag->size, frag->page_offset);
+ sk_mem_charge(sk, frag->size);
+ get_page(skb_frag_page(frag));
+ }
+ sg_mark_end(&offload_ctx->sg_tx_data[record->num_frags - 1]);
+
+ /* all ready, send */
+ return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags);
+}
+
+static int tls_create_new_record(struct tls_offload_context *offload_ctx,
+ struct page_frag *pfrag,
+ size_t prepend_size)
+{
+ struct tls_record_info *record;
+ skb_frag_t *frag;
+
+ record = kmalloc(sizeof(*record), GFP_KERNEL);
+ if (!record)
+ return -ENOMEM;
+
+ frag = &record->frags[0];
+ __skb_frag_set_page(frag, pfrag->page);
+ frag->page_offset = pfrag->offset;
+ skb_frag_size_set(frag, prepend_size);
+
+ get_page(pfrag->page);
+ pfrag->offset += prepend_size;
+
+ record->num_frags = 1;
+ record->len = prepend_size;
+ offload_ctx->open_record = record;
+ return 0;
+}
+
+static int tls_do_allocation(struct sock *sk,
+ struct tls_offload_context *offload_ctx,
+ struct page_frag *pfrag,
+ size_t prepend_size)
+{
+ int ret;
+
+ if (!offload_ctx->open_record) {
+ if (unlikely(!skb_page_frag_refill(prepend_size, pfrag,
+ sk->sk_allocation))) {
+ sk->sk_prot->enter_memory_pressure(sk);
+ sk_stream_moderate_sndbuf(sk);
+ return -ENOMEM;
+ }
+
+ ret = tls_create_new_record(offload_ctx, pfrag, prepend_size);
+ if (ret)
+ return ret;
+
+ if (pfrag->size > pfrag->offset)
+ return 0;
+ }
+
+ if (!sk_page_frag_refill(sk, pfrag))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int tls_push_data(struct sock *sk,
+ struct iov_iter *msg_iter,
+ size_t size, int flags,
+ unsigned char record_type)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+ int tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST;
+ int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE);
+ struct tls_record_info *record = ctx->open_record;
+ struct page_frag *pfrag;
+ size_t orig_size = size;
+ u32 max_open_record_len;
+ int copy, rc = 0;
+ bool done = false;
+ long timeo;
+
+ if (flags &
+ ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST))
+ return -ENOTSUPP;
+
+ if (sk->sk_err)
+ return -sk->sk_err;
+
+ timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+ rc = tls_complete_pending_work(sk, tls_ctx, flags, &timeo);
+ if (rc < 0)
+ return rc;
+
+ pfrag = sk_page_frag(sk);
+
+ /* TLS_HEADER_SIZE is not counted as part of the TLS record, and
+ * we need to leave room for an authentication tag.
+ */
+ max_open_record_len = TLS_MAX_PAYLOAD_SIZE +
+ tls_ctx->tx.prepend_size;
+ do {
+ rc = tls_do_allocation(sk, ctx, pfrag,
+ tls_ctx->tx.prepend_size);
+ if (rc) {
+ rc = sk_stream_wait_memory(sk, &timeo);
+ if (!rc)
+ continue;
+
+ record = ctx->open_record;
+ if (!record)
+ break;
+handle_error:
+ if (record_type != TLS_RECORD_TYPE_DATA) {
+ /* avoid sending partial
+ * record with type !=
+ * application_data
+ */
+ size = orig_size;
+ destroy_record(record);
+ ctx->open_record = NULL;
+ } else if (record->len > tls_ctx->tx.prepend_size) {
+ goto last_record;
+ }
+
+ break;
+ }
+
+ record = ctx->open_record;
+ copy = min_t(size_t, size, (pfrag->size - pfrag->offset));
+ copy = min_t(size_t, copy, (max_open_record_len - record->len));
+
+ if (copy_from_iter_nocache(page_address(pfrag->page) +
+ pfrag->offset,
+ copy, msg_iter) != copy) {
+ rc = -EFAULT;
+ goto handle_error;
+ }
+ tls_append_frag(record, pfrag, copy);
+
+ size -= copy;
+ if (!size) {
+last_record:
+ tls_push_record_flags = flags;
+ if (more) {
+ tls_ctx->pending_open_record_frags =
+ record->num_frags;
+ break;
+ }
+
+ done = true;
+ }
+
+ if (done || record->len >= max_open_record_len ||
+ (record->num_frags >= MAX_SKB_FRAGS - 1)) {
+ rc = tls_push_record(sk,
+ tls_ctx,
+ ctx,
+ record,
+ pfrag,
+ tls_push_record_flags,
+ record_type);
+ if (rc < 0)
+ break;
+ }
+ } while (!done);
+
+ if (orig_size - size > 0)
+ rc = orig_size - size;
+
+ return rc;
+}
+
+int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ unsigned char record_type = TLS_RECORD_TYPE_DATA;
+ int rc;
+
+ lock_sock(sk);
+
+ if (unlikely(msg->msg_controllen)) {
+ rc = tls_proccess_cmsg(sk, msg, &record_type);
+ if (rc)
+ goto out;
+ }
+
+ rc = tls_push_data(sk, &msg->msg_iter, size,
+ msg->msg_flags, record_type);
+
+out:
+ release_sock(sk);
+ return rc;
+}
+
+int tls_device_sendpage(struct sock *sk, struct page *page,
+ int offset, size_t size, int flags)
+{
+ struct iov_iter msg_iter;
+ char *kaddr = kmap(page);
+ struct kvec iov;
+ int rc;
+
+ if (flags & MSG_SENDPAGE_NOTLAST)
+ flags |= MSG_MORE;
+
+ lock_sock(sk);
+
+ if (flags & MSG_OOB) {
+ rc = -ENOTSUPP;
+ goto out;
+ }
+
+ iov.iov_base = kaddr + offset;
+ iov.iov_len = size;
+ iov_iter_kvec(&msg_iter, WRITE | ITER_KVEC, &iov, 1, size);
+ rc = tls_push_data(sk, &msg_iter, size,
+ flags, TLS_RECORD_TYPE_DATA);
+ kunmap(page);
+
+out:
+ release_sock(sk);
+ return rc;
+}
+
+struct tls_record_info *tls_get_record(struct tls_offload_context *context,
+ u32 seq, u64 *p_record_sn)
+{
+ u64 record_sn = context->hint_record_sn;
+ struct tls_record_info *info;
+
+ info = context->retransmit_hint;
+ if (!info ||
+ before(seq, info->end_seq - info->len)) {
+ /* if retransmit_hint is irrelevant start
+ * from the beggining of the list
+ */
+ info = list_first_entry(&context->records_list,
+ struct tls_record_info, list);
+ record_sn = context->unacked_record_sn;
+ }
+
+ list_for_each_entry_from(info, &context->records_list, list) {
+ if (before(seq, info->end_seq)) {
+ if (!context->retransmit_hint ||
+ after(info->end_seq,
+ context->retransmit_hint->end_seq)) {
+ context->hint_record_sn = record_sn;
+ context->retransmit_hint = info;
+ }
+ *p_record_sn = record_sn;
+ return info;
+ }
+ record_sn++;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL(tls_get_record);
+
+static int tls_device_push_pending_record(struct sock *sk, int flags)
+{
+ struct iov_iter msg_iter;
+
+ iov_iter_kvec(&msg_iter, WRITE | ITER_KVEC, NULL, 0, 0);
+ return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA);
+}
+
+int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
+{
+ u16 nonce_size, tag_size, iv_size, rec_seq_size;
+ struct tls_record_info *start_marker_record;
+ struct tls_offload_context *offload_ctx;
+ struct tls_crypto_info *crypto_info;
+ struct net_device *netdev;
+ char *iv, *rec_seq;
+ struct sk_buff *skb;
+ int rc = -EINVAL;
+ __be64 rcd_sn;
+
+ if (!ctx)
+ goto out;
+
+ if (ctx->priv_ctx_tx) {
+ rc = -EEXIST;
+ goto out;
+ }
+
+ start_marker_record = kmalloc(sizeof(*start_marker_record), GFP_KERNEL);
+ if (!start_marker_record) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE, GFP_KERNEL);
+ if (!offload_ctx) {
+ rc = -ENOMEM;
+ goto free_marker_record;
+ }
+
+ crypto_info = &ctx->crypto_send;
+ switch (crypto_info->cipher_type) {
+ case TLS_CIPHER_AES_GCM_128:
+ nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE;
+ tag_size = TLS_CIPHER_AES_GCM_128_TAG_SIZE;
+ iv_size = TLS_CIPHER_AES_GCM_128_IV_SIZE;
+ iv = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->iv;
+ rec_seq_size = TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE;
+ rec_seq =
+ ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->rec_seq;
+ break;
+ default:
+ rc = -EINVAL;
+ goto free_offload_ctx;
+ }
+
+ ctx->tx.prepend_size = TLS_HEADER_SIZE + nonce_size;
+ ctx->tx.tag_size = tag_size;
+ ctx->tx.overhead_size = ctx->tx.prepend_size + ctx->tx.tag_size;
+ ctx->tx.iv_size = iv_size;
+ ctx->tx.iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
+ GFP_KERNEL);
+ if (!ctx->tx.iv) {
+ rc = -ENOMEM;
+ goto free_offload_ctx;
+ }
+
+ memcpy(ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size);
+
+ ctx->tx.rec_seq_size = rec_seq_size;
+ ctx->tx.rec_seq = kmalloc(rec_seq_size, GFP_KERNEL);
+ if (!ctx->tx.rec_seq) {
+ rc = -ENOMEM;
+ goto free_iv;
+ }
+ memcpy(ctx->tx.rec_seq, rec_seq, rec_seq_size);
+
+ rc = tls_sw_fallback_init(sk, offload_ctx, crypto_info);
+ if (rc)
+ goto free_rec_seq;
+
+ /* start at rec_seq - 1 to account for the start marker record */
+ memcpy(&rcd_sn, ctx->tx.rec_seq, sizeof(rcd_sn));
+ offload_ctx->unacked_record_sn = be64_to_cpu(rcd_sn) - 1;
+
+ start_marker_record->end_seq = tcp_sk(sk)->write_seq;
+ start_marker_record->len = 0;
+ start_marker_record->num_frags = 0;
+
+ INIT_LIST_HEAD(&offload_ctx->records_list);
+ list_add_tail(&start_marker_record->list, &offload_ctx->records_list);
+ spin_lock_init(&offload_ctx->lock);
+
+ clean_acked_data_enable(inet_csk(sk), &tls_icsk_clean_acked);
+ ctx->push_pending_record = tls_device_push_pending_record;
+ offload_ctx->sk_destruct = sk->sk_destruct;
+
+ /* TLS offload is greatly simplified if we don't send
+ * SKBs where only part of the payload needs to be encrypted.
+ * So mark the last skb in the write queue as end of record.
+ */
+ skb = tcp_write_queue_tail(sk);
+ if (skb)
+ TCP_SKB_CB(skb)->eor = 1;
+
+ refcount_set(&ctx->refcount, 1);
+
+ /* We support starting offload on multiple sockets
+ * concurrently, so we only need a read lock here.
+ * This lock must precede get_netdev_for_sock to prevent races between
+ * NETDEV_DOWN and setsockopt.
+ */
+ down_read(&device_offload_lock);
+ netdev = get_netdev_for_sock(sk);
+ if (!netdev) {
+ pr_err_ratelimited("%s: netdev not found\n", __func__);
+ rc = -EINVAL;
+ goto release_lock;
+ }
+
+ if (!(netdev->features & NETIF_F_HW_TLS_TX)) {
+ rc = -ENOTSUPP;
+ goto release_netdev;
+ }
+
+ /* Avoid offloading if the device is down
+ * We don't want to offload new flows after
+ * the NETDEV_DOWN event
+ */
+ if (!(netdev->flags & IFF_UP)) {
+ rc = -EINVAL;
+ goto release_netdev;
+ }
+
+ ctx->priv_ctx_tx = offload_ctx;
+ rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_TX,
+ &ctx->crypto_send,
+ tcp_sk(sk)->write_seq);
+ if (rc)
+ goto release_netdev;
+
+ ctx->netdev = netdev;
+
+ spin_lock_irq(&tls_device_lock);
+ list_add_tail(&ctx->list, &tls_device_list);
+ spin_unlock_irq(&tls_device_lock);
+
+ sk->sk_validate_xmit_skb = tls_validate_xmit_skb;
+ /* following this assignment tls_is_sk_tx_device_offloaded
+ * will return true and the context might be accessed
+ * by the netdev's xmit function.
+ */
+ smp_store_release(&sk->sk_destruct,
+ &tls_device_sk_destruct);
+ up_read(&device_offload_lock);
+ goto out;
+
+release_netdev:
+ dev_put(netdev);
+release_lock:
+ up_read(&device_offload_lock);
+ clean_acked_data_disable(inet_csk(sk));
+ crypto_free_aead(offload_ctx->aead_send);
+free_rec_seq:
+ kfree(ctx->tx.rec_seq);
+free_iv:
+ kfree(ctx->tx.iv);
+free_offload_ctx:
+ kfree(offload_ctx);
+ ctx->priv_ctx_tx = NULL;
+free_marker_record:
+ kfree(start_marker_record);
+out:
+ return rc;
+}
+
+static int tls_device_down(struct net_device *netdev)
+{
+ struct tls_context *ctx, *tmp;
+ unsigned long flags;
+ LIST_HEAD(list);
+
+ /* Request a write lock to block new offload attempts */
+ down_write(&device_offload_lock);
+
+ spin_lock_irqsave(&tls_device_lock, flags);
+ list_for_each_entry_safe(ctx, tmp, &tls_device_list, list) {
+ if (ctx->netdev != netdev ||
+ !refcount_inc_not_zero(&ctx->refcount))
+ continue;
+
+ list_move(&ctx->list, &list);
+ }
+ spin_unlock_irqrestore(&tls_device_lock, flags);
+
+ list_for_each_entry_safe(ctx, tmp, &list, list) {
+ netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
+ TLS_OFFLOAD_CTX_DIR_TX);
+ ctx->netdev = NULL;
+ dev_put(netdev);
+ list_del_init(&ctx->list);
+
+ if (refcount_dec_and_test(&ctx->refcount))
+ tls_device_free_ctx(ctx);
+ }
+
+ up_write(&device_offload_lock);
+
+ flush_work(&tls_device_gc_work);
+
+ return NOTIFY_DONE;
+}
+
+static int tls_dev_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (!(dev->features & NETIF_F_HW_TLS_TX))
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ case NETDEV_FEAT_CHANGE:
+ if (dev->tlsdev_ops &&
+ dev->tlsdev_ops->tls_dev_add &&
+ dev->tlsdev_ops->tls_dev_del)
+ return NOTIFY_DONE;
+ else
+ return NOTIFY_BAD;
+ case NETDEV_DOWN:
+ return tls_device_down(dev);
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block tls_dev_notifier = {
+ .notifier_call = tls_dev_event,
+};
+
+void __init tls_device_init(void)
+{
+ register_netdevice_notifier(&tls_dev_notifier);
+}
+
+void __exit tls_device_cleanup(void)
+{
+ unregister_netdevice_notifier(&tls_dev_notifier);
+ flush_work(&tls_device_gc_work);
+}
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
new file mode 100644
index 000000000000..748914abdb60
--- /dev/null
+++ b/net/tls/tls_device_fallback.c
@@ -0,0 +1,450 @@
+/* Copyright (c) 2018, Mellanox Technologies All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <net/tls.h>
+#include <crypto/aead.h>
+#include <crypto/scatterwalk.h>
+#include <net/ip6_checksum.h>
+
+static void chain_to_walk(struct scatterlist *sg, struct scatter_walk *walk)
+{
+ struct scatterlist *src = walk->sg;
+ int diff = walk->offset - src->offset;
+
+ sg_set_page(sg, sg_page(src),
+ src->length - diff, walk->offset);
+
+ scatterwalk_crypto_chain(sg, sg_next(src), 0, 2);
+}
+
+static int tls_enc_record(struct aead_request *aead_req,
+ struct crypto_aead *aead, char *aad,
+ char *iv, __be64 rcd_sn,
+ struct scatter_walk *in,
+ struct scatter_walk *out, int *in_len)
+{
+ unsigned char buf[TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE];
+ struct scatterlist sg_in[3];
+ struct scatterlist sg_out[3];
+ u16 len;
+ int rc;
+
+ len = min_t(int, *in_len, ARRAY_SIZE(buf));
+
+ scatterwalk_copychunks(buf, in, len, 0);
+ scatterwalk_copychunks(buf, out, len, 1);
+
+ *in_len -= len;
+ if (!*in_len)
+ return 0;
+
+ scatterwalk_pagedone(in, 0, 1);
+ scatterwalk_pagedone(out, 1, 1);
+
+ len = buf[4] | (buf[3] << 8);
+ len -= TLS_CIPHER_AES_GCM_128_IV_SIZE;
+
+ tls_make_aad(aad, len - TLS_CIPHER_AES_GCM_128_TAG_SIZE,
+ (char *)&rcd_sn, sizeof(rcd_sn), buf[0]);
+
+ memcpy(iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, buf + TLS_HEADER_SIZE,
+ TLS_CIPHER_AES_GCM_128_IV_SIZE);
+
+ sg_init_table(sg_in, ARRAY_SIZE(sg_in));
+ sg_init_table(sg_out, ARRAY_SIZE(sg_out));
+ sg_set_buf(sg_in, aad, TLS_AAD_SPACE_SIZE);
+ sg_set_buf(sg_out, aad, TLS_AAD_SPACE_SIZE);
+ chain_to_walk(sg_in + 1, in);
+ chain_to_walk(sg_out + 1, out);
+
+ *in_len -= len;
+ if (*in_len < 0) {
+ *in_len += TLS_CIPHER_AES_GCM_128_TAG_SIZE;
+ /* the input buffer doesn't contain the entire record.
+ * trim len accordingly. The resulting authentication tag
+ * will contain garbage, but we don't care, so we won't
+ * include any of it in the output skb
+ * Note that we assume the output buffer length
+ * is larger then input buffer length + tag size
+ */
+ if (*in_len < 0)
+ len += *in_len;
+
+ *in_len = 0;
+ }
+
+ if (*in_len) {
+ scatterwalk_copychunks(NULL, in, len, 2);
+ scatterwalk_pagedone(in, 0, 1);
+ scatterwalk_copychunks(NULL, out, len, 2);
+ scatterwalk_pagedone(out, 1, 1);
+ }
+
+ len -= TLS_CIPHER_AES_GCM_128_TAG_SIZE;
+ aead_request_set_crypt(aead_req, sg_in, sg_out, len, iv);
+
+ rc = crypto_aead_encrypt(aead_req);
+
+ return rc;
+}
+
+static void tls_init_aead_request(struct aead_request *aead_req,
+ struct crypto_aead *aead)
+{
+ aead_request_set_tfm(aead_req, aead);
+ aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
+}
+
+static struct aead_request *tls_alloc_aead_request(struct crypto_aead *aead,
+ gfp_t flags)
+{
+ unsigned int req_size = sizeof(struct aead_request) +
+ crypto_aead_reqsize(aead);
+ struct aead_request *aead_req;
+
+ aead_req = kzalloc(req_size, flags);
+ if (aead_req)
+ tls_init_aead_request(aead_req, aead);
+ return aead_req;
+}
+
+static int tls_enc_records(struct aead_request *aead_req,
+ struct crypto_aead *aead, struct scatterlist *sg_in,
+ struct scatterlist *sg_out, char *aad, char *iv,
+ u64 rcd_sn, int len)
+{
+ struct scatter_walk out, in;
+ int rc;
+
+ scatterwalk_start(&in, sg_in);
+ scatterwalk_start(&out, sg_out);
+
+ do {
+ rc = tls_enc_record(aead_req, aead, aad, iv,
+ cpu_to_be64(rcd_sn), &in, &out, &len);
+ rcd_sn++;
+
+ } while (rc == 0 && len);
+
+ scatterwalk_done(&in, 0, 0);
+ scatterwalk_done(&out, 1, 0);
+
+ return rc;
+}
+
+/* Can't use icsk->icsk_af_ops->send_check here because the ip addresses
+ * might have been changed by NAT.
+ */
+static void update_chksum(struct sk_buff *skb, int headln)
+{
+ struct tcphdr *th = tcp_hdr(skb);
+ int datalen = skb->len - headln;
+ const struct ipv6hdr *ipv6h;
+ const struct iphdr *iph;
+
+ /* We only changed the payload so if we are using partial we don't
+ * need to update anything.
+ */
+ if (likely(skb->ip_summed == CHECKSUM_PARTIAL))
+ return;
+
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct tcphdr, check);
+
+ if (skb->sk->sk_family == AF_INET6) {
+ ipv6h = ipv6_hdr(skb);
+ th->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
+ datalen, IPPROTO_TCP, 0);
+ } else {
+ iph = ip_hdr(skb);
+ th->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen,
+ IPPROTO_TCP, 0);
+ }
+}
+
+static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln)
+{
+ skb_copy_header(nskb, skb);
+
+ skb_put(nskb, skb->len);
+ memcpy(nskb->data, skb->data, headln);
+ update_chksum(nskb, headln);
+
+ nskb->destructor = skb->destructor;
+ nskb->sk = skb->sk;
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ refcount_add(nskb->truesize - skb->truesize,
+ &nskb->sk->sk_wmem_alloc);
+}
+
+/* This function may be called after the user socket is already
+ * closed so make sure we don't use anything freed during
+ * tls_sk_proto_close here
+ */
+
+static int fill_sg_in(struct scatterlist *sg_in,
+ struct sk_buff *skb,
+ struct tls_offload_context *ctx,
+ u64 *rcd_sn,
+ s32 *sync_size,
+ int *resync_sgs)
+{
+ int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
+ int payload_len = skb->len - tcp_payload_offset;
+ u32 tcp_seq = ntohl(tcp_hdr(skb)->seq);
+ struct tls_record_info *record;
+ unsigned long flags;
+ int remaining;
+ int i;
+
+ spin_lock_irqsave(&ctx->lock, flags);
+ record = tls_get_record(ctx, tcp_seq, rcd_sn);
+ if (!record) {
+ spin_unlock_irqrestore(&ctx->lock, flags);
+ WARN(1, "Record not found for seq %u\n", tcp_seq);
+ return -EINVAL;
+ }
+
+ *sync_size = tcp_seq - tls_record_start_seq(record);
+ if (*sync_size < 0) {
+ int is_start_marker = tls_record_is_start_marker(record);
+
+ spin_unlock_irqrestore(&ctx->lock, flags);
+ /* This should only occur if the relevant record was
+ * already acked. In that case it should be ok
+ * to drop the packet and avoid retransmission.
+ *
+ * There is a corner case where the packet contains
+ * both an acked and a non-acked record.
+ * We currently don't handle that case and rely
+ * on TCP to retranmit a packet that doesn't contain
+ * already acked payload.
+ */
+ if (!is_start_marker)
+ *sync_size = 0;
+ return -EINVAL;
+ }
+
+ remaining = *sync_size;
+ for (i = 0; remaining > 0; i++) {
+ skb_frag_t *frag = &record->frags[i];
+
+ __skb_frag_ref(frag);
+ sg_set_page(sg_in + i, skb_frag_page(frag),
+ skb_frag_size(frag), frag->page_offset);
+
+ remaining -= skb_frag_size(frag);
+
+ if (remaining < 0)
+ sg_in[i].length += remaining;
+ }
+ *resync_sgs = i;
+
+ spin_unlock_irqrestore(&ctx->lock, flags);
+ if (skb_to_sgvec(skb, &sg_in[i], tcp_payload_offset, payload_len) < 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void fill_sg_out(struct scatterlist sg_out[3], void *buf,
+ struct tls_context *tls_ctx,
+ struct sk_buff *nskb,
+ int tcp_payload_offset,
+ int payload_len,
+ int sync_size,
+ void *dummy_buf)
+{
+ sg_set_buf(&sg_out[0], dummy_buf, sync_size);
+ sg_set_buf(&sg_out[1], nskb->data + tcp_payload_offset, payload_len);
+ /* Add room for authentication tag produced by crypto */
+ dummy_buf += sync_size;
+ sg_set_buf(&sg_out[2], dummy_buf, TLS_CIPHER_AES_GCM_128_TAG_SIZE);
+}
+
+static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx,
+ struct scatterlist sg_out[3],
+ struct scatterlist *sg_in,
+ struct sk_buff *skb,
+ s32 sync_size, u64 rcd_sn)
+{
+ int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
+ struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+ int payload_len = skb->len - tcp_payload_offset;
+ void *buf, *iv, *aad, *dummy_buf;
+ struct aead_request *aead_req;
+ struct sk_buff *nskb = NULL;
+ int buf_len;
+
+ aead_req = tls_alloc_aead_request(ctx->aead_send, GFP_ATOMIC);
+ if (!aead_req)
+ return NULL;
+
+ buf_len = TLS_CIPHER_AES_GCM_128_SALT_SIZE +
+ TLS_CIPHER_AES_GCM_128_IV_SIZE +
+ TLS_AAD_SPACE_SIZE +
+ sync_size +
+ TLS_CIPHER_AES_GCM_128_TAG_SIZE;
+ buf = kmalloc(buf_len, GFP_ATOMIC);
+ if (!buf)
+ goto free_req;
+
+ iv = buf;
+ memcpy(iv, tls_ctx->crypto_send_aes_gcm_128.salt,
+ TLS_CIPHER_AES_GCM_128_SALT_SIZE);
+ aad = buf + TLS_CIPHER_AES_GCM_128_SALT_SIZE +
+ TLS_CIPHER_AES_GCM_128_IV_SIZE;
+ dummy_buf = aad + TLS_AAD_SPACE_SIZE;
+
+ nskb = alloc_skb(skb_headroom(skb) + skb->len, GFP_ATOMIC);
+ if (!nskb)
+ goto free_buf;
+
+ skb_reserve(nskb, skb_headroom(skb));
+
+ fill_sg_out(sg_out, buf, tls_ctx, nskb, tcp_payload_offset,
+ payload_len, sync_size, dummy_buf);
+
+ if (tls_enc_records(aead_req, ctx->aead_send, sg_in, sg_out, aad, iv,
+ rcd_sn, sync_size + payload_len) < 0)
+ goto free_nskb;
+
+ complete_skb(nskb, skb, tcp_payload_offset);
+
+ /* validate_xmit_skb_list assumes that if the skb wasn't segmented
+ * nskb->prev will point to the skb itself
+ */
+ nskb->prev = nskb;
+
+free_buf:
+ kfree(buf);
+free_req:
+ kfree(aead_req);
+ return nskb;
+free_nskb:
+ kfree_skb(nskb);
+ nskb = NULL;
+ goto free_buf;
+}
+
+static struct sk_buff *tls_sw_fallback(struct sock *sk, struct sk_buff *skb)
+{
+ int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+ int payload_len = skb->len - tcp_payload_offset;
+ struct scatterlist *sg_in, sg_out[3];
+ struct sk_buff *nskb = NULL;
+ int sg_in_max_elements;
+ int resync_sgs = 0;
+ s32 sync_size = 0;
+ u64 rcd_sn;
+
+ /* worst case is:
+ * MAX_SKB_FRAGS in tls_record_info
+ * MAX_SKB_FRAGS + 1 in SKB head and frags.
+ */
+ sg_in_max_elements = 2 * MAX_SKB_FRAGS + 1;
+
+ if (!payload_len)
+ return skb;
+
+ sg_in = kmalloc_array(sg_in_max_elements, sizeof(*sg_in), GFP_ATOMIC);
+ if (!sg_in)
+ goto free_orig;
+
+ sg_init_table(sg_in, sg_in_max_elements);
+ sg_init_table(sg_out, ARRAY_SIZE(sg_out));
+
+ if (fill_sg_in(sg_in, skb, ctx, &rcd_sn, &sync_size, &resync_sgs)) {
+ /* bypass packets before kernel TLS socket option was set */
+ if (sync_size < 0 && payload_len <= -sync_size)
+ nskb = skb_get(skb);
+ goto put_sg;
+ }
+
+ nskb = tls_enc_skb(tls_ctx, sg_out, sg_in, skb, sync_size, rcd_sn);
+
+put_sg:
+ while (resync_sgs)
+ put_page(sg_page(&sg_in[--resync_sgs]));
+ kfree(sg_in);
+free_orig:
+ kfree_skb(skb);
+ return nskb;
+}
+
+struct sk_buff *tls_validate_xmit_skb(struct sock *sk,
+ struct net_device *dev,
+ struct sk_buff *skb)
+{
+ if (dev == tls_get_ctx(sk)->netdev)
+ return skb;
+
+ return tls_sw_fallback(sk, skb);
+}
+
+int tls_sw_fallback_init(struct sock *sk,
+ struct tls_offload_context *offload_ctx,
+ struct tls_crypto_info *crypto_info)
+{
+ const u8 *key;
+ int rc;
+
+ offload_ctx->aead_send =
+ crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(offload_ctx->aead_send)) {
+ rc = PTR_ERR(offload_ctx->aead_send);
+ pr_err_ratelimited("crypto_alloc_aead failed rc=%d\n", rc);
+ offload_ctx->aead_send = NULL;
+ goto err_out;
+ }
+
+ key = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->key;
+
+ rc = crypto_aead_setkey(offload_ctx->aead_send, key,
+ TLS_CIPHER_AES_GCM_128_KEY_SIZE);
+ if (rc)
+ goto free_aead;
+
+ rc = crypto_aead_setauthsize(offload_ctx->aead_send,
+ TLS_CIPHER_AES_GCM_128_TAG_SIZE);
+ if (rc)
+ goto free_aead;
+
+ return 0;
+free_aead:
+ crypto_free_aead(offload_ctx->aead_send);
+err_out:
+ return rc;
+}
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 0d379970960e..4b57ddd72f34 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -51,12 +51,12 @@ enum {
TLSV6,
TLS_NUM_PROTS,
};
-
enum {
TLS_BASE,
- TLS_SW_TX,
- TLS_SW_RX,
- TLS_SW_RXTX,
+ TLS_SW,
+#ifdef CONFIG_TLS_DEVICE
+ TLS_HW,
+#endif
TLS_HW_RECORD,
TLS_NUM_CONFIG,
};
@@ -65,14 +65,14 @@ static struct proto *saved_tcpv6_prot;
static DEFINE_MUTEX(tcpv6_prot_mutex);
static LIST_HEAD(device_list);
static DEFINE_MUTEX(device_mutex);
-static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG];
+static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG];
static struct proto_ops tls_sw_proto_ops;
-static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx)
+static void update_sk_prot(struct sock *sk, struct tls_context *ctx)
{
int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4;
- sk->sk_prot = &tls_prots[ip_ver][ctx->conf];
+ sk->sk_prot = &tls_prots[ip_ver][ctx->tx_conf][ctx->rx_conf];
}
int wait_on_pending_writer(struct sock *sk, long *timeo)
@@ -114,6 +114,7 @@ int tls_push_sg(struct sock *sk,
size = sg->length - offset;
offset += sg->offset;
+ ctx->in_tcp_sendpages = true;
while (1) {
if (sg_is_last(sg))
sendpage_flags = flags;
@@ -148,6 +149,8 @@ retry:
}
clear_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags);
+ ctx->in_tcp_sendpages = false;
+ ctx->sk_write_space(sk);
return 0;
}
@@ -217,6 +220,10 @@ static void tls_write_space(struct sock *sk)
{
struct tls_context *ctx = tls_get_ctx(sk);
+ /* We are already sending pages, ignore notification */
+ if (ctx->in_tcp_sendpages)
+ return;
+
if (!sk->sk_write_pending && tls_is_pending_closed_record(ctx)) {
gfp_t sk_allocation = sk->sk_allocation;
int rc;
@@ -245,10 +252,10 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
lock_sock(sk);
sk_proto_close = ctx->sk_proto_close;
- if (ctx->conf == TLS_HW_RECORD)
+ if (ctx->tx_conf == TLS_HW_RECORD && ctx->rx_conf == TLS_HW_RECORD)
goto skip_tx_cleanup;
- if (ctx->conf == TLS_BASE) {
+ if (ctx->tx_conf == TLS_BASE && ctx->rx_conf == TLS_BASE) {
kfree(ctx);
ctx = NULL;
goto skip_tx_cleanup;
@@ -270,15 +277,26 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
}
}
- kfree(ctx->tx.rec_seq);
- kfree(ctx->tx.iv);
- kfree(ctx->rx.rec_seq);
- kfree(ctx->rx.iv);
+ /* We need these for tls_sw_fallback handling of other packets */
+ if (ctx->tx_conf == TLS_SW) {
+ kfree(ctx->tx.rec_seq);
+ kfree(ctx->tx.iv);
+ tls_sw_free_resources_tx(sk);
+ }
+
+ if (ctx->rx_conf == TLS_SW) {
+ kfree(ctx->rx.rec_seq);
+ kfree(ctx->rx.iv);
+ tls_sw_free_resources_rx(sk);
+ }
- if (ctx->conf == TLS_SW_TX ||
- ctx->conf == TLS_SW_RX ||
- ctx->conf == TLS_SW_RXTX) {
- tls_sw_free_resources(sk);
+#ifdef CONFIG_TLS_DEVICE
+ if (ctx->tx_conf != TLS_HW) {
+#else
+ {
+#endif
+ kfree(ctx);
+ ctx = NULL;
}
skip_tx_cleanup:
@@ -287,7 +305,8 @@ skip_tx_cleanup:
/* free ctx for TLS_HW_RECORD, used by tcp_set_state
* for sk->sk_prot->unhash [tls_hw_unhash]
*/
- if (ctx && ctx->conf == TLS_HW_RECORD)
+ if (ctx && ctx->tx_conf == TLS_HW_RECORD &&
+ ctx->rx_conf == TLS_HW_RECORD)
kfree(ctx);
}
@@ -441,25 +460,29 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
goto err_crypto_info;
}
- /* currently SW is default, we will have ethtool in future */
if (tx) {
- rc = tls_set_sw_offload(sk, ctx, 1);
- if (ctx->conf == TLS_SW_RX)
- conf = TLS_SW_RXTX;
- else
- conf = TLS_SW_TX;
+#ifdef CONFIG_TLS_DEVICE
+ rc = tls_set_device_offload(sk, ctx);
+ conf = TLS_HW;
+ if (rc) {
+#else
+ {
+#endif
+ rc = tls_set_sw_offload(sk, ctx, 1);
+ conf = TLS_SW;
+ }
} else {
rc = tls_set_sw_offload(sk, ctx, 0);
- if (ctx->conf == TLS_SW_TX)
- conf = TLS_SW_RXTX;
- else
- conf = TLS_SW_RX;
+ conf = TLS_SW;
}
if (rc)
goto err_crypto_info;
- ctx->conf = conf;
+ if (tx)
+ ctx->tx_conf = conf;
+ else
+ ctx->rx_conf = conf;
update_sk_prot(sk, ctx);
if (tx) {
ctx->sk_write_space = sk->sk_write_space;
@@ -535,7 +558,8 @@ static int tls_hw_prot(struct sock *sk)
ctx->hash = sk->sk_prot->hash;
ctx->unhash = sk->sk_prot->unhash;
ctx->sk_proto_close = sk->sk_prot->close;
- ctx->conf = TLS_HW_RECORD;
+ ctx->rx_conf = TLS_HW_RECORD;
+ ctx->tx_conf = TLS_HW_RECORD;
update_sk_prot(sk, ctx);
rc = 1;
break;
@@ -579,29 +603,40 @@ static int tls_hw_hash(struct sock *sk)
return err;
}
-static void build_protos(struct proto *prot, struct proto *base)
+static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
+ struct proto *base)
{
- prot[TLS_BASE] = *base;
- prot[TLS_BASE].setsockopt = tls_setsockopt;
- prot[TLS_BASE].getsockopt = tls_getsockopt;
- prot[TLS_BASE].close = tls_sk_proto_close;
-
- prot[TLS_SW_TX] = prot[TLS_BASE];
- prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg;
- prot[TLS_SW_TX].sendpage = tls_sw_sendpage;
-
- prot[TLS_SW_RX] = prot[TLS_BASE];
- prot[TLS_SW_RX].recvmsg = tls_sw_recvmsg;
- prot[TLS_SW_RX].close = tls_sk_proto_close;
-
- prot[TLS_SW_RXTX] = prot[TLS_SW_TX];
- prot[TLS_SW_RXTX].recvmsg = tls_sw_recvmsg;
- prot[TLS_SW_RXTX].close = tls_sk_proto_close;
-
- prot[TLS_HW_RECORD] = *base;
- prot[TLS_HW_RECORD].hash = tls_hw_hash;
- prot[TLS_HW_RECORD].unhash = tls_hw_unhash;
- prot[TLS_HW_RECORD].close = tls_sk_proto_close;
+ prot[TLS_BASE][TLS_BASE] = *base;
+ prot[TLS_BASE][TLS_BASE].setsockopt = tls_setsockopt;
+ prot[TLS_BASE][TLS_BASE].getsockopt = tls_getsockopt;
+ prot[TLS_BASE][TLS_BASE].close = tls_sk_proto_close;
+
+ prot[TLS_SW][TLS_BASE] = prot[TLS_BASE][TLS_BASE];
+ prot[TLS_SW][TLS_BASE].sendmsg = tls_sw_sendmsg;
+ prot[TLS_SW][TLS_BASE].sendpage = tls_sw_sendpage;
+
+ prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE];
+ prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg;
+ prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close;
+
+ prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE];
+ prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg;
+ prot[TLS_SW][TLS_SW].close = tls_sk_proto_close;
+
+#ifdef CONFIG_TLS_DEVICE
+ prot[TLS_HW][TLS_BASE] = prot[TLS_BASE][TLS_BASE];
+ prot[TLS_HW][TLS_BASE].sendmsg = tls_device_sendmsg;
+ prot[TLS_HW][TLS_BASE].sendpage = tls_device_sendpage;
+
+ prot[TLS_HW][TLS_SW] = prot[TLS_BASE][TLS_SW];
+ prot[TLS_HW][TLS_SW].sendmsg = tls_device_sendmsg;
+ prot[TLS_HW][TLS_SW].sendpage = tls_device_sendpage;
+#endif
+
+ prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base;
+ prot[TLS_HW_RECORD][TLS_HW_RECORD].hash = tls_hw_hash;
+ prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash = tls_hw_unhash;
+ prot[TLS_HW_RECORD][TLS_HW_RECORD].close = tls_sk_proto_close;
}
static int tls_init(struct sock *sk)
@@ -632,7 +667,7 @@ static int tls_init(struct sock *sk)
ctx->getsockopt = sk->sk_prot->getsockopt;
ctx->sk_proto_close = sk->sk_prot->close;
- /* Build IPv6 TLS whenever the address of tcpv6_prot changes */
+ /* Build IPv6 TLS whenever the address of tcpv6 _prot changes */
if (ip_ver == TLSV6 &&
unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) {
mutex_lock(&tcpv6_prot_mutex);
@@ -643,7 +678,8 @@ static int tls_init(struct sock *sk)
mutex_unlock(&tcpv6_prot_mutex);
}
- ctx->conf = TLS_BASE;
+ ctx->tx_conf = TLS_BASE;
+ ctx->rx_conf = TLS_BASE;
update_sk_prot(sk, ctx);
out:
return rc;
@@ -681,6 +717,9 @@ static int __init tls_register(void)
tls_sw_proto_ops.poll = tls_sw_poll;
tls_sw_proto_ops.splice_read = tls_sw_splice_read;
+#ifdef CONFIG_TLS_DEVICE
+ tls_device_init();
+#endif
tcp_register_ulp(&tcp_tls_ulp_ops);
return 0;
@@ -689,6 +728,9 @@ static int __init tls_register(void)
static void __exit tls_unregister(void)
{
tcp_unregister_ulp(&tcp_tls_ulp_ops);
+#ifdef CONFIG_TLS_DEVICE
+ tls_device_cleanup();
+#endif
}
module_init(tls_register);
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 4dc766b03f00..5c3909c311f1 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -41,6 +41,8 @@
#include <net/strparser.h>
#include <net/tls.h>
+#define MAX_IV_SIZE TLS_CIPHER_AES_GCM_128_IV_SIZE
+
static int tls_do_decryption(struct sock *sk,
struct scatterlist *sgin,
struct scatterlist *sgout,
@@ -50,7 +52,7 @@ static int tls_do_decryption(struct sock *sk,
gfp_t flags)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
struct strp_msg *rxm = strp_msg(skb);
struct aead_request *aead_req;
@@ -120,7 +122,7 @@ out:
static void trim_both_sgl(struct sock *sk, int target_size)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
trim_sg(sk, ctx->sg_plaintext_data,
&ctx->sg_plaintext_num_elem,
@@ -139,7 +141,7 @@ static void trim_both_sgl(struct sock *sk, int target_size)
static int alloc_encrypted_sg(struct sock *sk, int len)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
int rc = 0;
rc = sk_alloc_sg(sk, len,
@@ -153,7 +155,7 @@ static int alloc_encrypted_sg(struct sock *sk, int len)
static int alloc_plaintext_sg(struct sock *sk, int len)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
int rc = 0;
rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, 0,
@@ -179,7 +181,7 @@ static void free_sg(struct sock *sk, struct scatterlist *sg,
static void tls_free_both_sg(struct sock *sk)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
free_sg(sk, ctx->sg_encrypted_data, &ctx->sg_encrypted_num_elem,
&ctx->sg_encrypted_size);
@@ -189,7 +191,7 @@ static void tls_free_both_sg(struct sock *sk)
}
static int tls_do_encryption(struct tls_context *tls_ctx,
- struct tls_sw_context *ctx, size_t data_len,
+ struct tls_sw_context_tx *ctx, size_t data_len,
gfp_t flags)
{
unsigned int req_size = sizeof(struct aead_request) +
@@ -225,7 +227,7 @@ static int tls_push_record(struct sock *sk, int flags,
unsigned char record_type)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
int rc;
sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1);
@@ -337,7 +339,7 @@ static int memcopy_from_iter(struct sock *sk, struct iov_iter *from,
int bytes)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
struct scatterlist *sg = ctx->sg_plaintext_data;
int copy, i, rc = 0;
@@ -365,7 +367,7 @@ out:
int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
int ret = 0;
int required_size;
long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
@@ -520,7 +522,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
int ret = 0;
long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
bool eor;
@@ -634,7 +636,7 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags,
long timeo, int *err)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
struct sk_buff *skb;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
@@ -672,8 +674,8 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb,
struct scatterlist *sgout)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
- char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + tls_ctx->rx.iv_size];
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+ char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + MAX_IV_SIZE];
struct scatterlist sgin_arr[MAX_SKB_FRAGS + 2];
struct scatterlist *sgin = &sgin_arr[0];
struct strp_msg *rxm = strp_msg(skb);
@@ -691,8 +693,7 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb,
if (!sgout) {
nsg = skb_cow_data(skb, 0, &unused) + 1;
sgin = kmalloc_array(nsg, sizeof(*sgin), sk->sk_allocation);
- if (!sgout)
- sgout = sgin;
+ sgout = sgin;
}
sg_init_table(sgin, nsg);
@@ -722,7 +723,7 @@ static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb,
unsigned int len)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
struct strp_msg *rxm = strp_msg(skb);
if (len < rxm->full_len) {
@@ -748,7 +749,7 @@ int tls_sw_recvmsg(struct sock *sk,
int *addr_len)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
unsigned char control;
struct strp_msg *rxm;
struct sk_buff *skb;
@@ -868,7 +869,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
size_t len, unsigned int flags)
{
struct tls_context *tls_ctx = tls_get_ctx(sock->sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
struct strp_msg *rxm = NULL;
struct sock *sk = sock->sk;
struct sk_buff *skb;
@@ -921,7 +922,7 @@ unsigned int tls_sw_poll(struct file *file, struct socket *sock,
unsigned int ret;
struct sock *sk = sock->sk;
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
/* Grab POLLOUT and POLLHUP from the underlying socket */
ret = ctx->sk_poll(file, sock, wait);
@@ -937,7 +938,7 @@ unsigned int tls_sw_poll(struct file *file, struct socket *sock,
static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
{
struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
char header[tls_ctx->rx.prepend_size];
struct strp_msg *rxm = strp_msg(skb);
size_t cipher_overhead;
@@ -986,7 +987,7 @@ read_failure:
static void tls_queue(struct strparser *strp, struct sk_buff *skb)
{
struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
struct strp_msg *rxm;
rxm = strp_msg(skb);
@@ -1002,18 +1003,28 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb)
static void tls_data_ready(struct sock *sk)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
strp_data_ready(&ctx->strp);
}
-void tls_sw_free_resources(struct sock *sk)
+void tls_sw_free_resources_tx(struct sock *sk)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
if (ctx->aead_send)
crypto_free_aead(ctx->aead_send);
+ tls_free_both_sg(sk);
+
+ kfree(ctx);
+}
+
+void tls_sw_free_resources_rx(struct sock *sk)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+
if (ctx->aead_recv) {
if (ctx->recv_pkt) {
kfree_skb(ctx->recv_pkt);
@@ -1029,10 +1040,7 @@ void tls_sw_free_resources(struct sock *sk)
lock_sock(sk);
}
- tls_free_both_sg(sk);
-
kfree(ctx);
- kfree(tls_ctx);
}
int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
@@ -1040,7 +1048,8 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
char keyval[TLS_CIPHER_AES_GCM_128_KEY_SIZE];
struct tls_crypto_info *crypto_info;
struct tls12_crypto_info_aes_gcm_128 *gcm_128_info;
- struct tls_sw_context *sw_ctx;
+ struct tls_sw_context_tx *sw_ctx_tx = NULL;
+ struct tls_sw_context_rx *sw_ctx_rx = NULL;
struct cipher_context *cctx;
struct crypto_aead **aead;
struct strp_callbacks cb;
@@ -1053,27 +1062,32 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
goto out;
}
- if (!ctx->priv_ctx) {
- sw_ctx = kzalloc(sizeof(*sw_ctx), GFP_KERNEL);
- if (!sw_ctx) {
+ if (tx) {
+ sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL);
+ if (!sw_ctx_tx) {
rc = -ENOMEM;
goto out;
}
- crypto_init_wait(&sw_ctx->async_wait);
+ crypto_init_wait(&sw_ctx_tx->async_wait);
+ ctx->priv_ctx_tx = sw_ctx_tx;
} else {
- sw_ctx = ctx->priv_ctx;
+ sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL);
+ if (!sw_ctx_rx) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ crypto_init_wait(&sw_ctx_rx->async_wait);
+ ctx->priv_ctx_rx = sw_ctx_rx;
}
- ctx->priv_ctx = (struct tls_offload_context *)sw_ctx;
-
if (tx) {
crypto_info = &ctx->crypto_send;
cctx = &ctx->tx;
- aead = &sw_ctx->aead_send;
+ aead = &sw_ctx_tx->aead_send;
} else {
crypto_info = &ctx->crypto_recv;
cctx = &ctx->rx;
- aead = &sw_ctx->aead_recv;
+ aead = &sw_ctx_rx->aead_recv;
}
switch (crypto_info->cipher_type) {
@@ -1094,6 +1108,12 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
goto free_priv;
}
+ /* Sanity-check the IV size for stack allocations. */
+ if (iv_size > MAX_IV_SIZE) {
+ rc = -EINVAL;
+ goto free_priv;
+ }
+
cctx->prepend_size = TLS_HEADER_SIZE + nonce_size;
cctx->tag_size = tag_size;
cctx->overhead_size = cctx->prepend_size + cctx->tag_size;
@@ -1114,22 +1134,24 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
}
memcpy(cctx->rec_seq, rec_seq, rec_seq_size);
- if (tx) {
- sg_init_table(sw_ctx->sg_encrypted_data,
- ARRAY_SIZE(sw_ctx->sg_encrypted_data));
- sg_init_table(sw_ctx->sg_plaintext_data,
- ARRAY_SIZE(sw_ctx->sg_plaintext_data));
-
- sg_init_table(sw_ctx->sg_aead_in, 2);
- sg_set_buf(&sw_ctx->sg_aead_in[0], sw_ctx->aad_space,
- sizeof(sw_ctx->aad_space));
- sg_unmark_end(&sw_ctx->sg_aead_in[1]);
- sg_chain(sw_ctx->sg_aead_in, 2, sw_ctx->sg_plaintext_data);
- sg_init_table(sw_ctx->sg_aead_out, 2);
- sg_set_buf(&sw_ctx->sg_aead_out[0], sw_ctx->aad_space,
- sizeof(sw_ctx->aad_space));
- sg_unmark_end(&sw_ctx->sg_aead_out[1]);
- sg_chain(sw_ctx->sg_aead_out, 2, sw_ctx->sg_encrypted_data);
+ if (sw_ctx_tx) {
+ sg_init_table(sw_ctx_tx->sg_encrypted_data,
+ ARRAY_SIZE(sw_ctx_tx->sg_encrypted_data));
+ sg_init_table(sw_ctx_tx->sg_plaintext_data,
+ ARRAY_SIZE(sw_ctx_tx->sg_plaintext_data));
+
+ sg_init_table(sw_ctx_tx->sg_aead_in, 2);
+ sg_set_buf(&sw_ctx_tx->sg_aead_in[0], sw_ctx_tx->aad_space,
+ sizeof(sw_ctx_tx->aad_space));
+ sg_unmark_end(&sw_ctx_tx->sg_aead_in[1]);
+ sg_chain(sw_ctx_tx->sg_aead_in, 2,
+ sw_ctx_tx->sg_plaintext_data);
+ sg_init_table(sw_ctx_tx->sg_aead_out, 2);
+ sg_set_buf(&sw_ctx_tx->sg_aead_out[0], sw_ctx_tx->aad_space,
+ sizeof(sw_ctx_tx->aad_space));
+ sg_unmark_end(&sw_ctx_tx->sg_aead_out[1]);
+ sg_chain(sw_ctx_tx->sg_aead_out, 2,
+ sw_ctx_tx->sg_encrypted_data);
}
if (!*aead) {
@@ -1154,22 +1176,22 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
if (rc)
goto free_aead;
- if (!tx) {
+ if (sw_ctx_rx) {
/* Set up strparser */
memset(&cb, 0, sizeof(cb));
cb.rcv_msg = tls_queue;
cb.parse_msg = tls_read_size;
- strp_init(&sw_ctx->strp, sk, &cb);
+ strp_init(&sw_ctx_rx->strp, sk, &cb);
write_lock_bh(&sk->sk_callback_lock);
- sw_ctx->saved_data_ready = sk->sk_data_ready;
+ sw_ctx_rx->saved_data_ready = sk->sk_data_ready;
sk->sk_data_ready = tls_data_ready;
write_unlock_bh(&sk->sk_callback_lock);
- sw_ctx->sk_poll = sk->sk_socket->ops->poll;
+ sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll;
- strp_check_rcv(&sw_ctx->strp);
+ strp_check_rcv(&sw_ctx_rx->strp);
}
goto out;
@@ -1181,11 +1203,16 @@ free_rec_seq:
kfree(cctx->rec_seq);
cctx->rec_seq = NULL;
free_iv:
- kfree(ctx->tx.iv);
- ctx->tx.iv = NULL;
+ kfree(cctx->iv);
+ cctx->iv = NULL;
free_priv:
- kfree(ctx->priv_ctx);
- ctx->priv_ctx = NULL;
+ if (tx) {
+ kfree(ctx->priv_ctx_tx);
+ ctx->priv_ctx_tx = NULL;
+ } else {
+ kfree(ctx->priv_ctx_rx);
+ ctx->priv_ctx_rx = NULL;
+ }
out:
return rc;
}
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index aac9b8f6552e..c1076c19b858 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -2018,7 +2018,13 @@ const struct vsock_transport *vsock_core_get_transport(void)
}
EXPORT_SYMBOL_GPL(vsock_core_get_transport);
+static void __exit vsock_exit(void)
+{
+ /* Do nothing. This function makes this module removable. */
+}
+
module_init(vsock_init_tables);
+module_exit(vsock_exit);
MODULE_AUTHOR("VMware, Inc.");
MODULE_DESCRIPTION("VMware Virtual Socket Family");