summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/9p/client.c11
-rw-r--r--net/atm/lec.c9
-rw-r--r--net/bridge/br_if.c4
-rw-r--r--net/bridge/netfilter/ebtables.c11
-rw-r--r--net/caif/chnl_net.c2
-rw-r--r--net/ceph/Makefile1
-rw-r--r--net/ceph/ceph_common.c8
-rw-r--r--net/ceph/crypto.c6
-rw-r--r--net/ceph/debugfs.c17
-rw-r--r--net/ceph/messenger.c195
-rw-r--r--net/ceph/mon_client.c16
-rw-r--r--net/ceph/osd_client.c67
-rw-r--r--net/ceph/osdmap.c71
-rw-r--r--net/ceph/striper.c261
-rw-r--r--net/compat.c6
-rw-r--r--net/core/dev.c2
-rw-r--r--net/core/dev_addr_lists.c2
-rw-r--r--net/core/ethtool.c5
-rw-r--r--net/core/filter.c1
-rw-r--r--net/core/neighbour.c40
-rw-r--r--net/core/sysctl_net_core.c1
-rw-r--r--net/core/utils.c23
-rw-r--r--net/dccp/ccids/ccid2.c14
-rw-r--r--net/dccp/timer.c2
-rw-r--r--net/dns_resolver/dns_key.c12
-rw-r--r--net/ife/ife.c38
-rw-r--r--net/ipv4/inetpeer.c1
-rw-r--r--net/ipv4/ip_gre.c6
-rw-r--r--net/ipv4/ip_output.c8
-rw-r--r--net/ipv4/netfilter/Makefile5
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic_main.c2
-rw-r--r--net/ipv4/route.c119
-rw-r--r--net/ipv4/tcp.c15
-rw-r--r--net/ipv4/tcp_bbr.c4
-rw-r--r--net/ipv4/tcp_input.c7
-rw-r--r--net/ipv6/netfilter/Kconfig55
-rw-r--r--net/ipv6/route.c9
-rw-r--r--net/ipv6/seg6_iptunnel.c2
-rw-r--r--net/l2tp/l2tp_core.c265
-rw-r--r--net/l2tp/l2tp_core.h7
-rw-r--r--net/l2tp/l2tp_debugfs.c18
-rw-r--r--net/l2tp/l2tp_netlink.c33
-rw-r--r--net/l2tp/l2tp_ppp.c43
-rw-r--r--net/llc/af_llc.c14
-rw-r--r--net/llc/llc_c_ac.c9
-rw-r--r--net/llc/llc_conn.c22
-rw-r--r--net/netfilter/Kconfig1
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c8
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c155
-rw-r--r--net/netfilter/nf_conntrack_expect.c5
-rw-r--r--net/netfilter/nf_conntrack_extend.c2
-rw-r--r--net/netfilter/nf_conntrack_sip.c16
-rw-r--r--net/netfilter/nf_tables_api.c69
-rw-r--r--net/netfilter/xt_connmark.c49
-rw-r--r--net/netlabel/netlabel_unlabeled.c10
-rw-r--r--net/netlink/af_netlink.c6
-rw-r--r--net/nsh/nsh.c4
-rw-r--r--net/openvswitch/flow_netlink.c9
-rw-r--r--net/packet/af_packet.c83
-rw-r--r--net/packet/internal.h10
-rw-r--r--net/qrtr/qrtr.c1
-rw-r--r--net/rds/ib_cm.c3
-rw-r--r--net/rds/recv.c1
-rw-r--r--net/rds/send.c15
-rw-r--r--net/sched/act_ife.c9
-rw-r--r--net/sched/sch_fq.c37
-rw-r--r--net/sctp/associola.c30
-rw-r--r--net/sctp/chunk.c10
-rw-r--r--net/sctp/inqueue.c2
-rw-r--r--net/sctp/ipv6.c106
-rw-r--r--net/sctp/output.c34
-rw-r--r--net/sctp/protocol.c43
-rw-r--r--net/sctp/sm_make_chunk.c12
-rw-r--r--net/sctp/sm_statefuns.c112
-rw-r--r--net/sctp/socket.c70
-rw-r--r--net/sctp/stream.c2
-rw-r--r--net/smc/af_smc.c71
-rw-r--r--net/smc/smc_core.c22
-rw-r--r--net/smc/smc_core.h3
-rw-r--r--net/socket.c2
-rw-r--r--net/strparser/strparser.c9
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c3
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seal.c5
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_unseal.c4
-rw-r--r--net/sunrpc/cache.c32
-rw-r--r--net/sunrpc/clnt.c8
-rw-r--r--net/sunrpc/rpc_pipe.c1
-rw-r--r--net/sunrpc/sched.c10
-rw-r--r--net/sunrpc/stats.c16
-rw-r--r--net/sunrpc/sunrpc.h6
-rw-r--r--net/sunrpc/svc.c118
-rw-r--r--net/sunrpc/svc_xprt.c34
-rw-r--r--net/sunrpc/svcsock.c8
-rw-r--r--net/sunrpc/xdr.c82
-rw-r--r--net/sunrpc/xprt.c34
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c7
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c13
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c53
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c32
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c4
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c33
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c38
-rw-r--r--net/sunrpc/xprtrdma/transport.c43
-rw-r--r--net/sunrpc/xprtrdma/verbs.c44
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h4
-rw-r--r--net/sunrpc/xprtsock.c4
-rw-r--r--net/tipc/monitor.c2
-rw-r--r--net/tipc/name_table.c34
-rw-r--r--net/tipc/name_table.h2
-rw-r--r--net/tipc/net.c2
-rw-r--r--net/tipc/netlink.c5
-rw-r--r--net/tipc/node.c13
-rw-r--r--net/tipc/socket.c4
-rw-r--r--net/tipc/subscr.c5
-rw-r--r--net/tls/tls_main.c19
-rw-r--r--net/tls/tls_sw.c10
-rw-r--r--net/vmw_vsock/af_vsock.c6
117 files changed, 2133 insertions, 1095 deletions
diff --git a/net/9p/client.c b/net/9p/client.c
index b433aff5ff13..21e6df1cc70f 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -190,7 +190,9 @@ static int parse_opts(char *opts, struct p9_client *clnt)
p9_debug(P9_DEBUG_ERROR,
"problem allocating copy of trans arg\n");
goto free_and_return;
- }
+ }
+
+ v9fs_put_trans(clnt->trans_mod);
clnt->trans_mod = v9fs_get_trans_by_name(s);
if (clnt->trans_mod == NULL) {
pr_info("Could not find request transport: %s\n",
@@ -226,6 +228,7 @@ static int parse_opts(char *opts, struct p9_client *clnt)
}
free_and_return:
+ v9fs_put_trans(clnt->trans_mod);
kfree(tmp_options);
return ret;
}
@@ -769,7 +772,7 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
if (err < 0) {
if (err != -ERESTARTSYS && err != -EFAULT)
c->status = Disconnected;
- goto reterr;
+ goto recalc_sigpending;
}
again:
/* Wait for the response */
@@ -804,6 +807,7 @@ again:
if (req->status == REQ_STATUS_RCVD)
err = 0;
}
+recalc_sigpending:
if (sigpending) {
spin_lock_irqsave(&current->sighand->siglock, flags);
recalc_sigpending();
@@ -867,7 +871,7 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
if (err == -EIO)
c->status = Disconnected;
if (err != -ERESTARTSYS)
- goto reterr;
+ goto recalc_sigpending;
}
if (req->status == REQ_STATUS_ERROR) {
p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
@@ -885,6 +889,7 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
if (req->status == REQ_STATUS_RCVD)
err = 0;
}
+recalc_sigpending:
if (sigpending) {
spin_lock_irqsave(&current->sighand->siglock, flags);
recalc_sigpending();
diff --git a/net/atm/lec.c b/net/atm/lec.c
index 01d5d20a6eb1..3138a869b5c0 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -41,6 +41,9 @@ static unsigned char bridge_ula_lec[] = { 0x01, 0x80, 0xc2, 0x00, 0x00 };
#include <linux/module.h>
#include <linux/init.h>
+/* Hardening for Spectre-v1 */
+#include <linux/nospec.h>
+
#include "lec.h"
#include "lec_arpc.h"
#include "resources.h"
@@ -687,8 +690,10 @@ static int lec_vcc_attach(struct atm_vcc *vcc, void __user *arg)
bytes_left = copy_from_user(&ioc_data, arg, sizeof(struct atmlec_ioc));
if (bytes_left != 0)
pr_info("copy from user failed for %d bytes\n", bytes_left);
- if (ioc_data.dev_num < 0 || ioc_data.dev_num >= MAX_LEC_ITF ||
- !dev_lec[ioc_data.dev_num])
+ if (ioc_data.dev_num < 0 || ioc_data.dev_num >= MAX_LEC_ITF)
+ return -EINVAL;
+ ioc_data.dev_num = array_index_nospec(ioc_data.dev_num, MAX_LEC_ITF);
+ if (!dev_lec[ioc_data.dev_num])
return -EINVAL;
vpriv = kmalloc(sizeof(struct lec_vcc_priv), GFP_KERNEL);
if (!vpriv)
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 82c1a6f430b3..5bb6681fa91e 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -518,8 +518,8 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
return -ELOOP;
}
- /* Device is already being bridged */
- if (br_port_exists(dev))
+ /* Device has master upper dev */
+ if (netdev_master_upper_dev_get(dev))
return -EBUSY;
/* No bridging devices that dislike that (e.g. wireless) */
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 032e0fe45940..28a4c3490359 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1825,13 +1825,14 @@ static int compat_table_info(const struct ebt_table_info *info,
{
unsigned int size = info->entries_size;
const void *entries = info->entries;
- int ret;
newinfo->entries_size = size;
-
- ret = xt_compat_init_offsets(NFPROTO_BRIDGE, info->nentries);
- if (ret)
- return ret;
+ if (info->nentries) {
+ int ret = xt_compat_init_offsets(NFPROTO_BRIDGE,
+ info->nentries);
+ if (ret)
+ return ret;
+ }
return EBT_ENTRY_ITERATE(entries, size, compat_calc_entry, info,
entries, newinfo);
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index 53ecda10b790..13e2ae6be620 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -174,7 +174,7 @@ static void chnl_flowctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow,
flow == CAIF_CTRLCMD_DEINIT_RSP ? "CLOSE/DEINIT" :
flow == CAIF_CTRLCMD_INIT_FAIL_RSP ? "OPEN_FAIL" :
flow == CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND ?
- "REMOTE_SHUTDOWN" : "UKNOWN CTRL COMMAND");
+ "REMOTE_SHUTDOWN" : "UNKNOWN CTRL COMMAND");
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index b4bded4b5396..12bf49772d24 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -8,6 +8,7 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
mon_client.o \
cls_lock_client.o \
osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
+ striper.o \
debugfs.o \
auth.o auth_none.o \
crypto.o armor.o \
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 4adf07826f4a..584fdbef2088 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -72,6 +72,7 @@ const char *ceph_msg_type_name(int type)
case CEPH_MSG_MON_GET_VERSION: return "mon_get_version";
case CEPH_MSG_MON_GET_VERSION_REPLY: return "mon_get_version_reply";
case CEPH_MSG_MDS_MAP: return "mds_map";
+ case CEPH_MSG_FS_MAP_USER: return "fs_map_user";
case CEPH_MSG_CLIENT_SESSION: return "client_session";
case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
case CEPH_MSG_CLIENT_REQUEST: return "client_request";
@@ -79,8 +80,13 @@ const char *ceph_msg_type_name(int type)
case CEPH_MSG_CLIENT_REPLY: return "client_reply";
case CEPH_MSG_CLIENT_CAPS: return "client_caps";
case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
+ case CEPH_MSG_CLIENT_QUOTA: return "client_quota";
case CEPH_MSG_CLIENT_SNAP: return "client_snap";
case CEPH_MSG_CLIENT_LEASE: return "client_lease";
+ case CEPH_MSG_POOLOP_REPLY: return "poolop_reply";
+ case CEPH_MSG_POOLOP: return "poolop";
+ case CEPH_MSG_MON_COMMAND: return "mon_command";
+ case CEPH_MSG_MON_COMMAND_ACK: return "mon_command_ack";
case CEPH_MSG_OSD_MAP: return "osd_map";
case CEPH_MSG_OSD_OP: return "osd_op";
case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
@@ -217,7 +223,7 @@ static int parse_fsid(const char *str, struct ceph_fsid *fsid)
if (i == 16)
err = 0;
- dout("parse_fsid ret %d got fsid %pU", err, fsid);
+ dout("parse_fsid ret %d got fsid %pU\n", err, fsid);
return err;
}
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index bf9d079cbafd..02172c408ff2 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -347,10 +347,12 @@ struct key_type key_type_ceph = {
.destroy = ceph_key_destroy,
};
-int ceph_crypto_init(void) {
+int __init ceph_crypto_init(void)
+{
return register_key_type(&key_type_ceph);
}
-void ceph_crypto_shutdown(void) {
+void ceph_crypto_shutdown(void)
+{
unregister_key_type(&key_type_ceph);
}
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 1eef6806aa1a..02952605d121 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -389,7 +389,7 @@ CEPH_DEFINE_SHOW_FUNC(monc_show)
CEPH_DEFINE_SHOW_FUNC(osdc_show)
CEPH_DEFINE_SHOW_FUNC(client_options_show)
-int ceph_debugfs_init(void)
+int __init ceph_debugfs_init(void)
{
ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
if (!ceph_debugfs_dir)
@@ -418,7 +418,7 @@ int ceph_debugfs_client_init(struct ceph_client *client)
goto out;
client->monc.debugfs_file = debugfs_create_file("monc",
- 0600,
+ 0400,
client->debugfs_dir,
client,
&monc_show_fops);
@@ -426,7 +426,7 @@ int ceph_debugfs_client_init(struct ceph_client *client)
goto out;
client->osdc.debugfs_file = debugfs_create_file("osdc",
- 0600,
+ 0400,
client->debugfs_dir,
client,
&osdc_show_fops);
@@ -434,7 +434,7 @@ int ceph_debugfs_client_init(struct ceph_client *client)
goto out;
client->debugfs_monmap = debugfs_create_file("monmap",
- 0600,
+ 0400,
client->debugfs_dir,
client,
&monmap_show_fops);
@@ -442,7 +442,7 @@ int ceph_debugfs_client_init(struct ceph_client *client)
goto out;
client->debugfs_osdmap = debugfs_create_file("osdmap",
- 0600,
+ 0400,
client->debugfs_dir,
client,
&osdmap_show_fops);
@@ -450,7 +450,7 @@ int ceph_debugfs_client_init(struct ceph_client *client)
goto out;
client->debugfs_options = debugfs_create_file("client_options",
- 0600,
+ 0400,
client->debugfs_dir,
client,
&client_options_show_fops);
@@ -477,7 +477,7 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client)
#else /* CONFIG_DEBUG_FS */
-int ceph_debugfs_init(void)
+int __init ceph_debugfs_init(void)
{
return 0;
}
@@ -496,6 +496,3 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client)
}
#endif /* CONFIG_DEBUG_FS */
-
-EXPORT_SYMBOL(ceph_debugfs_init);
-EXPORT_SYMBOL(ceph_debugfs_cleanup);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 8a4d3758030b..3b3d33ea9ed8 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -277,7 +277,7 @@ static void _ceph_msgr_exit(void)
ceph_msgr_slab_exit();
}
-int ceph_msgr_init(void)
+int __init ceph_msgr_init(void)
{
if (ceph_msgr_slab_init())
return -ENOMEM;
@@ -299,7 +299,6 @@ int ceph_msgr_init(void)
return -ENOMEM;
}
-EXPORT_SYMBOL(ceph_msgr_init);
void ceph_msgr_exit(void)
{
@@ -307,7 +306,6 @@ void ceph_msgr_exit(void)
_ceph_msgr_exit();
}
-EXPORT_SYMBOL(ceph_msgr_exit);
void ceph_msgr_flush(void)
{
@@ -839,93 +837,112 @@ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor,
size_t length)
{
struct ceph_msg_data *data = cursor->data;
- struct bio *bio;
+ struct ceph_bio_iter *it = &cursor->bio_iter;
- BUG_ON(data->type != CEPH_MSG_DATA_BIO);
+ cursor->resid = min_t(size_t, length, data->bio_length);
+ *it = data->bio_pos;
+ if (cursor->resid < it->iter.bi_size)
+ it->iter.bi_size = cursor->resid;
- bio = data->bio;
- BUG_ON(!bio);
-
- cursor->resid = min(length, data->bio_length);
- cursor->bio = bio;
- cursor->bvec_iter = bio->bi_iter;
- cursor->last_piece =
- cursor->resid <= bio_iter_len(bio, cursor->bvec_iter);
+ BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter));
+ cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter);
}
static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor,
size_t *page_offset,
size_t *length)
{
- struct ceph_msg_data *data = cursor->data;
- struct bio *bio;
- struct bio_vec bio_vec;
-
- BUG_ON(data->type != CEPH_MSG_DATA_BIO);
-
- bio = cursor->bio;
- BUG_ON(!bio);
-
- bio_vec = bio_iter_iovec(bio, cursor->bvec_iter);
-
- *page_offset = (size_t) bio_vec.bv_offset;
- BUG_ON(*page_offset >= PAGE_SIZE);
- if (cursor->last_piece) /* pagelist offset is always 0 */
- *length = cursor->resid;
- else
- *length = (size_t) bio_vec.bv_len;
- BUG_ON(*length > cursor->resid);
- BUG_ON(*page_offset + *length > PAGE_SIZE);
+ struct bio_vec bv = bio_iter_iovec(cursor->bio_iter.bio,
+ cursor->bio_iter.iter);
- return bio_vec.bv_page;
+ *page_offset = bv.bv_offset;
+ *length = bv.bv_len;
+ return bv.bv_page;
}
static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
size_t bytes)
{
- struct bio *bio;
- struct bio_vec bio_vec;
+ struct ceph_bio_iter *it = &cursor->bio_iter;
- BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO);
+ BUG_ON(bytes > cursor->resid);
+ BUG_ON(bytes > bio_iter_len(it->bio, it->iter));
+ cursor->resid -= bytes;
+ bio_advance_iter(it->bio, &it->iter, bytes);
- bio = cursor->bio;
- BUG_ON(!bio);
+ if (!cursor->resid) {
+ BUG_ON(!cursor->last_piece);
+ return false; /* no more data */
+ }
- bio_vec = bio_iter_iovec(bio, cursor->bvec_iter);
+ if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done))
+ return false; /* more bytes to process in this segment */
- /* Advance the cursor offset */
+ if (!it->iter.bi_size) {
+ it->bio = it->bio->bi_next;
+ it->iter = it->bio->bi_iter;
+ if (cursor->resid < it->iter.bi_size)
+ it->iter.bi_size = cursor->resid;
+ }
- BUG_ON(cursor->resid < bytes);
- cursor->resid -= bytes;
+ BUG_ON(cursor->last_piece);
+ BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter));
+ cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter);
+ return true;
+}
+#endif /* CONFIG_BLOCK */
- bio_advance_iter(bio, &cursor->bvec_iter, bytes);
+static void ceph_msg_data_bvecs_cursor_init(struct ceph_msg_data_cursor *cursor,
+ size_t length)
+{
+ struct ceph_msg_data *data = cursor->data;
+ struct bio_vec *bvecs = data->bvec_pos.bvecs;
- if (bytes < bio_vec.bv_len)
- return false; /* more bytes to process in this segment */
+ cursor->resid = min_t(size_t, length, data->bvec_pos.iter.bi_size);
+ cursor->bvec_iter = data->bvec_pos.iter;
+ cursor->bvec_iter.bi_size = cursor->resid;
- /* Move on to the next segment, and possibly the next bio */
+ BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter));
+ cursor->last_piece =
+ cursor->resid == bvec_iter_len(bvecs, cursor->bvec_iter);
+}
- if (!cursor->bvec_iter.bi_size) {
- bio = bio->bi_next;
- cursor->bio = bio;
- if (bio)
- cursor->bvec_iter = bio->bi_iter;
- else
- memset(&cursor->bvec_iter, 0,
- sizeof(cursor->bvec_iter));
- }
+static struct page *ceph_msg_data_bvecs_next(struct ceph_msg_data_cursor *cursor,
+ size_t *page_offset,
+ size_t *length)
+{
+ struct bio_vec bv = bvec_iter_bvec(cursor->data->bvec_pos.bvecs,
+ cursor->bvec_iter);
+
+ *page_offset = bv.bv_offset;
+ *length = bv.bv_len;
+ return bv.bv_page;
+}
- if (!cursor->last_piece) {
- BUG_ON(!cursor->resid);
- BUG_ON(!bio);
- /* A short read is OK, so use <= rather than == */
- if (cursor->resid <= bio_iter_len(bio, cursor->bvec_iter))
- cursor->last_piece = true;
+static bool ceph_msg_data_bvecs_advance(struct ceph_msg_data_cursor *cursor,
+ size_t bytes)
+{
+ struct bio_vec *bvecs = cursor->data->bvec_pos.bvecs;
+
+ BUG_ON(bytes > cursor->resid);
+ BUG_ON(bytes > bvec_iter_len(bvecs, cursor->bvec_iter));
+ cursor->resid -= bytes;
+ bvec_iter_advance(bvecs, &cursor->bvec_iter, bytes);
+
+ if (!cursor->resid) {
+ BUG_ON(!cursor->last_piece);
+ return false; /* no more data */
}
+ if (!bytes || cursor->bvec_iter.bi_bvec_done)
+ return false; /* more bytes to process in this segment */
+
+ BUG_ON(cursor->last_piece);
+ BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter));
+ cursor->last_piece =
+ cursor->resid == bvec_iter_len(bvecs, cursor->bvec_iter);
return true;
}
-#endif /* CONFIG_BLOCK */
/*
* For a page array, a piece comes from the first page in the array
@@ -1110,6 +1127,9 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
ceph_msg_data_bio_cursor_init(cursor, length);
break;
#endif /* CONFIG_BLOCK */
+ case CEPH_MSG_DATA_BVECS:
+ ceph_msg_data_bvecs_cursor_init(cursor, length);
+ break;
case CEPH_MSG_DATA_NONE:
default:
/* BUG(); */
@@ -1158,14 +1178,19 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
page = ceph_msg_data_bio_next(cursor, page_offset, length);
break;
#endif /* CONFIG_BLOCK */
+ case CEPH_MSG_DATA_BVECS:
+ page = ceph_msg_data_bvecs_next(cursor, page_offset, length);
+ break;
case CEPH_MSG_DATA_NONE:
default:
page = NULL;
break;
}
+
BUG_ON(!page);
BUG_ON(*page_offset + *length > PAGE_SIZE);
BUG_ON(!*length);
+ BUG_ON(*length > cursor->resid);
if (last_piece)
*last_piece = cursor->last_piece;
@@ -1194,6 +1219,9 @@ static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
new_piece = ceph_msg_data_bio_advance(cursor, bytes);
break;
#endif /* CONFIG_BLOCK */
+ case CEPH_MSG_DATA_BVECS:
+ new_piece = ceph_msg_data_bvecs_advance(cursor, bytes);
+ break;
case CEPH_MSG_DATA_NONE:
default:
BUG();
@@ -1575,13 +1603,18 @@ static int write_partial_message_data(struct ceph_connection *con)
* been revoked, so use the zero page.
*/
crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
- while (cursor->resid) {
+ while (cursor->total_resid) {
struct page *page;
size_t page_offset;
size_t length;
bool last_piece;
int ret;
+ if (!cursor->resid) {
+ ceph_msg_data_advance(cursor, 0);
+ continue;
+ }
+
page = ceph_msg_data_next(cursor, &page_offset, &length,
&last_piece);
ret = ceph_tcp_sendpage(con->sock, page, page_offset,
@@ -2297,7 +2330,12 @@ static int read_partial_msg_data(struct ceph_connection *con)
if (do_datacrc)
crc = con->in_data_crc;
- while (cursor->resid) {
+ while (cursor->total_resid) {
+ if (!cursor->resid) {
+ ceph_msg_data_advance(cursor, 0);
+ continue;
+ }
+
page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
if (ret <= 0) {
@@ -2531,6 +2569,11 @@ static int try_write(struct ceph_connection *con)
int ret = 1;
dout("try_write start %p state %lu\n", con, con->state);
+ if (con->state != CON_STATE_PREOPEN &&
+ con->state != CON_STATE_CONNECTING &&
+ con->state != CON_STATE_NEGOTIATING &&
+ con->state != CON_STATE_OPEN)
+ return 0;
more:
dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
@@ -2556,6 +2599,8 @@ more:
}
more_kvec:
+ BUG_ON(!con->sock);
+
/* kvec data queued? */
if (con->out_kvec_left) {
ret = write_partial_kvec(con);
@@ -3262,16 +3307,14 @@ void ceph_msg_data_add_pagelist(struct ceph_msg *msg,
EXPORT_SYMBOL(ceph_msg_data_add_pagelist);
#ifdef CONFIG_BLOCK
-void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio,
- size_t length)
+void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos,
+ u32 length)
{
struct ceph_msg_data *data;
- BUG_ON(!bio);
-
data = ceph_msg_data_create(CEPH_MSG_DATA_BIO);
BUG_ON(!data);
- data->bio = bio;
+ data->bio_pos = *bio_pos;
data->bio_length = length;
list_add_tail(&data->links, &msg->data);
@@ -3280,6 +3323,20 @@ void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio,
EXPORT_SYMBOL(ceph_msg_data_add_bio);
#endif /* CONFIG_BLOCK */
+void ceph_msg_data_add_bvecs(struct ceph_msg *msg,
+ struct ceph_bvec_iter *bvec_pos)
+{
+ struct ceph_msg_data *data;
+
+ data = ceph_msg_data_create(CEPH_MSG_DATA_BVECS);
+ BUG_ON(!data);
+ data->bvec_pos = *bvec_pos;
+
+ list_add_tail(&data->links, &msg->data);
+ msg->data_length += bvec_pos->iter.bi_size;
+}
+EXPORT_SYMBOL(ceph_msg_data_add_bvecs);
+
/*
* construct a new message with given type, size
* the new msg has a ref count of 1.
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 1547107f4854..21ac6e3b96bb 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -60,7 +60,7 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
num_mon = ceph_decode_32(&p);
ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
- if (num_mon >= CEPH_MAX_MON)
+ if (num_mon > CEPH_MAX_MON)
goto bad;
m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
if (m == NULL)
@@ -209,6 +209,14 @@ static void reopen_session(struct ceph_mon_client *monc)
__open_session(monc);
}
+static void un_backoff(struct ceph_mon_client *monc)
+{
+ monc->hunt_mult /= 2; /* reduce by 50% */
+ if (monc->hunt_mult < 1)
+ monc->hunt_mult = 1;
+ dout("%s hunt_mult now %d\n", __func__, monc->hunt_mult);
+}
+
/*
* Reschedule delayed work timer.
*/
@@ -963,6 +971,7 @@ static void delayed_work(struct work_struct *work)
if (!monc->hunting) {
ceph_con_keepalive(&monc->con);
__validate_auth(monc);
+ un_backoff(monc);
}
if (is_auth &&
@@ -1123,9 +1132,8 @@ static void finish_hunting(struct ceph_mon_client *monc)
dout("%s found mon%d\n", __func__, monc->cur_mon);
monc->hunting = false;
monc->had_a_connection = true;
- monc->hunt_mult /= 2; /* reduce by 50% */
- if (monc->hunt_mult < 1)
- monc->hunt_mult = 1;
+ un_backoff(monc);
+ __schedule_delayed(monc);
}
}
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 2814dba5902d..ea2a6c9fb7ce 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -20,6 +20,7 @@
#include <linux/ceph/decode.h>
#include <linux/ceph/auth.h>
#include <linux/ceph/pagelist.h>
+#include <linux/ceph/striper.h>
#define OSD_OPREPLY_FRONT_LEN 512
@@ -103,13 +104,12 @@ static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen,
u64 *objnum, u64 *objoff, u64 *objlen)
{
u64 orig_len = *plen;
- int r;
+ u32 xlen;
/* object extent? */
- r = ceph_calc_file_object_mapping(layout, off, orig_len, objnum,
- objoff, objlen);
- if (r < 0)
- return r;
+ ceph_calc_file_object_mapping(layout, off, orig_len, objnum,
+ objoff, &xlen);
+ *objlen = xlen;
if (*objlen < orig_len) {
*plen = *objlen;
dout(" skipping last %llu, final file extent %llu~%llu\n",
@@ -117,7 +117,6 @@ static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen,
}
dout("calc_layout objnum=%llx %llu~%llu\n", *objnum, *objoff, *objlen);
-
return 0;
}
@@ -148,14 +147,22 @@ static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data,
#ifdef CONFIG_BLOCK
static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
- struct bio *bio, size_t bio_length)
+ struct ceph_bio_iter *bio_pos,
+ u32 bio_length)
{
osd_data->type = CEPH_OSD_DATA_TYPE_BIO;
- osd_data->bio = bio;
+ osd_data->bio_pos = *bio_pos;
osd_data->bio_length = bio_length;
}
#endif /* CONFIG_BLOCK */
+static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data,
+ struct ceph_bvec_iter *bvec_pos)
+{
+ osd_data->type = CEPH_OSD_DATA_TYPE_BVECS;
+ osd_data->bvec_pos = *bvec_pos;
+}
+
#define osd_req_op_data(oreq, whch, typ, fld) \
({ \
struct ceph_osd_request *__oreq = (oreq); \
@@ -218,16 +225,29 @@ EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist);
#ifdef CONFIG_BLOCK
void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
- unsigned int which, struct bio *bio, size_t bio_length)
+ unsigned int which,
+ struct ceph_bio_iter *bio_pos,
+ u32 bio_length)
{
struct ceph_osd_data *osd_data;
osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
- ceph_osd_data_bio_init(osd_data, bio, bio_length);
+ ceph_osd_data_bio_init(osd_data, bio_pos, bio_length);
}
EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio);
#endif /* CONFIG_BLOCK */
+void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req,
+ unsigned int which,
+ struct ceph_bvec_iter *bvec_pos)
+{
+ struct ceph_osd_data *osd_data;
+
+ osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
+ ceph_osd_data_bvecs_init(osd_data, bvec_pos);
+}
+EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvec_pos);
+
static void osd_req_op_cls_request_info_pagelist(
struct ceph_osd_request *osd_req,
unsigned int which, struct ceph_pagelist *pagelist)
@@ -265,6 +285,23 @@ void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
}
EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
+void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req,
+ unsigned int which,
+ struct bio_vec *bvecs, u32 bytes)
+{
+ struct ceph_osd_data *osd_data;
+ struct ceph_bvec_iter it = {
+ .bvecs = bvecs,
+ .iter = { .bi_size = bytes },
+ };
+
+ osd_data = osd_req_op_data(osd_req, which, cls, request_data);
+ ceph_osd_data_bvecs_init(osd_data, &it);
+ osd_req->r_ops[which].cls.indata_len += bytes;
+ osd_req->r_ops[which].indata_len += bytes;
+}
+EXPORT_SYMBOL(osd_req_op_cls_request_data_bvecs);
+
void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req,
unsigned int which, struct page **pages, u64 length,
u32 alignment, bool pages_from_pool, bool own_pages)
@@ -290,6 +327,8 @@ static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data)
case CEPH_OSD_DATA_TYPE_BIO:
return (u64)osd_data->bio_length;
#endif /* CONFIG_BLOCK */
+ case CEPH_OSD_DATA_TYPE_BVECS:
+ return osd_data->bvec_pos.iter.bi_size;
default:
WARN(true, "unrecognized data type %d\n", (int)osd_data->type);
return 0;
@@ -828,8 +867,10 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
ceph_msg_data_add_pagelist(msg, osd_data->pagelist);
#ifdef CONFIG_BLOCK
} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
- ceph_msg_data_add_bio(msg, osd_data->bio, length);
+ ceph_msg_data_add_bio(msg, &osd_data->bio_pos, length);
#endif
+ } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BVECS) {
+ ceph_msg_data_add_bvecs(msg, &osd_data->bvec_pos);
} else {
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
}
@@ -5065,7 +5106,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
}
EXPORT_SYMBOL(ceph_osdc_writepages);
-int ceph_osdc_setup(void)
+int __init ceph_osdc_setup(void)
{
size_t size = sizeof(struct ceph_osd_request) +
CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op);
@@ -5076,7 +5117,6 @@ int ceph_osdc_setup(void)
return ceph_osd_request_cache ? 0 : -ENOMEM;
}
-EXPORT_SYMBOL(ceph_osdc_setup);
void ceph_osdc_cleanup(void)
{
@@ -5084,7 +5124,6 @@ void ceph_osdc_cleanup(void)
kmem_cache_destroy(ceph_osd_request_cache);
ceph_osd_request_cache = NULL;
}
-EXPORT_SYMBOL(ceph_osdc_cleanup);
/*
* handle incoming message
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 0da27c66349a..9645ffd6acfb 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -4,7 +4,6 @@
#include <linux/module.h>
#include <linux/slab.h>
-#include <asm/div64.h>
#include <linux/ceph/libceph.h>
#include <linux/ceph/osdmap.h>
@@ -2141,76 +2140,6 @@ bool ceph_osds_changed(const struct ceph_osds *old_acting,
}
/*
- * calculate file layout from given offset, length.
- * fill in correct oid, logical length, and object extent
- * offset, length.
- *
- * for now, we write only a single su, until we can
- * pass a stride back to the caller.
- */
-int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
- u64 off, u64 len,
- u64 *ono,
- u64 *oxoff, u64 *oxlen)
-{
- u32 osize = layout->object_size;
- u32 su = layout->stripe_unit;
- u32 sc = layout->stripe_count;
- u32 bl, stripeno, stripepos, objsetno;
- u32 su_per_object;
- u64 t, su_offset;
-
- dout("mapping %llu~%llu osize %u fl_su %u\n", off, len,
- osize, su);
- if (su == 0 || sc == 0)
- goto invalid;
- su_per_object = osize / su;
- if (su_per_object == 0)
- goto invalid;
- dout("osize %u / su %u = su_per_object %u\n", osize, su,
- su_per_object);
-
- if ((su & ~PAGE_MASK) != 0)
- goto invalid;
-
- /* bl = *off / su; */
- t = off;
- do_div(t, su);
- bl = t;
- dout("off %llu / su %u = bl %u\n", off, su, bl);
-
- stripeno = bl / sc;
- stripepos = bl % sc;
- objsetno = stripeno / su_per_object;
-
- *ono = objsetno * sc + stripepos;
- dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned int)*ono);
-
- /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
- t = off;
- su_offset = do_div(t, su);
- *oxoff = su_offset + (stripeno % su_per_object) * su;
-
- /*
- * Calculate the length of the extent being written to the selected
- * object. This is the minimum of the full length requested (len) or
- * the remainder of the current stripe being written to.
- */
- *oxlen = min_t(u64, len, su - su_offset);
-
- dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
- return 0;
-
-invalid:
- dout(" invalid layout\n");
- *ono = 0;
- *oxoff = 0;
- *oxlen = 0;
- return -EINVAL;
-}
-EXPORT_SYMBOL(ceph_calc_file_object_mapping);
-
-/*
* Map an object into a PG.
*
* Should only be called with target_oid and target_oloc (as opposed to
diff --git a/net/ceph/striper.c b/net/ceph/striper.c
new file mode 100644
index 000000000000..c36462dc86b7
--- /dev/null
+++ b/net/ceph/striper.c
@@ -0,0 +1,261 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/math64.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/striper.h>
+#include <linux/ceph/types.h>
+
+/*
+ * Map a file extent to a stripe unit within an object.
+ * Fill in objno, offset into object, and object extent length (i.e. the
+ * number of bytes mapped, less than or equal to @l->stripe_unit).
+ *
+ * Example for stripe_count = 3, stripes_per_object = 4:
+ *
+ * blockno | 0 3 6 9 | 1 4 7 10 | 2 5 8 11 | 12 15 18 21 | 13 16 19
+ * stripeno | 0 1 2 3 | 0 1 2 3 | 0 1 2 3 | 4 5 6 7 | 4 5 6
+ * stripepos | 0 | 1 | 2 | 0 | 1
+ * objno | 0 | 1 | 2 | 3 | 4
+ * objsetno | 0 | 1
+ */
+void ceph_calc_file_object_mapping(struct ceph_file_layout *l,
+ u64 off, u64 len,
+ u64 *objno, u64 *objoff, u32 *xlen)
+{
+ u32 stripes_per_object = l->object_size / l->stripe_unit;
+ u64 blockno; /* which su in the file (i.e. globally) */
+ u32 blockoff; /* offset into su */
+ u64 stripeno; /* which stripe */
+ u32 stripepos; /* which su in the stripe,
+ which object in the object set */
+ u64 objsetno; /* which object set */
+ u32 objsetpos; /* which stripe in the object set */
+
+ blockno = div_u64_rem(off, l->stripe_unit, &blockoff);
+ stripeno = div_u64_rem(blockno, l->stripe_count, &stripepos);
+ objsetno = div_u64_rem(stripeno, stripes_per_object, &objsetpos);
+
+ *objno = objsetno * l->stripe_count + stripepos;
+ *objoff = objsetpos * l->stripe_unit + blockoff;
+ *xlen = min_t(u64, len, l->stripe_unit - blockoff);
+}
+EXPORT_SYMBOL(ceph_calc_file_object_mapping);
+
+/*
+ * Return the last extent with given objno (@object_extents is sorted
+ * by objno). If not found, return NULL and set @add_pos so that the
+ * new extent can be added with list_add(add_pos, new_ex).
+ */
+static struct ceph_object_extent *
+lookup_last(struct list_head *object_extents, u64 objno,
+ struct list_head **add_pos)
+{
+ struct list_head *pos;
+
+ list_for_each_prev(pos, object_extents) {
+ struct ceph_object_extent *ex =
+ list_entry(pos, typeof(*ex), oe_item);
+
+ if (ex->oe_objno == objno)
+ return ex;
+
+ if (ex->oe_objno < objno)
+ break;
+ }
+
+ *add_pos = pos;
+ return NULL;
+}
+
+static struct ceph_object_extent *
+lookup_containing(struct list_head *object_extents, u64 objno,
+ u64 objoff, u32 xlen)
+{
+ struct ceph_object_extent *ex;
+
+ list_for_each_entry(ex, object_extents, oe_item) {
+ if (ex->oe_objno == objno &&
+ ex->oe_off <= objoff &&
+ ex->oe_off + ex->oe_len >= objoff + xlen) /* paranoia */
+ return ex;
+
+ if (ex->oe_objno > objno)
+ break;
+ }
+
+ return NULL;
+}
+
+/*
+ * Map a file extent to a sorted list of object extents.
+ *
+ * We want only one (or as few as possible) object extents per object.
+ * Adjacent object extents will be merged together, each returned object
+ * extent may reverse map to multiple different file extents.
+ *
+ * Call @alloc_fn for each new object extent and @action_fn for each
+ * mapped stripe unit, whether it was merged into an already allocated
+ * object extent or started a new object extent.
+ *
+ * Newly allocated object extents are added to @object_extents.
+ * To keep @object_extents sorted, successive calls to this function
+ * must map successive file extents (i.e. the list of file extents that
+ * are mapped using the same @object_extents must be sorted).
+ *
+ * The caller is responsible for @object_extents.
+ */
+int ceph_file_to_extents(struct ceph_file_layout *l, u64 off, u64 len,
+ struct list_head *object_extents,
+ struct ceph_object_extent *alloc_fn(void *arg),
+ void *alloc_arg,
+ ceph_object_extent_fn_t action_fn,
+ void *action_arg)
+{
+ struct ceph_object_extent *last_ex, *ex;
+
+ while (len) {
+ struct list_head *add_pos = NULL;
+ u64 objno, objoff;
+ u32 xlen;
+
+ ceph_calc_file_object_mapping(l, off, len, &objno, &objoff,
+ &xlen);
+
+ last_ex = lookup_last(object_extents, objno, &add_pos);
+ if (!last_ex || last_ex->oe_off + last_ex->oe_len != objoff) {
+ ex = alloc_fn(alloc_arg);
+ if (!ex)
+ return -ENOMEM;
+
+ ex->oe_objno = objno;
+ ex->oe_off = objoff;
+ ex->oe_len = xlen;
+ if (action_fn)
+ action_fn(ex, xlen, action_arg);
+
+ if (!last_ex)
+ list_add(&ex->oe_item, add_pos);
+ else
+ list_add(&ex->oe_item, &last_ex->oe_item);
+ } else {
+ last_ex->oe_len += xlen;
+ if (action_fn)
+ action_fn(last_ex, xlen, action_arg);
+ }
+
+ off += xlen;
+ len -= xlen;
+ }
+
+ for (last_ex = list_first_entry(object_extents, typeof(*ex), oe_item),
+ ex = list_next_entry(last_ex, oe_item);
+ &ex->oe_item != object_extents;
+ last_ex = ex, ex = list_next_entry(ex, oe_item)) {
+ if (last_ex->oe_objno > ex->oe_objno ||
+ (last_ex->oe_objno == ex->oe_objno &&
+ last_ex->oe_off + last_ex->oe_len >= ex->oe_off)) {
+ WARN(1, "%s: object_extents list not sorted!\n",
+ __func__);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(ceph_file_to_extents);
+
+/*
+ * A stripped down, non-allocating version of ceph_file_to_extents(),
+ * for when @object_extents is already populated.
+ */
+int ceph_iterate_extents(struct ceph_file_layout *l, u64 off, u64 len,
+ struct list_head *object_extents,
+ ceph_object_extent_fn_t action_fn,
+ void *action_arg)
+{
+ while (len) {
+ struct ceph_object_extent *ex;
+ u64 objno, objoff;
+ u32 xlen;
+
+ ceph_calc_file_object_mapping(l, off, len, &objno, &objoff,
+ &xlen);
+
+ ex = lookup_containing(object_extents, objno, objoff, xlen);
+ if (!ex) {
+ WARN(1, "%s: objno %llu %llu~%u not found!\n",
+ __func__, objno, objoff, xlen);
+ return -EINVAL;
+ }
+
+ action_fn(ex, xlen, action_arg);
+
+ off += xlen;
+ len -= xlen;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(ceph_iterate_extents);
+
+/*
+ * Reverse map an object extent to a sorted list of file extents.
+ *
+ * On success, the caller is responsible for:
+ *
+ * kfree(file_extents)
+ */
+int ceph_extent_to_file(struct ceph_file_layout *l,
+ u64 objno, u64 objoff, u64 objlen,
+ struct ceph_file_extent **file_extents,
+ u32 *num_file_extents)
+{
+ u32 stripes_per_object = l->object_size / l->stripe_unit;
+ u64 blockno; /* which su */
+ u32 blockoff; /* offset into su */
+ u64 stripeno; /* which stripe */
+ u32 stripepos; /* which su in the stripe,
+ which object in the object set */
+ u64 objsetno; /* which object set */
+ u32 i = 0;
+
+ if (!objlen) {
+ *file_extents = NULL;
+ *num_file_extents = 0;
+ return 0;
+ }
+
+ *num_file_extents = DIV_ROUND_UP_ULL(objoff + objlen, l->stripe_unit) -
+ DIV_ROUND_DOWN_ULL(objoff, l->stripe_unit);
+ *file_extents = kmalloc_array(*num_file_extents, sizeof(**file_extents),
+ GFP_NOIO);
+ if (!*file_extents)
+ return -ENOMEM;
+
+ div_u64_rem(objoff, l->stripe_unit, &blockoff);
+ while (objlen) {
+ u64 off, len;
+
+ objsetno = div_u64_rem(objno, l->stripe_count, &stripepos);
+ stripeno = div_u64(objoff, l->stripe_unit) +
+ objsetno * stripes_per_object;
+ blockno = stripeno * l->stripe_count + stripepos;
+ off = blockno * l->stripe_unit + blockoff;
+ len = min_t(u64, objlen, l->stripe_unit - blockoff);
+
+ (*file_extents)[i].fe_off = off;
+ (*file_extents)[i].fe_len = len;
+
+ blockoff = 0;
+ objoff += len;
+ objlen -= len;
+ i++;
+ }
+
+ BUG_ON(i != *num_file_extents);
+ return 0;
+}
+EXPORT_SYMBOL(ceph_extent_to_file);
diff --git a/net/compat.c b/net/compat.c
index 5ae7437d3853..7242cce5631b 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -377,7 +377,8 @@ static int compat_sock_setsockopt(struct socket *sock, int level, int optname,
optname == SO_ATTACH_REUSEPORT_CBPF)
return do_set_attach_filter(sock, level, optname,
optval, optlen);
- if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)
+ if (!COMPAT_USE_64BIT_TIME &&
+ (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO))
return do_set_sock_timeout(sock, level, optname, optval, optlen);
return sock_setsockopt(sock, level, optname, optval, optlen);
@@ -448,7 +449,8 @@ static int do_get_sock_timeout(struct socket *sock, int level, int optname,
static int compat_sock_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
- if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)
+ if (!COMPAT_USE_64BIT_TIME &&
+ (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO))
return do_get_sock_timeout(sock, level, optname, optval, optlen);
return sock_getsockopt(sock, level, optname, optval, optlen);
}
diff --git a/net/core/dev.c b/net/core/dev.c
index 969462ebb296..af0558b00c6c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2969,7 +2969,7 @@ netdev_features_t passthru_features_check(struct sk_buff *skb,
}
EXPORT_SYMBOL(passthru_features_check);
-static netdev_features_t dflt_features_check(const struct sk_buff *skb,
+static netdev_features_t dflt_features_check(struct sk_buff *skb,
struct net_device *dev,
netdev_features_t features)
{
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index e3e6a3e2ca22..d884d8f5f0e5 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -839,7 +839,7 @@ void dev_mc_flush(struct net_device *dev)
EXPORT_SYMBOL(dev_mc_flush);
/**
- * dev_mc_flush - Init multicast address list
+ * dev_mc_init - Init multicast address list
* @dev: device
*
* Init multicast address list.
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 03416e6dd5d7..ba02f0dfe85c 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1032,6 +1032,11 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
info_size = sizeof(info);
if (copy_from_user(&info, useraddr, info_size))
return -EFAULT;
+ /* Since malicious users may modify the original data,
+ * we need to check whether FLOW_RSS is still requested.
+ */
+ if (!(info.flow_type & FLOW_RSS))
+ return -EINVAL;
}
if (info.cmd == ETHTOOL_GRXCLSRLALL) {
diff --git a/net/core/filter.c b/net/core/filter.c
index d31aff93270d..e77c30ca491d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3240,6 +3240,7 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
skb_dst_set(skb, (struct dst_entry *) md);
info = &md->u.tun_info;
+ memset(info, 0, sizeof(*info));
info->mode = IP_TUNNEL_INFO_TX;
info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 7b7a14abba28..ce519861be59 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -55,7 +55,8 @@ static void neigh_timer_handler(struct timer_list *t);
static void __neigh_notify(struct neighbour *n, int type, int flags,
u32 pid);
static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid);
-static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
+static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
+ struct net_device *dev);
#ifdef CONFIG_PROC_FS
static const struct file_operations neigh_stat_seq_fops;
@@ -291,8 +292,7 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
{
write_lock_bh(&tbl->lock);
neigh_flush_dev(tbl, dev);
- pneigh_ifdown(tbl, dev);
- write_unlock_bh(&tbl->lock);
+ pneigh_ifdown_and_unlock(tbl, dev);
del_timer_sync(&tbl->proxy_timer);
pneigh_queue_purge(&tbl->proxy_queue);
@@ -681,9 +681,10 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
return -ENOENT;
}
-static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
+static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
+ struct net_device *dev)
{
- struct pneigh_entry *n, **np;
+ struct pneigh_entry *n, **np, *freelist = NULL;
u32 h;
for (h = 0; h <= PNEIGH_HASHMASK; h++) {
@@ -691,16 +692,23 @@ static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
while ((n = *np) != NULL) {
if (!dev || n->dev == dev) {
*np = n->next;
- if (tbl->pdestructor)
- tbl->pdestructor(n);
- if (n->dev)
- dev_put(n->dev);
- kfree(n);
+ n->next = freelist;
+ freelist = n;
continue;
}
np = &n->next;
}
}
+ write_unlock_bh(&tbl->lock);
+ while ((n = freelist)) {
+ freelist = n->next;
+ n->next = NULL;
+ if (tbl->pdestructor)
+ tbl->pdestructor(n);
+ if (n->dev)
+ dev_put(n->dev);
+ kfree(n);
+ }
return -ENOENT;
}
@@ -2323,12 +2331,16 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL, NULL);
if (!err) {
- if (tb[NDA_IFINDEX])
+ if (tb[NDA_IFINDEX]) {
+ if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32))
+ return -EINVAL;
filter_idx = nla_get_u32(tb[NDA_IFINDEX]);
-
- if (tb[NDA_MASTER])
+ }
+ if (tb[NDA_MASTER]) {
+ if (nla_len(tb[NDA_MASTER]) != sizeof(u32))
+ return -EINVAL;
filter_master_idx = nla_get_u32(tb[NDA_MASTER]);
-
+ }
if (filter_idx || filter_master_idx)
flags |= NLM_F_DUMP_FILTERED;
}
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index b3b609f0eeb5..b1a2c5e38530 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -15,7 +15,6 @@
#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/slab.h>
-#include <linux/kmemleak.h>
#include <net/ip.h>
#include <net/sock.h>
diff --git a/net/core/utils.c b/net/core/utils.c
index 93066bd0305a..d47863b07a60 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -403,6 +403,29 @@ int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af,
}
EXPORT_SYMBOL(inet_pton_with_scope);
+bool inet_addr_is_any(struct sockaddr *addr)
+{
+ if (addr->sa_family == AF_INET6) {
+ struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr;
+ const struct sockaddr_in6 in6_any =
+ { .sin6_addr = IN6ADDR_ANY_INIT };
+
+ if (!memcmp(in6->sin6_addr.s6_addr,
+ in6_any.sin6_addr.s6_addr, 16))
+ return true;
+ } else if (addr->sa_family == AF_INET) {
+ struct sockaddr_in *in = (struct sockaddr_in *)addr;
+
+ if (in->sin_addr.s_addr == htonl(INADDR_ANY))
+ return true;
+ } else {
+ pr_warn("unexpected address family %u\n", addr->sa_family);
+ }
+
+ return false;
+}
+EXPORT_SYMBOL(inet_addr_is_any);
+
void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
__be32 from, __be32 to, bool pseudohdr)
{
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index 92d016e87816..385f153fe031 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -126,6 +126,16 @@ static void ccid2_change_l_seq_window(struct sock *sk, u64 val)
DCCPF_SEQ_WMAX));
}
+static void dccp_tasklet_schedule(struct sock *sk)
+{
+ struct tasklet_struct *t = &dccp_sk(sk)->dccps_xmitlet;
+
+ if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
+ sock_hold(sk);
+ __tasklet_schedule(t);
+ }
+}
+
static void ccid2_hc_tx_rto_expire(struct timer_list *t)
{
struct ccid2_hc_tx_sock *hc = from_timer(hc, t, tx_rtotimer);
@@ -166,7 +176,7 @@ static void ccid2_hc_tx_rto_expire(struct timer_list *t)
/* if we were blocked before, we may now send cwnd=1 packet */
if (sender_was_blocked)
- tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
+ dccp_tasklet_schedule(sk);
/* restart backed-off timer */
sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
out:
@@ -706,7 +716,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
done:
/* check if incoming Acks allow pending packets to be sent */
if (sender_was_blocked && !ccid2_cwnd_network_limited(hc))
- tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
+ dccp_tasklet_schedule(sk);
dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
}
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index b50a8732ff43..1501a20a94ca 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -232,6 +232,7 @@ static void dccp_write_xmitlet(unsigned long data)
else
dccp_write_xmit(sk);
bh_unlock_sock(sk);
+ sock_put(sk);
}
static void dccp_write_xmit_timer(struct timer_list *t)
@@ -240,7 +241,6 @@ static void dccp_write_xmit_timer(struct timer_list *t)
struct sock *sk = &dp->dccps_inet_connection.icsk_inet.sk;
dccp_write_xmitlet((unsigned long)sk);
- sock_put(sk);
}
void dccp_init_xmit_timers(struct sock *sk)
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index 8396705deffc..40c851693f77 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -91,9 +91,9 @@ dns_resolver_preparse(struct key_preparsed_payload *prep)
next_opt = memchr(opt, '#', end - opt) ?: end;
opt_len = next_opt - opt;
- if (!opt_len) {
- printk(KERN_WARNING
- "Empty option to dns_resolver key\n");
+ if (opt_len <= 0 || opt_len > 128) {
+ pr_warn_ratelimited("Invalid option length (%d) for dns_resolver key\n",
+ opt_len);
return -EINVAL;
}
@@ -127,10 +127,8 @@ dns_resolver_preparse(struct key_preparsed_payload *prep)
}
bad_option_value:
- printk(KERN_WARNING
- "Option '%*.*s' to dns_resolver key:"
- " bad/missing value\n",
- opt_nlen, opt_nlen, opt);
+ pr_warn_ratelimited("Option '%*.*s' to dns_resolver key: bad/missing value\n",
+ opt_nlen, opt_nlen, opt);
return -EINVAL;
} while (opt = next_opt + 1, opt < end);
}
diff --git a/net/ife/ife.c b/net/ife/ife.c
index 7d1ec76e7f43..13bbf8cb6a39 100644
--- a/net/ife/ife.c
+++ b/net/ife/ife.c
@@ -69,6 +69,9 @@ void *ife_decode(struct sk_buff *skb, u16 *metalen)
int total_pull;
u16 ifehdrln;
+ if (!pskb_may_pull(skb, skb->dev->hard_header_len + IFE_METAHDRLEN))
+ return NULL;
+
ifehdr = (struct ifeheadr *) (skb->data + skb->dev->hard_header_len);
ifehdrln = ntohs(ifehdr->metalen);
total_pull = skb->dev->hard_header_len + ifehdrln;
@@ -92,12 +95,43 @@ struct meta_tlvhdr {
__be16 len;
};
+static bool __ife_tlv_meta_valid(const unsigned char *skbdata,
+ const unsigned char *ifehdr_end)
+{
+ const struct meta_tlvhdr *tlv;
+ u16 tlvlen;
+
+ if (unlikely(skbdata + sizeof(*tlv) > ifehdr_end))
+ return false;
+
+ tlv = (const struct meta_tlvhdr *)skbdata;
+ tlvlen = ntohs(tlv->len);
+
+ /* tlv length field is inc header, check on minimum */
+ if (tlvlen < NLA_HDRLEN)
+ return false;
+
+ /* overflow by NLA_ALIGN check */
+ if (NLA_ALIGN(tlvlen) < tlvlen)
+ return false;
+
+ if (unlikely(skbdata + NLA_ALIGN(tlvlen) > ifehdr_end))
+ return false;
+
+ return true;
+}
+
/* Caller takes care of presenting data in network order
*/
-void *ife_tlv_meta_decode(void *skbdata, u16 *attrtype, u16 *dlen, u16 *totlen)
+void *ife_tlv_meta_decode(void *skbdata, const void *ifehdr_end, u16 *attrtype,
+ u16 *dlen, u16 *totlen)
{
- struct meta_tlvhdr *tlv = (struct meta_tlvhdr *) skbdata;
+ struct meta_tlvhdr *tlv;
+
+ if (!__ife_tlv_meta_valid(skbdata, ifehdr_end))
+ return NULL;
+ tlv = (struct meta_tlvhdr *)skbdata;
*dlen = ntohs(tlv->len) - NLA_HDRLEN;
*attrtype = ntohs(tlv->type);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 1f04bd91fc2e..d757b9642d0d 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -211,6 +211,7 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base,
p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
if (p) {
p->daddr = *daddr;
+ p->dtime = (__u32)jiffies;
refcount_set(&p->refcnt, 2);
atomic_set(&p->rid, 0);
p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index a8772a978224..9c169bb2444d 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -781,8 +781,14 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu)
tunnel->encap.type == TUNNEL_ENCAP_NONE) {
dev->features |= NETIF_F_GSO_SOFTWARE;
dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+ } else {
+ dev->features &= ~NETIF_F_GSO_SOFTWARE;
+ dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
}
dev->features |= NETIF_F_LLTX;
+ } else {
+ dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
+ dev->features &= ~(NETIF_F_LLTX | NETIF_F_GSO_SOFTWARE);
}
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 4c11b810a447..83c73bab2c3d 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1109,6 +1109,10 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
struct ip_options_rcu *opt;
struct rtable *rt;
+ rt = *rtp;
+ if (unlikely(!rt))
+ return -EFAULT;
+
/*
* setup for corking.
*/
@@ -1124,9 +1128,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
cork->flags |= IPCORK_OPT;
cork->addr = ipc->addr;
}
- rt = *rtp;
- if (unlikely(!rt))
- return -EFAULT;
+
/*
* We steal reference to this route, caller should not release it
*/
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 7523ddb2566b..0e5edd0c7926 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -28,10 +28,9 @@ obj-$(CONFIG_NF_REJECT_IPV4) += nf_reject_ipv4.o
obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
-nf_nat_snmp_basic-y := nf_nat_snmp_basic-asn1.o nf_nat_snmp_basic_main.o
-$(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic-asn1.h
+nf_nat_snmp_basic-y := nf_nat_snmp_basic.asn1.o nf_nat_snmp_basic_main.o
+$(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h
obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
-clean-files := nf_nat_snmp_basic-asn1.c nf_nat_snmp_basic-asn1.h
obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
index b6e277093e7e..ac110c1d55b5 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
@@ -54,7 +54,7 @@
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <linux/netfilter/nf_conntrack_snmp.h>
-#include "nf_nat_snmp_basic-asn1.h"
+#include "nf_nat_snmp_basic.asn1.h"
MODULE_LICENSE("GPL");
MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 59bc6ab1a4eb..1412a7baf0b9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -108,7 +108,6 @@
#include <net/rtnetlink.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
-#include <linux/kmemleak.h>
#endif
#include <net/secure_seq.h>
#include <net/ip_tunnels.h>
@@ -710,7 +709,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
fnhe->fnhe_gw = gw;
fnhe->fnhe_pmtu = pmtu;
fnhe->fnhe_mtu_locked = lock;
- fnhe->fnhe_expires = expires;
+ fnhe->fnhe_expires = max(1UL, expires);
/* Exception created; mark the cached routes for the nexthop
* stale, so anyone caching it rechecks if this exception
@@ -1298,6 +1297,36 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
}
+static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
+{
+ struct fnhe_hash_bucket *hash;
+ struct fib_nh_exception *fnhe, __rcu **fnhe_p;
+ u32 hval = fnhe_hashfun(daddr);
+
+ spin_lock_bh(&fnhe_lock);
+
+ hash = rcu_dereference_protected(nh->nh_exceptions,
+ lockdep_is_held(&fnhe_lock));
+ hash += hval;
+
+ fnhe_p = &hash->chain;
+ fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
+ while (fnhe) {
+ if (fnhe->fnhe_daddr == daddr) {
+ rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
+ fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
+ fnhe_flush_routes(fnhe);
+ kfree_rcu(fnhe, rcu);
+ break;
+ }
+ fnhe_p = &fnhe->fnhe_next;
+ fnhe = rcu_dereference_protected(fnhe->fnhe_next,
+ lockdep_is_held(&fnhe_lock));
+ }
+
+ spin_unlock_bh(&fnhe_lock);
+}
+
static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
{
struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
@@ -1311,8 +1340,14 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
fnhe = rcu_dereference(fnhe->fnhe_next)) {
- if (fnhe->fnhe_daddr == daddr)
+ if (fnhe->fnhe_daddr == daddr) {
+ if (fnhe->fnhe_expires &&
+ time_after(jiffies, fnhe->fnhe_expires)) {
+ ip_del_fnhe(nh, daddr);
+ break;
+ }
return fnhe;
+ }
}
return NULL;
}
@@ -1637,36 +1672,6 @@ static void ip_handle_martian_source(struct net_device *dev,
#endif
}
-static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
-{
- struct fnhe_hash_bucket *hash;
- struct fib_nh_exception *fnhe, __rcu **fnhe_p;
- u32 hval = fnhe_hashfun(daddr);
-
- spin_lock_bh(&fnhe_lock);
-
- hash = rcu_dereference_protected(nh->nh_exceptions,
- lockdep_is_held(&fnhe_lock));
- hash += hval;
-
- fnhe_p = &hash->chain;
- fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
- while (fnhe) {
- if (fnhe->fnhe_daddr == daddr) {
- rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
- fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
- fnhe_flush_routes(fnhe);
- kfree_rcu(fnhe, rcu);
- break;
- }
- fnhe_p = &fnhe->fnhe_next;
- fnhe = rcu_dereference_protected(fnhe->fnhe_next,
- lockdep_is_held(&fnhe_lock));
- }
-
- spin_unlock_bh(&fnhe_lock);
-}
-
/* called in rcu_read_lock() section */
static int __mkroute_input(struct sk_buff *skb,
const struct fib_result *res,
@@ -1720,20 +1725,10 @@ static int __mkroute_input(struct sk_buff *skb,
fnhe = find_exception(&FIB_RES_NH(*res), daddr);
if (do_cache) {
- if (fnhe) {
+ if (fnhe)
rth = rcu_dereference(fnhe->fnhe_rth_input);
- if (rth && rth->dst.expires &&
- time_after(jiffies, rth->dst.expires)) {
- ip_del_fnhe(&FIB_RES_NH(*res), daddr);
- fnhe = NULL;
- } else {
- goto rt_cache;
- }
- }
-
- rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
-
-rt_cache:
+ else
+ rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
if (rt_cache_valid(rth)) {
skb_dst_set_noref(skb, &rth->dst);
goto out;
@@ -2217,39 +2212,31 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
* the loopback interface and the IP_PKTINFO ipi_ifindex will
* be set to the loopback interface as well.
*/
- fi = NULL;
+ do_cache = false;
}
fnhe = NULL;
do_cache &= fi != NULL;
- if (do_cache) {
+ if (fi) {
struct rtable __rcu **prth;
struct fib_nh *nh = &FIB_RES_NH(*res);
fnhe = find_exception(nh, fl4->daddr);
+ if (!do_cache)
+ goto add;
if (fnhe) {
prth = &fnhe->fnhe_rth_output;
- rth = rcu_dereference(*prth);
- if (rth && rth->dst.expires &&
- time_after(jiffies, rth->dst.expires)) {
- ip_del_fnhe(nh, fl4->daddr);
- fnhe = NULL;
- } else {
- goto rt_cache;
+ } else {
+ if (unlikely(fl4->flowi4_flags &
+ FLOWI_FLAG_KNOWN_NH &&
+ !(nh->nh_gw &&
+ nh->nh_scope == RT_SCOPE_LINK))) {
+ do_cache = false;
+ goto add;
}
+ prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
}
-
- if (unlikely(fl4->flowi4_flags &
- FLOWI_FLAG_KNOWN_NH &&
- !(nh->nh_gw &&
- nh->nh_scope == RT_SCOPE_LINK))) {
- do_cache = false;
- goto add;
- }
- prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
rth = rcu_dereference(*prth);
-
-rt_cache:
if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
return rth;
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bccc4c270087..c9d00ef54dec 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -697,7 +697,7 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
{
return skb->len < size_goal &&
sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
- skb != tcp_write_queue_head(sk) &&
+ !tcp_rtx_queue_empty(sk) &&
refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
}
@@ -1204,7 +1204,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
uarg->zerocopy = 0;
}
- if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) {
+ if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
+ !tp->repair) {
err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
if (err == -EINPROGRESS && copied_syn > 0)
goto out;
@@ -2368,6 +2369,7 @@ void tcp_write_queue_purge(struct sock *sk)
INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
sk_mem_reclaim(sk);
tcp_clear_all_retrans_hints(tcp_sk(sk));
+ tcp_sk(sk)->packets_out = 0;
}
int tcp_disconnect(struct sock *sk, int flags)
@@ -2417,7 +2419,6 @@ int tcp_disconnect(struct sock *sk, int flags)
icsk->icsk_backoff = 0;
tp->snd_cwnd = 2;
icsk->icsk_probes_out = 0;
- tp->packets_out = 0;
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_cnt = 0;
tp->window_clamp = 0;
@@ -2673,7 +2674,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
case TCP_REPAIR_QUEUE:
if (!tp->repair)
err = -EPERM;
- else if (val < TCP_QUEUES_NR)
+ else if ((unsigned int)val < TCP_QUEUES_NR)
tp->repair_queue = val;
else
err = -EINVAL;
@@ -2813,8 +2814,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
#ifdef CONFIG_TCP_MD5SIG
case TCP_MD5SIG:
case TCP_MD5SIG_EXT:
- /* Read the IP->Key mappings from userspace */
- err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
+ if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
+ err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
+ else
+ err = -EINVAL;
break;
#endif
case TCP_USER_TIMEOUT:
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 158d105e76da..58e2f479ffb4 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -806,7 +806,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
}
}
}
- bbr->idle_restart = 0;
+ /* Restart after idle ends only once we process a new S/ACK for data */
+ if (rs->delivered > 0)
+ bbr->idle_restart = 0;
}
static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 367def6ddeda..e51c644484dc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3868,11 +3868,8 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
int length = (th->doff << 2) - sizeof(*th);
const u8 *ptr = (const u8 *)(th + 1);
- /* If the TCP option is too short, we can short cut */
- if (length < TCPOLEN_MD5SIG)
- return NULL;
-
- while (length > 0) {
+ /* If not enough data remaining, we can short cut */
+ while (length >= TCPOLEN_MD5SIG) {
int opcode = *ptr++;
int opsize;
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index ccbfa83e4bb0..ce77bcc2490c 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -48,6 +48,34 @@ config NFT_CHAIN_ROUTE_IPV6
fields such as the source, destination, flowlabel, hop-limit and
the packet mark.
+if NF_NAT_IPV6
+
+config NFT_CHAIN_NAT_IPV6
+ tristate "IPv6 nf_tables nat chain support"
+ help
+ This option enables the "nat" chain for IPv6 in nf_tables. This
+ chain type is used to perform Network Address Translation (NAT)
+ packet transformations such as the source, destination address and
+ source and destination ports.
+
+config NFT_MASQ_IPV6
+ tristate "IPv6 masquerade support for nf_tables"
+ depends on NFT_MASQ
+ select NF_NAT_MASQUERADE_IPV6
+ help
+ This is the expression that provides IPv4 masquerading support for
+ nf_tables.
+
+config NFT_REDIR_IPV6
+ tristate "IPv6 redirect support for nf_tables"
+ depends on NFT_REDIR
+ select NF_NAT_REDIRECT
+ help
+ This is the expression that provides IPv4 redirect support for
+ nf_tables.
+
+endif # NF_NAT_IPV6
+
config NFT_REJECT_IPV6
select NF_REJECT_IPV6
default NFT_REJECT
@@ -107,39 +135,12 @@ config NF_NAT_IPV6
if NF_NAT_IPV6
-config NFT_CHAIN_NAT_IPV6
- depends on NF_TABLES_IPV6
- tristate "IPv6 nf_tables nat chain support"
- help
- This option enables the "nat" chain for IPv6 in nf_tables. This
- chain type is used to perform Network Address Translation (NAT)
- packet transformations such as the source, destination address and
- source and destination ports.
-
config NF_NAT_MASQUERADE_IPV6
tristate "IPv6 masquerade support"
help
This is the kernel functionality to provide NAT in the masquerade
flavour (automatic source address selection) for IPv6.
-config NFT_MASQ_IPV6
- tristate "IPv6 masquerade support for nf_tables"
- depends on NF_TABLES_IPV6
- depends on NFT_MASQ
- select NF_NAT_MASQUERADE_IPV6
- help
- This is the expression that provides IPv4 masquerading support for
- nf_tables.
-
-config NFT_REDIR_IPV6
- tristate "IPv6 redirect support for nf_tables"
- depends on NF_TABLES_IPV6
- depends on NFT_REDIR
- select NF_NAT_REDIRECT
- help
- This is the expression that provides IPv4 redirect support for
- nf_tables.
-
endif # NF_NAT_IPV6
config IP6_NF_IPTABLES
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 49b954d6d0fa..f4d61736c41a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1835,11 +1835,16 @@ static void ip6_multipath_l3_keys(const struct sk_buff *skb,
const struct ipv6hdr *inner_iph;
const struct icmp6hdr *icmph;
struct ipv6hdr _inner_iph;
+ struct icmp6hdr _icmph;
if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
goto out;
- icmph = icmp6_hdr(skb);
+ icmph = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_icmph), &_icmph);
+ if (!icmph)
+ goto out;
+
if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
@@ -3975,6 +3980,7 @@ void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
[RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
+ [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
[RTA_OIF] = { .type = NLA_U32 },
[RTA_IIF] = { .type = NLA_U32 },
[RTA_PRIORITY] = { .type = NLA_U32 },
@@ -3986,6 +3992,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
[RTA_EXPIRES] = { .type = NLA_U32 },
[RTA_UID] = { .type = NLA_U32 },
[RTA_MARK] = { .type = NLA_U32 },
+ [RTA_TABLE] = { .type = NLA_U32 },
};
static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index f343e6f0fc95..5fe139484919 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -136,7 +136,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
isrh->nexthdr = proto;
hdr->daddr = isrh->segments[isrh->first_segment];
- set_tun_src(net, ip6_dst_idev(dst)->dev, &hdr->daddr, &hdr->saddr);
+ set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr);
#ifdef CONFIG_IPV6_SEG6_HMAC
if (sr_has_hmac(isrh)) {
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 14b67dfacc4b..40261cb68e83 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -183,6 +183,26 @@ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id)
}
EXPORT_SYMBOL_GPL(l2tp_tunnel_get);
+struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth)
+{
+ const struct l2tp_net *pn = l2tp_pernet(net);
+ struct l2tp_tunnel *tunnel;
+ int count = 0;
+
+ rcu_read_lock_bh();
+ list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) {
+ if (++count > nth) {
+ l2tp_tunnel_inc_refcount(tunnel);
+ rcu_read_unlock_bh();
+ return tunnel;
+ }
+ }
+ rcu_read_unlock_bh();
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(l2tp_tunnel_get_nth);
+
/* Lookup a session. A new reference is held on the returned session. */
struct l2tp_session *l2tp_session_get(const struct net *net,
struct l2tp_tunnel *tunnel,
@@ -335,46 +355,6 @@ err_tlock:
}
EXPORT_SYMBOL_GPL(l2tp_session_register);
-/* Lookup a tunnel by id
- */
-struct l2tp_tunnel *l2tp_tunnel_find(const struct net *net, u32 tunnel_id)
-{
- struct l2tp_tunnel *tunnel;
- struct l2tp_net *pn = l2tp_pernet(net);
-
- rcu_read_lock_bh();
- list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) {
- if (tunnel->tunnel_id == tunnel_id) {
- rcu_read_unlock_bh();
- return tunnel;
- }
- }
- rcu_read_unlock_bh();
-
- return NULL;
-}
-EXPORT_SYMBOL_GPL(l2tp_tunnel_find);
-
-struct l2tp_tunnel *l2tp_tunnel_find_nth(const struct net *net, int nth)
-{
- struct l2tp_net *pn = l2tp_pernet(net);
- struct l2tp_tunnel *tunnel;
- int count = 0;
-
- rcu_read_lock_bh();
- list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) {
- if (++count > nth) {
- rcu_read_unlock_bh();
- return tunnel;
- }
- }
-
- rcu_read_unlock_bh();
-
- return NULL;
-}
-EXPORT_SYMBOL_GPL(l2tp_tunnel_find_nth);
-
/*****************************************************************************
* Receive data handling
*****************************************************************************/
@@ -1436,74 +1416,11 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32
{
struct l2tp_tunnel *tunnel = NULL;
int err;
- struct socket *sock = NULL;
- struct sock *sk = NULL;
- struct l2tp_net *pn;
enum l2tp_encap_type encap = L2TP_ENCAPTYPE_UDP;
- /* Get the tunnel socket from the fd, which was opened by
- * the userspace L2TP daemon. If not specified, create a
- * kernel socket.
- */
- if (fd < 0) {
- err = l2tp_tunnel_sock_create(net, tunnel_id, peer_tunnel_id,
- cfg, &sock);
- if (err < 0)
- goto err;
- } else {
- sock = sockfd_lookup(fd, &err);
- if (!sock) {
- pr_err("tunl %u: sockfd_lookup(fd=%d) returned %d\n",
- tunnel_id, fd, err);
- err = -EBADF;
- goto err;
- }
-
- /* Reject namespace mismatches */
- if (!net_eq(sock_net(sock->sk), net)) {
- pr_err("tunl %u: netns mismatch\n", tunnel_id);
- err = -EINVAL;
- goto err;
- }
- }
-
- sk = sock->sk;
-
if (cfg != NULL)
encap = cfg->encap;
- /* Quick sanity checks */
- err = -EPROTONOSUPPORT;
- if (sk->sk_type != SOCK_DGRAM) {
- pr_debug("tunl %hu: fd %d wrong socket type\n",
- tunnel_id, fd);
- goto err;
- }
- switch (encap) {
- case L2TP_ENCAPTYPE_UDP:
- if (sk->sk_protocol != IPPROTO_UDP) {
- pr_err("tunl %hu: fd %d wrong protocol, got %d, expected %d\n",
- tunnel_id, fd, sk->sk_protocol, IPPROTO_UDP);
- goto err;
- }
- break;
- case L2TP_ENCAPTYPE_IP:
- if (sk->sk_protocol != IPPROTO_L2TP) {
- pr_err("tunl %hu: fd %d wrong protocol, got %d, expected %d\n",
- tunnel_id, fd, sk->sk_protocol, IPPROTO_L2TP);
- goto err;
- }
- break;
- }
-
- /* Check if this socket has already been prepped */
- tunnel = l2tp_tunnel(sk);
- if (tunnel != NULL) {
- /* This socket has already been prepped */
- err = -EBUSY;
- goto err;
- }
-
tunnel = kzalloc(sizeof(struct l2tp_tunnel), GFP_KERNEL);
if (tunnel == NULL) {
err = -ENOMEM;
@@ -1520,72 +1437,126 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32
rwlock_init(&tunnel->hlist_lock);
tunnel->acpt_newsess = true;
- /* The net we belong to */
- tunnel->l2tp_net = net;
- pn = l2tp_pernet(net);
-
if (cfg != NULL)
tunnel->debug = cfg->debug;
- /* Mark socket as an encapsulation socket. See net/ipv4/udp.c */
tunnel->encap = encap;
- if (encap == L2TP_ENCAPTYPE_UDP) {
- struct udp_tunnel_sock_cfg udp_cfg = { };
-
- udp_cfg.sk_user_data = tunnel;
- udp_cfg.encap_type = UDP_ENCAP_L2TPINUDP;
- udp_cfg.encap_rcv = l2tp_udp_encap_recv;
- udp_cfg.encap_destroy = l2tp_udp_encap_destroy;
-
- setup_udp_tunnel_sock(net, sock, &udp_cfg);
- } else {
- sk->sk_user_data = tunnel;
- }
- /* Bump the reference count. The tunnel context is deleted
- * only when this drops to zero. A reference is also held on
- * the tunnel socket to ensure that it is not released while
- * the tunnel is extant. Must be done before sk_destruct is
- * set.
- */
refcount_set(&tunnel->ref_count, 1);
- sock_hold(sk);
- tunnel->sock = sk;
tunnel->fd = fd;
- /* Hook on the tunnel socket destructor so that we can cleanup
- * if the tunnel socket goes away.
- */
- tunnel->old_sk_destruct = sk->sk_destruct;
- sk->sk_destruct = &l2tp_tunnel_destruct;
- lockdep_set_class_and_name(&sk->sk_lock.slock, &l2tp_socket_class, "l2tp_sock");
-
- sk->sk_allocation = GFP_ATOMIC;
-
/* Init delete workqueue struct */
INIT_WORK(&tunnel->del_work, l2tp_tunnel_del_work);
- /* Add tunnel to our list */
INIT_LIST_HEAD(&tunnel->list);
- spin_lock_bh(&pn->l2tp_tunnel_list_lock);
- list_add_rcu(&tunnel->list, &pn->l2tp_tunnel_list);
- spin_unlock_bh(&pn->l2tp_tunnel_list_lock);
err = 0;
err:
if (tunnelp)
*tunnelp = tunnel;
- /* If tunnel's socket was created by the kernel, it doesn't
- * have a file.
- */
- if (sock && sock->file)
- sockfd_put(sock);
-
return err;
}
EXPORT_SYMBOL_GPL(l2tp_tunnel_create);
+static int l2tp_validate_socket(const struct sock *sk, const struct net *net,
+ enum l2tp_encap_type encap)
+{
+ if (!net_eq(sock_net(sk), net))
+ return -EINVAL;
+
+ if (sk->sk_type != SOCK_DGRAM)
+ return -EPROTONOSUPPORT;
+
+ if ((encap == L2TP_ENCAPTYPE_UDP && sk->sk_protocol != IPPROTO_UDP) ||
+ (encap == L2TP_ENCAPTYPE_IP && sk->sk_protocol != IPPROTO_L2TP))
+ return -EPROTONOSUPPORT;
+
+ if (sk->sk_user_data)
+ return -EBUSY;
+
+ return 0;
+}
+
+int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net,
+ struct l2tp_tunnel_cfg *cfg)
+{
+ struct l2tp_tunnel *tunnel_walk;
+ struct l2tp_net *pn;
+ struct socket *sock;
+ struct sock *sk;
+ int ret;
+
+ if (tunnel->fd < 0) {
+ ret = l2tp_tunnel_sock_create(net, tunnel->tunnel_id,
+ tunnel->peer_tunnel_id, cfg,
+ &sock);
+ if (ret < 0)
+ goto err;
+ } else {
+ sock = sockfd_lookup(tunnel->fd, &ret);
+ if (!sock)
+ goto err;
+
+ ret = l2tp_validate_socket(sock->sk, net, tunnel->encap);
+ if (ret < 0)
+ goto err_sock;
+ }
+
+ sk = sock->sk;
+
+ sock_hold(sk);
+ tunnel->sock = sk;
+ tunnel->l2tp_net = net;
+
+ pn = l2tp_pernet(net);
+
+ spin_lock_bh(&pn->l2tp_tunnel_list_lock);
+ list_for_each_entry(tunnel_walk, &pn->l2tp_tunnel_list, list) {
+ if (tunnel_walk->tunnel_id == tunnel->tunnel_id) {
+ spin_unlock_bh(&pn->l2tp_tunnel_list_lock);
+
+ ret = -EEXIST;
+ goto err_sock;
+ }
+ }
+ list_add_rcu(&tunnel->list, &pn->l2tp_tunnel_list);
+ spin_unlock_bh(&pn->l2tp_tunnel_list_lock);
+
+ if (tunnel->encap == L2TP_ENCAPTYPE_UDP) {
+ struct udp_tunnel_sock_cfg udp_cfg = {
+ .sk_user_data = tunnel,
+ .encap_type = UDP_ENCAP_L2TPINUDP,
+ .encap_rcv = l2tp_udp_encap_recv,
+ .encap_destroy = l2tp_udp_encap_destroy,
+ };
+
+ setup_udp_tunnel_sock(net, sock, &udp_cfg);
+ } else {
+ sk->sk_user_data = tunnel;
+ }
+
+ tunnel->old_sk_destruct = sk->sk_destruct;
+ sk->sk_destruct = &l2tp_tunnel_destruct;
+ lockdep_set_class_and_name(&sk->sk_lock.slock, &l2tp_socket_class,
+ "l2tp_sock");
+ sk->sk_allocation = GFP_ATOMIC;
+
+ if (tunnel->fd >= 0)
+ sockfd_put(sock);
+
+ return 0;
+
+err_sock:
+ if (tunnel->fd < 0)
+ sock_release(sock);
+ else
+ sockfd_put(sock);
+err:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(l2tp_tunnel_register);
+
/* This function is used by the netlink TUNNEL_DELETE command.
*/
void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel)
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 2718d0b284d0..c199020f8a8a 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -212,6 +212,8 @@ static inline void *l2tp_session_priv(struct l2tp_session *session)
}
struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id);
+struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth);
+
void l2tp_tunnel_free(struct l2tp_tunnel *tunnel);
struct l2tp_session *l2tp_session_get(const struct net *net,
@@ -220,12 +222,13 @@ struct l2tp_session *l2tp_session_get(const struct net *net,
struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth);
struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net,
const char *ifname);
-struct l2tp_tunnel *l2tp_tunnel_find(const struct net *net, u32 tunnel_id);
-struct l2tp_tunnel *l2tp_tunnel_find_nth(const struct net *net, int nth);
int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id,
u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg,
struct l2tp_tunnel **tunnelp);
+int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net,
+ struct l2tp_tunnel_cfg *cfg);
+
void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel);
void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel);
struct l2tp_session *l2tp_session_create(int priv_size,
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index 72e713da4733..7f1e842ef05a 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -47,7 +47,11 @@ struct l2tp_dfs_seq_data {
static void l2tp_dfs_next_tunnel(struct l2tp_dfs_seq_data *pd)
{
- pd->tunnel = l2tp_tunnel_find_nth(pd->net, pd->tunnel_idx);
+ /* Drop reference taken during previous invocation */
+ if (pd->tunnel)
+ l2tp_tunnel_dec_refcount(pd->tunnel);
+
+ pd->tunnel = l2tp_tunnel_get_nth(pd->net, pd->tunnel_idx);
pd->tunnel_idx++;
}
@@ -96,7 +100,17 @@ static void *l2tp_dfs_seq_next(struct seq_file *m, void *v, loff_t *pos)
static void l2tp_dfs_seq_stop(struct seq_file *p, void *v)
{
- /* nothing to do */
+ struct l2tp_dfs_seq_data *pd = v;
+
+ if (!pd || pd == SEQ_START_TOKEN)
+ return;
+
+ /* Drop reference taken by last invocation of l2tp_dfs_next_tunnel() */
+ if (pd->tunnel) {
+ l2tp_tunnel_dec_refcount(pd->tunnel);
+ pd->tunnel = NULL;
+ pd->session = NULL;
+ }
}
static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v)
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index e7ea9c4b89ff..6616c9fd292f 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -236,12 +236,6 @@ static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info
if (info->attrs[L2TP_ATTR_DEBUG])
cfg.debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]);
- tunnel = l2tp_tunnel_find(net, tunnel_id);
- if (tunnel != NULL) {
- ret = -EEXIST;
- goto out;
- }
-
ret = -EINVAL;
switch (cfg.encap) {
case L2TP_ENCAPTYPE_UDP:
@@ -251,9 +245,19 @@ static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info
break;
}
- if (ret >= 0)
- ret = l2tp_tunnel_notify(&l2tp_nl_family, info,
- tunnel, L2TP_CMD_TUNNEL_CREATE);
+ if (ret < 0)
+ goto out;
+
+ l2tp_tunnel_inc_refcount(tunnel);
+ ret = l2tp_tunnel_register(tunnel, net, &cfg);
+ if (ret < 0) {
+ kfree(tunnel);
+ goto out;
+ }
+ ret = l2tp_tunnel_notify(&l2tp_nl_family, info, tunnel,
+ L2TP_CMD_TUNNEL_CREATE);
+ l2tp_tunnel_dec_refcount(tunnel);
+
out:
return ret;
}
@@ -483,14 +487,17 @@ static int l2tp_nl_cmd_tunnel_dump(struct sk_buff *skb, struct netlink_callback
struct net *net = sock_net(skb->sk);
for (;;) {
- tunnel = l2tp_tunnel_find_nth(net, ti);
+ tunnel = l2tp_tunnel_get_nth(net, ti);
if (tunnel == NULL)
goto out;
if (l2tp_nl_tunnel_send(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
- tunnel, L2TP_CMD_TUNNEL_GET) < 0)
+ tunnel, L2TP_CMD_TUNNEL_GET) < 0) {
+ l2tp_tunnel_dec_refcount(tunnel);
goto out;
+ }
+ l2tp_tunnel_dec_refcount(tunnel);
ti++;
}
@@ -844,7 +851,7 @@ static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback
for (;;) {
if (tunnel == NULL) {
- tunnel = l2tp_tunnel_find_nth(net, ti);
+ tunnel = l2tp_tunnel_get_nth(net, ti);
if (tunnel == NULL)
goto out;
}
@@ -852,6 +859,7 @@ static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback
session = l2tp_session_get_nth(tunnel, si);
if (session == NULL) {
ti++;
+ l2tp_tunnel_dec_refcount(tunnel);
tunnel = NULL;
si = 0;
continue;
@@ -861,6 +869,7 @@ static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback
cb->nlh->nlmsg_seq, NLM_F_MULTI,
session, L2TP_CMD_SESSION_GET) < 0) {
l2tp_session_dec_refcount(session);
+ l2tp_tunnel_dec_refcount(tunnel);
break;
}
l2tp_session_dec_refcount(session);
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index d6deca11da19..1fd9e145076a 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -619,6 +619,13 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
lock_sock(sk);
error = -EINVAL;
+
+ if (sockaddr_len != sizeof(struct sockaddr_pppol2tp) &&
+ sockaddr_len != sizeof(struct sockaddr_pppol2tpv3) &&
+ sockaddr_len != sizeof(struct sockaddr_pppol2tpin6) &&
+ sockaddr_len != sizeof(struct sockaddr_pppol2tpv3in6))
+ goto end;
+
if (sp->sa_protocol != PX_PROTO_OL2TP)
goto end;
@@ -698,6 +705,15 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
error = l2tp_tunnel_create(sock_net(sk), fd, ver, tunnel_id, peer_tunnel_id, &tcfg, &tunnel);
if (error < 0)
goto end;
+
+ l2tp_tunnel_inc_refcount(tunnel);
+ error = l2tp_tunnel_register(tunnel, sock_net(sk),
+ &tcfg);
+ if (error < 0) {
+ kfree(tunnel);
+ goto end;
+ }
+ drop_tunnel = true;
}
} else {
/* Error if we can't find the tunnel */
@@ -1542,16 +1558,19 @@ struct pppol2tp_seq_data {
static void pppol2tp_next_tunnel(struct net *net, struct pppol2tp_seq_data *pd)
{
+ /* Drop reference taken during previous invocation */
+ if (pd->tunnel)
+ l2tp_tunnel_dec_refcount(pd->tunnel);
+
for (;;) {
- pd->tunnel = l2tp_tunnel_find_nth(net, pd->tunnel_idx);
+ pd->tunnel = l2tp_tunnel_get_nth(net, pd->tunnel_idx);
pd->tunnel_idx++;
- if (pd->tunnel == NULL)
- break;
+ /* Only accept L2TPv2 tunnels */
+ if (!pd->tunnel || pd->tunnel->version == 2)
+ return;
- /* Ignore L2TPv3 tunnels */
- if (pd->tunnel->version < 3)
- break;
+ l2tp_tunnel_dec_refcount(pd->tunnel);
}
}
@@ -1600,7 +1619,17 @@ static void *pppol2tp_seq_next(struct seq_file *m, void *v, loff_t *pos)
static void pppol2tp_seq_stop(struct seq_file *p, void *v)
{
- /* nothing to do */
+ struct pppol2tp_seq_data *pd = v;
+
+ if (!pd || pd == SEQ_START_TOKEN)
+ return;
+
+ /* Drop reference taken by last invocation of pppol2tp_next_tunnel() */
+ if (pd->tunnel) {
+ l2tp_tunnel_dec_refcount(pd->tunnel);
+ pd->tunnel = NULL;
+ pd->session = NULL;
+ }
}
static void pppol2tp_seq_tunnel_show(struct seq_file *m, void *v)
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 01dcc0823d1f..cb80ebb38311 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -199,9 +199,19 @@ static int llc_ui_release(struct socket *sock)
llc->laddr.lsap, llc->daddr.lsap);
if (!llc_send_disc(sk))
llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo);
- if (!sock_flag(sk, SOCK_ZAPPED))
+ if (!sock_flag(sk, SOCK_ZAPPED)) {
+ struct llc_sap *sap = llc->sap;
+
+ /* Hold this for release_sock(), so that llc_backlog_rcv()
+ * could still use it.
+ */
+ llc_sap_hold(sap);
llc_sap_remove_socket(llc->sap, sk);
- release_sock(sk);
+ release_sock(sk);
+ llc_sap_put(sap);
+ } else {
+ release_sock(sk);
+ }
if (llc->dev)
dev_put(llc->dev);
sock_put(sk);
diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c
index 163121192aca..4d78375f9872 100644
--- a/net/llc/llc_c_ac.c
+++ b/net/llc/llc_c_ac.c
@@ -1099,14 +1099,7 @@ int llc_conn_ac_inc_tx_win_size(struct sock *sk, struct sk_buff *skb)
int llc_conn_ac_stop_all_timers(struct sock *sk, struct sk_buff *skb)
{
- struct llc_sock *llc = llc_sk(sk);
-
- del_timer(&llc->pf_cycle_timer.timer);
- del_timer(&llc->ack_timer.timer);
- del_timer(&llc->rej_sent_timer.timer);
- del_timer(&llc->busy_state_timer.timer);
- llc->ack_must_be_send = 0;
- llc->ack_pf = 0;
+ llc_sk_stop_all_timers(sk, false);
return 0;
}
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 110e32bcb399..c0ac522b48a1 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -961,6 +961,26 @@ out:
return sk;
}
+void llc_sk_stop_all_timers(struct sock *sk, bool sync)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ if (sync) {
+ del_timer_sync(&llc->pf_cycle_timer.timer);
+ del_timer_sync(&llc->ack_timer.timer);
+ del_timer_sync(&llc->rej_sent_timer.timer);
+ del_timer_sync(&llc->busy_state_timer.timer);
+ } else {
+ del_timer(&llc->pf_cycle_timer.timer);
+ del_timer(&llc->ack_timer.timer);
+ del_timer(&llc->rej_sent_timer.timer);
+ del_timer(&llc->busy_state_timer.timer);
+ }
+
+ llc->ack_must_be_send = 0;
+ llc->ack_pf = 0;
+}
+
/**
* llc_sk_free - Frees a LLC socket
* @sk - socket to free
@@ -973,7 +993,7 @@ void llc_sk_free(struct sock *sk)
llc->state = LLC_CONN_OUT_OF_SVC;
/* Stop all (possibly) running timers */
- llc_conn_ac_stop_all_timers(sk, NULL);
+ llc_sk_stop_all_timers(sk, true);
#ifdef DEBUG_LLC_CONN_ALLOC
printk(KERN_INFO "%s: unackq=%d, txq=%d\n", __func__,
skb_queue_len(&llc->pdu_unack_q),
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 704b3832dbad..44d8a55e9721 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -594,6 +594,7 @@ config NFT_QUOTA
config NFT_REJECT
default m if NETFILTER_ADVANCED=n
tristate "Netfilter nf_tables reject support"
+ depends on !NF_TABLES_INET || (IPV6!=m || m)
help
This option adds the "reject" expression that you can use to
explicitly deny and notify via TCP reset/ICMP informational errors
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 5ebde4b15810..f36098887ad0 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2384,11 +2384,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
strlcpy(cfg.mcast_ifn, dm->mcast_ifn,
sizeof(cfg.mcast_ifn));
cfg.syncid = dm->syncid;
- rtnl_lock();
- mutex_lock(&ipvs->sync_mutex);
ret = start_sync_thread(ipvs, &cfg, dm->state);
- mutex_unlock(&ipvs->sync_mutex);
- rtnl_unlock();
} else {
mutex_lock(&ipvs->sync_mutex);
ret = stop_sync_thread(ipvs, dm->state);
@@ -3481,12 +3477,8 @@ static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
if (ipvs->mixed_address_family_dests > 0)
return -EINVAL;
- rtnl_lock();
- mutex_lock(&ipvs->sync_mutex);
ret = start_sync_thread(ipvs, &c,
nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
- mutex_unlock(&ipvs->sync_mutex);
- rtnl_unlock();
return ret;
}
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index fbaf3bd05b2e..001501e25625 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -49,6 +49,7 @@
#include <linux/kthread.h>
#include <linux/wait.h>
#include <linux/kernel.h>
+#include <linux/sched/signal.h>
#include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */
@@ -1360,15 +1361,9 @@ static void set_mcast_pmtudisc(struct sock *sk, int val)
/*
* Specifiy default interface for outgoing multicasts
*/
-static int set_mcast_if(struct sock *sk, char *ifname)
+static int set_mcast_if(struct sock *sk, struct net_device *dev)
{
- struct net_device *dev;
struct inet_sock *inet = inet_sk(sk);
- struct net *net = sock_net(sk);
-
- dev = __dev_get_by_name(net, ifname);
- if (!dev)
- return -ENODEV;
if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
return -EINVAL;
@@ -1396,19 +1391,14 @@ static int set_mcast_if(struct sock *sk, char *ifname)
* in the in_addr structure passed in as a parameter.
*/
static int
-join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
+join_mcast_group(struct sock *sk, struct in_addr *addr, struct net_device *dev)
{
- struct net *net = sock_net(sk);
struct ip_mreqn mreq;
- struct net_device *dev;
int ret;
memset(&mreq, 0, sizeof(mreq));
memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
- dev = __dev_get_by_name(net, ifname);
- if (!dev)
- return -ENODEV;
if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
return -EINVAL;
@@ -1423,15 +1413,10 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
#ifdef CONFIG_IP_VS_IPV6
static int join_mcast_group6(struct sock *sk, struct in6_addr *addr,
- char *ifname)
+ struct net_device *dev)
{
- struct net *net = sock_net(sk);
- struct net_device *dev;
int ret;
- dev = __dev_get_by_name(net, ifname);
- if (!dev)
- return -ENODEV;
if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
return -EINVAL;
@@ -1443,24 +1428,18 @@ static int join_mcast_group6(struct sock *sk, struct in6_addr *addr,
}
#endif
-static int bind_mcastif_addr(struct socket *sock, char *ifname)
+static int bind_mcastif_addr(struct socket *sock, struct net_device *dev)
{
- struct net *net = sock_net(sock->sk);
- struct net_device *dev;
__be32 addr;
struct sockaddr_in sin;
- dev = __dev_get_by_name(net, ifname);
- if (!dev)
- return -ENODEV;
-
addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
if (!addr)
pr_err("You probably need to specify IP address on "
"multicast interface.\n");
IP_VS_DBG(7, "binding socket with (%s) %pI4\n",
- ifname, &addr);
+ dev->name, &addr);
/* Now bind the socket with the address of multicast interface */
sin.sin_family = AF_INET;
@@ -1493,7 +1472,8 @@ static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen,
/*
* Set up sending multicast socket over UDP
*/
-static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id)
+static int make_send_sock(struct netns_ipvs *ipvs, int id,
+ struct net_device *dev, struct socket **sock_ret)
{
/* multicast addr */
union ipvs_sockaddr mcast_addr;
@@ -1505,9 +1485,10 @@ static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id)
IPPROTO_UDP, &sock);
if (result < 0) {
pr_err("Error during creation of socket; terminating\n");
- return ERR_PTR(result);
+ goto error;
}
- result = set_mcast_if(sock->sk, ipvs->mcfg.mcast_ifn);
+ *sock_ret = sock;
+ result = set_mcast_if(sock->sk, dev);
if (result < 0) {
pr_err("Error setting outbound mcast interface\n");
goto error;
@@ -1522,7 +1503,7 @@ static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id)
set_sock_size(sock->sk, 1, result);
if (AF_INET == ipvs->mcfg.mcast_af)
- result = bind_mcastif_addr(sock, ipvs->mcfg.mcast_ifn);
+ result = bind_mcastif_addr(sock, dev);
else
result = 0;
if (result < 0) {
@@ -1538,19 +1519,18 @@ static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id)
goto error;
}
- return sock;
+ return 0;
error:
- sock_release(sock);
- return ERR_PTR(result);
+ return result;
}
/*
* Set up receiving multicast socket over UDP
*/
-static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id,
- int ifindex)
+static int make_receive_sock(struct netns_ipvs *ipvs, int id,
+ struct net_device *dev, struct socket **sock_ret)
{
/* multicast addr */
union ipvs_sockaddr mcast_addr;
@@ -1562,8 +1542,9 @@ static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id,
IPPROTO_UDP, &sock);
if (result < 0) {
pr_err("Error during creation of socket; terminating\n");
- return ERR_PTR(result);
+ goto error;
}
+ *sock_ret = sock;
/* it is equivalent to the REUSEADDR option in user-space */
sock->sk->sk_reuse = SK_CAN_REUSE;
result = sysctl_sync_sock_size(ipvs);
@@ -1571,7 +1552,7 @@ static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id,
set_sock_size(sock->sk, 0, result);
get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id);
- sock->sk->sk_bound_dev_if = ifindex;
+ sock->sk->sk_bound_dev_if = dev->ifindex;
result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen);
if (result < 0) {
pr_err("Error binding to the multicast addr\n");
@@ -1582,21 +1563,20 @@ static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id,
#ifdef CONFIG_IP_VS_IPV6
if (ipvs->bcfg.mcast_af == AF_INET6)
result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr,
- ipvs->bcfg.mcast_ifn);
+ dev);
else
#endif
result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr,
- ipvs->bcfg.mcast_ifn);
+ dev);
if (result < 0) {
pr_err("Error joining to the multicast group\n");
goto error;
}
- return sock;
+ return 0;
error:
- sock_release(sock);
- return ERR_PTR(result);
+ return result;
}
@@ -1778,13 +1758,12 @@ static int sync_thread_backup(void *data)
int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
int state)
{
- struct ip_vs_sync_thread_data *tinfo;
+ struct ip_vs_sync_thread_data *tinfo = NULL;
struct task_struct **array = NULL, *task;
- struct socket *sock;
struct net_device *dev;
char *name;
int (*threadfn)(void *data);
- int id, count, hlen;
+ int id = 0, count, hlen;
int result = -ENOMEM;
u16 mtu, min_mtu;
@@ -1792,6 +1771,18 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n",
sizeof(struct ip_vs_sync_conn_v0));
+ /* Do not hold one mutex and then to block on another */
+ for (;;) {
+ rtnl_lock();
+ if (mutex_trylock(&ipvs->sync_mutex))
+ break;
+ rtnl_unlock();
+ mutex_lock(&ipvs->sync_mutex);
+ if (rtnl_trylock())
+ break;
+ mutex_unlock(&ipvs->sync_mutex);
+ }
+
if (!ipvs->sync_state) {
count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX);
ipvs->threads_mask = count - 1;
@@ -1810,7 +1801,8 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
dev = __dev_get_by_name(ipvs->net, c->mcast_ifn);
if (!dev) {
pr_err("Unknown mcast interface: %s\n", c->mcast_ifn);
- return -ENODEV;
+ result = -ENODEV;
+ goto out_early;
}
hlen = (AF_INET6 == c->mcast_af) ?
sizeof(struct ipv6hdr) + sizeof(struct udphdr) :
@@ -1827,26 +1819,30 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
c->sync_maxlen = mtu - hlen;
if (state == IP_VS_STATE_MASTER) {
+ result = -EEXIST;
if (ipvs->ms)
- return -EEXIST;
+ goto out_early;
ipvs->mcfg = *c;
name = "ipvs-m:%d:%d";
threadfn = sync_thread_master;
} else if (state == IP_VS_STATE_BACKUP) {
+ result = -EEXIST;
if (ipvs->backup_threads)
- return -EEXIST;
+ goto out_early;
ipvs->bcfg = *c;
name = "ipvs-b:%d:%d";
threadfn = sync_thread_backup;
} else {
- return -EINVAL;
+ result = -EINVAL;
+ goto out_early;
}
if (state == IP_VS_STATE_MASTER) {
struct ipvs_master_sync_state *ms;
+ result = -ENOMEM;
ipvs->ms = kcalloc(count, sizeof(ipvs->ms[0]), GFP_KERNEL);
if (!ipvs->ms)
goto out;
@@ -1862,39 +1858,38 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
} else {
array = kcalloc(count, sizeof(struct task_struct *),
GFP_KERNEL);
+ result = -ENOMEM;
if (!array)
goto out;
}
- tinfo = NULL;
for (id = 0; id < count; id++) {
- if (state == IP_VS_STATE_MASTER)
- sock = make_send_sock(ipvs, id);
- else
- sock = make_receive_sock(ipvs, id, dev->ifindex);
- if (IS_ERR(sock)) {
- result = PTR_ERR(sock);
- goto outtinfo;
- }
+ result = -ENOMEM;
tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
if (!tinfo)
- goto outsocket;
+ goto out;
tinfo->ipvs = ipvs;
- tinfo->sock = sock;
+ tinfo->sock = NULL;
if (state == IP_VS_STATE_BACKUP) {
tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen,
GFP_KERNEL);
if (!tinfo->buf)
- goto outtinfo;
+ goto out;
} else {
tinfo->buf = NULL;
}
tinfo->id = id;
+ if (state == IP_VS_STATE_MASTER)
+ result = make_send_sock(ipvs, id, dev, &tinfo->sock);
+ else
+ result = make_receive_sock(ipvs, id, dev, &tinfo->sock);
+ if (result < 0)
+ goto out;
task = kthread_run(threadfn, tinfo, name, ipvs->gen, id);
if (IS_ERR(task)) {
result = PTR_ERR(task);
- goto outtinfo;
+ goto out;
}
tinfo = NULL;
if (state == IP_VS_STATE_MASTER)
@@ -1911,20 +1906,20 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
ipvs->sync_state |= state;
spin_unlock_bh(&ipvs->sync_buff_lock);
+ mutex_unlock(&ipvs->sync_mutex);
+ rtnl_unlock();
+
/* increase the module use count */
ip_vs_use_count_inc();
return 0;
-outsocket:
- sock_release(sock);
-
-outtinfo:
- if (tinfo) {
- sock_release(tinfo->sock);
- kfree(tinfo->buf);
- kfree(tinfo);
- }
+out:
+ /* We do not need RTNL lock anymore, release it here so that
+ * sock_release below and in the kthreads can use rtnl_lock
+ * to leave the mcast group.
+ */
+ rtnl_unlock();
count = id;
while (count-- > 0) {
if (state == IP_VS_STATE_MASTER)
@@ -1932,13 +1927,23 @@ outtinfo:
else
kthread_stop(array[count]);
}
- kfree(array);
-
-out:
if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
kfree(ipvs->ms);
ipvs->ms = NULL;
}
+ mutex_unlock(&ipvs->sync_mutex);
+ if (tinfo) {
+ if (tinfo->sock)
+ sock_release(tinfo->sock);
+ kfree(tinfo->buf);
+ kfree(tinfo);
+ }
+ kfree(array);
+ return result;
+
+out_early:
+ mutex_unlock(&ipvs->sync_mutex);
+ rtnl_unlock();
return result;
}
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 8ef21d9f9a00..4b2b3d53acfc 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -252,7 +252,7 @@ static inline int expect_clash(const struct nf_conntrack_expect *a,
static inline int expect_matches(const struct nf_conntrack_expect *a,
const struct nf_conntrack_expect *b)
{
- return a->master == b->master && a->class == b->class &&
+ return a->master == b->master &&
nf_ct_tuple_equal(&a->tuple, &b->tuple) &&
nf_ct_tuple_mask_equal(&a->mask, &b->mask) &&
net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) &&
@@ -421,6 +421,9 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
h = nf_ct_expect_dst_hash(net, &expect->tuple);
hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) {
if (expect_matches(i, expect)) {
+ if (i->class != expect->class)
+ return -EALREADY;
+
if (nf_ct_remove_expect(i))
break;
} else if (expect_clash(i, expect)) {
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index 9fe0ddc333fb..277bbfe26478 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -9,6 +9,7 @@
* 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
+#include <linux/kmemleak.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/rcupdate.h>
@@ -71,6 +72,7 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
rcu_read_unlock();
alloc = max(newlen, NF_CT_EXT_PREALLOC);
+ kmemleak_not_leak(old);
new = __krealloc(old, alloc, gfp);
if (!new)
return NULL;
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 4dbb5bad4363..908e51e2dc2b 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -938,11 +938,19 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,
datalen, rtp_exp, rtcp_exp,
mediaoff, medialen, daddr);
else {
- if (nf_ct_expect_related(rtp_exp) == 0) {
- if (nf_ct_expect_related(rtcp_exp) != 0)
- nf_ct_unexpect_related(rtp_exp);
- else
+ /* -EALREADY handling works around end-points that send
+ * SDP messages with identical port but different media type,
+ * we pretend expectation was set up.
+ */
+ int errp = nf_ct_expect_related(rtp_exp);
+
+ if (errp == 0 || errp == -EALREADY) {
+ int errcp = nf_ct_expect_related(rtcp_exp);
+
+ if (errcp == 0 || errcp == -EALREADY)
ret = NF_ACCEPT;
+ else if (errp == 0)
+ nf_ct_unexpect_related(rtp_exp);
}
}
nf_ct_expect_put(rtcp_exp);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 9134cc429ad4..04d4e3772584 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2361,41 +2361,46 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
}
if (nlh->nlmsg_flags & NLM_F_REPLACE) {
- if (nft_is_active_next(net, old_rule)) {
- trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE,
- old_rule);
- if (trans == NULL) {
- err = -ENOMEM;
- goto err2;
- }
- nft_deactivate_next(net, old_rule);
- chain->use--;
- list_add_tail_rcu(&rule->list, &old_rule->list);
- } else {
+ if (!nft_is_active_next(net, old_rule)) {
err = -ENOENT;
goto err2;
}
- } else if (nlh->nlmsg_flags & NLM_F_APPEND)
- if (old_rule)
- list_add_rcu(&rule->list, &old_rule->list);
- else
- list_add_tail_rcu(&rule->list, &chain->rules);
- else {
- if (old_rule)
- list_add_tail_rcu(&rule->list, &old_rule->list);
- else
- list_add_rcu(&rule->list, &chain->rules);
- }
+ trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE,
+ old_rule);
+ if (trans == NULL) {
+ err = -ENOMEM;
+ goto err2;
+ }
+ nft_deactivate_next(net, old_rule);
+ chain->use--;
- if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) {
- err = -ENOMEM;
- goto err3;
+ if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) {
+ err = -ENOMEM;
+ goto err2;
+ }
+
+ list_add_tail_rcu(&rule->list, &old_rule->list);
+ } else {
+ if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) {
+ err = -ENOMEM;
+ goto err2;
+ }
+
+ if (nlh->nlmsg_flags & NLM_F_APPEND) {
+ if (old_rule)
+ list_add_rcu(&rule->list, &old_rule->list);
+ else
+ list_add_tail_rcu(&rule->list, &chain->rules);
+ } else {
+ if (old_rule)
+ list_add_tail_rcu(&rule->list, &old_rule->list);
+ else
+ list_add_rcu(&rule->list, &chain->rules);
+ }
}
chain->use++;
return 0;
-err3:
- list_del_rcu(&rule->list);
err2:
nf_tables_rule_destroy(&ctx, rule);
err1:
@@ -3207,18 +3212,20 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
err = ops->init(set, &desc, nla);
if (err < 0)
- goto err2;
+ goto err3;
err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set);
if (err < 0)
- goto err3;
+ goto err4;
list_add_tail_rcu(&set->list, &table->sets);
table->use++;
return 0;
-err3:
+err4:
ops->destroy(set);
+err3:
+ kfree(set->name);
err2:
kvfree(set);
err1:
@@ -5738,7 +5745,7 @@ static void nft_chain_commit_update(struct nft_trans *trans)
struct nft_base_chain *basechain;
if (nft_trans_chain_name(trans))
- strcpy(trans->ctx.chain->name, nft_trans_chain_name(trans));
+ swap(trans->ctx.chain->name, nft_trans_chain_name(trans));
if (!nft_is_base_chain(trans->ctx.chain))
return;
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index 773da82190dc..94df000abb92 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -36,11 +36,10 @@ MODULE_ALIAS("ipt_connmark");
MODULE_ALIAS("ip6t_connmark");
static unsigned int
-connmark_tg_shift(struct sk_buff *skb,
- const struct xt_connmark_tginfo1 *info,
- u8 shift_bits, u8 shift_dir)
+connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info)
{
enum ip_conntrack_info ctinfo;
+ u_int32_t new_targetmark;
struct nf_conn *ct;
u_int32_t newmark;
@@ -51,34 +50,39 @@ connmark_tg_shift(struct sk_buff *skb,
switch (info->mode) {
case XT_CONNMARK_SET:
newmark = (ct->mark & ~info->ctmask) ^ info->ctmark;
- if (shift_dir == D_SHIFT_RIGHT)
- newmark >>= shift_bits;
+ if (info->shift_dir == D_SHIFT_RIGHT)
+ newmark >>= info->shift_bits;
else
- newmark <<= shift_bits;
+ newmark <<= info->shift_bits;
+
if (ct->mark != newmark) {
ct->mark = newmark;
nf_conntrack_event_cache(IPCT_MARK, ct);
}
break;
case XT_CONNMARK_SAVE:
- newmark = (ct->mark & ~info->ctmask) ^
- (skb->mark & info->nfmask);
- if (shift_dir == D_SHIFT_RIGHT)
- newmark >>= shift_bits;
+ new_targetmark = (skb->mark & info->nfmask);
+ if (info->shift_dir == D_SHIFT_RIGHT)
+ new_targetmark >>= info->shift_bits;
else
- newmark <<= shift_bits;
+ new_targetmark <<= info->shift_bits;
+
+ newmark = (ct->mark & ~info->ctmask) ^
+ new_targetmark;
if (ct->mark != newmark) {
ct->mark = newmark;
nf_conntrack_event_cache(IPCT_MARK, ct);
}
break;
case XT_CONNMARK_RESTORE:
- newmark = (skb->mark & ~info->nfmask) ^
- (ct->mark & info->ctmask);
- if (shift_dir == D_SHIFT_RIGHT)
- newmark >>= shift_bits;
+ new_targetmark = (ct->mark & info->ctmask);
+ if (info->shift_dir == D_SHIFT_RIGHT)
+ new_targetmark >>= info->shift_bits;
else
- newmark <<= shift_bits;
+ new_targetmark <<= info->shift_bits;
+
+ newmark = (skb->mark & ~info->nfmask) ^
+ new_targetmark;
skb->mark = newmark;
break;
}
@@ -89,8 +93,14 @@ static unsigned int
connmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_connmark_tginfo1 *info = par->targinfo;
-
- return connmark_tg_shift(skb, info, 0, 0);
+ const struct xt_connmark_tginfo2 info2 = {
+ .ctmark = info->ctmark,
+ .ctmask = info->ctmask,
+ .nfmask = info->nfmask,
+ .mode = info->mode,
+ };
+
+ return connmark_tg_shift(skb, &info2);
}
static unsigned int
@@ -98,8 +108,7 @@ connmark_tg_v2(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_connmark_tginfo2 *info = par->targinfo;
- return connmark_tg_shift(skb, (const struct xt_connmark_tginfo1 *)info,
- info->shift_bits, info->shift_dir);
+ return connmark_tg_shift(skb, info);
}
static int connmark_tg_check(const struct xt_tgchk_param *par)
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 22dc1b9d6362..c070dfc0190a 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -1472,6 +1472,16 @@ int netlbl_unlabel_getattr(const struct sk_buff *skb,
iface = rcu_dereference(netlbl_unlhsh_def);
if (iface == NULL || !iface->valid)
goto unlabel_getattr_nolabel;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ /* When resolving a fallback label, check the sk_buff version as
+ * it is possible (e.g. SCTP) to have family = PF_INET6 while
+ * receiving ip_hdr(skb)->version = 4.
+ */
+ if (family == PF_INET6 && ip_hdr(skb)->version == 4)
+ family = PF_INET;
+#endif /* IPv6 */
+
switch (family) {
case PF_INET: {
struct iphdr *hdr4;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 55342c4d5cec..2e2dd88fc79f 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2606,13 +2606,13 @@ static int netlink_seq_show(struct seq_file *seq, void *v)
{
if (v == SEQ_START_TOKEN) {
seq_puts(seq,
- "sk Eth Pid Groups "
- "Rmem Wmem Dump Locks Drops Inode\n");
+ "sk Eth Pid Groups "
+ "Rmem Wmem Dump Locks Drops Inode\n");
} else {
struct sock *s = v;
struct netlink_sock *nlk = nlk_sk(s);
- seq_printf(seq, "%pK %-3d %-6u %08x %-8d %-8d %d %-8d %-8d %-8lu\n",
+ seq_printf(seq, "%pK %-3d %-10u %08x %-8d %-8d %-5d %-8d %-8d %-8lu\n",
s,
s->sk_protocol,
nlk->portid,
diff --git a/net/nsh/nsh.c b/net/nsh/nsh.c
index d7da99a0b0b8..9696ef96b719 100644
--- a/net/nsh/nsh.c
+++ b/net/nsh/nsh.c
@@ -57,6 +57,8 @@ int nsh_pop(struct sk_buff *skb)
return -ENOMEM;
nh = (struct nshhdr *)(skb->data);
length = nsh_hdr_len(nh);
+ if (length < NSH_BASE_HDR_LEN)
+ return -EINVAL;
inner_proto = tun_p_to_eth_p(nh->np);
if (!pskb_may_pull(skb, length))
return -ENOMEM;
@@ -90,6 +92,8 @@ static struct sk_buff *nsh_gso_segment(struct sk_buff *skb,
if (unlikely(!pskb_may_pull(skb, NSH_BASE_HDR_LEN)))
goto out;
nsh_len = nsh_hdr_len(nsh_hdr(skb));
+ if (nsh_len < NSH_BASE_HDR_LEN)
+ goto out;
if (unlikely(!pskb_may_pull(skb, nsh_len)))
goto out;
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 7322aa1e382e..492ab0c36f7c 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -1712,13 +1712,10 @@ static void nlattr_set(struct nlattr *attr, u8 val,
/* The nlattr stream should already have been validated */
nla_for_each_nested(nla, attr, rem) {
- if (tbl[nla_type(nla)].len == OVS_ATTR_NESTED) {
- if (tbl[nla_type(nla)].next)
- tbl = tbl[nla_type(nla)].next;
- nlattr_set(nla, val, tbl);
- } else {
+ if (tbl[nla_type(nla)].len == OVS_ATTR_NESTED)
+ nlattr_set(nla, val, tbl[nla_type(nla)].next ? : tbl);
+ else
memset(nla_data(nla), val, nla_len(nla));
- }
if (nla_type(nla) == OVS_KEY_ATTR_CT_STATE)
*(u32 *)nla_data(nla) &= CT_SUPPORTED_MASK;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 616cb9c18f88..01f3515cada0 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -329,11 +329,11 @@ static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
skb_set_queue_mapping(skb, queue_index);
}
-/* register_prot_hook must be invoked with the po->bind_lock held,
+/* __register_prot_hook must be invoked through register_prot_hook
* or from a context in which asynchronous accesses to the packet
* socket is not possible (packet_create()).
*/
-static void register_prot_hook(struct sock *sk)
+static void __register_prot_hook(struct sock *sk)
{
struct packet_sock *po = pkt_sk(sk);
@@ -348,8 +348,13 @@ static void register_prot_hook(struct sock *sk)
}
}
-/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
- * held. If the sync parameter is true, we will temporarily drop
+static void register_prot_hook(struct sock *sk)
+{
+ lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
+ __register_prot_hook(sk);
+}
+
+/* If the sync parameter is true, we will temporarily drop
* the po->bind_lock and do a synchronize_net to make sure no
* asynchronous packet processing paths still refer to the elements
* of po->prot_hook. If the sync parameter is false, it is the
@@ -359,6 +364,8 @@ static void __unregister_prot_hook(struct sock *sk, bool sync)
{
struct packet_sock *po = pkt_sk(sk);
+ lockdep_assert_held_once(&po->bind_lock);
+
po->running = 0;
if (po->fanout)
@@ -3008,6 +3015,7 @@ static int packet_release(struct socket *sock)
packet_flush_mclist(sk);
+ lock_sock(sk);
if (po->rx_ring.pg_vec) {
memset(&req_u, 0, sizeof(req_u));
packet_set_ring(sk, &req_u, 1, 0);
@@ -3017,6 +3025,7 @@ static int packet_release(struct socket *sock)
memset(&req_u, 0, sizeof(req_u));
packet_set_ring(sk, &req_u, 1, 1);
}
+ release_sock(sk);
f = fanout_release(sk);
@@ -3250,7 +3259,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
if (proto) {
po->prot_hook.type = proto;
- register_prot_hook(sk);
+ __register_prot_hook(sk);
}
mutex_lock(&net->packet.sklist_lock);
@@ -3643,6 +3652,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
union tpacket_req_u req_u;
int len;
+ lock_sock(sk);
switch (po->tp_version) {
case TPACKET_V1:
case TPACKET_V2:
@@ -3653,12 +3663,17 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
len = sizeof(req_u.req3);
break;
}
- if (optlen < len)
- return -EINVAL;
- if (copy_from_user(&req_u.req, optval, len))
- return -EFAULT;
- return packet_set_ring(sk, &req_u, 0,
- optname == PACKET_TX_RING);
+ if (optlen < len) {
+ ret = -EINVAL;
+ } else {
+ if (copy_from_user(&req_u.req, optval, len))
+ ret = -EFAULT;
+ else
+ ret = packet_set_ring(sk, &req_u, 0,
+ optname == PACKET_TX_RING);
+ }
+ release_sock(sk);
+ return ret;
}
case PACKET_COPY_THRESH:
{
@@ -3724,12 +3739,18 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
if (optlen != sizeof(val))
return -EINVAL;
- if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
- return -EBUSY;
if (copy_from_user(&val, optval, sizeof(val)))
return -EFAULT;
- po->tp_loss = !!val;
- return 0;
+
+ lock_sock(sk);
+ if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
+ ret = -EBUSY;
+ } else {
+ po->tp_loss = !!val;
+ ret = 0;
+ }
+ release_sock(sk);
+ return ret;
}
case PACKET_AUXDATA:
{
@@ -3740,7 +3761,9 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
if (copy_from_user(&val, optval, sizeof(val)))
return -EFAULT;
+ lock_sock(sk);
po->auxdata = !!val;
+ release_sock(sk);
return 0;
}
case PACKET_ORIGDEV:
@@ -3752,7 +3775,9 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
if (copy_from_user(&val, optval, sizeof(val)))
return -EFAULT;
+ lock_sock(sk);
po->origdev = !!val;
+ release_sock(sk);
return 0;
}
case PACKET_VNET_HDR:
@@ -3761,15 +3786,20 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
if (sock->type != SOCK_RAW)
return -EINVAL;
- if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
- return -EBUSY;
if (optlen < sizeof(val))
return -EINVAL;
if (copy_from_user(&val, optval, sizeof(val)))
return -EFAULT;
- po->has_vnet_hdr = !!val;
- return 0;
+ lock_sock(sk);
+ if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
+ ret = -EBUSY;
+ } else {
+ po->has_vnet_hdr = !!val;
+ ret = 0;
+ }
+ release_sock(sk);
+ return ret;
}
case PACKET_TIMESTAMP:
{
@@ -3807,11 +3837,17 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
if (optlen != sizeof(val))
return -EINVAL;
- if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
- return -EBUSY;
if (copy_from_user(&val, optval, sizeof(val)))
return -EFAULT;
- po->tp_tx_has_off = !!val;
+
+ lock_sock(sk);
+ if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
+ ret = -EBUSY;
+ } else {
+ po->tp_tx_has_off = !!val;
+ ret = 0;
+ }
+ release_sock(sk);
return 0;
}
case PACKET_QDISC_BYPASS:
@@ -4208,8 +4244,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
/* Added to avoid minimal code churn */
struct tpacket_req *req = &req_u->req;
- lock_sock(sk);
-
rb = tx_ring ? &po->tx_ring : &po->rx_ring;
rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
@@ -4347,7 +4381,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
if (pg_vec)
free_pg_vec(pg_vec, order, req->tp_block_nr);
out:
- release_sock(sk);
return err;
}
diff --git a/net/packet/internal.h b/net/packet/internal.h
index a1d2b2319ae9..3bb7c5fb3bff 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -112,10 +112,12 @@ struct packet_sock {
int copy_thresh;
spinlock_t bind_lock;
struct mutex pg_vec_lock;
- unsigned int running:1, /* prot_hook is attached*/
- auxdata:1,
+ unsigned int running; /* bind_lock must be held */
+ unsigned int auxdata:1, /* writer must hold sock lock */
origdev:1,
- has_vnet_hdr:1;
+ has_vnet_hdr:1,
+ tp_loss:1,
+ tp_tx_has_off:1;
int pressure;
int ifindex; /* bound device */
__be16 num;
@@ -125,8 +127,6 @@ struct packet_sock {
enum tpacket_versions tp_version;
unsigned int tp_hdrlen;
unsigned int tp_reserve;
- unsigned int tp_loss:1;
- unsigned int tp_tx_has_off:1;
unsigned int tp_tstamp;
struct net_device __rcu *cached_dev;
int (*xmit)(struct sk_buff *skb);
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index b33e5aeb4c06..2aa07b547b16 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -1135,3 +1135,4 @@ module_exit(qrtr_proto_fini);
MODULE_DESCRIPTION("Qualcomm IPC-router driver");
MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_NETPROTO(PF_QIPCRTR);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index eea1d8611b20..13b38ad0fa4a 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -547,7 +547,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd,
ic->i_send_cq, ic->i_recv_cq);
- return ret;
+ goto out;
sends_out:
vfree(ic->i_sends);
@@ -572,6 +572,7 @@ send_cq_out:
ic->i_send_cq = NULL;
rds_ibdev_out:
rds_ib_remove_conn(rds_ibdev, conn);
+out:
rds_ib_dev_put(rds_ibdev);
return ret;
diff --git a/net/rds/recv.c b/net/rds/recv.c
index de50e2126e40..dc67458b52f0 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -558,6 +558,7 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
struct rds_cmsg_rx_trace t;
int i, j;
+ memset(&t, 0, sizeof(t));
inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock();
t.rx_traces = rs->rs_rx_traces;
for (i = 0; i < rs->rs_rx_traces; i++) {
diff --git a/net/rds/send.c b/net/rds/send.c
index acad04243b41..94c7f74909be 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -1017,10 +1017,15 @@ static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn)
if (conn->c_npaths == 0 && hash != 0) {
rds_send_ping(conn, 0);
- if (conn->c_npaths == 0) {
- wait_event_interruptible(conn->c_hs_waitq,
- (conn->c_npaths != 0));
- }
+ /* The underlying connection is not up yet. Need to wait
+ * until it is up to be sure that the non-zero c_path can be
+ * used. But if we are interrupted, we have to use the zero
+ * c_path in case the connection ends up being non-MP capable.
+ */
+ if (conn->c_npaths == 0)
+ if (wait_event_interruptible(conn->c_hs_waitq,
+ conn->c_npaths != 0))
+ hash = 0;
if (conn->c_npaths == 1)
hash = 0;
}
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index a5994cf0512b..8527cfdc446d 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -652,7 +652,7 @@ static int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife,
}
}
- return 0;
+ return -ENOENT;
}
static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
@@ -682,7 +682,12 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
u16 mtype;
u16 dlen;
- curr_data = ife_tlv_meta_decode(tlv_data, &mtype, &dlen, NULL);
+ curr_data = ife_tlv_meta_decode(tlv_data, ifehdr_end, &mtype,
+ &dlen, NULL);
+ if (!curr_data) {
+ qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats));
+ return TC_ACT_SHOT;
+ }
if (find_decode_metaid(skb, ife, mtype, dlen, curr_data)) {
/* abuse overlimits to count when we receive metadata
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index a366e4c9413a..4808713c73b9 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -128,6 +128,28 @@ static bool fq_flow_is_detached(const struct fq_flow *f)
return f->next == &detached;
}
+static bool fq_flow_is_throttled(const struct fq_flow *f)
+{
+ return f->next == &throttled;
+}
+
+static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
+{
+ if (head->first)
+ head->last->next = flow;
+ else
+ head->first = flow;
+ head->last = flow;
+ flow->next = NULL;
+}
+
+static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f)
+{
+ rb_erase(&f->rate_node, &q->delayed);
+ q->throttled_flows--;
+ fq_flow_add_tail(&q->old_flows, f);
+}
+
static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
{
struct rb_node **p = &q->delayed.rb_node, *parent = NULL;
@@ -155,15 +177,6 @@ static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
static struct kmem_cache *fq_flow_cachep __read_mostly;
-static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
-{
- if (head->first)
- head->last->next = flow;
- else
- head->first = flow;
- head->last = flow;
- flow->next = NULL;
-}
/* limit number of collected flows per round */
#define FQ_GC_MAX 8
@@ -267,6 +280,8 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
f->socket_hash != sk->sk_hash)) {
f->credit = q->initial_quantum;
f->socket_hash = sk->sk_hash;
+ if (fq_flow_is_throttled(f))
+ fq_flow_unset_throttled(q, f);
f->time_next_packet = 0ULL;
}
return f;
@@ -438,9 +453,7 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now)
q->time_next_delayed_flow = f->time_next_packet;
break;
}
- rb_erase(p, &q->delayed);
- q->throttled_flows--;
- fq_flow_add_tail(&q->old_flows, f);
+ fq_flow_unset_throttled(q, f);
}
}
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 837806dd5799..a47179da24e6 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1024,8 +1024,9 @@ static void sctp_assoc_bh_rcv(struct work_struct *work)
struct sctp_endpoint *ep;
struct sctp_chunk *chunk;
struct sctp_inq *inqueue;
- int state;
+ int first_time = 1; /* is this the first time through the loop */
int error = 0;
+ int state;
/* The association should be held so we should be safe. */
ep = asoc->ep;
@@ -1036,6 +1037,30 @@ static void sctp_assoc_bh_rcv(struct work_struct *work)
state = asoc->state;
subtype = SCTP_ST_CHUNK(chunk->chunk_hdr->type);
+ /* If the first chunk in the packet is AUTH, do special
+ * processing specified in Section 6.3 of SCTP-AUTH spec
+ */
+ if (first_time && subtype.chunk == SCTP_CID_AUTH) {
+ struct sctp_chunkhdr *next_hdr;
+
+ next_hdr = sctp_inq_peek(inqueue);
+ if (!next_hdr)
+ goto normal;
+
+ /* If the next chunk is COOKIE-ECHO, skip the AUTH
+ * chunk while saving a pointer to it so we can do
+ * Authentication later (during cookie-echo
+ * processing).
+ */
+ if (next_hdr->type == SCTP_CID_COOKIE_ECHO) {
+ chunk->auth_chunk = skb_clone(chunk->skb,
+ GFP_ATOMIC);
+ chunk->auth = 1;
+ continue;
+ }
+ }
+
+normal:
/* SCTP-AUTH, Section 6.3:
* The receiver has a list of chunk types which it expects
* to be received only after an AUTH-chunk. This list has
@@ -1074,6 +1099,9 @@ static void sctp_assoc_bh_rcv(struct work_struct *work)
/* If there is an error on chunk, discard this packet. */
if (error && chunk)
chunk->pdiscard = 1;
+
+ if (first_time)
+ first_time = 0;
}
sctp_association_put(asoc);
}
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index f889a84f264d..be296d633e95 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -172,6 +172,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
struct list_head *pos, *temp;
struct sctp_chunk *chunk;
struct sctp_datamsg *msg;
+ struct sctp_sock *sp;
+ struct sctp_af *af;
int err;
msg = sctp_datamsg_new(GFP_KERNEL);
@@ -190,9 +192,11 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
/* This is the biggest possible DATA chunk that can fit into
* the packet
*/
- max_data = asoc->pathmtu -
- sctp_sk(asoc->base.sk)->pf->af->net_header_len -
- sizeof(struct sctphdr) - sctp_datachk_len(&asoc->stream);
+ sp = sctp_sk(asoc->base.sk);
+ af = sp->pf->af;
+ max_data = asoc->pathmtu - af->net_header_len -
+ sizeof(struct sctphdr) - sctp_datachk_len(&asoc->stream) -
+ af->ip_options_len(asoc->base.sk);
max_data = SCTP_TRUNC4(max_data);
/* If the the peer requested that we authenticate DATA chunks
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index 23ebc5318edc..eb93ffe2408b 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -217,7 +217,7 @@ new_skb:
skb_pull(chunk->skb, sizeof(*ch));
chunk->subh.v = NULL; /* Subheader is no longer valid. */
- if (chunk->chunk_end + sizeof(*ch) < skb_tail_pointer(chunk->skb)) {
+ if (chunk->chunk_end + sizeof(*ch) <= skb_tail_pointer(chunk->skb)) {
/* This is not a singleton */
chunk->singleton = 0;
} else if (chunk->chunk_end > skb_tail_pointer(chunk->skb)) {
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index f1fc48e9689c..42247110d842 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -427,6 +427,41 @@ static void sctp_v6_copy_addrlist(struct list_head *addrlist,
rcu_read_unlock();
}
+/* Copy over any ip options */
+static void sctp_v6_copy_ip_options(struct sock *sk, struct sock *newsk)
+{
+ struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
+ struct ipv6_txoptions *opt;
+
+ newnp = inet6_sk(newsk);
+
+ rcu_read_lock();
+ opt = rcu_dereference(np->opt);
+ if (opt) {
+ opt = ipv6_dup_options(newsk, opt);
+ if (!opt)
+ pr_err("%s: Failed to copy ip options\n", __func__);
+ }
+ RCU_INIT_POINTER(newnp->opt, opt);
+ rcu_read_unlock();
+}
+
+/* Account for the IP options */
+static int sctp_v6_ip_options_len(struct sock *sk)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_txoptions *opt;
+ int len = 0;
+
+ rcu_read_lock();
+ opt = rcu_dereference(np->opt);
+ if (opt)
+ len = opt->opt_flen + opt->opt_nflen;
+
+ rcu_read_unlock();
+ return len;
+}
+
/* Initialize a sockaddr_storage from in incoming skb. */
static void sctp_v6_from_skb(union sctp_addr *addr, struct sk_buff *skb,
int is_saddr)
@@ -521,46 +556,49 @@ static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr,
addr->v6.sin6_scope_id = 0;
}
-/* Compare addresses exactly.
- * v4-mapped-v6 is also in consideration.
- */
-static int sctp_v6_cmp_addr(const union sctp_addr *addr1,
- const union sctp_addr *addr2)
+static int __sctp_v6_cmp_addr(const union sctp_addr *addr1,
+ const union sctp_addr *addr2)
{
if (addr1->sa.sa_family != addr2->sa.sa_family) {
if (addr1->sa.sa_family == AF_INET &&
addr2->sa.sa_family == AF_INET6 &&
- ipv6_addr_v4mapped(&addr2->v6.sin6_addr)) {
- if (addr2->v6.sin6_port == addr1->v4.sin_port &&
- addr2->v6.sin6_addr.s6_addr32[3] ==
- addr1->v4.sin_addr.s_addr)
- return 1;
- }
+ ipv6_addr_v4mapped(&addr2->v6.sin6_addr) &&
+ addr2->v6.sin6_addr.s6_addr32[3] ==
+ addr1->v4.sin_addr.s_addr)
+ return 1;
+
if (addr2->sa.sa_family == AF_INET &&
addr1->sa.sa_family == AF_INET6 &&
- ipv6_addr_v4mapped(&addr1->v6.sin6_addr)) {
- if (addr1->v6.sin6_port == addr2->v4.sin_port &&
- addr1->v6.sin6_addr.s6_addr32[3] ==
- addr2->v4.sin_addr.s_addr)
- return 1;
- }
+ ipv6_addr_v4mapped(&addr1->v6.sin6_addr) &&
+ addr1->v6.sin6_addr.s6_addr32[3] ==
+ addr2->v4.sin_addr.s_addr)
+ return 1;
+
return 0;
}
- if (addr1->v6.sin6_port != addr2->v6.sin6_port)
- return 0;
+
if (!ipv6_addr_equal(&addr1->v6.sin6_addr, &addr2->v6.sin6_addr))
return 0;
+
/* If this is a linklocal address, compare the scope_id. */
- if (ipv6_addr_type(&addr1->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) {
- if (addr1->v6.sin6_scope_id && addr2->v6.sin6_scope_id &&
- (addr1->v6.sin6_scope_id != addr2->v6.sin6_scope_id)) {
- return 0;
- }
- }
+ if ((ipv6_addr_type(&addr1->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) &&
+ addr1->v6.sin6_scope_id && addr2->v6.sin6_scope_id &&
+ addr1->v6.sin6_scope_id != addr2->v6.sin6_scope_id)
+ return 0;
return 1;
}
+/* Compare addresses exactly.
+ * v4-mapped-v6 is also in consideration.
+ */
+static int sctp_v6_cmp_addr(const union sctp_addr *addr1,
+ const union sctp_addr *addr2)
+{
+ return __sctp_v6_cmp_addr(addr1, addr2) &&
+ addr1->v6.sin6_port == addr2->v6.sin6_port;
+}
+
/* Initialize addr struct to INADDR_ANY. */
static void sctp_v6_inaddr_any(union sctp_addr *addr, __be16 port)
{
@@ -666,7 +704,6 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
struct sock *newsk;
struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
struct sctp6_sock *newsctp6sk;
- struct ipv6_txoptions *opt;
newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, kern);
if (!newsk)
@@ -689,12 +726,7 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
newnp->ipv6_ac_list = NULL;
newnp->ipv6_fl_list = NULL;
- rcu_read_lock();
- opt = rcu_dereference(np->opt);
- if (opt)
- opt = ipv6_dup_options(newsk, opt);
- RCU_INIT_POINTER(newnp->opt, opt);
- rcu_read_unlock();
+ sctp_v6_copy_ip_options(sk, newsk);
/* Initialize sk's sport, dport, rcv_saddr and daddr for getsockname()
* and getpeername().
@@ -846,8 +878,8 @@ static int sctp_inet6_cmp_addr(const union sctp_addr *addr1,
const union sctp_addr *addr2,
struct sctp_sock *opt)
{
- struct sctp_af *af1, *af2;
struct sock *sk = sctp_opt2sk(opt);
+ struct sctp_af *af1, *af2;
af1 = sctp_get_af_specific(addr1->sa.sa_family);
af2 = sctp_get_af_specific(addr2->sa.sa_family);
@@ -863,10 +895,10 @@ static int sctp_inet6_cmp_addr(const union sctp_addr *addr1,
if (sctp_is_any(sk, addr1) || sctp_is_any(sk, addr2))
return 1;
- if (addr1->sa.sa_family != addr2->sa.sa_family)
- return 0;
+ if (addr1->sa.sa_family == AF_INET && addr2->sa.sa_family == AF_INET)
+ return addr1->v4.sin_addr.s_addr == addr2->v4.sin_addr.s_addr;
- return af1->cmp_addr(addr1, addr2);
+ return __sctp_v6_cmp_addr(addr1, addr2);
}
/* Verify that the provided sockaddr looks bindable. Common verification,
@@ -1043,6 +1075,7 @@ static struct sctp_af sctp_af_inet6 = {
.ecn_capable = sctp_v6_ecn_capable,
.net_header_len = sizeof(struct ipv6hdr),
.sockaddr_len = sizeof(struct sockaddr_in6),
+ .ip_options_len = sctp_v6_ip_options_len,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ipv6_setsockopt,
.compat_getsockopt = compat_ipv6_getsockopt,
@@ -1061,6 +1094,7 @@ static struct sctp_pf sctp_pf_inet6 = {
.addr_to_user = sctp_v6_addr_to_user,
.to_sk_saddr = sctp_v6_to_sk_saddr,
.to_sk_daddr = sctp_v6_to_sk_daddr,
+ .copy_ip_options = sctp_v6_copy_ip_options,
.af = &sctp_af_inet6,
};
diff --git a/net/sctp/output.c b/net/sctp/output.c
index d6e1c90cc09a..690d8557bb7b 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -69,7 +69,11 @@ static enum sctp_xmit sctp_packet_will_fit(struct sctp_packet *packet,
static void sctp_packet_reset(struct sctp_packet *packet)
{
+ /* sctp_packet_transmit() relies on this to reset size to the
+ * current overhead after sending packets.
+ */
packet->size = packet->overhead;
+
packet->has_cookie_echo = 0;
packet->has_sack = 0;
packet->has_data = 0;
@@ -87,6 +91,7 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag,
struct sctp_transport *tp = packet->transport;
struct sctp_association *asoc = tp->asoc;
struct sock *sk;
+ size_t overhead = sizeof(struct ipv6hdr) + sizeof(struct sctphdr);
pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag);
packet->vtag = vtag;
@@ -95,10 +100,22 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag,
if (!sctp_packet_empty(packet))
return;
- /* set packet max_size with pathmtu */
+ /* set packet max_size with pathmtu, then calculate overhead */
packet->max_size = tp->pathmtu;
- if (!asoc)
+ if (asoc) {
+ struct sctp_sock *sp = sctp_sk(asoc->base.sk);
+ struct sctp_af *af = sp->pf->af;
+
+ overhead = af->net_header_len +
+ af->ip_options_len(asoc->base.sk);
+ overhead += sizeof(struct sctphdr);
+ packet->overhead = overhead;
+ packet->size = overhead;
+ } else {
+ packet->overhead = overhead;
+ packet->size = overhead;
return;
+ }
/* update dst or transport pathmtu if in need */
sk = asoc->base.sk;
@@ -140,23 +157,14 @@ void sctp_packet_init(struct sctp_packet *packet,
struct sctp_transport *transport,
__u16 sport, __u16 dport)
{
- struct sctp_association *asoc = transport->asoc;
- size_t overhead;
-
pr_debug("%s: packet:%p transport:%p\n", __func__, packet, transport);
packet->transport = transport;
packet->source_port = sport;
packet->destination_port = dport;
INIT_LIST_HEAD(&packet->chunk_list);
- if (asoc) {
- struct sctp_sock *sp = sctp_sk(asoc->base.sk);
- overhead = sp->pf->af->net_header_len;
- } else {
- overhead = sizeof(struct ipv6hdr);
- }
- overhead += sizeof(struct sctphdr);
- packet->overhead = overhead;
+ /* The overhead will be calculated by sctp_packet_config() */
+ packet->overhead = 0;
sctp_packet_reset(packet);
packet->vtag = 0;
}
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index a24cde236330..d685f8456762 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -187,6 +187,45 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp,
return error;
}
+/* Copy over any ip options */
+static void sctp_v4_copy_ip_options(struct sock *sk, struct sock *newsk)
+{
+ struct inet_sock *newinet, *inet = inet_sk(sk);
+ struct ip_options_rcu *inet_opt, *newopt = NULL;
+
+ newinet = inet_sk(newsk);
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt) {
+ newopt = sock_kmalloc(newsk, sizeof(*inet_opt) +
+ inet_opt->opt.optlen, GFP_ATOMIC);
+ if (newopt)
+ memcpy(newopt, inet_opt, sizeof(*inet_opt) +
+ inet_opt->opt.optlen);
+ else
+ pr_err("%s: Failed to copy ip options\n", __func__);
+ }
+ RCU_INIT_POINTER(newinet->inet_opt, newopt);
+ rcu_read_unlock();
+}
+
+/* Account for the IP options */
+static int sctp_v4_ip_options_len(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_options_rcu *inet_opt;
+ int len = 0;
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt)
+ len = inet_opt->opt.optlen;
+
+ rcu_read_unlock();
+ return len;
+}
+
/* Initialize a sctp_addr from in incoming skb. */
static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb,
int is_saddr)
@@ -538,6 +577,8 @@ static struct sock *sctp_v4_create_accept_sk(struct sock *sk,
sctp_copy_sock(newsk, sk, asoc);
sock_reset_flag(newsk, SOCK_ZAPPED);
+ sctp_v4_copy_ip_options(sk, newsk);
+
newinet = inet_sk(newsk);
newinet->inet_daddr = asoc->peer.primary_addr.v4.sin_addr.s_addr;
@@ -956,6 +997,7 @@ static struct sctp_pf sctp_pf_inet = {
.addr_to_user = sctp_v4_addr_to_user,
.to_sk_saddr = sctp_v4_to_sk_saddr,
.to_sk_daddr = sctp_v4_to_sk_daddr,
+ .copy_ip_options = sctp_v4_copy_ip_options,
.af = &sctp_af_inet
};
@@ -1040,6 +1082,7 @@ static struct sctp_af sctp_af_inet = {
.ecn_capable = sctp_v4_ecn_capable,
.net_header_len = sizeof(struct iphdr),
.sockaddr_len = sizeof(struct sockaddr_in),
+ .ip_options_len = sctp_v4_ip_options_len,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ip_setsockopt,
.compat_getsockopt = compat_ip_getsockopt,
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index cc20bc39ee7c..5a4fb1dc8400 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -3098,6 +3098,12 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc,
if (af->is_any(&addr))
memcpy(&addr, &asconf->source, sizeof(addr));
+ if (security_sctp_bind_connect(asoc->ep->base.sk,
+ SCTP_PARAM_ADD_IP,
+ (struct sockaddr *)&addr,
+ af->sockaddr_len))
+ return SCTP_ERROR_REQ_REFUSED;
+
/* ADDIP 4.3 D9) If an endpoint receives an ADD IP address
* request and does not have the local resources to add this
* new address to the association, it MUST return an Error
@@ -3164,6 +3170,12 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc,
if (af->is_any(&addr))
memcpy(&addr.v4, sctp_source(asconf), sizeof(addr));
+ if (security_sctp_bind_connect(asoc->ep->base.sk,
+ SCTP_PARAM_SET_PRIMARY,
+ (struct sockaddr *)&addr,
+ af->sockaddr_len))
+ return SCTP_ERROR_REQ_REFUSED;
+
peer = sctp_assoc_lookup_paddr(asoc, &addr);
if (!peer)
return SCTP_ERROR_DNS_FAILED;
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index cc56a67dbb4d..c9ae3404b1bb 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -153,10 +153,7 @@ static enum sctp_disposition sctp_sf_violation_chunk(
struct sctp_cmd_seq *commands);
static enum sctp_ierror sctp_sf_authenticate(
- struct net *net,
- const struct sctp_endpoint *ep,
const struct sctp_association *asoc,
- const union sctp_subtype type,
struct sctp_chunk *chunk);
static enum sctp_disposition __sctp_sf_do_9_1_abort(
@@ -321,6 +318,11 @@ enum sctp_disposition sctp_sf_do_5_1B_init(struct net *net,
struct sctp_packet *packet;
int len;
+ /* Update socket peer label if first association. */
+ if (security_sctp_assoc_request((struct sctp_endpoint *)ep,
+ chunk->skb))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
/* 6.10 Bundling
* An endpoint MUST NOT bundle INIT, INIT ACK or
* SHUTDOWN COMPLETE with any other chunks.
@@ -621,6 +623,38 @@ enum sctp_disposition sctp_sf_do_5_1C_ack(struct net *net,
return SCTP_DISPOSITION_CONSUME;
}
+static bool sctp_auth_chunk_verify(struct net *net, struct sctp_chunk *chunk,
+ const struct sctp_association *asoc)
+{
+ struct sctp_chunk auth;
+
+ if (!chunk->auth_chunk)
+ return true;
+
+ /* SCTP-AUTH: auth_chunk pointer is only set when the cookie-echo
+ * is supposed to be authenticated and we have to do delayed
+ * authentication. We've just recreated the association using
+ * the information in the cookie and now it's much easier to
+ * do the authentication.
+ */
+
+ /* Make sure that we and the peer are AUTH capable */
+ if (!net->sctp.auth_enable || !asoc->peer.auth_capable)
+ return false;
+
+ /* set-up our fake chunk so that we can process it */
+ auth.skb = chunk->auth_chunk;
+ auth.asoc = chunk->asoc;
+ auth.sctp_hdr = chunk->sctp_hdr;
+ auth.chunk_hdr = (struct sctp_chunkhdr *)
+ skb_push(chunk->auth_chunk,
+ sizeof(struct sctp_chunkhdr));
+ skb_pull(chunk->auth_chunk, sizeof(struct sctp_chunkhdr));
+ auth.transport = chunk->transport;
+
+ return sctp_sf_authenticate(asoc, &auth) == SCTP_IERROR_NO_ERROR;
+}
+
/*
* Respond to a normal COOKIE ECHO chunk.
* We are the side that is being asked for an association.
@@ -758,37 +792,9 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net,
if (error)
goto nomem_init;
- /* SCTP-AUTH: auth_chunk pointer is only set when the cookie-echo
- * is supposed to be authenticated and we have to do delayed
- * authentication. We've just recreated the association using
- * the information in the cookie and now it's much easier to
- * do the authentication.
- */
- if (chunk->auth_chunk) {
- struct sctp_chunk auth;
- enum sctp_ierror ret;
-
- /* Make sure that we and the peer are AUTH capable */
- if (!net->sctp.auth_enable || !new_asoc->peer.auth_capable) {
- sctp_association_free(new_asoc);
- return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
- }
-
- /* set-up our fake chunk so that we can process it */
- auth.skb = chunk->auth_chunk;
- auth.asoc = chunk->asoc;
- auth.sctp_hdr = chunk->sctp_hdr;
- auth.chunk_hdr = (struct sctp_chunkhdr *)
- skb_push(chunk->auth_chunk,
- sizeof(struct sctp_chunkhdr));
- skb_pull(chunk->auth_chunk, sizeof(struct sctp_chunkhdr));
- auth.transport = chunk->transport;
-
- ret = sctp_sf_authenticate(net, ep, new_asoc, type, &auth);
- if (ret != SCTP_IERROR_NO_ERROR) {
- sctp_association_free(new_asoc);
- return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
- }
+ if (!sctp_auth_chunk_verify(net, chunk, new_asoc)) {
+ sctp_association_free(new_asoc);
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
}
repl = sctp_make_cookie_ack(new_asoc, chunk);
@@ -922,6 +928,9 @@ enum sctp_disposition sctp_sf_do_5_1E_ca(struct net *net,
*/
sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL());
+ /* Set peer label for connection. */
+ security_inet_conn_established(ep->base.sk, chunk->skb);
+
/* RFC 2960 5.1 Normal Establishment of an Association
*
* E) Upon reception of the COOKIE ACK, endpoint "A" will move
@@ -1459,6 +1468,11 @@ static enum sctp_disposition sctp_sf_do_unexpected_init(
struct sctp_packet *packet;
int len;
+ /* Update socket peer label if first association. */
+ if (security_sctp_assoc_request((struct sctp_endpoint *)ep,
+ chunk->skb))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
/* 6.10 Bundling
* An endpoint MUST NOT bundle INIT, INIT ACK or
* SHUTDOWN COMPLETE with any other chunks.
@@ -1781,13 +1795,18 @@ static enum sctp_disposition sctp_sf_do_dupcook_a(
GFP_ATOMIC))
goto nomem;
+ if (sctp_auth_asoc_init_active_key(new_asoc, GFP_ATOMIC))
+ goto nomem;
+
+ if (!sctp_auth_chunk_verify(net, chunk, new_asoc))
+ return SCTP_DISPOSITION_DISCARD;
+
/* Make sure no new addresses are being added during the
* restart. Though this is a pretty complicated attack
* since you'd have to get inside the cookie.
*/
- if (!sctp_sf_check_restart_addrs(new_asoc, asoc, chunk, commands)) {
+ if (!sctp_sf_check_restart_addrs(new_asoc, asoc, chunk, commands))
return SCTP_DISPOSITION_CONSUME;
- }
/* If the endpoint is in the SHUTDOWN-ACK-SENT state and recognizes
* the peer has restarted (Action A), it MUST NOT setup a new
@@ -1893,6 +1912,12 @@ static enum sctp_disposition sctp_sf_do_dupcook_b(
GFP_ATOMIC))
goto nomem;
+ if (sctp_auth_asoc_init_active_key(new_asoc, GFP_ATOMIC))
+ goto nomem;
+
+ if (!sctp_auth_chunk_verify(net, chunk, new_asoc))
+ return SCTP_DISPOSITION_DISCARD;
+
/* Update the content of current association. */
sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc));
sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
@@ -1990,6 +2015,9 @@ static enum sctp_disposition sctp_sf_do_dupcook_d(
* a COOKIE ACK.
*/
+ if (!sctp_auth_chunk_verify(net, chunk, asoc))
+ return SCTP_DISPOSITION_DISCARD;
+
/* Don't accidentally move back into established state. */
if (asoc->state < SCTP_STATE_ESTABLISHED) {
sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
@@ -2037,7 +2065,7 @@ static enum sctp_disposition sctp_sf_do_dupcook_d(
}
}
- repl = sctp_make_cookie_ack(new_asoc, chunk);
+ repl = sctp_make_cookie_ack(asoc, chunk);
if (!repl)
goto nomem;
@@ -2145,6 +2173,11 @@ enum sctp_disposition sctp_sf_do_5_2_4_dupcook(
}
}
+ /* Update socket peer label if first association. */
+ if (security_sctp_assoc_request((struct sctp_endpoint *)ep,
+ chunk->skb))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
/* Set temp so that it won't be added into hashtable */
new_asoc->temp = 1;
@@ -4147,10 +4180,7 @@ gen_shutdown:
* The return value is the disposition of the chunk.
*/
static enum sctp_ierror sctp_sf_authenticate(
- struct net *net,
- const struct sctp_endpoint *ep,
const struct sctp_association *asoc,
- const union sctp_subtype type,
struct sctp_chunk *chunk)
{
struct sctp_shared_key *sh_key = NULL;
@@ -4251,7 +4281,7 @@ enum sctp_disposition sctp_sf_eat_auth(struct net *net,
commands);
auth_hdr = (struct sctp_authhdr *)chunk->skb->data;
- error = sctp_sf_authenticate(net, ep, asoc, type, chunk);
+ error = sctp_sf_authenticate(asoc, chunk);
switch (error) {
case SCTP_IERROR_AUTH_BAD_HMAC:
/* Generate the ERROR chunk and discard the rest
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index eb712df7156e..80835ac26d2c 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1049,6 +1049,12 @@ static int sctp_setsockopt_bindx(struct sock *sk,
/* Do the work. */
switch (op) {
case SCTP_BINDX_ADD_ADDR:
+ /* Allow security module to validate bindx addresses. */
+ err = security_sctp_bind_connect(sk, SCTP_SOCKOPT_BINDX_ADD,
+ (struct sockaddr *)kaddrs,
+ addrs_size);
+ if (err)
+ goto out;
err = sctp_bindx_add(sk, kaddrs, addrcnt);
if (err)
goto out;
@@ -1258,6 +1264,7 @@ static int __sctp_connect(struct sock *sk,
if (assoc_id)
*assoc_id = asoc->assoc_id;
+
err = sctp_wait_for_connect(asoc, &timeo);
/* Note: the asoc may be freed after the return of
* sctp_wait_for_connect.
@@ -1353,7 +1360,16 @@ static int __sctp_setsockopt_connectx(struct sock *sk,
if (unlikely(IS_ERR(kaddrs)))
return PTR_ERR(kaddrs);
+ /* Allow security module to validate connectx addresses. */
+ err = security_sctp_bind_connect(sk, SCTP_SOCKOPT_CONNECTX,
+ (struct sockaddr *)kaddrs,
+ addrs_size);
+ if (err)
+ goto out_free;
+
err = __sctp_connect(sk, kaddrs, addrs_size, assoc_id);
+
+out_free:
kvfree(kaddrs);
return err;
@@ -1683,6 +1699,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
struct sctp_association *asoc;
enum sctp_scope scope;
struct cmsghdr *cmsg;
+ struct sctp_af *af;
int err;
*tp = NULL;
@@ -1708,6 +1725,21 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
scope = sctp_scope(daddr);
+ /* Label connection socket for first association 1-to-many
+ * style for client sequence socket()->sendmsg(). This
+ * needs to be done before sctp_assoc_add_peer() as that will
+ * set up the initial packet that needs to account for any
+ * security ip options (CIPSO/CALIPSO) added to the packet.
+ */
+ af = sctp_get_af_specific(daddr->sa.sa_family);
+ if (!af)
+ return -EINVAL;
+ err = security_sctp_bind_connect(sk, SCTP_SENDMSG_CONNECT,
+ (struct sockaddr *)daddr,
+ af->sockaddr_len);
+ if (err < 0)
+ return err;
+
asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
if (!asoc)
return -ENOMEM;
@@ -2935,6 +2967,8 @@ static int sctp_setsockopt_primary_addr(struct sock *sk, char __user *optval,
{
struct sctp_prim prim;
struct sctp_transport *trans;
+ struct sctp_af *af;
+ int err;
if (optlen != sizeof(struct sctp_prim))
return -EINVAL;
@@ -2942,6 +2976,17 @@ static int sctp_setsockopt_primary_addr(struct sock *sk, char __user *optval,
if (copy_from_user(&prim, optval, sizeof(struct sctp_prim)))
return -EFAULT;
+ /* Allow security module to validate address but need address len. */
+ af = sctp_get_af_specific(prim.ssp_addr.ss_family);
+ if (!af)
+ return -EINVAL;
+
+ err = security_sctp_bind_connect(sk, SCTP_PRIMARY_ADDR,
+ (struct sockaddr *)&prim.ssp_addr,
+ af->sockaddr_len);
+ if (err)
+ return err;
+
trans = sctp_addr_id2transport(sk, &prim.ssp_addr, prim.ssp_assoc_id);
if (!trans)
return -EINVAL;
@@ -3164,6 +3209,7 @@ static int sctp_setsockopt_mappedv4(struct sock *sk, char __user *optval, unsign
static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned int optlen)
{
struct sctp_sock *sp = sctp_sk(sk);
+ struct sctp_af *af = sp->pf->af;
struct sctp_assoc_value params;
struct sctp_association *asoc;
int val;
@@ -3188,7 +3234,8 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
if (val) {
int min_len, max_len;
- min_len = SCTP_DEFAULT_MINSEGMENT - sp->pf->af->net_header_len;
+ min_len = SCTP_DEFAULT_MINSEGMENT - af->net_header_len;
+ min_len -= af->ip_options_len(sk);
min_len -= sizeof(struct sctphdr) +
sizeof(struct sctp_data_chunk);
@@ -3201,7 +3248,8 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
asoc = sctp_id2assoc(sk, params.assoc_id);
if (asoc) {
if (val == 0) {
- val = asoc->pathmtu - sp->pf->af->net_header_len;
+ val = asoc->pathmtu - af->net_header_len;
+ val -= af->ip_options_len(sk);
val -= sizeof(struct sctphdr) +
sctp_datachk_len(&asoc->stream);
}
@@ -3270,6 +3318,13 @@ static int sctp_setsockopt_peer_primary_addr(struct sock *sk, char __user *optva
if (!sctp_assoc_lookup_laddr(asoc, (union sctp_addr *)&prim.sspp_addr))
return -EADDRNOTAVAIL;
+ /* Allow security module to validate address. */
+ err = security_sctp_bind_connect(sk, SCTP_SET_PEER_PRIMARY_ADDR,
+ (struct sockaddr *)&prim.sspp_addr,
+ af->sockaddr_len);
+ if (err)
+ return err;
+
/* Create an ASCONF chunk with SET_PRIMARY parameter */
chunk = sctp_make_asconf_set_prim(asoc,
(union sctp_addr *)&prim.sspp_addr);
@@ -5143,9 +5198,11 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp)
sctp_copy_sock(sock->sk, sk, asoc);
/* Make peeled-off sockets more like 1-1 accepted sockets.
- * Set the daddr and initialize id to something more random
+ * Set the daddr and initialize id to something more random and also
+ * copy over any ip options.
*/
sp->pf->to_sk_daddr(&asoc->peer.primary_addr, sk);
+ sp->pf->copy_ip_options(sk, sock->sk);
/* Populate the fields of the newsk from the oldsk and migrate the
* asoc to the newsk.
@@ -8468,6 +8525,8 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
{
struct inet_sock *inet = inet_sk(sk);
struct inet_sock *newinet;
+ struct sctp_sock *sp = sctp_sk(sk);
+ struct sctp_endpoint *ep = sp->ep;
newsk->sk_type = sk->sk_type;
newsk->sk_bound_dev_if = sk->sk_bound_dev_if;
@@ -8510,7 +8569,10 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
net_enable_timestamp();
- security_sk_clone(sk, newsk);
+ /* Set newsk security attributes from orginal sk and connection
+ * security attribute from ep.
+ */
+ security_sctp_sk_clone(ep, sk, newsk);
}
static inline void sctp_copy_descendant(struct sock *sk_to,
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index f799043abec9..f1f1d1b232ba 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -240,6 +240,8 @@ void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new)
new->out = NULL;
new->in = NULL;
+ new->outcnt = 0;
+ new->incnt = 0;
}
static int sctp_send_reconf(struct sctp_association *asoc,
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 5f8046c62d90..544bab42f925 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -292,6 +292,17 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
}
+/* register a new rmb */
+static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc)
+{
+ /* register memory region for new rmb */
+ if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
+ rmb_desc->regerr = 1;
+ return -EFAULT;
+ }
+ return 0;
+}
+
static int smc_clnt_conf_first_link(struct smc_sock *smc)
{
struct smc_link_group *lgr = smc->conn.lgr;
@@ -321,9 +332,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
smc_wr_remember_qp_attr(link);
- rc = smc_wr_reg_send(link,
- smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]);
- if (rc)
+ if (smc_reg_rmb(link, smc->conn.rmb_desc))
return SMC_CLC_DECL_INTERR;
/* send CONFIRM LINK response over RoCE fabric */
@@ -473,13 +482,8 @@ static int smc_connect_rdma(struct smc_sock *smc)
goto decline_rdma_unlock;
}
} else {
- struct smc_buf_desc *buf_desc = smc->conn.rmb_desc;
-
- if (!buf_desc->reused) {
- /* register memory region for new rmb */
- rc = smc_wr_reg_send(link,
- buf_desc->mr_rx[SMC_SINGLE_LINK]);
- if (rc) {
+ if (!smc->conn.rmb_desc->reused) {
+ if (smc_reg_rmb(link, smc->conn.rmb_desc)) {
reason_code = SMC_CLC_DECL_INTERR;
goto decline_rdma_unlock;
}
@@ -719,9 +723,7 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)
link = &lgr->lnk[SMC_SINGLE_LINK];
- rc = smc_wr_reg_send(link,
- smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]);
- if (rc)
+ if (smc_reg_rmb(link, smc->conn.rmb_desc))
return SMC_CLC_DECL_INTERR;
/* send CONFIRM LINK request to client over the RoCE fabric */
@@ -854,13 +856,8 @@ static void smc_listen_work(struct work_struct *work)
smc_rx_init(new_smc);
if (local_contact != SMC_FIRST_CONTACT) {
- struct smc_buf_desc *buf_desc = new_smc->conn.rmb_desc;
-
- if (!buf_desc->reused) {
- /* register memory region for new rmb */
- rc = smc_wr_reg_send(link,
- buf_desc->mr_rx[SMC_SINGLE_LINK]);
- if (rc) {
+ if (!new_smc->conn.rmb_desc->reused) {
+ if (smc_reg_rmb(link, new_smc->conn.rmb_desc)) {
reason_code = SMC_CLC_DECL_INTERR;
goto decline_rdma_unlock;
}
@@ -978,10 +975,6 @@ static void smc_tcp_listen_work(struct work_struct *work)
}
out:
- if (lsmc->clcsock) {
- sock_release(lsmc->clcsock);
- lsmc->clcsock = NULL;
- }
release_sock(lsk);
sock_put(&lsmc->sk); /* sock_hold in smc_listen */
}
@@ -1170,13 +1163,15 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
/* delegate to CLC child sock */
release_sock(sk);
mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
- /* if non-blocking connect finished ... */
lock_sock(sk);
- if ((sk->sk_state == SMC_INIT) && (mask & EPOLLOUT)) {
- sk->sk_err = smc->clcsock->sk->sk_err;
- if (sk->sk_err) {
- mask |= EPOLLERR;
- } else {
+ sk->sk_err = smc->clcsock->sk->sk_err;
+ if (sk->sk_err) {
+ mask |= EPOLLERR;
+ } else {
+ /* if non-blocking connect finished ... */
+ if (sk->sk_state == SMC_INIT &&
+ mask & EPOLLOUT &&
+ smc->clcsock->sk->sk_state != TCP_CLOSE) {
rc = smc_connect_rdma(smc);
if (rc < 0)
mask |= EPOLLERR;
@@ -1259,14 +1254,12 @@ static int smc_shutdown(struct socket *sock, int how)
rc = smc_close_shutdown_write(smc);
break;
case SHUT_RD:
- if (sk->sk_state == SMC_LISTEN)
- rc = smc_close_active(smc);
- else
- rc = 0;
- /* nothing more to do because peer is not involved */
+ rc = 0;
+ /* nothing more to do because peer is not involved */
break;
}
- rc1 = kernel_sock_shutdown(smc->clcsock, how);
+ if (smc->clcsock)
+ rc1 = kernel_sock_shutdown(smc->clcsock, how);
/* map sock_shutdown_cmd constants to sk_shutdown value range */
sk->sk_shutdown |= how + 1;
@@ -1322,8 +1315,11 @@ static ssize_t smc_sendpage(struct socket *sock, struct page *page,
smc = smc_sk(sk);
lock_sock(sk);
- if (sk->sk_state != SMC_ACTIVE)
+ if (sk->sk_state != SMC_ACTIVE) {
+ release_sock(sk);
goto out;
+ }
+ release_sock(sk);
if (smc->use_fallback)
rc = kernel_sendpage(smc->clcsock, page, offset,
size, flags);
@@ -1331,7 +1327,6 @@ static ssize_t smc_sendpage(struct socket *sock, struct page *page,
rc = sock_no_sendpage(sock, page, offset, size, flags);
out:
- release_sock(sk);
return rc;
}
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index f44f6803f7ff..d4bd01bb44e1 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -32,6 +32,9 @@
static u32 smc_lgr_num; /* unique link group number */
+static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk,
+ bool is_rmb);
+
static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
{
/* client link group creation always follows the server link group
@@ -234,9 +237,22 @@ static void smc_buf_unuse(struct smc_connection *conn)
conn->sndbuf_size = 0;
}
if (conn->rmb_desc) {
- conn->rmb_desc->reused = true;
- conn->rmb_desc->used = 0;
- conn->rmbe_size = 0;
+ if (!conn->rmb_desc->regerr) {
+ conn->rmb_desc->reused = 1;
+ conn->rmb_desc->used = 0;
+ conn->rmbe_size = 0;
+ } else {
+ /* buf registration failed, reuse not possible */
+ struct smc_link_group *lgr = conn->lgr;
+ struct smc_link *lnk;
+
+ write_lock_bh(&lgr->rmbs_lock);
+ list_del(&conn->rmb_desc->list);
+ write_unlock_bh(&lgr->rmbs_lock);
+
+ lnk = &lgr->lnk[SMC_SINGLE_LINK];
+ smc_buf_free(conn->rmb_desc, lnk, true);
+ }
}
}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 07e2a393e6d9..5dfcb15d529f 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -123,7 +123,8 @@ struct smc_buf_desc {
*/
u32 order; /* allocation order */
u32 used; /* currently used / unused */
- bool reused; /* new created / reused */
+ u8 reused : 1; /* new created / reused */
+ u8 regerr : 1; /* err during registration */
};
struct smc_rtoken { /* address/key of remote RMB */
diff --git a/net/socket.c b/net/socket.c
index 54dcb43e25a1..f10f1d947c78 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1536,7 +1536,7 @@ SYSCALL_DEFINE2(listen, int, fd, int, backlog)
*
* 1003.1g adds the ability to recvmsg() to query connection pending
* status to recvmsg. We need to add that support in a way thats
- * clean when we restucture accept also.
+ * clean when we restructure accept also.
*/
int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index b9283ce5cd85..092bebc70048 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -67,7 +67,7 @@ static void strp_abort_strp(struct strparser *strp, int err)
static void strp_start_timer(struct strparser *strp, long timeo)
{
- if (timeo)
+ if (timeo && timeo != LONG_MAX)
mod_delayed_work(strp_wq, &strp->msg_timer_work, timeo);
}
@@ -296,9 +296,9 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
strp_start_timer(strp, timeo);
}
+ stm->accum_len += cand_len;
strp->need_bytes = stm->strp.full_len -
stm->accum_len;
- stm->accum_len += cand_len;
stm->early_eaten = cand_len;
STRP_STATS_ADD(strp->stats.bytes, cand_len);
desc->count = 0; /* Stop reading socket */
@@ -321,6 +321,7 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
/* Hurray, we have a new message! */
cancel_delayed_work(&strp->msg_timer_work);
strp->skb_head = NULL;
+ strp->need_bytes = 0;
STRP_STATS_INCR(strp->stats.msgs);
/* Give skb to upper layer */
@@ -410,9 +411,7 @@ void strp_data_ready(struct strparser *strp)
return;
if (strp->need_bytes) {
- if (strp_peek_len(strp) >= strp->need_bytes)
- strp->need_bytes = 0;
- else
+ if (strp_peek_len(strp) < strp->need_bytes)
return;
}
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 12649c9fedab..8654494b4d0a 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -237,9 +237,6 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
- err = crypto_ahash_init(req);
- if (err)
- goto out;
err = crypto_ahash_setkey(hmac_md5, cksumkey, kctx->gk5e->keylength);
if (err)
goto out;
diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c
index 1d74d653e6c0..94a2b3f082a8 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seal.c
@@ -177,6 +177,7 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text,
u64 seq_send;
u8 *cksumkey;
unsigned int cksum_usage;
+ __be64 seq_send_be64;
dprintk("RPC: %s\n", __func__);
@@ -187,7 +188,9 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text,
spin_lock(&krb5_seq_lock);
seq_send = ctx->seq_send64++;
spin_unlock(&krb5_seq_lock);
- *((__be64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send);
+
+ seq_send_be64 = cpu_to_be64(seq_send);
+ memcpy(krb5_hdr + 8, (char *) &seq_send_be64, 8);
if (ctx->initiate) {
cksumkey = ctx->initiator_sign;
diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c
index dcf9515d9aef..b601a73cc9db 100644
--- a/net/sunrpc/auth_gss/gss_krb5_unseal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c
@@ -155,10 +155,12 @@ gss_verify_mic_v2(struct krb5_ctx *ctx,
u8 flags;
int i;
unsigned int cksum_usage;
+ __be16 be16_ptr;
dprintk("RPC: %s\n", __func__);
- if (be16_to_cpu(*((__be16 *)ptr)) != KG2_TOK_MIC)
+ memcpy(&be16_ptr, (char *) ptr, 2);
+ if (be16_to_cpu(be16_ptr) != KG2_TOK_MIC)
return GSS_S_DEFECTIVE_TOKEN;
flags = ptr[2];
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index c536cc24b3d1..cdda4744c9b1 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1450,8 +1450,8 @@ static ssize_t write_flush(struct file *file, const char __user *buf,
struct cache_detail *cd)
{
char tbuf[20];
- char *bp, *ep;
- time_t then, now;
+ char *ep;
+ time_t now;
if (*ppos || count > sizeof(tbuf)-1)
return -EINVAL;
@@ -1461,24 +1461,24 @@ static ssize_t write_flush(struct file *file, const char __user *buf,
simple_strtoul(tbuf, &ep, 0);
if (*ep && *ep != '\n')
return -EINVAL;
+ /* Note that while we check that 'buf' holds a valid number,
+ * we always ignore the value and just flush everything.
+ * Making use of the number leads to races.
+ */
- bp = tbuf;
- then = get_expiry(&bp);
now = seconds_since_boot();
- cd->nextcheck = now;
- /* Can only set flush_time to 1 second beyond "now", or
- * possibly 1 second beyond flushtime. This is because
- * flush_time never goes backwards so it mustn't get too far
- * ahead of time.
+ /* Always flush everything, so behave like cache_purge()
+ * Do this by advancing flush_time to the current time,
+ * or by one second if it has already reached the current time.
+ * Newly added cache entries will always have ->last_refresh greater
+ * that ->flush_time, so they don't get flushed prematurely.
*/
- if (then >= now) {
- /* Want to flush everything, so behave like cache_purge() */
- if (cd->flush_time >= now)
- now = cd->flush_time + 1;
- then = now;
- }
- cd->flush_time = then;
+ if (cd->flush_time >= now)
+ now = cd->flush_time + 1;
+
+ cd->flush_time = now;
+ cd->nextcheck = now;
cache_flush();
*ppos += count;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 806395687bb6..c2266f387213 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1887,7 +1887,7 @@ call_connect_status(struct rpc_task *task)
dprint_status(task);
- trace_rpc_connect_status(task, status);
+ trace_rpc_connect_status(task);
task->tk_status = 0;
switch (status) {
case -ECONNREFUSED:
@@ -2014,6 +2014,9 @@ call_transmit_status(struct rpc_task *task)
case -EPERM:
if (RPC_IS_SOFTCONN(task)) {
xprt_end_transmit(task);
+ if (!task->tk_msg.rpc_proc->p_proc)
+ trace_xprt_ping(task->tk_xprt,
+ task->tk_status);
rpc_exit(task, task->tk_status);
break;
}
@@ -2112,6 +2115,9 @@ call_status(struct rpc_task *task)
struct rpc_rqst *req = task->tk_rqstp;
int status;
+ if (!task->tk_msg.rpc_proc->p_proc)
+ trace_xprt_ping(task->tk_xprt, task->tk_status);
+
if (req->rq_reply_bytes_recvd > 0 && !req->rq_bytes_sent)
task->tk_status = req->rq_reply_bytes_recvd;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 0f08934b2cea..c81ef5e6c981 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -1375,6 +1375,7 @@ rpc_gssd_dummy_depopulate(struct dentry *pipe_dentry)
struct dentry *clnt_dir = pipe_dentry->d_parent;
struct dentry *gssd_dir = clnt_dir->d_parent;
+ dget(pipe_dentry);
__rpc_rmpipe(d_inode(clnt_dir), pipe_dentry);
__rpc_depopulate(clnt_dir, gssd_dummy_info_file, 0, 1);
__rpc_depopulate(gssd_dir, gssd_dummy_clnt_dir, 0, 1);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index d9db2eab3a8d..3fe5d60ab0e2 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -276,7 +276,7 @@ static void rpc_set_active(struct rpc_task *task)
{
rpc_task_set_debuginfo(task);
set_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
- trace_rpc_task_begin(task->tk_client, task, NULL);
+ trace_rpc_task_begin(task, NULL);
}
/*
@@ -291,7 +291,7 @@ static int rpc_complete_task(struct rpc_task *task)
unsigned long flags;
int ret;
- trace_rpc_task_complete(task->tk_client, task, NULL);
+ trace_rpc_task_complete(task, NULL);
spin_lock_irqsave(&wq->lock, flags);
clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
@@ -358,7 +358,7 @@ static void __rpc_sleep_on_priority(struct rpc_wait_queue *q,
dprintk("RPC: %5u sleep_on(queue \"%s\" time %lu)\n",
task->tk_pid, rpc_qname(q), jiffies);
- trace_rpc_task_sleep(task->tk_client, task, q);
+ trace_rpc_task_sleep(task, q);
__rpc_add_wait_queue(q, task, queue_priority);
@@ -428,7 +428,7 @@ static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq,
return;
}
- trace_rpc_task_wakeup(task->tk_client, task, queue);
+ trace_rpc_task_wakeup(task, queue);
__rpc_remove_wait_queue(queue, task);
@@ -780,7 +780,7 @@ static void __rpc_execute(struct rpc_task *task)
}
if (!do_action)
break;
- trace_rpc_task_run_action(task->tk_client, task, do_action);
+ trace_rpc_task_run_action(task, do_action);
do_action(task);
/*
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 1e671333c3d5..f68aa46c9dd7 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -24,6 +24,8 @@
#include <linux/sunrpc/metrics.h>
#include <linux/rcupdate.h>
+#include <trace/events/sunrpc.h>
+
#include "netns.h"
#define RPCDBG_FACILITY RPCDBG_MISC
@@ -148,7 +150,7 @@ void rpc_count_iostats_metrics(const struct rpc_task *task,
struct rpc_iostats *op_metrics)
{
struct rpc_rqst *req = task->tk_rqstp;
- ktime_t delta, now;
+ ktime_t backlog, execute, now;
if (!op_metrics || !req)
return;
@@ -164,16 +166,20 @@ void rpc_count_iostats_metrics(const struct rpc_task *task,
op_metrics->om_bytes_sent += req->rq_xmit_bytes_sent;
op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd;
+ backlog = 0;
if (ktime_to_ns(req->rq_xtime)) {
- delta = ktime_sub(req->rq_xtime, task->tk_start);
- op_metrics->om_queue = ktime_add(op_metrics->om_queue, delta);
+ backlog = ktime_sub(req->rq_xtime, task->tk_start);
+ op_metrics->om_queue = ktime_add(op_metrics->om_queue, backlog);
}
+
op_metrics->om_rtt = ktime_add(op_metrics->om_rtt, req->rq_rtt);
- delta = ktime_sub(now, task->tk_start);
- op_metrics->om_execute = ktime_add(op_metrics->om_execute, delta);
+ execute = ktime_sub(now, task->tk_start);
+ op_metrics->om_execute = ktime_add(op_metrics->om_execute, execute);
spin_unlock(&op_metrics->om_lock);
+
+ trace_rpc_stats_latency(req->rq_task, backlog, req->rq_rtt, execute);
}
EXPORT_SYMBOL_GPL(rpc_count_iostats_metrics);
diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h
index f2b7cb540e61..09a0315ea77b 100644
--- a/net/sunrpc/sunrpc.h
+++ b/net/sunrpc/sunrpc.h
@@ -37,12 +37,6 @@ struct rpc_buffer {
char data[];
};
-static inline int rpc_reply_expected(struct rpc_task *task)
-{
- return (task->tk_msg.rpc_proc != NULL) &&
- (task->tk_msg.rpc_proc->p_decode != NULL);
-}
-
static inline int sock_is_loopback(struct sock *sk)
{
struct dst_entry *dst;
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 387cc4add6f6..30a4226baf03 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1255,6 +1255,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
/* Syntactic check complete */
serv->sv_stats->rpccnt++;
+ trace_svc_process(rqstp, progp->pg_name);
/* Build the reply header. */
statp = resv->iov_base +resv->iov_len;
@@ -1431,14 +1432,10 @@ svc_process(struct svc_rqst *rqstp)
}
/* Returns 1 for send, 0 for drop */
- if (likely(svc_process_common(rqstp, argv, resv))) {
- int ret = svc_send(rqstp);
+ if (likely(svc_process_common(rqstp, argv, resv)))
+ return svc_send(rqstp);
- trace_svc_process(rqstp, ret);
- return ret;
- }
out_drop:
- trace_svc_process(rqstp, 0);
svc_drop(rqstp);
return 0;
}
@@ -1536,3 +1533,112 @@ u32 svc_max_payload(const struct svc_rqst *rqstp)
return max;
}
EXPORT_SYMBOL_GPL(svc_max_payload);
+
+/**
+ * svc_fill_write_vector - Construct data argument for VFS write call
+ * @rqstp: svc_rqst to operate on
+ * @first: buffer containing first section of write payload
+ * @total: total number of bytes of write payload
+ *
+ * Returns the number of elements populated in the data argument array.
+ */
+unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct kvec *first,
+ size_t total)
+{
+ struct kvec *vec = rqstp->rq_vec;
+ struct page **pages;
+ unsigned int i;
+
+ /* Some types of transport can present the write payload
+ * entirely in rq_arg.pages. In this case, @first is empty.
+ */
+ i = 0;
+ if (first->iov_len) {
+ vec[i].iov_base = first->iov_base;
+ vec[i].iov_len = min_t(size_t, total, first->iov_len);
+ total -= vec[i].iov_len;
+ ++i;
+ }
+
+ WARN_ON_ONCE(rqstp->rq_arg.page_base != 0);
+ pages = rqstp->rq_arg.pages;
+ while (total) {
+ vec[i].iov_base = page_address(*pages);
+ vec[i].iov_len = min_t(size_t, total, PAGE_SIZE);
+ total -= vec[i].iov_len;
+ ++i;
+
+ ++pages;
+ }
+
+ WARN_ON_ONCE(i > ARRAY_SIZE(rqstp->rq_vec));
+ return i;
+}
+EXPORT_SYMBOL_GPL(svc_fill_write_vector);
+
+/**
+ * svc_fill_symlink_pathname - Construct pathname argument for VFS symlink call
+ * @rqstp: svc_rqst to operate on
+ * @first: buffer containing first section of pathname
+ * @total: total length of the pathname argument
+ *
+ * Returns pointer to a NUL-terminated string, or an ERR_PTR. The buffer is
+ * released automatically when @rqstp is recycled.
+ */
+char *svc_fill_symlink_pathname(struct svc_rqst *rqstp, struct kvec *first,
+ size_t total)
+{
+ struct xdr_buf *arg = &rqstp->rq_arg;
+ struct page **pages;
+ char *result;
+
+ /* VFS API demands a NUL-terminated pathname. This function
+ * uses a page from @rqstp as the pathname buffer, to enable
+ * direct placement. Thus the total buffer size is PAGE_SIZE.
+ * Space in this buffer for NUL-termination requires that we
+ * cap the size of the returned symlink pathname just a
+ * little early.
+ */
+ if (total > PAGE_SIZE - 1)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ /* Some types of transport can present the pathname entirely
+ * in rq_arg.pages. If not, then copy the pathname into one
+ * page.
+ */
+ pages = arg->pages;
+ WARN_ON_ONCE(arg->page_base != 0);
+ if (first->iov_base == 0) {
+ result = page_address(*pages);
+ result[total] = '\0';
+ } else {
+ size_t len, remaining;
+ char *dst;
+
+ result = page_address(*(rqstp->rq_next_page++));
+ dst = result;
+ remaining = total;
+
+ len = min_t(size_t, total, first->iov_len);
+ memcpy(dst, first->iov_base, len);
+ dst += len;
+ remaining -= len;
+
+ /* No more than one page left */
+ if (remaining) {
+ len = min_t(size_t, remaining, PAGE_SIZE);
+ memcpy(dst, page_address(*pages), len);
+ dst += len;
+ }
+
+ *dst = '\0';
+ }
+
+ /* Sanity check: we don't allow the pathname argument to
+ * contain a NUL byte.
+ */
+ if (strlen(result) != total)
+ return ERR_PTR(-EINVAL);
+ return result;
+}
+EXPORT_SYMBOL_GPL(svc_fill_symlink_pathname);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index f9307bd6644b..5185efb9027b 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -173,6 +173,7 @@ void svc_xprt_init(struct net *net, struct svc_xprt_class *xcl,
set_bit(XPT_BUSY, &xprt->xpt_flags);
rpc_init_wait_queue(&xprt->xpt_bc_pending, "xpt_bc_pending");
xprt->xpt_net = get_net(net);
+ strcpy(xprt->xpt_remotebuf, "uninitialized");
}
EXPORT_SYMBOL_GPL(svc_xprt_init);
@@ -382,25 +383,21 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
int cpu;
if (!svc_xprt_has_something_to_do(xprt))
- goto out;
+ return;
/* Mark transport as busy. It will remain in this state until
* the provider calls svc_xprt_received. We update XPT_BUSY
* atomically because it also guards against trying to enqueue
* the transport twice.
*/
- if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) {
- /* Don't enqueue transport while already enqueued */
- dprintk("svc: transport %p busy, not enqueued\n", xprt);
- goto out;
- }
+ if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags))
+ return;
cpu = get_cpu();
pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
atomic_long_inc(&pool->sp_stats.packets);
- dprintk("svc: transport %p put into queue\n", xprt);
spin_lock_bh(&pool->sp_lock);
list_add_tail(&xprt->xpt_ready, &pool->sp_sockets);
pool->sp_stats.sockets_queued++;
@@ -412,6 +409,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
if (test_and_set_bit(RQ_BUSY, &rqstp->rq_flags))
continue;
atomic_long_inc(&pool->sp_stats.threads_woken);
+ rqstp->rq_qtime = ktime_get();
wake_up_process(rqstp->rq_task);
goto out_unlock;
}
@@ -420,7 +418,6 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
out_unlock:
rcu_read_unlock();
put_cpu();
-out:
trace_svc_xprt_do_enqueue(xprt, rqstp);
}
EXPORT_SYMBOL_GPL(svc_xprt_do_enqueue);
@@ -454,13 +451,9 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
struct svc_xprt, xpt_ready);
list_del_init(&xprt->xpt_ready);
svc_xprt_get(xprt);
-
- dprintk("svc: transport %p dequeued, inuse=%d\n",
- xprt, kref_read(&xprt->xpt_ref));
}
spin_unlock_bh(&pool->sp_lock);
out:
- trace_svc_xprt_dequeue(xprt);
return xprt;
}
@@ -492,7 +485,7 @@ static void svc_xprt_release(struct svc_rqst *rqstp)
{
struct svc_xprt *xprt = rqstp->rq_xprt;
- rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp);
+ xprt->xpt_ops->xpo_release_rqst(rqstp);
kfree(rqstp->rq_deferred);
rqstp->rq_deferred = NULL;
@@ -538,7 +531,6 @@ void svc_wake_up(struct svc_serv *serv)
if (test_bit(RQ_BUSY, &rqstp->rq_flags))
continue;
rcu_read_unlock();
- dprintk("svc: daemon %p woken up.\n", rqstp);
wake_up_process(rqstp->rq_task);
trace_svc_wake_up(rqstp->rq_task->pid);
return;
@@ -734,6 +726,7 @@ out_found:
rqstp->rq_chandle.thread_wait = 5*HZ;
else
rqstp->rq_chandle.thread_wait = 1*HZ;
+ trace_svc_xprt_dequeue(rqstp);
return rqstp->rq_xprt;
}
@@ -789,7 +782,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
len = svc_deferred_recv(rqstp);
else
len = xprt->xpt_ops->xpo_recvfrom(rqstp);
- dprintk("svc: got len=%d\n", len);
+ rqstp->rq_stime = ktime_get();
rqstp->rq_reserved = serv->sv_max_mesg;
atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
}
@@ -844,10 +837,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
clear_bit(XPT_OLD, &xprt->xpt_flags);
- if (xprt->xpt_ops->xpo_secure_port(rqstp))
- set_bit(RQ_SECURE, &rqstp->rq_flags);
- else
- clear_bit(RQ_SECURE, &rqstp->rq_flags);
+ xprt->xpt_ops->xpo_secure_port(rqstp);
rqstp->rq_chandle.defer = svc_defer;
rqstp->rq_xid = svc_getu32(&rqstp->rq_arg.head[0]);
@@ -859,7 +849,6 @@ out_release:
rqstp->rq_res.len = 0;
svc_xprt_release(rqstp);
out:
- trace_svc_recv(rqstp, err);
return err;
}
EXPORT_SYMBOL_GPL(svc_recv);
@@ -889,7 +878,7 @@ int svc_send(struct svc_rqst *rqstp)
goto out;
/* release the receive skb before sending the reply */
- rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp);
+ xprt->xpt_ops->xpo_release_rqst(rqstp);
/* calculate over-all length */
xb = &rqstp->rq_res;
@@ -899,6 +888,7 @@ int svc_send(struct svc_rqst *rqstp)
/* Grab mutex to serialize outgoing data. */
mutex_lock(&xprt->xpt_mutex);
+ trace_svc_stats_latency(rqstp);
if (test_bit(XPT_DEAD, &xprt->xpt_flags)
|| test_bit(XPT_CLOSE, &xprt->xpt_flags))
len = -ENOTCONN;
@@ -906,12 +896,12 @@ int svc_send(struct svc_rqst *rqstp)
len = xprt->xpt_ops->xpo_sendto(rqstp);
mutex_unlock(&xprt->xpt_mutex);
rpc_wake_up(&xprt->xpt_bc_pending);
+ trace_svc_send(rqstp, len);
svc_xprt_release(rqstp);
if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
len = 0;
out:
- trace_svc_send(rqstp, len);
return len;
}
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 08cd951aaeea..5445145e639c 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -391,9 +391,12 @@ static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
release_sock(sock->sk);
}
-static int svc_sock_secure_port(struct svc_rqst *rqstp)
+static void svc_sock_secure_port(struct svc_rqst *rqstp)
{
- return svc_port_is_privileged(svc_addr(rqstp));
+ if (svc_port_is_privileged(svc_addr(rqstp)))
+ set_bit(RQ_SECURE, &rqstp->rq_flags);
+ else
+ clear_bit(RQ_SECURE, &rqstp->rq_flags);
}
/*
@@ -1309,6 +1312,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags);
if (sk->sk_state == TCP_LISTEN) {
dprintk("setting up TCP socket for listening\n");
+ strcpy(svsk->sk_xprt.xpt_remotebuf, "listener");
set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
sk->sk_data_ready = svc_tcp_listen_data_ready;
set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index e34f4ee7f2b6..30afbd236656 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1519,6 +1519,88 @@ out:
EXPORT_SYMBOL_GPL(xdr_process_buf);
/**
+ * xdr_stream_decode_opaque - Decode variable length opaque
+ * @xdr: pointer to xdr_stream
+ * @ptr: location to store opaque data
+ * @size: size of storage buffer @ptr
+ *
+ * Return values:
+ * On success, returns size of object stored in *@ptr
+ * %-EBADMSG on XDR buffer overflow
+ * %-EMSGSIZE on overflow of storage buffer @ptr
+ */
+ssize_t xdr_stream_decode_opaque(struct xdr_stream *xdr, void *ptr, size_t size)
+{
+ ssize_t ret;
+ void *p;
+
+ ret = xdr_stream_decode_opaque_inline(xdr, &p, size);
+ if (ret <= 0)
+ return ret;
+ memcpy(ptr, p, ret);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xdr_stream_decode_opaque);
+
+/**
+ * xdr_stream_decode_opaque_dup - Decode and duplicate variable length opaque
+ * @xdr: pointer to xdr_stream
+ * @ptr: location to store pointer to opaque data
+ * @maxlen: maximum acceptable object size
+ * @gfp_flags: GFP mask to use
+ *
+ * Return values:
+ * On success, returns size of object stored in *@ptr
+ * %-EBADMSG on XDR buffer overflow
+ * %-EMSGSIZE if the size of the object would exceed @maxlen
+ * %-ENOMEM on memory allocation failure
+ */
+ssize_t xdr_stream_decode_opaque_dup(struct xdr_stream *xdr, void **ptr,
+ size_t maxlen, gfp_t gfp_flags)
+{
+ ssize_t ret;
+ void *p;
+
+ ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen);
+ if (ret > 0) {
+ *ptr = kmemdup(p, ret, gfp_flags);
+ if (*ptr != NULL)
+ return ret;
+ ret = -ENOMEM;
+ }
+ *ptr = NULL;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xdr_stream_decode_opaque_dup);
+
+/**
+ * xdr_stream_decode_string - Decode variable length string
+ * @xdr: pointer to xdr_stream
+ * @str: location to store string
+ * @size: size of storage buffer @str
+ *
+ * Return values:
+ * On success, returns length of NUL-terminated string stored in *@str
+ * %-EBADMSG on XDR buffer overflow
+ * %-EMSGSIZE on overflow of storage buffer @str
+ */
+ssize_t xdr_stream_decode_string(struct xdr_stream *xdr, char *str, size_t size)
+{
+ ssize_t ret;
+ void *p;
+
+ ret = xdr_stream_decode_opaque_inline(xdr, &p, size);
+ if (ret > 0) {
+ memcpy(str, p, ret);
+ str[ret] = '\0';
+ return strlen(str);
+ }
+ *str = '\0';
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xdr_stream_decode_string);
+
+/**
* xdr_stream_decode_string_dup - Decode and duplicate variable length string
* @xdr: pointer to xdr_stream
* @str: location to store pointer to string
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 8f0ad4f268da..70f005044f06 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -826,6 +826,7 @@ static void xprt_connect_status(struct rpc_task *task)
* @xprt: transport on which the original request was transmitted
* @xid: RPC XID of incoming reply
*
+ * Caller holds xprt->recv_lock.
*/
struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid)
{
@@ -834,6 +835,7 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid)
list_for_each_entry(entry, &xprt->recv, rq_list)
if (entry->rq_xid == xid) {
trace_xprt_lookup_rqst(xprt, xid, 0);
+ entry->rq_rtt = ktime_sub(ktime_get(), entry->rq_xtime);
return entry;
}
@@ -889,7 +891,13 @@ __must_hold(&req->rq_xprt->recv_lock)
}
}
-static void xprt_update_rtt(struct rpc_task *task)
+/**
+ * xprt_update_rtt - Update RPC RTT statistics
+ * @task: RPC request that recently completed
+ *
+ * Caller holds xprt->recv_lock.
+ */
+void xprt_update_rtt(struct rpc_task *task)
{
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_rtt *rtt = task->tk_client->cl_rtt;
@@ -902,13 +910,14 @@ static void xprt_update_rtt(struct rpc_task *task)
rpc_set_timeo(rtt, timer, req->rq_ntrans - 1);
}
}
+EXPORT_SYMBOL_GPL(xprt_update_rtt);
/**
* xprt_complete_rqst - called when reply processing is complete
* @task: RPC request that recently completed
* @copied: actual number of bytes received from the transport
*
- * Caller holds transport lock.
+ * Caller holds xprt->recv_lock.
*/
void xprt_complete_rqst(struct rpc_task *task, int copied)
{
@@ -920,9 +929,6 @@ void xprt_complete_rqst(struct rpc_task *task, int copied)
trace_xprt_complete_rqst(xprt, req->rq_xid, copied);
xprt->stat.recvs++;
- req->rq_rtt = ktime_sub(ktime_get(), req->rq_xtime);
- if (xprt->ops->timer != NULL)
- xprt_update_rtt(task);
list_del_init(&req->rq_list);
req->rq_private_buf.len = copied;
@@ -1003,7 +1009,7 @@ void xprt_transmit(struct rpc_task *task)
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
unsigned int connect_cookie;
- int status, numreqs;
+ int status;
dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen);
@@ -1027,7 +1033,6 @@ void xprt_transmit(struct rpc_task *task)
return;
connect_cookie = xprt->connect_cookie;
- req->rq_xtime = ktime_get();
status = xprt->ops->send_request(task);
trace_xprt_transmit(xprt, req->rq_xid, status);
if (status != 0) {
@@ -1042,9 +1047,6 @@ void xprt_transmit(struct rpc_task *task)
xprt->ops->set_retrans_timeout(task);
- numreqs = atomic_read(&xprt->num_reqs);
- if (numreqs > xprt->stat.max_slots)
- xprt->stat.max_slots = numreqs;
xprt->stat.sends++;
xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs;
xprt->stat.bklog_u += xprt->backlog.qlen;
@@ -1106,14 +1108,15 @@ static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt)
{
struct rpc_rqst *req = ERR_PTR(-EAGAIN);
- if (!atomic_add_unless(&xprt->num_reqs, 1, xprt->max_reqs))
+ if (xprt->num_reqs >= xprt->max_reqs)
goto out;
+ ++xprt->num_reqs;
spin_unlock(&xprt->reserve_lock);
req = kzalloc(sizeof(struct rpc_rqst), GFP_NOFS);
spin_lock(&xprt->reserve_lock);
if (req != NULL)
goto out;
- atomic_dec(&xprt->num_reqs);
+ --xprt->num_reqs;
req = ERR_PTR(-ENOMEM);
out:
return req;
@@ -1121,7 +1124,8 @@ out:
static bool xprt_dynamic_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
{
- if (atomic_add_unless(&xprt->num_reqs, -1, xprt->min_reqs)) {
+ if (xprt->num_reqs > xprt->min_reqs) {
+ --xprt->num_reqs;
kfree(req);
return true;
}
@@ -1157,6 +1161,8 @@ void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
spin_unlock(&xprt->reserve_lock);
return;
out_init_req:
+ xprt->stat.max_slots = max_t(unsigned int, xprt->stat.max_slots,
+ xprt->num_reqs);
task->tk_status = 0;
task->tk_rqstp = req;
xprt_request_init(task, xprt);
@@ -1224,7 +1230,7 @@ struct rpc_xprt *xprt_alloc(struct net *net, size_t size,
else
xprt->max_reqs = num_prealloc;
xprt->min_reqs = num_prealloc;
- atomic_set(&xprt->num_reqs, num_prealloc);
+ xprt->num_reqs = num_prealloc;
return xprt;
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index ed1a4a3065ee..47ebac949769 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -44,13 +44,6 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
if (IS_ERR(req))
return PTR_ERR(req);
- rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE,
- DMA_TO_DEVICE, GFP_KERNEL);
- if (IS_ERR(rb))
- goto out_fail;
- req->rl_rdmabuf = rb;
- xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb));
-
size = r_xprt->rx_data.inline_rsize;
rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL);
if (IS_ERR(rb))
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index d5f95bb39300..5cc68a824f45 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -191,7 +191,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
mr = rpcrdma_mr_get(r_xprt);
if (!mr)
- return ERR_PTR(-ENOBUFS);
+ return ERR_PTR(-EAGAIN);
pageoff = offset_in_page(seg1->mr_offset);
seg1->mr_offset -= pageoff; /* start of page */
@@ -251,6 +251,16 @@ out_maperr:
return ERR_PTR(-EIO);
}
+/* Post Send WR containing the RPC Call message.
+ */
+static int
+fmr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+{
+ struct ib_send_wr *bad_wr;
+
+ return ib_post_send(ia->ri_id->qp, &req->rl_sendctx->sc_wr, &bad_wr);
+}
+
/* Invalidate all memory regions that were registered for "req".
*
* Sleeps until it is safe for the host CPU to access the
@@ -305,6 +315,7 @@ out_reset:
const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
.ro_map = fmr_op_map,
+ .ro_send = fmr_op_send,
.ro_unmap_sync = fmr_op_unmap_sync,
.ro_recover_mr = fmr_op_recover_mr,
.ro_open = fmr_op_open,
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 90f688f19783..c5743a0960be 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -357,8 +357,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
struct rpcrdma_mr *mr;
struct ib_mr *ibmr;
struct ib_reg_wr *reg_wr;
- struct ib_send_wr *bad_wr;
- int rc, i, n;
+ int i, n;
u8 key;
mr = NULL;
@@ -367,7 +366,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
rpcrdma_mr_defer_recovery(mr);
mr = rpcrdma_mr_get(r_xprt);
if (!mr)
- return ERR_PTR(-ENOBUFS);
+ return ERR_PTR(-EAGAIN);
} while (mr->frwr.fr_state != FRWR_IS_INVALID);
frwr = &mr->frwr;
frwr->fr_state = FRWR_IS_VALID;
@@ -407,22 +406,12 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
ib_update_fast_reg_key(ibmr, ++key);
reg_wr = &frwr->fr_regwr;
- reg_wr->wr.next = NULL;
- reg_wr->wr.opcode = IB_WR_REG_MR;
- frwr->fr_cqe.done = frwr_wc_fastreg;
- reg_wr->wr.wr_cqe = &frwr->fr_cqe;
- reg_wr->wr.num_sge = 0;
- reg_wr->wr.send_flags = 0;
reg_wr->mr = ibmr;
reg_wr->key = ibmr->rkey;
reg_wr->access = writing ?
IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
IB_ACCESS_REMOTE_READ;
- rc = ib_post_send(ia->ri_id->qp, &reg_wr->wr, &bad_wr);
- if (rc)
- goto out_senderr;
-
mr->mr_handle = ibmr->rkey;
mr->mr_length = ibmr->length;
mr->mr_offset = ibmr->iova;
@@ -442,11 +431,40 @@ out_mapmr_err:
frwr->fr_mr, n, mr->mr_nents);
rpcrdma_mr_defer_recovery(mr);
return ERR_PTR(-EIO);
+}
-out_senderr:
- pr_err("rpcrdma: FRWR registration ib_post_send returned %i\n", rc);
- rpcrdma_mr_defer_recovery(mr);
- return ERR_PTR(-ENOTCONN);
+/* Post Send WR containing the RPC Call message.
+ *
+ * For FRMR, chain any FastReg WRs to the Send WR. Only a
+ * single ib_post_send call is needed to register memory
+ * and then post the Send WR.
+ */
+static int
+frwr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+{
+ struct ib_send_wr *post_wr, *bad_wr;
+ struct rpcrdma_mr *mr;
+
+ post_wr = &req->rl_sendctx->sc_wr;
+ list_for_each_entry(mr, &req->rl_registered, mr_list) {
+ struct rpcrdma_frwr *frwr;
+
+ frwr = &mr->frwr;
+
+ frwr->fr_cqe.done = frwr_wc_fastreg;
+ frwr->fr_regwr.wr.next = post_wr;
+ frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe;
+ frwr->fr_regwr.wr.num_sge = 0;
+ frwr->fr_regwr.wr.opcode = IB_WR_REG_MR;
+ frwr->fr_regwr.wr.send_flags = 0;
+
+ post_wr = &frwr->fr_regwr.wr;
+ }
+
+ /* If ib_post_send fails, the next ->send_request for
+ * @req will queue these MWs for recovery.
+ */
+ return ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
}
/* Handle a remotely invalidated mr on the @mrs list
@@ -561,6 +579,7 @@ reset_mrs:
const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
.ro_map = frwr_op_map,
+ .ro_send = frwr_op_send,
.ro_reminv = frwr_op_reminv,
.ro_unmap_sync = frwr_op_unmap_sync,
.ro_recover_mr = frwr_op_recover_mr,
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index f0855a959a27..e8adad33d0bb 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -365,7 +365,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
false, &mr);
if (IS_ERR(seg))
- return PTR_ERR(seg);
+ goto out_maperr;
rpcrdma_mr_push(mr, &req->rl_registered);
if (encode_read_segment(xdr, mr, pos) < 0)
@@ -377,6 +377,11 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
} while (nsegs);
return 0;
+
+out_maperr:
+ if (PTR_ERR(seg) == -EAGAIN)
+ xprt_wait_for_buffer_space(rqst->rq_task, NULL);
+ return PTR_ERR(seg);
}
/* Register and XDR encode the Write list. Supports encoding a list
@@ -423,7 +428,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
true, &mr);
if (IS_ERR(seg))
- return PTR_ERR(seg);
+ goto out_maperr;
rpcrdma_mr_push(mr, &req->rl_registered);
if (encode_rdma_segment(xdr, mr) < 0)
@@ -440,6 +445,11 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
*segcount = cpu_to_be32(nchunks);
return 0;
+
+out_maperr:
+ if (PTR_ERR(seg) == -EAGAIN)
+ xprt_wait_for_buffer_space(rqst->rq_task, NULL);
+ return PTR_ERR(seg);
}
/* Register and XDR encode the Reply chunk. Supports encoding an array
@@ -481,7 +491,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
true, &mr);
if (IS_ERR(seg))
- return PTR_ERR(seg);
+ goto out_maperr;
rpcrdma_mr_push(mr, &req->rl_registered);
if (encode_rdma_segment(xdr, mr) < 0)
@@ -498,6 +508,11 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
*segcount = cpu_to_be32(nchunks);
return 0;
+
+out_maperr:
+ if (PTR_ERR(seg) == -EAGAIN)
+ xprt_wait_for_buffer_space(rqst->rq_task, NULL);
+ return PTR_ERR(seg);
}
/**
@@ -724,8 +739,8 @@ rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
* Returns:
* %0 if the RPC was sent successfully,
* %-ENOTCONN if the connection was lost,
- * %-EAGAIN if not enough pages are available for on-demand reply buffer,
- * %-ENOBUFS if no MRs are available to register chunks,
+ * %-EAGAIN if the caller should call again with the same arguments,
+ * %-ENOBUFS if the caller should call again after a delay,
* %-EMSGSIZE if the transport header is too small,
* %-EIO if a permanent problem occurred while marshaling.
*/
@@ -868,10 +883,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
return 0;
out_err:
- if (ret != -ENOBUFS) {
- pr_err("rpcrdma: header marshaling failed (%d)\n", ret);
- r_xprt->rx_stats.failed_marshal_count++;
- }
+ r_xprt->rx_stats.failed_marshal_count++;
return ret;
}
@@ -1366,7 +1378,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
trace_xprtrdma_reply(rqst->rq_task, rep, req, credits);
- queue_work_on(req->rl_cpu, rpcrdma_receive_wq, &rep->rr_work);
+ queue_work(rpcrdma_receive_wq, &rep->rr_work);
return;
out_badstatus:
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index a4a8f6989ee7..dd8a431dc2ae 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -51,9 +51,9 @@
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
/* RPC/RDMA parameters */
-unsigned int svcrdma_ord = RPCRDMA_ORD;
+unsigned int svcrdma_ord = 16; /* historical default */
static unsigned int min_ord = 1;
-static unsigned int max_ord = 4096;
+static unsigned int max_ord = 255;
unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS;
static unsigned int min_max_requests = 4;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 19e9c6b33042..3d45015dca97 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -110,15 +110,16 @@
* the RDMA_RECV completion. The SGL should contain full pages up until the
* last one.
*/
-static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
- struct svc_rdma_op_ctxt *ctxt,
- u32 byte_count)
+static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp,
+ struct svc_rdma_op_ctxt *ctxt)
{
struct page *page;
- u32 bc;
int sge_no;
+ u32 len;
- /* Swap the page in the SGE with the page in argpages */
+ /* The reply path assumes the Call's transport header resides
+ * in rqstp->rq_pages[0].
+ */
page = ctxt->pages[0];
put_page(rqstp->rq_pages[0]);
rqstp->rq_pages[0] = page;
@@ -126,35 +127,35 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
/* Set up the XDR head */
rqstp->rq_arg.head[0].iov_base = page_address(page);
rqstp->rq_arg.head[0].iov_len =
- min_t(size_t, byte_count, ctxt->sge[0].length);
- rqstp->rq_arg.len = byte_count;
- rqstp->rq_arg.buflen = byte_count;
+ min_t(size_t, ctxt->byte_len, ctxt->sge[0].length);
+ rqstp->rq_arg.len = ctxt->byte_len;
+ rqstp->rq_arg.buflen = ctxt->byte_len;
/* Compute bytes past head in the SGL */
- bc = byte_count - rqstp->rq_arg.head[0].iov_len;
+ len = ctxt->byte_len - rqstp->rq_arg.head[0].iov_len;
/* If data remains, store it in the pagelist */
- rqstp->rq_arg.page_len = bc;
+ rqstp->rq_arg.page_len = len;
rqstp->rq_arg.page_base = 0;
sge_no = 1;
- while (bc && sge_no < ctxt->count) {
+ while (len && sge_no < ctxt->count) {
page = ctxt->pages[sge_no];
put_page(rqstp->rq_pages[sge_no]);
rqstp->rq_pages[sge_no] = page;
- bc -= min_t(u32, bc, ctxt->sge[sge_no].length);
+ len -= min_t(u32, len, ctxt->sge[sge_no].length);
sge_no++;
}
rqstp->rq_respages = &rqstp->rq_pages[sge_no];
rqstp->rq_next_page = rqstp->rq_respages + 1;
/* If not all pages were used from the SGL, free the remaining ones */
- bc = sge_no;
+ len = sge_no;
while (sge_no < ctxt->count) {
page = ctxt->pages[sge_no++];
put_page(page);
}
- ctxt->count = bc;
+ ctxt->count = len;
/* Set up tail */
rqstp->rq_arg.tail[0].iov_base = NULL;
@@ -534,10 +535,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
ctxt, rdma_xprt, rqstp);
atomic_inc(&rdma_stat_recv);
- /* Build up the XDR from the receive buffers. */
- rdma_build_arg_xdr(rqstp, ctxt, ctxt->byte_len);
+ svc_rdma_build_arg_xdr(rqstp, ctxt);
- /* Decode the RDMA header. */
p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg);
if (ret < 0)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 9ad12a215b51..96cc8f6597d3 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -69,7 +69,7 @@ static void svc_rdma_release_rqst(struct svc_rqst *);
static void svc_rdma_detach(struct svc_xprt *xprt);
static void svc_rdma_free(struct svc_xprt *xprt);
static int svc_rdma_has_wspace(struct svc_xprt *xprt);
-static int svc_rdma_secure_port(struct svc_rqst *);
+static void svc_rdma_secure_port(struct svc_rqst *);
static void svc_rdma_kill_temp_xprt(struct svc_xprt *);
static const struct svc_xprt_ops svc_rdma_ops = {
@@ -330,9 +330,9 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
flushed:
if (wc->status != IB_WC_WR_FLUSH_ERR)
- pr_warn("svcrdma: receive: %s (%u/0x%x)\n",
- ib_wc_status_msg(wc->status),
- wc->status, wc->vendor_err);
+ pr_err("svcrdma: Recv: %s (%u/0x%x)\n",
+ ib_wc_status_msg(wc->status),
+ wc->status, wc->vendor_err);
set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
svc_rdma_put_context(ctxt, 1);
@@ -401,8 +401,10 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
*/
set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags);
- if (listener)
+ if (listener) {
+ strcpy(cma_xprt->sc_xprt.xpt_remotebuf, "listener");
set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
+ }
return cma_xprt;
}
@@ -762,13 +764,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
if (!svc_rdma_prealloc_ctxts(newxprt))
goto errout;
- /*
- * Limit ORD based on client limit, local device limit, and
- * configured svcrdma limit.
- */
- newxprt->sc_ord = min_t(size_t, dev->attrs.max_qp_rd_atom, newxprt->sc_ord);
- newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord);
-
newxprt->sc_pd = ib_alloc_pd(dev, 0);
if (IS_ERR(newxprt->sc_pd)) {
dprintk("svcrdma: error creating PD for connect request\n");
@@ -843,15 +838,18 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
memset(&conn_param, 0, sizeof conn_param);
conn_param.responder_resources = 0;
- conn_param.initiator_depth = newxprt->sc_ord;
+ conn_param.initiator_depth = min_t(int, newxprt->sc_ord,
+ dev->attrs.max_qp_init_rd_atom);
+ if (!conn_param.initiator_depth) {
+ dprintk("svcrdma: invalid ORD setting\n");
+ ret = -EINVAL;
+ goto errout;
+ }
conn_param.private_data = &pmsg;
conn_param.private_data_len = sizeof(pmsg);
ret = rdma_accept(newxprt->sc_cm_id, &conn_param);
- if (ret) {
- dprintk("svcrdma: failed to accept new connection, ret=%d\n",
- ret);
+ if (ret)
goto errout;
- }
dprintk("svcrdma: new connection %p accepted:\n", newxprt);
sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
@@ -862,7 +860,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth);
dprintk(" rdma_rw_ctxs : %d\n", ctxts);
dprintk(" max_requests : %d\n", newxprt->sc_max_requests);
- dprintk(" ord : %d\n", newxprt->sc_ord);
+ dprintk(" ord : %d\n", conn_param.initiator_depth);
return &newxprt->sc_xprt;
@@ -992,9 +990,9 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt)
return 1;
}
-static int svc_rdma_secure_port(struct svc_rqst *rqstp)
+static void svc_rdma_secure_port(struct svc_rqst *rqstp)
{
- return 1;
+ set_bit(RQ_SECURE, &rqstp->rq_flags);
}
static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt)
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 4b1ecfe979cf..cc1aad325496 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -52,7 +52,6 @@
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/sunrpc/addr.h>
-#include <linux/smp.h>
#include "xprt_rdma.h"
@@ -237,8 +236,6 @@ rpcrdma_connect_worker(struct work_struct *work)
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
spin_lock_bh(&xprt->transport_lock);
- if (++xprt->connect_cookie == 0) /* maintain a reserved value */
- ++xprt->connect_cookie;
if (ep->rep_connected > 0) {
if (!xprt_test_and_set_connected(xprt))
xprt_wake_pending_tasks(xprt, 0);
@@ -540,29 +537,6 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
}
}
-/* Allocate a fixed-size buffer in which to construct and send the
- * RPC-over-RDMA header for this request.
- */
-static bool
-rpcrdma_get_rdmabuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
- gfp_t flags)
-{
- size_t size = RPCRDMA_HDRBUF_SIZE;
- struct rpcrdma_regbuf *rb;
-
- if (req->rl_rdmabuf)
- return true;
-
- rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags);
- if (IS_ERR(rb))
- return false;
-
- r_xprt->rx_stats.hardway_register_count += size;
- req->rl_rdmabuf = rb;
- xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb));
- return true;
-}
-
static bool
rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
size_t size, gfp_t flags)
@@ -644,15 +618,11 @@ xprt_rdma_allocate(struct rpc_task *task)
if (RPC_IS_SWAPPER(task))
flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
- if (!rpcrdma_get_rdmabuf(r_xprt, req, flags))
- goto out_fail;
if (!rpcrdma_get_sendbuf(r_xprt, req, rqst->rq_callsize, flags))
goto out_fail;
if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags))
goto out_fail;
- req->rl_cpu = smp_processor_id();
- req->rl_connect_cookie = 0; /* our reserved value */
rpcrdma_set_xprtdata(rqst, req);
rqst->rq_buffer = req->rl_sendbuf->rg_base;
rqst->rq_rbuffer = req->rl_recvbuf->rg_base;
@@ -694,7 +664,8 @@ xprt_rdma_free(struct rpc_task *task)
* Returns:
* %0 if the RPC message has been sent
* %-ENOTCONN if the caller should reconnect and call again
- * %-ENOBUFS if the caller should call again later
+ * %-EAGAIN if the caller should call again
+ * %-ENOBUFS if the caller should call again after a delay
* %-EIO if a permanent error occurred and the request was not
* sent. Do not try to send this message again.
*/
@@ -723,9 +694,9 @@ xprt_rdma_send_request(struct rpc_task *task)
rpcrdma_recv_buffer_get(req);
/* Must suppress retransmit to maintain credits */
- if (req->rl_connect_cookie == xprt->connect_cookie)
+ if (rqst->rq_connect_cookie == xprt->connect_cookie)
goto drop_connection;
- req->rl_connect_cookie = xprt->connect_cookie;
+ rqst->rq_xtime = ktime_get();
__set_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags);
if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
@@ -733,6 +704,12 @@ xprt_rdma_send_request(struct rpc_task *task)
rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len;
rqst->rq_bytes_sent = 0;
+
+ /* An RPC with no reply will throw off credit accounting,
+ * so drop the connection to reset the credit grant.
+ */
+ if (!rpc_reply_expected(task))
+ goto drop_connection;
return 0;
failed_marshal:
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index e6f84a6434a0..fe5eaca2d197 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -250,11 +250,11 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
wait_for_completion(&ia->ri_remove_done);
ia->ri_id = NULL;
- ia->ri_pd = NULL;
ia->ri_device = NULL;
/* Return 1 to ensure the core destroys the id. */
return 1;
case RDMA_CM_EVENT_ESTABLISHED:
+ ++xprt->rx_xprt.connect_cookie;
connstate = 1;
rpcrdma_update_connect_private(xprt, &event->param.conn);
goto connected;
@@ -273,6 +273,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
connstate = -EAGAIN;
goto connected;
case RDMA_CM_EVENT_DISCONNECTED:
+ ++xprt->rx_xprt.connect_cookie;
connstate = -ECONNABORTED;
connected:
xprt->rx_buf.rb_credits = 1;
@@ -445,7 +446,9 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia)
ia->ri_id->qp = NULL;
}
ib_free_cq(ep->rep_attr.recv_cq);
+ ep->rep_attr.recv_cq = NULL;
ib_free_cq(ep->rep_attr.send_cq);
+ ep->rep_attr.send_cq = NULL;
/* The ULP is responsible for ensuring all DMA
* mappings and MRs are gone.
@@ -458,6 +461,8 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia)
rpcrdma_dma_unmap_regbuf(req->rl_recvbuf);
}
rpcrdma_mrs_destroy(buf);
+ ib_dealloc_pd(ia->ri_pd);
+ ia->ri_pd = NULL;
/* Allow waiters to continue */
complete(&ia->ri_remove_done);
@@ -589,11 +594,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
/* Client offers RDMA Read but does not initiate */
ep->rep_remote_cma.initiator_depth = 0;
- if (ia->ri_device->attrs.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
- ep->rep_remote_cma.responder_resources = 32;
- else
- ep->rep_remote_cma.responder_resources =
- ia->ri_device->attrs.max_qp_rd_atom;
+ ep->rep_remote_cma.responder_resources =
+ min_t(int, U8_MAX, ia->ri_device->attrs.max_qp_rd_atom);
/* Limit transport retries so client can detect server
* GID changes quickly. RPC layer handles re-establishing
@@ -628,14 +630,16 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
cancel_delayed_work_sync(&ep->rep_connect_worker);
- if (ia->ri_id->qp) {
+ if (ia->ri_id && ia->ri_id->qp) {
rpcrdma_ep_disconnect(ep, ia);
rdma_destroy_qp(ia->ri_id);
ia->ri_id->qp = NULL;
}
- ib_free_cq(ep->rep_attr.recv_cq);
- ib_free_cq(ep->rep_attr.send_cq);
+ if (ep->rep_attr.recv_cq)
+ ib_free_cq(ep->rep_attr.recv_cq);
+ if (ep->rep_attr.send_cq)
+ ib_free_cq(ep->rep_attr.send_cq);
}
/* Re-establish a connection after a device removal event.
@@ -1024,7 +1028,7 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
LIST_HEAD(free);
LIST_HEAD(all);
- for (count = 0; count < 32; count++) {
+ for (count = 0; count < 3; count++) {
struct rpcrdma_mr *mr;
int rc;
@@ -1049,8 +1053,9 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
list_splice(&all, &buf->rb_all);
r_xprt->rx_stats.mrs_allocated += count;
spin_unlock(&buf->rb_mrlock);
-
trace_xprtrdma_createmrs(r_xprt, count);
+
+ xprt_write_space(&r_xprt->rx_xprt);
}
static void
@@ -1068,17 +1073,27 @@ struct rpcrdma_req *
rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
+ struct rpcrdma_regbuf *rb;
struct rpcrdma_req *req;
req = kzalloc(sizeof(*req), GFP_KERNEL);
if (req == NULL)
return ERR_PTR(-ENOMEM);
+ rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE,
+ DMA_TO_DEVICE, GFP_KERNEL);
+ if (IS_ERR(rb)) {
+ kfree(req);
+ return ERR_PTR(-ENOMEM);
+ }
+ req->rl_rdmabuf = rb;
+ xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb));
+ req->rl_buffer = buffer;
+ INIT_LIST_HEAD(&req->rl_registered);
+
spin_lock(&buffer->rb_reqslock);
list_add(&req->rl_all, &buffer->rb_allreqs);
spin_unlock(&buffer->rb_reqslock);
- req->rl_buffer = &r_xprt->rx_buf;
- INIT_LIST_HEAD(&req->rl_registered);
return req;
}
@@ -1535,7 +1550,6 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
struct rpcrdma_req *req)
{
struct ib_send_wr *send_wr = &req->rl_sendctx->sc_wr;
- struct ib_send_wr *send_wr_fail;
int rc;
if (req->rl_reply) {
@@ -1554,7 +1568,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
--ep->rep_send_count;
}
- rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail);
+ rc = ia->ri_ops->ro_send(ia, req);
trace_xprtrdma_post_send(req, rc);
if (rc)
return -ENOTCONN;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 69883a960a3f..3d3b423fa9c1 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -334,8 +334,6 @@ enum {
struct rpcrdma_buffer;
struct rpcrdma_req {
struct list_head rl_list;
- int rl_cpu;
- unsigned int rl_connect_cookie;
struct rpcrdma_buffer *rl_buffer;
struct rpcrdma_rep *rl_reply;
struct xdr_stream rl_stream;
@@ -474,6 +472,8 @@ struct rpcrdma_memreg_ops {
(*ro_map)(struct rpcrdma_xprt *,
struct rpcrdma_mr_seg *, int, bool,
struct rpcrdma_mr **);
+ int (*ro_send)(struct rpcrdma_ia *ia,
+ struct rpcrdma_req *req);
void (*ro_reminv)(struct rpcrdma_rep *rep,
struct list_head *mrs);
void (*ro_unmap_sync)(struct rpcrdma_xprt *,
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 956e29c1438d..c8902f11efdd 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -527,6 +527,7 @@ static int xs_local_send_request(struct rpc_task *task)
xs_pktdump("packet data:",
req->rq_svec->iov_base, req->rq_svec->iov_len);
+ req->rq_xtime = ktime_get();
status = xs_sendpages(transport->sock, NULL, 0, xdr, req->rq_bytes_sent,
true, &sent);
dprintk("RPC: %s(%u) = %d\n",
@@ -589,6 +590,7 @@ static int xs_udp_send_request(struct rpc_task *task)
if (!xprt_bound(xprt))
return -ENOTCONN;
+ req->rq_xtime = ktime_get();
status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen,
xdr, req->rq_bytes_sent, true, &sent);
@@ -678,6 +680,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
/* Continue transmitting the packet/record. We must be careful
* to cope with writespace callbacks arriving _after_ we have
* called sendmsg(). */
+ req->rq_xtime = ktime_get();
while (1) {
sent = 0;
status = xs_sendpages(transport->sock, NULL, 0, xdr,
@@ -1060,6 +1063,7 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt,
if (!rovr)
goto out_unlock;
xprt_pin_rqst(rovr);
+ xprt_update_rtt(rovr->rq_task);
spin_unlock(&xprt->recv_lock);
task = rovr->rq_task;
diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c
index 32dc33a94bc7..5453e564da82 100644
--- a/net/tipc/monitor.c
+++ b/net/tipc/monitor.c
@@ -777,7 +777,7 @@ int __tipc_nl_add_monitor(struct net *net, struct tipc_nl_msg *msg,
ret = tipc_bearer_get_name(net, bearer_name, bearer_id);
if (ret || !mon)
- return -EINVAL;
+ return 0;
hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
NLM_F_MULTI, TIPC_NL_MON_GET);
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index b1fe20972aa9..dd1c4fa2eb78 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -241,7 +241,8 @@ err:
static struct publication *tipc_service_remove_publ(struct net *net,
struct tipc_service *sc,
u32 lower, u32 upper,
- u32 node, u32 key)
+ u32 node, u32 key,
+ struct service_range **rng)
{
struct tipc_subscription *sub, *tmp;
struct service_range *sr;
@@ -275,19 +276,15 @@ static struct publication *tipc_service_remove_publ(struct net *net,
list_del(&p->all_publ);
list_del(&p->local_publ);
-
- /* Remove service range item if this was its last publication */
- if (list_empty(&sr->all_publ)) {
+ if (list_empty(&sr->all_publ))
last = true;
- rb_erase(&sr->tree_node, &sc->ranges);
- kfree(sr);
- }
/* Notify any waiting subscriptions */
list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) {
tipc_sub_report_overlap(sub, p->lower, p->upper, TIPC_WITHDRAWN,
p->port, p->node, p->scope, last);
}
+ *rng = sr;
return p;
}
@@ -379,13 +376,20 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
u32 node, u32 key)
{
struct tipc_service *sc = tipc_service_find(net, type);
+ struct service_range *sr = NULL;
struct publication *p = NULL;
if (!sc)
return NULL;
spin_lock_bh(&sc->lock);
- p = tipc_service_remove_publ(net, sc, lower, upper, node, key);
+ p = tipc_service_remove_publ(net, sc, lower, upper, node, key, &sr);
+
+ /* Remove service range item if this was its last publication */
+ if (sr && list_empty(&sr->all_publ)) {
+ rb_erase(&sr->tree_node, &sc->ranges);
+ kfree(sr);
+ }
/* Delete service item if this no more publications and subscriptions */
if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) {
@@ -665,13 +669,14 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower,
/**
* tipc_nametbl_subscribe - add a subscription object to the name table
*/
-void tipc_nametbl_subscribe(struct tipc_subscription *sub)
+bool tipc_nametbl_subscribe(struct tipc_subscription *sub)
{
struct name_table *nt = tipc_name_table(sub->net);
struct tipc_net *tn = tipc_net(sub->net);
struct tipc_subscr *s = &sub->evt.s;
u32 type = tipc_sub_read(s, seq.type);
struct tipc_service *sc;
+ bool res = true;
spin_lock_bh(&tn->nametbl_lock);
sc = tipc_service_find(sub->net, type);
@@ -685,8 +690,10 @@ void tipc_nametbl_subscribe(struct tipc_subscription *sub)
pr_warn("Failed to subscribe for {%u,%u,%u}\n", type,
tipc_sub_read(s, seq.lower),
tipc_sub_read(s, seq.upper));
+ res = false;
}
spin_unlock_bh(&tn->nametbl_lock);
+ return res;
}
/**
@@ -744,16 +751,17 @@ int tipc_nametbl_init(struct net *net)
static void tipc_service_delete(struct net *net, struct tipc_service *sc)
{
struct service_range *sr, *tmpr;
- struct publication *p, *tmpb;
+ struct publication *p, *tmp;
spin_lock_bh(&sc->lock);
rbtree_postorder_for_each_entry_safe(sr, tmpr, &sc->ranges, tree_node) {
- list_for_each_entry_safe(p, tmpb,
- &sr->all_publ, all_publ) {
+ list_for_each_entry_safe(p, tmp, &sr->all_publ, all_publ) {
tipc_service_remove_publ(net, sc, p->lower, p->upper,
- p->node, p->key);
+ p->node, p->key, &sr);
kfree_rcu(p, rcu);
}
+ rb_erase(&sr->tree_node, &sc->ranges);
+ kfree(sr);
}
hlist_del_init_rcu(&sc->service_list);
spin_unlock_bh(&sc->lock);
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index 4b14fc28d9e2..0febba41da86 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -126,7 +126,7 @@ struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type,
struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
u32 lower, u32 upper,
u32 node, u32 key);
-void tipc_nametbl_subscribe(struct tipc_subscription *s);
+bool tipc_nametbl_subscribe(struct tipc_subscription *s);
void tipc_nametbl_unsubscribe(struct tipc_subscription *s);
int tipc_nametbl_init(struct net *net);
void tipc_nametbl_stop(struct net *net);
diff --git a/net/tipc/net.c b/net/tipc/net.c
index 856f9e97ea29..4fbaa0464405 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -252,6 +252,8 @@ int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info)
u64 *w0 = (u64 *)&node_id[0];
u64 *w1 = (u64 *)&node_id[8];
+ if (!attrs[TIPC_NLA_NET_NODEID_W1])
+ return -EINVAL;
*w0 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID]);
*w1 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID_W1]);
tipc_net_init(net, node_id, 0);
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index b76f13f6fea1..6ff2254088f6 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -79,7 +79,10 @@ const struct nla_policy tipc_nl_sock_policy[TIPC_NLA_SOCK_MAX + 1] = {
const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = {
[TIPC_NLA_NET_UNSPEC] = { .type = NLA_UNSPEC },
- [TIPC_NLA_NET_ID] = { .type = NLA_U32 }
+ [TIPC_NLA_NET_ID] = { .type = NLA_U32 },
+ [TIPC_NLA_NET_ADDR] = { .type = NLA_U32 },
+ [TIPC_NLA_NET_NODEID] = { .type = NLA_U64 },
+ [TIPC_NLA_NET_NODEID_W1] = { .type = NLA_U64 },
};
const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = {
diff --git a/net/tipc/node.c b/net/tipc/node.c
index c77dd2f3c589..baaf93f12cbd 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -2232,8 +2232,8 @@ int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb)
struct net *net = sock_net(skb->sk);
u32 prev_bearer = cb->args[0];
struct tipc_nl_msg msg;
+ int bearer_id;
int err;
- int i;
if (prev_bearer == MAX_BEARERS)
return 0;
@@ -2243,16 +2243,13 @@ int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb)
msg.seq = cb->nlh->nlmsg_seq;
rtnl_lock();
- for (i = prev_bearer; i < MAX_BEARERS; i++) {
- prev_bearer = i;
- err = __tipc_nl_add_monitor(net, &msg, prev_bearer);
+ for (bearer_id = prev_bearer; bearer_id < MAX_BEARERS; bearer_id++) {
+ err = __tipc_nl_add_monitor(net, &msg, bearer_id);
if (err)
- goto out;
+ break;
}
-
-out:
rtnl_unlock();
- cb->args[0] = prev_bearer;
+ cb->args[0] = bearer_id;
return skb->len;
}
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 1fd1c8b5ce03..252a52ae0893 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1278,7 +1278,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
struct tipc_msg *hdr = &tsk->phdr;
struct tipc_name_seq *seq;
struct sk_buff_head pkts;
- u32 dnode, dport;
+ u32 dport, dnode = 0;
u32 type, inst;
int mtu, rc;
@@ -1348,6 +1348,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
msg_set_destnode(hdr, dnode);
msg_set_destport(hdr, dest->addr.id.ref);
msg_set_hdr_sz(hdr, BASIC_H_SIZE);
+ } else {
+ return -EINVAL;
}
/* Block or return if destination link is congested */
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index b7d80bc5f4ab..f340e53da625 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -153,7 +153,10 @@ struct tipc_subscription *tipc_sub_subscribe(struct net *net,
memcpy(&sub->evt.s, s, sizeof(*s));
spin_lock_init(&sub->lock);
kref_init(&sub->kref);
- tipc_nametbl_subscribe(sub);
+ if (!tipc_nametbl_subscribe(sub)) {
+ kfree(sub);
+ return NULL;
+ }
timer_setup(&sub->timer, tipc_sub_timeout, 0);
timeout = tipc_sub_read(&sub->evt.s, timeout);
if (timeout != TIPC_WAIT_FOREVER)
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 0d379970960e..20cd93be6236 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -114,6 +114,7 @@ int tls_push_sg(struct sock *sk,
size = sg->length - offset;
offset += sg->offset;
+ ctx->in_tcp_sendpages = true;
while (1) {
if (sg_is_last(sg))
sendpage_flags = flags;
@@ -134,6 +135,7 @@ retry:
offset -= sg->offset;
ctx->partially_sent_offset = offset;
ctx->partially_sent_record = (void *)sg;
+ ctx->in_tcp_sendpages = false;
return ret;
}
@@ -148,6 +150,8 @@ retry:
}
clear_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags);
+ ctx->in_tcp_sendpages = false;
+ ctx->sk_write_space(sk);
return 0;
}
@@ -217,6 +221,10 @@ static void tls_write_space(struct sock *sk)
{
struct tls_context *ctx = tls_get_ctx(sk);
+ /* We are already sending pages, ignore notification */
+ if (ctx->in_tcp_sendpages)
+ return;
+
if (!sk->sk_write_pending && tls_is_pending_closed_record(ctx)) {
gfp_t sk_allocation = sk->sk_allocation;
int rc;
@@ -241,16 +249,13 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
struct tls_context *ctx = tls_get_ctx(sk);
long timeo = sock_sndtimeo(sk, 0);
void (*sk_proto_close)(struct sock *sk, long timeout);
+ bool free_ctx = false;
lock_sock(sk);
sk_proto_close = ctx->sk_proto_close;
- if (ctx->conf == TLS_HW_RECORD)
- goto skip_tx_cleanup;
-
- if (ctx->conf == TLS_BASE) {
- kfree(ctx);
- ctx = NULL;
+ if (ctx->conf == TLS_BASE || ctx->conf == TLS_HW_RECORD) {
+ free_ctx = true;
goto skip_tx_cleanup;
}
@@ -287,7 +292,7 @@ skip_tx_cleanup:
/* free ctx for TLS_HW_RECORD, used by tcp_set_state
* for sk->sk_prot->unhash [tls_hw_unhash]
*/
- if (ctx && ctx->conf == TLS_HW_RECORD)
+ if (free_ctx)
kfree(ctx);
}
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 4dc766b03f00..71e79597f940 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -41,6 +41,8 @@
#include <net/strparser.h>
#include <net/tls.h>
+#define MAX_IV_SIZE TLS_CIPHER_AES_GCM_128_IV_SIZE
+
static int tls_do_decryption(struct sock *sk,
struct scatterlist *sgin,
struct scatterlist *sgout,
@@ -673,7 +675,7 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb,
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
- char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + tls_ctx->rx.iv_size];
+ char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + MAX_IV_SIZE];
struct scatterlist sgin_arr[MAX_SKB_FRAGS + 2];
struct scatterlist *sgin = &sgin_arr[0];
struct strp_msg *rxm = strp_msg(skb);
@@ -1094,6 +1096,12 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
goto free_priv;
}
+ /* Sanity-check the IV size for stack allocations. */
+ if (iv_size > MAX_IV_SIZE) {
+ rc = -EINVAL;
+ goto free_priv;
+ }
+
cctx->prepend_size = TLS_HEADER_SIZE + nonce_size;
cctx->tag_size = tag_size;
cctx->overhead_size = cctx->prepend_size + cctx->tag_size;
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index aac9b8f6552e..c1076c19b858 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -2018,7 +2018,13 @@ const struct vsock_transport *vsock_core_get_transport(void)
}
EXPORT_SYMBOL_GPL(vsock_core_get_transport);
+static void __exit vsock_exit(void)
+{
+ /* Do nothing. This function makes this module removable. */
+}
+
module_init(vsock_init_tables);
+module_exit(vsock_exit);
MODULE_AUTHOR("VMware, Inc.");
MODULE_DESCRIPTION("VMware Virtual Socket Family");