summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/8021q/vlan.c5
-rw-r--r--net/8021q/vlan.h2
-rw-r--r--net/8021q/vlan_dev.c20
-rw-r--r--net/9p/client.c8
-rw-r--r--net/ax25/af_ax25.c3
-rw-r--r--net/ax25/ax25_ds_timer.c5
-rw-r--r--net/ax25/ax25_std_timer.c5
-rw-r--r--net/ax25/ax25_subr.c3
-rw-r--r--net/batman-adv/routing.c1
-rw-r--r--net/batman-adv/soft-interface.c9
-rw-r--r--net/batman-adv/translation-table.c50
-rw-r--r--net/batman-adv/types.h2
-rw-r--r--net/bridge/br_fdb.c2
-rw-r--r--net/bridge/br_input.c15
-rw-r--r--net/bridge/br_multicast.c4
-rw-r--r--net/bridge/br_netlink.c2
-rw-r--r--net/bridge/br_private.h23
-rw-r--r--net/ceph/ceph_common.c2
-rw-r--r--net/ceph/ceph_strings.c16
-rw-r--r--net/ceph/debugfs.c147
-rw-r--r--net/ceph/mon_client.c393
-rw-r--r--net/ceph/osd_client.c4031
-rw-r--r--net/ceph/osdmap.c651
-rw-r--r--net/compat.c20
-rw-r--r--net/core/filter.c16
-rw-r--r--net/core/gen_stats.c2
-rw-r--r--net/core/neighbour.c6
-rw-r--r--net/core/net-sysfs.c1
-rw-r--r--net/core/pktgen.c8
-rw-r--r--net/ipv4/esp4.c52
-rw-r--r--net/ipv4/gre_demux.c10
-rw-r--r--net/ipv4/ip_gre.c26
-rw-r--r--net/ipv4/ipconfig.c4
-rw-r--r--net/ipv4/ipmr.c4
-rw-r--r--net/ipv4/tcp_output.c7
-rw-r--r--net/ipv4/udp.c90
-rw-r--r--net/ipv6/Kconfig2
-rw-r--r--net/ipv6/Makefile2
-rw-r--r--net/ipv6/icmp.c2
-rw-r--r--net/ipv6/ip6_checksum.c7
-rw-r--r--net/ipv6/ip6_gre.c5
-rw-r--r--net/ipv6/ip6_output.c11
-rw-r--r--net/ipv6/ip6mr.c1
-rw-r--r--net/ipv6/netfilter/nf_dup_ipv6.c1
-rw-r--r--net/ipv6/route.c2
-rw-r--r--net/ipv6/sit.c4
-rw-r--r--net/ipv6/tcp_ipv6.c8
-rw-r--r--net/ipv6/udp.c83
-rw-r--r--net/kcm/kcmproc.c1
-rw-r--r--net/l2tp/l2tp_core.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c5
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c5
-rw-r--r--net/netfilter/nf_conntrack_core.c2
-rw-r--r--net/netfilter/nf_conntrack_ftp.c1
-rw-r--r--net/netfilter/nf_conntrack_helper.c9
-rw-r--r--net/netfilter/nf_conntrack_irc.c1
-rw-r--r--net/netfilter/nf_conntrack_sane.c1
-rw-r--r--net/netfilter/nf_conntrack_sip.c1
-rw-r--r--net/netfilter/nf_conntrack_standalone.c2
-rw-r--r--net/netfilter/nf_conntrack_tftp.c1
-rw-r--r--net/netfilter/nf_queue.c17
-rw-r--r--net/netfilter/nf_tables_api.c26
-rw-r--r--net/netfilter/nf_tables_core.c2
-rw-r--r--net/netfilter/nfnetlink_queue.c20
-rw-r--r--net/netfilter/nft_hash.c3
-rw-r--r--net/netfilter/nft_rbtree.c3
-rw-r--r--net/netfilter/x_tables.c4
-rw-r--r--net/openvswitch/actions.c20
-rw-r--r--net/openvswitch/conntrack.c14
-rw-r--r--net/packet/af_packet.c25
-rw-r--r--net/rds/ib_cm.c2
-rw-r--r--net/rds/loop.c5
-rw-r--r--net/rds/rds.h2
-rw-r--r--net/rds/recv.c2
-rw-r--r--net/rds/send.c1
-rw-r--r--net/rds/sysctl.c3
-rw-r--r--net/rds/tcp.c78
-rw-r--r--net/rds/tcp.h3
-rw-r--r--net/rds/tcp_connect.c26
-rw-r--r--net/rds/tcp_listen.c20
-rw-r--r--net/rds/tcp_recv.c2
-rw-r--r--net/rds/tcp_send.c14
-rw-r--r--net/rds/threads.c10
-rw-r--r--net/rds/transport.c3
-rw-r--r--net/rxrpc/rxkad.c4
-rw-r--r--net/sched/act_api.c2
-rw-r--r--net/sched/act_ife.c53
-rw-r--r--net/sched/act_ipt.c7
-rw-r--r--net/sched/act_police.c33
-rw-r--r--net/sched/cls_flower.c6
-rw-r--r--net/sched/cls_u32.c72
-rw-r--r--net/sched/sch_drr.c4
-rw-r--r--net/sched/sch_fifo.c4
-rw-r--r--net/sched/sch_fq_codel.c26
-rw-r--r--net/sched/sch_generic.c2
-rw-r--r--net/sched/sch_hfsc.c12
-rw-r--r--net/sched/sch_htb.c2
-rw-r--r--net/sched/sch_ingress.c12
-rw-r--r--net/sched/sch_netem.c12
-rw-r--r--net/sched/sch_prio.c71
-rw-r--r--net/sched/sch_qfq.c6
-rw-r--r--net/sched/sch_red.c4
-rw-r--r--net/sched/sch_tbf.c4
-rw-r--r--net/sctp/sctp_diag.c6
-rw-r--r--net/sctp/socket.c1
-rw-r--r--net/sunrpc/auth.c9
-rw-r--r--net/sunrpc/auth_generic.c13
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c6
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c9
-rw-r--r--net/sunrpc/auth_unix.c6
-rw-r--r--net/sunrpc/clnt.c17
-rw-r--r--net/sunrpc/svc_xprt.c23
-rw-r--r--net/sunrpc/xdr.c2
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c16
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c134
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c214
-rw-r--r--net/sunrpc/xprtrdma/physical_ops.c39
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c517
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_marshal.c32
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c34
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c28
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c17
-rw-r--r--net/sunrpc/xprtrdma/transport.c16
-rw-r--r--net/sunrpc/xprtrdma/verbs.c78
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h47
-rw-r--r--net/sunrpc/xprtsock.c6
-rw-r--r--net/tipc/bearer.c2
-rw-r--r--net/tipc/link.c3
-rw-r--r--net/tipc/msg.c6
-rw-r--r--net/tipc/msg.h11
-rw-r--r--net/tipc/netlink_compat.c3
-rw-r--r--net/tipc/socket.c54
-rw-r--r--net/vmw_vsock/af_vsock.c12
133 files changed, 5050 insertions, 2756 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index a1e273af6fc8..82a116ba590e 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -290,6 +290,10 @@ static void vlan_sync_address(struct net_device *dev,
if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr))
return;
+ /* vlan continues to inherit address of lower device */
+ if (vlan_dev_inherit_address(vlandev, dev))
+ goto out;
+
/* vlan address was different from the old address and is equal to
* the new address */
if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
@@ -302,6 +306,7 @@ static void vlan_sync_address(struct net_device *dev,
!ether_addr_equal(vlandev->dev_addr, dev->dev_addr))
dev_uc_add(dev, vlandev->dev_addr);
+out:
ether_addr_copy(vlan->real_dev_addr, dev->dev_addr);
}
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 9d010a09ab98..cc1557978066 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -109,6 +109,8 @@ int vlan_check_real_dev(struct net_device *real_dev,
void vlan_setup(struct net_device *dev);
int register_vlan_dev(struct net_device *dev);
void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
+bool vlan_dev_inherit_address(struct net_device *dev,
+ struct net_device *real_dev);
static inline u32 vlan_get_ingress_priority(struct net_device *dev,
u16 vlan_tci)
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index e7e62570bdb8..86ae75b77390 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -245,6 +245,17 @@ void vlan_dev_get_realdev_name(const struct net_device *dev, char *result)
strncpy(result, vlan_dev_priv(dev)->real_dev->name, 23);
}
+bool vlan_dev_inherit_address(struct net_device *dev,
+ struct net_device *real_dev)
+{
+ if (dev->addr_assign_type != NET_ADDR_STOLEN)
+ return false;
+
+ ether_addr_copy(dev->dev_addr, real_dev->dev_addr);
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return true;
+}
+
static int vlan_dev_open(struct net_device *dev)
{
struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
@@ -255,7 +266,8 @@ static int vlan_dev_open(struct net_device *dev)
!(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
return -ENETDOWN;
- if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) {
+ if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr) &&
+ !vlan_dev_inherit_address(dev, real_dev)) {
err = dev_uc_add(real_dev, dev->dev_addr);
if (err < 0)
goto out;
@@ -560,8 +572,10 @@ static int vlan_dev_init(struct net_device *dev)
/* ipv6 shared card related stuff */
dev->dev_id = real_dev->dev_id;
- if (is_zero_ether_addr(dev->dev_addr))
- eth_hw_addr_inherit(dev, real_dev);
+ if (is_zero_ether_addr(dev->dev_addr)) {
+ ether_addr_copy(dev->dev_addr, real_dev->dev_addr);
+ dev->addr_assign_type = NET_ADDR_STOLEN;
+ }
if (is_zero_ether_addr(dev->broadcast))
memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len);
diff --git a/net/9p/client.c b/net/9p/client.c
index ea79ee9a7348..3fc94a49ccd5 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -518,10 +518,10 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
if (err)
goto out_err;
- if (p9_is_proto_dotu(c))
+ if (p9_is_proto_dotu(c) && ecode < 512)
err = -ecode;
- if (!err || !IS_ERR_VALUE(err)) {
+ if (!err) {
err = p9_errstr2errno(ename, strlen(ename));
p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n",
@@ -605,10 +605,10 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req,
if (err)
goto out_err;
- if (p9_is_proto_dotu(c))
+ if (p9_is_proto_dotu(c) && ecode < 512)
err = -ecode;
- if (!err || !IS_ERR_VALUE(err)) {
+ if (!err) {
err = p9_errstr2errno(ename, strlen(ename));
p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n",
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index fbd0acf80b13..2fdebabbfacd 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -976,7 +976,8 @@ static int ax25_release(struct socket *sock)
release_sock(sk);
ax25_disconnect(ax25, 0);
lock_sock(sk);
- ax25_destroy_socket(ax25);
+ if (!sock_flag(ax25->sk, SOCK_DESTROY))
+ ax25_destroy_socket(ax25);
break;
case AX25_STATE_3:
diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c
index 951cd57bb07d..5237dff6941d 100644
--- a/net/ax25/ax25_ds_timer.c
+++ b/net/ax25/ax25_ds_timer.c
@@ -102,6 +102,7 @@ void ax25_ds_heartbeat_expiry(ax25_cb *ax25)
switch (ax25->state) {
case AX25_STATE_0:
+ case AX25_STATE_2:
/* Magic here: If we listen() and a new link dies before it
is accepted() it isn't 'dead' so doesn't get removed. */
if (!sk || sock_flag(sk, SOCK_DESTROY) ||
@@ -111,6 +112,7 @@ void ax25_ds_heartbeat_expiry(ax25_cb *ax25)
sock_hold(sk);
ax25_destroy_socket(ax25);
bh_unlock_sock(sk);
+ /* Ungrab socket and destroy it */
sock_put(sk);
} else
ax25_destroy_socket(ax25);
@@ -213,7 +215,8 @@ void ax25_ds_t1_timeout(ax25_cb *ax25)
case AX25_STATE_2:
if (ax25->n2count == ax25->n2) {
ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
- ax25_disconnect(ax25, ETIMEDOUT);
+ if (!sock_flag(ax25->sk, SOCK_DESTROY))
+ ax25_disconnect(ax25, ETIMEDOUT);
return;
} else {
ax25->n2count++;
diff --git a/net/ax25/ax25_std_timer.c b/net/ax25/ax25_std_timer.c
index 004467c9e6e1..2c0d6ef66f9d 100644
--- a/net/ax25/ax25_std_timer.c
+++ b/net/ax25/ax25_std_timer.c
@@ -38,6 +38,7 @@ void ax25_std_heartbeat_expiry(ax25_cb *ax25)
switch (ax25->state) {
case AX25_STATE_0:
+ case AX25_STATE_2:
/* Magic here: If we listen() and a new link dies before it
is accepted() it isn't 'dead' so doesn't get removed. */
if (!sk || sock_flag(sk, SOCK_DESTROY) ||
@@ -47,6 +48,7 @@ void ax25_std_heartbeat_expiry(ax25_cb *ax25)
sock_hold(sk);
ax25_destroy_socket(ax25);
bh_unlock_sock(sk);
+ /* Ungrab socket and destroy it */
sock_put(sk);
} else
ax25_destroy_socket(ax25);
@@ -144,7 +146,8 @@ void ax25_std_t1timer_expiry(ax25_cb *ax25)
case AX25_STATE_2:
if (ax25->n2count == ax25->n2) {
ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
- ax25_disconnect(ax25, ETIMEDOUT);
+ if (!sock_flag(ax25->sk, SOCK_DESTROY))
+ ax25_disconnect(ax25, ETIMEDOUT);
return;
} else {
ax25->n2count++;
diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c
index 3b78e8473a01..655a7d4c96e1 100644
--- a/net/ax25/ax25_subr.c
+++ b/net/ax25/ax25_subr.c
@@ -264,7 +264,8 @@ void ax25_disconnect(ax25_cb *ax25, int reason)
{
ax25_clear_queues(ax25);
- ax25_stop_heartbeat(ax25);
+ if (!sock_flag(ax25->sk, SOCK_DESTROY))
+ ax25_stop_heartbeat(ax25);
ax25_stop_t1timer(ax25);
ax25_stop_t2timer(ax25);
ax25_stop_t3timer(ax25);
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index e3857ed4057f..6c2901a86230 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -374,6 +374,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
if (skb_cow(skb, ETH_HLEN) < 0)
goto out;
+ ethhdr = eth_hdr(skb);
icmph = (struct batadv_icmp_header *)skb->data;
icmp_packet_rr = (struct batadv_icmp_packet_rr *)icmph;
if (icmp_packet_rr->rr_cur >= BATADV_RR_LEN)
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 343d2c904399..287a3879ed7e 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -1033,7 +1033,9 @@ void batadv_softif_destroy_sysfs(struct net_device *soft_iface)
static void batadv_softif_destroy_netlink(struct net_device *soft_iface,
struct list_head *head)
{
+ struct batadv_priv *bat_priv = netdev_priv(soft_iface);
struct batadv_hard_iface *hard_iface;
+ struct batadv_softif_vlan *vlan;
list_for_each_entry(hard_iface, &batadv_hardif_list, list) {
if (hard_iface->soft_iface == soft_iface)
@@ -1041,6 +1043,13 @@ static void batadv_softif_destroy_netlink(struct net_device *soft_iface,
BATADV_IF_CLEANUP_KEEP);
}
+ /* destroy the "untagged" VLAN */
+ vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS);
+ if (vlan) {
+ batadv_softif_destroy_vlan(bat_priv, vlan);
+ batadv_softif_vlan_put(vlan);
+ }
+
batadv_sysfs_del_meshif(soft_iface);
unregister_netdevice_queue(soft_iface, head);
}
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index feaf492b01ca..57ec87f37050 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -650,8 +650,10 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
/* increase the refcounter of the related vlan */
vlan = batadv_softif_vlan_get(bat_priv, vid);
- if (WARN(!vlan, "adding TT local entry %pM to non-existent VLAN %d",
- addr, BATADV_PRINT_VID(vid))) {
+ if (!vlan) {
+ net_ratelimited_function(batadv_info, soft_iface,
+ "adding TT local entry %pM to non-existent VLAN %d\n",
+ addr, BATADV_PRINT_VID(vid));
kfree(tt_local);
tt_local = NULL;
goto out;
@@ -691,7 +693,6 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
if (unlikely(hash_added != 0)) {
/* remove the reference for the hash */
batadv_tt_local_entry_put(tt_local);
- batadv_softif_vlan_put(vlan);
goto out;
}
@@ -2269,6 +2270,29 @@ static u32 batadv_tt_local_crc(struct batadv_priv *bat_priv,
return crc;
}
+/**
+ * batadv_tt_req_node_release - free tt_req node entry
+ * @ref: kref pointer of the tt req_node entry
+ */
+static void batadv_tt_req_node_release(struct kref *ref)
+{
+ struct batadv_tt_req_node *tt_req_node;
+
+ tt_req_node = container_of(ref, struct batadv_tt_req_node, refcount);
+
+ kfree(tt_req_node);
+}
+
+/**
+ * batadv_tt_req_node_put - decrement the tt_req_node refcounter and
+ * possibly release it
+ * @tt_req_node: tt_req_node to be free'd
+ */
+static void batadv_tt_req_node_put(struct batadv_tt_req_node *tt_req_node)
+{
+ kref_put(&tt_req_node->refcount, batadv_tt_req_node_release);
+}
+
static void batadv_tt_req_list_free(struct batadv_priv *bat_priv)
{
struct batadv_tt_req_node *node;
@@ -2278,7 +2302,7 @@ static void batadv_tt_req_list_free(struct batadv_priv *bat_priv)
hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) {
hlist_del_init(&node->list);
- kfree(node);
+ batadv_tt_req_node_put(node);
}
spin_unlock_bh(&bat_priv->tt.req_list_lock);
@@ -2315,7 +2339,7 @@ static void batadv_tt_req_purge(struct batadv_priv *bat_priv)
if (batadv_has_timed_out(node->issued_at,
BATADV_TT_REQUEST_TIMEOUT)) {
hlist_del_init(&node->list);
- kfree(node);
+ batadv_tt_req_node_put(node);
}
}
spin_unlock_bh(&bat_priv->tt.req_list_lock);
@@ -2347,9 +2371,11 @@ batadv_tt_req_node_new(struct batadv_priv *bat_priv,
if (!tt_req_node)
goto unlock;
+ kref_init(&tt_req_node->refcount);
ether_addr_copy(tt_req_node->addr, orig_node->orig);
tt_req_node->issued_at = jiffies;
+ kref_get(&tt_req_node->refcount);
hlist_add_head(&tt_req_node->list, &bat_priv->tt.req_list);
unlock:
spin_unlock_bh(&bat_priv->tt.req_list_lock);
@@ -2613,13 +2639,19 @@ static bool batadv_send_tt_request(struct batadv_priv *bat_priv,
out:
if (primary_if)
batadv_hardif_put(primary_if);
+
if (ret && tt_req_node) {
spin_lock_bh(&bat_priv->tt.req_list_lock);
- /* hlist_del_init() verifies tt_req_node still is in the list */
- hlist_del_init(&tt_req_node->list);
+ if (!hlist_unhashed(&tt_req_node->list)) {
+ hlist_del_init(&tt_req_node->list);
+ batadv_tt_req_node_put(tt_req_node);
+ }
spin_unlock_bh(&bat_priv->tt.req_list_lock);
- kfree(tt_req_node);
}
+
+ if (tt_req_node)
+ batadv_tt_req_node_put(tt_req_node);
+
kfree(tvlv_tt_data);
return ret;
}
@@ -3055,7 +3087,7 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv,
if (!batadv_compare_eth(node->addr, resp_src))
continue;
hlist_del_init(&node->list);
- kfree(node);
+ batadv_tt_req_node_put(node);
}
spin_unlock_bh(&bat_priv->tt.req_list_lock);
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 6a577f4f8ba7..ba846b078af8 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1137,11 +1137,13 @@ struct batadv_tt_change_node {
* struct batadv_tt_req_node - data to keep track of the tt requests in flight
* @addr: mac address address of the originator this request was sent to
* @issued_at: timestamp used for purging stale tt requests
+ * @refcount: number of contexts the object is used by
* @list: list node for batadv_priv_tt::req_list
*/
struct batadv_tt_req_node {
u8 addr[ETH_ALEN];
unsigned long issued_at;
+ struct kref refcount;
struct hlist_node list;
};
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index dcea4f4c62b3..c18080ad4085 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -279,6 +279,8 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
* change from under us.
*/
list_for_each_entry(v, &vg->vlan_list, vlist) {
+ if (!br_vlan_should_use(v))
+ continue;
f = __br_fdb_get(br, br->dev->dev_addr, v->vid);
if (f && f->is_local && !f->dst)
fdb_delete_local(br, NULL, f);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 160797722228..43d2cd862bc2 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -213,8 +213,7 @@ drop:
}
EXPORT_SYMBOL_GPL(br_handle_frame_finish);
-/* note: already called with rcu_read_lock */
-static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+static void __br_handle_local_finish(struct sk_buff *skb)
{
struct net_bridge_port *p = br_port_get_rcu(skb->dev);
u16 vid = 0;
@@ -222,6 +221,14 @@ static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_bu
/* check if vlan is allowed, to avoid spoofing */
if (p->flags & BR_LEARNING && br_should_learn(p, skb, &vid))
br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false);
+}
+
+/* note: already called with rcu_read_lock */
+static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct net_bridge_port *p = br_port_get_rcu(skb->dev);
+
+ __br_handle_local_finish(skb);
BR_INPUT_SKB_CB(skb)->brdev = p->br->dev;
br_pass_frame_up(skb);
@@ -274,7 +281,9 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
if (p->br->stp_enabled == BR_NO_STP ||
fwd_mask & (1u << dest[5]))
goto forward;
- break;
+ *pskb = skb;
+ __br_handle_local_finish(skb);
+ return RX_HANDLER_PASS;
case 0x01: /* IEEE MAC (Pause) */
goto drop;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 6852f3c7009c..43844144c9c4 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -464,8 +464,11 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
if (ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0,
&ip6h->saddr)) {
kfree_skb(skb);
+ br->has_ipv6_addr = 0;
return NULL;
}
+
+ br->has_ipv6_addr = 1;
ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest);
hopopt = (u8 *)(ip6h + 1);
@@ -1745,6 +1748,7 @@ void br_multicast_init(struct net_bridge *br)
br->ip6_other_query.delay_time = 0;
br->ip6_querier.port = NULL;
#endif
+ br->has_ipv6_addr = 1;
spin_lock_init(&br->multicast_lock);
setup_timer(&br->multicast_router_timer,
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index a5343c7232bf..85e89f693589 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1273,7 +1273,7 @@ static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev,
struct bridge_vlan_xstats vxi;
struct br_vlan_stats stats;
- if (vl_idx++ < *prividx)
+ if (++vl_idx < *prividx)
continue;
memset(&vxi, 0, sizeof(vxi));
vxi.vid = v->vid;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index c7fb5d7a7218..52edecf3c294 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -314,6 +314,7 @@ struct net_bridge
u8 multicast_disabled:1;
u8 multicast_querier:1;
u8 multicast_query_use_ifaddr:1;
+ u8 has_ipv6_addr:1;
u32 hash_elasticity;
u32 hash_max;
@@ -588,10 +589,22 @@ static inline bool br_multicast_is_router(struct net_bridge *br)
static inline bool
__br_multicast_querier_exists(struct net_bridge *br,
- struct bridge_mcast_other_query *querier)
+ struct bridge_mcast_other_query *querier,
+ const bool is_ipv6)
{
+ bool own_querier_enabled;
+
+ if (br->multicast_querier) {
+ if (is_ipv6 && !br->has_ipv6_addr)
+ own_querier_enabled = false;
+ else
+ own_querier_enabled = true;
+ } else {
+ own_querier_enabled = false;
+ }
+
return time_is_before_jiffies(querier->delay_time) &&
- (br->multicast_querier || timer_pending(&querier->timer));
+ (own_querier_enabled || timer_pending(&querier->timer));
}
static inline bool br_multicast_querier_exists(struct net_bridge *br,
@@ -599,10 +612,12 @@ static inline bool br_multicast_querier_exists(struct net_bridge *br,
{
switch (eth->h_proto) {
case (htons(ETH_P_IP)):
- return __br_multicast_querier_exists(br, &br->ip4_other_query);
+ return __br_multicast_querier_exists(br,
+ &br->ip4_other_query, false);
#if IS_ENABLED(CONFIG_IPV6)
case (htons(ETH_P_IPV6)):
- return __br_multicast_querier_exists(br, &br->ip6_other_query);
+ return __br_multicast_querier_exists(br,
+ &br->ip6_other_query, true);
#endif
default:
return false;
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index dcc18c6f7cf9..55d2bfee16d7 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -651,7 +651,7 @@ EXPORT_SYMBOL(ceph_destroy_client);
/*
* true if we have the mon map (and have thus joined the cluster)
*/
-static int have_mon_and_osd_map(struct ceph_client *client)
+static bool have_mon_and_osd_map(struct ceph_client *client)
{
return client->monc.monmap && client->monc.monmap->epoch &&
client->osdc.osdmap && client->osdc.osdmap->epoch;
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 139a9cb19b0c..3773a4fa11e3 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -27,6 +27,22 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE)
}
}
+const char *ceph_osd_watch_op_name(int o)
+{
+ switch (o) {
+ case CEPH_OSD_WATCH_OP_UNWATCH:
+ return "unwatch";
+ case CEPH_OSD_WATCH_OP_WATCH:
+ return "watch";
+ case CEPH_OSD_WATCH_OP_RECONNECT:
+ return "reconnect";
+ case CEPH_OSD_WATCH_OP_PING:
+ return "ping";
+ default:
+ return "???";
+ }
+}
+
const char *ceph_osd_state_name(int s)
{
switch (s) {
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index b902fbc7863e..e77b04ca7802 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -54,24 +54,25 @@ static int osdmap_show(struct seq_file *s, void *p)
{
int i;
struct ceph_client *client = s->private;
- struct ceph_osdmap *map = client->osdc.osdmap;
+ struct ceph_osd_client *osdc = &client->osdc;
+ struct ceph_osdmap *map = osdc->osdmap;
struct rb_node *n;
if (map == NULL)
return 0;
- seq_printf(s, "epoch %d\n", map->epoch);
- seq_printf(s, "flags%s%s\n",
- (map->flags & CEPH_OSDMAP_NEARFULL) ? " NEARFULL" : "",
- (map->flags & CEPH_OSDMAP_FULL) ? " FULL" : "");
+ down_read(&osdc->lock);
+ seq_printf(s, "epoch %d flags 0x%x\n", map->epoch, map->flags);
for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
- struct ceph_pg_pool_info *pool =
+ struct ceph_pg_pool_info *pi =
rb_entry(n, struct ceph_pg_pool_info, node);
- seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n",
- pool->id, pool->pg_num, pool->pg_num_mask,
- pool->read_tier, pool->write_tier);
+ seq_printf(s, "pool %lld '%s' type %d size %d min_size %d pg_num %u pg_num_mask %d flags 0x%llx lfor %u read_tier %lld write_tier %lld\n",
+ pi->id, pi->name, pi->type, pi->size, pi->min_size,
+ pi->pg_num, pi->pg_num_mask, pi->flags,
+ pi->last_force_request_resend, pi->read_tier,
+ pi->write_tier);
}
for (i = 0; i < map->max_osd; i++) {
struct ceph_entity_addr *addr = &map->osd_addr[i];
@@ -103,6 +104,7 @@ static int osdmap_show(struct seq_file *s, void *p)
pg->pgid.seed, pg->primary_temp.osd);
}
+ up_read(&osdc->lock);
return 0;
}
@@ -126,6 +128,7 @@ static int monc_show(struct seq_file *s, void *p)
CEPH_SUBSCRIBE_ONETIME ? "" : "+"));
seq_putc(s, '\n');
}
+ seq_printf(s, "fs_cluster_id %d\n", monc->fs_cluster_id);
for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
__u16 op;
@@ -143,43 +146,113 @@ static int monc_show(struct seq_file *s, void *p)
return 0;
}
-static int osdc_show(struct seq_file *s, void *pp)
+static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
{
- struct ceph_client *client = s->private;
- struct ceph_osd_client *osdc = &client->osdc;
- struct rb_node *p;
+ int i;
- mutex_lock(&osdc->request_mutex);
- for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
- struct ceph_osd_request *req;
- unsigned int i;
- int opcode;
+ seq_printf(s, "osd%d\t%llu.%x\t[", t->osd, t->pgid.pool, t->pgid.seed);
+ for (i = 0; i < t->up.size; i++)
+ seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]);
+ seq_printf(s, "]/%d\t[", t->up.primary);
+ for (i = 0; i < t->acting.size; i++)
+ seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]);
+ seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary,
+ t->target_oid.name_len, t->target_oid.name, t->flags);
+ if (t->paused)
+ seq_puts(s, "\tP");
+}
- req = rb_entry(p, struct ceph_osd_request, r_node);
+static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
+{
+ int i;
- seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid,
- req->r_osd ? req->r_osd->o_osd : -1,
- req->r_pgid.pool, req->r_pgid.seed);
+ seq_printf(s, "%llu\t", req->r_tid);
+ dump_target(s, &req->r_t);
- seq_printf(s, "%.*s", req->r_base_oid.name_len,
- req->r_base_oid.name);
+ seq_printf(s, "\t%d\t%u'%llu", req->r_attempts,
+ le32_to_cpu(req->r_replay_version.epoch),
+ le64_to_cpu(req->r_replay_version.version));
- if (req->r_reassert_version.epoch)
- seq_printf(s, "\t%u'%llu",
- (unsigned int)le32_to_cpu(req->r_reassert_version.epoch),
- le64_to_cpu(req->r_reassert_version.version));
- else
- seq_printf(s, "\t");
+ for (i = 0; i < req->r_num_ops; i++) {
+ struct ceph_osd_req_op *op = &req->r_ops[i];
+
+ seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
+ ceph_osd_op_name(op->op));
+ if (op->op == CEPH_OSD_OP_WATCH)
+ seq_printf(s, "-%s",
+ ceph_osd_watch_op_name(op->watch.op));
+ }
+
+ seq_putc(s, '\n');
+}
+
+static void dump_requests(struct seq_file *s, struct ceph_osd *osd)
+{
+ struct rb_node *n;
+
+ mutex_lock(&osd->lock);
+ for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
+
+ dump_request(s, req);
+ }
+
+ mutex_unlock(&osd->lock);
+}
- for (i = 0; i < req->r_num_ops; i++) {
- opcode = req->r_ops[i].op;
- seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
- ceph_osd_op_name(opcode));
- }
+static void dump_linger_request(struct seq_file *s,
+ struct ceph_osd_linger_request *lreq)
+{
+ seq_printf(s, "%llu\t", lreq->linger_id);
+ dump_target(s, &lreq->t);
+
+ seq_printf(s, "\t%u\t%s%s/%d\n", lreq->register_gen,
+ lreq->is_watch ? "W" : "N", lreq->committed ? "C" : "",
+ lreq->last_error);
+}
+
+static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd)
+{
+ struct rb_node *n;
+
+ mutex_lock(&osd->lock);
+ for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
+ struct ceph_osd_linger_request *lreq =
+ rb_entry(n, struct ceph_osd_linger_request, node);
+
+ dump_linger_request(s, lreq);
+ }
+
+ mutex_unlock(&osd->lock);
+}
- seq_printf(s, "\n");
+static int osdc_show(struct seq_file *s, void *pp)
+{
+ struct ceph_client *client = s->private;
+ struct ceph_osd_client *osdc = &client->osdc;
+ struct rb_node *n;
+
+ down_read(&osdc->lock);
+ seq_printf(s, "REQUESTS %d homeless %d\n",
+ atomic_read(&osdc->num_requests),
+ atomic_read(&osdc->num_homeless));
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+ dump_requests(s, osd);
}
- mutex_unlock(&osdc->request_mutex);
+ dump_requests(s, &osdc->homeless_osd);
+
+ seq_puts(s, "LINGER REQUESTS\n");
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+ dump_linger_requests(s, osd);
+ }
+ dump_linger_requests(s, &osdc->homeless_osd);
+
+ up_read(&osdc->lock);
return 0;
}
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index cf638c009cfa..37c38a7fb5c5 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -260,20 +260,26 @@ static void __send_subscribe(struct ceph_mon_client *monc)
BUG_ON(num < 1); /* monmap sub is always there */
ceph_encode_32(&p, num);
for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
- const char *s = ceph_sub_str[i];
+ char buf[32];
+ int len;
if (!monc->subs[i].want)
continue;
- dout("%s %s start %llu flags 0x%x\n", __func__, s,
+ len = sprintf(buf, "%s", ceph_sub_str[i]);
+ if (i == CEPH_SUB_MDSMAP &&
+ monc->fs_cluster_id != CEPH_FS_CLUSTER_ID_NONE)
+ len += sprintf(buf + len, ".%d", monc->fs_cluster_id);
+
+ dout("%s %s start %llu flags 0x%x\n", __func__, buf,
le64_to_cpu(monc->subs[i].item.start),
monc->subs[i].item.flags);
- ceph_encode_string(&p, end, s, strlen(s));
+ ceph_encode_string(&p, end, buf, len);
memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item));
p += sizeof(monc->subs[i].item);
}
- BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19));
+ BUG_ON(p > end);
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
ceph_msg_revoke(msg);
@@ -376,19 +382,13 @@ void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch)
}
EXPORT_SYMBOL(ceph_monc_got_map);
-/*
- * Register interest in the next osdmap
- */
-void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
+void ceph_monc_renew_subs(struct ceph_mon_client *monc)
{
- dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have);
mutex_lock(&monc->mutex);
- if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP,
- monc->subs[CEPH_SUB_OSDMAP].have + 1, false))
- __send_subscribe(monc);
+ __send_subscribe(monc);
mutex_unlock(&monc->mutex);
}
-EXPORT_SYMBOL(ceph_monc_request_next_osdmap);
+EXPORT_SYMBOL(ceph_monc_renew_subs);
/*
* Wait for an osdmap with a given epoch.
@@ -478,51 +478,17 @@ out:
/*
* generic requests (currently statfs, mon_get_version)
*/
-static struct ceph_mon_generic_request *__lookup_generic_req(
- struct ceph_mon_client *monc, u64 tid)
-{
- struct ceph_mon_generic_request *req;
- struct rb_node *n = monc->generic_request_tree.rb_node;
-
- while (n) {
- req = rb_entry(n, struct ceph_mon_generic_request, node);
- if (tid < req->tid)
- n = n->rb_left;
- else if (tid > req->tid)
- n = n->rb_right;
- else
- return req;
- }
- return NULL;
-}
-
-static void __insert_generic_request(struct ceph_mon_client *monc,
- struct ceph_mon_generic_request *new)
-{
- struct rb_node **p = &monc->generic_request_tree.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_mon_generic_request *req = NULL;
-
- while (*p) {
- parent = *p;
- req = rb_entry(parent, struct ceph_mon_generic_request, node);
- if (new->tid < req->tid)
- p = &(*p)->rb_left;
- else if (new->tid > req->tid)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&new->node, parent, p);
- rb_insert_color(&new->node, &monc->generic_request_tree);
-}
+DEFINE_RB_FUNCS(generic_request, struct ceph_mon_generic_request, tid, node)
static void release_generic_request(struct kref *kref)
{
struct ceph_mon_generic_request *req =
container_of(kref, struct ceph_mon_generic_request, kref);
+ dout("%s greq %p request %p reply %p\n", __func__, req, req->request,
+ req->reply);
+ WARN_ON(!RB_EMPTY_NODE(&req->node));
+
if (req->reply)
ceph_msg_put(req->reply);
if (req->request)
@@ -533,7 +499,8 @@ static void release_generic_request(struct kref *kref)
static void put_generic_request(struct ceph_mon_generic_request *req)
{
- kref_put(&req->kref, release_generic_request);
+ if (req)
+ kref_put(&req->kref, release_generic_request);
}
static void get_generic_request(struct ceph_mon_generic_request *req)
@@ -541,6 +508,103 @@ static void get_generic_request(struct ceph_mon_generic_request *req)
kref_get(&req->kref);
}
+static struct ceph_mon_generic_request *
+alloc_generic_request(struct ceph_mon_client *monc, gfp_t gfp)
+{
+ struct ceph_mon_generic_request *req;
+
+ req = kzalloc(sizeof(*req), gfp);
+ if (!req)
+ return NULL;
+
+ req->monc = monc;
+ kref_init(&req->kref);
+ RB_CLEAR_NODE(&req->node);
+ init_completion(&req->completion);
+
+ dout("%s greq %p\n", __func__, req);
+ return req;
+}
+
+static void register_generic_request(struct ceph_mon_generic_request *req)
+{
+ struct ceph_mon_client *monc = req->monc;
+
+ WARN_ON(req->tid);
+
+ get_generic_request(req);
+ req->tid = ++monc->last_tid;
+ insert_generic_request(&monc->generic_request_tree, req);
+}
+
+static void send_generic_request(struct ceph_mon_client *monc,
+ struct ceph_mon_generic_request *req)
+{
+ WARN_ON(!req->tid);
+
+ dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+ req->request->hdr.tid = cpu_to_le64(req->tid);
+ ceph_con_send(&monc->con, ceph_msg_get(req->request));
+}
+
+static void __finish_generic_request(struct ceph_mon_generic_request *req)
+{
+ struct ceph_mon_client *monc = req->monc;
+
+ dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+ erase_generic_request(&monc->generic_request_tree, req);
+
+ ceph_msg_revoke(req->request);
+ ceph_msg_revoke_incoming(req->reply);
+}
+
+static void finish_generic_request(struct ceph_mon_generic_request *req)
+{
+ __finish_generic_request(req);
+ put_generic_request(req);
+}
+
+static void complete_generic_request(struct ceph_mon_generic_request *req)
+{
+ if (req->complete_cb)
+ req->complete_cb(req);
+ else
+ complete_all(&req->completion);
+ put_generic_request(req);
+}
+
+void cancel_generic_request(struct ceph_mon_generic_request *req)
+{
+ struct ceph_mon_client *monc = req->monc;
+ struct ceph_mon_generic_request *lookup_req;
+
+ dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+
+ mutex_lock(&monc->mutex);
+ lookup_req = lookup_generic_request(&monc->generic_request_tree,
+ req->tid);
+ if (lookup_req) {
+ WARN_ON(lookup_req != req);
+ finish_generic_request(req);
+ }
+
+ mutex_unlock(&monc->mutex);
+}
+
+static int wait_generic_request(struct ceph_mon_generic_request *req)
+{
+ int ret;
+
+ dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+ ret = wait_for_completion_interruptible(&req->completion);
+ if (ret)
+ cancel_generic_request(req);
+ else
+ ret = req->result; /* completed */
+
+ return ret;
+}
+
static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
struct ceph_msg_header *hdr,
int *skip)
@@ -551,7 +615,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
struct ceph_msg *m;
mutex_lock(&monc->mutex);
- req = __lookup_generic_req(monc, tid);
+ req = lookup_generic_request(&monc->generic_request_tree, tid);
if (!req) {
dout("get_generic_reply %lld dne\n", tid);
*skip = 1;
@@ -570,42 +634,6 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
return m;
}
-static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
- struct ceph_mon_generic_request *req)
-{
- int err;
-
- /* register request */
- req->tid = tid != 0 ? tid : ++monc->last_tid;
- req->request->hdr.tid = cpu_to_le64(req->tid);
- __insert_generic_request(monc, req);
- monc->num_generic_requests++;
- ceph_con_send(&monc->con, ceph_msg_get(req->request));
- mutex_unlock(&monc->mutex);
-
- err = wait_for_completion_interruptible(&req->completion);
-
- mutex_lock(&monc->mutex);
- rb_erase(&req->node, &monc->generic_request_tree);
- monc->num_generic_requests--;
-
- if (!err)
- err = req->result;
- return err;
-}
-
-static int do_generic_request(struct ceph_mon_client *monc,
- struct ceph_mon_generic_request *req)
-{
- int err;
-
- mutex_lock(&monc->mutex);
- err = __do_generic_request(monc, 0, req);
- mutex_unlock(&monc->mutex);
-
- return err;
-}
-
/*
* statfs
*/
@@ -616,22 +644,24 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
u64 tid = le64_to_cpu(msg->hdr.tid);
+ dout("%s msg %p tid %llu\n", __func__, msg, tid);
+
if (msg->front.iov_len != sizeof(*reply))
goto bad;
- dout("handle_statfs_reply %p tid %llu\n", msg, tid);
mutex_lock(&monc->mutex);
- req = __lookup_generic_req(monc, tid);
- if (req) {
- *(struct ceph_statfs *)req->buf = reply->st;
- req->result = 0;
- get_generic_request(req);
+ req = lookup_generic_request(&monc->generic_request_tree, tid);
+ if (!req) {
+ mutex_unlock(&monc->mutex);
+ return;
}
+
+ req->result = 0;
+ *req->u.st = reply->st; /* struct */
+ __finish_generic_request(req);
mutex_unlock(&monc->mutex);
- if (req) {
- complete_all(&req->completion);
- put_generic_request(req);
- }
+
+ complete_generic_request(req);
return;
bad:
@@ -646,38 +676,38 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
{
struct ceph_mon_generic_request *req;
struct ceph_mon_statfs *h;
- int err;
+ int ret = -ENOMEM;
- req = kzalloc(sizeof(*req), GFP_NOFS);
+ req = alloc_generic_request(monc, GFP_NOFS);
if (!req)
- return -ENOMEM;
-
- kref_init(&req->kref);
- req->buf = buf;
- init_completion(&req->completion);
+ goto out;
- err = -ENOMEM;
req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS,
true);
if (!req->request)
goto out;
- req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS,
- true);
+
+ req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 64, GFP_NOFS, true);
if (!req->reply)
goto out;
+ req->u.st = buf;
+
+ mutex_lock(&monc->mutex);
+ register_generic_request(req);
/* fill out request */
h = req->request->front.iov_base;
h->monhdr.have_version = 0;
h->monhdr.session_mon = cpu_to_le16(-1);
h->monhdr.session_mon_tid = 0;
h->fsid = monc->monmap->fsid;
+ send_generic_request(monc, req);
+ mutex_unlock(&monc->mutex);
- err = do_generic_request(monc, req);
-
+ ret = wait_generic_request(req);
out:
put_generic_request(req);
- return err;
+ return ret;
}
EXPORT_SYMBOL(ceph_monc_do_statfs);
@@ -690,7 +720,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
void *end = p + msg->front_alloc_len;
u64 handle;
- dout("%s %p tid %llu\n", __func__, msg, tid);
+ dout("%s msg %p tid %llu\n", __func__, msg, tid);
ceph_decode_need(&p, end, 2*sizeof(u64), bad);
handle = ceph_decode_64(&p);
@@ -698,77 +728,111 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
goto bad;
mutex_lock(&monc->mutex);
- req = __lookup_generic_req(monc, handle);
- if (req) {
- *(u64 *)req->buf = ceph_decode_64(&p);
- req->result = 0;
- get_generic_request(req);
+ req = lookup_generic_request(&monc->generic_request_tree, handle);
+ if (!req) {
+ mutex_unlock(&monc->mutex);
+ return;
}
+
+ req->result = 0;
+ req->u.newest = ceph_decode_64(&p);
+ __finish_generic_request(req);
mutex_unlock(&monc->mutex);
- if (req) {
- complete_all(&req->completion);
- put_generic_request(req);
- }
+ complete_generic_request(req);
return;
+
bad:
pr_err("corrupt mon_get_version reply, tid %llu\n", tid);
ceph_msg_dump(msg);
}
-/*
- * Send MMonGetVersion and wait for the reply.
- *
- * @what: one of "mdsmap", "osdmap" or "monmap"
- */
-int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
- u64 *newest)
+static struct ceph_mon_generic_request *
+__ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+ ceph_monc_callback_t cb, u64 private_data)
{
struct ceph_mon_generic_request *req;
- void *p, *end;
- u64 tid;
- int err;
- req = kzalloc(sizeof(*req), GFP_NOFS);
+ req = alloc_generic_request(monc, GFP_NOIO);
if (!req)
- return -ENOMEM;
-
- kref_init(&req->kref);
- req->buf = newest;
- init_completion(&req->completion);
+ goto err_put_req;
req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION,
sizeof(u64) + sizeof(u32) + strlen(what),
- GFP_NOFS, true);
- if (!req->request) {
- err = -ENOMEM;
- goto out;
- }
+ GFP_NOIO, true);
+ if (!req->request)
+ goto err_put_req;
- req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024,
- GFP_NOFS, true);
- if (!req->reply) {
- err = -ENOMEM;
- goto out;
- }
+ req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 32, GFP_NOIO,
+ true);
+ if (!req->reply)
+ goto err_put_req;
- p = req->request->front.iov_base;
- end = p + req->request->front_alloc_len;
+ req->complete_cb = cb;
+ req->private_data = private_data;
- /* fill out request */
mutex_lock(&monc->mutex);
- tid = ++monc->last_tid;
- ceph_encode_64(&p, tid); /* handle */
- ceph_encode_string(&p, end, what, strlen(what));
+ register_generic_request(req);
+ {
+ void *p = req->request->front.iov_base;
+ void *const end = p + req->request->front_alloc_len;
+
+ ceph_encode_64(&p, req->tid); /* handle */
+ ceph_encode_string(&p, end, what, strlen(what));
+ WARN_ON(p != end);
+ }
+ send_generic_request(monc, req);
+ mutex_unlock(&monc->mutex);
- err = __do_generic_request(monc, tid, req);
+ return req;
- mutex_unlock(&monc->mutex);
-out:
+err_put_req:
put_generic_request(req);
- return err;
+ return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * Send MMonGetVersion and wait for the reply.
+ *
+ * @what: one of "mdsmap", "osdmap" or "monmap"
+ */
+int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+ u64 *newest)
+{
+ struct ceph_mon_generic_request *req;
+ int ret;
+
+ req = __ceph_monc_get_version(monc, what, NULL, 0);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ ret = wait_generic_request(req);
+ if (!ret)
+ *newest = req->u.newest;
+
+ put_generic_request(req);
+ return ret;
}
-EXPORT_SYMBOL(ceph_monc_do_get_version);
+EXPORT_SYMBOL(ceph_monc_get_version);
+
+/*
+ * Send MMonGetVersion,
+ *
+ * @what: one of "mdsmap", "osdmap" or "monmap"
+ */
+int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
+ ceph_monc_callback_t cb, u64 private_data)
+{
+ struct ceph_mon_generic_request *req;
+
+ req = __ceph_monc_get_version(monc, what, cb, private_data);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ put_generic_request(req);
+ return 0;
+}
+EXPORT_SYMBOL(ceph_monc_get_version_async);
/*
* Resend pending generic requests.
@@ -890,7 +954,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
if (!monc->m_subscribe_ack)
goto out_auth;
- monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS,
+ monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 128, GFP_NOFS,
true);
if (!monc->m_subscribe)
goto out_subscribe_ack;
@@ -914,9 +978,10 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
monc->generic_request_tree = RB_ROOT;
- monc->num_generic_requests = 0;
monc->last_tid = 0;
+ monc->fs_cluster_id = CEPH_FS_CLUSTER_ID_NONE;
+
return 0;
out_auth_reply:
@@ -954,6 +1019,8 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
ceph_auth_destroy(monc->auth);
+ WARN_ON(!RB_EMPTY_ROOT(&monc->generic_request_tree));
+
ceph_msg_put(monc->m_auth);
ceph_msg_put(monc->m_auth_reply);
ceph_msg_put(monc->m_subscribe);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 40a53a70efdf..89469592076c 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -19,25 +19,12 @@
#include <linux/ceph/auth.h>
#include <linux/ceph/pagelist.h>
-#define OSD_OP_FRONT_LEN 4096
#define OSD_OPREPLY_FRONT_LEN 512
static struct kmem_cache *ceph_osd_request_cache;
static const struct ceph_connection_operations osd_con_ops;
-static void __send_queued(struct ceph_osd_client *osdc);
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd);
-static void __register_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req);
-static void __unregister_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req);
-static void __unregister_linger_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req);
-static void __enqueue_request(struct ceph_osd_request *req);
-static void __send_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req);
-
/*
* Implement client access to distributed object storage cluster.
*
@@ -56,6 +43,52 @@ static void __send_request(struct ceph_osd_client *osdc,
* channel with an OSD is reset.
*/
+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+static void link_linger(struct ceph_osd *osd,
+ struct ceph_osd_linger_request *lreq);
+static void unlink_linger(struct ceph_osd *osd,
+ struct ceph_osd_linger_request *lreq);
+
+#if 1
+static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
+{
+ bool wrlocked = true;
+
+ if (unlikely(down_read_trylock(sem))) {
+ wrlocked = false;
+ up_read(sem);
+ }
+
+ return wrlocked;
+}
+static inline void verify_osdc_locked(struct ceph_osd_client *osdc)
+{
+ WARN_ON(!rwsem_is_locked(&osdc->lock));
+}
+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc)
+{
+ WARN_ON(!rwsem_is_wrlocked(&osdc->lock));
+}
+static inline void verify_osd_locked(struct ceph_osd *osd)
+{
+ struct ceph_osd_client *osdc = osd->o_osdc;
+
+ WARN_ON(!(mutex_is_locked(&osd->lock) &&
+ rwsem_is_locked(&osdc->lock)) &&
+ !rwsem_is_wrlocked(&osdc->lock));
+}
+static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq)
+{
+ WARN_ON(!mutex_is_locked(&lreq->lock));
+}
+#else
+static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { }
+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { }
+static inline void verify_osd_locked(struct ceph_osd *osd) { }
+static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { }
+#endif
+
/*
* calculate the mapping of a file extent onto an object, and fill out the
* request accordingly. shorten extent as necessary if it crosses an
@@ -144,14 +177,6 @@ osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
}
EXPORT_SYMBOL(osd_req_op_extent_osd_data);
-struct ceph_osd_data *
-osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
- unsigned int which)
-{
- return osd_req_op_data(osd_req, which, cls, response_data);
-}
-EXPORT_SYMBOL(osd_req_op_cls_response_data); /* ??? */
-
void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
unsigned int which, struct page **pages,
u64 length, u32 alignment,
@@ -218,6 +243,8 @@ void osd_req_op_cls_request_data_pagelist(
osd_data = osd_req_op_data(osd_req, which, cls, request_data);
ceph_osd_data_pagelist_init(osd_data, pagelist);
+ osd_req->r_ops[which].cls.indata_len += pagelist->length;
+ osd_req->r_ops[which].indata_len += pagelist->length;
}
EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
@@ -230,6 +257,8 @@ void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
osd_data = osd_req_op_data(osd_req, which, cls, request_data);
ceph_osd_data_pages_init(osd_data, pages, length, alignment,
pages_from_pool, own_pages);
+ osd_req->r_ops[which].cls.indata_len += length;
+ osd_req->r_ops[which].indata_len += length;
}
EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
@@ -302,14 +331,76 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
case CEPH_OSD_OP_STAT:
ceph_osd_data_release(&op->raw_data_in);
break;
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ ceph_osd_data_release(&op->notify_ack.request_data);
+ break;
+ case CEPH_OSD_OP_NOTIFY:
+ ceph_osd_data_release(&op->notify.request_data);
+ ceph_osd_data_release(&op->notify.response_data);
+ break;
default:
break;
}
}
/*
+ * Assumes @t is zero-initialized.
+ */
+static void target_init(struct ceph_osd_request_target *t)
+{
+ ceph_oid_init(&t->base_oid);
+ ceph_oloc_init(&t->base_oloc);
+ ceph_oid_init(&t->target_oid);
+ ceph_oloc_init(&t->target_oloc);
+
+ ceph_osds_init(&t->acting);
+ ceph_osds_init(&t->up);
+ t->size = -1;
+ t->min_size = -1;
+
+ t->osd = CEPH_HOMELESS_OSD;
+}
+
+static void target_copy(struct ceph_osd_request_target *dest,
+ const struct ceph_osd_request_target *src)
+{
+ ceph_oid_copy(&dest->base_oid, &src->base_oid);
+ ceph_oloc_copy(&dest->base_oloc, &src->base_oloc);
+ ceph_oid_copy(&dest->target_oid, &src->target_oid);
+ ceph_oloc_copy(&dest->target_oloc, &src->target_oloc);
+
+ dest->pgid = src->pgid; /* struct */
+ dest->pg_num = src->pg_num;
+ dest->pg_num_mask = src->pg_num_mask;
+ ceph_osds_copy(&dest->acting, &src->acting);
+ ceph_osds_copy(&dest->up, &src->up);
+ dest->size = src->size;
+ dest->min_size = src->min_size;
+ dest->sort_bitwise = src->sort_bitwise;
+
+ dest->flags = src->flags;
+ dest->paused = src->paused;
+
+ dest->osd = src->osd;
+}
+
+static void target_destroy(struct ceph_osd_request_target *t)
+{
+ ceph_oid_destroy(&t->base_oid);
+ ceph_oid_destroy(&t->target_oid);
+}
+
+/*
* requests
*/
+static void request_release_checks(struct ceph_osd_request *req)
+{
+ WARN_ON(!RB_EMPTY_NODE(&req->r_node));
+ WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node));
+ WARN_ON(!list_empty(&req->r_unsafe_item));
+ WARN_ON(req->r_osd);
+}
+
static void ceph_osdc_release_request(struct kref *kref)
{
struct ceph_osd_request *req = container_of(kref,
@@ -318,24 +409,19 @@ static void ceph_osdc_release_request(struct kref *kref)
dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
req->r_request, req->r_reply);
- WARN_ON(!RB_EMPTY_NODE(&req->r_node));
- WARN_ON(!list_empty(&req->r_req_lru_item));
- WARN_ON(!list_empty(&req->r_osd_item));
- WARN_ON(!list_empty(&req->r_linger_item));
- WARN_ON(!list_empty(&req->r_linger_osd_item));
- WARN_ON(req->r_osd);
+ request_release_checks(req);
if (req->r_request)
ceph_msg_put(req->r_request);
- if (req->r_reply) {
- ceph_msg_revoke_incoming(req->r_reply);
+ if (req->r_reply)
ceph_msg_put(req->r_reply);
- }
for (which = 0; which < req->r_num_ops; which++)
osd_req_op_data_release(req, which);
+ target_destroy(&req->r_t);
ceph_put_snap_context(req->r_snapc);
+
if (req->r_mempool)
mempool_free(req, req->r_osdc->req_mempool);
else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
@@ -354,12 +440,66 @@ EXPORT_SYMBOL(ceph_osdc_get_request);
void ceph_osdc_put_request(struct ceph_osd_request *req)
{
- dout("%s %p (was %d)\n", __func__, req,
- atomic_read(&req->r_kref.refcount));
- kref_put(&req->r_kref, ceph_osdc_release_request);
+ if (req) {
+ dout("%s %p (was %d)\n", __func__, req,
+ atomic_read(&req->r_kref.refcount));
+ kref_put(&req->r_kref, ceph_osdc_release_request);
+ }
}
EXPORT_SYMBOL(ceph_osdc_put_request);
+static void request_init(struct ceph_osd_request *req)
+{
+ /* req only, each op is zeroed in _osd_req_op_init() */
+ memset(req, 0, sizeof(*req));
+
+ kref_init(&req->r_kref);
+ init_completion(&req->r_completion);
+ init_completion(&req->r_safe_completion);
+ RB_CLEAR_NODE(&req->r_node);
+ RB_CLEAR_NODE(&req->r_mc_node);
+ INIT_LIST_HEAD(&req->r_unsafe_item);
+
+ target_init(&req->r_t);
+}
+
+/*
+ * This is ugly, but it allows us to reuse linger registration and ping
+ * requests, keeping the structure of the code around send_linger{_ping}()
+ * reasonable. Setting up a min_nr=2 mempool for each linger request
+ * and dealing with copying ops (this blasts req only, watch op remains
+ * intact) isn't any better.
+ */
+static void request_reinit(struct ceph_osd_request *req)
+{
+ struct ceph_osd_client *osdc = req->r_osdc;
+ bool mempool = req->r_mempool;
+ unsigned int num_ops = req->r_num_ops;
+ u64 snapid = req->r_snapid;
+ struct ceph_snap_context *snapc = req->r_snapc;
+ bool linger = req->r_linger;
+ struct ceph_msg *request_msg = req->r_request;
+ struct ceph_msg *reply_msg = req->r_reply;
+
+ dout("%s req %p\n", __func__, req);
+ WARN_ON(atomic_read(&req->r_kref.refcount) != 1);
+ request_release_checks(req);
+
+ WARN_ON(atomic_read(&request_msg->kref.refcount) != 1);
+ WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1);
+ target_destroy(&req->r_t);
+
+ request_init(req);
+ req->r_osdc = osdc;
+ req->r_mempool = mempool;
+ req->r_num_ops = num_ops;
+ req->r_snapid = snapid;
+ req->r_snapc = snapc;
+ req->r_linger = linger;
+ req->r_request = request_msg;
+ req->r_reply = reply_msg;
+}
+
struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
struct ceph_snap_context *snapc,
unsigned int num_ops,
@@ -367,8 +507,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
gfp_t gfp_flags)
{
struct ceph_osd_request *req;
- struct ceph_msg *msg;
- size_t msg_size;
if (use_mempool) {
BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
@@ -383,73 +521,65 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
if (unlikely(!req))
return NULL;
- /* req only, each op is zeroed in _osd_req_op_init() */
- memset(req, 0, sizeof(*req));
-
+ request_init(req);
req->r_osdc = osdc;
req->r_mempool = use_mempool;
req->r_num_ops = num_ops;
+ req->r_snapid = CEPH_NOSNAP;
+ req->r_snapc = ceph_get_snap_context(snapc);
- kref_init(&req->r_kref);
- init_completion(&req->r_completion);
- init_completion(&req->r_safe_completion);
- RB_CLEAR_NODE(&req->r_node);
- INIT_LIST_HEAD(&req->r_unsafe_item);
- INIT_LIST_HEAD(&req->r_linger_item);
- INIT_LIST_HEAD(&req->r_linger_osd_item);
- INIT_LIST_HEAD(&req->r_req_lru_item);
- INIT_LIST_HEAD(&req->r_osd_item);
-
- req->r_base_oloc.pool = -1;
- req->r_target_oloc.pool = -1;
+ dout("%s req %p\n", __func__, req);
+ return req;
+}
+EXPORT_SYMBOL(ceph_osdc_alloc_request);
- msg_size = OSD_OPREPLY_FRONT_LEN;
- if (num_ops > CEPH_OSD_SLAB_OPS) {
- /* ceph_osd_op and rval */
- msg_size += (num_ops - CEPH_OSD_SLAB_OPS) *
- (sizeof(struct ceph_osd_op) + 4);
- }
+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
+{
+ struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_msg *msg;
+ int msg_size;
- /* create reply message */
- if (use_mempool)
- msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
- else
- msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size,
- gfp_flags, true);
- if (!msg) {
- ceph_osdc_put_request(req);
- return NULL;
- }
- req->r_reply = msg;
+ WARN_ON(ceph_oid_empty(&req->r_base_oid));
+ /* create request message */
msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
msg_size += 1 + 8 + 4 + 4; /* pgid */
- msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
- msg_size += 2 + num_ops * sizeof(struct ceph_osd_op);
+ msg_size += 4 + req->r_base_oid.name_len; /* oid */
+ msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
msg_size += 8; /* snapid */
msg_size += 8; /* snap_seq */
- msg_size += 4 + 8 * (snapc ? snapc->num_snaps : 0); /* snaps */
+ msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
msg_size += 4; /* retry_attempt */
- /* create request message; allow space for oid */
- if (use_mempool)
+ if (req->r_mempool)
msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
else
- msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true);
- if (!msg) {
- ceph_osdc_put_request(req);
- return NULL;
- }
+ msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true);
+ if (!msg)
+ return -ENOMEM;
memset(msg->front.iov_base, 0, msg->front.iov_len);
-
req->r_request = msg;
- return req;
+ /* create reply message */
+ msg_size = OSD_OPREPLY_FRONT_LEN;
+ msg_size += req->r_base_oid.name_len;
+ msg_size += req->r_num_ops * sizeof(struct ceph_osd_op);
+
+ if (req->r_mempool)
+ msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+ else
+ msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true);
+ if (!msg)
+ return -ENOMEM;
+
+ req->r_reply = msg;
+
+ return 0;
}
-EXPORT_SYMBOL(ceph_osdc_alloc_request);
+EXPORT_SYMBOL(ceph_osdc_alloc_messages);
static bool osd_req_opcode_valid(u16 opcode)
{
@@ -587,8 +717,6 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
- op->cls.argc = 0; /* currently unused */
-
op->indata_len = payload_len;
}
EXPORT_SYMBOL(osd_req_op_cls_init);
@@ -627,21 +755,19 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
}
EXPORT_SYMBOL(osd_req_op_xattr_init);
-void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
- unsigned int which, u16 opcode,
- u64 cookie, u64 version, int flag)
+/*
+ * @watch_opcode: CEPH_OSD_WATCH_OP_*
+ */
+static void osd_req_op_watch_init(struct ceph_osd_request *req, int which,
+ u64 cookie, u8 watch_opcode)
{
- struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
- opcode, 0);
-
- BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
+ struct ceph_osd_req_op *op;
+ op = _osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0);
op->watch.cookie = cookie;
- op->watch.ver = version;
- if (opcode == CEPH_OSD_OP_WATCH && flag)
- op->watch.flag = (u8)1;
+ op->watch.op = watch_opcode;
+ op->watch.gen = 0;
}
-EXPORT_SYMBOL(osd_req_op_watch_init);
void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
unsigned int which,
@@ -686,16 +812,9 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
}
}
-static u64 osd_req_encode_op(struct ceph_osd_request *req,
- struct ceph_osd_op *dst, unsigned int which)
+static u32 osd_req_encode_op(struct ceph_osd_op *dst,
+ const struct ceph_osd_req_op *src)
{
- struct ceph_osd_req_op *src;
- struct ceph_osd_data *osd_data;
- u64 request_data_len = 0;
- u64 data_length;
-
- BUG_ON(which >= req->r_num_ops);
- src = &req->r_ops[which];
if (WARN_ON(!osd_req_opcode_valid(src->op))) {
pr_err("unrecognized osd opcode %d\n", src->op);
@@ -704,57 +823,36 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
switch (src->op) {
case CEPH_OSD_OP_STAT:
- osd_data = &src->raw_data_in;
- ceph_osdc_msg_data_add(req->r_reply, osd_data);
break;
case CEPH_OSD_OP_READ:
case CEPH_OSD_OP_WRITE:
case CEPH_OSD_OP_WRITEFULL:
case CEPH_OSD_OP_ZERO:
case CEPH_OSD_OP_TRUNCATE:
- if (src->op == CEPH_OSD_OP_WRITE ||
- src->op == CEPH_OSD_OP_WRITEFULL)
- request_data_len = src->extent.length;
dst->extent.offset = cpu_to_le64(src->extent.offset);
dst->extent.length = cpu_to_le64(src->extent.length);
dst->extent.truncate_size =
cpu_to_le64(src->extent.truncate_size);
dst->extent.truncate_seq =
cpu_to_le32(src->extent.truncate_seq);
- osd_data = &src->extent.osd_data;
- if (src->op == CEPH_OSD_OP_WRITE ||
- src->op == CEPH_OSD_OP_WRITEFULL)
- ceph_osdc_msg_data_add(req->r_request, osd_data);
- else
- ceph_osdc_msg_data_add(req->r_reply, osd_data);
break;
case CEPH_OSD_OP_CALL:
dst->cls.class_len = src->cls.class_len;
dst->cls.method_len = src->cls.method_len;
- osd_data = &src->cls.request_info;
- ceph_osdc_msg_data_add(req->r_request, osd_data);
- BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST);
- request_data_len = osd_data->pagelist->length;
-
- osd_data = &src->cls.request_data;
- data_length = ceph_osd_data_length(osd_data);
- if (data_length) {
- BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE);
- dst->cls.indata_len = cpu_to_le32(data_length);
- ceph_osdc_msg_data_add(req->r_request, osd_data);
- src->indata_len += data_length;
- request_data_len += data_length;
- }
- osd_data = &src->cls.response_data;
- ceph_osdc_msg_data_add(req->r_reply, osd_data);
+ dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
break;
case CEPH_OSD_OP_STARTSYNC:
break;
- case CEPH_OSD_OP_NOTIFY_ACK:
case CEPH_OSD_OP_WATCH:
dst->watch.cookie = cpu_to_le64(src->watch.cookie);
- dst->watch.ver = cpu_to_le64(src->watch.ver);
- dst->watch.flag = src->watch.flag;
+ dst->watch.ver = cpu_to_le64(0);
+ dst->watch.op = src->watch.op;
+ dst->watch.gen = cpu_to_le32(src->watch.gen);
+ break;
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ break;
+ case CEPH_OSD_OP_NOTIFY:
+ dst->notify.cookie = cpu_to_le64(src->notify.cookie);
break;
case CEPH_OSD_OP_SETALLOCHINT:
dst->alloc_hint.expected_object_size =
@@ -768,9 +866,6 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
dst->xattr.cmp_op = src->xattr.cmp_op;
dst->xattr.cmp_mode = src->xattr.cmp_mode;
- osd_data = &src->xattr.osd_data;
- ceph_osdc_msg_data_add(req->r_request, osd_data);
- request_data_len = osd_data->pagelist->length;
break;
case CEPH_OSD_OP_CREATE:
case CEPH_OSD_OP_DELETE:
@@ -787,7 +882,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
dst->flags = cpu_to_le32(src->flags);
dst->payload_len = cpu_to_le32(src->indata_len);
- return request_data_len;
+ return src->indata_len;
}
/*
@@ -824,17 +919,15 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
GFP_NOFS);
- if (!req)
- return ERR_PTR(-ENOMEM);
-
- req->r_flags = flags;
+ if (!req) {
+ r = -ENOMEM;
+ goto fail;
+ }
/* calculate max write size */
r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
- if (r < 0) {
- ceph_osdc_put_request(req);
- return ERR_PTR(r);
- }
+ if (r)
+ goto fail;
if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
osd_req_op_init(req, which, opcode, 0);
@@ -854,194 +947,71 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
truncate_size, truncate_seq);
}
+ req->r_flags = flags;
req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
+ ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
+
+ req->r_snapid = vino.snap;
+ if (flags & CEPH_OSD_FLAG_WRITE)
+ req->r_data_offset = off;
- snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name),
- "%llx.%08llx", vino.ino, objnum);
- req->r_base_oid.name_len = strlen(req->r_base_oid.name);
+ r = ceph_osdc_alloc_messages(req, GFP_NOFS);
+ if (r)
+ goto fail;
return req;
+
+fail:
+ ceph_osdc_put_request(req);
+ return ERR_PTR(r);
}
EXPORT_SYMBOL(ceph_osdc_new_request);
/*
* We keep osd requests in an rbtree, sorted by ->r_tid.
*/
-static void __insert_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *new)
-{
- struct rb_node **p = &osdc->requests.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_osd_request *req = NULL;
-
- while (*p) {
- parent = *p;
- req = rb_entry(parent, struct ceph_osd_request, r_node);
- if (new->r_tid < req->r_tid)
- p = &(*p)->rb_left;
- else if (new->r_tid > req->r_tid)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&new->r_node, parent, p);
- rb_insert_color(&new->r_node, &osdc->requests);
-}
+DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
+DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node)
-static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
- u64 tid)
+static bool osd_homeless(struct ceph_osd *osd)
{
- struct ceph_osd_request *req;
- struct rb_node *n = osdc->requests.rb_node;
-
- while (n) {
- req = rb_entry(n, struct ceph_osd_request, r_node);
- if (tid < req->r_tid)
- n = n->rb_left;
- else if (tid > req->r_tid)
- n = n->rb_right;
- else
- return req;
- }
- return NULL;
+ return osd->o_osd == CEPH_HOMELESS_OSD;
}
-static struct ceph_osd_request *
-__lookup_request_ge(struct ceph_osd_client *osdc,
- u64 tid)
+static bool osd_registered(struct ceph_osd *osd)
{
- struct ceph_osd_request *req;
- struct rb_node *n = osdc->requests.rb_node;
-
- while (n) {
- req = rb_entry(n, struct ceph_osd_request, r_node);
- if (tid < req->r_tid) {
- if (!n->rb_left)
- return req;
- n = n->rb_left;
- } else if (tid > req->r_tid) {
- n = n->rb_right;
- } else {
- return req;
- }
- }
- return NULL;
-}
-
-static void __kick_linger_request(struct ceph_osd_request *req)
-{
- struct ceph_osd_client *osdc = req->r_osdc;
- struct ceph_osd *osd = req->r_osd;
+ verify_osdc_locked(osd->o_osdc);
- /*
- * Linger requests need to be resent with a new tid to avoid
- * the dup op detection logic on the OSDs. Achieve this with
- * a re-register dance instead of open-coding.
- */
- ceph_osdc_get_request(req);
- if (!list_empty(&req->r_linger_item))
- __unregister_linger_request(osdc, req);
- else
- __unregister_request(osdc, req);
- __register_request(osdc, req);
- ceph_osdc_put_request(req);
-
- /*
- * Unless request has been registered as both normal and
- * lingering, __unregister{,_linger}_request clears r_osd.
- * However, here we need to preserve r_osd to make sure we
- * requeue on the same OSD.
- */
- WARN_ON(req->r_osd || !osd);
- req->r_osd = osd;
-
- dout("%s requeueing %p tid %llu\n", __func__, req, req->r_tid);
- __enqueue_request(req);
+ return !RB_EMPTY_NODE(&osd->o_node);
}
/*
- * Resubmit requests pending on the given osd.
+ * Assumes @osd is zero-initialized.
*/
-static void __kick_osd_requests(struct ceph_osd_client *osdc,
- struct ceph_osd *osd)
+static void osd_init(struct ceph_osd *osd)
{
- struct ceph_osd_request *req, *nreq;
- LIST_HEAD(resend);
- LIST_HEAD(resend_linger);
- int err;
-
- dout("%s osd%d\n", __func__, osd->o_osd);
- err = __reset_osd(osdc, osd);
- if (err)
- return;
-
- /*
- * Build up a list of requests to resend by traversing the
- * osd's list of requests. Requests for a given object are
- * sent in tid order, and that is also the order they're
- * kept on this list. Therefore all requests that are in
- * flight will be found first, followed by all requests that
- * have not yet been sent. And to resend requests while
- * preserving this order we will want to put any sent
- * requests back on the front of the osd client's unsent
- * list.
- *
- * So we build a separate ordered list of already-sent
- * requests for the affected osd and splice it onto the
- * front of the osd client's unsent list. Once we've seen a
- * request that has not yet been sent we're done. Those
- * requests are already sitting right where they belong.
- */
- list_for_each_entry(req, &osd->o_requests, r_osd_item) {
- if (!req->r_sent)
- break;
-
- if (!req->r_linger) {
- dout("%s requeueing %p tid %llu\n", __func__, req,
- req->r_tid);
- list_move_tail(&req->r_req_lru_item, &resend);
- req->r_flags |= CEPH_OSD_FLAG_RETRY;
- } else {
- list_move_tail(&req->r_req_lru_item, &resend_linger);
- }
- }
- list_splice(&resend, &osdc->req_unsent);
-
- /*
- * Both registered and not yet registered linger requests are
- * enqueued with a new tid on the same OSD. We add/move them
- * to req_unsent/o_requests at the end to keep things in tid
- * order.
- */
- list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
- r_linger_osd_item) {
- WARN_ON(!list_empty(&req->r_req_lru_item));
- __kick_linger_request(req);
- }
-
- list_for_each_entry_safe(req, nreq, &resend_linger, r_req_lru_item)
- __kick_linger_request(req);
+ atomic_set(&osd->o_ref, 1);
+ RB_CLEAR_NODE(&osd->o_node);
+ osd->o_requests = RB_ROOT;
+ osd->o_linger_requests = RB_ROOT;
+ INIT_LIST_HEAD(&osd->o_osd_lru);
+ INIT_LIST_HEAD(&osd->o_keepalive_item);
+ osd->o_incarnation = 1;
+ mutex_init(&osd->lock);
}
-/*
- * If the osd connection drops, we need to resubmit all requests.
- */
-static void osd_reset(struct ceph_connection *con)
+static void osd_cleanup(struct ceph_osd *osd)
{
- struct ceph_osd *osd = con->private;
- struct ceph_osd_client *osdc;
-
- if (!osd)
- return;
- dout("osd_reset osd%d\n", osd->o_osd);
- osdc = osd->o_osdc;
- down_read(&osdc->map_sem);
- mutex_lock(&osdc->request_mutex);
- __kick_osd_requests(osdc, osd);
- __send_queued(osdc);
- mutex_unlock(&osdc->request_mutex);
- up_read(&osdc->map_sem);
+ WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
+ WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
+ WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
+ WARN_ON(!list_empty(&osd->o_osd_lru));
+ WARN_ON(!list_empty(&osd->o_keepalive_item));
+
+ if (osd->o_auth.authorizer) {
+ WARN_ON(osd_homeless(osd));
+ ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
+ }
}
/*
@@ -1051,22 +1021,15 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
{
struct ceph_osd *osd;
- osd = kzalloc(sizeof(*osd), GFP_NOFS);
- if (!osd)
- return NULL;
+ WARN_ON(onum == CEPH_HOMELESS_OSD);
- atomic_set(&osd->o_ref, 1);
+ osd = kzalloc(sizeof(*osd), GFP_NOIO | __GFP_NOFAIL);
+ osd_init(osd);
osd->o_osdc = osdc;
osd->o_osd = onum;
- RB_CLEAR_NODE(&osd->o_node);
- INIT_LIST_HEAD(&osd->o_requests);
- INIT_LIST_HEAD(&osd->o_linger_requests);
- INIT_LIST_HEAD(&osd->o_osd_lru);
- osd->o_incarnation = 1;
ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
- INIT_LIST_HEAD(&osd->o_keepalive_item);
return osd;
}
@@ -1087,114 +1050,115 @@ static void put_osd(struct ceph_osd *osd)
dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
atomic_read(&osd->o_ref) - 1);
if (atomic_dec_and_test(&osd->o_ref)) {
- if (osd->o_auth.authorizer)
- ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
+ osd_cleanup(osd);
kfree(osd);
}
}
-/*
- * remove an osd from our map
- */
-static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
- dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
- WARN_ON(!list_empty(&osd->o_requests));
- WARN_ON(!list_empty(&osd->o_linger_requests));
-
- list_del_init(&osd->o_osd_lru);
- rb_erase(&osd->o_node, &osdc->osds);
- RB_CLEAR_NODE(&osd->o_node);
-}
-
-static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
- dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
-
- if (!RB_EMPTY_NODE(&osd->o_node)) {
- ceph_con_close(&osd->o_con);
- __remove_osd(osdc, osd);
- put_osd(osd);
- }
-}
+DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
-static void remove_all_osds(struct ceph_osd_client *osdc)
+static void __move_osd_to_lru(struct ceph_osd *osd)
{
- dout("%s %p\n", __func__, osdc);
- mutex_lock(&osdc->request_mutex);
- while (!RB_EMPTY_ROOT(&osdc->osds)) {
- struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
- struct ceph_osd, o_node);
- remove_osd(osdc, osd);
- }
- mutex_unlock(&osdc->request_mutex);
-}
+ struct ceph_osd_client *osdc = osd->o_osdc;
-static void __move_osd_to_lru(struct ceph_osd_client *osdc,
- struct ceph_osd *osd)
-{
- dout("%s %p\n", __func__, osd);
+ dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
BUG_ON(!list_empty(&osd->o_osd_lru));
+ spin_lock(&osdc->osd_lru_lock);
list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
+ spin_unlock(&osdc->osd_lru_lock);
+
osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
}
-static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc,
- struct ceph_osd *osd)
+static void maybe_move_osd_to_lru(struct ceph_osd *osd)
{
- dout("%s %p\n", __func__, osd);
-
- if (list_empty(&osd->o_requests) &&
- list_empty(&osd->o_linger_requests))
- __move_osd_to_lru(osdc, osd);
+ if (RB_EMPTY_ROOT(&osd->o_requests) &&
+ RB_EMPTY_ROOT(&osd->o_linger_requests))
+ __move_osd_to_lru(osd);
}
static void __remove_osd_from_lru(struct ceph_osd *osd)
{
- dout("__remove_osd_from_lru %p\n", osd);
+ struct ceph_osd_client *osdc = osd->o_osdc;
+
+ dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+ spin_lock(&osdc->osd_lru_lock);
if (!list_empty(&osd->o_osd_lru))
list_del_init(&osd->o_osd_lru);
+ spin_unlock(&osdc->osd_lru_lock);
}
-static void remove_old_osds(struct ceph_osd_client *osdc)
+/*
+ * Close the connection and assign any leftover requests to the
+ * homeless session.
+ */
+static void close_osd(struct ceph_osd *osd)
{
- struct ceph_osd *osd, *nosd;
+ struct ceph_osd_client *osdc = osd->o_osdc;
+ struct rb_node *n;
- dout("__remove_old_osds %p\n", osdc);
- mutex_lock(&osdc->request_mutex);
- list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
- if (time_before(jiffies, osd->lru_ttl))
- break;
- remove_osd(osdc, osd);
+ verify_osdc_wrlocked(osdc);
+ dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+ ceph_con_close(&osd->o_con);
+
+ for (n = rb_first(&osd->o_requests); n; ) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
+
+ n = rb_next(n); /* unlink_request() */
+
+ dout(" reassigning req %p tid %llu\n", req, req->r_tid);
+ unlink_request(osd, req);
+ link_request(&osdc->homeless_osd, req);
+ }
+ for (n = rb_first(&osd->o_linger_requests); n; ) {
+ struct ceph_osd_linger_request *lreq =
+ rb_entry(n, struct ceph_osd_linger_request, node);
+
+ n = rb_next(n); /* unlink_linger() */
+
+ dout(" reassigning lreq %p linger_id %llu\n", lreq,
+ lreq->linger_id);
+ unlink_linger(osd, lreq);
+ link_linger(&osdc->homeless_osd, lreq);
}
- mutex_unlock(&osdc->request_mutex);
+
+ __remove_osd_from_lru(osd);
+ erase_osd(&osdc->osds, osd);
+ put_osd(osd);
}
/*
* reset osd connect
*/
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+static int reopen_osd(struct ceph_osd *osd)
{
struct ceph_entity_addr *peer_addr;
- dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
- if (list_empty(&osd->o_requests) &&
- list_empty(&osd->o_linger_requests)) {
- remove_osd(osdc, osd);
+ dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+ if (RB_EMPTY_ROOT(&osd->o_requests) &&
+ RB_EMPTY_ROOT(&osd->o_linger_requests)) {
+ close_osd(osd);
return -ENODEV;
}
- peer_addr = &osdc->osdmap->osd_addr[osd->o_osd];
+ peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd];
if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
!ceph_con_opened(&osd->o_con)) {
- struct ceph_osd_request *req;
+ struct rb_node *n;
dout("osd addr hasn't changed and connection never opened, "
"letting msgr retry\n");
/* touch each r_stamp for handle_timeout()'s benfit */
- list_for_each_entry(req, &osd->o_requests, r_osd_item)
+ for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
req->r_stamp = jiffies;
+ }
return -EAGAIN;
}
@@ -1206,455 +1170,1369 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
return 0;
}
-static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
+static struct ceph_osd *lookup_create_osd(struct ceph_osd_client *osdc, int o,
+ bool wrlocked)
{
- struct rb_node **p = &osdc->osds.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_osd *osd = NULL;
+ struct ceph_osd *osd;
- dout("__insert_osd %p osd%d\n", new, new->o_osd);
- while (*p) {
- parent = *p;
- osd = rb_entry(parent, struct ceph_osd, o_node);
- if (new->o_osd < osd->o_osd)
- p = &(*p)->rb_left;
- else if (new->o_osd > osd->o_osd)
- p = &(*p)->rb_right;
- else
- BUG();
+ if (wrlocked)
+ verify_osdc_wrlocked(osdc);
+ else
+ verify_osdc_locked(osdc);
+
+ if (o != CEPH_HOMELESS_OSD)
+ osd = lookup_osd(&osdc->osds, o);
+ else
+ osd = &osdc->homeless_osd;
+ if (!osd) {
+ if (!wrlocked)
+ return ERR_PTR(-EAGAIN);
+
+ osd = create_osd(osdc, o);
+ insert_osd(&osdc->osds, osd);
+ ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
+ &osdc->osdmap->osd_addr[osd->o_osd]);
}
- rb_link_node(&new->o_node, parent, p);
- rb_insert_color(&new->o_node, &osdc->osds);
+ dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd);
+ return osd;
}
-static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
+/*
+ * Create request <-> OSD session relation.
+ *
+ * @req has to be assigned a tid, @osd may be homeless.
+ */
+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req)
{
- struct ceph_osd *osd;
- struct rb_node *n = osdc->osds.rb_node;
-
- while (n) {
- osd = rb_entry(n, struct ceph_osd, o_node);
- if (o < osd->o_osd)
- n = n->rb_left;
- else if (o > osd->o_osd)
- n = n->rb_right;
- else
- return osd;
- }
- return NULL;
+ verify_osd_locked(osd);
+ WARN_ON(!req->r_tid || req->r_osd);
+ dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
+ req, req->r_tid);
+
+ if (!osd_homeless(osd))
+ __remove_osd_from_lru(osd);
+ else
+ atomic_inc(&osd->o_osdc->num_homeless);
+
+ get_osd(osd);
+ insert_request(&osd->o_requests, req);
+ req->r_osd = osd;
}
-static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req)
{
- schedule_delayed_work(&osdc->timeout_work,
- osdc->client->options->osd_keepalive_timeout);
+ verify_osd_locked(osd);
+ WARN_ON(req->r_osd != osd);
+ dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
+ req, req->r_tid);
+
+ req->r_osd = NULL;
+ erase_request(&osd->o_requests, req);
+ put_osd(osd);
+
+ if (!osd_homeless(osd))
+ maybe_move_osd_to_lru(osd);
+ else
+ atomic_dec(&osd->o_osdc->num_homeless);
}
-static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
+static bool __pool_full(struct ceph_pg_pool_info *pi)
{
- cancel_delayed_work(&osdc->timeout_work);
+ return pi->flags & CEPH_POOL_FLAG_FULL;
}
-/*
- * Register request, assign tid. If this is the first request, set up
- * the timeout event.
- */
-static void __register_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+static bool have_pool_full(struct ceph_osd_client *osdc)
{
- req->r_tid = ++osdc->last_tid;
- req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
- dout("__register_request %p tid %lld\n", req, req->r_tid);
- __insert_request(osdc, req);
- ceph_osdc_get_request(req);
- osdc->num_requests++;
- if (osdc->num_requests == 1) {
- dout(" first request, scheduling timeout\n");
- __schedule_osd_timeout(osdc);
+ struct rb_node *n;
+
+ for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
+ struct ceph_pg_pool_info *pi =
+ rb_entry(n, struct ceph_pg_pool_info, node);
+
+ if (__pool_full(pi))
+ return true;
}
+
+ return false;
+}
+
+static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id)
+{
+ struct ceph_pg_pool_info *pi;
+
+ pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
+ if (!pi)
+ return false;
+
+ return __pool_full(pi);
}
/*
- * called under osdc->request_mutex
+ * Returns whether a request should be blocked from being sent
+ * based on the current osdmap and osd_client settings.
*/
-static void __unregister_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+static bool target_should_be_paused(struct ceph_osd_client *osdc,
+ const struct ceph_osd_request_target *t,
+ struct ceph_pg_pool_info *pi)
{
- if (RB_EMPTY_NODE(&req->r_node)) {
- dout("__unregister_request %p tid %lld not registered\n",
- req, req->r_tid);
- return;
+ bool pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
+ bool pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ __pool_full(pi);
+
+ WARN_ON(pi->id != t->base_oloc.pool);
+ return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
+ (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
+}
+
+enum calc_target_result {
+ CALC_TARGET_NO_ACTION = 0,
+ CALC_TARGET_NEED_RESEND,
+ CALC_TARGET_POOL_DNE,
+};
+
+static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
+ struct ceph_osd_request_target *t,
+ u32 *last_force_resend,
+ bool any_change)
+{
+ struct ceph_pg_pool_info *pi;
+ struct ceph_pg pgid, last_pgid;
+ struct ceph_osds up, acting;
+ bool force_resend = false;
+ bool need_check_tiering = false;
+ bool need_resend = false;
+ bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE);
+ enum calc_target_result ct_res;
+ int ret;
+
+ pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
+ if (!pi) {
+ t->osd = CEPH_HOMELESS_OSD;
+ ct_res = CALC_TARGET_POOL_DNE;
+ goto out;
}
- dout("__unregister_request %p tid %lld\n", req, req->r_tid);
- rb_erase(&req->r_node, &osdc->requests);
- RB_CLEAR_NODE(&req->r_node);
- osdc->num_requests--;
+ if (osdc->osdmap->epoch == pi->last_force_request_resend) {
+ if (last_force_resend &&
+ *last_force_resend < pi->last_force_request_resend) {
+ *last_force_resend = pi->last_force_request_resend;
+ force_resend = true;
+ } else if (!last_force_resend) {
+ force_resend = true;
+ }
+ }
+ if (ceph_oid_empty(&t->target_oid) || force_resend) {
+ ceph_oid_copy(&t->target_oid, &t->base_oid);
+ need_check_tiering = true;
+ }
+ if (ceph_oloc_empty(&t->target_oloc) || force_resend) {
+ ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
+ need_check_tiering = true;
+ }
- if (req->r_osd) {
- /* make sure the original request isn't in flight. */
- ceph_msg_revoke(req->r_request);
+ if (need_check_tiering &&
+ (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
+ if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
+ t->target_oloc.pool = pi->read_tier;
+ if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
+ t->target_oloc.pool = pi->write_tier;
+ }
- list_del_init(&req->r_osd_item);
- maybe_move_osd_to_lru(osdc, req->r_osd);
- if (list_empty(&req->r_linger_osd_item))
- req->r_osd = NULL;
+ ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid,
+ &t->target_oloc, &pgid);
+ if (ret) {
+ WARN_ON(ret != -ENOENT);
+ t->osd = CEPH_HOMELESS_OSD;
+ ct_res = CALC_TARGET_POOL_DNE;
+ goto out;
+ }
+ last_pgid.pool = pgid.pool;
+ last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
+
+ ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
+ if (any_change &&
+ ceph_is_new_interval(&t->acting,
+ &acting,
+ &t->up,
+ &up,
+ t->size,
+ pi->size,
+ t->min_size,
+ pi->min_size,
+ t->pg_num,
+ pi->pg_num,
+ t->sort_bitwise,
+ sort_bitwise,
+ &last_pgid))
+ force_resend = true;
+
+ if (t->paused && !target_should_be_paused(osdc, t, pi)) {
+ t->paused = false;
+ need_resend = true;
}
- list_del_init(&req->r_req_lru_item);
- ceph_osdc_put_request(req);
+ if (ceph_pg_compare(&t->pgid, &pgid) ||
+ ceph_osds_changed(&t->acting, &acting, any_change) ||
+ force_resend) {
+ t->pgid = pgid; /* struct */
+ ceph_osds_copy(&t->acting, &acting);
+ ceph_osds_copy(&t->up, &up);
+ t->size = pi->size;
+ t->min_size = pi->min_size;
+ t->pg_num = pi->pg_num;
+ t->pg_num_mask = pi->pg_num_mask;
+ t->sort_bitwise = sort_bitwise;
+
+ t->osd = acting.primary;
+ need_resend = true;
+ }
+
+ ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION;
+out:
+ dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
+ return ct_res;
+}
+
+static void setup_request_data(struct ceph_osd_request *req,
+ struct ceph_msg *msg)
+{
+ u32 data_len = 0;
+ int i;
+
+ if (!list_empty(&msg->data))
+ return;
+
+ WARN_ON(msg->data_length);
+ for (i = 0; i < req->r_num_ops; i++) {
+ struct ceph_osd_req_op *op = &req->r_ops[i];
+
+ switch (op->op) {
+ /* request */
+ case CEPH_OSD_OP_WRITE:
+ case CEPH_OSD_OP_WRITEFULL:
+ WARN_ON(op->indata_len != op->extent.length);
+ ceph_osdc_msg_data_add(msg, &op->extent.osd_data);
+ break;
+ case CEPH_OSD_OP_SETXATTR:
+ case CEPH_OSD_OP_CMPXATTR:
+ WARN_ON(op->indata_len != op->xattr.name_len +
+ op->xattr.value_len);
+ ceph_osdc_msg_data_add(msg, &op->xattr.osd_data);
+ break;
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ ceph_osdc_msg_data_add(msg,
+ &op->notify_ack.request_data);
+ break;
+
+ /* reply */
+ case CEPH_OSD_OP_STAT:
+ ceph_osdc_msg_data_add(req->r_reply,
+ &op->raw_data_in);
+ break;
+ case CEPH_OSD_OP_READ:
+ ceph_osdc_msg_data_add(req->r_reply,
+ &op->extent.osd_data);
+ break;
+
+ /* both */
+ case CEPH_OSD_OP_CALL:
+ WARN_ON(op->indata_len != op->cls.class_len +
+ op->cls.method_len +
+ op->cls.indata_len);
+ ceph_osdc_msg_data_add(msg, &op->cls.request_info);
+ /* optional, can be NONE */
+ ceph_osdc_msg_data_add(msg, &op->cls.request_data);
+ /* optional, can be NONE */
+ ceph_osdc_msg_data_add(req->r_reply,
+ &op->cls.response_data);
+ break;
+ case CEPH_OSD_OP_NOTIFY:
+ ceph_osdc_msg_data_add(msg,
+ &op->notify.request_data);
+ ceph_osdc_msg_data_add(req->r_reply,
+ &op->notify.response_data);
+ break;
+ }
+
+ data_len += op->indata_len;
+ }
+
+ WARN_ON(data_len != msg->data_length);
+}
+
+static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
+{
+ void *p = msg->front.iov_base;
+ void *const end = p + msg->front_alloc_len;
+ u32 data_len = 0;
+ int i;
+
+ if (req->r_flags & CEPH_OSD_FLAG_WRITE) {
+ /* snapshots aren't writeable */
+ WARN_ON(req->r_snapid != CEPH_NOSNAP);
+ } else {
+ WARN_ON(req->r_mtime.tv_sec || req->r_mtime.tv_nsec ||
+ req->r_data_offset || req->r_snapc);
+ }
+
+ setup_request_data(req, msg);
+
+ ceph_encode_32(&p, 1); /* client_inc, always 1 */
+ ceph_encode_32(&p, req->r_osdc->osdmap->epoch);
+ ceph_encode_32(&p, req->r_flags);
+ ceph_encode_timespec(p, &req->r_mtime);
+ p += sizeof(struct ceph_timespec);
+ /* aka reassert_version */
+ memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version));
+ p += sizeof(req->r_replay_version);
+
+ /* oloc */
+ ceph_encode_8(&p, 4);
+ ceph_encode_8(&p, 4);
+ ceph_encode_32(&p, 8 + 4 + 4);
+ ceph_encode_64(&p, req->r_t.target_oloc.pool);
+ ceph_encode_32(&p, -1); /* preferred */
+ ceph_encode_32(&p, 0); /* key len */
+
+ /* pgid */
+ ceph_encode_8(&p, 1);
+ ceph_encode_64(&p, req->r_t.pgid.pool);
+ ceph_encode_32(&p, req->r_t.pgid.seed);
+ ceph_encode_32(&p, -1); /* preferred */
+
+ /* oid */
+ ceph_encode_32(&p, req->r_t.target_oid.name_len);
+ memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len);
+ p += req->r_t.target_oid.name_len;
- if (osdc->num_requests == 0) {
- dout(" no requests, canceling timeout\n");
- __cancel_osd_timeout(osdc);
+ /* ops, can imply data */
+ ceph_encode_16(&p, req->r_num_ops);
+ for (i = 0; i < req->r_num_ops; i++) {
+ data_len += osd_req_encode_op(p, &req->r_ops[i]);
+ p += sizeof(struct ceph_osd_op);
}
+
+ ceph_encode_64(&p, req->r_snapid); /* snapid */
+ if (req->r_snapc) {
+ ceph_encode_64(&p, req->r_snapc->seq);
+ ceph_encode_32(&p, req->r_snapc->num_snaps);
+ for (i = 0; i < req->r_snapc->num_snaps; i++)
+ ceph_encode_64(&p, req->r_snapc->snaps[i]);
+ } else {
+ ceph_encode_64(&p, 0); /* snap_seq */
+ ceph_encode_32(&p, 0); /* snaps len */
+ }
+
+ ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
+
+ BUG_ON(p > end);
+ msg->front.iov_len = p - msg->front.iov_base;
+ msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ msg->hdr.data_len = cpu_to_le32(data_len);
+ /*
+ * The header "data_off" is a hint to the receiver allowing it
+ * to align received data into its buffers such that there's no
+ * need to re-copy it before writing it to disk (direct I/O).
+ */
+ msg->hdr.data_off = cpu_to_le16(req->r_data_offset);
+
+ dout("%s req %p oid %s oid_len %d front %zu data %u\n", __func__,
+ req, req->r_t.target_oid.name, req->r_t.target_oid.name_len,
+ msg->front.iov_len, data_len);
}
/*
- * Cancel a previously queued request message
+ * @req has to be assigned a tid and registered.
*/
-static void __cancel_request(struct ceph_osd_request *req)
+static void send_request(struct ceph_osd_request *req)
{
- if (req->r_sent && req->r_osd) {
+ struct ceph_osd *osd = req->r_osd;
+
+ verify_osd_locked(osd);
+ WARN_ON(osd->o_osd != req->r_t.osd);
+
+ /*
+ * We may have a previously queued request message hanging
+ * around. Cancel it to avoid corrupting the msgr.
+ */
+ if (req->r_sent)
ceph_msg_revoke(req->r_request);
- req->r_sent = 0;
+
+ req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR;
+ if (req->r_attempts)
+ req->r_flags |= CEPH_OSD_FLAG_RETRY;
+ else
+ WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY);
+
+ encode_request(req, req->r_request);
+
+ dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n",
+ __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
+ req->r_t.osd, req->r_flags, req->r_attempts);
+
+ req->r_t.paused = false;
+ req->r_stamp = jiffies;
+ req->r_attempts++;
+
+ req->r_sent = osd->o_incarnation;
+ req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
+ ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
+}
+
+static void maybe_request_map(struct ceph_osd_client *osdc)
+{
+ bool continuous = false;
+
+ verify_osdc_locked(osdc);
+ WARN_ON(!osdc->osdmap->epoch);
+
+ if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD) ||
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
+ dout("%s osdc %p continuous\n", __func__, osdc);
+ continuous = true;
+ } else {
+ dout("%s osdc %p onetime\n", __func__, osdc);
}
+
+ if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
+ osdc->osdmap->epoch + 1, continuous))
+ ceph_monc_renew_subs(&osdc->client->monc);
}
-static void __register_linger_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+static void send_map_check(struct ceph_osd_request *req);
+
+static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
{
- dout("%s %p tid %llu\n", __func__, req, req->r_tid);
- WARN_ON(!req->r_linger);
+ struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_osd *osd;
+ enum calc_target_result ct_res;
+ bool need_send = false;
+ bool promoted = false;
+
+ WARN_ON(req->r_tid || req->r_got_reply);
+ dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
+
+again:
+ ct_res = calc_target(osdc, &req->r_t, &req->r_last_force_resend, false);
+ if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
+ goto promote;
+
+ osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked);
+ if (IS_ERR(osd)) {
+ WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked);
+ goto promote;
+ }
+
+ if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
+ dout("req %p pausewr\n", req);
+ req->r_t.paused = true;
+ maybe_request_map(osdc);
+ } else if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) {
+ dout("req %p pauserd\n", req);
+ req->r_t.paused = true;
+ maybe_request_map(osdc);
+ } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+ !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY |
+ CEPH_OSD_FLAG_FULL_FORCE)) &&
+ (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ pool_full(osdc, req->r_t.base_oloc.pool))) {
+ dout("req %p full/pool_full\n", req);
+ pr_warn_ratelimited("FULL or reached pool quota\n");
+ req->r_t.paused = true;
+ maybe_request_map(osdc);
+ } else if (!osd_homeless(osd)) {
+ need_send = true;
+ } else {
+ maybe_request_map(osdc);
+ }
+
+ mutex_lock(&osd->lock);
+ /*
+ * Assign the tid atomically with send_request() to protect
+ * multiple writes to the same object from racing with each
+ * other, resulting in out of order ops on the OSDs.
+ */
+ req->r_tid = atomic64_inc_return(&osdc->last_tid);
+ link_request(osd, req);
+ if (need_send)
+ send_request(req);
+ mutex_unlock(&osd->lock);
+ if (ct_res == CALC_TARGET_POOL_DNE)
+ send_map_check(req);
+
+ if (promoted)
+ downgrade_write(&osdc->lock);
+ return;
+
+promote:
+ up_read(&osdc->lock);
+ down_write(&osdc->lock);
+ wrlocked = true;
+ promoted = true;
+ goto again;
+}
+
+static void account_request(struct ceph_osd_request *req)
+{
+ unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+
+ if (req->r_flags & CEPH_OSD_FLAG_READ) {
+ WARN_ON(req->r_flags & mask);
+ req->r_flags |= CEPH_OSD_FLAG_ACK;
+ } else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
+ WARN_ON(!(req->r_flags & mask));
+ else
+ WARN_ON(1);
+
+ WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
+ atomic_inc(&req->r_osdc->num_requests);
+}
+
+static void submit_request(struct ceph_osd_request *req, bool wrlocked)
+{
ceph_osdc_get_request(req);
- list_add_tail(&req->r_linger_item, &osdc->req_linger);
- if (req->r_osd)
- list_add_tail(&req->r_linger_osd_item,
- &req->r_osd->o_linger_requests);
+ account_request(req);
+ __submit_request(req, wrlocked);
}
-static void __unregister_linger_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+static void __finish_request(struct ceph_osd_request *req)
{
- WARN_ON(!req->r_linger);
+ struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_osd *osd = req->r_osd;
- if (list_empty(&req->r_linger_item)) {
- dout("%s %p tid %llu not registered\n", __func__, req,
- req->r_tid);
+ verify_osd_locked(osd);
+ dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+
+ WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid));
+ unlink_request(osd, req);
+ atomic_dec(&osdc->num_requests);
+
+ /*
+ * If an OSD has failed or returned and a request has been sent
+ * twice, it's possible to get a reply and end up here while the
+ * request message is queued for delivery. We will ignore the
+ * reply, so not a big deal, but better to try and catch it.
+ */
+ ceph_msg_revoke(req->r_request);
+ ceph_msg_revoke_incoming(req->r_reply);
+}
+
+static void finish_request(struct ceph_osd_request *req)
+{
+ __finish_request(req);
+ ceph_osdc_put_request(req);
+}
+
+static void __complete_request(struct ceph_osd_request *req)
+{
+ if (req->r_callback)
+ req->r_callback(req);
+ else
+ complete_all(&req->r_completion);
+}
+
+/*
+ * Note that this is open-coded in handle_reply(), which has to deal
+ * with ack vs commit, dup acks, etc.
+ */
+static void complete_request(struct ceph_osd_request *req, int err)
+{
+ dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
+
+ req->r_result = err;
+ __finish_request(req);
+ __complete_request(req);
+ complete_all(&req->r_safe_completion);
+ ceph_osdc_put_request(req);
+}
+
+static void cancel_map_check(struct ceph_osd_request *req)
+{
+ struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_osd_request *lookup_req;
+
+ verify_osdc_wrlocked(osdc);
+
+ lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
+ if (!lookup_req)
return;
+
+ WARN_ON(lookup_req != req);
+ erase_request_mc(&osdc->map_checks, req);
+ ceph_osdc_put_request(req);
+}
+
+static void cancel_request(struct ceph_osd_request *req)
+{
+ dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+
+ cancel_map_check(req);
+ finish_request(req);
+}
+
+static void check_pool_dne(struct ceph_osd_request *req)
+{
+ struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_osdmap *map = osdc->osdmap;
+
+ verify_osdc_wrlocked(osdc);
+ WARN_ON(!map->epoch);
+
+ if (req->r_attempts) {
+ /*
+ * We sent a request earlier, which means that
+ * previously the pool existed, and now it does not
+ * (i.e., it was deleted).
+ */
+ req->r_map_dne_bound = map->epoch;
+ dout("%s req %p tid %llu pool disappeared\n", __func__, req,
+ req->r_tid);
+ } else {
+ dout("%s req %p tid %llu map_dne_bound %u have %u\n", __func__,
+ req, req->r_tid, req->r_map_dne_bound, map->epoch);
+ }
+
+ if (req->r_map_dne_bound) {
+ if (map->epoch >= req->r_map_dne_bound) {
+ /* we had a new enough map */
+ pr_info_ratelimited("tid %llu pool does not exist\n",
+ req->r_tid);
+ complete_request(req, -ENOENT);
+ }
+ } else {
+ send_map_check(req);
}
+}
+
+static void map_check_cb(struct ceph_mon_generic_request *greq)
+{
+ struct ceph_osd_client *osdc = &greq->monc->client->osdc;
+ struct ceph_osd_request *req;
+ u64 tid = greq->private_data;
- dout("%s %p tid %llu\n", __func__, req, req->r_tid);
- list_del_init(&req->r_linger_item);
+ WARN_ON(greq->result || !greq->u.newest);
- if (req->r_osd) {
- list_del_init(&req->r_linger_osd_item);
- maybe_move_osd_to_lru(osdc, req->r_osd);
- if (list_empty(&req->r_osd_item))
- req->r_osd = NULL;
+ down_write(&osdc->lock);
+ req = lookup_request_mc(&osdc->map_checks, tid);
+ if (!req) {
+ dout("%s tid %llu dne\n", __func__, tid);
+ goto out_unlock;
}
+
+ dout("%s req %p tid %llu map_dne_bound %u newest %llu\n", __func__,
+ req, req->r_tid, req->r_map_dne_bound, greq->u.newest);
+ if (!req->r_map_dne_bound)
+ req->r_map_dne_bound = greq->u.newest;
+ erase_request_mc(&osdc->map_checks, req);
+ check_pool_dne(req);
+
ceph_osdc_put_request(req);
+out_unlock:
+ up_write(&osdc->lock);
}
-void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+static void send_map_check(struct ceph_osd_request *req)
{
- if (!req->r_linger) {
- dout("set_request_linger %p\n", req);
- req->r_linger = 1;
+ struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_osd_request *lookup_req;
+ int ret;
+
+ verify_osdc_wrlocked(osdc);
+
+ lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
+ if (lookup_req) {
+ WARN_ON(lookup_req != req);
+ return;
}
+
+ ceph_osdc_get_request(req);
+ insert_request_mc(&osdc->map_checks, req);
+ ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
+ map_check_cb, req->r_tid);
+ WARN_ON(ret);
}
-EXPORT_SYMBOL(ceph_osdc_set_request_linger);
/*
- * Returns whether a request should be blocked from being sent
- * based on the current osdmap and osd_client settings.
- *
- * Caller should hold map_sem for read.
+ * lingering requests, watch/notify v2 infrastructure
*/
-static bool __req_should_be_paused(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+static void linger_release(struct kref *kref)
+{
+ struct ceph_osd_linger_request *lreq =
+ container_of(kref, struct ceph_osd_linger_request, kref);
+
+ dout("%s lreq %p reg_req %p ping_req %p\n", __func__, lreq,
+ lreq->reg_req, lreq->ping_req);
+ WARN_ON(!RB_EMPTY_NODE(&lreq->node));
+ WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node));
+ WARN_ON(!RB_EMPTY_NODE(&lreq->mc_node));
+ WARN_ON(!list_empty(&lreq->scan_item));
+ WARN_ON(!list_empty(&lreq->pending_lworks));
+ WARN_ON(lreq->osd);
+
+ if (lreq->reg_req)
+ ceph_osdc_put_request(lreq->reg_req);
+ if (lreq->ping_req)
+ ceph_osdc_put_request(lreq->ping_req);
+ target_destroy(&lreq->t);
+ kfree(lreq);
+}
+
+static void linger_put(struct ceph_osd_linger_request *lreq)
{
- bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
- bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
- ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
- return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) ||
- (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr);
+ if (lreq)
+ kref_put(&lreq->kref, linger_release);
+}
+
+static struct ceph_osd_linger_request *
+linger_get(struct ceph_osd_linger_request *lreq)
+{
+ kref_get(&lreq->kref);
+ return lreq;
+}
+
+static struct ceph_osd_linger_request *
+linger_alloc(struct ceph_osd_client *osdc)
+{
+ struct ceph_osd_linger_request *lreq;
+
+ lreq = kzalloc(sizeof(*lreq), GFP_NOIO);
+ if (!lreq)
+ return NULL;
+
+ kref_init(&lreq->kref);
+ mutex_init(&lreq->lock);
+ RB_CLEAR_NODE(&lreq->node);
+ RB_CLEAR_NODE(&lreq->osdc_node);
+ RB_CLEAR_NODE(&lreq->mc_node);
+ INIT_LIST_HEAD(&lreq->scan_item);
+ INIT_LIST_HEAD(&lreq->pending_lworks);
+ init_completion(&lreq->reg_commit_wait);
+ init_completion(&lreq->notify_finish_wait);
+
+ lreq->osdc = osdc;
+ target_init(&lreq->t);
+
+ dout("%s lreq %p\n", __func__, lreq);
+ return lreq;
}
+DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node)
+DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node)
+DEFINE_RB_FUNCS(linger_mc, struct ceph_osd_linger_request, linger_id, mc_node)
+
/*
- * Calculate mapping of a request to a PG. Takes tiering into account.
+ * Create linger request <-> OSD session relation.
+ *
+ * @lreq has to be registered, @osd may be homeless.
*/
-static int __calc_request_pg(struct ceph_osdmap *osdmap,
- struct ceph_osd_request *req,
- struct ceph_pg *pg_out)
+static void link_linger(struct ceph_osd *osd,
+ struct ceph_osd_linger_request *lreq)
{
- bool need_check_tiering;
+ verify_osd_locked(osd);
+ WARN_ON(!lreq->linger_id || lreq->osd);
+ dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
+ osd->o_osd, lreq, lreq->linger_id);
- need_check_tiering = false;
- if (req->r_target_oloc.pool == -1) {
- req->r_target_oloc = req->r_base_oloc; /* struct */
- need_check_tiering = true;
+ if (!osd_homeless(osd))
+ __remove_osd_from_lru(osd);
+ else
+ atomic_inc(&osd->o_osdc->num_homeless);
+
+ get_osd(osd);
+ insert_linger(&osd->o_linger_requests, lreq);
+ lreq->osd = osd;
+}
+
+static void unlink_linger(struct ceph_osd *osd,
+ struct ceph_osd_linger_request *lreq)
+{
+ verify_osd_locked(osd);
+ WARN_ON(lreq->osd != osd);
+ dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
+ osd->o_osd, lreq, lreq->linger_id);
+
+ lreq->osd = NULL;
+ erase_linger(&osd->o_linger_requests, lreq);
+ put_osd(osd);
+
+ if (!osd_homeless(osd))
+ maybe_move_osd_to_lru(osd);
+ else
+ atomic_dec(&osd->o_osdc->num_homeless);
+}
+
+static bool __linger_registered(struct ceph_osd_linger_request *lreq)
+{
+ verify_osdc_locked(lreq->osdc);
+
+ return !RB_EMPTY_NODE(&lreq->osdc_node);
+}
+
+static bool linger_registered(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+ bool registered;
+
+ down_read(&osdc->lock);
+ registered = __linger_registered(lreq);
+ up_read(&osdc->lock);
+
+ return registered;
+}
+
+static void linger_register(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+
+ verify_osdc_wrlocked(osdc);
+ WARN_ON(lreq->linger_id);
+
+ linger_get(lreq);
+ lreq->linger_id = ++osdc->last_linger_id;
+ insert_linger_osdc(&osdc->linger_requests, lreq);
+}
+
+static void linger_unregister(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+
+ verify_osdc_wrlocked(osdc);
+
+ erase_linger_osdc(&osdc->linger_requests, lreq);
+ linger_put(lreq);
+}
+
+static void cancel_linger_request(struct ceph_osd_request *req)
+{
+ struct ceph_osd_linger_request *lreq = req->r_priv;
+
+ WARN_ON(!req->r_linger);
+ cancel_request(req);
+ linger_put(lreq);
+}
+
+struct linger_work {
+ struct work_struct work;
+ struct ceph_osd_linger_request *lreq;
+ struct list_head pending_item;
+ unsigned long queued_stamp;
+
+ union {
+ struct {
+ u64 notify_id;
+ u64 notifier_id;
+ void *payload; /* points into @msg front */
+ size_t payload_len;
+
+ struct ceph_msg *msg; /* for ceph_msg_put() */
+ } notify;
+ struct {
+ int err;
+ } error;
+ };
+};
+
+static struct linger_work *lwork_alloc(struct ceph_osd_linger_request *lreq,
+ work_func_t workfn)
+{
+ struct linger_work *lwork;
+
+ lwork = kzalloc(sizeof(*lwork), GFP_NOIO);
+ if (!lwork)
+ return NULL;
+
+ INIT_WORK(&lwork->work, workfn);
+ INIT_LIST_HEAD(&lwork->pending_item);
+ lwork->lreq = linger_get(lreq);
+
+ return lwork;
+}
+
+static void lwork_free(struct linger_work *lwork)
+{
+ struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+ mutex_lock(&lreq->lock);
+ list_del(&lwork->pending_item);
+ mutex_unlock(&lreq->lock);
+
+ linger_put(lreq);
+ kfree(lwork);
+}
+
+static void lwork_queue(struct linger_work *lwork)
+{
+ struct ceph_osd_linger_request *lreq = lwork->lreq;
+ struct ceph_osd_client *osdc = lreq->osdc;
+
+ verify_lreq_locked(lreq);
+ WARN_ON(!list_empty(&lwork->pending_item));
+
+ lwork->queued_stamp = jiffies;
+ list_add_tail(&lwork->pending_item, &lreq->pending_lworks);
+ queue_work(osdc->notify_wq, &lwork->work);
+}
+
+static void do_watch_notify(struct work_struct *w)
+{
+ struct linger_work *lwork = container_of(w, struct linger_work, work);
+ struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+ if (!linger_registered(lreq)) {
+ dout("%s lreq %p not registered\n", __func__, lreq);
+ goto out;
}
- if (req->r_target_oid.name_len == 0) {
- ceph_oid_copy(&req->r_target_oid, &req->r_base_oid);
- need_check_tiering = true;
+
+ WARN_ON(!lreq->is_watch);
+ dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n",
+ __func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id,
+ lwork->notify.payload_len);
+ lreq->wcb(lreq->data, lwork->notify.notify_id, lreq->linger_id,
+ lwork->notify.notifier_id, lwork->notify.payload,
+ lwork->notify.payload_len);
+
+out:
+ ceph_msg_put(lwork->notify.msg);
+ lwork_free(lwork);
+}
+
+static void do_watch_error(struct work_struct *w)
+{
+ struct linger_work *lwork = container_of(w, struct linger_work, work);
+ struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+ if (!linger_registered(lreq)) {
+ dout("%s lreq %p not registered\n", __func__, lreq);
+ goto out;
}
- if (need_check_tiering &&
- (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
- struct ceph_pg_pool_info *pi;
-
- pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool);
- if (pi) {
- if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
- pi->read_tier >= 0)
- req->r_target_oloc.pool = pi->read_tier;
- if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
- pi->write_tier >= 0)
- req->r_target_oloc.pool = pi->write_tier;
+ dout("%s lreq %p err %d\n", __func__, lreq, lwork->error.err);
+ lreq->errcb(lreq->data, lreq->linger_id, lwork->error.err);
+
+out:
+ lwork_free(lwork);
+}
+
+static void queue_watch_error(struct ceph_osd_linger_request *lreq)
+{
+ struct linger_work *lwork;
+
+ lwork = lwork_alloc(lreq, do_watch_error);
+ if (!lwork) {
+ pr_err("failed to allocate error-lwork\n");
+ return;
+ }
+
+ lwork->error.err = lreq->last_error;
+ lwork_queue(lwork);
+}
+
+static void linger_reg_commit_complete(struct ceph_osd_linger_request *lreq,
+ int result)
+{
+ if (!completion_done(&lreq->reg_commit_wait)) {
+ lreq->reg_commit_error = (result <= 0 ? result : 0);
+ complete_all(&lreq->reg_commit_wait);
+ }
+}
+
+static void linger_commit_cb(struct ceph_osd_request *req)
+{
+ struct ceph_osd_linger_request *lreq = req->r_priv;
+
+ mutex_lock(&lreq->lock);
+ dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
+ lreq->linger_id, req->r_result);
+ WARN_ON(!__linger_registered(lreq));
+ linger_reg_commit_complete(lreq, req->r_result);
+ lreq->committed = true;
+
+ if (!lreq->is_watch) {
+ struct ceph_osd_data *osd_data =
+ osd_req_op_data(req, 0, notify, response_data);
+ void *p = page_address(osd_data->pages[0]);
+
+ WARN_ON(req->r_ops[0].op != CEPH_OSD_OP_NOTIFY ||
+ osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+
+ /* make note of the notify_id */
+ if (req->r_ops[0].outdata_len >= sizeof(u64)) {
+ lreq->notify_id = ceph_decode_64(&p);
+ dout("lreq %p notify_id %llu\n", lreq,
+ lreq->notify_id);
+ } else {
+ dout("lreq %p no notify_id\n", lreq);
}
- /* !pi is caught in ceph_oloc_oid_to_pg() */
}
- return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc,
- &req->r_target_oid, pg_out);
+ mutex_unlock(&lreq->lock);
+ linger_put(lreq);
}
-static void __enqueue_request(struct ceph_osd_request *req)
+static int normalize_watch_error(int err)
{
- struct ceph_osd_client *osdc = req->r_osdc;
+ /*
+ * Translate ENOENT -> ENOTCONN so that a delete->disconnection
+ * notification and a failure to reconnect because we raced with
+ * the delete appear the same to the user.
+ */
+ if (err == -ENOENT)
+ err = -ENOTCONN;
+
+ return err;
+}
+
+static void linger_reconnect_cb(struct ceph_osd_request *req)
+{
+ struct ceph_osd_linger_request *lreq = req->r_priv;
+
+ mutex_lock(&lreq->lock);
+ dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__,
+ lreq, lreq->linger_id, req->r_result, lreq->last_error);
+ if (req->r_result < 0) {
+ if (!lreq->last_error) {
+ lreq->last_error = normalize_watch_error(req->r_result);
+ queue_watch_error(lreq);
+ }
+ }
+
+ mutex_unlock(&lreq->lock);
+ linger_put(lreq);
+}
+
+static void send_linger(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_request *req = lreq->reg_req;
+ struct ceph_osd_req_op *op = &req->r_ops[0];
- dout("%s %p tid %llu to osd%d\n", __func__, req, req->r_tid,
- req->r_osd ? req->r_osd->o_osd : -1);
+ verify_osdc_wrlocked(req->r_osdc);
+ dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
- if (req->r_osd) {
- __remove_osd_from_lru(req->r_osd);
- list_add_tail(&req->r_osd_item, &req->r_osd->o_requests);
- list_move_tail(&req->r_req_lru_item, &osdc->req_unsent);
+ if (req->r_osd)
+ cancel_linger_request(req);
+
+ request_reinit(req);
+ ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+ ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+ req->r_flags = lreq->t.flags;
+ req->r_mtime = lreq->mtime;
+
+ mutex_lock(&lreq->lock);
+ if (lreq->is_watch && lreq->committed) {
+ WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
+ op->watch.cookie != lreq->linger_id);
+ op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT;
+ op->watch.gen = ++lreq->register_gen;
+ dout("lreq %p reconnect register_gen %u\n", lreq,
+ op->watch.gen);
+ req->r_callback = linger_reconnect_cb;
} else {
- list_move_tail(&req->r_req_lru_item, &osdc->req_notarget);
+ if (!lreq->is_watch)
+ lreq->notify_id = 0;
+ else
+ WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
+ dout("lreq %p register\n", lreq);
+ req->r_callback = linger_commit_cb;
}
+ mutex_unlock(&lreq->lock);
+
+ req->r_priv = linger_get(lreq);
+ req->r_linger = true;
+
+ submit_request(req, true);
}
-/*
- * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
- * (as needed), and set the request r_osd appropriately. If there is
- * no up osd, set r_osd to NULL. Move the request to the appropriate list
- * (unsent, homeless) or leave on in-flight lru.
- *
- * Return 0 if unchanged, 1 if changed, or negative on error.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __map_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req, int force_resend)
+static void linger_ping_cb(struct ceph_osd_request *req)
{
- struct ceph_pg pgid;
- int acting[CEPH_PG_MAX_SIZE];
- int num, o;
- int err;
- bool was_paused;
-
- dout("map_request %p tid %lld\n", req, req->r_tid);
-
- err = __calc_request_pg(osdc->osdmap, req, &pgid);
- if (err) {
- list_move(&req->r_req_lru_item, &osdc->req_notarget);
- return err;
- }
- req->r_pgid = pgid;
-
- num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o);
- if (num < 0)
- num = 0;
-
- was_paused = req->r_paused;
- req->r_paused = __req_should_be_paused(osdc, req);
- if (was_paused && !req->r_paused)
- force_resend = 1;
-
- if ((!force_resend &&
- req->r_osd && req->r_osd->o_osd == o &&
- req->r_sent >= req->r_osd->o_incarnation &&
- req->r_num_pg_osds == num &&
- memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
- (req->r_osd == NULL && o == -1) ||
- req->r_paused)
- return 0; /* no change */
-
- dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
- req->r_tid, pgid.pool, pgid.seed, o,
- req->r_osd ? req->r_osd->o_osd : -1);
-
- /* record full pg acting set */
- memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
- req->r_num_pg_osds = num;
-
- if (req->r_osd) {
- __cancel_request(req);
- list_del_init(&req->r_osd_item);
- list_del_init(&req->r_linger_osd_item);
- req->r_osd = NULL;
- }
-
- req->r_osd = __lookup_osd(osdc, o);
- if (!req->r_osd && o >= 0) {
- err = -ENOMEM;
- req->r_osd = create_osd(osdc, o);
- if (!req->r_osd) {
- list_move(&req->r_req_lru_item, &osdc->req_notarget);
- goto out;
+ struct ceph_osd_linger_request *lreq = req->r_priv;
+
+ mutex_lock(&lreq->lock);
+ dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n",
+ __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent,
+ lreq->last_error);
+ if (lreq->register_gen == req->r_ops[0].watch.gen) {
+ if (!req->r_result) {
+ lreq->watch_valid_thru = lreq->ping_sent;
+ } else if (!lreq->last_error) {
+ lreq->last_error = normalize_watch_error(req->r_result);
+ queue_watch_error(lreq);
}
+ } else {
+ dout("lreq %p register_gen %u ignoring old pong %u\n", lreq,
+ lreq->register_gen, req->r_ops[0].watch.gen);
+ }
+
+ mutex_unlock(&lreq->lock);
+ linger_put(lreq);
+}
- dout("map_request osd %p is osd%d\n", req->r_osd, o);
- __insert_osd(osdc, req->r_osd);
+static void send_linger_ping(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+ struct ceph_osd_request *req = lreq->ping_req;
+ struct ceph_osd_req_op *op = &req->r_ops[0];
- ceph_con_open(&req->r_osd->o_con,
- CEPH_ENTITY_TYPE_OSD, o,
- &osdc->osdmap->osd_addr[o]);
+ if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) {
+ dout("%s PAUSERD\n", __func__);
+ return;
}
- __enqueue_request(req);
- err = 1; /* osd or pg changed */
+ lreq->ping_sent = jiffies;
+ dout("%s lreq %p linger_id %llu ping_sent %lu register_gen %u\n",
+ __func__, lreq, lreq->linger_id, lreq->ping_sent,
+ lreq->register_gen);
-out:
- return err;
+ if (req->r_osd)
+ cancel_linger_request(req);
+
+ request_reinit(req);
+ target_copy(&req->r_t, &lreq->t);
+
+ WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
+ op->watch.cookie != lreq->linger_id ||
+ op->watch.op != CEPH_OSD_WATCH_OP_PING);
+ op->watch.gen = lreq->register_gen;
+ req->r_callback = linger_ping_cb;
+ req->r_priv = linger_get(lreq);
+ req->r_linger = true;
+
+ ceph_osdc_get_request(req);
+ account_request(req);
+ req->r_tid = atomic64_inc_return(&osdc->last_tid);
+ link_request(lreq->osd, req);
+ send_request(req);
}
-/*
- * caller should hold map_sem (for read) and request_mutex
- */
-static void __send_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+static void linger_submit(struct ceph_osd_linger_request *lreq)
{
- void *p;
+ struct ceph_osd_client *osdc = lreq->osdc;
+ struct ceph_osd *osd;
- dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n",
- req, req->r_tid, req->r_osd->o_osd, req->r_flags,
- (unsigned long long)req->r_pgid.pool, req->r_pgid.seed);
+ calc_target(osdc, &lreq->t, &lreq->last_force_resend, false);
+ osd = lookup_create_osd(osdc, lreq->t.osd, true);
+ link_linger(osd, lreq);
- /* fill in message content that changes each time we send it */
- put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch);
- put_unaligned_le32(req->r_flags, req->r_request_flags);
- put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool);
- p = req->r_request_pgid;
- ceph_encode_64(&p, req->r_pgid.pool);
- ceph_encode_32(&p, req->r_pgid.seed);
- put_unaligned_le64(1, req->r_request_attempts); /* FIXME */
- memcpy(req->r_request_reassert_version, &req->r_reassert_version,
- sizeof(req->r_reassert_version));
+ send_linger(lreq);
+}
- req->r_stamp = jiffies;
- list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
+static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+ struct ceph_osd_linger_request *lookup_lreq;
- ceph_msg_get(req->r_request); /* send consumes a ref */
+ verify_osdc_wrlocked(osdc);
- req->r_sent = req->r_osd->o_incarnation;
+ lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
+ lreq->linger_id);
+ if (!lookup_lreq)
+ return;
- ceph_con_send(&req->r_osd->o_con, req->r_request);
+ WARN_ON(lookup_lreq != lreq);
+ erase_linger_mc(&osdc->linger_map_checks, lreq);
+ linger_put(lreq);
}
/*
- * Send any requests in the queue (req_unsent).
+ * @lreq has to be both registered and linked.
*/
-static void __send_queued(struct ceph_osd_client *osdc)
+static void __linger_cancel(struct ceph_osd_linger_request *lreq)
{
- struct ceph_osd_request *req, *tmp;
+ if (lreq->is_watch && lreq->ping_req->r_osd)
+ cancel_linger_request(lreq->ping_req);
+ if (lreq->reg_req->r_osd)
+ cancel_linger_request(lreq->reg_req);
+ cancel_linger_map_check(lreq);
+ unlink_linger(lreq->osd, lreq);
+ linger_unregister(lreq);
+}
+
+static void linger_cancel(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
- dout("__send_queued\n");
- list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item)
- __send_request(osdc, req);
+ down_write(&osdc->lock);
+ if (__linger_registered(lreq))
+ __linger_cancel(lreq);
+ up_write(&osdc->lock);
}
-/*
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __ceph_osdc_start_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req,
- bool nofail)
-{
- int rc;
-
- __register_request(osdc, req);
- req->r_sent = 0;
- req->r_got_reply = 0;
- rc = __map_request(osdc, req, 0);
- if (rc < 0) {
- if (nofail) {
- dout("osdc_start_request failed map, "
- " will retry %lld\n", req->r_tid);
- rc = 0;
- } else {
- __unregister_request(osdc, req);
- }
- return rc;
+static void send_linger_map_check(struct ceph_osd_linger_request *lreq);
+
+static void check_linger_pool_dne(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+ struct ceph_osdmap *map = osdc->osdmap;
+
+ verify_osdc_wrlocked(osdc);
+ WARN_ON(!map->epoch);
+
+ if (lreq->register_gen) {
+ lreq->map_dne_bound = map->epoch;
+ dout("%s lreq %p linger_id %llu pool disappeared\n", __func__,
+ lreq, lreq->linger_id);
+ } else {
+ dout("%s lreq %p linger_id %llu map_dne_bound %u have %u\n",
+ __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
+ map->epoch);
}
- if (req->r_osd == NULL) {
- dout("send_request %p no up osds in pg\n", req);
- ceph_monc_request_next_osdmap(&osdc->client->monc);
+ if (lreq->map_dne_bound) {
+ if (map->epoch >= lreq->map_dne_bound) {
+ /* we had a new enough map */
+ pr_info("linger_id %llu pool does not exist\n",
+ lreq->linger_id);
+ linger_reg_commit_complete(lreq, -ENOENT);
+ __linger_cancel(lreq);
+ }
} else {
- __send_queued(osdc);
+ send_linger_map_check(lreq);
}
+}
- return 0;
+static void linger_map_check_cb(struct ceph_mon_generic_request *greq)
+{
+ struct ceph_osd_client *osdc = &greq->monc->client->osdc;
+ struct ceph_osd_linger_request *lreq;
+ u64 linger_id = greq->private_data;
+
+ WARN_ON(greq->result || !greq->u.newest);
+
+ down_write(&osdc->lock);
+ lreq = lookup_linger_mc(&osdc->linger_map_checks, linger_id);
+ if (!lreq) {
+ dout("%s linger_id %llu dne\n", __func__, linger_id);
+ goto out_unlock;
+ }
+
+ dout("%s lreq %p linger_id %llu map_dne_bound %u newest %llu\n",
+ __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
+ greq->u.newest);
+ if (!lreq->map_dne_bound)
+ lreq->map_dne_bound = greq->u.newest;
+ erase_linger_mc(&osdc->linger_map_checks, lreq);
+ check_linger_pool_dne(lreq);
+
+ linger_put(lreq);
+out_unlock:
+ up_write(&osdc->lock);
+}
+
+static void send_linger_map_check(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+ struct ceph_osd_linger_request *lookup_lreq;
+ int ret;
+
+ verify_osdc_wrlocked(osdc);
+
+ lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
+ lreq->linger_id);
+ if (lookup_lreq) {
+ WARN_ON(lookup_lreq != lreq);
+ return;
+ }
+
+ linger_get(lreq);
+ insert_linger_mc(&osdc->linger_map_checks, lreq);
+ ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
+ linger_map_check_cb, lreq->linger_id);
+ WARN_ON(ret);
+}
+
+static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
+{
+ int ret;
+
+ dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+ ret = wait_for_completion_interruptible(&lreq->reg_commit_wait);
+ return ret ?: lreq->reg_commit_error;
+}
+
+static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq)
+{
+ int ret;
+
+ dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+ ret = wait_for_completion_interruptible(&lreq->notify_finish_wait);
+ return ret ?: lreq->notify_finish_error;
}
/*
- * Timeout callback, called every N seconds when 1 or more osd
- * requests has been active for more than N seconds. When this
- * happens, we ping all OSDs with requests who have timed out to
- * ensure any communications channel reset is detected. Reset the
- * request timeouts another N seconds in the future as we go.
- * Reschedule the timeout event another N seconds in future (unless
- * there are no open requests).
+ * Timeout callback, called every N seconds. When 1 or more OSD
+ * requests has been active for more than N seconds, we send a keepalive
+ * (tag + timestamp) to its OSD to ensure any communications channel
+ * reset is detected.
*/
static void handle_timeout(struct work_struct *work)
{
struct ceph_osd_client *osdc =
container_of(work, struct ceph_osd_client, timeout_work.work);
struct ceph_options *opts = osdc->client->options;
- struct ceph_osd_request *req;
- struct ceph_osd *osd;
- struct list_head slow_osds;
- dout("timeout\n");
- down_read(&osdc->map_sem);
-
- ceph_monc_request_next_osdmap(&osdc->client->monc);
+ unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
+ LIST_HEAD(slow_osds);
+ struct rb_node *n, *p;
- mutex_lock(&osdc->request_mutex);
+ dout("%s osdc %p\n", __func__, osdc);
+ down_write(&osdc->lock);
/*
* ping osds that are a bit slow. this ensures that if there
* is a break in the TCP connection we will notice, and reopen
* a connection with that osd (from the fault callback).
*/
- INIT_LIST_HEAD(&slow_osds);
- list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
- if (time_before(jiffies,
- req->r_stamp + opts->osd_keepalive_timeout))
- break;
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+ bool found = false;
+
+ for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+ struct ceph_osd_request *req =
+ rb_entry(p, struct ceph_osd_request, r_node);
+
+ if (time_before(req->r_stamp, cutoff)) {
+ dout(" req %p tid %llu on osd%d is laggy\n",
+ req, req->r_tid, osd->o_osd);
+ found = true;
+ }
+ }
+ for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
+ struct ceph_osd_linger_request *lreq =
+ rb_entry(p, struct ceph_osd_linger_request, node);
+
+ dout(" lreq %p linger_id %llu is served by osd%d\n",
+ lreq, lreq->linger_id, osd->o_osd);
+ found = true;
+
+ mutex_lock(&lreq->lock);
+ if (lreq->is_watch && lreq->committed && !lreq->last_error)
+ send_linger_ping(lreq);
+ mutex_unlock(&lreq->lock);
+ }
- osd = req->r_osd;
- BUG_ON(!osd);
- dout(" tid %llu is slow, will send keepalive on osd%d\n",
- req->r_tid, osd->o_osd);
- list_move_tail(&osd->o_keepalive_item, &slow_osds);
+ if (found)
+ list_move_tail(&osd->o_keepalive_item, &slow_osds);
}
+
+ if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
+ maybe_request_map(osdc);
+
while (!list_empty(&slow_osds)) {
- osd = list_entry(slow_osds.next, struct ceph_osd,
- o_keepalive_item);
+ struct ceph_osd *osd = list_first_entry(&slow_osds,
+ struct ceph_osd,
+ o_keepalive_item);
list_del_init(&osd->o_keepalive_item);
ceph_con_keepalive(&osd->o_con);
}
- __schedule_osd_timeout(osdc);
- __send_queued(osdc);
- mutex_unlock(&osdc->request_mutex);
- up_read(&osdc->map_sem);
+ up_write(&osdc->lock);
+ schedule_delayed_work(&osdc->timeout_work,
+ osdc->client->options->osd_keepalive_timeout);
}
static void handle_osds_timeout(struct work_struct *work)
@@ -1663,12 +2541,20 @@ static void handle_osds_timeout(struct work_struct *work)
container_of(work, struct ceph_osd_client,
osds_timeout_work.work);
unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
+ struct ceph_osd *osd, *nosd;
- dout("osds timeout\n");
- down_read(&osdc->map_sem);
- remove_old_osds(osdc);
- up_read(&osdc->map_sem);
+ dout("%s osdc %p\n", __func__, osdc);
+ down_write(&osdc->lock);
+ list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
+ if (time_before(jiffies, osd->lru_ttl))
+ break;
+ WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
+ WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
+ close_osd(osd);
+ }
+
+ up_write(&osdc->lock);
schedule_delayed_work(&osdc->osds_timeout_work,
round_jiffies_relative(delay));
}
@@ -1776,107 +2662,76 @@ e_inval:
goto out;
}
-static void complete_request(struct ceph_osd_request *req)
-{
- complete_all(&req->r_safe_completion); /* fsync waiter */
-}
+struct MOSDOpReply {
+ struct ceph_pg pgid;
+ u64 flags;
+ int result;
+ u32 epoch;
+ int num_ops;
+ u32 outdata_len[CEPH_OSD_MAX_OPS];
+ s32 rval[CEPH_OSD_MAX_OPS];
+ int retry_attempt;
+ struct ceph_eversion replay_version;
+ u64 user_version;
+ struct ceph_request_redirect redirect;
+};
-/*
- * handle osd op reply. either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- */
-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m)
{
- void *p, *end;
- struct ceph_osd_request *req;
- struct ceph_request_redirect redir;
- u64 tid;
- int object_len;
- unsigned int numops;
- int payload_len, flags;
- s32 result;
- s32 retry_attempt;
- struct ceph_pg pg;
- int err;
- u32 reassert_epoch;
- u64 reassert_version;
- u32 osdmap_epoch;
- int already_completed;
- u32 bytes;
+ void *p = msg->front.iov_base;
+ void *const end = p + msg->front.iov_len;
+ u16 version = le16_to_cpu(msg->hdr.version);
+ struct ceph_eversion bad_replay_version;
u8 decode_redir;
- unsigned int i;
-
- tid = le64_to_cpu(msg->hdr.tid);
- dout("handle_reply %p tid %llu\n", msg, tid);
+ u32 len;
+ int ret;
+ int i;
- p = msg->front.iov_base;
- end = p + msg->front.iov_len;
+ ceph_decode_32_safe(&p, end, len, e_inval);
+ ceph_decode_need(&p, end, len, e_inval);
+ p += len; /* skip oid */
- ceph_decode_need(&p, end, 4, bad);
- object_len = ceph_decode_32(&p);
- ceph_decode_need(&p, end, object_len, bad);
- p += object_len;
+ ret = ceph_decode_pgid(&p, end, &m->pgid);
+ if (ret)
+ return ret;
- err = ceph_decode_pgid(&p, end, &pg);
- if (err)
- goto bad;
+ ceph_decode_64_safe(&p, end, m->flags, e_inval);
+ ceph_decode_32_safe(&p, end, m->result, e_inval);
+ ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval);
+ memcpy(&bad_replay_version, p, sizeof(bad_replay_version));
+ p += sizeof(bad_replay_version);
+ ceph_decode_32_safe(&p, end, m->epoch, e_inval);
- ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad);
- flags = ceph_decode_64(&p);
- result = ceph_decode_32(&p);
- reassert_epoch = ceph_decode_32(&p);
- reassert_version = ceph_decode_64(&p);
- osdmap_epoch = ceph_decode_32(&p);
-
- /* lookup */
- down_read(&osdc->map_sem);
- mutex_lock(&osdc->request_mutex);
- req = __lookup_request(osdc, tid);
- if (req == NULL) {
- dout("handle_reply tid %llu dne\n", tid);
- goto bad_mutex;
- }
- ceph_osdc_get_request(req);
+ ceph_decode_32_safe(&p, end, m->num_ops, e_inval);
+ if (m->num_ops > ARRAY_SIZE(m->outdata_len))
+ goto e_inval;
- dout("handle_reply %p tid %llu req %p result %d\n", msg, tid,
- req, result);
-
- ceph_decode_need(&p, end, 4, bad_put);
- numops = ceph_decode_32(&p);
- if (numops > CEPH_OSD_MAX_OPS)
- goto bad_put;
- if (numops != req->r_num_ops)
- goto bad_put;
- payload_len = 0;
- ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad_put);
- for (i = 0; i < numops; i++) {
+ ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op),
+ e_inval);
+ for (i = 0; i < m->num_ops; i++) {
struct ceph_osd_op *op = p;
- int len;
- len = le32_to_cpu(op->payload_len);
- req->r_ops[i].outdata_len = len;
- dout(" op %d has %d bytes\n", i, len);
- payload_len += len;
+ m->outdata_len[i] = le32_to_cpu(op->payload_len);
p += sizeof(*op);
}
- bytes = le32_to_cpu(msg->hdr.data_len);
- if (payload_len != bytes) {
- pr_warn("sum of op payload lens %d != data_len %d\n",
- payload_len, bytes);
- goto bad_put;
- }
- ceph_decode_need(&p, end, 4 + numops * 4, bad_put);
- retry_attempt = ceph_decode_32(&p);
- for (i = 0; i < numops; i++)
- req->r_ops[i].rval = ceph_decode_32(&p);
+ ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval);
+ for (i = 0; i < m->num_ops; i++)
+ ceph_decode_32_safe(&p, end, m->rval[i], e_inval);
- if (le16_to_cpu(msg->hdr.version) >= 6) {
- p += 8 + 4; /* skip replay_version */
- p += 8; /* skip user_version */
+ if (version >= 5) {
+ ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval);
+ memcpy(&m->replay_version, p, sizeof(m->replay_version));
+ p += sizeof(m->replay_version);
+ ceph_decode_64_safe(&p, end, m->user_version, e_inval);
+ } else {
+ m->replay_version = bad_replay_version; /* struct */
+ m->user_version = le64_to_cpu(m->replay_version.version);
+ }
- if (le16_to_cpu(msg->hdr.version) >= 7)
- ceph_decode_8_safe(&p, end, decode_redir, bad_put);
+ if (version >= 6) {
+ if (version >= 7)
+ ceph_decode_8_safe(&p, end, decode_redir, e_inval);
else
decode_redir = 1;
} else {
@@ -1884,228 +2739,410 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
}
if (decode_redir) {
- err = ceph_redirect_decode(&p, end, &redir);
- if (err)
- goto bad_put;
+ ret = ceph_redirect_decode(&p, end, &m->redirect);
+ if (ret)
+ return ret;
} else {
- redir.oloc.pool = -1;
+ ceph_oloc_init(&m->redirect.oloc);
}
- if (redir.oloc.pool != -1) {
- dout("redirect pool %lld\n", redir.oloc.pool);
+ return 0;
- __unregister_request(osdc, req);
+e_inval:
+ return -EINVAL;
+}
- req->r_target_oloc = redir.oloc; /* struct */
+/*
+ * We are done with @req if
+ * - @m is a safe reply, or
+ * - @m is an unsafe reply and we didn't want a safe one
+ */
+static bool done_request(const struct ceph_osd_request *req,
+ const struct MOSDOpReply *m)
+{
+ return (m->result < 0 ||
+ (m->flags & CEPH_OSD_FLAG_ONDISK) ||
+ !(req->r_flags & CEPH_OSD_FLAG_ONDISK));
+}
- /*
- * Start redirect requests with nofail=true. If
- * mapping fails, request will end up on the notarget
- * list, waiting for the new osdmap (which can take
- * a while), even though the original request mapped
- * successfully. In the future we might want to follow
- * original request's nofail setting here.
- */
- err = __ceph_osdc_start_request(osdc, req, true);
- BUG_ON(err);
+/*
+ * handle osd op reply. either call the callback if it is specified,
+ * or do the completion to wake up the waiting thread.
+ *
+ * ->r_unsafe_callback is set? yes no
+ *
+ * first reply is OK (needed r_cb/r_completion, r_cb/r_completion,
+ * any or needed/got safe) r_safe_completion r_safe_completion
+ *
+ * first reply is unsafe r_unsafe_cb(true) (nothing)
+ *
+ * when we get the safe reply r_unsafe_cb(false), r_cb/r_completion,
+ * r_safe_completion r_safe_completion
+ */
+static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
+{
+ struct ceph_osd_client *osdc = osd->o_osdc;
+ struct ceph_osd_request *req;
+ struct MOSDOpReply m;
+ u64 tid = le64_to_cpu(msg->hdr.tid);
+ u32 data_len = 0;
+ bool already_acked;
+ int ret;
+ int i;
- goto out_unlock;
- }
+ dout("%s msg %p tid %llu\n", __func__, msg, tid);
- already_completed = req->r_got_reply;
- if (!req->r_got_reply) {
- req->r_result = result;
- dout("handle_reply result %d bytes %d\n", req->r_result,
- bytes);
- if (req->r_result == 0)
- req->r_result = bytes;
+ down_read(&osdc->lock);
+ if (!osd_registered(osd)) {
+ dout("%s osd%d unknown\n", __func__, osd->o_osd);
+ goto out_unlock_osdc;
+ }
+ WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
- /* in case this is a write and we need to replay, */
- req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch);
- req->r_reassert_version.version = cpu_to_le64(reassert_version);
+ mutex_lock(&osd->lock);
+ req = lookup_request(&osd->o_requests, tid);
+ if (!req) {
+ dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid);
+ goto out_unlock_session;
+ }
- req->r_got_reply = 1;
- } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
- dout("handle_reply tid %llu dup ack\n", tid);
- goto out_unlock;
+ ret = decode_MOSDOpReply(msg, &m);
+ if (ret) {
+ pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
+ req->r_tid, ret);
+ ceph_msg_dump(msg);
+ goto fail_request;
+ }
+ dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n",
+ __func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed,
+ m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch),
+ le64_to_cpu(m.replay_version.version), m.user_version);
+
+ if (m.retry_attempt >= 0) {
+ if (m.retry_attempt != req->r_attempts - 1) {
+ dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
+ req, req->r_tid, m.retry_attempt,
+ req->r_attempts - 1);
+ goto out_unlock_session;
+ }
+ } else {
+ WARN_ON(1); /* MOSDOpReply v4 is assumed */
}
- dout("handle_reply tid %llu flags %d\n", tid, flags);
+ if (!ceph_oloc_empty(&m.redirect.oloc)) {
+ dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
+ m.redirect.oloc.pool);
+ unlink_request(osd, req);
+ mutex_unlock(&osd->lock);
+
+ ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
+ req->r_flags |= CEPH_OSD_FLAG_REDIRECTED;
+ req->r_tid = 0;
+ __submit_request(req, false);
+ goto out_unlock_osdc;
+ }
- if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK))
- __register_linger_request(osdc, req);
+ if (m.num_ops != req->r_num_ops) {
+ pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
+ req->r_num_ops, req->r_tid);
+ goto fail_request;
+ }
+ for (i = 0; i < req->r_num_ops; i++) {
+ dout(" req %p tid %llu op %d rval %d len %u\n", req,
+ req->r_tid, i, m.rval[i], m.outdata_len[i]);
+ req->r_ops[i].rval = m.rval[i];
+ req->r_ops[i].outdata_len = m.outdata_len[i];
+ data_len += m.outdata_len[i];
+ }
+ if (data_len != le32_to_cpu(msg->hdr.data_len)) {
+ pr_err("sum of lens %u != %u for tid %llu\n", data_len,
+ le32_to_cpu(msg->hdr.data_len), req->r_tid);
+ goto fail_request;
+ }
+ dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__,
+ req, req->r_tid, req->r_got_reply, m.result, data_len);
+
+ already_acked = req->r_got_reply;
+ if (!already_acked) {
+ req->r_result = m.result ?: data_len;
+ req->r_replay_version = m.replay_version; /* struct */
+ req->r_got_reply = true;
+ } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
+ dout("req %p tid %llu dup ack\n", req, req->r_tid);
+ goto out_unlock_session;
+ }
- /* either this is a read, or we got the safe response */
- if (result < 0 ||
- (flags & CEPH_OSD_FLAG_ONDISK) ||
- ((flags & CEPH_OSD_FLAG_WRITE) == 0))
- __unregister_request(osdc, req);
+ if (done_request(req, &m)) {
+ __finish_request(req);
+ if (req->r_linger) {
+ WARN_ON(req->r_unsafe_callback);
+ dout("req %p tid %llu cb (locked)\n", req, req->r_tid);
+ __complete_request(req);
+ }
+ }
- mutex_unlock(&osdc->request_mutex);
- up_read(&osdc->map_sem);
+ mutex_unlock(&osd->lock);
+ up_read(&osdc->lock);
- if (!already_completed) {
- if (req->r_unsafe_callback &&
- result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK))
+ if (done_request(req, &m)) {
+ if (already_acked && req->r_unsafe_callback) {
+ dout("req %p tid %llu safe-cb\n", req, req->r_tid);
+ req->r_unsafe_callback(req, false);
+ } else if (!req->r_linger) {
+ dout("req %p tid %llu cb\n", req, req->r_tid);
+ __complete_request(req);
+ }
+ if (m.flags & CEPH_OSD_FLAG_ONDISK)
+ complete_all(&req->r_safe_completion);
+ ceph_osdc_put_request(req);
+ } else {
+ if (req->r_unsafe_callback) {
+ dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
req->r_unsafe_callback(req, true);
- if (req->r_callback)
- req->r_callback(req, msg);
- else
- complete_all(&req->r_completion);
+ } else {
+ WARN_ON(1);
+ }
}
- if (flags & CEPH_OSD_FLAG_ONDISK) {
- if (req->r_unsafe_callback && already_completed)
- req->r_unsafe_callback(req, false);
- complete_request(req);
+ return;
+
+fail_request:
+ complete_request(req, -EIO);
+out_unlock_session:
+ mutex_unlock(&osd->lock);
+out_unlock_osdc:
+ up_read(&osdc->lock);
+}
+
+static void set_pool_was_full(struct ceph_osd_client *osdc)
+{
+ struct rb_node *n;
+
+ for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
+ struct ceph_pg_pool_info *pi =
+ rb_entry(n, struct ceph_pg_pool_info, node);
+
+ pi->was_full = __pool_full(pi);
}
+}
-out:
- dout("req=%p req->r_linger=%d\n", req, req->r_linger);
- ceph_osdc_put_request(req);
- return;
-out_unlock:
- mutex_unlock(&osdc->request_mutex);
- up_read(&osdc->map_sem);
- goto out;
+static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id)
+{
+ struct ceph_pg_pool_info *pi;
-bad_put:
- req->r_result = -EIO;
- __unregister_request(osdc, req);
- if (req->r_callback)
- req->r_callback(req, msg);
- else
- complete_all(&req->r_completion);
- complete_request(req);
- ceph_osdc_put_request(req);
-bad_mutex:
- mutex_unlock(&osdc->request_mutex);
- up_read(&osdc->map_sem);
-bad:
- pr_err("corrupt osd_op_reply got %d %d\n",
- (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));
- ceph_msg_dump(msg);
+ pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
+ if (!pi)
+ return false;
+
+ return pi->was_full && !__pool_full(pi);
}
-static void reset_changed_osds(struct ceph_osd_client *osdc)
+static enum calc_target_result
+recalc_linger_target(struct ceph_osd_linger_request *lreq)
{
- struct rb_node *p, *n;
+ struct ceph_osd_client *osdc = lreq->osdc;
+ enum calc_target_result ct_res;
- dout("%s %p\n", __func__, osdc);
- for (p = rb_first(&osdc->osds); p; p = n) {
- struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node);
+ ct_res = calc_target(osdc, &lreq->t, &lreq->last_force_resend, true);
+ if (ct_res == CALC_TARGET_NEED_RESEND) {
+ struct ceph_osd *osd;
- n = rb_next(p);
- if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
- memcmp(&osd->o_con.peer_addr,
- ceph_osd_addr(osdc->osdmap,
- osd->o_osd),
- sizeof(struct ceph_entity_addr)) != 0)
- __reset_osd(osdc, osd);
+ osd = lookup_create_osd(osdc, lreq->t.osd, true);
+ if (osd != lreq->osd) {
+ unlink_linger(lreq->osd, lreq);
+ link_linger(osd, lreq);
+ }
}
+
+ return ct_res;
}
/*
- * Requeue requests whose mapping to an OSD has changed. If requests map to
- * no osd, request a new map.
- *
- * Caller should hold map_sem for read.
+ * Requeue requests whose mapping to an OSD has changed.
*/
-static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
- bool force_resend_writes)
+static void scan_requests(struct ceph_osd *osd,
+ bool force_resend,
+ bool cleared_full,
+ bool check_pool_cleared_full,
+ struct rb_root *need_resend,
+ struct list_head *need_resend_linger)
{
- struct ceph_osd_request *req, *nreq;
- struct rb_node *p;
- int needmap = 0;
- int err;
- bool force_resend_req;
+ struct ceph_osd_client *osdc = osd->o_osdc;
+ struct rb_node *n;
+ bool force_resend_writes;
+
+ for (n = rb_first(&osd->o_linger_requests); n; ) {
+ struct ceph_osd_linger_request *lreq =
+ rb_entry(n, struct ceph_osd_linger_request, node);
+ enum calc_target_result ct_res;
+
+ n = rb_next(n); /* recalc_linger_target() */
+
+ dout("%s lreq %p linger_id %llu\n", __func__, lreq,
+ lreq->linger_id);
+ ct_res = recalc_linger_target(lreq);
+ switch (ct_res) {
+ case CALC_TARGET_NO_ACTION:
+ force_resend_writes = cleared_full ||
+ (check_pool_cleared_full &&
+ pool_cleared_full(osdc, lreq->t.base_oloc.pool));
+ if (!force_resend && !force_resend_writes)
+ break;
+
+ /* fall through */
+ case CALC_TARGET_NEED_RESEND:
+ cancel_linger_map_check(lreq);
+ /*
+ * scan_requests() for the previous epoch(s)
+ * may have already added it to the list, since
+ * it's not unlinked here.
+ */
+ if (list_empty(&lreq->scan_item))
+ list_add_tail(&lreq->scan_item, need_resend_linger);
+ break;
+ case CALC_TARGET_POOL_DNE:
+ check_linger_pool_dne(lreq);
+ break;
+ }
+ }
- dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "",
- force_resend_writes ? " (force resend writes)" : "");
- mutex_lock(&osdc->request_mutex);
- for (p = rb_first(&osdc->requests); p; ) {
- req = rb_entry(p, struct ceph_osd_request, r_node);
- p = rb_next(p);
+ for (n = rb_first(&osd->o_requests); n; ) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
+ enum calc_target_result ct_res;
+
+ n = rb_next(n); /* unlink_request(), check_pool_dne() */
+
+ dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+ ct_res = calc_target(osdc, &req->r_t,
+ &req->r_last_force_resend, false);
+ switch (ct_res) {
+ case CALC_TARGET_NO_ACTION:
+ force_resend_writes = cleared_full ||
+ (check_pool_cleared_full &&
+ pool_cleared_full(osdc, req->r_t.base_oloc.pool));
+ if (!force_resend &&
+ (!(req->r_flags & CEPH_OSD_FLAG_WRITE) ||
+ !force_resend_writes))
+ break;
+
+ /* fall through */
+ case CALC_TARGET_NEED_RESEND:
+ cancel_map_check(req);
+ unlink_request(osd, req);
+ insert_request(need_resend, req);
+ break;
+ case CALC_TARGET_POOL_DNE:
+ check_pool_dne(req);
+ break;
+ }
+ }
+}
+static int handle_one_map(struct ceph_osd_client *osdc,
+ void *p, void *end, bool incremental,
+ struct rb_root *need_resend,
+ struct list_head *need_resend_linger)
+{
+ struct ceph_osdmap *newmap;
+ struct rb_node *n;
+ bool skipped_map = false;
+ bool was_full;
+
+ was_full = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
+ set_pool_was_full(osdc);
+
+ if (incremental)
+ newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
+ else
+ newmap = ceph_osdmap_decode(&p, end);
+ if (IS_ERR(newmap))
+ return PTR_ERR(newmap);
+
+ if (newmap != osdc->osdmap) {
/*
- * For linger requests that have not yet been
- * registered, move them to the linger list; they'll
- * be sent to the osd in the loop below. Unregister
- * the request before re-registering it as a linger
- * request to ensure the __map_request() below
- * will decide it needs to be sent.
+ * Preserve ->was_full before destroying the old map.
+ * For pools that weren't in the old map, ->was_full
+ * should be false.
*/
- if (req->r_linger && list_empty(&req->r_linger_item)) {
- dout("%p tid %llu restart on osd%d\n",
- req, req->r_tid,
- req->r_osd ? req->r_osd->o_osd : -1);
- ceph_osdc_get_request(req);
- __unregister_request(osdc, req);
- __register_linger_request(osdc, req);
- ceph_osdc_put_request(req);
- continue;
+ for (n = rb_first(&newmap->pg_pools); n; n = rb_next(n)) {
+ struct ceph_pg_pool_info *pi =
+ rb_entry(n, struct ceph_pg_pool_info, node);
+ struct ceph_pg_pool_info *old_pi;
+
+ old_pi = ceph_pg_pool_by_id(osdc->osdmap, pi->id);
+ if (old_pi)
+ pi->was_full = old_pi->was_full;
+ else
+ WARN_ON(pi->was_full);
}
- force_resend_req = force_resend ||
- (force_resend_writes &&
- req->r_flags & CEPH_OSD_FLAG_WRITE);
- err = __map_request(osdc, req, force_resend_req);
- if (err < 0)
- continue; /* error */
- if (req->r_osd == NULL) {
- dout("%p tid %llu maps to no osd\n", req, req->r_tid);
- needmap++; /* request a newer map */
- } else if (err > 0) {
- if (!req->r_linger) {
- dout("%p tid %llu requeued on osd%d\n", req,
- req->r_tid,
- req->r_osd ? req->r_osd->o_osd : -1);
- req->r_flags |= CEPH_OSD_FLAG_RETRY;
- }
+ if (osdc->osdmap->epoch &&
+ osdc->osdmap->epoch + 1 < newmap->epoch) {
+ WARN_ON(incremental);
+ skipped_map = true;
}
+
+ ceph_osdmap_destroy(osdc->osdmap);
+ osdc->osdmap = newmap;
}
- list_for_each_entry_safe(req, nreq, &osdc->req_linger,
- r_linger_item) {
- dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
-
- err = __map_request(osdc, req,
- force_resend || force_resend_writes);
- dout("__map_request returned %d\n", err);
- if (err < 0)
- continue; /* hrm! */
- if (req->r_osd == NULL || err > 0) {
- if (req->r_osd == NULL) {
- dout("lingering %p tid %llu maps to no osd\n",
- req, req->r_tid);
- /*
- * A homeless lingering request makes
- * no sense, as it's job is to keep
- * a particular OSD connection open.
- * Request a newer map and kick the
- * request, knowing that it won't be
- * resent until we actually get a map
- * that can tell us where to send it.
- */
- needmap++;
- }
+ was_full &= !ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
+ scan_requests(&osdc->homeless_osd, skipped_map, was_full, true,
+ need_resend, need_resend_linger);
+
+ for (n = rb_first(&osdc->osds); n; ) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+ n = rb_next(n); /* close_osd() */
+
+ scan_requests(osd, skipped_map, was_full, true, need_resend,
+ need_resend_linger);
+ if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
+ memcmp(&osd->o_con.peer_addr,
+ ceph_osd_addr(osdc->osdmap, osd->o_osd),
+ sizeof(struct ceph_entity_addr)))
+ close_osd(osd);
+ }
- dout("kicking lingering %p tid %llu osd%d\n", req,
- req->r_tid, req->r_osd ? req->r_osd->o_osd : -1);
- __register_request(osdc, req);
- __unregister_linger_request(osdc, req);
+ return 0;
+}
+
+static void kick_requests(struct ceph_osd_client *osdc,
+ struct rb_root *need_resend,
+ struct list_head *need_resend_linger)
+{
+ struct ceph_osd_linger_request *lreq, *nlreq;
+ struct rb_node *n;
+
+ for (n = rb_first(need_resend); n; ) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
+ struct ceph_osd *osd;
+
+ n = rb_next(n);
+ erase_request(need_resend, req); /* before link_request() */
+
+ WARN_ON(req->r_osd);
+ calc_target(osdc, &req->r_t, NULL, false);
+ osd = lookup_create_osd(osdc, req->r_t.osd, true);
+ link_request(osd, req);
+ if (!req->r_linger) {
+ if (!osd_homeless(osd) && !req->r_t.paused)
+ send_request(req);
+ } else {
+ cancel_linger_request(req);
}
}
- reset_changed_osds(osdc);
- mutex_unlock(&osdc->request_mutex);
- if (needmap) {
- dout("%d requests for down osds, need new map\n", needmap);
- ceph_monc_request_next_osdmap(&osdc->client->monc);
+ list_for_each_entry_safe(lreq, nlreq, need_resend_linger, scan_item) {
+ if (!osd_homeless(lreq->osd))
+ send_linger(lreq);
+
+ list_del_init(&lreq->scan_item);
}
}
-
/*
* Process updated osd map.
*
@@ -2115,27 +3152,31 @@ static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
*/
void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
{
- void *p, *end, *next;
+ void *p = msg->front.iov_base;
+ void *const end = p + msg->front.iov_len;
u32 nr_maps, maplen;
u32 epoch;
- struct ceph_osdmap *newmap = NULL, *oldmap;
- int err;
struct ceph_fsid fsid;
- bool was_full;
+ struct rb_root need_resend = RB_ROOT;
+ LIST_HEAD(need_resend_linger);
+ bool handled_incremental = false;
+ bool was_pauserd, was_pausewr;
+ bool pauserd, pausewr;
+ int err;
- dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
- p = msg->front.iov_base;
- end = p + msg->front.iov_len;
+ dout("%s have %u\n", __func__, osdc->osdmap->epoch);
+ down_write(&osdc->lock);
/* verify fsid */
ceph_decode_need(&p, end, sizeof(fsid), bad);
ceph_decode_copy(&p, &fsid, sizeof(fsid));
if (ceph_check_fsid(osdc->client, &fsid) < 0)
- return;
-
- down_write(&osdc->map_sem);
+ goto bad;
- was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+ was_pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
+ was_pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ have_pool_full(osdc);
/* incremental maps */
ceph_decode_32_safe(&p, end, nr_maps, bad);
@@ -2145,34 +3186,23 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
epoch = ceph_decode_32(&p);
maplen = ceph_decode_32(&p);
ceph_decode_need(&p, end, maplen, bad);
- next = p + maplen;
- if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
+ if (osdc->osdmap->epoch &&
+ osdc->osdmap->epoch + 1 == epoch) {
dout("applying incremental map %u len %d\n",
epoch, maplen);
- newmap = osdmap_apply_incremental(&p, next,
- osdc->osdmap,
- &osdc->client->msgr);
- if (IS_ERR(newmap)) {
- err = PTR_ERR(newmap);
+ err = handle_one_map(osdc, p, p + maplen, true,
+ &need_resend, &need_resend_linger);
+ if (err)
goto bad;
- }
- BUG_ON(!newmap);
- if (newmap != osdc->osdmap) {
- ceph_osdmap_destroy(osdc->osdmap);
- osdc->osdmap = newmap;
- }
- was_full = was_full ||
- ceph_osdmap_flag(osdc->osdmap,
- CEPH_OSDMAP_FULL);
- kick_requests(osdc, 0, was_full);
+ handled_incremental = true;
} else {
dout("ignoring incremental map %u len %d\n",
epoch, maplen);
}
- p = next;
+ p += maplen;
nr_maps--;
}
- if (newmap)
+ if (handled_incremental)
goto done;
/* full maps */
@@ -2186,455 +3216,647 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
if (nr_maps > 1) {
dout("skipping non-latest full map %u len %d\n",
epoch, maplen);
- } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
+ } else if (osdc->osdmap->epoch >= epoch) {
dout("skipping full map %u len %d, "
"older than our %u\n", epoch, maplen,
osdc->osdmap->epoch);
} else {
- int skipped_map = 0;
-
dout("taking full map %u len %d\n", epoch, maplen);
- newmap = ceph_osdmap_decode(&p, p+maplen);
- if (IS_ERR(newmap)) {
- err = PTR_ERR(newmap);
+ err = handle_one_map(osdc, p, p + maplen, false,
+ &need_resend, &need_resend_linger);
+ if (err)
goto bad;
- }
- BUG_ON(!newmap);
- oldmap = osdc->osdmap;
- osdc->osdmap = newmap;
- if (oldmap) {
- if (oldmap->epoch + 1 < newmap->epoch)
- skipped_map = 1;
- ceph_osdmap_destroy(oldmap);
- }
- was_full = was_full ||
- ceph_osdmap_flag(osdc->osdmap,
- CEPH_OSDMAP_FULL);
- kick_requests(osdc, skipped_map, was_full);
}
p += maplen;
nr_maps--;
}
- if (!osdc->osdmap)
- goto bad;
done:
- downgrade_write(&osdc->map_sem);
- ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
- osdc->osdmap->epoch);
-
/*
* subscribe to subsequent osdmap updates if full to ensure
* we find out when we are no longer full and stop returning
* ENOSPC.
*/
- if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
- ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
- ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR))
- ceph_monc_request_next_osdmap(&osdc->client->monc);
-
- mutex_lock(&osdc->request_mutex);
- __send_queued(osdc);
- mutex_unlock(&osdc->request_mutex);
- up_read(&osdc->map_sem);
+ pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
+ pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ have_pool_full(osdc);
+ if (was_pauserd || was_pausewr || pauserd || pausewr)
+ maybe_request_map(osdc);
+
+ kick_requests(osdc, &need_resend, &need_resend_linger);
+
+ ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
+ osdc->osdmap->epoch);
+ up_write(&osdc->lock);
wake_up_all(&osdc->client->auth_wq);
return;
bad:
pr_err("osdc handle_map corrupt msg\n");
ceph_msg_dump(msg);
- up_write(&osdc->map_sem);
+ up_write(&osdc->lock);
}
/*
- * watch/notify callback event infrastructure
- *
- * These callbacks are used both for watch and notify operations.
+ * Resubmit requests pending on the given osd.
*/
-static void __release_event(struct kref *kref)
+static void kick_osd_requests(struct ceph_osd *osd)
{
- struct ceph_osd_event *event =
- container_of(kref, struct ceph_osd_event, kref);
+ struct rb_node *n;
- dout("__release_event %p\n", event);
- kfree(event);
-}
+ for (n = rb_first(&osd->o_requests); n; ) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
-static void get_event(struct ceph_osd_event *event)
-{
- kref_get(&event->kref);
-}
+ n = rb_next(n); /* cancel_linger_request() */
-void ceph_osdc_put_event(struct ceph_osd_event *event)
-{
- kref_put(&event->kref, __release_event);
+ if (!req->r_linger) {
+ if (!req->r_t.paused)
+ send_request(req);
+ } else {
+ cancel_linger_request(req);
+ }
+ }
+ for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
+ struct ceph_osd_linger_request *lreq =
+ rb_entry(n, struct ceph_osd_linger_request, node);
+
+ send_linger(lreq);
+ }
}
-EXPORT_SYMBOL(ceph_osdc_put_event);
-static void __insert_event(struct ceph_osd_client *osdc,
- struct ceph_osd_event *new)
+/*
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+static void osd_fault(struct ceph_connection *con)
{
- struct rb_node **p = &osdc->event_tree.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_osd_event *event = NULL;
+ struct ceph_osd *osd = con->private;
+ struct ceph_osd_client *osdc = osd->o_osdc;
- while (*p) {
- parent = *p;
- event = rb_entry(parent, struct ceph_osd_event, node);
- if (new->cookie < event->cookie)
- p = &(*p)->rb_left;
- else if (new->cookie > event->cookie)
- p = &(*p)->rb_right;
- else
- BUG();
+ dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+ down_write(&osdc->lock);
+ if (!osd_registered(osd)) {
+ dout("%s osd%d unknown\n", __func__, osd->o_osd);
+ goto out_unlock;
}
- rb_link_node(&new->node, parent, p);
- rb_insert_color(&new->node, &osdc->event_tree);
+ if (!reopen_osd(osd))
+ kick_osd_requests(osd);
+ maybe_request_map(osdc);
+
+out_unlock:
+ up_write(&osdc->lock);
}
-static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc,
- u64 cookie)
+/*
+ * Process osd watch notifications
+ */
+static void handle_watch_notify(struct ceph_osd_client *osdc,
+ struct ceph_msg *msg)
{
- struct rb_node **p = &osdc->event_tree.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_osd_event *event = NULL;
+ void *p = msg->front.iov_base;
+ void *const end = p + msg->front.iov_len;
+ struct ceph_osd_linger_request *lreq;
+ struct linger_work *lwork;
+ u8 proto_ver, opcode;
+ u64 cookie, notify_id;
+ u64 notifier_id = 0;
+ s32 return_code = 0;
+ void *payload = NULL;
+ u32 payload_len = 0;
- while (*p) {
- parent = *p;
- event = rb_entry(parent, struct ceph_osd_event, node);
- if (cookie < event->cookie)
- p = &(*p)->rb_left;
- else if (cookie > event->cookie)
- p = &(*p)->rb_right;
- else
- return event;
+ ceph_decode_8_safe(&p, end, proto_ver, bad);
+ ceph_decode_8_safe(&p, end, opcode, bad);
+ ceph_decode_64_safe(&p, end, cookie, bad);
+ p += 8; /* skip ver */
+ ceph_decode_64_safe(&p, end, notify_id, bad);
+
+ if (proto_ver >= 1) {
+ ceph_decode_32_safe(&p, end, payload_len, bad);
+ ceph_decode_need(&p, end, payload_len, bad);
+ payload = p;
+ p += payload_len;
}
- return NULL;
-}
-static void __remove_event(struct ceph_osd_event *event)
-{
- struct ceph_osd_client *osdc = event->osdc;
+ if (le16_to_cpu(msg->hdr.version) >= 2)
+ ceph_decode_32_safe(&p, end, return_code, bad);
+
+ if (le16_to_cpu(msg->hdr.version) >= 3)
+ ceph_decode_64_safe(&p, end, notifier_id, bad);
- if (!RB_EMPTY_NODE(&event->node)) {
- dout("__remove_event removed %p\n", event);
- rb_erase(&event->node, &osdc->event_tree);
- ceph_osdc_put_event(event);
+ down_read(&osdc->lock);
+ lreq = lookup_linger_osdc(&osdc->linger_requests, cookie);
+ if (!lreq) {
+ dout("%s opcode %d cookie %llu dne\n", __func__, opcode,
+ cookie);
+ goto out_unlock_osdc;
+ }
+
+ mutex_lock(&lreq->lock);
+ dout("%s opcode %d cookie %llu lreq %p is_watch %d\n", __func__,
+ opcode, cookie, lreq, lreq->is_watch);
+ if (opcode == CEPH_WATCH_EVENT_DISCONNECT) {
+ if (!lreq->last_error) {
+ lreq->last_error = -ENOTCONN;
+ queue_watch_error(lreq);
+ }
+ } else if (!lreq->is_watch) {
+ /* CEPH_WATCH_EVENT_NOTIFY_COMPLETE */
+ if (lreq->notify_id && lreq->notify_id != notify_id) {
+ dout("lreq %p notify_id %llu != %llu, ignoring\n", lreq,
+ lreq->notify_id, notify_id);
+ } else if (!completion_done(&lreq->notify_finish_wait)) {
+ struct ceph_msg_data *data =
+ list_first_entry_or_null(&msg->data,
+ struct ceph_msg_data,
+ links);
+
+ if (data) {
+ if (lreq->preply_pages) {
+ WARN_ON(data->type !=
+ CEPH_MSG_DATA_PAGES);
+ *lreq->preply_pages = data->pages;
+ *lreq->preply_len = data->length;
+ } else {
+ ceph_release_page_vector(data->pages,
+ calc_pages_for(0, data->length));
+ }
+ }
+ lreq->notify_finish_error = return_code;
+ complete_all(&lreq->notify_finish_wait);
+ }
} else {
- dout("__remove_event didn't remove %p\n", event);
+ /* CEPH_WATCH_EVENT_NOTIFY */
+ lwork = lwork_alloc(lreq, do_watch_notify);
+ if (!lwork) {
+ pr_err("failed to allocate notify-lwork\n");
+ goto out_unlock_lreq;
+ }
+
+ lwork->notify.notify_id = notify_id;
+ lwork->notify.notifier_id = notifier_id;
+ lwork->notify.payload = payload;
+ lwork->notify.payload_len = payload_len;
+ lwork->notify.msg = ceph_msg_get(msg);
+ lwork_queue(lwork);
}
+
+out_unlock_lreq:
+ mutex_unlock(&lreq->lock);
+out_unlock_osdc:
+ up_read(&osdc->lock);
+ return;
+
+bad:
+ pr_err("osdc handle_watch_notify corrupt msg\n");
}
-int ceph_osdc_create_event(struct ceph_osd_client *osdc,
- void (*event_cb)(u64, u64, u8, void *),
- void *data, struct ceph_osd_event **pevent)
+/*
+ * Register request, send initial attempt.
+ */
+int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req,
+ bool nofail)
{
- struct ceph_osd_event *event;
+ down_read(&osdc->lock);
+ submit_request(req, false);
+ up_read(&osdc->lock);
- event = kmalloc(sizeof(*event), GFP_NOIO);
- if (!event)
- return -ENOMEM;
-
- dout("create_event %p\n", event);
- event->cb = event_cb;
- event->one_shot = 0;
- event->data = data;
- event->osdc = osdc;
- INIT_LIST_HEAD(&event->osd_node);
- RB_CLEAR_NODE(&event->node);
- kref_init(&event->kref); /* one ref for us */
- kref_get(&event->kref); /* one ref for the caller */
-
- spin_lock(&osdc->event_lock);
- event->cookie = ++osdc->event_count;
- __insert_event(osdc, event);
- spin_unlock(&osdc->event_lock);
-
- *pevent = event;
return 0;
}
-EXPORT_SYMBOL(ceph_osdc_create_event);
+EXPORT_SYMBOL(ceph_osdc_start_request);
-void ceph_osdc_cancel_event(struct ceph_osd_event *event)
+/*
+ * Unregister a registered request. The request is not completed (i.e.
+ * no callbacks or wakeups) - higher layers are supposed to know what
+ * they are canceling.
+ */
+void ceph_osdc_cancel_request(struct ceph_osd_request *req)
{
- struct ceph_osd_client *osdc = event->osdc;
+ struct ceph_osd_client *osdc = req->r_osdc;
- dout("cancel_event %p\n", event);
- spin_lock(&osdc->event_lock);
- __remove_event(event);
- spin_unlock(&osdc->event_lock);
- ceph_osdc_put_event(event); /* caller's */
+ down_write(&osdc->lock);
+ if (req->r_osd)
+ cancel_request(req);
+ up_write(&osdc->lock);
}
-EXPORT_SYMBOL(ceph_osdc_cancel_event);
-
+EXPORT_SYMBOL(ceph_osdc_cancel_request);
-static void do_event_work(struct work_struct *work)
+/*
+ * @timeout: in jiffies, 0 means "wait forever"
+ */
+static int wait_request_timeout(struct ceph_osd_request *req,
+ unsigned long timeout)
{
- struct ceph_osd_event_work *event_work =
- container_of(work, struct ceph_osd_event_work, work);
- struct ceph_osd_event *event = event_work->event;
- u64 ver = event_work->ver;
- u64 notify_id = event_work->notify_id;
- u8 opcode = event_work->opcode;
+ long left;
+
+ dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+ left = wait_for_completion_killable_timeout(&req->r_completion,
+ ceph_timeout_jiffies(timeout));
+ if (left <= 0) {
+ left = left ?: -ETIMEDOUT;
+ ceph_osdc_cancel_request(req);
- dout("do_event_work completing %p\n", event);
- event->cb(ver, notify_id, opcode, event->data);
- dout("do_event_work completed %p\n", event);
- ceph_osdc_put_event(event);
- kfree(event_work);
+ /* kludge - need to to wake ceph_osdc_sync() */
+ complete_all(&req->r_safe_completion);
+ } else {
+ left = req->r_result; /* completed */
+ }
+
+ return left;
}
+/*
+ * wait for a request to complete
+ */
+int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ return wait_request_timeout(req, 0);
+}
+EXPORT_SYMBOL(ceph_osdc_wait_request);
/*
- * Process osd watch notifications
+ * sync - wait for all in-flight requests to flush. avoid starvation.
*/
-static void handle_watch_notify(struct ceph_osd_client *osdc,
- struct ceph_msg *msg)
+void ceph_osdc_sync(struct ceph_osd_client *osdc)
{
- void *p, *end;
- u8 proto_ver;
- u64 cookie, ver, notify_id;
- u8 opcode;
- struct ceph_osd_event *event;
- struct ceph_osd_event_work *event_work;
+ struct rb_node *n, *p;
+ u64 last_tid = atomic64_read(&osdc->last_tid);
- p = msg->front.iov_base;
- end = p + msg->front.iov_len;
+again:
+ down_read(&osdc->lock);
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
- ceph_decode_8_safe(&p, end, proto_ver, bad);
- ceph_decode_8_safe(&p, end, opcode, bad);
- ceph_decode_64_safe(&p, end, cookie, bad);
- ceph_decode_64_safe(&p, end, ver, bad);
- ceph_decode_64_safe(&p, end, notify_id, bad);
+ mutex_lock(&osd->lock);
+ for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+ struct ceph_osd_request *req =
+ rb_entry(p, struct ceph_osd_request, r_node);
+
+ if (req->r_tid > last_tid)
+ break;
- spin_lock(&osdc->event_lock);
- event = __find_event(osdc, cookie);
- if (event) {
- BUG_ON(event->one_shot);
- get_event(event);
- }
- spin_unlock(&osdc->event_lock);
- dout("handle_watch_notify cookie %lld ver %lld event %p\n",
- cookie, ver, event);
- if (event) {
- event_work = kmalloc(sizeof(*event_work), GFP_NOIO);
- if (!event_work) {
- pr_err("couldn't allocate event_work\n");
- ceph_osdc_put_event(event);
- return;
+ if (!(req->r_flags & CEPH_OSD_FLAG_WRITE))
+ continue;
+
+ ceph_osdc_get_request(req);
+ mutex_unlock(&osd->lock);
+ up_read(&osdc->lock);
+ dout("%s waiting on req %p tid %llu last_tid %llu\n",
+ __func__, req, req->r_tid, last_tid);
+ wait_for_completion(&req->r_safe_completion);
+ ceph_osdc_put_request(req);
+ goto again;
}
- INIT_WORK(&event_work->work, do_event_work);
- event_work->event = event;
- event_work->ver = ver;
- event_work->notify_id = notify_id;
- event_work->opcode = opcode;
- queue_work(osdc->notify_wq, &event_work->work);
+ mutex_unlock(&osd->lock);
}
- return;
+ up_read(&osdc->lock);
+ dout("%s done last_tid %llu\n", __func__, last_tid);
+}
+EXPORT_SYMBOL(ceph_osdc_sync);
-bad:
- pr_err("osdc handle_watch_notify corrupt msg\n");
+static struct ceph_osd_request *
+alloc_linger_request(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_request *req;
+
+ req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO);
+ if (!req)
+ return NULL;
+
+ ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+ ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+
+ if (ceph_osdc_alloc_messages(req, GFP_NOIO)) {
+ ceph_osdc_put_request(req);
+ return NULL;
+ }
+
+ return req;
}
/*
- * build new request AND message
- *
+ * Returns a handle, caller owns a ref.
*/
-void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
- struct ceph_snap_context *snapc, u64 snap_id,
- struct timespec *mtime)
-{
- struct ceph_msg *msg = req->r_request;
- void *p;
- size_t msg_size;
- int flags = req->r_flags;
- u64 data_len;
- unsigned int i;
-
- req->r_snapid = snap_id;
- req->r_snapc = ceph_get_snap_context(snapc);
-
- /* encode request */
- msg->hdr.version = cpu_to_le16(4);
-
- p = msg->front.iov_base;
- ceph_encode_32(&p, 1); /* client_inc is always 1 */
- req->r_request_osdmap_epoch = p;
- p += 4;
- req->r_request_flags = p;
- p += 4;
- if (req->r_flags & CEPH_OSD_FLAG_WRITE)
- ceph_encode_timespec(p, mtime);
- p += sizeof(struct ceph_timespec);
- req->r_request_reassert_version = p;
- p += sizeof(struct ceph_eversion); /* will get filled in */
-
- /* oloc */
- ceph_encode_8(&p, 4);
- ceph_encode_8(&p, 4);
- ceph_encode_32(&p, 8 + 4 + 4);
- req->r_request_pool = p;
- p += 8;
- ceph_encode_32(&p, -1); /* preferred */
- ceph_encode_32(&p, 0); /* key len */
+struct ceph_osd_linger_request *
+ceph_osdc_watch(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ rados_watchcb2_t wcb,
+ rados_watcherrcb_t errcb,
+ void *data)
+{
+ struct ceph_osd_linger_request *lreq;
+ int ret;
- ceph_encode_8(&p, 1);
- req->r_request_pgid = p;
- p += 8 + 4;
- ceph_encode_32(&p, -1); /* preferred */
+ lreq = linger_alloc(osdc);
+ if (!lreq)
+ return ERR_PTR(-ENOMEM);
- /* oid */
- ceph_encode_32(&p, req->r_base_oid.name_len);
- memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len);
- dout("oid '%.*s' len %d\n", req->r_base_oid.name_len,
- req->r_base_oid.name, req->r_base_oid.name_len);
- p += req->r_base_oid.name_len;
-
- /* ops--can imply data */
- ceph_encode_16(&p, (u16)req->r_num_ops);
- data_len = 0;
- for (i = 0; i < req->r_num_ops; i++) {
- data_len += osd_req_encode_op(req, p, i);
- p += sizeof(struct ceph_osd_op);
+ lreq->is_watch = true;
+ lreq->wcb = wcb;
+ lreq->errcb = errcb;
+ lreq->data = data;
+ lreq->watch_valid_thru = jiffies;
+
+ ceph_oid_copy(&lreq->t.base_oid, oid);
+ ceph_oloc_copy(&lreq->t.base_oloc, oloc);
+ lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+ lreq->mtime = CURRENT_TIME;
+
+ lreq->reg_req = alloc_linger_request(lreq);
+ if (!lreq->reg_req) {
+ ret = -ENOMEM;
+ goto err_put_lreq;
}
- /* snaps */
- ceph_encode_64(&p, req->r_snapid);
- ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0);
- ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0);
- if (req->r_snapc) {
- for (i = 0; i < snapc->num_snaps; i++) {
- ceph_encode_64(&p, req->r_snapc->snaps[i]);
- }
+ lreq->ping_req = alloc_linger_request(lreq);
+ if (!lreq->ping_req) {
+ ret = -ENOMEM;
+ goto err_put_lreq;
}
- req->r_request_attempts = p;
- p += 4;
-
- /* data */
- if (flags & CEPH_OSD_FLAG_WRITE) {
- u16 data_off;
-
- /*
- * The header "data_off" is a hint to the receiver
- * allowing it to align received data into its
- * buffers such that there's no need to re-copy
- * it before writing it to disk (direct I/O).
- */
- data_off = (u16) (off & 0xffff);
- req->r_request->hdr.data_off = cpu_to_le16(data_off);
+ down_write(&osdc->lock);
+ linger_register(lreq); /* before osd_req_op_* */
+ osd_req_op_watch_init(lreq->reg_req, 0, lreq->linger_id,
+ CEPH_OSD_WATCH_OP_WATCH);
+ osd_req_op_watch_init(lreq->ping_req, 0, lreq->linger_id,
+ CEPH_OSD_WATCH_OP_PING);
+ linger_submit(lreq);
+ up_write(&osdc->lock);
+
+ ret = linger_reg_commit_wait(lreq);
+ if (ret) {
+ linger_cancel(lreq);
+ goto err_put_lreq;
}
- req->r_request->hdr.data_len = cpu_to_le32(data_len);
- BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
- msg_size = p - msg->front.iov_base;
- msg->front.iov_len = msg_size;
- msg->hdr.front_len = cpu_to_le32(msg_size);
+ return lreq;
- dout("build_request msg_size was %d\n", (int)msg_size);
+err_put_lreq:
+ linger_put(lreq);
+ return ERR_PTR(ret);
}
-EXPORT_SYMBOL(ceph_osdc_build_request);
+EXPORT_SYMBOL(ceph_osdc_watch);
/*
- * Register request, send initial attempt.
+ * Releases a ref.
+ *
+ * Times out after mount_timeout to preserve rbd unmap behaviour
+ * introduced in 2894e1d76974 ("rbd: timeout watch teardown on unmap
+ * with mount_timeout").
*/
-int ceph_osdc_start_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req,
- bool nofail)
+int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
+ struct ceph_osd_linger_request *lreq)
{
- int rc;
+ struct ceph_options *opts = osdc->client->options;
+ struct ceph_osd_request *req;
+ int ret;
- down_read(&osdc->map_sem);
- mutex_lock(&osdc->request_mutex);
+ req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+ if (!req)
+ return -ENOMEM;
- rc = __ceph_osdc_start_request(osdc, req, nofail);
+ ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+ ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+ req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+ req->r_mtime = CURRENT_TIME;
+ osd_req_op_watch_init(req, 0, lreq->linger_id,
+ CEPH_OSD_WATCH_OP_UNWATCH);
- mutex_unlock(&osdc->request_mutex);
- up_read(&osdc->map_sem);
+ ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+ if (ret)
+ goto out_put_req;
- return rc;
+ ceph_osdc_start_request(osdc, req, false);
+ linger_cancel(lreq);
+ linger_put(lreq);
+ ret = wait_request_timeout(req, opts->mount_timeout);
+
+out_put_req:
+ ceph_osdc_put_request(req);
+ return ret;
}
-EXPORT_SYMBOL(ceph_osdc_start_request);
+EXPORT_SYMBOL(ceph_osdc_unwatch);
-/*
- * Unregister a registered request. The request is not completed (i.e.
- * no callbacks or wakeups) - higher layers are supposed to know what
- * they are canceling.
- */
-void ceph_osdc_cancel_request(struct ceph_osd_request *req)
+static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which,
+ u64 notify_id, u64 cookie, void *payload,
+ size_t payload_len)
{
- struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_osd_req_op *op;
+ struct ceph_pagelist *pl;
+ int ret;
+
+ op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0);
+
+ pl = kmalloc(sizeof(*pl), GFP_NOIO);
+ if (!pl)
+ return -ENOMEM;
- mutex_lock(&osdc->request_mutex);
- if (req->r_linger)
- __unregister_linger_request(osdc, req);
- __unregister_request(osdc, req);
- mutex_unlock(&osdc->request_mutex);
+ ceph_pagelist_init(pl);
+ ret = ceph_pagelist_encode_64(pl, notify_id);
+ ret |= ceph_pagelist_encode_64(pl, cookie);
+ if (payload) {
+ ret |= ceph_pagelist_encode_32(pl, payload_len);
+ ret |= ceph_pagelist_append(pl, payload, payload_len);
+ } else {
+ ret |= ceph_pagelist_encode_32(pl, 0);
+ }
+ if (ret) {
+ ceph_pagelist_release(pl);
+ return -ENOMEM;
+ }
- dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid);
+ ceph_osd_data_pagelist_init(&op->notify_ack.request_data, pl);
+ op->indata_len = pl->length;
+ return 0;
}
-EXPORT_SYMBOL(ceph_osdc_cancel_request);
-/*
- * wait for a request to complete
- */
-int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ u64 notify_id,
+ u64 cookie,
+ void *payload,
+ size_t payload_len)
+{
+ struct ceph_osd_request *req;
+ int ret;
+
+ req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+ if (!req)
+ return -ENOMEM;
+
+ ceph_oid_copy(&req->r_base_oid, oid);
+ ceph_oloc_copy(&req->r_base_oloc, oloc);
+ req->r_flags = CEPH_OSD_FLAG_READ;
+
+ ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+ if (ret)
+ goto out_put_req;
+
+ ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload,
+ payload_len);
+ if (ret)
+ goto out_put_req;
+
+ ceph_osdc_start_request(osdc, req, false);
+ ret = ceph_osdc_wait_request(osdc, req);
+
+out_put_req:
+ ceph_osdc_put_request(req);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_notify_ack);
+
+static int osd_req_op_notify_init(struct ceph_osd_request *req, int which,
+ u64 cookie, u32 prot_ver, u32 timeout,
+ void *payload, size_t payload_len)
{
- int rc;
+ struct ceph_osd_req_op *op;
+ struct ceph_pagelist *pl;
+ int ret;
- dout("%s %p tid %llu\n", __func__, req, req->r_tid);
+ op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0);
+ op->notify.cookie = cookie;
- rc = wait_for_completion_interruptible(&req->r_completion);
- if (rc < 0) {
- dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid);
- ceph_osdc_cancel_request(req);
- complete_request(req);
- return rc;
+ pl = kmalloc(sizeof(*pl), GFP_NOIO);
+ if (!pl)
+ return -ENOMEM;
+
+ ceph_pagelist_init(pl);
+ ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */
+ ret |= ceph_pagelist_encode_32(pl, timeout);
+ ret |= ceph_pagelist_encode_32(pl, payload_len);
+ ret |= ceph_pagelist_append(pl, payload, payload_len);
+ if (ret) {
+ ceph_pagelist_release(pl);
+ return -ENOMEM;
}
- dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid,
- req->r_result);
- return req->r_result;
+ ceph_osd_data_pagelist_init(&op->notify.request_data, pl);
+ op->indata_len = pl->length;
+ return 0;
}
-EXPORT_SYMBOL(ceph_osdc_wait_request);
/*
- * sync - wait for all in-flight requests to flush. avoid starvation.
+ * @timeout: in seconds
+ *
+ * @preply_{pages,len} are initialized both on success and error.
+ * The caller is responsible for:
+ *
+ * ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len))
*/
-void ceph_osdc_sync(struct ceph_osd_client *osdc)
+int ceph_osdc_notify(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ void *payload,
+ size_t payload_len,
+ u32 timeout,
+ struct page ***preply_pages,
+ size_t *preply_len)
{
- struct ceph_osd_request *req;
- u64 last_tid, next_tid = 0;
+ struct ceph_osd_linger_request *lreq;
+ struct page **pages;
+ int ret;
- mutex_lock(&osdc->request_mutex);
- last_tid = osdc->last_tid;
- while (1) {
- req = __lookup_request_ge(osdc, next_tid);
- if (!req)
- break;
- if (req->r_tid > last_tid)
- break;
+ WARN_ON(!timeout);
+ if (preply_pages) {
+ *preply_pages = NULL;
+ *preply_len = 0;
+ }
- next_tid = req->r_tid + 1;
- if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
- continue;
+ lreq = linger_alloc(osdc);
+ if (!lreq)
+ return -ENOMEM;
- ceph_osdc_get_request(req);
- mutex_unlock(&osdc->request_mutex);
- dout("sync waiting on tid %llu (last is %llu)\n",
- req->r_tid, last_tid);
- wait_for_completion(&req->r_safe_completion);
- mutex_lock(&osdc->request_mutex);
- ceph_osdc_put_request(req);
+ lreq->preply_pages = preply_pages;
+ lreq->preply_len = preply_len;
+
+ ceph_oid_copy(&lreq->t.base_oid, oid);
+ ceph_oloc_copy(&lreq->t.base_oloc, oloc);
+ lreq->t.flags = CEPH_OSD_FLAG_READ;
+
+ lreq->reg_req = alloc_linger_request(lreq);
+ if (!lreq->reg_req) {
+ ret = -ENOMEM;
+ goto out_put_lreq;
}
- mutex_unlock(&osdc->request_mutex);
- dout("sync done (thru tid %llu)\n", last_tid);
+
+ /* for notify_id */
+ pages = ceph_alloc_page_vector(1, GFP_NOIO);
+ if (IS_ERR(pages)) {
+ ret = PTR_ERR(pages);
+ goto out_put_lreq;
+ }
+
+ down_write(&osdc->lock);
+ linger_register(lreq); /* before osd_req_op_* */
+ ret = osd_req_op_notify_init(lreq->reg_req, 0, lreq->linger_id, 1,
+ timeout, payload, payload_len);
+ if (ret) {
+ linger_unregister(lreq);
+ up_write(&osdc->lock);
+ ceph_release_page_vector(pages, 1);
+ goto out_put_lreq;
+ }
+ ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify,
+ response_data),
+ pages, PAGE_SIZE, 0, false, true);
+ linger_submit(lreq);
+ up_write(&osdc->lock);
+
+ ret = linger_reg_commit_wait(lreq);
+ if (!ret)
+ ret = linger_notify_finish_wait(lreq);
+ else
+ dout("lreq %p failed to initiate notify %d\n", lreq, ret);
+
+ linger_cancel(lreq);
+out_put_lreq:
+ linger_put(lreq);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_notify);
+
+/*
+ * Return the number of milliseconds since the watch was last
+ * confirmed, or an error. If there is an error, the watch is no
+ * longer valid, and should be destroyed with ceph_osdc_unwatch().
+ */
+int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
+ struct ceph_osd_linger_request *lreq)
+{
+ unsigned long stamp, age;
+ int ret;
+
+ down_read(&osdc->lock);
+ mutex_lock(&lreq->lock);
+ stamp = lreq->watch_valid_thru;
+ if (!list_empty(&lreq->pending_lworks)) {
+ struct linger_work *lwork =
+ list_first_entry(&lreq->pending_lworks,
+ struct linger_work,
+ pending_item);
+
+ if (time_before(lwork->queued_stamp, stamp))
+ stamp = lwork->queued_stamp;
+ }
+ age = jiffies - stamp;
+ dout("%s lreq %p linger_id %llu age %lu last_error %d\n", __func__,
+ lreq, lreq->linger_id, age, lreq->last_error);
+ /* we are truncating to msecs, so return a safe upper bound */
+ ret = lreq->last_error ?: 1 + jiffies_to_msecs(age);
+
+ mutex_unlock(&lreq->lock);
+ up_read(&osdc->lock);
+ return ret;
}
-EXPORT_SYMBOL(ceph_osdc_sync);
/*
* Call all pending notify callbacks - for use after a watch is
@@ -2646,6 +3868,13 @@ void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
}
EXPORT_SYMBOL(ceph_osdc_flush_notifies);
+void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc)
+{
+ down_read(&osdc->lock);
+ maybe_request_map(osdc);
+ up_read(&osdc->lock);
+}
+EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
/*
* init, shutdown
@@ -2656,43 +3885,35 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
dout("init\n");
osdc->client = client;
- osdc->osdmap = NULL;
- init_rwsem(&osdc->map_sem);
- init_completion(&osdc->map_waiters);
- osdc->last_requested_map = 0;
- mutex_init(&osdc->request_mutex);
- osdc->last_tid = 0;
+ init_rwsem(&osdc->lock);
osdc->osds = RB_ROOT;
INIT_LIST_HEAD(&osdc->osd_lru);
- osdc->requests = RB_ROOT;
- INIT_LIST_HEAD(&osdc->req_lru);
- INIT_LIST_HEAD(&osdc->req_unsent);
- INIT_LIST_HEAD(&osdc->req_notarget);
- INIT_LIST_HEAD(&osdc->req_linger);
- osdc->num_requests = 0;
+ spin_lock_init(&osdc->osd_lru_lock);
+ osd_init(&osdc->homeless_osd);
+ osdc->homeless_osd.o_osdc = osdc;
+ osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
+ osdc->linger_requests = RB_ROOT;
+ osdc->map_checks = RB_ROOT;
+ osdc->linger_map_checks = RB_ROOT;
INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
- spin_lock_init(&osdc->event_lock);
- osdc->event_tree = RB_ROOT;
- osdc->event_count = 0;
-
- schedule_delayed_work(&osdc->osds_timeout_work,
- round_jiffies_relative(osdc->client->options->osd_idle_ttl));
err = -ENOMEM;
+ osdc->osdmap = ceph_osdmap_alloc();
+ if (!osdc->osdmap)
+ goto out;
+
osdc->req_mempool = mempool_create_slab_pool(10,
ceph_osd_request_cache);
if (!osdc->req_mempool)
- goto out;
+ goto out_map;
err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
- OSD_OP_FRONT_LEN, 10, true,
- "osd_op");
+ PAGE_SIZE, 10, true, "osd_op");
if (err < 0)
goto out_mempool;
err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
- OSD_OPREPLY_FRONT_LEN, 10, true,
- "osd_op_reply");
+ PAGE_SIZE, 10, true, "osd_op_reply");
if (err < 0)
goto out_msgpool;
@@ -2701,6 +3922,11 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
if (!osdc->notify_wq)
goto out_msgpool_reply;
+ schedule_delayed_work(&osdc->timeout_work,
+ osdc->client->options->osd_keepalive_timeout);
+ schedule_delayed_work(&osdc->osds_timeout_work,
+ round_jiffies_relative(osdc->client->options->osd_idle_ttl));
+
return 0;
out_msgpool_reply:
@@ -2709,6 +3935,8 @@ out_msgpool:
ceph_msgpool_destroy(&osdc->msgpool_op);
out_mempool:
mempool_destroy(osdc->req_mempool);
+out_map:
+ ceph_osdmap_destroy(osdc->osdmap);
out:
return err;
}
@@ -2719,11 +3947,25 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
destroy_workqueue(osdc->notify_wq);
cancel_delayed_work_sync(&osdc->timeout_work);
cancel_delayed_work_sync(&osdc->osds_timeout_work);
- if (osdc->osdmap) {
- ceph_osdmap_destroy(osdc->osdmap);
- osdc->osdmap = NULL;
+
+ down_write(&osdc->lock);
+ while (!RB_EMPTY_ROOT(&osdc->osds)) {
+ struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
+ struct ceph_osd, o_node);
+ close_osd(osd);
}
- remove_all_osds(osdc);
+ up_write(&osdc->lock);
+ WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1);
+ osd_cleanup(&osdc->homeless_osd);
+
+ WARN_ON(!list_empty(&osdc->osd_lru));
+ WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests));
+ WARN_ON(!RB_EMPTY_ROOT(&osdc->map_checks));
+ WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_map_checks));
+ WARN_ON(atomic_read(&osdc->num_requests));
+ WARN_ON(atomic_read(&osdc->num_homeless));
+
+ ceph_osdmap_destroy(osdc->osdmap);
mempool_destroy(osdc->req_mempool);
ceph_msgpool_destroy(&osdc->msgpool_op);
ceph_msgpool_destroy(&osdc->msgpool_op_reply);
@@ -2752,15 +3994,12 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
return PTR_ERR(req);
/* it may be a short read due to an object boundary */
-
osd_req_op_extent_osd_data_pages(req, 0,
pages, *plen, page_align, false, false);
dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
off, *plen, *plen, page_align);
- ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
-
rc = ceph_osdc_start_request(osdc, req, false);
if (!rc)
rc = ceph_osdc_wait_request(osdc, req);
@@ -2786,7 +4025,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
int rc = 0;
int page_align = off & ~PAGE_MASK;
- BUG_ON(vino.snap != CEPH_NOSNAP); /* snapshots aren't writeable */
req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
CEPH_OSD_OP_WRITE,
CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
@@ -2800,8 +4038,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
false, false);
dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
- ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime);
-
+ req->r_mtime = *mtime;
rc = ceph_osdc_start_request(osdc, req, true);
if (!rc)
rc = ceph_osdc_wait_request(osdc, req);
@@ -2841,19 +4078,15 @@ EXPORT_SYMBOL(ceph_osdc_cleanup);
static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
{
struct ceph_osd *osd = con->private;
- struct ceph_osd_client *osdc;
+ struct ceph_osd_client *osdc = osd->o_osdc;
int type = le16_to_cpu(msg->hdr.type);
- if (!osd)
- goto out;
- osdc = osd->o_osdc;
-
switch (type) {
case CEPH_MSG_OSD_MAP:
ceph_osdc_handle_map(osdc, msg);
break;
case CEPH_MSG_OSD_OPREPLY:
- handle_reply(osdc, msg);
+ handle_reply(osd, msg);
break;
case CEPH_MSG_WATCH_NOTIFY:
handle_watch_notify(osdc, msg);
@@ -2863,7 +4096,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
pr_err("received unknown message type %d %s\n", type,
ceph_msg_type_name(type));
}
-out:
+
ceph_msg_put(msg);
}
@@ -2878,21 +4111,27 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
{
struct ceph_osd *osd = con->private;
struct ceph_osd_client *osdc = osd->o_osdc;
- struct ceph_msg *m;
+ struct ceph_msg *m = NULL;
struct ceph_osd_request *req;
int front_len = le32_to_cpu(hdr->front_len);
int data_len = le32_to_cpu(hdr->data_len);
- u64 tid;
+ u64 tid = le64_to_cpu(hdr->tid);
- tid = le64_to_cpu(hdr->tid);
- mutex_lock(&osdc->request_mutex);
- req = __lookup_request(osdc, tid);
+ down_read(&osdc->lock);
+ if (!osd_registered(osd)) {
+ dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd);
+ *skip = 1;
+ goto out_unlock_osdc;
+ }
+ WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num));
+
+ mutex_lock(&osd->lock);
+ req = lookup_request(&osd->o_requests, tid);
if (!req) {
dout("%s osd%d tid %llu unknown, skipping\n", __func__,
osd->o_osd, tid);
- m = NULL;
*skip = 1;
- goto out;
+ goto out_unlock_session;
}
ceph_msg_revoke_incoming(req->r_reply);
@@ -2904,7 +4143,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
false);
if (!m)
- goto out;
+ goto out_unlock_session;
ceph_msg_put(req->r_reply);
req->r_reply = m;
}
@@ -2915,14 +4154,49 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
req->r_reply->data_length);
m = NULL;
*skip = 1;
- goto out;
+ goto out_unlock_session;
}
m = ceph_msg_get(req->r_reply);
dout("get_reply tid %lld %p\n", tid, m);
-out:
- mutex_unlock(&osdc->request_mutex);
+out_unlock_session:
+ mutex_unlock(&osd->lock);
+out_unlock_osdc:
+ up_read(&osdc->lock);
+ return m;
+}
+
+/*
+ * TODO: switch to a msg-owned pagelist
+ */
+static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
+{
+ struct ceph_msg *m;
+ int type = le16_to_cpu(hdr->type);
+ u32 front_len = le32_to_cpu(hdr->front_len);
+ u32 data_len = le32_to_cpu(hdr->data_len);
+
+ m = ceph_msg_new(type, front_len, GFP_NOIO, false);
+ if (!m)
+ return NULL;
+
+ if (data_len) {
+ struct page **pages;
+ struct ceph_osd_data osd_data;
+
+ pages = ceph_alloc_page_vector(calc_pages_for(0, data_len),
+ GFP_NOIO);
+ if (!pages) {
+ ceph_msg_put(m);
+ return NULL;
+ }
+
+ ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false,
+ false);
+ ceph_osdc_msg_data_add(m, &osd_data);
+ }
+
return m;
}
@@ -2932,18 +4206,17 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
{
struct ceph_osd *osd = con->private;
int type = le16_to_cpu(hdr->type);
- int front = le32_to_cpu(hdr->front_len);
*skip = 0;
switch (type) {
case CEPH_MSG_OSD_MAP:
case CEPH_MSG_WATCH_NOTIFY:
- return ceph_msg_new(type, front, GFP_NOFS, false);
+ return alloc_msg_with_page_vector(hdr);
case CEPH_MSG_OSD_OPREPLY:
return get_reply(con, hdr, skip);
default:
- pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
- osd->o_osd);
+ pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__,
+ osd->o_osd, type);
*skip = 1;
return NULL;
}
@@ -3047,5 +4320,5 @@ static const struct ceph_connection_operations osd_con_ops = {
.alloc_msg = alloc_msg,
.sign_message = osd_sign_message,
.check_message_signature = osd_check_message_signature,
- .fault = osd_reset,
+ .fault = osd_fault,
};
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 243574c8cf33..03062bb763b3 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -380,23 +380,24 @@ bad:
return ERR_PTR(err);
}
-/*
- * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
- * to a set of osds) and primary_temp (explicit primary setting)
- */
-static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs)
{
- if (l.pool < r.pool)
+ if (lhs->pool < rhs->pool)
return -1;
- if (l.pool > r.pool)
+ if (lhs->pool > rhs->pool)
return 1;
- if (l.seed < r.seed)
+ if (lhs->seed < rhs->seed)
return -1;
- if (l.seed > r.seed)
+ if (lhs->seed > rhs->seed)
return 1;
+
return 0;
}
+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds) and primary_temp (explicit primary setting)
+ */
static int __insert_pg_mapping(struct ceph_pg_mapping *new,
struct rb_root *root)
{
@@ -409,7 +410,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new,
while (*p) {
parent = *p;
pg = rb_entry(parent, struct ceph_pg_mapping, node);
- c = pgid_cmp(new->pgid, pg->pgid);
+ c = ceph_pg_compare(&new->pgid, &pg->pgid);
if (c < 0)
p = &(*p)->rb_left;
else if (c > 0)
@@ -432,7 +433,7 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
while (n) {
pg = rb_entry(n, struct ceph_pg_mapping, node);
- c = pgid_cmp(pgid, pg->pgid);
+ c = ceph_pg_compare(&pgid, &pg->pgid);
if (c < 0) {
n = n->rb_left;
} else if (c > 0) {
@@ -596,7 +597,9 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
*p += 4; /* skip crash_replay_interval */
if (ev >= 7)
- *p += 1; /* skip min_size */
+ pi->min_size = ceph_decode_8(p);
+ else
+ pi->min_size = pi->size - pi->size / 2;
if (ev >= 8)
*p += 8 + 8; /* skip quota_max_* */
@@ -616,6 +619,50 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
pi->write_tier = -1;
}
+ if (ev >= 10) {
+ /* skip properties */
+ num = ceph_decode_32(p);
+ while (num--) {
+ len = ceph_decode_32(p);
+ *p += len; /* key */
+ len = ceph_decode_32(p);
+ *p += len; /* val */
+ }
+ }
+
+ if (ev >= 11) {
+ /* skip hit_set_params */
+ *p += 1 + 1; /* versions */
+ len = ceph_decode_32(p);
+ *p += len;
+
+ *p += 4; /* skip hit_set_period */
+ *p += 4; /* skip hit_set_count */
+ }
+
+ if (ev >= 12)
+ *p += 4; /* skip stripe_width */
+
+ if (ev >= 13) {
+ *p += 8; /* skip target_max_bytes */
+ *p += 8; /* skip target_max_objects */
+ *p += 4; /* skip cache_target_dirty_ratio_micro */
+ *p += 4; /* skip cache_target_full_ratio_micro */
+ *p += 4; /* skip cache_min_flush_age */
+ *p += 4; /* skip cache_min_evict_age */
+ }
+
+ if (ev >= 14) {
+ /* skip erasure_code_profile */
+ len = ceph_decode_32(p);
+ *p += len;
+ }
+
+ if (ev >= 15)
+ pi->last_force_request_resend = ceph_decode_32(p);
+ else
+ pi->last_force_request_resend = 0;
+
/* ignore the rest */
*p = pool_end;
@@ -660,6 +707,23 @@ bad:
/*
* osd map
*/
+struct ceph_osdmap *ceph_osdmap_alloc(void)
+{
+ struct ceph_osdmap *map;
+
+ map = kzalloc(sizeof(*map), GFP_NOIO);
+ if (!map)
+ return NULL;
+
+ map->pg_pools = RB_ROOT;
+ map->pool_max = -1;
+ map->pg_temp = RB_ROOT;
+ map->primary_temp = RB_ROOT;
+ mutex_init(&map->crush_scratch_mutex);
+
+ return map;
+}
+
void ceph_osdmap_destroy(struct ceph_osdmap *map)
{
dout("osdmap_destroy %p\n", map);
@@ -1183,14 +1247,10 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
struct ceph_osdmap *map;
int ret;
- map = kzalloc(sizeof(*map), GFP_NOFS);
+ map = ceph_osdmap_alloc();
if (!map)
return ERR_PTR(-ENOMEM);
- map->pg_temp = RB_ROOT;
- map->primary_temp = RB_ROOT;
- mutex_init(&map->crush_scratch_mutex);
-
ret = osdmap_decode(p, end, map);
if (ret) {
ceph_osdmap_destroy(map);
@@ -1204,8 +1264,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
* decode and apply an incremental map update.
*/
struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
- struct ceph_osdmap *map,
- struct ceph_messenger *msgr)
+ struct ceph_osdmap *map)
{
struct crush_map *newcrush = NULL;
struct ceph_fsid fsid;
@@ -1381,8 +1440,252 @@ bad:
return ERR_PTR(err);
}
+void ceph_oid_copy(struct ceph_object_id *dest,
+ const struct ceph_object_id *src)
+{
+ WARN_ON(!ceph_oid_empty(dest));
+
+ if (src->name != src->inline_name) {
+ /* very rare, see ceph_object_id definition */
+ dest->name = kmalloc(src->name_len + 1,
+ GFP_NOIO | __GFP_NOFAIL);
+ }
+
+ memcpy(dest->name, src->name, src->name_len + 1);
+ dest->name_len = src->name_len;
+}
+EXPORT_SYMBOL(ceph_oid_copy);
+
+static __printf(2, 0)
+int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap)
+{
+ int len;
+
+ WARN_ON(!ceph_oid_empty(oid));
+
+ len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap);
+ if (len >= sizeof(oid->inline_name))
+ return len;
+
+ oid->name_len = len;
+ return 0;
+}
+
+/*
+ * If oid doesn't fit into inline buffer, BUG.
+ */
+void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ BUG_ON(oid_printf_vargs(oid, fmt, ap));
+ va_end(ap);
+}
+EXPORT_SYMBOL(ceph_oid_printf);
+
+static __printf(3, 0)
+int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp,
+ const char *fmt, va_list ap)
+{
+ va_list aq;
+ int len;
+
+ va_copy(aq, ap);
+ len = oid_printf_vargs(oid, fmt, aq);
+ va_end(aq);
+
+ if (len) {
+ char *external_name;
+
+ external_name = kmalloc(len + 1, gfp);
+ if (!external_name)
+ return -ENOMEM;
+
+ oid->name = external_name;
+ WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len);
+ oid->name_len = len;
+ }
+
+ return 0;
+}
+
+/*
+ * If oid doesn't fit into inline buffer, allocate.
+ */
+int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
+ const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+
+ va_start(ap, fmt);
+ ret = oid_aprintf_vargs(oid, gfp, fmt, ap);
+ va_end(ap);
+
+ return ret;
+}
+EXPORT_SYMBOL(ceph_oid_aprintf);
+
+void ceph_oid_destroy(struct ceph_object_id *oid)
+{
+ if (oid->name != oid->inline_name)
+ kfree(oid->name);
+}
+EXPORT_SYMBOL(ceph_oid_destroy);
+
+/*
+ * osds only
+ */
+static bool __osds_equal(const struct ceph_osds *lhs,
+ const struct ceph_osds *rhs)
+{
+ if (lhs->size == rhs->size &&
+ !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0])))
+ return true;
+
+ return false;
+}
+
+/*
+ * osds + primary
+ */
+static bool osds_equal(const struct ceph_osds *lhs,
+ const struct ceph_osds *rhs)
+{
+ if (__osds_equal(lhs, rhs) &&
+ lhs->primary == rhs->primary)
+ return true;
+
+ return false;
+}
+
+static bool osds_valid(const struct ceph_osds *set)
+{
+ /* non-empty set */
+ if (set->size > 0 && set->primary >= 0)
+ return true;
+
+ /* empty can_shift_osds set */
+ if (!set->size && set->primary == -1)
+ return true;
+
+ /* empty !can_shift_osds set - all NONE */
+ if (set->size > 0 && set->primary == -1) {
+ int i;
+
+ for (i = 0; i < set->size; i++) {
+ if (set->osds[i] != CRUSH_ITEM_NONE)
+ break;
+ }
+ if (i == set->size)
+ return true;
+ }
+
+ return false;
+}
+
+void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
+{
+ memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0]));
+ dest->size = src->size;
+ dest->primary = src->primary;
+}
+
+static bool is_split(const struct ceph_pg *pgid,
+ u32 old_pg_num,
+ u32 new_pg_num)
+{
+ int old_bits = calc_bits_of(old_pg_num);
+ int old_mask = (1 << old_bits) - 1;
+ int n;
+
+ WARN_ON(pgid->seed >= old_pg_num);
+ if (new_pg_num <= old_pg_num)
+ return false;
+
+ for (n = 1; ; n++) {
+ int next_bit = n << (old_bits - 1);
+ u32 s = next_bit | pgid->seed;
+
+ if (s < old_pg_num || s == pgid->seed)
+ continue;
+ if (s >= new_pg_num)
+ break;
+
+ s = ceph_stable_mod(s, old_pg_num, old_mask);
+ if (s == pgid->seed)
+ return true;
+ }
+
+ return false;
+}
+
+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
+ const struct ceph_osds *new_acting,
+ const struct ceph_osds *old_up,
+ const struct ceph_osds *new_up,
+ int old_size,
+ int new_size,
+ int old_min_size,
+ int new_min_size,
+ u32 old_pg_num,
+ u32 new_pg_num,
+ bool old_sort_bitwise,
+ bool new_sort_bitwise,
+ const struct ceph_pg *pgid)
+{
+ return !osds_equal(old_acting, new_acting) ||
+ !osds_equal(old_up, new_up) ||
+ old_size != new_size ||
+ old_min_size != new_min_size ||
+ is_split(pgid, old_pg_num, new_pg_num) ||
+ old_sort_bitwise != new_sort_bitwise;
+}
+
+static int calc_pg_rank(int osd, const struct ceph_osds *acting)
+{
+ int i;
+
+ for (i = 0; i < acting->size; i++) {
+ if (acting->osds[i] == osd)
+ return i;
+ }
+
+ return -1;
+}
+
+static bool primary_changed(const struct ceph_osds *old_acting,
+ const struct ceph_osds *new_acting)
+{
+ if (!old_acting->size && !new_acting->size)
+ return false; /* both still empty */
+ if (!old_acting->size ^ !new_acting->size)
+ return true; /* was empty, now not, or vice versa */
+ if (old_acting->primary != new_acting->primary)
+ return true; /* primary changed */
+
+ if (calc_pg_rank(old_acting->primary, old_acting) !=
+ calc_pg_rank(new_acting->primary, new_acting))
+ return true;
+
+ return false; /* same primary (tho replicas may have changed) */
+}
+
+bool ceph_osds_changed(const struct ceph_osds *old_acting,
+ const struct ceph_osds *new_acting,
+ bool any_change)
+{
+ if (primary_changed(old_acting, new_acting))
+ return true;
+
+ if (any_change && !__osds_equal(old_acting, new_acting))
+ return true;
+
+ return false;
+}
/*
* calculate file layout from given offset, length.
@@ -1455,30 +1758,71 @@ invalid:
EXPORT_SYMBOL(ceph_calc_file_object_mapping);
/*
- * Calculate mapping of a (oloc, oid) pair to a PG. Should only be
- * called with target's (oloc, oid), since tiering isn't taken into
- * account.
+ * Map an object into a PG.
+ *
+ * Should only be called with target_oid and target_oloc (as opposed to
+ * base_oid and base_oloc), since tiering isn't taken into account.
*/
-int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
- struct ceph_object_locator *oloc,
- struct ceph_object_id *oid,
- struct ceph_pg *pg_out)
+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ struct ceph_pg *raw_pgid)
{
struct ceph_pg_pool_info *pi;
- pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool);
+ pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
if (!pi)
- return -EIO;
+ return -ENOENT;
- pg_out->pool = oloc->pool;
- pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
- oid->name_len);
+ raw_pgid->pool = oloc->pool;
+ raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
+ oid->name_len);
- dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name,
- pg_out->pool, pg_out->seed);
+ dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name,
+ raw_pgid->pool, raw_pgid->seed);
return 0;
}
-EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
+EXPORT_SYMBOL(ceph_object_locator_to_pg);
+
+/*
+ * Map a raw PG (full precision ps) into an actual PG.
+ */
+static void raw_pg_to_pg(struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_pg *pgid)
+{
+ pgid->pool = raw_pgid->pool;
+ pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num,
+ pi->pg_num_mask);
+}
+
+/*
+ * Map a raw PG (full precision ps) into a placement ps (placement
+ * seed). Include pool id in that value so that different pools don't
+ * use the same seeds.
+ */
+static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid)
+{
+ if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
+ /* hash pool id and seed so that pool PGs do not overlap */
+ return crush_hash32_2(CRUSH_HASH_RJENKINS1,
+ ceph_stable_mod(raw_pgid->seed,
+ pi->pgp_num,
+ pi->pgp_num_mask),
+ raw_pgid->pool);
+ } else {
+ /*
+ * legacy behavior: add ps and pool together. this is
+ * not a great approach because the PGs from each pool
+ * will overlap on top of each other: 0.5 == 1.4 ==
+ * 2.3 == ...
+ */
+ return ceph_stable_mod(raw_pgid->seed, pi->pgp_num,
+ pi->pgp_num_mask) +
+ (unsigned)raw_pgid->pool;
+ }
+}
static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
int *result, int result_max,
@@ -1497,84 +1841,92 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
}
/*
- * Calculate raw (crush) set for given pgid.
+ * Calculate raw set (CRUSH output) for given PG. The result may
+ * contain nonexistent OSDs. ->primary is undefined for a raw set.
*
- * Return raw set length, or error.
+ * Placement seed (CRUSH input) is returned through @ppps.
*/
-static int pg_to_raw_osds(struct ceph_osdmap *osdmap,
- struct ceph_pg_pool_info *pool,
- struct ceph_pg pgid, u32 pps, int *osds)
+static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_osds *raw,
+ u32 *ppps)
{
+ u32 pps = raw_pg_to_pps(pi, raw_pgid);
int ruleno;
int len;
- /* crush */
- ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
- pool->type, pool->size);
+ ceph_osds_init(raw);
+ if (ppps)
+ *ppps = pps;
+
+ ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type,
+ pi->size);
if (ruleno < 0) {
pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
- pgid.pool, pool->crush_ruleset, pool->type,
- pool->size);
- return -ENOENT;
+ pi->id, pi->crush_ruleset, pi->type, pi->size);
+ return;
}
- len = do_crush(osdmap, ruleno, pps, osds,
- min_t(int, pool->size, CEPH_PG_MAX_SIZE),
+ len = do_crush(osdmap, ruleno, pps, raw->osds,
+ min_t(int, pi->size, ARRAY_SIZE(raw->osds)),
osdmap->osd_weight, osdmap->max_osd);
if (len < 0) {
pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
- len, ruleno, pgid.pool, pool->crush_ruleset,
- pool->type, pool->size);
- return len;
+ len, ruleno, pi->id, pi->crush_ruleset, pi->type,
+ pi->size);
+ return;
}
- return len;
+ raw->size = len;
}
/*
- * Given raw set, calculate up set and up primary.
+ * Given raw set, calculate up set and up primary. By definition of an
+ * up set, the result won't contain nonexistent or down OSDs.
*
- * Return up set length. *primary is set to up primary osd id, or -1
- * if up set is empty.
+ * This is done in-place - on return @set is the up set. If it's
+ * empty, ->primary will remain undefined.
*/
-static int raw_to_up_osds(struct ceph_osdmap *osdmap,
- struct ceph_pg_pool_info *pool,
- int *osds, int len, int *primary)
+static void raw_to_up_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ struct ceph_osds *set)
{
- int up_primary = -1;
int i;
- if (ceph_can_shift_osds(pool)) {
+ /* ->primary is undefined for a raw set */
+ BUG_ON(set->primary != -1);
+
+ if (ceph_can_shift_osds(pi)) {
int removed = 0;
- for (i = 0; i < len; i++) {
- if (ceph_osd_is_down(osdmap, osds[i])) {
+ /* shift left */
+ for (i = 0; i < set->size; i++) {
+ if (ceph_osd_is_down(osdmap, set->osds[i])) {
removed++;
continue;
}
if (removed)
- osds[i - removed] = osds[i];
+ set->osds[i - removed] = set->osds[i];
}
-
- len -= removed;
- if (len > 0)
- up_primary = osds[0];
+ set->size -= removed;
+ if (set->size > 0)
+ set->primary = set->osds[0];
} else {
- for (i = len - 1; i >= 0; i--) {
- if (ceph_osd_is_down(osdmap, osds[i]))
- osds[i] = CRUSH_ITEM_NONE;
+ /* set down/dne devices to NONE */
+ for (i = set->size - 1; i >= 0; i--) {
+ if (ceph_osd_is_down(osdmap, set->osds[i]))
+ set->osds[i] = CRUSH_ITEM_NONE;
else
- up_primary = osds[i];
+ set->primary = set->osds[i];
}
}
-
- *primary = up_primary;
- return len;
}
-static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
- struct ceph_pg_pool_info *pool,
- int *osds, int len, int *primary)
+static void apply_primary_affinity(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ u32 pps,
+ struct ceph_osds *up)
{
int i;
int pos = -1;
@@ -1586,8 +1938,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
if (!osdmap->osd_primary_affinity)
return;
- for (i = 0; i < len; i++) {
- int osd = osds[i];
+ for (i = 0; i < up->size; i++) {
+ int osd = up->osds[i];
if (osd != CRUSH_ITEM_NONE &&
osdmap->osd_primary_affinity[osd] !=
@@ -1595,7 +1947,7 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
break;
}
}
- if (i == len)
+ if (i == up->size)
return;
/*
@@ -1603,8 +1955,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
* osd into the hash/rng so that a proportional fraction of an
* osd's pgs get rejected as primary.
*/
- for (i = 0; i < len; i++) {
- int osd = osds[i];
+ for (i = 0; i < up->size; i++) {
+ int osd = up->osds[i];
u32 aff;
if (osd == CRUSH_ITEM_NONE)
@@ -1629,135 +1981,110 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
if (pos < 0)
return;
- *primary = osds[pos];
+ up->primary = up->osds[pos];
- if (ceph_can_shift_osds(pool) && pos > 0) {
+ if (ceph_can_shift_osds(pi) && pos > 0) {
/* move the new primary to the front */
for (i = pos; i > 0; i--)
- osds[i] = osds[i - 1];
- osds[0] = *primary;
+ up->osds[i] = up->osds[i - 1];
+ up->osds[0] = up->primary;
}
}
/*
- * Given up set, apply pg_temp and primary_temp mappings.
+ * Get pg_temp and primary_temp mappings for given PG.
*
- * Return acting set length. *primary is set to acting primary osd id,
- * or -1 if acting set is empty.
+ * Note that a PG may have none, only pg_temp, only primary_temp or
+ * both pg_temp and primary_temp mappings. This means @temp isn't
+ * always a valid OSD set on return: in the "only primary_temp" case,
+ * @temp will have its ->primary >= 0 but ->size == 0.
*/
-static int apply_temps(struct ceph_osdmap *osdmap,
- struct ceph_pg_pool_info *pool, struct ceph_pg pgid,
- int *osds, int len, int *primary)
+static void get_temp_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_osds *temp)
{
+ struct ceph_pg pgid;
struct ceph_pg_mapping *pg;
- int temp_len;
- int temp_primary;
int i;
- /* raw_pg -> pg */
- pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
- pool->pg_num_mask);
+ raw_pg_to_pg(pi, raw_pgid, &pgid);
+ ceph_osds_init(temp);
/* pg_temp? */
pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
if (pg) {
- temp_len = 0;
- temp_primary = -1;
-
for (i = 0; i < pg->pg_temp.len; i++) {
if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
- if (ceph_can_shift_osds(pool))
+ if (ceph_can_shift_osds(pi))
continue;
- else
- osds[temp_len++] = CRUSH_ITEM_NONE;
+
+ temp->osds[temp->size++] = CRUSH_ITEM_NONE;
} else {
- osds[temp_len++] = pg->pg_temp.osds[i];
+ temp->osds[temp->size++] = pg->pg_temp.osds[i];
}
}
/* apply pg_temp's primary */
- for (i = 0; i < temp_len; i++) {
- if (osds[i] != CRUSH_ITEM_NONE) {
- temp_primary = osds[i];
+ for (i = 0; i < temp->size; i++) {
+ if (temp->osds[i] != CRUSH_ITEM_NONE) {
+ temp->primary = temp->osds[i];
break;
}
}
- } else {
- temp_len = len;
- temp_primary = *primary;
}
/* primary_temp? */
pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
if (pg)
- temp_primary = pg->primary_temp.osd;
-
- *primary = temp_primary;
- return temp_len;
+ temp->primary = pg->primary_temp.osd;
}
/*
- * Calculate acting set for given pgid.
+ * Map a PG to its acting set as well as its up set.
*
- * Return acting set length, or error. *primary is set to acting
- * primary osd id, or -1 if acting set is empty or on error.
+ * Acting set is used for data mapping purposes, while up set can be
+ * recorded for detecting interval changes and deciding whether to
+ * resend a request.
*/
-int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
- int *osds, int *primary)
+void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_osds *up,
+ struct ceph_osds *acting)
{
- struct ceph_pg_pool_info *pool;
+ struct ceph_pg_pool_info *pi;
u32 pps;
- int len;
- pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
- if (!pool) {
- *primary = -1;
- return -ENOENT;
+ pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
+ if (!pi) {
+ ceph_osds_init(up);
+ ceph_osds_init(acting);
+ goto out;
}
- if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
- /* hash pool id and seed so that pool PGs do not overlap */
- pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
- ceph_stable_mod(pgid.seed, pool->pgp_num,
- pool->pgp_num_mask),
- pgid.pool);
- } else {
- /*
- * legacy behavior: add ps and pool together. this is
- * not a great approach because the PGs from each pool
- * will overlap on top of each other: 0.5 == 1.4 ==
- * 2.3 == ...
- */
- pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
- pool->pgp_num_mask) +
- (unsigned)pgid.pool;
- }
-
- len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds);
- if (len < 0) {
- *primary = -1;
- return len;
+ pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
+ raw_to_up_osds(osdmap, pi, up);
+ apply_primary_affinity(osdmap, pi, pps, up);
+ get_temp_osds(osdmap, pi, raw_pgid, acting);
+ if (!acting->size) {
+ memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0]));
+ acting->size = up->size;
+ if (acting->primary == -1)
+ acting->primary = up->primary;
}
-
- len = raw_to_up_osds(osdmap, pool, osds, len, primary);
-
- apply_primary_affinity(osdmap, pps, pool, osds, len, primary);
-
- len = apply_temps(osdmap, pool, pgid, osds, len, primary);
-
- return len;
+out:
+ WARN_ON(!osds_valid(up) || !osds_valid(acting));
}
/*
- * Return primary osd for given pgid, or -1 if none.
+ * Return acting primary for given PG, or -1 if none.
*/
-int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
+ const struct ceph_pg *raw_pgid)
{
- int osds[CEPH_PG_MAX_SIZE];
- int primary;
-
- ceph_calc_pg_acting(osdmap, pgid, osds, &primary);
+ struct ceph_osds up, acting;
- return primary;
+ ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting);
+ return acting.primary;
}
-EXPORT_SYMBOL(ceph_calc_pg_primary);
+EXPORT_SYMBOL(ceph_pg_to_acting_primary);
diff --git a/net/compat.c b/net/compat.c
index 5cfd26a0006f..1cd2ec046164 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -309,8 +309,8 @@ void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm)
__scm_destroy(scm);
}
-static int do_set_attach_filter(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+/* allocate a 64-bit sock_fprog on the user stack for duration of syscall. */
+struct sock_fprog __user *get_compat_bpf_fprog(char __user *optval)
{
struct compat_sock_fprog __user *fprog32 = (struct compat_sock_fprog __user *)optval;
struct sock_fprog __user *kfprog = compat_alloc_user_space(sizeof(struct sock_fprog));
@@ -323,6 +323,19 @@ static int do_set_attach_filter(struct socket *sock, int level, int optname,
__get_user(ptr, &fprog32->filter) ||
__put_user(len, &kfprog->len) ||
__put_user(compat_ptr(ptr), &kfprog->filter))
+ return NULL;
+
+ return kfprog;
+}
+EXPORT_SYMBOL_GPL(get_compat_bpf_fprog);
+
+static int do_set_attach_filter(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock_fprog __user *kfprog;
+
+ kfprog = get_compat_bpf_fprog(optval);
+ if (!kfprog)
return -EFAULT;
return sock_setsockopt(sock, level, optname, (char __user *)kfprog,
@@ -354,7 +367,8 @@ static int do_set_sock_timeout(struct socket *sock, int level,
static int compat_sock_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen)
{
- if (optname == SO_ATTACH_FILTER)
+ if (optname == SO_ATTACH_FILTER ||
+ optname == SO_ATTACH_REUSEPORT_CBPF)
return do_set_attach_filter(sock, level, optname,
optval, optlen);
if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)
diff --git a/net/core/filter.c b/net/core/filter.c
index 68adb5f52110..c4b330c85c02 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2085,7 +2085,8 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type)
}
static bool sk_filter_is_valid_access(int off, int size,
- enum bpf_access_type type)
+ enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
{
switch (off) {
case offsetof(struct __sk_buff, tc_classid):
@@ -2108,7 +2109,8 @@ static bool sk_filter_is_valid_access(int off, int size,
}
static bool tc_cls_act_is_valid_access(int off, int size,
- enum bpf_access_type type)
+ enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
{
if (type == BPF_WRITE) {
switch (off) {
@@ -2123,6 +2125,16 @@ static bool tc_cls_act_is_valid_access(int off, int size,
return false;
}
}
+
+ switch (off) {
+ case offsetof(struct __sk_buff, data):
+ *reg_type = PTR_TO_PACKET;
+ break;
+ case offsetof(struct __sk_buff, data_end):
+ *reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
return __is_valid_access(off, size, type);
}
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index f96ee8b9478d..be873e4e3125 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -47,6 +47,7 @@ nla_put_failure:
* @xstats_type: TLV type for backward compatibility xstats TLV
* @lock: statistics lock
* @d: dumping handle
+ * @padattr: padding attribute
*
* Initializes the dumping handle, grabs the statistic lock and appends
* an empty TLV header to the socket buffer for use a container for all
@@ -87,6 +88,7 @@ EXPORT_SYMBOL(gnet_stats_start_copy_compat);
* @type: TLV type for top level statistic TLV
* @lock: statistics lock
* @d: dumping handle
+ * @padattr: padding attribute
*
* Initializes the dumping handle, grabs the statistic lock and appends
* an empty TLV header to the socket buffer for use a container for all
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 29dd8cc22bbf..510cd62fcb99 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2469,13 +2469,17 @@ int neigh_xmit(int index, struct net_device *dev,
tbl = neigh_tables[index];
if (!tbl)
goto out;
+ rcu_read_lock_bh();
neigh = __neigh_lookup_noref(tbl, addr, dev);
if (!neigh)
neigh = __neigh_create(tbl, addr, dev, false);
err = PTR_ERR(neigh);
- if (IS_ERR(neigh))
+ if (IS_ERR(neigh)) {
+ rcu_read_unlock_bh();
goto out_kfree_skb;
+ }
err = neigh->output(neigh, skb);
+ rcu_read_unlock_bh();
}
else if (index == NEIGH_LINK_TABLE) {
err = dev_hard_header(skb, dev, ntohs(skb->protocol),
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 2b3f76fe65f4..7a0b616557ab 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -24,6 +24,7 @@
#include <linux/jiffies.h>
#include <linux/pm_runtime.h>
#include <linux/of.h>
+#include <linux/of_net.h>
#include "net-sysfs.h"
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 8604ae245960..8b02df0d354d 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2245,10 +2245,8 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
hrtimer_set_expires(&t.timer, spin_until);
remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer));
- if (remaining <= 0) {
- pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
- return;
- }
+ if (remaining <= 0)
+ goto out;
start_time = ktime_get();
if (remaining < 100000) {
@@ -2273,7 +2271,9 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
}
pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time));
+out:
pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
+ destroy_hrtimer_on_stack(&t.timer);
}
static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev)
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 477937465a20..d95631d09248 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -23,6 +23,11 @@ struct esp_skb_cb {
void *tmp;
};
+struct esp_output_extra {
+ __be32 seqhi;
+ u32 esphoff;
+};
+
#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
@@ -35,11 +40,11 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
*
* TODO: Use spare space in skb for this where possible.
*/
-static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen)
+static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int extralen)
{
unsigned int len;
- len = seqhilen;
+ len = extralen;
len += crypto_aead_ivsize(aead);
@@ -57,15 +62,16 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen)
return kmalloc(len, GFP_ATOMIC);
}
-static inline __be32 *esp_tmp_seqhi(void *tmp)
+static inline void *esp_tmp_extra(void *tmp)
{
- return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32));
+ return PTR_ALIGN(tmp, __alignof__(struct esp_output_extra));
}
-static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
+
+static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int extralen)
{
return crypto_aead_ivsize(aead) ?
- PTR_ALIGN((u8 *)tmp + seqhilen,
- crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
+ PTR_ALIGN((u8 *)tmp + extralen,
+ crypto_aead_alignmask(aead) + 1) : tmp + extralen;
}
static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv)
@@ -99,7 +105,7 @@ static void esp_restore_header(struct sk_buff *skb, unsigned int offset)
{
struct ip_esp_hdr *esph = (void *)(skb->data + offset);
void *tmp = ESP_SKB_CB(skb)->tmp;
- __be32 *seqhi = esp_tmp_seqhi(tmp);
+ __be32 *seqhi = esp_tmp_extra(tmp);
esph->seq_no = esph->spi;
esph->spi = *seqhi;
@@ -107,7 +113,11 @@ static void esp_restore_header(struct sk_buff *skb, unsigned int offset)
static void esp_output_restore_header(struct sk_buff *skb)
{
- esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32));
+ void *tmp = ESP_SKB_CB(skb)->tmp;
+ struct esp_output_extra *extra = esp_tmp_extra(tmp);
+
+ esp_restore_header(skb, skb_transport_offset(skb) + extra->esphoff -
+ sizeof(__be32));
}
static void esp_output_done_esn(struct crypto_async_request *base, int err)
@@ -121,6 +131,7 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
{
int err;
+ struct esp_output_extra *extra;
struct ip_esp_hdr *esph;
struct crypto_aead *aead;
struct aead_request *req;
@@ -137,8 +148,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
int tfclen;
int nfrags;
int assoclen;
- int seqhilen;
- __be32 *seqhi;
+ int extralen;
__be64 seqno;
/* skb is pure payload to encrypt */
@@ -166,21 +176,21 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
nfrags = err;
assoclen = sizeof(*esph);
- seqhilen = 0;
+ extralen = 0;
if (x->props.flags & XFRM_STATE_ESN) {
- seqhilen += sizeof(__be32);
- assoclen += seqhilen;
+ extralen += sizeof(*extra);
+ assoclen += sizeof(__be32);
}
- tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
+ tmp = esp_alloc_tmp(aead, nfrags, extralen);
if (!tmp) {
err = -ENOMEM;
goto error;
}
- seqhi = esp_tmp_seqhi(tmp);
- iv = esp_tmp_iv(aead, tmp, seqhilen);
+ extra = esp_tmp_extra(tmp);
+ iv = esp_tmp_iv(aead, tmp, extralen);
req = esp_tmp_req(aead, iv);
sg = esp_req_sg(aead, req);
@@ -247,8 +257,10 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
* encryption.
*/
if ((x->props.flags & XFRM_STATE_ESN)) {
- esph = (void *)(skb_transport_header(skb) - sizeof(__be32));
- *seqhi = esph->spi;
+ extra->esphoff = (unsigned char *)esph -
+ skb_transport_header(skb);
+ esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
+ extra->seqhi = esph->spi;
esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
aead_request_set_callback(req, 0, esp_output_done_esn, skb);
}
@@ -445,7 +457,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
goto out;
ESP_SKB_CB(skb)->tmp = tmp;
- seqhi = esp_tmp_seqhi(tmp);
+ seqhi = esp_tmp_extra(tmp);
iv = esp_tmp_iv(aead, tmp, seqhilen);
req = esp_tmp_req(aead, iv);
sg = esp_req_sg(aead, req);
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 4c39f4fd332a..de1d119a4497 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -62,26 +62,26 @@ EXPORT_SYMBOL_GPL(gre_del_protocol);
/* Fills in tpi and returns header length to be pulled. */
int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
- bool *csum_err, __be16 proto)
+ bool *csum_err, __be16 proto, int nhs)
{
const struct gre_base_hdr *greh;
__be32 *options;
int hdr_len;
- if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
+ if (unlikely(!pskb_may_pull(skb, nhs + sizeof(struct gre_base_hdr))))
return -EINVAL;
- greh = (struct gre_base_hdr *)skb_transport_header(skb);
+ greh = (struct gre_base_hdr *)(skb->data + nhs);
if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
return -EINVAL;
tpi->flags = gre_flags_to_tnl_flags(greh->flags);
hdr_len = gre_calc_hlen(tpi->flags);
- if (!pskb_may_pull(skb, hdr_len))
+ if (!pskb_may_pull(skb, nhs + hdr_len))
return -EINVAL;
- greh = (struct gre_base_hdr *)skb_transport_header(skb);
+ greh = (struct gre_base_hdr *)(skb->data + nhs);
tpi->proto = greh->protocol;
options = (__be32 *)(greh + 1);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 4d2025f7ec57..1d000af7f561 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -49,12 +49,6 @@
#include <net/gre.h>
#include <net/dst_metadata.h>
-#if IS_ENABLED(CONFIG_IPV6)
-#include <net/ipv6.h>
-#include <net/ip6_fib.h>
-#include <net/ip6_route.h>
-#endif
-
/*
Problems & solutions
--------------------
@@ -217,12 +211,14 @@ static void gre_err(struct sk_buff *skb, u32 info)
* by themselves???
*/
+ const struct iphdr *iph = (struct iphdr *)skb->data;
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
struct tnl_ptk_info tpi;
bool csum_err = false;
- if (gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP)) < 0) {
+ if (gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP),
+ iph->ihl * 4) < 0) {
if (!csum_err) /* ignore csum errors. */
return;
}
@@ -338,7 +334,7 @@ static int gre_rcv(struct sk_buff *skb)
}
#endif
- hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP));
+ hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0);
if (hdr_len < 0)
goto drop;
@@ -1121,6 +1117,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
{
struct nlattr *tb[IFLA_MAX + 1];
struct net_device *dev;
+ LIST_HEAD(list_kill);
struct ip_tunnel *t;
int err;
@@ -1136,8 +1133,10 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
t->collect_md = true;
err = ipgre_newlink(net, dev, tb, NULL);
- if (err < 0)
- goto out;
+ if (err < 0) {
+ free_netdev(dev);
+ return ERR_PTR(err);
+ }
/* openvswitch users expect packet sizes to be unrestricted,
* so set the largest MTU we can.
@@ -1146,9 +1145,14 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
if (err)
goto out;
+ err = rtnl_configure_link(dev, NULL);
+ if (err < 0)
+ goto out;
+
return dev;
out:
- free_netdev(dev);
+ ip_tunnel_dellink(dev, &list_kill);
+ unregister_netdevice_many(&list_kill);
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 2ed9dd2b5f2f..1d71c40eaaf3 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -127,7 +127,9 @@ __be32 ic_myaddr = NONE; /* My IP address */
static __be32 ic_netmask = NONE; /* Netmask for local subnet */
__be32 ic_gateway = NONE; /* Gateway IP address */
-__be32 ic_addrservaddr = NONE; /* IP Address of the IP addresses'server */
+#ifdef IPCONFIG_DYNAMIC
+static __be32 ic_addrservaddr = NONE; /* IP Address of the IP addresses'server */
+#endif
__be32 ic_servaddr = NONE; /* Boot server IP address */
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 21a38e296fe2..5ad48ec77710 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -891,8 +891,10 @@ static struct mfc_cache *ipmr_cache_alloc(void)
{
struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
- if (c)
+ if (c) {
+ c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
c->mfc_un.res.minvif = MAXVIFS;
+ }
return c;
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8bd9911fdd16..e00e972c4e6a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2751,7 +2751,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
struct sk_buff *hole = NULL;
- u32 last_lost;
+ u32 max_segs, last_lost;
int mib_idx;
int fwd_rexmitting = 0;
@@ -2771,6 +2771,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
last_lost = tp->snd_una;
}
+ max_segs = tcp_tso_autosize(sk, tcp_current_mss(sk));
tcp_for_write_queue_from(skb, sk) {
__u8 sacked = TCP_SKB_CB(skb)->sacked;
int segs;
@@ -2784,6 +2785,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
if (segs <= 0)
return;
+ /* In case tcp_shift_skb_data() have aggregated large skbs,
+ * we need to make sure not sending too bigs TSO packets
+ */
+ segs = min_t(int, segs, max_segs);
if (fwd_rexmitting) {
begin_fwd:
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index d56c0559b477..ca5e8ea29538 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -391,9 +391,9 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
}
-static inline int compute_score(struct sock *sk, struct net *net,
- __be32 saddr, unsigned short hnum, __be16 sport,
- __be32 daddr, __be16 dport, int dif)
+static int compute_score(struct sock *sk, struct net *net,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned short hnum, int dif)
{
int score;
struct inet_sock *inet;
@@ -434,52 +434,6 @@ static inline int compute_score(struct sock *sk, struct net *net,
return score;
}
-/*
- * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num)
- */
-static inline int compute_score2(struct sock *sk, struct net *net,
- __be32 saddr, __be16 sport,
- __be32 daddr, unsigned int hnum, int dif)
-{
- int score;
- struct inet_sock *inet;
-
- if (!net_eq(sock_net(sk), net) ||
- ipv6_only_sock(sk))
- return -1;
-
- inet = inet_sk(sk);
-
- if (inet->inet_rcv_saddr != daddr ||
- inet->inet_num != hnum)
- return -1;
-
- score = (sk->sk_family == PF_INET) ? 2 : 1;
-
- if (inet->inet_daddr) {
- if (inet->inet_daddr != saddr)
- return -1;
- score += 4;
- }
-
- if (inet->inet_dport) {
- if (inet->inet_dport != sport)
- return -1;
- score += 4;
- }
-
- if (sk->sk_bound_dev_if) {
- if (sk->sk_bound_dev_if != dif)
- return -1;
- score += 4;
- }
-
- if (sk->sk_incoming_cpu == raw_smp_processor_id())
- score++;
-
- return score;
-}
-
static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
const __u16 lport, const __be32 faddr,
const __be16 fport)
@@ -492,11 +446,11 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
udp_ehash_secret + net_hash_mix(net));
}
-/* called with read_rcu_lock() */
+/* called with rcu_read_lock() */
static struct sock *udp4_lib_lookup2(struct net *net,
__be32 saddr, __be16 sport,
__be32 daddr, unsigned int hnum, int dif,
- struct udp_hslot *hslot2, unsigned int slot2,
+ struct udp_hslot *hslot2,
struct sk_buff *skb)
{
struct sock *sk, *result;
@@ -506,7 +460,7 @@ static struct sock *udp4_lib_lookup2(struct net *net,
result = NULL;
badness = 0;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
- score = compute_score2(sk, net, saddr, sport,
+ score = compute_score(sk, net, saddr, sport,
daddr, hnum, dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
@@ -554,17 +508,22 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
result = udp4_lib_lookup2(net, saddr, sport,
daddr, hnum, dif,
- hslot2, slot2, skb);
+ hslot2, skb);
if (!result) {
+ unsigned int old_slot2 = slot2;
hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
slot2 = hash2 & udptable->mask;
+ /* avoid searching the same slot again. */
+ if (unlikely(slot2 == old_slot2))
+ return result;
+
hslot2 = &udptable->hash2[slot2];
if (hslot->count < hslot2->count)
goto begin;
result = udp4_lib_lookup2(net, saddr, sport,
- htonl(INADDR_ANY), hnum, dif,
- hslot2, slot2, skb);
+ daddr, hnum, dif,
+ hslot2, skb);
}
return result;
}
@@ -572,8 +531,8 @@ begin:
result = NULL;
badness = 0;
sk_for_each_rcu(sk, &hslot->head) {
- score = compute_score(sk, net, saddr, hnum, sport,
- daddr, dport, dif);
+ score = compute_score(sk, net, saddr, sport,
+ daddr, hnum, dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
@@ -1618,12 +1577,12 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
}
}
- if (rcu_access_pointer(sk->sk_filter)) {
- if (udp_lib_checksum_complete(skb))
+ if (rcu_access_pointer(sk->sk_filter) &&
+ udp_lib_checksum_complete(skb))
goto csum_error;
- if (sk_filter(sk, skb))
- goto drop;
- }
+
+ if (sk_filter(sk, skb))
+ goto drop;
udp_csum_pull_header(skb);
if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
@@ -1755,8 +1714,11 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
return err;
}
- return skb_checksum_init_zero_check(skb, proto, uh->check,
- inet_compute_pseudo);
+ /* Note, we are only interested in != 0 or == 0, thus the
+ * force to int.
+ */
+ return (__force int)skb_checksum_init_zero_check(skb, proto, uh->check,
+ inet_compute_pseudo);
}
/*
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 994608263260..2343e4f2e0bf 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -239,7 +239,7 @@ config IPV6_FOU
config IPV6_FOU_TUNNEL
tristate
default NET_FOU_IP_TUNNELS && IPV6_FOU
- select INET6_TUNNEL
+ select IPV6_TUNNEL
config IPV6_MULTIPLE_TABLES
bool "IPv6: Multiple Routing Tables"
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index d42ca3d1197f..6d8ea099213e 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -42,7 +42,7 @@ obj-$(CONFIG_IPV6_VTI) += ip6_vti.o
obj-$(CONFIG_IPV6_SIT) += sit.o
obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
obj-$(CONFIG_IPV6_GRE) += ip6_gre.o
-obj-$(CONFIG_NET_FOU_IPV6_TUNNELS) += fou6.o
+obj-$(CONFIG_IPV6_FOU) += fou6.o
obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o
obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload)
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 4527285fcaa2..a4fa84076969 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -98,7 +98,7 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (!(type & ICMPV6_INFOMSG_MASK))
if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST)
- ping_err(skb, offset, info);
+ ping_err(skb, offset, ntohl(info));
}
static int icmpv6_rcv(struct sk_buff *skb);
diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c
index b2025bf3da4a..c0cbcb259f5a 100644
--- a/net/ipv6/ip6_checksum.c
+++ b/net/ipv6/ip6_checksum.c
@@ -78,9 +78,12 @@ int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto)
* we accept a checksum of zero here. When we find the socket
* for the UDP packet we'll check if that socket allows zero checksum
* for IPv6 (set by socket option).
+ *
+ * Note, we are only interested in != 0 or == 0, thus the
+ * force to int.
*/
- return skb_checksum_init_zero_check(skb, proto, uh->check,
- ip6_compute_pseudo);
+ return (__force int)skb_checksum_init_zero_check(skb, proto, uh->check,
+ ip6_compute_pseudo);
}
EXPORT_SYMBOL(udp6_csum_init);
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index f4ac2842d4d9..776d145113e1 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -468,7 +468,7 @@ static int gre_rcv(struct sk_buff *skb)
bool csum_err = false;
int hdr_len;
- hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IPV6));
+ hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IPV6), 0);
if (hdr_len < 0)
goto drop;
@@ -1256,6 +1256,8 @@ static int ip6gre_tap_init(struct net_device *dev)
if (ret)
return ret;
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+
tunnel = netdev_priv(dev);
ip6gre_tnl_link_config(tunnel, 1);
@@ -1289,6 +1291,7 @@ static void ip6gre_tap_setup(struct net_device *dev)
dev->features |= NETIF_F_NETNS_LOCAL;
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
}
static bool ip6gre_netlink_encap_parms(struct nlattr *data[],
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index cbf127ae7c67..635b8d340cdb 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1071,17 +1071,12 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
const struct in6_addr *final_dst)
{
struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
- int err;
dst = ip6_sk_dst_check(sk, dst, fl6);
+ if (!dst)
+ dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
- err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
- if (err)
- return ERR_PTR(err);
- if (final_dst)
- fl6->daddr = *final_dst;
-
- return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
+ return dst;
}
EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index f2e2013f8346..487ef3bc7bbc 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1074,6 +1074,7 @@ static struct mfc6_cache *ip6mr_cache_alloc(void)
struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
if (!c)
return NULL;
+ c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
c->mfc_un.res.minvif = MAXMIFS;
return c;
}
diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c
index 6989c70ae29f..4a84b5ad9ecb 100644
--- a/net/ipv6/netfilter/nf_dup_ipv6.c
+++ b/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -33,6 +33,7 @@ static bool nf_dup_ipv6_route(struct net *net, struct sk_buff *skb,
fl6.daddr = *gw;
fl6.flowlabel = (__force __be32)(((iph->flow_lbl[0] & 0xF) << 16) |
(iph->flow_lbl[1] << 8) | iph->flow_lbl[2]);
+ fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH;
dst = ip6_route_output(net, NULL, &fl6);
if (dst->error) {
dst_release(dst);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 969913da494f..520b7884d0c2 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1782,7 +1782,7 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net,
};
struct fib6_table *table;
struct rt6_info *rt;
- int flags = 0;
+ int flags = RT6_LOOKUP_F_IFACE;
table = fib6_get_table(net, cfg->fc_table);
if (!table)
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 0a5a255277e5..0619ac70836d 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -560,13 +560,13 @@ static int ipip6_err(struct sk_buff *skb, u32 info)
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
ipv4_update_pmtu(skb, dev_net(skb->dev), info,
- t->parms.link, 0, IPPROTO_IPV6, 0);
+ t->parms.link, 0, iph->protocol, 0);
err = 0;
goto out;
}
if (type == ICMP_REDIRECT) {
ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
- IPPROTO_IPV6, 0);
+ iph->protocol, 0);
err = 0;
goto out;
}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 79e33e02f11a..2255d2bf5f6b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -738,7 +738,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq,
u32 ack, u32 win, u32 tsval, u32 tsecr,
int oif, struct tcp_md5sig_key *key, int rst,
- u8 tclass, u32 label)
+ u8 tclass, __be32 label)
{
const struct tcphdr *th = tcp_hdr(skb);
struct tcphdr *t1;
@@ -911,7 +911,7 @@ out:
static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq,
u32 ack, u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key, u8 tclass,
- u32 label)
+ __be32 label)
{
tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, key, 0,
tclass, label);
@@ -1721,7 +1721,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
destp = ntohs(inet->inet_dport);
srcp = ntohs(inet->inet_sport);
- if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1;
timer_expires = icsk->icsk_timeout;
} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 2da1896af934..005dc82c2138 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -115,11 +115,10 @@ static void udp_v6_rehash(struct sock *sk)
udp_lib_rehash(sk, new_hash);
}
-static inline int compute_score(struct sock *sk, struct net *net,
- unsigned short hnum,
- const struct in6_addr *saddr, __be16 sport,
- const struct in6_addr *daddr, __be16 dport,
- int dif)
+static int compute_score(struct sock *sk, struct net *net,
+ const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr, unsigned short hnum,
+ int dif)
{
int score;
struct inet_sock *inet;
@@ -162,54 +161,11 @@ static inline int compute_score(struct sock *sk, struct net *net,
return score;
}
-static inline int compute_score2(struct sock *sk, struct net *net,
- const struct in6_addr *saddr, __be16 sport,
- const struct in6_addr *daddr,
- unsigned short hnum, int dif)
-{
- int score;
- struct inet_sock *inet;
-
- if (!net_eq(sock_net(sk), net) ||
- udp_sk(sk)->udp_port_hash != hnum ||
- sk->sk_family != PF_INET6)
- return -1;
-
- if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
- return -1;
-
- score = 0;
- inet = inet_sk(sk);
-
- if (inet->inet_dport) {
- if (inet->inet_dport != sport)
- return -1;
- score++;
- }
-
- if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
- if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr))
- return -1;
- score++;
- }
-
- if (sk->sk_bound_dev_if) {
- if (sk->sk_bound_dev_if != dif)
- return -1;
- score++;
- }
-
- if (sk->sk_incoming_cpu == raw_smp_processor_id())
- score++;
-
- return score;
-}
-
-/* called with read_rcu_lock() */
+/* called with rcu_read_lock() */
static struct sock *udp6_lib_lookup2(struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, unsigned int hnum, int dif,
- struct udp_hslot *hslot2, unsigned int slot2,
+ struct udp_hslot *hslot2,
struct sk_buff *skb)
{
struct sock *sk, *result;
@@ -219,7 +175,7 @@ static struct sock *udp6_lib_lookup2(struct net *net,
result = NULL;
badness = -1;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
- score = compute_score2(sk, net, saddr, sport,
+ score = compute_score(sk, net, saddr, sport,
daddr, hnum, dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
@@ -268,17 +224,22 @@ struct sock *__udp6_lib_lookup(struct net *net,
result = udp6_lib_lookup2(net, saddr, sport,
daddr, hnum, dif,
- hslot2, slot2, skb);
+ hslot2, skb);
if (!result) {
+ unsigned int old_slot2 = slot2;
hash2 = udp6_portaddr_hash(net, &in6addr_any, hnum);
slot2 = hash2 & udptable->mask;
+ /* avoid searching the same slot again. */
+ if (unlikely(slot2 == old_slot2))
+ return result;
+
hslot2 = &udptable->hash2[slot2];
if (hslot->count < hslot2->count)
goto begin;
result = udp6_lib_lookup2(net, saddr, sport,
- &in6addr_any, hnum, dif,
- hslot2, slot2, skb);
+ daddr, hnum, dif,
+ hslot2, skb);
}
return result;
}
@@ -286,7 +247,7 @@ begin:
result = NULL;
badness = -1;
sk_for_each_rcu(sk, &hslot->head) {
- score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif);
+ score = compute_score(sk, net, saddr, sport, daddr, hnum, dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
@@ -653,12 +614,12 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
}
}
- if (rcu_access_pointer(sk->sk_filter)) {
- if (udp_lib_checksum_complete(skb))
- goto csum_error;
- if (sk_filter(sk, skb))
- goto drop;
- }
+ if (rcu_access_pointer(sk->sk_filter) &&
+ udp_lib_checksum_complete(skb))
+ goto csum_error;
+
+ if (sk_filter(sk, skb))
+ goto drop;
udp_csum_pull_header(skb);
if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c
index 738008726cc6..fda7f4715c58 100644
--- a/net/kcm/kcmproc.c
+++ b/net/kcm/kcmproc.c
@@ -241,6 +241,7 @@ static const struct file_operations kcm_seq_fops = {
.open = kcm_seq_open,
.read = seq_read,
.llseek = seq_lseek,
+ .release = seq_release_net,
};
static struct kcm_seq_muxinfo kcm_seq_muxinfo = {
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 6edfa9980314..1e40dacaa137 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1581,7 +1581,7 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32
/* Mark socket as an encapsulation socket. See net/ipv4/udp.c */
tunnel->encap = encap;
if (encap == L2TP_ENCAPTYPE_UDP) {
- struct udp_tunnel_sock_cfg udp_cfg;
+ struct udp_tunnel_sock_cfg udp_cfg = { };
udp_cfg.sk_user_data = tunnel;
udp_cfg.encap_type = UDP_ENCAP_L2TPINUDP;
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 2cb3c626cd43..096a45103f14 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -762,7 +762,7 @@ static int expire_quiescent_template(struct netns_ipvs *ipvs,
* If available, return 1, otherwise invalidate this connection
* template and return 0.
*/
-int ip_vs_check_template(struct ip_vs_conn *ct)
+int ip_vs_check_template(struct ip_vs_conn *ct, struct ip_vs_dest *cdest)
{
struct ip_vs_dest *dest = ct->dest;
struct netns_ipvs *ipvs = ct->ipvs;
@@ -772,7 +772,8 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
*/
if ((dest == NULL) ||
!(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
- expire_quiescent_template(ipvs, dest)) {
+ expire_quiescent_template(ipvs, dest) ||
+ (cdest && (dest != cdest))) {
IP_VS_DBG_BUF(9, "check_template: dest not available for "
"protocol %s s:%s:%d v:%s:%d "
"-> d:%s:%d\n",
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 1207f20d24e4..2c1b498a7a27 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -321,7 +321,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
/* Check if a template already exists */
ct = ip_vs_ct_in_get(&param);
- if (!ct || !ip_vs_check_template(ct)) {
+ if (!ct || !ip_vs_check_template(ct, NULL)) {
struct ip_vs_scheduler *sched;
/*
@@ -1154,7 +1154,8 @@ struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc,
vport, &param) < 0)
return NULL;
ct = ip_vs_ct_in_get(&param);
- if (!ct) {
+ /* check if template exists and points to the same dest */
+ if (!ct || !ip_vs_check_template(ct, dest)) {
ct = ip_vs_conn_new(&param, dest->af, daddr, dport,
IP_VS_CONN_F_TEMPLATE, dest, 0);
if (!ct) {
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index db2312eeb2a4..f204274a9b6b 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1544,6 +1544,8 @@ void nf_conntrack_cleanup_end(void)
nf_conntrack_tstamp_fini();
nf_conntrack_acct_fini();
nf_conntrack_expect_fini();
+
+ kmem_cache_destroy(nf_conntrack_cachep);
}
/*
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index 883c691ec8d0..19efeba02abb 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -632,6 +632,7 @@ static int __init nf_conntrack_ftp_init(void)
if (ret) {
pr_err("failed to register helper for pf: %d port: %d\n",
ftp[i][j].tuple.src.l3num, ports[i]);
+ ports_c = i;
nf_conntrack_ftp_fini();
return ret;
}
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index f703adb7e5f7..196cb39649e1 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -361,9 +361,10 @@ EXPORT_SYMBOL_GPL(nf_ct_helper_log);
int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
{
- int ret = 0;
- struct nf_conntrack_helper *cur;
+ struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) };
unsigned int h = helper_hash(&me->tuple);
+ struct nf_conntrack_helper *cur;
+ int ret = 0;
BUG_ON(me->expect_policy == NULL);
BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES);
@@ -371,9 +372,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
mutex_lock(&nf_ct_helper_mutex);
hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) {
- if (strncmp(cur->name, me->name, NF_CT_HELPER_NAME_LEN) == 0 &&
- cur->tuple.src.l3num == me->tuple.src.l3num &&
- cur->tuple.dst.protonum == me->tuple.dst.protonum) {
+ if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple, &mask)) {
ret = -EEXIST;
goto out;
}
diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
index 8b6da2719600..f97ac61d2536 100644
--- a/net/netfilter/nf_conntrack_irc.c
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -271,6 +271,7 @@ static int __init nf_conntrack_irc_init(void)
if (ret) {
pr_err("failed to register helper for pf: %u port: %u\n",
irc[i].tuple.src.l3num, ports[i]);
+ ports_c = i;
nf_conntrack_irc_fini();
return ret;
}
diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c
index 7523a575f6d1..3fcbaab83b3d 100644
--- a/net/netfilter/nf_conntrack_sane.c
+++ b/net/netfilter/nf_conntrack_sane.c
@@ -223,6 +223,7 @@ static int __init nf_conntrack_sane_init(void)
if (ret) {
pr_err("failed to register helper for pf: %d port: %d\n",
sane[i][j].tuple.src.l3num, ports[i]);
+ ports_c = i;
nf_conntrack_sane_fini();
return ret;
}
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 3e06402739e0..f72ba5587588 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -1669,6 +1669,7 @@ static int __init nf_conntrack_sip_init(void)
if (ret) {
pr_err("failed to register helper for pf: %u port: %u\n",
sip[i][j].tuple.src.l3num, ports[i]);
+ ports_c = i;
nf_conntrack_sip_fini();
return ret;
}
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index f87e84ebcec3..c026c472ea80 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -487,8 +487,6 @@ static struct ctl_table nf_ct_sysctl_table[] = {
{ }
};
-#define NET_NF_CONNTRACK_MAX 2089
-
static struct ctl_table nf_ct_netfilter_table[] = {
{
.procname = "nf_conntrack_max",
diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c
index 36f964066461..2e65b5430fba 100644
--- a/net/netfilter/nf_conntrack_tftp.c
+++ b/net/netfilter/nf_conntrack_tftp.c
@@ -142,6 +142,7 @@ static int __init nf_conntrack_tftp_init(void)
if (ret) {
pr_err("failed to register helper for pf: %u port: %u\n",
tftp[i][j].tuple.src.l3num, ports[i]);
+ ports_c = i;
nf_conntrack_tftp_fini();
return ret;
}
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 5baa8e24e6ac..b19ad20a705c 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -26,23 +26,21 @@
* Once the queue is registered it must reinject all packets it
* receives, no matter what.
*/
-static const struct nf_queue_handler __rcu *queue_handler __read_mostly;
/* return EBUSY when somebody else is registered, return EEXIST if the
* same handler is registered, return 0 in case of success. */
-void nf_register_queue_handler(const struct nf_queue_handler *qh)
+void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh)
{
/* should never happen, we only have one queueing backend in kernel */
- WARN_ON(rcu_access_pointer(queue_handler));
- rcu_assign_pointer(queue_handler, qh);
+ WARN_ON(rcu_access_pointer(net->nf.queue_handler));
+ rcu_assign_pointer(net->nf.queue_handler, qh);
}
EXPORT_SYMBOL(nf_register_queue_handler);
/* The caller must flush their queue before this */
-void nf_unregister_queue_handler(void)
+void nf_unregister_queue_handler(struct net *net)
{
- RCU_INIT_POINTER(queue_handler, NULL);
- synchronize_rcu();
+ RCU_INIT_POINTER(net->nf.queue_handler, NULL);
}
EXPORT_SYMBOL(nf_unregister_queue_handler);
@@ -103,7 +101,7 @@ void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops)
const struct nf_queue_handler *qh;
rcu_read_lock();
- qh = rcu_dereference(queue_handler);
+ qh = rcu_dereference(net->nf.queue_handler);
if (qh)
qh->nf_hook_drop(net, ops);
rcu_read_unlock();
@@ -122,9 +120,10 @@ int nf_queue(struct sk_buff *skb,
struct nf_queue_entry *entry = NULL;
const struct nf_afinfo *afinfo;
const struct nf_queue_handler *qh;
+ struct net *net = state->net;
/* QUEUE == DROP if no one is waiting, to be safe. */
- qh = rcu_dereference(queue_handler);
+ qh = rcu_dereference(net->nf.queue_handler);
if (!qh) {
status = -ESRCH;
goto err;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 4d292b933b5c..2c881871db38 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2647,6 +2647,8 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
/* Only accept unspec with dump */
if (nfmsg->nfgen_family == NFPROTO_UNSPEC)
return -EAFNOSUPPORT;
+ if (!nla[NFTA_SET_TABLE])
+ return -EINVAL;
set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]);
if (IS_ERR(set))
@@ -2944,24 +2946,20 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
* jumps are already validated for that chain.
*/
list_for_each_entry(i, &set->bindings, list) {
- if (binding->flags & NFT_SET_MAP &&
+ if (i->flags & NFT_SET_MAP &&
i->chain == binding->chain)
goto bind;
}
+ iter.genmask = nft_genmask_next(ctx->net);
iter.skip = 0;
iter.count = 0;
iter.err = 0;
iter.fn = nf_tables_bind_check_setelem;
set->ops->walk(ctx, set, &iter);
- if (iter.err < 0) {
- /* Destroy anonymous sets if binding fails */
- if (set->flags & NFT_SET_ANONYMOUS)
- nf_tables_set_destroy(ctx, set);
-
+ if (iter.err < 0)
return iter.err;
- }
}
bind:
binding->chain = ctx->chain;
@@ -3190,12 +3188,13 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
if (nest == NULL)
goto nla_put_failure;
- args.cb = cb;
- args.skb = skb;
- args.iter.skip = cb->args[0];
- args.iter.count = 0;
- args.iter.err = 0;
- args.iter.fn = nf_tables_dump_setelem;
+ args.cb = cb;
+ args.skb = skb;
+ args.iter.genmask = nft_genmask_cur(ctx.net);
+ args.iter.skip = cb->args[0];
+ args.iter.count = 0;
+ args.iter.err = 0;
+ args.iter.fn = nf_tables_dump_setelem;
set->ops->walk(&ctx, set, &args.iter);
nla_nest_end(skb, nest);
@@ -4282,6 +4281,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
binding->chain != chain)
continue;
+ iter.genmask = nft_genmask_next(ctx->net);
iter.skip = 0;
iter.count = 0;
iter.err = 0;
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index e9f8dffcc244..fb8b5892b5ff 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -143,7 +143,7 @@ next_rule:
list_for_each_entry_continue_rcu(rule, &chain->rules, list) {
/* This rule is not active, skip. */
- if (unlikely(rule->genmask & (1 << gencursor)))
+ if (unlikely(rule->genmask & gencursor))
continue;
rulenum++;
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index aa93877ab6e2..5d36a0926b4a 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -557,7 +557,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
if (entskb->tstamp.tv64) {
struct nfqnl_msg_packet_timestamp ts;
- struct timespec64 kts = ktime_to_timespec64(skb->tstamp);
+ struct timespec64 kts = ktime_to_timespec64(entskb->tstamp);
ts.sec = cpu_to_be64(kts.tv_sec);
ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC);
@@ -1482,21 +1482,29 @@ static int __net_init nfnl_queue_net_init(struct net *net)
net->nf.proc_netfilter, &nfqnl_file_ops))
return -ENOMEM;
#endif
+ nf_register_queue_handler(net, &nfqh);
return 0;
}
static void __net_exit nfnl_queue_net_exit(struct net *net)
{
+ nf_unregister_queue_handler(net);
#ifdef CONFIG_PROC_FS
remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter);
#endif
}
+static void nfnl_queue_net_exit_batch(struct list_head *net_exit_list)
+{
+ synchronize_rcu();
+}
+
static struct pernet_operations nfnl_queue_net_ops = {
- .init = nfnl_queue_net_init,
- .exit = nfnl_queue_net_exit,
- .id = &nfnl_queue_net_id,
- .size = sizeof(struct nfnl_queue_net),
+ .init = nfnl_queue_net_init,
+ .exit = nfnl_queue_net_exit,
+ .exit_batch = nfnl_queue_net_exit_batch,
+ .id = &nfnl_queue_net_id,
+ .size = sizeof(struct nfnl_queue_net),
};
static int __init nfnetlink_queue_init(void)
@@ -1517,7 +1525,6 @@ static int __init nfnetlink_queue_init(void)
}
register_netdevice_notifier(&nfqnl_dev_notifier);
- nf_register_queue_handler(&nfqh);
return status;
cleanup_netlink_notifier:
@@ -1529,7 +1536,6 @@ out:
static void __exit nfnetlink_queue_fini(void)
{
- nf_unregister_queue_handler();
unregister_netdevice_notifier(&nfqnl_dev_notifier);
nfnetlink_subsys_unregister(&nfqnl_subsys);
netlink_unregister_notifier(&nfqnl_rtnl_notifier);
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 6fa016564f90..f39c53a159eb 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -189,7 +189,6 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
struct nft_hash_elem *he;
struct rhashtable_iter hti;
struct nft_set_elem elem;
- u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
int err;
err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
@@ -218,7 +217,7 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
goto cont;
if (nft_set_elem_expired(&he->ext))
goto cont;
- if (!nft_set_elem_active(&he->ext, genmask))
+ if (!nft_set_elem_active(&he->ext, iter->genmask))
goto cont;
elem.priv = he;
diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c
index f762094af7c1..7201d57b5a93 100644
--- a/net/netfilter/nft_rbtree.c
+++ b/net/netfilter/nft_rbtree.c
@@ -211,7 +211,6 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
struct nft_rbtree_elem *rbe;
struct nft_set_elem elem;
struct rb_node *node;
- u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
spin_lock_bh(&nft_rbtree_lock);
for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
@@ -219,7 +218,7 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
if (iter->count < iter->skip)
goto cont;
- if (!nft_set_elem_active(&rbe->ext, genmask))
+ if (!nft_set_elem_active(&rbe->ext, iter->genmask))
goto cont;
elem.priv = rbe;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index c69c892231d7..2675d580c490 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -612,7 +612,7 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems,
return -EINVAL;
if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0 &&
- target_offset + sizeof(struct compat_xt_standard_target) != next_offset)
+ COMPAT_XT_ALIGN(target_offset + sizeof(struct compat_xt_standard_target)) != next_offset)
return -EINVAL;
/* compat_xt_entry match has less strict aligment requirements,
@@ -694,7 +694,7 @@ int xt_check_entry_offsets(const void *base,
return -EINVAL;
if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0 &&
- target_offset + sizeof(struct xt_standard_target) != next_offset)
+ XT_ALIGN(target_offset + sizeof(struct xt_standard_target)) != next_offset)
return -EINVAL;
return xt_check_entry_match(elems, base + target_offset,
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 879185fe183f..9a3eb7a0ebf4 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -137,11 +137,23 @@ static bool is_flow_key_valid(const struct sw_flow_key *key)
return !!key->eth.type;
}
+static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr,
+ __be16 ethertype)
+{
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ __be16 diff[] = { ~(hdr->h_proto), ethertype };
+
+ skb->csum = ~csum_partial((char *)diff, sizeof(diff),
+ ~skb->csum);
+ }
+
+ hdr->h_proto = ethertype;
+}
+
static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
const struct ovs_action_push_mpls *mpls)
{
__be32 *new_mpls_lse;
- struct ethhdr *hdr;
/* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */
if (skb->encapsulation)
@@ -160,9 +172,7 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN);
- hdr = eth_hdr(skb);
- hdr->h_proto = mpls->mpls_ethertype;
-
+ update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype);
if (!skb->inner_protocol)
skb_set_inner_protocol(skb, skb->protocol);
skb->protocol = mpls->mpls_ethertype;
@@ -193,7 +203,7 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
* field correctly in the presence of VLAN tags.
*/
hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN);
- hdr->h_proto = ethertype;
+ update_ethertype(skb, hdr, ethertype);
if (eth_p_mpls(skb->protocol))
skb->protocol = ethertype;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 3d5feede962d..d84312584ee4 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -818,8 +818,18 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
*/
state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
__ovs_ct_update_key(key, state, &info->zone, exp->master);
- } else
- return __ovs_ct_lookup(net, key, info, skb);
+ } else {
+ struct nf_conn *ct;
+ int err;
+
+ err = __ovs_ct_lookup(net, key, info, skb);
+ if (err)
+ return err;
+
+ ct = (struct nf_conn *)skb->nfct;
+ if (ct)
+ nf_ct_deliver_cached_events(ct);
+ }
return 0;
}
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 4040eb92d9c9..9bff6ef16fa7 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -93,6 +93,7 @@
#include <net/inet_common.h>
#endif
#include <linux/bpf.h>
+#include <net/compat.h>
#include "internal.h"
@@ -3940,6 +3941,27 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
}
+#ifdef CONFIG_COMPAT
+static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct packet_sock *po = pkt_sk(sock->sk);
+
+ if (level != SOL_PACKET)
+ return -ENOPROTOOPT;
+
+ if (optname == PACKET_FANOUT_DATA &&
+ po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
+ optval = (char __user *)get_compat_bpf_fprog(optval);
+ if (!optval)
+ return -EFAULT;
+ optlen = sizeof(struct sock_fprog);
+ }
+
+ return packet_setsockopt(sock, level, optname, optval, optlen);
+}
+#endif
+
static int packet_notifier(struct notifier_block *this,
unsigned long msg, void *ptr)
{
@@ -4416,6 +4438,9 @@ static const struct proto_ops packet_ops = {
.shutdown = sock_no_shutdown,
.setsockopt = packet_setsockopt,
.getsockopt = packet_getsockopt,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_packet_setsockopt,
+#endif
.sendmsg = packet_sendmsg,
.recvmsg = packet_recvmsg,
.mmap = packet_mmap,
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 310cabce2311..7c2a65a6af5c 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -111,7 +111,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
}
}
- if (conn->c_version < RDS_PROTOCOL(3,1)) {
+ if (conn->c_version < RDS_PROTOCOL(3, 1)) {
printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed,"
" no longer supported\n",
&conn->c_faddr,
diff --git a/net/rds/loop.c b/net/rds/loop.c
index 6b12b68541ae..814173b466d9 100644
--- a/net/rds/loop.c
+++ b/net/rds/loop.c
@@ -95,8 +95,9 @@ out:
*/
static void rds_loop_inc_free(struct rds_incoming *inc)
{
- struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
- rds_message_put(rm);
+ struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
+
+ rds_message_put(rm);
}
/* we need to at least give the thread something to succeed */
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 80256b08eac0..387df5f32e49 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -74,6 +74,7 @@ enum {
RDS_CONN_CONNECTING,
RDS_CONN_DISCONNECTING,
RDS_CONN_UP,
+ RDS_CONN_RESETTING,
RDS_CONN_ERROR,
};
@@ -813,6 +814,7 @@ void rds_connect_worker(struct work_struct *);
void rds_shutdown_worker(struct work_struct *);
void rds_send_worker(struct work_struct *);
void rds_recv_worker(struct work_struct *);
+void rds_connect_path_complete(struct rds_connection *conn, int curr);
void rds_connect_complete(struct rds_connection *conn);
/* transport.c */
diff --git a/net/rds/recv.c b/net/rds/recv.c
index c0be1ecd11c9..8413f6c99e13 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -561,5 +561,7 @@ void rds_inc_info_copy(struct rds_incoming *inc,
minfo.fport = inc->i_hdr.h_dport;
}
+ minfo.flags = 0;
+
rds_info_copy(iter, &minfo, sizeof(minfo));
}
diff --git a/net/rds/send.c b/net/rds/send.c
index c9cdb358ea88..b1962f8e30f7 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -99,6 +99,7 @@ void rds_send_reset(struct rds_connection *conn)
list_splice_init(&conn->c_retrans, &conn->c_send_queue);
spin_unlock_irqrestore(&conn->c_lock, flags);
}
+EXPORT_SYMBOL_GPL(rds_send_reset);
static int acquire_in_xmit(struct rds_connection *conn)
{
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
index c173f69e1479..e381bbcd9cc1 100644
--- a/net/rds/sysctl.c
+++ b/net/rds/sysctl.c
@@ -102,7 +102,8 @@ int rds_sysctl_init(void)
rds_sysctl_reconnect_min = msecs_to_jiffies(1);
rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
- rds_sysctl_reg_table = register_net_sysctl(&init_net,"net/rds", rds_sysctl_rds_table);
+ rds_sysctl_reg_table =
+ register_net_sysctl(&init_net, "net/rds", rds_sysctl_rds_table);
if (!rds_sysctl_reg_table)
return -ENOMEM;
return 0;
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 86187dad1440..74ee126a6fe6 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -126,9 +126,81 @@ void rds_tcp_restore_callbacks(struct socket *sock,
}
/*
- * This is the only path that sets tc->t_sock. Send and receive trust that
- * it is set. The RDS_CONN_UP bit protects those paths from being
- * called while it isn't set.
+ * rds_tcp_reset_callbacks() switches the to the new sock and
+ * returns the existing tc->t_sock.
+ *
+ * The only functions that set tc->t_sock are rds_tcp_set_callbacks
+ * and rds_tcp_reset_callbacks. Send and receive trust that
+ * it is set. The absence of RDS_CONN_UP bit protects those paths
+ * from being called while it isn't set.
+ */
+void rds_tcp_reset_callbacks(struct socket *sock,
+ struct rds_connection *conn)
+{
+ struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct socket *osock = tc->t_sock;
+
+ if (!osock)
+ goto newsock;
+
+ /* Need to resolve a duelling SYN between peers.
+ * We have an outstanding SYN to this peer, which may
+ * potentially have transitioned to the RDS_CONN_UP state,
+ * so we must quiesce any send threads before resetting
+ * c_transport_data. We quiesce these threads by setting
+ * c_state to something other than RDS_CONN_UP, and then
+ * waiting for any existing threads in rds_send_xmit to
+ * complete release_in_xmit(). (Subsequent threads entering
+ * rds_send_xmit() will bail on !rds_conn_up().
+ *
+ * However an incoming syn-ack at this point would end up
+ * marking the conn as RDS_CONN_UP, and would again permit
+ * rds_send_xmi() threads through, so ideally we would
+ * synchronize on RDS_CONN_UP after lock_sock(), but cannot
+ * do that: waiting on !RDS_IN_XMIT after lock_sock() may
+ * end up deadlocking with tcp_sendmsg(), and the RDS_IN_XMIT
+ * would not get set. As a result, we set c_state to
+ * RDS_CONN_RESETTTING, to ensure that rds_tcp_state_change
+ * cannot mark rds_conn_path_up() in the window before lock_sock()
+ */
+ atomic_set(&conn->c_state, RDS_CONN_RESETTING);
+ wait_event(conn->c_waitq, !test_bit(RDS_IN_XMIT, &conn->c_flags));
+ lock_sock(osock->sk);
+ /* reset receive side state for rds_tcp_data_recv() for osock */
+ if (tc->t_tinc) {
+ rds_inc_put(&tc->t_tinc->ti_inc);
+ tc->t_tinc = NULL;
+ }
+ tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+ tc->t_tinc_data_rem = 0;
+ tc->t_sock = NULL;
+
+ write_lock_bh(&osock->sk->sk_callback_lock);
+
+ osock->sk->sk_user_data = NULL;
+ osock->sk->sk_data_ready = tc->t_orig_data_ready;
+ osock->sk->sk_write_space = tc->t_orig_write_space;
+ osock->sk->sk_state_change = tc->t_orig_state_change;
+ write_unlock_bh(&osock->sk->sk_callback_lock);
+ release_sock(osock->sk);
+ sock_release(osock);
+newsock:
+ rds_send_reset(conn);
+ lock_sock(sock->sk);
+ write_lock_bh(&sock->sk->sk_callback_lock);
+ tc->t_sock = sock;
+ sock->sk->sk_user_data = conn;
+ sock->sk->sk_data_ready = rds_tcp_data_ready;
+ sock->sk->sk_write_space = rds_tcp_write_space;
+ sock->sk->sk_state_change = rds_tcp_state_change;
+
+ write_unlock_bh(&sock->sk->sk_callback_lock);
+ release_sock(sock->sk);
+}
+
+/* Add tc to rds_tcp_tc_list and set tc->t_sock. See comments
+ * above rds_tcp_reset_callbacks for notes about synchronization
+ * with data path
*/
void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn)
{
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 41c228300525..7940babf6c71 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -50,6 +50,7 @@ struct rds_tcp_statistics {
void rds_tcp_tune(struct socket *sock);
void rds_tcp_nonagle(struct socket *sock);
void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn);
+void rds_tcp_reset_callbacks(struct socket *sock, struct rds_connection *conn);
void rds_tcp_restore_callbacks(struct socket *sock,
struct rds_tcp_connection *tc);
u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc);
@@ -82,7 +83,7 @@ int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
void rds_tcp_xmit_prepare(struct rds_connection *conn);
void rds_tcp_xmit_complete(struct rds_connection *conn);
int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
- unsigned int hdr_off, unsigned int sg, unsigned int off);
+ unsigned int hdr_off, unsigned int sg, unsigned int off);
void rds_tcp_write_space(struct sock *sk);
/* tcp_stats.c */
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index fb82e0a0bf89..f6e95d60db54 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -54,19 +54,19 @@ void rds_tcp_state_change(struct sock *sk)
rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state);
- switch(sk->sk_state) {
- /* ignore connecting sockets as they make progress */
- case TCP_SYN_SENT:
- case TCP_SYN_RECV:
- break;
- case TCP_ESTABLISHED:
- rds_connect_complete(conn);
- break;
- case TCP_CLOSE_WAIT:
- case TCP_CLOSE:
- rds_conn_drop(conn);
- default:
- break;
+ switch (sk->sk_state) {
+ /* ignore connecting sockets as they make progress */
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV:
+ break;
+ case TCP_ESTABLISHED:
+ rds_connect_path_complete(conn, RDS_CONN_CONNECTING);
+ break;
+ case TCP_CLOSE_WAIT:
+ case TCP_CLOSE:
+ rds_conn_drop(conn);
+ default:
+ break;
}
out:
read_unlock_bh(&sk->sk_callback_lock);
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 4bf4befe5066..245542ca4718 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -78,7 +78,6 @@ int rds_tcp_accept_one(struct socket *sock)
struct inet_sock *inet;
struct rds_tcp_connection *rs_tcp = NULL;
int conn_state;
- struct sock *nsk;
if (!sock) /* module unload or netns delete in progress */
return -ENETUNREACH;
@@ -136,26 +135,21 @@ int rds_tcp_accept_one(struct socket *sock)
!conn->c_outgoing) {
goto rst_nsk;
} else {
- atomic_set(&conn->c_state, RDS_CONN_CONNECTING);
- wait_event(conn->c_waitq,
- !test_bit(RDS_IN_XMIT, &conn->c_flags));
- rds_tcp_restore_callbacks(rs_tcp->t_sock, rs_tcp);
+ rds_tcp_reset_callbacks(new_sock, conn);
conn->c_outgoing = 0;
+ /* rds_connect_path_complete() marks RDS_CONN_UP */
+ rds_connect_path_complete(conn, RDS_CONN_RESETTING);
}
+ } else {
+ rds_tcp_set_callbacks(new_sock, conn);
+ rds_connect_path_complete(conn, RDS_CONN_CONNECTING);
}
- rds_tcp_set_callbacks(new_sock, conn);
- rds_connect_complete(conn); /* marks RDS_CONN_UP */
new_sock = NULL;
ret = 0;
goto out;
rst_nsk:
/* reset the newly returned accept sock and bail */
- nsk = new_sock->sk;
- rds_tcp_stats_inc(s_tcp_listen_closed_stale);
- nsk->sk_user_data = NULL;
- nsk->sk_prot->disconnect(nsk, 0);
- tcp_done(nsk);
- new_sock = NULL;
+ kernel_sock_shutdown(new_sock, SHUT_RDWR);
ret = 0;
out:
if (rs_tcp)
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index c3196f9d070a..6e6a7111a034 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -171,7 +171,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
while (left) {
if (!tinc) {
tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
- arg->gfp);
+ arg->gfp);
if (!tinc) {
desc->error = -ENOMEM;
goto out;
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 22d0f2020a79..618be69c9c3b 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -66,19 +66,19 @@ void rds_tcp_xmit_complete(struct rds_connection *conn)
static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
{
struct kvec vec = {
- .iov_base = data,
- .iov_len = len,
+ .iov_base = data,
+ .iov_len = len,
+ };
+ struct msghdr msg = {
+ .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL,
};
- struct msghdr msg = {
- .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL,
- };
return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len);
}
/* the core send_sem serializes this with other xmit and shutdown */
int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
- unsigned int hdr_off, unsigned int sg, unsigned int off)
+ unsigned int hdr_off, unsigned int sg, unsigned int off)
{
struct rds_tcp_connection *tc = conn->c_transport_data;
int done = 0;
@@ -196,7 +196,7 @@ void rds_tcp_write_space(struct sock *sk)
tc->t_last_seen_una = rds_tcp_snd_una(tc);
rds_send_drop_acked(conn, rds_tcp_snd_una(tc), rds_tcp_is_acked);
- if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf)
+ if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf)
queue_delayed_work(rds_wq, &conn->c_send_w, 0);
out:
diff --git a/net/rds/threads.c b/net/rds/threads.c
index 454aa6d23327..4a323045719b 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -71,9 +71,9 @@
struct workqueue_struct *rds_wq;
EXPORT_SYMBOL_GPL(rds_wq);
-void rds_connect_complete(struct rds_connection *conn)
+void rds_connect_path_complete(struct rds_connection *conn, int curr)
{
- if (!rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_UP)) {
+ if (!rds_conn_transition(conn, curr, RDS_CONN_UP)) {
printk(KERN_WARNING "%s: Cannot transition to state UP, "
"current state is %d\n",
__func__,
@@ -90,6 +90,12 @@ void rds_connect_complete(struct rds_connection *conn)
queue_delayed_work(rds_wq, &conn->c_send_w, 0);
queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
}
+EXPORT_SYMBOL_GPL(rds_connect_path_complete);
+
+void rds_connect_complete(struct rds_connection *conn)
+{
+ rds_connect_path_complete(conn, RDS_CONN_CONNECTING);
+}
EXPORT_SYMBOL_GPL(rds_connect_complete);
/*
diff --git a/net/rds/transport.c b/net/rds/transport.c
index f3afd1d60d3c..2ffd3e30c643 100644
--- a/net/rds/transport.c
+++ b/net/rds/transport.c
@@ -140,8 +140,7 @@ unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
rds_info_iter_unmap(iter);
down_read(&rds_trans_sem);
- for (i = 0; i < RDS_TRANS_COUNT; i++)
- {
+ for (i = 0; i < RDS_TRANS_COUNT; i++) {
trans = transports[i];
if (!trans || !trans->stats_info_copy)
continue;
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 6b726a046a7d..bab56ed649ba 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -1162,9 +1162,7 @@ static int rxkad_init(void)
/* pin the cipher we need so that the crypto layer doesn't invoke
* keventd to go get it */
rxkad_ci = crypto_alloc_skcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(rxkad_ci))
- return PTR_ERR(rxkad_ci);
- return 0;
+ return PTR_ERR_OR_ZERO(rxkad_ci);
}
/*
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 336774a535c3..c7a0b0d481c0 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -1118,7 +1118,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
nla_nest_end(skb, nest);
ret = skb->len;
} else
- nla_nest_cancel(skb, nest);
+ nlmsg_trim(skb, b);
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
if (NETLINK_CB(cb->skb).portid && ret)
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 658046dfe02d..ea4a2fef1b71 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -106,9 +106,9 @@ int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi)
}
EXPORT_SYMBOL_GPL(ife_get_meta_u16);
-int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval)
+int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval, gfp_t gfp)
{
- mi->metaval = kmemdup(metaval, sizeof(u32), GFP_KERNEL);
+ mi->metaval = kmemdup(metaval, sizeof(u32), gfp);
if (!mi->metaval)
return -ENOMEM;
@@ -116,9 +116,9 @@ int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval)
}
EXPORT_SYMBOL_GPL(ife_alloc_meta_u32);
-int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval)
+int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval, gfp_t gfp)
{
- mi->metaval = kmemdup(metaval, sizeof(u16), GFP_KERNEL);
+ mi->metaval = kmemdup(metaval, sizeof(u16), gfp);
if (!mi->metaval)
return -ENOMEM;
@@ -240,10 +240,10 @@ static int ife_validate_metatype(struct tcf_meta_ops *ops, void *val, int len)
}
/* called when adding new meta information
- * under ife->tcf_lock
+ * under ife->tcf_lock for existing action
*/
static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid,
- void *val, int len)
+ void *val, int len, bool exists)
{
struct tcf_meta_ops *ops = find_ife_oplist(metaid);
int ret = 0;
@@ -251,11 +251,13 @@ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid,
if (!ops) {
ret = -ENOENT;
#ifdef CONFIG_MODULES
- spin_unlock_bh(&ife->tcf_lock);
+ if (exists)
+ spin_unlock_bh(&ife->tcf_lock);
rtnl_unlock();
request_module("ifemeta%u", metaid);
rtnl_lock();
- spin_lock_bh(&ife->tcf_lock);
+ if (exists)
+ spin_lock_bh(&ife->tcf_lock);
ops = find_ife_oplist(metaid);
#endif
}
@@ -272,10 +274,10 @@ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid,
}
/* called when adding new meta information
- * under ife->tcf_lock
+ * under ife->tcf_lock for existing action
*/
static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval,
- int len)
+ int len, bool atomic)
{
struct tcf_meta_info *mi = NULL;
struct tcf_meta_ops *ops = find_ife_oplist(metaid);
@@ -284,7 +286,7 @@ static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval,
if (!ops)
return -ENOENT;
- mi = kzalloc(sizeof(*mi), GFP_KERNEL);
+ mi = kzalloc(sizeof(*mi), atomic ? GFP_ATOMIC : GFP_KERNEL);
if (!mi) {
/*put back what find_ife_oplist took */
module_put(ops->owner);
@@ -294,7 +296,7 @@ static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval,
mi->metaid = metaid;
mi->ops = ops;
if (len > 0) {
- ret = ops->alloc(mi, metaval);
+ ret = ops->alloc(mi, metaval, atomic ? GFP_ATOMIC : GFP_KERNEL);
if (ret != 0) {
kfree(mi);
module_put(ops->owner);
@@ -313,11 +315,13 @@ static int use_all_metadata(struct tcf_ife_info *ife)
int rc = 0;
int installed = 0;
+ read_lock(&ife_mod_lock);
list_for_each_entry(o, &ifeoplist, list) {
- rc = add_metainfo(ife, o->metaid, NULL, 0);
+ rc = add_metainfo(ife, o->metaid, NULL, 0, true);
if (rc == 0)
installed += 1;
}
+ read_unlock(&ife_mod_lock);
if (installed)
return 0;
@@ -385,8 +389,9 @@ static void tcf_ife_cleanup(struct tc_action *a, int bind)
spin_unlock_bh(&ife->tcf_lock);
}
-/* under ife->tcf_lock */
-static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb)
+/* under ife->tcf_lock for existing action */
+static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
+ bool exists)
{
int len = 0;
int rc = 0;
@@ -398,11 +403,11 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb)
val = nla_data(tb[i]);
len = nla_len(tb[i]);
- rc = load_metaops_and_vet(ife, i, val, len);
+ rc = load_metaops_and_vet(ife, i, val, len, exists);
if (rc != 0)
return rc;
- rc = add_metainfo(ife, i, val, len);
+ rc = add_metainfo(ife, i, val, len, exists);
if (rc)
return rc;
}
@@ -474,7 +479,8 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
saddr = nla_data(tb[TCA_IFE_SMAC]);
}
- spin_lock_bh(&ife->tcf_lock);
+ if (exists)
+ spin_lock_bh(&ife->tcf_lock);
ife->tcf_action = parm->action;
if (parm->flags & IFE_ENCODE) {
@@ -504,11 +510,12 @@ metadata_parse_err:
if (ret == ACT_P_CREATED)
_tcf_ife_cleanup(a, bind);
- spin_unlock_bh(&ife->tcf_lock);
+ if (exists)
+ spin_unlock_bh(&ife->tcf_lock);
return err;
}
- err = populate_metalist(ife, tb2);
+ err = populate_metalist(ife, tb2, exists);
if (err)
goto metadata_parse_err;
@@ -523,12 +530,14 @@ metadata_parse_err:
if (ret == ACT_P_CREATED)
_tcf_ife_cleanup(a, bind);
- spin_unlock_bh(&ife->tcf_lock);
+ if (exists)
+ spin_unlock_bh(&ife->tcf_lock);
return err;
}
}
- spin_unlock_bh(&ife->tcf_lock);
+ if (exists)
+ spin_unlock_bh(&ife->tcf_lock);
if (ret == ACT_P_CREATED)
tcf_hash_insert(tn, a);
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 9f002ada7074..d4bd19ee5822 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -121,10 +121,13 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla,
}
td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]);
- if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size)
+ if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) {
+ if (exists)
+ tcf_hash_release(a, bind);
return -EINVAL;
+ }
- if (!tcf_hash_check(tn, index, a, bind)) {
+ if (!exists) {
ret = tcf_hash_create(tn, index, est, a, sizeof(*ipt), bind,
false);
if (ret)
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index b884dae692a1..c557789765dc 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -38,7 +38,7 @@ struct tcf_police {
bool peak_present;
};
#define to_police(pc) \
- container_of(pc, struct tcf_police, common)
+ container_of(pc->priv, struct tcf_police, common)
#define POL_TAB_MASK 15
@@ -119,14 +119,12 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action *a,
int ovr, int bind)
{
- unsigned int h;
int ret = 0, err;
struct nlattr *tb[TCA_POLICE_MAX + 1];
struct tc_police *parm;
struct tcf_police *police;
struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
struct tc_action_net *tn = net_generic(net, police_net_id);
- struct tcf_hashinfo *hinfo = tn->hinfo;
int size;
if (nla == NULL)
@@ -145,7 +143,7 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
if (parm->index) {
if (tcf_hash_search(tn, a, parm->index)) {
- police = to_police(a->priv);
+ police = to_police(a);
if (bind) {
police->tcf_bindcnt += 1;
police->tcf_refcnt += 1;
@@ -156,16 +154,15 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
/* not replacing */
return -EEXIST;
}
+ } else {
+ ret = tcf_hash_create(tn, parm->index, NULL, a,
+ sizeof(*police), bind, false);
+ if (ret)
+ return ret;
+ ret = ACT_P_CREATED;
}
- police = kzalloc(sizeof(*police), GFP_KERNEL);
- if (police == NULL)
- return -ENOMEM;
- ret = ACT_P_CREATED;
- police->tcf_refcnt = 1;
- spin_lock_init(&police->tcf_lock);
- if (bind)
- police->tcf_bindcnt = 1;
+ police = to_police(a);
override:
if (parm->rate.rate) {
err = -ENOMEM;
@@ -237,16 +234,8 @@ override:
return ret;
police->tcfp_t_c = ktime_get_ns();
- police->tcf_index = parm->index ? parm->index :
- tcf_hash_new_index(tn);
- police->tcf_tm.install = jiffies;
- police->tcf_tm.lastuse = jiffies;
- h = tcf_hash(police->tcf_index, POL_TAB_MASK);
- spin_lock_bh(&hinfo->lock);
- hlist_add_head(&police->tcf_head, &hinfo->htab[h]);
- spin_unlock_bh(&hinfo->lock);
+ tcf_hash_insert(tn, a);
- a->priv = police;
return ret;
failure_unlock:
@@ -255,7 +244,7 @@ failure:
qdisc_put_rtab(P_tab);
qdisc_put_rtab(R_tab);
if (ret == ACT_P_CREATED)
- kfree(police);
+ tcf_hash_cleanup(a, est);
return err;
}
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 730aacafc22d..b3b7978f4182 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -171,7 +171,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie)
struct tc_cls_flower_offload offload = {0};
struct tc_to_netdev tc;
- if (!tc_should_offload(dev, 0))
+ if (!tc_should_offload(dev, tp, 0))
return;
offload.command = TC_CLSFLOWER_DESTROY;
@@ -194,7 +194,7 @@ static void fl_hw_replace_filter(struct tcf_proto *tp,
struct tc_cls_flower_offload offload = {0};
struct tc_to_netdev tc;
- if (!tc_should_offload(dev, flags))
+ if (!tc_should_offload(dev, tp, flags))
return;
offload.command = TC_CLSFLOWER_REPLACE;
@@ -216,7 +216,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
struct tc_cls_flower_offload offload = {0};
struct tc_to_netdev tc;
- if (!tc_should_offload(dev, 0))
+ if (!tc_should_offload(dev, tp, 0))
return;
offload.command = TC_CLSFLOWER_STATS;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 079b43b3c5d2..ffe593efe930 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -440,7 +440,7 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
offload.type = TC_SETUP_CLSU32;
offload.cls_u32 = &u32_offload;
- if (tc_should_offload(dev, 0)) {
+ if (tc_should_offload(dev, tp, 0)) {
offload.cls_u32->command = TC_CLSU32_DELETE_KNODE;
offload.cls_u32->knode.handle = handle;
dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
@@ -457,20 +457,21 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp,
struct tc_to_netdev offload;
int err;
+ if (!tc_should_offload(dev, tp, flags))
+ return tc_skip_sw(flags) ? -EINVAL : 0;
+
offload.type = TC_SETUP_CLSU32;
offload.cls_u32 = &u32_offload;
- if (tc_should_offload(dev, flags)) {
- offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
- offload.cls_u32->hnode.divisor = h->divisor;
- offload.cls_u32->hnode.handle = h->handle;
- offload.cls_u32->hnode.prio = h->prio;
+ offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
+ offload.cls_u32->hnode.divisor = h->divisor;
+ offload.cls_u32->hnode.handle = h->handle;
+ offload.cls_u32->hnode.prio = h->prio;
- err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
- tp->protocol, &offload);
- if (tc_skip_sw(flags))
- return err;
- }
+ err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+ tp->protocol, &offload);
+ if (tc_skip_sw(flags))
+ return err;
return 0;
}
@@ -484,7 +485,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
offload.type = TC_SETUP_CLSU32;
offload.cls_u32 = &u32_offload;
- if (tc_should_offload(dev, 0)) {
+ if (tc_should_offload(dev, tp, 0)) {
offload.cls_u32->command = TC_CLSU32_DELETE_HNODE;
offload.cls_u32->hnode.divisor = h->divisor;
offload.cls_u32->hnode.handle = h->handle;
@@ -507,27 +508,28 @@ static int u32_replace_hw_knode(struct tcf_proto *tp,
offload.type = TC_SETUP_CLSU32;
offload.cls_u32 = &u32_offload;
- if (tc_should_offload(dev, flags)) {
- offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE;
- offload.cls_u32->knode.handle = n->handle;
- offload.cls_u32->knode.fshift = n->fshift;
+ if (!tc_should_offload(dev, tp, flags))
+ return tc_skip_sw(flags) ? -EINVAL : 0;
+
+ offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE;
+ offload.cls_u32->knode.handle = n->handle;
+ offload.cls_u32->knode.fshift = n->fshift;
#ifdef CONFIG_CLS_U32_MARK
- offload.cls_u32->knode.val = n->val;
- offload.cls_u32->knode.mask = n->mask;
+ offload.cls_u32->knode.val = n->val;
+ offload.cls_u32->knode.mask = n->mask;
#else
- offload.cls_u32->knode.val = 0;
- offload.cls_u32->knode.mask = 0;
+ offload.cls_u32->knode.val = 0;
+ offload.cls_u32->knode.mask = 0;
#endif
- offload.cls_u32->knode.sel = &n->sel;
- offload.cls_u32->knode.exts = &n->exts;
- if (n->ht_down)
- offload.cls_u32->knode.link_handle = n->ht_down->handle;
-
- err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
- tp->protocol, &offload);
- if (tc_skip_sw(flags))
- return err;
- }
+ offload.cls_u32->knode.sel = &n->sel;
+ offload.cls_u32->knode.exts = &n->exts;
+ if (n->ht_down)
+ offload.cls_u32->knode.link_handle = n->ht_down->handle;
+
+ err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+ tp->protocol, &offload);
+ if (tc_skip_sw(flags))
+ return err;
return 0;
}
@@ -863,7 +865,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
if (tb[TCA_U32_FLAGS]) {
flags = nla_get_u32(tb[TCA_U32_FLAGS]);
if (!tc_flags_valid(flags))
- return err;
+ return -EINVAL;
}
n = (struct tc_u_knode *)*arg;
@@ -921,11 +923,17 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
ht->divisor = divisor;
ht->handle = handle;
ht->prio = tp->prio;
+
+ err = u32_replace_hw_hnode(tp, ht, flags);
+ if (err) {
+ kfree(ht);
+ return err;
+ }
+
RCU_INIT_POINTER(ht->next, tp_c->hlist);
rcu_assign_pointer(tp_c->hlist, ht);
*arg = (unsigned long)ht;
- u32_replace_hw_hnode(tp, ht, flags);
return 0;
}
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index a63e879e8975..bf8af2c43c2c 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -375,6 +375,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
cl->deficit = cl->quantum;
}
+ qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
return err;
}
@@ -407,6 +408,7 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch)
bstats_update(&cl->bstats, skb);
qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
return skb;
}
@@ -428,6 +430,7 @@ static unsigned int drr_drop(struct Qdisc *sch)
if (cl->qdisc->ops->drop) {
len = cl->qdisc->ops->drop(cl->qdisc);
if (len > 0) {
+ sch->qstats.backlog -= len;
sch->q.qlen--;
if (cl->qdisc->q.qlen == 0)
list_del(&cl->alist);
@@ -463,6 +466,7 @@ static void drr_reset_qdisc(struct Qdisc *sch)
qdisc_reset(cl->qdisc);
}
}
+ sch->qstats.backlog = 0;
sch->q.qlen = 0;
}
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 2177eac0a61e..2e4bd2c0a50c 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -37,14 +37,18 @@ static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
+ unsigned int prev_backlog;
+
if (likely(skb_queue_len(&sch->q) < sch->limit))
return qdisc_enqueue_tail(skb, sch);
+ prev_backlog = sch->qstats.backlog;
/* queue full, remove one skb to fulfill the limit */
__qdisc_queue_drop_head(sch, &sch->q);
qdisc_qstats_drop(sch);
qdisc_enqueue_tail(skb, sch);
+ qdisc_tree_reduce_backlog(sch, 0, prev_backlog - sch->qstats.backlog);
return NET_XMIT_CN;
}
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 6883a8971562..da250b2e06ae 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -199,6 +199,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
unsigned int idx, prev_backlog, prev_qlen;
struct fq_codel_flow *flow;
int uninitialized_var(ret);
+ unsigned int pkt_len;
bool memory_limited;
idx = fq_codel_classify(skb, sch, &ret);
@@ -230,6 +231,8 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
prev_backlog = sch->qstats.backlog;
prev_qlen = sch->q.qlen;
+ /* save this packet length as it might be dropped by fq_codel_drop() */
+ pkt_len = qdisc_pkt_len(skb);
/* fq_codel_drop() is quite expensive, as it performs a linear search
* in q->backlogs[] to find a fat flow.
* So instead of dropping a single packet, drop half of its backlog
@@ -237,14 +240,23 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
*/
ret = fq_codel_drop(sch, q->drop_batch_size);
- q->drop_overlimit += prev_qlen - sch->q.qlen;
+ prev_qlen -= sch->q.qlen;
+ prev_backlog -= sch->qstats.backlog;
+ q->drop_overlimit += prev_qlen;
if (memory_limited)
- q->drop_overmemory += prev_qlen - sch->q.qlen;
- /* As we dropped packet(s), better let upper stack know this */
- qdisc_tree_reduce_backlog(sch, prev_qlen - sch->q.qlen,
- prev_backlog - sch->qstats.backlog);
+ q->drop_overmemory += prev_qlen;
- return ret == idx ? NET_XMIT_CN : NET_XMIT_SUCCESS;
+ /* As we dropped packet(s), better let upper stack know this.
+ * If we dropped a packet for this flow, return NET_XMIT_CN,
+ * but in this case, our parents wont increase their backlogs.
+ */
+ if (ret == idx) {
+ qdisc_tree_reduce_backlog(sch, prev_qlen - 1,
+ prev_backlog - pkt_len);
+ return NET_XMIT_CN;
+ }
+ qdisc_tree_reduce_backlog(sch, prev_qlen, prev_backlog);
+ return NET_XMIT_SUCCESS;
}
/* This is the specific function called from codel_dequeue()
@@ -649,7 +661,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
qs.backlog = q->backlogs[idx];
qs.drops = flow->dropped;
}
- if (gnet_stats_copy_queue(d, NULL, &qs, 0) < 0)
+ if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
return -1;
if (idx < q->flows_cnt)
return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 269dd71b3828..f9e0e9c03d0a 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -49,6 +49,7 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
{
q->gso_skb = skb;
q->qstats.requeues++;
+ qdisc_qstats_backlog_inc(q, skb);
q->q.qlen++; /* it's still part of the queue */
__netif_schedule(q);
@@ -92,6 +93,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
txq = skb_get_tx_queue(txq->dev, skb);
if (!netif_xmit_frozen_or_stopped(txq)) {
q->gso_skb = NULL;
+ qdisc_qstats_backlog_dec(q, skb);
q->q.qlen--;
} else
skb = NULL;
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index d783d7cc3348..1ac9f9f03fe3 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1529,6 +1529,7 @@ hfsc_reset_qdisc(struct Qdisc *sch)
q->eligible = RB_ROOT;
INIT_LIST_HEAD(&q->droplist);
qdisc_watchdog_cancel(&q->watchdog);
+ sch->qstats.backlog = 0;
sch->q.qlen = 0;
}
@@ -1559,14 +1560,6 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb)
struct hfsc_sched *q = qdisc_priv(sch);
unsigned char *b = skb_tail_pointer(skb);
struct tc_hfsc_qopt qopt;
- struct hfsc_class *cl;
- unsigned int i;
-
- sch->qstats.backlog = 0;
- for (i = 0; i < q->clhash.hashsize; i++) {
- hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode)
- sch->qstats.backlog += cl->qdisc->qstats.backlog;
- }
qopt.defcls = q->defcls;
if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
@@ -1604,6 +1597,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (cl->qdisc->q.qlen == 1)
set_active(cl, qdisc_pkt_len(skb));
+ qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
@@ -1672,6 +1666,7 @@ hfsc_dequeue(struct Qdisc *sch)
qdisc_unthrottled(sch);
qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
return skb;
@@ -1695,6 +1690,7 @@ hfsc_drop(struct Qdisc *sch)
}
cl->qstats.drops++;
qdisc_qstats_drop(sch);
+ sch->qstats.backlog -= len;
sch->q.qlen--;
return len;
}
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index d4b4218af6b1..62f9d8100c6e 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1007,7 +1007,9 @@ static void htb_work_func(struct work_struct *work)
struct htb_sched *q = container_of(work, struct htb_sched, work);
struct Qdisc *sch = q->watchdog.qdisc;
+ rcu_read_lock();
__netif_schedule(qdisc_root(sch));
+ rcu_read_unlock();
}
static int htb_init(struct Qdisc *sch, struct nlattr *opt)
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 10adbc617905..8fe6999b642a 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -27,6 +27,11 @@ static unsigned long ingress_get(struct Qdisc *sch, u32 classid)
return TC_H_MIN(classid) + 1;
}
+static bool ingress_cl_offload(u32 classid)
+{
+ return true;
+}
+
static unsigned long ingress_bind_filter(struct Qdisc *sch,
unsigned long parent, u32 classid)
{
@@ -86,6 +91,7 @@ static const struct Qdisc_class_ops ingress_class_ops = {
.put = ingress_put,
.walk = ingress_walk,
.tcf_chain = ingress_find_tcf,
+ .tcf_cl_offload = ingress_cl_offload,
.bind_tcf = ingress_bind_filter,
.unbind_tcf = ingress_put,
};
@@ -110,6 +116,11 @@ static unsigned long clsact_get(struct Qdisc *sch, u32 classid)
}
}
+static bool clsact_cl_offload(u32 classid)
+{
+ return TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_INGRESS);
+}
+
static unsigned long clsact_bind_filter(struct Qdisc *sch,
unsigned long parent, u32 classid)
{
@@ -158,6 +169,7 @@ static const struct Qdisc_class_ops clsact_class_ops = {
.put = ingress_put,
.walk = ingress_walk,
.tcf_chain = clsact_find_tcf,
+ .tcf_cl_offload = clsact_cl_offload,
.bind_tcf = clsact_bind_filter,
.unbind_tcf = ingress_put,
};
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 205bed00dd34..178f1630a036 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -650,14 +650,14 @@ deliver:
#endif
if (q->qdisc) {
+ unsigned int pkt_len = qdisc_pkt_len(skb);
int err = qdisc_enqueue(skb, q->qdisc);
- if (unlikely(err != NET_XMIT_SUCCESS)) {
- if (net_xmit_drop_count(err)) {
- qdisc_qstats_drop(sch);
- qdisc_tree_reduce_backlog(sch, 1,
- qdisc_pkt_len(skb));
- }
+ if (err != NET_XMIT_SUCCESS &&
+ net_xmit_drop_count(err)) {
+ qdisc_qstats_drop(sch);
+ qdisc_tree_reduce_backlog(sch, 1,
+ pkt_len);
}
goto tfifo_dequeue;
}
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index fee1b15506b2..a356450b747b 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -85,6 +85,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
ret = qdisc_enqueue(skb, qdisc);
if (ret == NET_XMIT_SUCCESS) {
+ qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
@@ -117,6 +118,7 @@ static struct sk_buff *prio_dequeue(struct Qdisc *sch)
struct sk_buff *skb = qdisc_dequeue_peeked(qdisc);
if (skb) {
qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
return skb;
}
@@ -135,6 +137,7 @@ static unsigned int prio_drop(struct Qdisc *sch)
for (prio = q->bands-1; prio >= 0; prio--) {
qdisc = q->queues[prio];
if (qdisc->ops->drop && (len = qdisc->ops->drop(qdisc)) != 0) {
+ sch->qstats.backlog -= len;
sch->q.qlen--;
return len;
}
@@ -151,6 +154,7 @@ prio_reset(struct Qdisc *sch)
for (prio = 0; prio < q->bands; prio++)
qdisc_reset(q->queues[prio]);
+ sch->qstats.backlog = 0;
sch->q.qlen = 0;
}
@@ -168,8 +172,9 @@ prio_destroy(struct Qdisc *sch)
static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
{
struct prio_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *queues[TCQ_PRIO_BANDS];
+ int oldbands = q->bands, i;
struct tc_prio_qopt *qopt;
- int i;
if (nla_len(opt) < sizeof(*qopt))
return -EINVAL;
@@ -183,62 +188,42 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
return -EINVAL;
}
+ /* Before commit, make sure we can allocate all new qdiscs */
+ for (i = oldbands; i < qopt->bands; i++) {
+ queues[i] = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ TC_H_MAKE(sch->handle, i + 1));
+ if (!queues[i]) {
+ while (i > oldbands)
+ qdisc_destroy(queues[--i]);
+ return -ENOMEM;
+ }
+ }
+
sch_tree_lock(sch);
q->bands = qopt->bands;
memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
- for (i = q->bands; i < TCQ_PRIO_BANDS; i++) {
+ for (i = q->bands; i < oldbands; i++) {
struct Qdisc *child = q->queues[i];
- q->queues[i] = &noop_qdisc;
- if (child != &noop_qdisc) {
- qdisc_tree_reduce_backlog(child, child->q.qlen, child->qstats.backlog);
- qdisc_destroy(child);
- }
- }
- sch_tree_unlock(sch);
- for (i = 0; i < q->bands; i++) {
- if (q->queues[i] == &noop_qdisc) {
- struct Qdisc *child, *old;
-
- child = qdisc_create_dflt(sch->dev_queue,
- &pfifo_qdisc_ops,
- TC_H_MAKE(sch->handle, i + 1));
- if (child) {
- sch_tree_lock(sch);
- old = q->queues[i];
- q->queues[i] = child;
-
- if (old != &noop_qdisc) {
- qdisc_tree_reduce_backlog(old,
- old->q.qlen,
- old->qstats.backlog);
- qdisc_destroy(old);
- }
- sch_tree_unlock(sch);
- }
- }
+ qdisc_tree_reduce_backlog(child, child->q.qlen,
+ child->qstats.backlog);
+ qdisc_destroy(child);
}
+
+ for (i = oldbands; i < q->bands; i++)
+ q->queues[i] = queues[i];
+
+ sch_tree_unlock(sch);
return 0;
}
static int prio_init(struct Qdisc *sch, struct nlattr *opt)
{
- struct prio_sched_data *q = qdisc_priv(sch);
- int i;
-
- for (i = 0; i < TCQ_PRIO_BANDS; i++)
- q->queues[i] = &noop_qdisc;
-
- if (opt == NULL) {
+ if (!opt)
return -EINVAL;
- } else {
- int err;
- if ((err = prio_tune(sch, opt)) != 0)
- return err;
- }
- return 0;
+ return prio_tune(sch, opt);
}
static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 8d2d8d953432..f18857febdad 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -1235,8 +1235,10 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid);
err = qfq_change_agg(sch, cl, cl->agg->class_weight,
qdisc_pkt_len(skb));
- if (err)
- return err;
+ if (err) {
+ cl->qstats.drops++;
+ return qdisc_drop(skb, sch);
+ }
}
err = qdisc_enqueue(skb, cl->qdisc);
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 8c0508c0e287..91578bdd378c 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -97,6 +97,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
ret = qdisc_enqueue(skb, child);
if (likely(ret == NET_XMIT_SUCCESS)) {
+ qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
} else if (net_xmit_drop_count(ret)) {
q->stats.pdrop++;
@@ -118,6 +119,7 @@ static struct sk_buff *red_dequeue(struct Qdisc *sch)
skb = child->dequeue(child);
if (skb) {
qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
} else {
if (!red_is_idling(&q->vars))
@@ -143,6 +145,7 @@ static unsigned int red_drop(struct Qdisc *sch)
if (child->ops->drop && (len = child->ops->drop(child)) > 0) {
q->stats.other++;
qdisc_qstats_drop(sch);
+ sch->qstats.backlog -= len;
sch->q.qlen--;
return len;
}
@@ -158,6 +161,7 @@ static void red_reset(struct Qdisc *sch)
struct red_sched_data *q = qdisc_priv(sch);
qdisc_reset(q->qdisc);
+ sch->qstats.backlog = 0;
sch->q.qlen = 0;
red_restart(&q->vars);
}
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 83b90b584fae..3161e491990b 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -207,6 +207,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return ret;
}
+ qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
@@ -217,6 +218,7 @@ static unsigned int tbf_drop(struct Qdisc *sch)
unsigned int len = 0;
if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
+ sch->qstats.backlog -= len;
sch->q.qlen--;
qdisc_qstats_drop(sch);
}
@@ -263,6 +265,7 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
q->t_c = now;
q->tokens = toks;
q->ptokens = ptoks;
+ qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
qdisc_unthrottled(sch);
qdisc_bstats_update(sch, skb);
@@ -294,6 +297,7 @@ static void tbf_reset(struct Qdisc *sch)
struct tbf_sched_data *q = qdisc_priv(sch);
qdisc_reset(q->qdisc);
+ sch->qstats.backlog = 0;
sch->q.qlen = 0;
q->t_c = ktime_get_ns();
q->tokens = q->buffer;
diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
index 1ce724b87618..f69edcf219e5 100644
--- a/net/sctp/sctp_diag.c
+++ b/net/sctp/sctp_diag.c
@@ -3,12 +3,6 @@
#include <linux/sock_diag.h>
#include <net/sctp/sctp.h>
-extern void inet_diag_msg_common_fill(struct inet_diag_msg *r,
- struct sock *sk);
-extern int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
- struct inet_diag_msg *r, int ext,
- struct user_namespace *user_ns);
-
static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
void *info);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 777d0324594a..67154b848aa9 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4220,6 +4220,7 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
info->sctpi_s_disable_fragments = sp->disable_fragments;
info->sctpi_s_v4mapped = sp->v4mapped;
info->sctpi_s_frag_interleave = sp->frag_interleave;
+ info->sctpi_s_type = sp->type;
return 0;
}
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 02f53674dc39..040ff627c18a 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -543,7 +543,7 @@ rpcauth_cache_enforce_limit(void)
*/
struct rpc_cred *
rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
- int flags)
+ int flags, gfp_t gfp)
{
LIST_HEAD(free);
struct rpc_cred_cache *cache = auth->au_credcache;
@@ -580,7 +580,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
if (flags & RPCAUTH_LOOKUP_RCU)
return ERR_PTR(-ECHILD);
- new = auth->au_ops->crcreate(auth, acred, flags);
+ new = auth->au_ops->crcreate(auth, acred, flags, gfp);
if (IS_ERR(new)) {
cred = new;
goto out;
@@ -703,8 +703,7 @@ rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
new = rpcauth_bind_new_cred(task, lookupflags);
if (IS_ERR(new))
return PTR_ERR(new);
- if (req->rq_cred != NULL)
- put_rpccred(req->rq_cred);
+ put_rpccred(req->rq_cred);
req->rq_cred = new;
return 0;
}
@@ -712,6 +711,8 @@ rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
void
put_rpccred(struct rpc_cred *cred)
{
+ if (cred == NULL)
+ return;
/* Fast path for unhashed credentials */
if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) == 0) {
if (atomic_dec_and_test(&cred->cr_count))
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 41248b1820c7..54dd3fdead54 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -38,6 +38,13 @@ struct rpc_cred *rpc_lookup_cred(void)
}
EXPORT_SYMBOL_GPL(rpc_lookup_cred);
+struct rpc_cred *
+rpc_lookup_generic_cred(struct auth_cred *acred, int flags, gfp_t gfp)
+{
+ return rpcauth_lookup_credcache(&generic_auth, acred, flags, gfp);
+}
+EXPORT_SYMBOL_GPL(rpc_lookup_generic_cred);
+
struct rpc_cred *rpc_lookup_cred_nonblock(void)
{
return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU);
@@ -77,15 +84,15 @@ static struct rpc_cred *generic_bind_cred(struct rpc_task *task,
static struct rpc_cred *
generic_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
{
- return rpcauth_lookup_credcache(&generic_auth, acred, flags);
+ return rpcauth_lookup_credcache(&generic_auth, acred, flags, GFP_KERNEL);
}
static struct rpc_cred *
-generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
{
struct generic_cred *gcred;
- gcred = kmalloc(sizeof(*gcred), GFP_KERNEL);
+ gcred = kmalloc(sizeof(*gcred), gfp);
if (gcred == NULL)
return ERR_PTR(-ENOMEM);
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 15612ffa8d57..e64ae93d5b4f 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1299,11 +1299,11 @@ gss_destroy_cred(struct rpc_cred *cred)
static struct rpc_cred *
gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
{
- return rpcauth_lookup_credcache(auth, acred, flags);
+ return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS);
}
static struct rpc_cred *
-gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
{
struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth);
struct gss_cred *cred = NULL;
@@ -1313,7 +1313,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
__func__, from_kuid(&init_user_ns, acred->uid),
auth->au_flavor);
- if (!(cred = kzalloc(sizeof(*cred), GFP_NOFS)))
+ if (!(cred = kzalloc(sizeof(*cred), gfp)))
goto out_err;
rpcauth_init_cred(&cred->gc_base, acred, auth, &gss_credops);
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 1095be9c80ab..e085f5ae1548 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -569,10 +569,9 @@ gss_svc_searchbyctx(struct cache_detail *cd, struct xdr_netobj *handle)
struct rsc *found;
memset(&rsci, 0, sizeof(rsci));
- if (dup_to_netobj(&rsci.handle, handle->data, handle->len))
- return NULL;
+ rsci.handle.data = handle->data;
+ rsci.handle.len = handle->len;
found = rsc_lookup(cd, &rsci);
- rsc_free(&rsci);
if (!found)
return NULL;
if (cache_check(cd, &found->h, NULL))
@@ -857,8 +856,8 @@ unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct g
goto out;
if (svc_getnl(&buf->head[0]) != seq)
goto out;
- /* trim off the mic at the end before returning */
- xdr_buf_trim(buf, mic.len + 4);
+ /* trim off the mic and padding at the end before returning */
+ xdr_buf_trim(buf, round_up_to_quad(mic.len) + 4);
stat = 0;
out:
kfree(mic.data);
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 0d3dd364c22f..9f65452b7cbc 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -52,11 +52,11 @@ unx_destroy(struct rpc_auth *auth)
static struct rpc_cred *
unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
{
- return rpcauth_lookup_credcache(auth, acred, flags);
+ return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS);
}
static struct rpc_cred *
-unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
{
struct unx_cred *cred;
unsigned int groups = 0;
@@ -66,7 +66,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
from_kuid(&init_user_ns, acred->uid),
from_kgid(&init_user_ns, acred->gid));
- if (!(cred = kmalloc(sizeof(*cred), GFP_NOFS)))
+ if (!(cred = kmalloc(sizeof(*cred), gfp)))
return ERR_PTR(-ENOMEM);
rpcauth_init_cred(&cred->uc_base, acred, auth, &unix_credops);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 7e0c9bf22df8..06b4df9faaa1 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1414,6 +1414,23 @@ size_t rpc_max_payload(struct rpc_clnt *clnt)
EXPORT_SYMBOL_GPL(rpc_max_payload);
/**
+ * rpc_max_bc_payload - Get maximum backchannel payload size, in bytes
+ * @clnt: RPC client to query
+ */
+size_t rpc_max_bc_payload(struct rpc_clnt *clnt)
+{
+ struct rpc_xprt *xprt;
+ size_t ret;
+
+ rcu_read_lock();
+ xprt = rcu_dereference(clnt->cl_xprt);
+ ret = xprt->ops->bc_maxpayload(xprt);
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_max_bc_payload);
+
+/**
* rpc_get_timeout - Get timeout for transport in units of HZ
* @clnt: RPC client to query
*/
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 7422f28818b2..f5572e31d518 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -244,13 +244,12 @@ void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new)
svc_xprt_received(new);
}
-int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
+int _svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
struct net *net, const int family,
const unsigned short port, int flags)
{
struct svc_xprt_class *xcl;
- dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
spin_lock(&svc_xprt_class_lock);
list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
struct svc_xprt *newxprt;
@@ -274,12 +273,28 @@ int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
}
err:
spin_unlock(&svc_xprt_class_lock);
- dprintk("svc: transport %s not found\n", xprt_name);
-
/* This errno is exposed to user space. Provide a reasonable
* perror msg for a bad transport. */
return -EPROTONOSUPPORT;
}
+
+int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
+ struct net *net, const int family,
+ const unsigned short port, int flags)
+{
+ int err;
+
+ dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
+ err = _svc_create_xprt(serv, xprt_name, net, family, port, flags);
+ if (err == -EPROTONOSUPPORT) {
+ request_module("svc%s", xprt_name);
+ err = _svc_create_xprt(serv, xprt_name, net, family, port, flags);
+ }
+ if (err)
+ dprintk("svc: transport %s not found, err %d\n",
+ xprt_name, err);
+ return err;
+}
EXPORT_SYMBOL_GPL(svc_create_xprt);
/*
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 6bdb3865212d..c4f3cc0c0775 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -797,6 +797,8 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
xdr_set_iov(xdr, buf->head, buf->len);
else if (buf->page_len != 0)
xdr_set_page_base(xdr, 0, buf->len);
+ else
+ xdr_set_iov(xdr, buf->head, buf->len);
if (p != NULL && p > xdr->p && xdr->end >= p) {
xdr->nwords -= p - xdr->p;
xdr->p = p;
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 2dcd7640eeb5..87762d976b63 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -192,6 +192,22 @@ int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net)
}
/**
+ * xprt_rdma_bc_maxpayload - Return maximum backchannel message size
+ * @xprt: transport
+ *
+ * Returns maximum size, in bytes, of a backchannel message
+ */
+size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
+{
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+ size_t maxmsg;
+
+ maxmsg = min_t(unsigned int, cdata->inline_rsize, cdata->inline_wsize);
+ return maxmsg - RPCRDMA_HDRLEN_MIN;
+}
+
+/**
* rpcrdma_bc_marshal_reply - Send backwards direction reply
* @rqst: buffer containing RPC reply data
*
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index b289e106540b..6326ebe8b595 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -35,10 +35,71 @@
/* Maximum scatter/gather per FMR */
#define RPCRDMA_MAX_FMR_SGES (64)
+static struct workqueue_struct *fmr_recovery_wq;
+
+#define FMR_RECOVERY_WQ_FLAGS (WQ_UNBOUND)
+
+int
+fmr_alloc_recovery_wq(void)
+{
+ fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0);
+ return !fmr_recovery_wq ? -ENOMEM : 0;
+}
+
+void
+fmr_destroy_recovery_wq(void)
+{
+ struct workqueue_struct *wq;
+
+ if (!fmr_recovery_wq)
+ return;
+
+ wq = fmr_recovery_wq;
+ fmr_recovery_wq = NULL;
+ destroy_workqueue(wq);
+}
+
+static int
+__fmr_unmap(struct rpcrdma_mw *mw)
+{
+ LIST_HEAD(l);
+
+ list_add(&mw->fmr.fmr->list, &l);
+ return ib_unmap_fmr(&l);
+}
+
+/* Deferred reset of a single FMR. Generate a fresh rkey by
+ * replacing the MR. There's no recovery if this fails.
+ */
+static void
+__fmr_recovery_worker(struct work_struct *work)
+{
+ struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw,
+ mw_work);
+ struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+
+ __fmr_unmap(mw);
+ rpcrdma_put_mw(r_xprt, mw);
+ return;
+}
+
+/* A broken MR was discovered in a context that can't sleep.
+ * Defer recovery to the recovery worker.
+ */
+static void
+__fmr_queue_recovery(struct rpcrdma_mw *mw)
+{
+ INIT_WORK(&mw->mw_work, __fmr_recovery_worker);
+ queue_work(fmr_recovery_wq, &mw->mw_work);
+}
+
static int
fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
struct rpcrdma_create_data_internal *cdata)
{
+ rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
+ RPCRDMA_MAX_DATA_SEGS /
+ RPCRDMA_MAX_FMR_SGES));
return 0;
}
@@ -48,7 +109,7 @@ static size_t
fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
{
return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES);
+ RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
}
static int
@@ -89,6 +150,7 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt)
if (IS_ERR(r->fmr.fmr))
goto out_fmr_err;
+ r->mw_xprt = r_xprt;
list_add(&r->mw_list, &buf->rb_mws);
list_add(&r->mw_all, &buf->rb_all);
}
@@ -104,15 +166,6 @@ out:
return rc;
}
-static int
-__fmr_unmap(struct rpcrdma_mw *r)
-{
- LIST_HEAD(l);
-
- list_add(&r->fmr.fmr->list, &l);
- return ib_unmap_fmr(&l);
-}
-
/* Use the ib_map_phys_fmr() verb to register a memory region
* for remote access via RDMA READ or RDMA WRITE.
*/
@@ -183,15 +236,10 @@ static void
__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
{
struct ib_device *device = r_xprt->rx_ia.ri_device;
- struct rpcrdma_mw *mw = seg->rl_mw;
int nsegs = seg->mr_nsegs;
- seg->rl_mw = NULL;
-
while (nsegs--)
rpcrdma_unmap_one(device, seg++);
-
- rpcrdma_put_mw(r_xprt, mw);
}
/* Invalidate all memory regions that were registered for "req".
@@ -234,42 +282,50 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
seg = &req->rl_segments[i];
__fmr_dma_unmap(r_xprt, seg);
+ rpcrdma_put_mw(r_xprt, seg->rl_mw);
i += seg->mr_nsegs;
seg->mr_nsegs = 0;
+ seg->rl_mw = NULL;
}
req->rl_nchunks = 0;
}
-/* Use the ib_unmap_fmr() verb to prevent further remote
- * access via RDMA READ or RDMA WRITE.
+/* Use a slow, safe mechanism to invalidate all memory regions
+ * that were registered for "req".
+ *
+ * In the asynchronous case, DMA unmapping occurs first here
+ * because the rpcrdma_mr_seg is released immediately after this
+ * call. It's contents won't be available in __fmr_dma_unmap later.
+ * FIXME.
*/
-static int
-fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
+static void
+fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ bool sync)
{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_mr_seg *seg1 = seg;
- struct rpcrdma_mw *mw = seg1->rl_mw;
- int rc, nsegs = seg->mr_nsegs;
+ struct rpcrdma_mr_seg *seg;
+ struct rpcrdma_mw *mw;
+ unsigned int i;
- dprintk("RPC: %s: FMR %p\n", __func__, mw);
+ for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
+ seg = &req->rl_segments[i];
+ mw = seg->rl_mw;
- seg1->rl_mw = NULL;
- while (seg1->mr_nsegs--)
- rpcrdma_unmap_one(ia->ri_device, seg++);
- rc = __fmr_unmap(mw);
- if (rc)
- goto out_err;
- rpcrdma_put_mw(r_xprt, mw);
- return nsegs;
+ if (sync) {
+ /* ORDER */
+ __fmr_unmap(mw);
+ __fmr_dma_unmap(r_xprt, seg);
+ rpcrdma_put_mw(r_xprt, mw);
+ } else {
+ __fmr_dma_unmap(r_xprt, seg);
+ __fmr_queue_recovery(mw);
+ }
-out_err:
- /* The FMR is abandoned, but remains in rb_all. fmr_op_destroy
- * will attempt to release it when the transport is destroyed.
- */
- dprintk("RPC: %s: ib_unmap_fmr status %i\n", __func__, rc);
- return nsegs;
+ i += seg->mr_nsegs;
+ seg->mr_nsegs = 0;
+ seg->rl_mw = NULL;
+ }
}
static void
@@ -295,7 +351,7 @@ fmr_op_destroy(struct rpcrdma_buffer *buf)
const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
.ro_map = fmr_op_map,
.ro_unmap_sync = fmr_op_unmap_sync,
- .ro_unmap = fmr_op_unmap,
+ .ro_unmap_safe = fmr_op_unmap_safe,
.ro_open = fmr_op_open,
.ro_maxpages = fmr_op_maxpages,
.ro_init = fmr_op_init,
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 94c3fa910b85..c0947544babe 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -98,6 +98,47 @@ frwr_destroy_recovery_wq(void)
destroy_workqueue(wq);
}
+static int
+__frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
+{
+ struct rpcrdma_frmr *f = &r->frmr;
+ int rc;
+
+ rc = ib_dereg_mr(f->fr_mr);
+ if (rc) {
+ pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n",
+ rc, r);
+ return rc;
+ }
+
+ f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG,
+ ia->ri_max_frmr_depth);
+ if (IS_ERR(f->fr_mr)) {
+ pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n",
+ PTR_ERR(f->fr_mr), r);
+ return PTR_ERR(f->fr_mr);
+ }
+
+ dprintk("RPC: %s: recovered FRMR %p\n", __func__, r);
+ f->fr_state = FRMR_IS_INVALID;
+ return 0;
+}
+
+static void
+__frwr_reset_and_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
+{
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_frmr *f = &mw->frmr;
+ int rc;
+
+ rc = __frwr_reset_mr(ia, mw);
+ ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, f->fr_dir);
+ if (rc)
+ return;
+
+ rpcrdma_put_mw(r_xprt, mw);
+}
+
/* Deferred reset of a single FRMR. Generate a fresh rkey by
* replacing the MR.
*
@@ -109,26 +150,10 @@ static void
__frwr_recovery_worker(struct work_struct *work)
{
struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
- frmr.fr_work);
- struct rpcrdma_xprt *r_xprt = r->frmr.fr_xprt;
- unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
- struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
-
- if (ib_dereg_mr(r->frmr.fr_mr))
- goto out_fail;
+ mw_work);
- r->frmr.fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
- if (IS_ERR(r->frmr.fr_mr))
- goto out_fail;
-
- dprintk("RPC: %s: recovered FRMR %p\n", __func__, r);
- r->frmr.fr_state = FRMR_IS_INVALID;
- rpcrdma_put_mw(r_xprt, r);
+ __frwr_reset_and_unmap(r->mw_xprt, r);
return;
-
-out_fail:
- pr_warn("RPC: %s: FRMR %p unrecovered\n",
- __func__, r);
}
/* A broken MR was discovered in a context that can't sleep.
@@ -137,8 +162,8 @@ out_fail:
static void
__frwr_queue_recovery(struct rpcrdma_mw *r)
{
- INIT_WORK(&r->frmr.fr_work, __frwr_recovery_worker);
- queue_work(frwr_recovery_wq, &r->frmr.fr_work);
+ INIT_WORK(&r->mw_work, __frwr_recovery_worker);
+ queue_work(frwr_recovery_wq, &r->mw_work);
}
static int
@@ -152,11 +177,11 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
if (IS_ERR(f->fr_mr))
goto out_mr_err;
- f->sg = kcalloc(depth, sizeof(*f->sg), GFP_KERNEL);
- if (!f->sg)
+ f->fr_sg = kcalloc(depth, sizeof(*f->fr_sg), GFP_KERNEL);
+ if (!f->fr_sg)
goto out_list_err;
- sg_init_table(f->sg, depth);
+ sg_init_table(f->fr_sg, depth);
init_completion(&f->fr_linv_done);
@@ -185,7 +210,7 @@ __frwr_release(struct rpcrdma_mw *r)
if (rc)
dprintk("RPC: %s: ib_dereg_mr status %i\n",
__func__, rc);
- kfree(r->frmr.sg);
+ kfree(r->frmr.fr_sg);
}
static int
@@ -231,6 +256,9 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
depth;
}
+ rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
+ RPCRDMA_MAX_DATA_SEGS /
+ ia->ri_max_frmr_depth));
return 0;
}
@@ -243,7 +271,7 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth);
+ RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frmr_depth);
}
static void
@@ -350,9 +378,9 @@ frwr_op_init(struct rpcrdma_xprt *r_xprt)
return rc;
}
+ r->mw_xprt = r_xprt;
list_add(&r->mw_list, &buf->rb_mws);
list_add(&r->mw_all, &buf->rb_all);
- r->frmr.fr_xprt = r_xprt;
}
return 0;
@@ -396,12 +424,12 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
for (i = 0; i < nsegs;) {
if (seg->mr_page)
- sg_set_page(&frmr->sg[i],
+ sg_set_page(&frmr->fr_sg[i],
seg->mr_page,
seg->mr_len,
offset_in_page(seg->mr_offset));
else
- sg_set_buf(&frmr->sg[i], seg->mr_offset,
+ sg_set_buf(&frmr->fr_sg[i], seg->mr_offset,
seg->mr_len);
++seg;
@@ -412,25 +440,26 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
break;
}
- frmr->sg_nents = i;
+ frmr->fr_nents = i;
+ frmr->fr_dir = direction;
- dma_nents = ib_dma_map_sg(device, frmr->sg, frmr->sg_nents, direction);
+ dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction);
if (!dma_nents) {
pr_err("RPC: %s: failed to dma map sg %p sg_nents %u\n",
- __func__, frmr->sg, frmr->sg_nents);
+ __func__, frmr->fr_sg, frmr->fr_nents);
return -ENOMEM;
}
- n = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, NULL, PAGE_SIZE);
- if (unlikely(n != frmr->sg_nents)) {
+ n = ib_map_mr_sg(mr, frmr->fr_sg, frmr->fr_nents, NULL, PAGE_SIZE);
+ if (unlikely(n != frmr->fr_nents)) {
pr_err("RPC: %s: failed to map mr %p (%u/%u)\n",
- __func__, frmr->fr_mr, n, frmr->sg_nents);
+ __func__, frmr->fr_mr, n, frmr->fr_nents);
rc = n < 0 ? n : -EINVAL;
goto out_senderr;
}
dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n",
- __func__, mw, frmr->sg_nents, mr->length);
+ __func__, mw, frmr->fr_nents, mr->length);
key = (u8)(mr->rkey & 0x000000FF);
ib_update_fast_reg_key(mr, ++key);
@@ -452,18 +481,16 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
if (rc)
goto out_senderr;
- seg1->mr_dir = direction;
seg1->rl_mw = mw;
seg1->mr_rkey = mr->rkey;
seg1->mr_base = mr->iova;
- seg1->mr_nsegs = frmr->sg_nents;
+ seg1->mr_nsegs = frmr->fr_nents;
seg1->mr_len = mr->length;
- return frmr->sg_nents;
+ return frmr->fr_nents;
out_senderr:
dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
- ib_dma_unmap_sg(device, frmr->sg, dma_nents, direction);
__frwr_queue_recovery(mw);
return rc;
}
@@ -487,24 +514,6 @@ __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
return invalidate_wr;
}
-static void
-__frwr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
- int rc)
-{
- struct ib_device *device = r_xprt->rx_ia.ri_device;
- struct rpcrdma_mw *mw = seg->rl_mw;
- struct rpcrdma_frmr *f = &mw->frmr;
-
- seg->rl_mw = NULL;
-
- ib_dma_unmap_sg(device, f->sg, f->sg_nents, seg->mr_dir);
-
- if (!rc)
- rpcrdma_put_mw(r_xprt, mw);
- else
- __frwr_queue_recovery(mw);
-}
-
/* Invalidate all memory regions that were registered for "req".
*
* Sleeps until it is safe for the host CPU to access the
@@ -518,6 +527,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
struct rpcrdma_mr_seg *seg;
unsigned int i, nchunks;
struct rpcrdma_frmr *f;
+ struct rpcrdma_mw *mw;
int rc;
dprintk("RPC: %s: req %p\n", __func__, req);
@@ -558,11 +568,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* unless ri_id->qp is a valid pointer.
*/
rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr);
- if (rc) {
- pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
- rdma_disconnect(ia->ri_id);
- goto unmap;
- }
+ if (rc)
+ goto reset_mrs;
wait_for_completion(&f->fr_linv_done);
@@ -572,56 +579,65 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
unmap:
for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
seg = &req->rl_segments[i];
+ mw = seg->rl_mw;
+ seg->rl_mw = NULL;
- __frwr_dma_unmap(r_xprt, seg, rc);
+ ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents,
+ f->fr_dir);
+ rpcrdma_put_mw(r_xprt, mw);
i += seg->mr_nsegs;
seg->mr_nsegs = 0;
}
req->rl_nchunks = 0;
-}
+ return;
-/* Post a LOCAL_INV Work Request to prevent further remote access
- * via RDMA READ or RDMA WRITE.
- */
-static int
-frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
-{
- struct rpcrdma_mr_seg *seg1 = seg;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_mw *mw = seg1->rl_mw;
- struct rpcrdma_frmr *frmr = &mw->frmr;
- struct ib_send_wr *invalidate_wr, *bad_wr;
- int rc, nsegs = seg->mr_nsegs;
+reset_mrs:
+ pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
- dprintk("RPC: %s: FRMR %p\n", __func__, mw);
+ /* Find and reset the MRs in the LOCAL_INV WRs that did not
+ * get posted. This is synchronous, and slow.
+ */
+ for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+ seg = &req->rl_segments[i];
+ mw = seg->rl_mw;
+ f = &mw->frmr;
- seg1->rl_mw = NULL;
- frmr->fr_state = FRMR_IS_INVALID;
- invalidate_wr = &mw->frmr.fr_invwr;
+ if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) {
+ __frwr_reset_mr(ia, mw);
+ bad_wr = bad_wr->next;
+ }
- memset(invalidate_wr, 0, sizeof(*invalidate_wr));
- frmr->fr_cqe.done = frwr_wc_localinv;
- invalidate_wr->wr_cqe = &frmr->fr_cqe;
- invalidate_wr->opcode = IB_WR_LOCAL_INV;
- invalidate_wr->ex.invalidate_rkey = frmr->fr_mr->rkey;
- DECR_CQCOUNT(&r_xprt->rx_ep);
+ i += seg->mr_nsegs;
+ }
+ goto unmap;
+}
- ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir);
- read_lock(&ia->ri_qplock);
- rc = ib_post_send(ia->ri_id->qp, invalidate_wr, &bad_wr);
- read_unlock(&ia->ri_qplock);
- if (rc)
- goto out_err;
+/* Use a slow, safe mechanism to invalidate all memory regions
+ * that were registered for "req".
+ */
+static void
+frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ bool sync)
+{
+ struct rpcrdma_mr_seg *seg;
+ struct rpcrdma_mw *mw;
+ unsigned int i;
- rpcrdma_put_mw(r_xprt, mw);
- return nsegs;
+ for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
+ seg = &req->rl_segments[i];
+ mw = seg->rl_mw;
-out_err:
- dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
- __frwr_queue_recovery(mw);
- return nsegs;
+ if (sync)
+ __frwr_reset_and_unmap(r_xprt, mw);
+ else
+ __frwr_queue_recovery(mw);
+
+ i += seg->mr_nsegs;
+ seg->mr_nsegs = 0;
+ seg->rl_mw = NULL;
+ }
}
static void
@@ -643,7 +659,7 @@ frwr_op_destroy(struct rpcrdma_buffer *buf)
const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
.ro_map = frwr_op_map,
.ro_unmap_sync = frwr_op_unmap_sync,
- .ro_unmap = frwr_op_unmap,
+ .ro_unmap_safe = frwr_op_unmap_safe,
.ro_open = frwr_op_open,
.ro_maxpages = frwr_op_maxpages,
.ro_init = frwr_op_init,
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
index 481b9b6f4a15..3750596cc432 100644
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -36,8 +36,11 @@ physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
__func__, PTR_ERR(mr));
return -ENOMEM;
}
-
ia->ri_dma_mr = mr;
+
+ rpcrdma_set_max_header_sizes(ia, cdata, min_t(unsigned int,
+ RPCRDMA_MAX_DATA_SEGS,
+ RPCRDMA_MAX_HDR_SEGS));
return 0;
}
@@ -47,7 +50,7 @@ static size_t
physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
{
return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- rpcrdma_max_segments(r_xprt));
+ RPCRDMA_MAX_HDR_SEGS);
}
static int
@@ -71,17 +74,6 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
return 1;
}
-/* Unmap a memory region, but leave it registered.
- */
-static int
-physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
-{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-
- rpcrdma_unmap_one(ia->ri_device, seg);
- return 1;
-}
-
/* DMA unmap all memory regions that were mapped for "req".
*/
static void
@@ -94,6 +86,25 @@ physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
rpcrdma_unmap_one(device, &req->rl_segments[i++]);
}
+/* Use a slow, safe mechanism to invalidate all memory regions
+ * that were registered for "req".
+ *
+ * For physical memory registration, there is no good way to
+ * fence a single MR that has been advertised to the server. The
+ * client has already handed the server an R_key that cannot be
+ * invalidated and is shared by all MRs on this connection.
+ * Tearing down the PD might be the only safe choice, but it's
+ * not clear that a freshly acquired DMA R_key would be different
+ * than the one used by the PD that was just destroyed.
+ * FIXME.
+ */
+static void
+physical_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ bool sync)
+{
+ physical_op_unmap_sync(r_xprt, req);
+}
+
static void
physical_op_destroy(struct rpcrdma_buffer *buf)
{
@@ -102,7 +113,7 @@ physical_op_destroy(struct rpcrdma_buffer *buf)
const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
.ro_map = physical_op_map,
.ro_unmap_sync = physical_op_unmap_sync,
- .ro_unmap = physical_op_unmap,
+ .ro_unmap_safe = physical_op_unmap_safe,
.ro_open = physical_op_open,
.ro_maxpages = physical_op_maxpages,
.ro_init = physical_op_init,
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 888823bb6dae..35a81096e83d 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -61,26 +61,84 @@ enum rpcrdma_chunktype {
rpcrdma_replych
};
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
static const char transfertypes[][12] = {
- "pure inline", /* no chunks */
- " read chunk", /* some argument via rdma read */
- "*read chunk", /* entire request via rdma read */
- "write chunk", /* some result via rdma write */
+ "inline", /* no chunks */
+ "read list", /* some argument via rdma read */
+ "*read list", /* entire request via rdma read */
+ "write list", /* some result via rdma write */
"reply chunk" /* entire reply via rdma write */
};
-#endif
+
+/* Returns size of largest RPC-over-RDMA header in a Call message
+ *
+ * The largest Call header contains a full-size Read list and a
+ * minimal Reply chunk.
+ */
+static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
+{
+ unsigned int size;
+
+ /* Fixed header fields and list discriminators */
+ size = RPCRDMA_HDRLEN_MIN;
+
+ /* Maximum Read list size */
+ maxsegs += 2; /* segment for head and tail buffers */
+ size = maxsegs * sizeof(struct rpcrdma_read_chunk);
+
+ /* Minimal Read chunk size */
+ size += sizeof(__be32); /* segment count */
+ size += sizeof(struct rpcrdma_segment);
+ size += sizeof(__be32); /* list discriminator */
+
+ dprintk("RPC: %s: max call header size = %u\n",
+ __func__, size);
+ return size;
+}
+
+/* Returns size of largest RPC-over-RDMA header in a Reply message
+ *
+ * There is only one Write list or one Reply chunk per Reply
+ * message. The larger list is the Write list.
+ */
+static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
+{
+ unsigned int size;
+
+ /* Fixed header fields and list discriminators */
+ size = RPCRDMA_HDRLEN_MIN;
+
+ /* Maximum Write list size */
+ maxsegs += 2; /* segment for head and tail buffers */
+ size = sizeof(__be32); /* segment count */
+ size += maxsegs * sizeof(struct rpcrdma_segment);
+ size += sizeof(__be32); /* list discriminator */
+
+ dprintk("RPC: %s: max reply header size = %u\n",
+ __func__, size);
+ return size;
+}
+
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *ia,
+ struct rpcrdma_create_data_internal *cdata,
+ unsigned int maxsegs)
+{
+ ia->ri_max_inline_write = cdata->inline_wsize -
+ rpcrdma_max_call_header_size(maxsegs);
+ ia->ri_max_inline_read = cdata->inline_rsize -
+ rpcrdma_max_reply_header_size(maxsegs);
+}
/* The client can send a request inline as long as the RPCRDMA header
* plus the RPC call fit under the transport's inline limit. If the
* combined call message size exceeds that limit, the client must use
* the read chunk list for this operation.
*/
-static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
+static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
+ struct rpc_rqst *rqst)
{
- unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
+ return rqst->rq_snd_buf.len <= ia->ri_max_inline_write;
}
/* The client can't know how large the actual reply will be. Thus it
@@ -89,11 +147,12 @@ static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
* limit, the client must provide a write list or a reply chunk for
* this request.
*/
-static bool rpcrdma_results_inline(struct rpc_rqst *rqst)
+static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
+ struct rpc_rqst *rqst)
{
- unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst);
+ return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read;
}
static int
@@ -226,23 +285,16 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
return n;
}
-/*
- * Create read/write chunk lists, and reply chunks, for RDMA
- *
- * Assume check against THRESHOLD has been done, and chunks are required.
- * Assume only encoding one list entry for read|write chunks. The NFSv3
- * protocol is simple enough to allow this as it only has a single "bulk
- * result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The
- * RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.)
- *
- * When used for a single reply chunk (which is a special write
- * chunk used for the entire reply, rather than just the data), it
- * is used primarily for READDIR and READLINK which would otherwise
- * be severely size-limited by a small rdma inline read max. The server
- * response will come back as an RDMA Write, followed by a message
- * of type RDMA_NOMSG carrying the xid and length. As a result, reply
- * chunks do not provide data alignment, however they do not require
- * "fixup" (moving the response to the upper layer buffer) either.
+static inline __be32 *
+xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg)
+{
+ *iptr++ = cpu_to_be32(seg->mr_rkey);
+ *iptr++ = cpu_to_be32(seg->mr_len);
+ return xdr_encode_hyper(iptr, seg->mr_base);
+}
+
+/* XDR-encode the Read list. Supports encoding a list of read
+ * segments that belong to a single read chunk.
*
* Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
*
@@ -250,131 +302,190 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
* N elements, position P (same P for all chunks of same arg!):
* 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
*
+ * Returns a pointer to the XDR word in the RDMA header following
+ * the end of the Read list, or an error pointer.
+ */
+static __be32 *
+rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req, struct rpc_rqst *rqst,
+ __be32 *iptr, enum rpcrdma_chunktype rtype)
+{
+ struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+ unsigned int pos;
+ int n, nsegs;
+
+ if (rtype == rpcrdma_noch) {
+ *iptr++ = xdr_zero; /* item not present */
+ return iptr;
+ }
+
+ pos = rqst->rq_snd_buf.head[0].iov_len;
+ if (rtype == rpcrdma_areadch)
+ pos = 0;
+ nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg,
+ RPCRDMA_MAX_SEGS - req->rl_nchunks);
+ if (nsegs < 0)
+ return ERR_PTR(nsegs);
+
+ do {
+ n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false);
+ if (n <= 0)
+ return ERR_PTR(n);
+
+ *iptr++ = xdr_one; /* item present */
+
+ /* All read segments in this chunk
+ * have the same "position".
+ */
+ *iptr++ = cpu_to_be32(pos);
+ iptr = xdr_encode_rdma_segment(iptr, seg);
+
+ dprintk("RPC: %5u %s: read segment pos %u "
+ "%d@0x%016llx:0x%08x (%s)\n",
+ rqst->rq_task->tk_pid, __func__, pos,
+ seg->mr_len, (unsigned long long)seg->mr_base,
+ seg->mr_rkey, n < nsegs ? "more" : "last");
+
+ r_xprt->rx_stats.read_chunk_count++;
+ req->rl_nchunks++;
+ seg += n;
+ nsegs -= n;
+ } while (nsegs);
+ req->rl_nextseg = seg;
+
+ /* Finish Read list */
+ *iptr++ = xdr_zero; /* Next item not present */
+ return iptr;
+}
+
+/* XDR-encode the Write list. Supports encoding a list containing
+ * one array of plain segments that belong to a single write chunk.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
* Write chunklist (a list of (one) counted array):
* N elements:
* 1 - N - HLOO - HLOO - ... - HLOO - 0
*
+ * Returns a pointer to the XDR word in the RDMA header following
+ * the end of the Write list, or an error pointer.
+ */
+static __be32 *
+rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ struct rpc_rqst *rqst, __be32 *iptr,
+ enum rpcrdma_chunktype wtype)
+{
+ struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+ int n, nsegs, nchunks;
+ __be32 *segcount;
+
+ if (wtype != rpcrdma_writech) {
+ *iptr++ = xdr_zero; /* no Write list present */
+ return iptr;
+ }
+
+ nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
+ rqst->rq_rcv_buf.head[0].iov_len,
+ wtype, seg,
+ RPCRDMA_MAX_SEGS - req->rl_nchunks);
+ if (nsegs < 0)
+ return ERR_PTR(nsegs);
+
+ *iptr++ = xdr_one; /* Write list present */
+ segcount = iptr++; /* save location of segment count */
+
+ nchunks = 0;
+ do {
+ n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
+ if (n <= 0)
+ return ERR_PTR(n);
+
+ iptr = xdr_encode_rdma_segment(iptr, seg);
+
+ dprintk("RPC: %5u %s: write segment "
+ "%d@0x016%llx:0x%08x (%s)\n",
+ rqst->rq_task->tk_pid, __func__,
+ seg->mr_len, (unsigned long long)seg->mr_base,
+ seg->mr_rkey, n < nsegs ? "more" : "last");
+
+ r_xprt->rx_stats.write_chunk_count++;
+ r_xprt->rx_stats.total_rdma_request += seg->mr_len;
+ req->rl_nchunks++;
+ nchunks++;
+ seg += n;
+ nsegs -= n;
+ } while (nsegs);
+ req->rl_nextseg = seg;
+
+ /* Update count of segments in this Write chunk */
+ *segcount = cpu_to_be32(nchunks);
+
+ /* Finish Write list */
+ *iptr++ = xdr_zero; /* Next item not present */
+ return iptr;
+}
+
+/* XDR-encode the Reply chunk. Supports encoding an array of plain
+ * segments that belong to a single write (reply) chunk.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
* Reply chunk (a counted array):
* N elements:
* 1 - N - HLOO - HLOO - ... - HLOO
*
- * Returns positive RPC/RDMA header size, or negative errno.
+ * Returns a pointer to the XDR word in the RDMA header following
+ * the end of the Reply chunk, or an error pointer.
*/
-
-static ssize_t
-rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
- struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
+static __be32 *
+rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req, struct rpc_rqst *rqst,
+ __be32 *iptr, enum rpcrdma_chunktype wtype)
{
- struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
- int n, nsegs, nchunks = 0;
- unsigned int pos;
- struct rpcrdma_mr_seg *seg = req->rl_segments;
- struct rpcrdma_read_chunk *cur_rchunk = NULL;
- struct rpcrdma_write_array *warray = NULL;
- struct rpcrdma_write_chunk *cur_wchunk = NULL;
- __be32 *iptr = headerp->rm_body.rm_chunks;
- int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool);
-
- if (type == rpcrdma_readch || type == rpcrdma_areadch) {
- /* a read chunk - server will RDMA Read our memory */
- cur_rchunk = (struct rpcrdma_read_chunk *) iptr;
- } else {
- /* a write or reply chunk - server will RDMA Write our memory */
- *iptr++ = xdr_zero; /* encode a NULL read chunk list */
- if (type == rpcrdma_replych)
- *iptr++ = xdr_zero; /* a NULL write chunk list */
- warray = (struct rpcrdma_write_array *) iptr;
- cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1);
- }
+ struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+ int n, nsegs, nchunks;
+ __be32 *segcount;
- if (type == rpcrdma_replych || type == rpcrdma_areadch)
- pos = 0;
- else
- pos = target->head[0].iov_len;
+ if (wtype != rpcrdma_replych) {
+ *iptr++ = xdr_zero; /* no Reply chunk present */
+ return iptr;
+ }
- nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
+ nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg,
+ RPCRDMA_MAX_SEGS - req->rl_nchunks);
if (nsegs < 0)
- return nsegs;
+ return ERR_PTR(nsegs);
- map = r_xprt->rx_ia.ri_ops->ro_map;
+ *iptr++ = xdr_one; /* Reply chunk present */
+ segcount = iptr++; /* save location of segment count */
+
+ nchunks = 0;
do {
- n = map(r_xprt, seg, nsegs, cur_wchunk != NULL);
+ n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
if (n <= 0)
- goto out;
- if (cur_rchunk) { /* read */
- cur_rchunk->rc_discrim = xdr_one;
- /* all read chunks have the same "position" */
- cur_rchunk->rc_position = cpu_to_be32(pos);
- cur_rchunk->rc_target.rs_handle =
- cpu_to_be32(seg->mr_rkey);
- cur_rchunk->rc_target.rs_length =
- cpu_to_be32(seg->mr_len);
- xdr_encode_hyper(
- (__be32 *)&cur_rchunk->rc_target.rs_offset,
- seg->mr_base);
- dprintk("RPC: %s: read chunk "
- "elem %d@0x%llx:0x%x pos %u (%s)\n", __func__,
- seg->mr_len, (unsigned long long)seg->mr_base,
- seg->mr_rkey, pos, n < nsegs ? "more" : "last");
- cur_rchunk++;
- r_xprt->rx_stats.read_chunk_count++;
- } else { /* write/reply */
- cur_wchunk->wc_target.rs_handle =
- cpu_to_be32(seg->mr_rkey);
- cur_wchunk->wc_target.rs_length =
- cpu_to_be32(seg->mr_len);
- xdr_encode_hyper(
- (__be32 *)&cur_wchunk->wc_target.rs_offset,
- seg->mr_base);
- dprintk("RPC: %s: %s chunk "
- "elem %d@0x%llx:0x%x (%s)\n", __func__,
- (type == rpcrdma_replych) ? "reply" : "write",
- seg->mr_len, (unsigned long long)seg->mr_base,
- seg->mr_rkey, n < nsegs ? "more" : "last");
- cur_wchunk++;
- if (type == rpcrdma_replych)
- r_xprt->rx_stats.reply_chunk_count++;
- else
- r_xprt->rx_stats.write_chunk_count++;
- r_xprt->rx_stats.total_rdma_request += seg->mr_len;
- }
+ return ERR_PTR(n);
+
+ iptr = xdr_encode_rdma_segment(iptr, seg);
+
+ dprintk("RPC: %5u %s: reply segment "
+ "%d@0x%016llx:0x%08x (%s)\n",
+ rqst->rq_task->tk_pid, __func__,
+ seg->mr_len, (unsigned long long)seg->mr_base,
+ seg->mr_rkey, n < nsegs ? "more" : "last");
+
+ r_xprt->rx_stats.reply_chunk_count++;
+ r_xprt->rx_stats.total_rdma_request += seg->mr_len;
+ req->rl_nchunks++;
nchunks++;
seg += n;
nsegs -= n;
} while (nsegs);
+ req->rl_nextseg = seg;
- /* success. all failures return above */
- req->rl_nchunks = nchunks;
-
- /*
- * finish off header. If write, marshal discrim and nchunks.
- */
- if (cur_rchunk) {
- iptr = (__be32 *) cur_rchunk;
- *iptr++ = xdr_zero; /* finish the read chunk list */
- *iptr++ = xdr_zero; /* encode a NULL write chunk list */
- *iptr++ = xdr_zero; /* encode a NULL reply chunk */
- } else {
- warray->wc_discrim = xdr_one;
- warray->wc_nchunks = cpu_to_be32(nchunks);
- iptr = (__be32 *) cur_wchunk;
- if (type == rpcrdma_writech) {
- *iptr++ = xdr_zero; /* finish the write chunk list */
- *iptr++ = xdr_zero; /* encode a NULL reply chunk */
- }
- }
-
- /*
- * Return header size.
- */
- return (unsigned char *)iptr - (unsigned char *)headerp;
+ /* Update count of segments in the Reply chunk */
+ *segcount = cpu_to_be32(nchunks);
-out:
- for (pos = 0; nchunks--;)
- pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
- &req->rl_segments[pos]);
- return n;
+ return iptr;
}
/*
@@ -440,13 +551,10 @@ static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
* Marshal a request: the primary job of this routine is to choose
* the transfer modes. See comments below.
*
- * Uses multiple RDMA IOVs for a request:
- * [0] -- RPC RDMA header, which uses memory from the *start* of the
- * preregistered buffer that already holds the RPC data in
- * its middle.
- * [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
- * [2] -- optional padding.
- * [3] -- if padded, header only in [1] and data here.
+ * Prepares up to two IOVs per Call message:
+ *
+ * [0] -- RPC RDMA header
+ * [1] -- the RPC header/data
*
* Returns zero on success, otherwise a negative errno.
*/
@@ -457,24 +565,17 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
struct rpc_xprt *xprt = rqst->rq_xprt;
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
- char *base;
- size_t rpclen;
- ssize_t hdrlen;
enum rpcrdma_chunktype rtype, wtype;
struct rpcrdma_msg *headerp;
+ ssize_t hdrlen;
+ size_t rpclen;
+ __be32 *iptr;
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state))
return rpcrdma_bc_marshal_reply(rqst);
#endif
- /*
- * rpclen gets amount of data in first buffer, which is the
- * pre-registered buffer.
- */
- base = rqst->rq_svec[0].iov_base;
- rpclen = rqst->rq_svec[0].iov_len;
-
headerp = rdmab_to_msg(req->rl_rdmabuf);
/* don't byte-swap XID, it's already done in request */
headerp->rm_xid = rqst->rq_xid;
@@ -485,15 +586,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
/*
* Chunks needed for results?
*
- * o Read ops return data as write chunk(s), header as inline.
* o If the expected result is under the inline threshold, all ops
* return as inline.
+ * o Large read ops return data as write chunk(s), header as
+ * inline.
* o Large non-read ops return as a single reply chunk.
*/
- if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
- wtype = rpcrdma_writech;
- else if (rpcrdma_results_inline(rqst))
+ if (rpcrdma_results_inline(r_xprt, rqst))
wtype = rpcrdma_noch;
+ else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+ wtype = rpcrdma_writech;
else
wtype = rpcrdma_replych;
@@ -511,10 +613,14 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* that both has a data payload, and whose non-data arguments
* by themselves are larger than the inline threshold.
*/
- if (rpcrdma_args_inline(rqst)) {
+ if (rpcrdma_args_inline(r_xprt, rqst)) {
rtype = rpcrdma_noch;
+ rpcrdma_inline_pullup(rqst);
+ rpclen = rqst->rq_svec[0].iov_len;
} else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
rtype = rpcrdma_readch;
+ rpclen = rqst->rq_svec[0].iov_len;
+ rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
} else {
r_xprt->rx_stats.nomsg_call_count++;
headerp->rm_type = htonl(RDMA_NOMSG);
@@ -522,57 +628,50 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
rpclen = 0;
}
- /* The following simplification is not true forever */
- if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
- wtype = rpcrdma_noch;
- if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
- dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
- __func__);
- return -EIO;
- }
-
- hdrlen = RPCRDMA_HDRLEN_MIN;
-
- /*
- * Pull up any extra send data into the preregistered buffer.
- * When padding is in use and applies to the transfer, insert
- * it and change the message type.
+ /* This implementation supports the following combinations
+ * of chunk lists in one RPC-over-RDMA Call message:
+ *
+ * - Read list
+ * - Write list
+ * - Reply chunk
+ * - Read list + Reply chunk
+ *
+ * It might not yet support the following combinations:
+ *
+ * - Read list + Write list
+ *
+ * It does not support the following combinations:
+ *
+ * - Write list + Reply chunk
+ * - Read list + Write list + Reply chunk
+ *
+ * This implementation supports only a single chunk in each
+ * Read or Write list. Thus for example the client cannot
+ * send a Call message with a Position Zero Read chunk and a
+ * regular Read chunk at the same time.
*/
- if (rtype == rpcrdma_noch) {
-
- rpcrdma_inline_pullup(rqst);
-
- headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
- headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
- headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
- /* new length after pullup */
- rpclen = rqst->rq_svec[0].iov_len;
- } else if (rtype == rpcrdma_readch)
- rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
- if (rtype != rpcrdma_noch) {
- hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
- headerp, rtype);
- wtype = rtype; /* simplify dprintk */
-
- } else if (wtype != rpcrdma_noch) {
- hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
- headerp, wtype);
- }
- if (hdrlen < 0)
- return hdrlen;
+ req->rl_nchunks = 0;
+ req->rl_nextseg = req->rl_segments;
+ iptr = headerp->rm_body.rm_chunks;
+ iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype);
+ if (IS_ERR(iptr))
+ goto out_unmap;
+ iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype);
+ if (IS_ERR(iptr))
+ goto out_unmap;
+ iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype);
+ if (IS_ERR(iptr))
+ goto out_unmap;
+ hdrlen = (unsigned char *)iptr - (unsigned char *)headerp;
+
+ if (hdrlen + rpclen > RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
+ goto out_overflow;
+
+ dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n",
+ rqst->rq_task->tk_pid, __func__,
+ transfertypes[rtype], transfertypes[wtype],
+ hdrlen, rpclen);
- dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd"
- " headerp 0x%p base 0x%p lkey 0x%x\n",
- __func__, transfertypes[wtype], hdrlen, rpclen,
- headerp, base, rdmab_lkey(req->rl_rdmabuf));
-
- /*
- * initialize send_iov's - normally only two: rdma chunk header and
- * single preregistered RPC header buffer, but if padding is present,
- * then use a preregistered (and zeroed) pad buffer between the RPC
- * header and any write data. In all non-rdma cases, any following
- * data has been copied into the RPC header buffer.
- */
req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
req->rl_send_iov[0].length = hdrlen;
req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
@@ -587,6 +686,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
req->rl_niovs = 2;
return 0;
+
+out_overflow:
+ pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n",
+ hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]);
+ /* Terminate this RPC. Chunks registered above will be
+ * released by xprt_release -> xprt_rmda_free .
+ */
+ return -EIO;
+
+out_unmap:
+ r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
+ return PTR_ERR(iptr);
}
/*
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 765bca47c74d..0ba9887f3e22 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -145,19 +145,32 @@ static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
return (__be32 *)&ary->wc_array[nchunks];
}
-int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
+/**
+ * svc_rdma_xdr_decode_req - Parse incoming RPC-over-RDMA header
+ * @rq_arg: Receive buffer
+ *
+ * On entry, xdr->head[0].iov_base points to first byte in the
+ * RPC-over-RDMA header.
+ *
+ * On successful exit, head[0] points to first byte past the
+ * RPC-over-RDMA header. For RDMA_MSG, this is the RPC message.
+ * The length of the RPC-over-RDMA header is returned.
+ */
+int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg)
{
+ struct rpcrdma_msg *rmsgp;
__be32 *va, *vaend;
unsigned int len;
u32 hdr_len;
/* Verify that there's enough bytes for header + something */
- if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_ERR) {
+ if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) {
dprintk("svcrdma: header too short = %d\n",
- rqstp->rq_arg.len);
+ rq_arg->len);
return -EINVAL;
}
+ rmsgp = (struct rpcrdma_msg *)rq_arg->head[0].iov_base;
if (rmsgp->rm_vers != rpcrdma_version) {
dprintk("%s: bad version %u\n", __func__,
be32_to_cpu(rmsgp->rm_vers));
@@ -189,10 +202,10 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh);
va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
- rqstp->rq_arg.head[0].iov_base = va;
+ rq_arg->head[0].iov_base = va;
len = (u32)((unsigned long)va - (unsigned long)rmsgp);
- rqstp->rq_arg.head[0].iov_len -= len;
- if (len > rqstp->rq_arg.len)
+ rq_arg->head[0].iov_len -= len;
+ if (len > rq_arg->len)
return -EINVAL;
return len;
default:
@@ -205,7 +218,7 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
* chunk list and a reply chunk list.
*/
va = &rmsgp->rm_body.rm_chunks[0];
- vaend = (__be32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
+ vaend = (__be32 *)((unsigned long)rmsgp + rq_arg->len);
va = decode_read_list(va, vaend);
if (!va) {
dprintk("svcrdma: failed to decode read list\n");
@@ -222,10 +235,9 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
return -EINVAL;
}
- rqstp->rq_arg.head[0].iov_base = va;
+ rq_arg->head[0].iov_base = va;
hdr_len = (unsigned long)va - (unsigned long)rmsgp;
- rqstp->rq_arg.head[0].iov_len -= hdr_len;
-
+ rq_arg->head[0].iov_len -= hdr_len;
return hdr_len;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index fbe7444e7de6..2c25606f2561 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -447,10 +447,8 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
head->arg.len = rqstp->rq_arg.len;
head->arg.buflen = rqstp->rq_arg.buflen;
- ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
- position = be32_to_cpu(ch->rc_position);
-
/* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */
+ position = be32_to_cpu(ch->rc_position);
if (position == 0) {
head->arg.pages = &head->pages[0];
page_offset = head->byte_len;
@@ -488,7 +486,7 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
if (page_offset & 3) {
u32 pad = 4 - (page_offset & 3);
- head->arg.page_len += pad;
+ head->arg.tail[0].iov_len += pad;
head->arg.len += pad;
head->arg.buflen += pad;
page_offset += pad;
@@ -510,11 +508,10 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
return ret;
}
-static int rdma_read_complete(struct svc_rqst *rqstp,
- struct svc_rdma_op_ctxt *head)
+static void rdma_read_complete(struct svc_rqst *rqstp,
+ struct svc_rdma_op_ctxt *head)
{
int page_no;
- int ret;
/* Copy RPC pages */
for (page_no = 0; page_no < head->count; page_no++) {
@@ -550,23 +547,6 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
rqstp->rq_arg.tail[0] = head->arg.tail[0];
rqstp->rq_arg.len = head->arg.len;
rqstp->rq_arg.buflen = head->arg.buflen;
-
- /* Free the context */
- svc_rdma_put_context(head, 0);
-
- /* XXX: What should this be? */
- rqstp->rq_prot = IPPROTO_MAX;
- svc_xprt_copy_addrs(rqstp, rqstp->rq_xprt);
-
- ret = rqstp->rq_arg.head[0].iov_len
- + rqstp->rq_arg.page_len
- + rqstp->rq_arg.tail[0].iov_len;
- dprintk("svcrdma: deferred read ret=%d, rq_arg.len=%u, "
- "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len=%zu\n",
- ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base,
- rqstp->rq_arg.head[0].iov_len);
-
- return ret;
}
/* By convention, backchannel calls arrive via rdma_msg type
@@ -624,7 +604,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
dto_q);
list_del_init(&ctxt->dto_q);
spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
- return rdma_read_complete(rqstp, ctxt);
+ rdma_read_complete(rqstp, ctxt);
+ goto complete;
} else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
struct svc_rdma_op_ctxt,
@@ -655,7 +636,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
/* Decode the RDMA header. */
rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
- ret = svc_rdma_xdr_decode_req(rmsgp, rqstp);
+ ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg);
if (ret < 0)
goto out_err;
if (ret == 0)
@@ -682,6 +663,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
return 0;
}
+complete:
ret = rqstp->rq_arg.head[0].iov_len
+ rqstp->rq_arg.page_len
+ rqstp->rq_arg.tail[0].iov_len;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 4f1b1c4f45f9..54d533300620 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -463,25 +463,21 @@ static int send_reply(struct svcxprt_rdma *rdma,
struct svc_rqst *rqstp,
struct page *page,
struct rpcrdma_msg *rdma_resp,
- struct svc_rdma_op_ctxt *ctxt,
struct svc_rdma_req_map *vec,
int byte_count)
{
+ struct svc_rdma_op_ctxt *ctxt;
struct ib_send_wr send_wr;
u32 xdr_off;
int sge_no;
int sge_bytes;
int page_no;
int pages;
- int ret;
-
- ret = svc_rdma_repost_recv(rdma, GFP_KERNEL);
- if (ret) {
- svc_rdma_put_context(ctxt, 0);
- return -ENOTCONN;
- }
+ int ret = -EIO;
/* Prepare the context */
+ ctxt = svc_rdma_get_context(rdma);
+ ctxt->direction = DMA_TO_DEVICE;
ctxt->pages[0] = page;
ctxt->count = 1;
@@ -565,8 +561,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
err:
svc_rdma_unmap_dma(ctxt);
svc_rdma_put_context(ctxt, 1);
- pr_err("svcrdma: failed to send reply, rc=%d\n", ret);
- return -EIO;
+ return ret;
}
void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
@@ -585,7 +580,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
int ret;
int inline_bytes;
struct page *res_page;
- struct svc_rdma_op_ctxt *ctxt;
struct svc_rdma_req_map *vec;
dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
@@ -598,8 +592,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
rp_ary = svc_rdma_get_reply_array(rdma_argp, wr_ary);
/* Build an req vec for the XDR */
- ctxt = svc_rdma_get_context(rdma);
- ctxt->direction = DMA_TO_DEVICE;
vec = svc_rdma_get_req_map(rdma);
ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL);
if (ret)
@@ -635,7 +627,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
inline_bytes -= ret;
}
- ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec,
+ /* Post a fresh Receive buffer _before_ sending the reply */
+ ret = svc_rdma_post_recv(rdma, GFP_KERNEL);
+ if (ret)
+ goto err1;
+
+ ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec,
inline_bytes);
if (ret < 0)
goto err1;
@@ -648,7 +645,8 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
put_page(res_page);
err0:
svc_rdma_put_req_map(rdma, vec);
- svc_rdma_put_context(ctxt, 0);
+ pr_err("svcrdma: Could not send reply, err=%d. Closing transport.\n",
+ ret);
set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
return -ENOTCONN;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 90668969d559..dd9440137834 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -789,7 +789,7 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
int ret;
dprintk("svcrdma: Creating RDMA socket\n");
- if (sa->sa_family != AF_INET) {
+ if ((sa->sa_family != AF_INET) && (sa->sa_family != AF_INET6)) {
dprintk("svcrdma: Address family %d is not supported.\n", sa->sa_family);
return ERR_PTR(-EAFNOSUPPORT);
}
@@ -805,6 +805,16 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
goto err0;
}
+ /* Allow both IPv4 and IPv6 sockets to bind a single port
+ * at the same time.
+ */
+#if IS_ENABLED(CONFIG_IPV6)
+ ret = rdma_set_afonly(listen_id, 1);
+ if (ret) {
+ dprintk("svcrdma: rdma_set_afonly failed = %d\n", ret);
+ goto err1;
+ }
+#endif
ret = rdma_bind_addr(listen_id, sa);
if (ret) {
dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret);
@@ -1073,7 +1083,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
/* Post receive buffers */
- for (i = 0; i < newxprt->sc_rq_depth; i++) {
+ for (i = 0; i < newxprt->sc_max_requests; i++) {
ret = svc_rdma_post_recv(newxprt, GFP_KERNEL);
if (ret) {
dprintk("svcrdma: failure posting receive buffers\n");
@@ -1170,6 +1180,9 @@ static void __svc_rdma_free(struct work_struct *work)
dprintk("svcrdma: %s(%p)\n", __func__, rdma);
+ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
+ ib_drain_qp(rdma->sc_qp);
+
/* We should only be called from kref_put */
if (atomic_read(&xprt->xpt_ref.refcount) != 0)
pr_err("svcrdma: sc_xprt still in use? (%d)\n",
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index b1b009f10ea3..99d2e5b72726 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -73,6 +73,8 @@ static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
+static unsigned int min_inline_size = RPCRDMA_MIN_INLINE;
+static unsigned int max_inline_size = RPCRDMA_MAX_INLINE;
static unsigned int zero;
static unsigned int max_padding = PAGE_SIZE;
static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
@@ -96,6 +98,8 @@ static struct ctl_table xr_tunables_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
+ .extra1 = &min_inline_size,
+ .extra2 = &max_inline_size,
},
{
.procname = "rdma_max_inline_write",
@@ -103,6 +107,8 @@ static struct ctl_table xr_tunables_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
+ .extra1 = &min_inline_size,
+ .extra2 = &max_inline_size,
},
{
.procname = "rdma_inline_write_padding",
@@ -508,6 +514,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
out:
dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
req->rl_connect_cookie = 0; /* our reserved value */
+ req->rl_task = task;
return req->rl_sendbuf->rg_base;
out_rdmabuf:
@@ -564,7 +571,6 @@ xprt_rdma_free(void *buffer)
struct rpcrdma_req *req;
struct rpcrdma_xprt *r_xprt;
struct rpcrdma_regbuf *rb;
- int i;
if (buffer == NULL)
return;
@@ -578,11 +584,8 @@ xprt_rdma_free(void *buffer)
dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply);
- for (i = 0; req->rl_nchunks;) {
- --req->rl_nchunks;
- i += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
- &req->rl_segments[i]);
- }
+ r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req,
+ !RPC_IS_ASYNC(req->rl_task));
rpcrdma_buffer_put(req);
}
@@ -707,6 +710,7 @@ static struct rpc_xprt_ops xprt_rdma_procs = {
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
.bc_setup = xprt_rdma_bc_setup,
.bc_up = xprt_rdma_bc_up,
+ .bc_maxpayload = xprt_rdma_bc_maxpayload,
.bc_free_rqst = xprt_rdma_bc_free_rqst,
.bc_destroy = xprt_rdma_bc_destroy,
#endif
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index f5ed9f982cd7..b044d98a1370 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -203,15 +203,6 @@ out_fail:
goto out_schedule;
}
-static void
-rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
-{
- struct ib_wc wc;
-
- while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
- rpcrdma_receive_wc(NULL, &wc);
-}
-
static int
rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
{
@@ -374,23 +365,6 @@ out:
}
/*
- * Drain any cq, prior to teardown.
- */
-static void
-rpcrdma_clean_cq(struct ib_cq *cq)
-{
- struct ib_wc wc;
- int count = 0;
-
- while (1 == ib_poll_cq(cq, 1, &wc))
- ++count;
-
- if (count)
- dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
- __func__, count, wc.opcode);
-}
-
-/*
* Exported functions.
*/
@@ -459,7 +433,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
dprintk("RPC: %s: memory registration strategy is '%s'\n",
__func__, ia->ri_ops->ro_displayname);
- rwlock_init(&ia->ri_qplock);
return 0;
out3:
@@ -515,7 +488,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
__func__);
return -ENOMEM;
}
- max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS;
+ max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS - 1;
/* check provider's send/recv wr limits */
if (cdata->max_requests > max_qp_wr)
@@ -526,11 +499,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_attr.srq = NULL;
ep->rep_attr.cap.max_send_wr = cdata->max_requests;
ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
+ ep->rep_attr.cap.max_send_wr += 1; /* drain cqe */
rc = ia->ri_ops->ro_open(ia, ep, cdata);
if (rc)
return rc;
ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
+ ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */
ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
ep->rep_attr.cap.max_recv_sge = 1;
ep->rep_attr.cap.max_inline_data = 0;
@@ -578,6 +553,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_attr.recv_cq = recvcq;
/* Initialize cma parameters */
+ memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
/* RPC/RDMA does not use private data */
ep->rep_remote_cma.private_data = NULL;
@@ -591,7 +567,16 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_remote_cma.responder_resources =
ia->ri_device->attrs.max_qp_rd_atom;
- ep->rep_remote_cma.retry_count = 7;
+ /* Limit transport retries so client can detect server
+ * GID changes quickly. RPC layer handles re-establishing
+ * transport connection and retransmission.
+ */
+ ep->rep_remote_cma.retry_count = 6;
+
+ /* RPC-over-RDMA handles its own flow control. In addition,
+ * make all RNR NAKs visible so we know that RPC-over-RDMA
+ * flow control is working correctly (no NAKs should be seen).
+ */
ep->rep_remote_cma.flow_control = 0;
ep->rep_remote_cma.rnr_retry_count = 0;
@@ -622,13 +607,8 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
cancel_delayed_work_sync(&ep->rep_connect_worker);
- if (ia->ri_id->qp)
- rpcrdma_ep_disconnect(ep, ia);
-
- rpcrdma_clean_cq(ep->rep_attr.recv_cq);
- rpcrdma_clean_cq(ep->rep_attr.send_cq);
-
if (ia->ri_id->qp) {
+ rpcrdma_ep_disconnect(ep, ia);
rdma_destroy_qp(ia->ri_id);
ia->ri_id->qp = NULL;
}
@@ -659,7 +639,6 @@ retry:
dprintk("RPC: %s: reconnecting...\n", __func__);
rpcrdma_ep_disconnect(ep, ia);
- rpcrdma_flush_cqs(ep);
xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
id = rpcrdma_create_id(xprt, ia,
@@ -692,10 +671,8 @@ retry:
goto out;
}
- write_lock(&ia->ri_qplock);
old = ia->ri_id;
ia->ri_id = id;
- write_unlock(&ia->ri_qplock);
rdma_destroy_qp(old);
rpcrdma_destroy_id(old);
@@ -785,7 +762,6 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
int rc;
- rpcrdma_flush_cqs(ep);
rc = rdma_disconnect(ia->ri_id);
if (!rc) {
/* returns without wait if not connected */
@@ -797,6 +773,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
ep->rep_connected = rc;
}
+
+ ib_drain_qp(ia->ri_id->qp);
}
struct rpcrdma_req *
@@ -1271,25 +1249,3 @@ out_rc:
rpcrdma_recv_buffer_put(rep);
return rc;
}
-
-/* How many chunk list items fit within our inline buffers?
- */
-unsigned int
-rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
- int bytes, segments;
-
- bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize);
- bytes -= RPCRDMA_HDRLEN_MIN;
- if (bytes < sizeof(struct rpcrdma_segment) * 2) {
- pr_warn("RPC: %s: inline threshold too small\n",
- __func__);
- return 0;
- }
-
- segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1);
- dprintk("RPC: %s: max chunk list size = %d segments\n",
- __func__, segments);
- return segments;
-}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 2ebc743cb96f..95cdc66225ee 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -65,7 +65,6 @@
*/
struct rpcrdma_ia {
const struct rpcrdma_memreg_ops *ri_ops;
- rwlock_t ri_qplock;
struct ib_device *ri_device;
struct rdma_cm_id *ri_id;
struct ib_pd *ri_pd;
@@ -73,6 +72,8 @@ struct rpcrdma_ia {
struct completion ri_done;
int ri_async_rc;
unsigned int ri_max_frmr_depth;
+ unsigned int ri_max_inline_write;
+ unsigned int ri_max_inline_read;
struct ib_qp_attr ri_qp_attr;
struct ib_qp_init_attr ri_qp_init_attr;
};
@@ -144,6 +145,26 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN)
+/* To ensure a transport can always make forward progress,
+ * the number of RDMA segments allowed in header chunk lists
+ * is capped at 8. This prevents less-capable devices and
+ * memory registrations from overrunning the Send buffer
+ * while building chunk lists.
+ *
+ * Elements of the Read list take up more room than the
+ * Write list or Reply chunk. 8 read segments means the Read
+ * list (or Write list or Reply chunk) cannot consume more
+ * than
+ *
+ * ((8 + 2) * read segment size) + 1 XDR words, or 244 bytes.
+ *
+ * And the fixed part of the header is another 24 bytes.
+ *
+ * The smallest inline threshold is 1024 bytes, ensuring that
+ * at least 750 bytes are available for RPC messages.
+ */
+#define RPCRDMA_MAX_HDR_SEGS (8)
+
/*
* struct rpcrdma_rep -- this structure encapsulates state required to recv
* and complete a reply, asychronously. It needs several pieces of
@@ -162,7 +183,9 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
*/
#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE)
-#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
+
+/* data segments + head/tail for Call + head/tail for Reply */
+#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 4)
struct rpcrdma_buffer;
@@ -198,14 +221,13 @@ enum rpcrdma_frmr_state {
};
struct rpcrdma_frmr {
- struct scatterlist *sg;
- int sg_nents;
+ struct scatterlist *fr_sg;
+ int fr_nents;
+ enum dma_data_direction fr_dir;
struct ib_mr *fr_mr;
struct ib_cqe fr_cqe;
enum rpcrdma_frmr_state fr_state;
struct completion fr_linv_done;
- struct work_struct fr_work;
- struct rpcrdma_xprt *fr_xprt;
union {
struct ib_reg_wr fr_regwr;
struct ib_send_wr fr_invwr;
@@ -222,6 +244,8 @@ struct rpcrdma_mw {
struct rpcrdma_fmr fmr;
struct rpcrdma_frmr frmr;
};
+ struct work_struct mw_work;
+ struct rpcrdma_xprt *mw_xprt;
struct list_head mw_list;
struct list_head mw_all;
};
@@ -270,12 +294,14 @@ struct rpcrdma_req {
unsigned int rl_niovs;
unsigned int rl_nchunks;
unsigned int rl_connect_cookie;
+ struct rpc_task *rl_task;
struct rpcrdma_buffer *rl_buffer;
struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS];
struct rpcrdma_regbuf *rl_rdmabuf;
struct rpcrdma_regbuf *rl_sendbuf;
struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
+ struct rpcrdma_mr_seg *rl_nextseg;
struct ib_cqe rl_cqe;
struct list_head rl_all;
@@ -372,8 +398,8 @@ struct rpcrdma_memreg_ops {
struct rpcrdma_mr_seg *, int, bool);
void (*ro_unmap_sync)(struct rpcrdma_xprt *,
struct rpcrdma_req *);
- int (*ro_unmap)(struct rpcrdma_xprt *,
- struct rpcrdma_mr_seg *);
+ void (*ro_unmap_safe)(struct rpcrdma_xprt *,
+ struct rpcrdma_req *, bool);
int (*ro_open)(struct rpcrdma_ia *,
struct rpcrdma_ep *,
struct rpcrdma_create_data_internal *);
@@ -456,7 +482,6 @@ struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
void rpcrdma_free_regbuf(struct rpcrdma_ia *,
struct rpcrdma_regbuf *);
-unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
int frwr_alloc_recovery_wq(void);
@@ -519,6 +544,9 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *);
* RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
*/
int rpcrdma_marshal_req(struct rpc_rqst *);
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *,
+ struct rpcrdma_create_data_internal *,
+ unsigned int);
/* RPC/RDMA module init - xprtrdma/transport.c
*/
@@ -534,6 +562,7 @@ void xprt_rdma_cleanup(void);
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
int xprt_rdma_bc_up(struct svc_serv *, struct net *);
+size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *);
int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
int rpcrdma_bc_marshal_reply(struct rpc_rqst *);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index b90c5397b5e1..2d3e0c42361e 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1364,6 +1364,11 @@ static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net)
return ret;
return 0;
}
+
+static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt)
+{
+ return PAGE_SIZE;
+}
#else
static inline int _xs_tcp_read_data(struct rpc_xprt *xprt,
struct xdr_skb_reader *desc)
@@ -2661,6 +2666,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
#ifdef CONFIG_SUNRPC_BACKCHANNEL
.bc_setup = xprt_setup_bc,
.bc_up = xs_tcp_bc_up,
+ .bc_maxpayload = xs_tcp_bc_maxpayload,
.bc_free_rqst = xprt_free_bc_rqst,
.bc_destroy = xprt_destroy_bc,
#endif
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 6f11c62bc8f9..bf8f05c3eb82 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -405,7 +405,7 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb,
return 0;
/* Send RESET message even if bearer is detached from device */
- tipc_ptr = rtnl_dereference(dev->tipc_ptr);
+ tipc_ptr = rcu_dereference_rtnl(dev->tipc_ptr);
if (unlikely(!tipc_ptr && !msg_is_reset(buf_msg(skb))))
goto drop;
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 7059c94f33c5..67b6ab9f4c8d 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -704,7 +704,8 @@ static void link_profile_stats(struct tipc_link *l)
*/
int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
{
- int mtyp, rc = 0;
+ int mtyp = 0;
+ int rc = 0;
bool state = false;
bool probe = false;
bool setup = false;
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 8740930f0787..17201aa8423d 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -41,6 +41,8 @@
#include "name_table.h"
#define MAX_FORWARD_SIZE 1024
+#define BUF_HEADROOM (LL_MAX_HEADER + 48)
+#define BUF_TAILROOM 16
static unsigned int align(unsigned int i)
{
@@ -505,6 +507,10 @@ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err)
msg_set_hdr_sz(hdr, BASIC_H_SIZE);
}
+ if (skb_cloned(_skb) &&
+ pskb_expand_head(_skb, BUF_HEADROOM, BUF_TAILROOM, GFP_KERNEL))
+ goto exit;
+
/* Now reverse the concerned fields */
msg_set_errcode(hdr, err);
msg_set_origport(hdr, msg_destport(&ohdr));
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 024da8af91f0..7cf52fb39bee 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -94,17 +94,6 @@ struct plist;
#define TIPC_MEDIA_INFO_OFFSET 5
-/**
- * TIPC message buffer code
- *
- * TIPC message buffer headroom reserves space for the worst-case
- * link-level device header (in case the message is sent off-node).
- *
- * Note: Headroom should be a multiple of 4 to ensure the TIPC header fields
- * are word aligned for quicker access
- */
-#define BUF_HEADROOM (LL_MAX_HEADER + 48)
-
struct tipc_skb_cb {
void *handle;
struct sk_buff *tail;
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index f795b1dd0ccd..3ad9fab1985f 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -604,7 +604,8 @@ static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg,
link_info.dest = nla_get_flag(link[TIPC_NLA_LINK_DEST]);
link_info.up = htonl(nla_get_flag(link[TIPC_NLA_LINK_UP]));
- strcpy(link_info.str, nla_data(link[TIPC_NLA_LINK_NAME]));
+ nla_strlcpy(link_info.str, nla_data(link[TIPC_NLA_LINK_NAME]),
+ TIPC_MAX_LINK_NAME);
return tipc_add_tlv(msg->rep, TIPC_TLV_LINK_INFO,
&link_info, sizeof(link_info));
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 88bfcd707064..c49b8df438cb 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -796,9 +796,11 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
* @tsk: receiving socket
* @skb: pointer to message buffer.
*/
-static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb)
+static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb,
+ struct sk_buff_head *xmitq)
{
struct sock *sk = &tsk->sk;
+ u32 onode = tsk_own_node(tsk);
struct tipc_msg *hdr = buf_msg(skb);
int mtyp = msg_type(hdr);
bool conn_cong;
@@ -811,7 +813,8 @@ static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb)
if (mtyp == CONN_PROBE) {
msg_set_type(hdr, CONN_PROBE_REPLY);
- tipc_sk_respond(sk, skb, TIPC_OK);
+ if (tipc_msg_reverse(onode, &skb, TIPC_OK))
+ __skb_queue_tail(xmitq, skb);
return;
} else if (mtyp == CONN_ACK) {
conn_cong = tsk_conn_cong(tsk);
@@ -1686,7 +1689,8 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb)
*
* Returns true if message was added to socket receive queue, otherwise false
*/
-static bool filter_rcv(struct sock *sk, struct sk_buff *skb)
+static bool filter_rcv(struct sock *sk, struct sk_buff *skb,
+ struct sk_buff_head *xmitq)
{
struct socket *sock = sk->sk_socket;
struct tipc_sock *tsk = tipc_sk(sk);
@@ -1696,7 +1700,7 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb)
int usr = msg_user(hdr);
if (unlikely(msg_user(hdr) == CONN_MANAGER)) {
- tipc_sk_proto_rcv(tsk, skb);
+ tipc_sk_proto_rcv(tsk, skb, xmitq);
return false;
}
@@ -1739,7 +1743,8 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb)
return true;
reject:
- tipc_sk_respond(sk, skb, err);
+ if (tipc_msg_reverse(tsk_own_node(tsk), &skb, err))
+ __skb_queue_tail(xmitq, skb);
return false;
}
@@ -1755,9 +1760,24 @@ reject:
static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
unsigned int truesize = skb->truesize;
+ struct sk_buff_head xmitq;
+ u32 dnode, selector;
- if (likely(filter_rcv(sk, skb)))
+ __skb_queue_head_init(&xmitq);
+
+ if (likely(filter_rcv(sk, skb, &xmitq))) {
atomic_add(truesize, &tipc_sk(sk)->dupl_rcvcnt);
+ return 0;
+ }
+
+ if (skb_queue_empty(&xmitq))
+ return 0;
+
+ /* Send response/rejected message */
+ skb = __skb_dequeue(&xmitq);
+ dnode = msg_destnode(buf_msg(skb));
+ selector = msg_origport(buf_msg(skb));
+ tipc_node_xmit_skb(sock_net(sk), skb, dnode, selector);
return 0;
}
@@ -1771,12 +1791,13 @@ static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb)
* Caller must hold socket lock
*/
static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
- u32 dport)
+ u32 dport, struct sk_buff_head *xmitq)
{
+ unsigned long time_limit = jiffies + 2;
+ struct sk_buff *skb;
unsigned int lim;
atomic_t *dcnt;
- struct sk_buff *skb;
- unsigned long time_limit = jiffies + 2;
+ u32 onode;
while (skb_queue_len(inputq)) {
if (unlikely(time_after_eq(jiffies, time_limit)))
@@ -1788,7 +1809,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
/* Add message directly to receive queue if possible */
if (!sock_owned_by_user(sk)) {
- filter_rcv(sk, skb);
+ filter_rcv(sk, skb, xmitq);
continue;
}
@@ -1801,7 +1822,9 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
continue;
/* Overload => reject message back to sender */
- tipc_sk_respond(sk, skb, TIPC_ERR_OVERLOAD);
+ onode = tipc_own_addr(sock_net(sk));
+ if (tipc_msg_reverse(onode, &skb, TIPC_ERR_OVERLOAD))
+ __skb_queue_tail(xmitq, skb);
break;
}
}
@@ -1814,12 +1837,14 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
*/
void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq)
{
+ struct sk_buff_head xmitq;
u32 dnode, dport = 0;
int err;
struct tipc_sock *tsk;
struct sock *sk;
struct sk_buff *skb;
+ __skb_queue_head_init(&xmitq);
while (skb_queue_len(inputq)) {
dport = tipc_skb_peek_port(inputq, dport);
tsk = tipc_sk_lookup(net, dport);
@@ -1827,9 +1852,14 @@ void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq)
if (likely(tsk)) {
sk = &tsk->sk;
if (likely(spin_trylock_bh(&sk->sk_lock.slock))) {
- tipc_sk_enqueue(inputq, sk, dport);
+ tipc_sk_enqueue(inputq, sk, dport, &xmitq);
spin_unlock_bh(&sk->sk_lock.slock);
}
+ /* Send pending response/rejected messages, if any */
+ while ((skb = __skb_dequeue(&xmitq))) {
+ dnode = msg_destnode(buf_msg(skb));
+ tipc_node_xmit_skb(net, skb, dnode, dport);
+ }
sock_put(sk);
continue;
}
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index b5f1221f48d4..b96ac918e0ba 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -61,6 +61,14 @@
* function will also cleanup rejected sockets, those that reach the connected
* state but leave it before they have been accepted.
*
+ * - Lock ordering for pending or accept queue sockets is:
+ *
+ * lock_sock(listener);
+ * lock_sock_nested(pending, SINGLE_DEPTH_NESTING);
+ *
+ * Using explicit nested locking keeps lockdep happy since normally only one
+ * lock of a given class may be taken at a time.
+ *
* - Sockets created by user action will be cleaned up when the user process
* calls close(2), causing our release implementation to be called. Our release
* implementation will perform some cleanup then drop the last reference so our
@@ -443,7 +451,7 @@ void vsock_pending_work(struct work_struct *work)
cleanup = true;
lock_sock(listener);
- lock_sock(sk);
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
if (vsock_is_pending(sk)) {
vsock_remove_pending(listener, sk);
@@ -1292,7 +1300,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags)
if (connected) {
listener->sk_ack_backlog--;
- lock_sock(connected);
+ lock_sock_nested(connected, SINGLE_DEPTH_NESTING);
vconnected = vsock_sk(connected);
/* If the listener socket has received an error, then we should