From 146a99aefe4a45f68ed371709103cd1dbba6128c Mon Sep 17 00:00:00 2001 From: Tanzir Hasan Date: Tue, 2 Jan 2024 22:22:18 +0000 Subject: xprtrdma: removed asm-generic headers from verbs.c asm-generic/barrier.h and asm/bitops.h are already brought into the file through the header linux/sunrpc/svc_rdma.h and the file can still be built with their removal. They have been replaced with the preferred linux/bitops.h and asm/barrier.h to remove the need for the asm-generic header. Suggested-by: Al Viro Signed-off-by: Tanzir Hasan Acked-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 432557a553e7..e6e660647137 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -49,14 +49,14 @@ * o buffer memory */ +#include #include #include #include #include #include -#include -#include +#include #include -- cgit v1.2.3 From acd9f2dd23c632568156217aac7a05f5a0313152 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 4 Jun 2024 15:45:23 -0400 Subject: xprtrdma: Fix rpcrdma_reqs_reset() Avoid FastReg operations getting MW_BIND_ERR after a reconnect. rpcrdma_reqs_reset() is called on transport tear-down to get each rpcrdma_req back into a clean state. MRs on req->rl_registered are waiting for a FastReg, are already registered, or are waiting for invalidation. If the transport is being torn down when reqs_reset() is called, the matching LocalInv might never be posted. That leaves these MR registered /and/ on req->rl_free_mrs, where they can be re-used for the next connection. Since xprtrdma does not keep specific track of the MR state, it's not possible to know what state these MRs are in, so the only safe thing to do is release them immediately. Fixes: 5de55ce951a1 ("xprtrdma: Release in-flight MRs on disconnect") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 3 ++- net/sunrpc/xprtrdma/verbs.c | 16 +++++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index ffbf99894970..47f33bb7bff8 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -92,7 +92,8 @@ static void frwr_mr_put(struct rpcrdma_mr *mr) rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs); } -/* frwr_reset - Place MRs back on the free list +/** + * frwr_reset - Place MRs back on @req's free list * @req: request to reset * * Used after a failed marshal. For FRWR, this means the MRs diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index e6e660647137..39319539d8af 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -897,6 +897,8 @@ static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt) static void rpcrdma_req_reset(struct rpcrdma_req *req) { + struct rpcrdma_mr *mr; + /* Credits are valid for only one connection */ req->rl_slot.rq_cong = 0; @@ -906,7 +908,19 @@ static void rpcrdma_req_reset(struct rpcrdma_req *req) rpcrdma_regbuf_dma_unmap(req->rl_sendbuf); rpcrdma_regbuf_dma_unmap(req->rl_recvbuf); - frwr_reset(req); + /* The verbs consumer can't know the state of an MR on the + * req->rl_registered list unless a successful completion + * has occurred, so they cannot be re-used. + */ + while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { + struct rpcrdma_buffer *buf = &mr->mr_xprt->rx_buf; + + spin_lock(&buf->rb_lock); + list_del(&mr->mr_all); + spin_unlock(&buf->rb_lock); + + frwr_mr_release(mr); + } } /* ASSUMPTION: the rb_allreqs list is stable for the duration, -- cgit v1.2.3 From 7e86845a0346efc95fddaa97ce5cd6a8bda8c71c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 4 Jun 2024 15:45:24 -0400 Subject: rpcrdma: Implement generic device removal Commit e87a911fed07 ("nvme-rdma: use ib_client API to detect device removal") explains the benefits of handling device removal outside of the CM event handler. Sketch in an IB device removal notification mechanism that can be used by both the client and server side RPC-over-RDMA transport implementations. Suggested-by: Sagi Grimberg Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Signed-off-by: Anna Schumaker --- include/linux/sunrpc/rdma_rn.h | 27 ++++++ include/trace/events/rpcrdma.h | 34 ++++++++ net/sunrpc/xprtrdma/Makefile | 2 +- net/sunrpc/xprtrdma/ib_client.c | 181 ++++++++++++++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/module.c | 18 +++- 5 files changed, 258 insertions(+), 4 deletions(-) create mode 100644 include/linux/sunrpc/rdma_rn.h create mode 100644 net/sunrpc/xprtrdma/ib_client.c (limited to 'net') diff --git a/include/linux/sunrpc/rdma_rn.h b/include/linux/sunrpc/rdma_rn.h new file mode 100644 index 000000000000..7d032ca057af --- /dev/null +++ b/include/linux/sunrpc/rdma_rn.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * * Copyright (c) 2024, Oracle and/or its affiliates. + */ + +#ifndef _LINUX_SUNRPC_RDMA_RN_H +#define _LINUX_SUNRPC_RDMA_RN_H + +#include + +/** + * rpcrdma_notification - request removal notification + */ +struct rpcrdma_notification { + void (*rn_done)(struct rpcrdma_notification *rn); + u32 rn_index; +}; + +int rpcrdma_rn_register(struct ib_device *device, + struct rpcrdma_notification *rn, + void (*done)(struct rpcrdma_notification *rn)); +void rpcrdma_rn_unregister(struct ib_device *device, + struct rpcrdma_notification *rn); +int rpcrdma_ib_client_register(void); +void rpcrdma_ib_client_unregister(void); + +#endif /* _LINUX_SUNRPC_RDMA_RN_H */ diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 14392652273a..ecdaf088219d 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -2220,6 +2220,40 @@ TRACE_EVENT(svcrdma_sq_post_err, ) ); +DECLARE_EVENT_CLASS(rpcrdma_client_device_class, + TP_PROTO( + const struct ib_device *device + ), + + TP_ARGS(device), + + TP_STRUCT__entry( + __string(name, device->name) + ), + + TP_fast_assign( + __assign_str(name); + ), + + TP_printk("device=%s", + __get_str(name) + ) +); + +#define DEFINE_CLIENT_DEVICE_EVENT(name) \ + DEFINE_EVENT(rpcrdma_client_device_class, name, \ + TP_PROTO( \ + const struct ib_device *device \ + ), \ + TP_ARGS(device) \ + ) + +DEFINE_CLIENT_DEVICE_EVENT(rpcrdma_client_completion); +DEFINE_CLIENT_DEVICE_EVENT(rpcrdma_client_add_one); +DEFINE_CLIENT_DEVICE_EVENT(rpcrdma_client_remove_one); +DEFINE_CLIENT_DEVICE_EVENT(rpcrdma_client_wait_on); +DEFINE_CLIENT_DEVICE_EVENT(rpcrdma_client_remove_one_done); + #endif /* _TRACE_RPCRDMA_H */ #include diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index 55b21bae866d..3232aa23cdb4 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o -rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o \ +rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o ib_client.o \ svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \ svc_rdma_pcl.o module.o diff --git a/net/sunrpc/xprtrdma/ib_client.c b/net/sunrpc/xprtrdma/ib_client.c new file mode 100644 index 000000000000..a938c19c3490 --- /dev/null +++ b/net/sunrpc/xprtrdma/ib_client.c @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* + * Copyright (c) 2024 Oracle. All rights reserved. + */ + +/* #include +#include */ +#include +#include +#include +#include + +#include +#include + +#include "xprt_rdma.h" +#include + +/* Per-ib_device private data for rpcrdma */ +struct rpcrdma_device { + struct kref rd_kref; + unsigned long rd_flags; + struct ib_device *rd_device; + struct xarray rd_xa; + struct completion rd_done; +}; + +#define RPCRDMA_RD_F_REMOVING (0) + +static struct ib_client rpcrdma_ib_client; + +/* + * Listeners have no associated device, so we never register them. + * Note that ib_get_client_data() does not check if @device is + * NULL for us. + */ +static struct rpcrdma_device *rpcrdma_get_client_data(struct ib_device *device) +{ + if (!device) + return NULL; + return ib_get_client_data(device, &rpcrdma_ib_client); +} + +/** + * rpcrdma_rn_register - register to get device removal notifications + * @device: device to monitor + * @rn: notification object that wishes to be notified + * @done: callback to notify caller of device removal + * + * Returns zero on success. The callback in rn_done is guaranteed + * to be invoked when the device is removed, unless this notification + * is unregistered first. + * + * On failure, a negative errno is returned. + */ +int rpcrdma_rn_register(struct ib_device *device, + struct rpcrdma_notification *rn, + void (*done)(struct rpcrdma_notification *rn)) +{ + struct rpcrdma_device *rd = rpcrdma_get_client_data(device); + + if (!rd || test_bit(RPCRDMA_RD_F_REMOVING, &rd->rd_flags)) + return -ENETUNREACH; + + kref_get(&rd->rd_kref); + if (xa_alloc(&rd->rd_xa, &rn->rn_index, rn, xa_limit_32b, GFP_KERNEL) < 0) + return -ENOMEM; + rn->rn_done = done; + return 0; +} + +static void rpcrdma_rn_release(struct kref *kref) +{ + struct rpcrdma_device *rd = container_of(kref, struct rpcrdma_device, + rd_kref); + + trace_rpcrdma_client_completion(rd->rd_device); + complete(&rd->rd_done); +} + +/** + * rpcrdma_rn_unregister - stop device removal notifications + * @device: monitored device + * @rn: notification object that no longer wishes to be notified + */ +void rpcrdma_rn_unregister(struct ib_device *device, + struct rpcrdma_notification *rn) +{ + struct rpcrdma_device *rd = rpcrdma_get_client_data(device); + + if (!rd) + return; + + xa_erase(&rd->rd_xa, rn->rn_index); + kref_put(&rd->rd_kref, rpcrdma_rn_release); +} + +/** + * rpcrdma_add_one - ib_client device insertion callback + * @device: device about to be inserted + * + * Returns zero on success. xprtrdma private data has been allocated + * for this device. On failure, a negative errno is returned. + */ +static int rpcrdma_add_one(struct ib_device *device) +{ + struct rpcrdma_device *rd; + + rd = kzalloc(sizeof(*rd), GFP_KERNEL); + if (!rd) + return -ENOMEM; + + kref_init(&rd->rd_kref); + xa_init_flags(&rd->rd_xa, XA_FLAGS_ALLOC1); + rd->rd_device = device; + init_completion(&rd->rd_done); + ib_set_client_data(device, &rpcrdma_ib_client, rd); + + trace_rpcrdma_client_add_one(device); + return 0; +} + +/** + * rpcrdma_remove_one - ib_client device removal callback + * @device: device about to be removed + * @client_data: this module's private per-device data + * + * Upon return, all transports associated with @device have divested + * themselves from IB hardware resources. + */ +static void rpcrdma_remove_one(struct ib_device *device, + void *client_data) +{ + struct rpcrdma_device *rd = client_data; + struct rpcrdma_notification *rn; + unsigned long index; + + trace_rpcrdma_client_remove_one(device); + + set_bit(RPCRDMA_RD_F_REMOVING, &rd->rd_flags); + xa_for_each(&rd->rd_xa, index, rn) + rn->rn_done(rn); + + /* + * Wait only if there are still outstanding notification + * registrants for this device. + */ + if (!refcount_dec_and_test(&rd->rd_kref.refcount)) { + trace_rpcrdma_client_wait_on(device); + wait_for_completion(&rd->rd_done); + } + + trace_rpcrdma_client_remove_one_done(device); + kfree(rd); +} + +static struct ib_client rpcrdma_ib_client = { + .name = "rpcrdma", + .add = rpcrdma_add_one, + .remove = rpcrdma_remove_one, +}; + +/** + * rpcrdma_ib_client_unregister - unregister ib_client for xprtrdma + * + * cel: watch for orphaned rpcrdma_device objects on module unload + */ +void rpcrdma_ib_client_unregister(void) +{ + ib_unregister_client(&rpcrdma_ib_client); +} + +/** + * rpcrdma_ib_client_register - register ib_client for rpcrdma + * + * Returns zero on success, or a negative errno. + */ +int rpcrdma_ib_client_register(void) +{ + return ib_register_client(&rpcrdma_ib_client); +} diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c index 45c5b41ac8dc..697f571d4c01 100644 --- a/net/sunrpc/xprtrdma/module.c +++ b/net/sunrpc/xprtrdma/module.c @@ -11,6 +11,7 @@ #include #include #include +#include #include @@ -30,21 +31,32 @@ static void __exit rpc_rdma_cleanup(void) { xprt_rdma_cleanup(); svc_rdma_cleanup(); + rpcrdma_ib_client_unregister(); } static int __init rpc_rdma_init(void) { int rc; + rc = rpcrdma_ib_client_register(); + if (rc) + goto out_rc; + rc = svc_rdma_init(); if (rc) - goto out; + goto out_ib_client; rc = xprt_rdma_init(); if (rc) - svc_rdma_cleanup(); + goto out_svc_rdma; -out: + return 0; + +out_svc_rdma: + svc_rdma_cleanup(); +out_ib_client: + rpcrdma_ib_client_unregister(); +out_rc: return rc; } -- cgit v1.2.3 From 3f4eb9ff923413cdb4c7e171c06d3564f6286712 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 4 Jun 2024 15:45:25 -0400 Subject: xprtrdma: Handle device removal outside of the CM event handler Wait for all disconnects to complete to ensure the transport has divested all of its hardware resources before the underlying RDMA device can be removed. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 23 +++++++++++++++++++++++ net/sunrpc/xprtrdma/verbs.c | 23 ++++++++++++++--------- net/sunrpc/xprtrdma/xprt_rdma.h | 2 ++ 3 files changed, 39 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index ecdaf088219d..ba2d6a0e41cc 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -669,6 +669,29 @@ TRACE_EVENT(xprtrdma_inline_thresh, DEFINE_CONN_EVENT(connect); DEFINE_CONN_EVENT(disconnect); +TRACE_EVENT(xprtrdma_device_removal, + TP_PROTO( + const struct rdma_cm_id *id + ), + + TP_ARGS(id), + + TP_STRUCT__entry( + __string(name, id->device->name) + __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + ), + + TP_fast_assign( + __assign_str(name); + memcpy(__entry->addr, &id->route.addr.dst_addr, + sizeof(struct sockaddr_in6)); + ), + + TP_printk("device %s to be removed, disconnecting %pISpc\n", + __get_str(name), __entry->addr + ) +); + DEFINE_RXPRT_EVENT(xprtrdma_op_inject_dsc); TRACE_EVENT(xprtrdma_op_connect, diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 39319539d8af..2904454adb06 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -222,7 +222,6 @@ static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep, static int rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { - struct sockaddr *sap = (struct sockaddr *)&id->route.addr.dst_addr; struct rpcrdma_ep *ep = id->context; might_sleep(); @@ -241,14 +240,6 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) ep->re_async_rc = -ENETUNREACH; complete(&ep->re_done); return 0; - case RDMA_CM_EVENT_DEVICE_REMOVAL: - pr_info("rpcrdma: removing device %s for %pISpc\n", - ep->re_id->device->name, sap); - switch (xchg(&ep->re_connect_status, -ENODEV)) { - case 0: goto wake_connect_worker; - case 1: goto disconnected; - } - return 0; case RDMA_CM_EVENT_ADDR_CHANGE: ep->re_connect_status = -ENODEV; goto disconnected; @@ -284,6 +275,14 @@ disconnected: return 0; } +static void rpcrdma_ep_removal_done(struct rpcrdma_notification *rn) +{ + struct rpcrdma_ep *ep = container_of(rn, struct rpcrdma_ep, re_rn); + + trace_xprtrdma_device_removal(ep->re_id); + xprt_force_disconnect(ep->re_xprt); +} + static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep) { @@ -323,6 +322,10 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, if (rc) goto out; + rc = rpcrdma_rn_register(id->device, &ep->re_rn, rpcrdma_ep_removal_done); + if (rc) + goto out; + return id; out: @@ -350,6 +353,8 @@ static void rpcrdma_ep_destroy(struct kref *kref) ib_dealloc_pd(ep->re_pd); ep->re_pd = NULL; + rpcrdma_rn_unregister(ep->re_id->device, &ep->re_rn); + kfree(ep); module_put(THIS_MODULE); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index da409450dfc0..341725c66ec8 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -56,6 +56,7 @@ #include /* completion IDs */ #include /* RPC/RDMA protocol */ #include /* xprt parameters */ +#include /* removal notifications */ #define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ #define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ @@ -92,6 +93,7 @@ struct rpcrdma_ep { struct rpcrdma_connect_private re_cm_private; struct rdma_conn_param re_remote_cma; + struct rpcrdma_notification re_rn; int re_receive_count; unsigned int re_max_requests; /* depends on device */ unsigned int re_inline_send; /* negotiated */ -- cgit v1.2.3 From 9d53378c2c145991fb5bc986c82eb9030212c2ea Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 4 Jun 2024 15:45:26 -0400 Subject: xprtrdma: Clean up synopsis of frwr_mr_unmap() Commit 7a03aeb66c41 ("xprtrdma: Micro-optimize MR DMA-unmapping") removed the last use of the @r_xprt parameter in this function, but neglected to remove the parameter itself. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 47f33bb7bff8..31434aeb8e29 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -54,7 +54,7 @@ static void frwr_cid_init(struct rpcrdma_ep *ep, cid->ci_completion_id = mr->mr_ibmr->res.id; } -static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) +static void frwr_mr_unmap(struct rpcrdma_mr *mr) { if (mr->mr_device) { trace_xprtrdma_mr_unmap(mr); @@ -73,7 +73,7 @@ void frwr_mr_release(struct rpcrdma_mr *mr) { int rc; - frwr_mr_unmap(mr->mr_xprt, mr); + frwr_mr_unmap(mr); rc = ib_dereg_mr(mr->mr_ibmr); if (rc) @@ -84,7 +84,7 @@ void frwr_mr_release(struct rpcrdma_mr *mr) static void frwr_mr_put(struct rpcrdma_mr *mr) { - frwr_mr_unmap(mr->mr_xprt, mr); + frwr_mr_unmap(mr); /* The MR is returned to the req's MR free list instead * of to the xprt's MR free list. No spinlock is needed. -- cgit v1.2.3 From 0e13dd9ea8be46f980a46c3ffd8cb786f3e2fb5b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 4 Jun 2024 15:45:27 -0400 Subject: xprtrdma: Remove temp allocation of rpcrdma_rep objects The original code was designed so that most calls to rpcrdma_rep_create() would occur on the NUMA node that the device preferred. There are a few cases where that's not possible, so those reps are marked as temporary. However, we have the device (and its preferred node) already in rpcrdma_rep_create(), so let's use that to guarantee the memory is allocated from the correct node. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 3 +-- net/sunrpc/xprtrdma/verbs.c | 57 +++++++++++++++++------------------------ net/sunrpc/xprtrdma/xprt_rdma.h | 3 +-- 3 files changed, 26 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 190a4de239c8..1478c41c7e9d 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1471,8 +1471,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) credits = 1; /* don't deadlock */ else if (credits > r_xprt->rx_ep->re_max_requests) credits = r_xprt->rx_ep->re_max_requests; - rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1), - false); + rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1)); if (buf->rb_credits != credits) rpcrdma_update_cwnd(r_xprt, credits); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 2904454adb06..63262ef0c2e3 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -69,13 +69,15 @@ static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt, struct rpcrdma_sendctx *sc); static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt); static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt); -static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep); static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt); static void rpcrdma_ep_get(struct rpcrdma_ep *ep); static int rpcrdma_ep_put(struct rpcrdma_ep *ep); static struct rpcrdma_regbuf * +rpcrdma_regbuf_alloc_node(size_t size, enum dma_data_direction direction, + int node); +static struct rpcrdma_regbuf * rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction); static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb); static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb); @@ -510,7 +512,7 @@ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt) * outstanding Receives. */ rpcrdma_ep_get(ep); - rpcrdma_post_recvs(r_xprt, 1, true); + rpcrdma_post_recvs(r_xprt, 1); rc = rdma_connect(ep->re_id, &ep->re_remote_cma); if (rc) @@ -943,18 +945,20 @@ static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt) } static noinline -struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, - bool temp) +struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_ep *ep = r_xprt->rx_ep; + struct ib_device *device = ep->re_id->device; struct rpcrdma_rep *rep; rep = kzalloc(sizeof(*rep), XPRTRDMA_GFP_FLAGS); if (rep == NULL) goto out; - rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep->re_inline_recv, - DMA_FROM_DEVICE); + rep->rr_rdmabuf = rpcrdma_regbuf_alloc_node(ep->re_inline_recv, + DMA_FROM_DEVICE, + ibdev_to_node(device)); if (!rep->rr_rdmabuf) goto out_free; @@ -969,7 +973,6 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, rep->rr_recv_wr.wr_cqe = &rep->rr_cqe; rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; rep->rr_recv_wr.num_sge = 1; - rep->rr_temp = temp; spin_lock(&buf->rb_lock); list_add(&rep->rr_all, &buf->rb_all_reps); @@ -988,17 +991,6 @@ static void rpcrdma_rep_free(struct rpcrdma_rep *rep) kfree(rep); } -static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) -{ - struct rpcrdma_buffer *buf = &rep->rr_rxprt->rx_buf; - - spin_lock(&buf->rb_lock); - list_del(&rep->rr_all); - spin_unlock(&buf->rb_lock); - - rpcrdma_rep_free(rep); -} - static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf) { struct llist_node *node; @@ -1030,10 +1022,8 @@ static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_rep *rep; - list_for_each_entry(rep, &buf->rb_all_reps, rr_all) { + list_for_each_entry(rep, &buf->rb_all_reps, rr_all) rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf); - rep->rr_temp = true; /* Mark this rep for destruction */ - } } static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf) @@ -1250,14 +1240,15 @@ void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) * or Replies they may be registered externally via frwr_map. */ static struct rpcrdma_regbuf * -rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction) +rpcrdma_regbuf_alloc_node(size_t size, enum dma_data_direction direction, + int node) { struct rpcrdma_regbuf *rb; - rb = kmalloc(sizeof(*rb), XPRTRDMA_GFP_FLAGS); + rb = kmalloc_node(sizeof(*rb), XPRTRDMA_GFP_FLAGS, node); if (!rb) return NULL; - rb->rg_data = kmalloc(size, XPRTRDMA_GFP_FLAGS); + rb->rg_data = kmalloc_node(size, XPRTRDMA_GFP_FLAGS, node); if (!rb->rg_data) { kfree(rb); return NULL; @@ -1269,6 +1260,12 @@ rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction) return rb; } +static struct rpcrdma_regbuf * +rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction) +{ + return rpcrdma_regbuf_alloc_node(size, direction, NUMA_NO_NODE); +} + /** * rpcrdma_regbuf_realloc - re-allocate a SEND/RECV buffer * @rb: regbuf to reallocate @@ -1346,10 +1343,9 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb) * rpcrdma_post_recvs - Refill the Receive Queue * @r_xprt: controlling transport instance * @needed: current credit grant - * @temp: mark Receive buffers to be deleted after one use * */ -void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp) +void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_ep *ep = r_xprt->rx_ep; @@ -1363,8 +1359,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp) if (likely(ep->re_receive_count > needed)) goto out; needed -= ep->re_receive_count; - if (!temp) - needed += RPCRDMA_MAX_RECV_BATCH; + needed += RPCRDMA_MAX_RECV_BATCH; if (atomic_inc_return(&ep->re_receiving) > 1) goto out; @@ -1373,12 +1368,8 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp) wr = NULL; while (needed) { rep = rpcrdma_rep_get_locked(buf); - if (rep && rep->rr_temp) { - rpcrdma_rep_destroy(rep); - continue; - } if (!rep) - rep = rpcrdma_rep_create(r_xprt, temp); + rep = rpcrdma_rep_create(r_xprt); if (!rep) break; if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) { diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 341725c66ec8..8147d2b41494 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -200,7 +200,6 @@ struct rpcrdma_rep { __be32 rr_proc; int rr_wc_flags; u32 rr_inv_rkey; - bool rr_temp; struct rpcrdma_regbuf *rr_rdmabuf; struct rpcrdma_xprt *rr_rxprt; struct rpc_rqst *rr_rqst; @@ -468,7 +467,7 @@ void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc); int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt); void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt); -void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp); +void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed); /* * Buffer calls - xprtrdma/verbs.c -- cgit v1.2.3 From 6258cf25d5e3155c3219ab5a79b970eef7996356 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 19 Jun 2024 11:05:13 +1000 Subject: SUNRPC: avoid soft lockup when transmitting UDP to reachable server. Prior to the commit identified below, call_transmit_status() would handle -EPERM and other errors related to an unreachable server by falling through to call_status() which added a 3-second delay and handled the failure as a timeout. Since that commit, call_transmit_status() falls through to handle_bind(). For UDP this moves straight on to handle_connect() and handle_transmit() so we immediately retransmit - and likely get the same error. This results in an indefinite loop in __rpc_execute() which triggers a soft-lockup warning. For the errors that indicate an unreachable server, call_transmit_status() should fall back to call_status() as it did before. This cannot cause the thundering herd that the previous patch was avoiding, as the call_status() will insert a delay. Fixes: ed7dc973bd91 ("SUNRPC: Prevent thundering herd when the socket is not connected") Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index cfd1b1bf7e35..09f29a95f2bc 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2326,12 +2326,13 @@ call_transmit_status(struct rpc_task *task) task->tk_action = call_transmit; task->tk_status = 0; break; - case -ECONNREFUSED: case -EHOSTDOWN: case -ENETDOWN: case -EHOSTUNREACH: case -ENETUNREACH: case -EPERM: + break; + case -ECONNREFUSED: if (RPC_IS_SOFTCONN(task)) { if (!task->tk_msg.rpc_proc->p_proc) trace_xprt_ping(task->tk_xprt, -- cgit v1.2.3 From ed0172af5d6fc07d1b40ca82f5ca3979300369f7 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 17 Jul 2024 10:49:33 -0400 Subject: SUNRPC: Fix a race to wake a sync task We've observed NFS clients with sync tasks sleeping in __rpc_execute waiting on RPC_TASK_QUEUED that have not responded to a wake-up from rpc_make_runnable(). I suspect this problem usually goes unnoticed, because on a busy client the task will eventually be re-awoken by another task completion or xprt event. However, if the state manager is draining the slot table, a sync task missing a wake-up can result in a hung client. We've been able to prove that the waker in rpc_make_runnable() successfully calls wake_up_bit() (ie- there's no race to tk_runstate), but the wake_up_bit() call fails to wake the waiter. I suspect the waker is missing the load of the bit's wait_queue_head, so waitqueue_active() is false. There are some very helpful comments about this problem above wake_up_bit(), prepare_to_wait(), and waitqueue_active(). Fix this by inserting smp_mb__after_atomic() before the wake_up_bit(), which pairs with prepare_to_wait() calling set_current_state(). Signed-off-by: Benjamin Coddington Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- net/sunrpc/sched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 6debf4fd42d4..cef623ea1506 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -369,8 +369,10 @@ static void rpc_make_runnable(struct workqueue_struct *wq, if (RPC_IS_ASYNC(task)) { INIT_WORK(&task->u.tk_work, rpc_async_schedule); queue_work(wq, &task->u.tk_work); - } else + } else { + smp_mb__after_atomic(); wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED); + } } /* -- cgit v1.2.3