From c410bf01933e5e09d142c66c3df9ad470a7eec13 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 11 May 2020 14:54:34 +0100 Subject: rxrpc: Fix the excessive initial retransmission timeout rxrpc currently uses a fixed 4s retransmission timeout until the RTT is sufficiently sampled. This can cause problems with some fileservers with calls to the cache manager in the afs filesystem being dropped from the fileserver because a packet goes missing and the retransmission timeout is greater than the call expiry timeout. Fix this by: (1) Copying the RTT/RTO calculation code from Linux's TCP implementation and altering it to fit rxrpc. (2) Altering the various users of the RTT to make use of the new SRTT value. (3) Replacing the use of rxrpc_resend_timeout to use the calculated RTO value instead (which is needed in jiffies), along with a backoff. Notes: (1) rxrpc provides RTT samples by matching the serial numbers on outgoing DATA packets that have the RXRPC_REQUEST_ACK set and PING ACK packets against the reference serial number in incoming REQUESTED ACK and PING-RESPONSE ACK packets. (2) Each packet that is transmitted on an rxrpc connection gets a new per-connection serial number, even for retransmissions, so an ACK can be cross-referenced to a specific trigger packet. This allows RTT information to be drawn from retransmitted DATA packets also. (3) rxrpc maintains the RTT/RTO state on the rxrpc_peer record rather than on an rxrpc_call because many RPC calls won't live long enough to generate more than one sample. (4) The calculated SRTT value is in units of 8ths of a microsecond rather than nanoseconds. The (S)RTT and RTO values are displayed in /proc/net/rxrpc/peers. Fixes: 17926a79320a ([AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both"") Signed-off-by: David Howells --- net/rxrpc/input.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/rxrpc/input.c') diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 69e09d69c896..e438bfd3fdf5 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -91,11 +91,11 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, /* We analyse the number of packets that get ACK'd per RTT * period and increase the window if we managed to fill it. */ - if (call->peer->rtt_usage == 0) + if (call->peer->rtt_count == 0) goto out; if (ktime_before(skb->tstamp, - ktime_add_ns(call->cong_tstamp, - call->peer->rtt))) + ktime_add_us(call->cong_tstamp, + call->peer->srtt_us >> 3))) goto out_no_clear_ca; change = rxrpc_cong_rtt_window_end; call->cong_tstamp = skb->tstamp; -- cgit v1.2.3 From d1f129470e6cb79b8b97fecd12689f6eb49e27fe Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 28 Apr 2020 22:06:54 +0100 Subject: rxrpc: Trace discarded ACKs Add a tracepoint to track received ACKs that are discarded due to being outside of the Tx window. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 35 +++++++++++++++++++++++++++++++++++ net/rxrpc/input.c | 12 ++++++++++-- 2 files changed, 45 insertions(+), 2 deletions(-) (limited to 'net/rxrpc/input.c') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index ab75f261f04a..ba9efdc848f9 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -1541,6 +1541,41 @@ TRACE_EVENT(rxrpc_notify_socket, __entry->serial) ); +TRACE_EVENT(rxrpc_rx_discard_ack, + TP_PROTO(unsigned int debug_id, rxrpc_serial_t serial, + rxrpc_seq_t first_soft_ack, rxrpc_seq_t call_ackr_first, + rxrpc_seq_t prev_pkt, rxrpc_seq_t call_ackr_prev), + + TP_ARGS(debug_id, serial, first_soft_ack, call_ackr_first, + prev_pkt, call_ackr_prev), + + TP_STRUCT__entry( + __field(unsigned int, debug_id ) + __field(rxrpc_serial_t, serial ) + __field(rxrpc_seq_t, first_soft_ack) + __field(rxrpc_seq_t, call_ackr_first) + __field(rxrpc_seq_t, prev_pkt) + __field(rxrpc_seq_t, call_ackr_prev) + ), + + TP_fast_assign( + __entry->debug_id = debug_id; + __entry->serial = serial; + __entry->first_soft_ack = first_soft_ack; + __entry->call_ackr_first = call_ackr_first; + __entry->prev_pkt = prev_pkt; + __entry->call_ackr_prev = call_ackr_prev; + ), + + TP_printk("c=%08x r=%08x %08x<%08x %08x<%08x", + __entry->debug_id, + __entry->serial, + __entry->first_soft_ack, + __entry->call_ackr_first, + __entry->prev_pkt, + __entry->call_ackr_prev) + ); + #endif /* _TRACE_RXRPC_H */ /* This part must be outside protection */ diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index e438bfd3fdf5..2f22f082a66c 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -866,8 +866,12 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) /* Discard any out-of-order or duplicate ACKs (outside lock). */ if (before(first_soft_ack, call->ackr_first_seq) || - before(prev_pkt, call->ackr_prev_seq)) + before(prev_pkt, call->ackr_prev_seq)) { + trace_rxrpc_rx_discard_ack(call->debug_id, sp->hdr.serial, + first_soft_ack, call->ackr_first_seq, + prev_pkt, call->ackr_prev_seq); return; + } buf.info.rxMTU = 0; ioffset = offset + nr_acks + 3; @@ -879,8 +883,12 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) /* Discard any out-of-order or duplicate ACKs (inside lock). */ if (before(first_soft_ack, call->ackr_first_seq) || - before(prev_pkt, call->ackr_prev_seq)) + before(prev_pkt, call->ackr_prev_seq)) { + trace_rxrpc_rx_discard_ack(call->debug_id, sp->hdr.serial, + first_soft_ack, call->ackr_first_seq, + prev_pkt, call->ackr_prev_seq); goto out; + } call->acks_latest_ts = skb->tstamp; call->ackr_first_seq = first_soft_ack; -- cgit v1.2.3 From 441fdee1eaf050ef0040bde0d7af075c1c6a6d8b Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 29 Apr 2020 23:48:43 +0100 Subject: rxrpc: Fix ack discard The Rx protocol has a "previousPacket" field in it that is not handled in the same way by all protocol implementations. Sometimes it contains the serial number of the last DATA packet received, sometimes the sequence number of the last DATA packet received and sometimes the highest sequence number so far received. AF_RXRPC is using this to weed out ACKs that are out of date (it's possible for ACK packets to get reordered on the wire), but this does not work with OpenAFS which will just stick the sequence number of the last packet seen into previousPacket. The issue being seen is that big AFS FS.StoreData RPC (eg. of ~256MiB) are timing out when partly sent. A trace was captured, with an additional tracepoint to show ACKs being discarded in rxrpc_input_ack(). Here's an excerpt showing the problem. 52873.203230: rxrpc_tx_data: c=000004ae DATA ed1a3584:00000002 0002449c q=00024499 fl=09 A DATA packet with sequence number 00024499 has been transmitted (the "q=" field). ... 52873.243296: rxrpc_rx_ack: c=000004ae 00012a2b DLY r=00024499 f=00024497 p=00024496 n=0 52873.243376: rxrpc_rx_ack: c=000004ae 00012a2c IDL r=0002449b f=00024499 p=00024498 n=0 52873.243383: rxrpc_rx_ack: c=000004ae 00012a2d OOS r=0002449d f=00024499 p=0002449a n=2 The Out-Of-Sequence ACK indicates that the server didn't see DATA sequence number 00024499, but did see seq 0002449a (previousPacket, shown as "p=", skipped the number, but firstPacket, "f=", which shows the bottom of the window is set at that point). 52873.252663: rxrpc_retransmit: c=000004ae q=24499 a=02 xp=14581537 52873.252664: rxrpc_tx_data: c=000004ae DATA ed1a3584:00000002 000244bc q=00024499 fl=0b *RETRANS* The packet has been retransmitted. Retransmission recurs until the peer says it got the packet. 52873.271013: rxrpc_rx_ack: c=000004ae 00012a31 OOS r=000244a1 f=00024499 p=0002449e n=6 More OOS ACKs indicate that the other packets that are already in the transmission pipeline are being received. The specific-ACK list is up to 6 ACKs and NAKs. ... 52873.284792: rxrpc_rx_ack: c=000004ae 00012a49 OOS r=000244b9 f=00024499 p=000244b6 n=30 52873.284802: rxrpc_retransmit: c=000004ae q=24499 a=0a xp=63505500 52873.284804: rxrpc_tx_data: c=000004ae DATA ed1a3584:00000002 000244c2 q=00024499 fl=0b *RETRANS* 52873.287468: rxrpc_rx_ack: c=000004ae 00012a4a OOS r=000244ba f=00024499 p=000244b7 n=31 52873.287478: rxrpc_rx_ack: c=000004ae 00012a4b OOS r=000244bb f=00024499 p=000244b8 n=32 At this point, the server's receive window is full (n=32) with presumably 1 NAK'd packet and 31 ACK'd packets. We can't transmit any more packets. 52873.287488: rxrpc_retransmit: c=000004ae q=24499 a=0a xp=61327980 52873.287489: rxrpc_tx_data: c=000004ae DATA ed1a3584:00000002 000244c3 q=00024499 fl=0b *RETRANS* 52873.293850: rxrpc_rx_ack: c=000004ae 00012a4c DLY r=000244bc f=000244a0 p=00024499 n=25 And now we've received an ACK indicating that a DATA retransmission was received. 7 packets have been processed (the occupied part of the window moved, as indicated by f= and n=). 52873.293853: rxrpc_rx_discard_ack: c=000004ae r=00012a4c 000244a0<00024499 00024499<000244b8 However, the DLY ACK gets discarded because its previousPacket has gone backwards (from p=000244b8, in the ACK at 52873.287478 to p=00024499 in the ACK at 52873.293850). We then end up in a continuous cycle of retransmit/discard. kafs fails to update its window because it's discarding the ACKs and can't transmit an extra packet that would clear the issue because the window is full. OpenAFS doesn't change the previousPacket value in the ACKs because no new DATA packets are received with a different previousPacket number. Fix this by altering the discard check to only discard an ACK based on previousPacket if there was no advance in the firstPacket. This allows us to transmit a new packet which will cause previousPacket to advance in the next ACK. The check, however, needs to allow for the possibility that previousPacket may actually have had the serial number placed in it instead - in which case it will go outside the window and we should ignore it. Fixes: 1a2391c30c0b ("rxrpc: Fix detection of out of order acks") Reported-by: Dave Botsch Signed-off-by: David Howells --- net/rxrpc/input.c | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) (limited to 'net/rxrpc/input.c') diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 2f22f082a66c..3be4177baf70 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -802,6 +802,30 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks, } } +/* + * Return true if the ACK is valid - ie. it doesn't appear to have regressed + * with respect to the ack state conveyed by preceding ACKs. + */ +static bool rxrpc_is_ack_valid(struct rxrpc_call *call, + rxrpc_seq_t first_pkt, rxrpc_seq_t prev_pkt) +{ + rxrpc_seq_t base = READ_ONCE(call->ackr_first_seq); + + if (after(first_pkt, base)) + return true; /* The window advanced */ + + if (before(first_pkt, base)) + return false; /* firstPacket regressed */ + + if (after_eq(prev_pkt, call->ackr_prev_seq)) + return true; /* previousPacket hasn't regressed. */ + + /* Some rx implementations put a serial number in previousPacket. */ + if (after_eq(prev_pkt, base + call->tx_winsize)) + return false; + return true; +} + /* * Process an ACK packet. * @@ -865,8 +889,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) } /* Discard any out-of-order or duplicate ACKs (outside lock). */ - if (before(first_soft_ack, call->ackr_first_seq) || - before(prev_pkt, call->ackr_prev_seq)) { + if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) { trace_rxrpc_rx_discard_ack(call->debug_id, sp->hdr.serial, first_soft_ack, call->ackr_first_seq, prev_pkt, call->ackr_prev_seq); @@ -882,8 +905,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) spin_lock(&call->input_lock); /* Discard any out-of-order or duplicate ACKs (inside lock). */ - if (before(first_soft_ack, call->ackr_first_seq) || - before(prev_pkt, call->ackr_prev_seq)) { + if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) { trace_rxrpc_rx_discard_ack(call->debug_id, sp->hdr.serial, first_soft_ack, call->ackr_first_seq, prev_pkt, call->ackr_prev_seq); -- cgit v1.2.3