OpenCloudOS-Kernel/include/trace/events/rxrpc.h

1981 lines
60 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: GPL-2.0-or-later */
/* AF_RXRPC tracepoints
*
* Copyright (C) 2016 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#undef TRACE_SYSTEM
#define TRACE_SYSTEM rxrpc
#if !defined(_TRACE_RXRPC_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_RXRPC_H
#include <linux/tracepoint.h>
#include <linux/errqueue.h>
/*
* Declare tracing information enums and their string mappings for display.
*/
#define rxrpc_abort_reasons \
/* AFS errors */ \
EM(afs_abort_general_error, "afs-error") \
EM(afs_abort_interrupted, "afs-intr") \
EM(afs_abort_oom, "afs-oom") \
EM(afs_abort_op_not_supported, "afs-op-notsupp") \
EM(afs_abort_probeuuid_negative, "afs-probeuuid-neg") \
EM(afs_abort_send_data_error, "afs-send-data") \
EM(afs_abort_unmarshal_error, "afs-unmarshal") \
/* rxperf errors */ \
EM(rxperf_abort_general_error, "rxperf-error") \
EM(rxperf_abort_oom, "rxperf-oom") \
EM(rxperf_abort_op_not_supported, "rxperf-op-notsupp") \
EM(rxperf_abort_unmarshal_error, "rxperf-unmarshal") \
/* RxKAD security errors */ \
EM(rxkad_abort_1_short_check, "rxkad1-short-check") \
EM(rxkad_abort_1_short_data, "rxkad1-short-data") \
EM(rxkad_abort_1_short_encdata, "rxkad1-short-encdata") \
EM(rxkad_abort_1_short_header, "rxkad1-short-hdr") \
EM(rxkad_abort_2_short_check, "rxkad2-short-check") \
EM(rxkad_abort_2_short_data, "rxkad2-short-data") \
EM(rxkad_abort_2_short_header, "rxkad2-short-hdr") \
EM(rxkad_abort_2_short_len, "rxkad2-short-len") \
EM(rxkad_abort_bad_checksum, "rxkad2-bad-cksum") \
EM(rxkad_abort_chall_key_expired, "rxkad-chall-key-exp") \
EM(rxkad_abort_chall_level, "rxkad-chall-level") \
EM(rxkad_abort_chall_no_key, "rxkad-chall-nokey") \
EM(rxkad_abort_chall_short, "rxkad-chall-short") \
EM(rxkad_abort_chall_version, "rxkad-chall-version") \
EM(rxkad_abort_resp_bad_callid, "rxkad-resp-bad-callid") \
EM(rxkad_abort_resp_bad_checksum, "rxkad-resp-bad-cksum") \
EM(rxkad_abort_resp_bad_param, "rxkad-resp-bad-param") \
EM(rxkad_abort_resp_call_ctr, "rxkad-resp-call-ctr") \
EM(rxkad_abort_resp_call_state, "rxkad-resp-call-state") \
EM(rxkad_abort_resp_key_expired, "rxkad-resp-key-exp") \
EM(rxkad_abort_resp_key_rejected, "rxkad-resp-key-rej") \
EM(rxkad_abort_resp_level, "rxkad-resp-level") \
EM(rxkad_abort_resp_nokey, "rxkad-resp-nokey") \
EM(rxkad_abort_resp_ooseq, "rxkad-resp-ooseq") \
EM(rxkad_abort_resp_short, "rxkad-resp-short") \
EM(rxkad_abort_resp_short_tkt, "rxkad-resp-short-tkt") \
EM(rxkad_abort_resp_tkt_aname, "rxkad-resp-tk-aname") \
EM(rxkad_abort_resp_tkt_expired, "rxkad-resp-tk-exp") \
EM(rxkad_abort_resp_tkt_future, "rxkad-resp-tk-future") \
EM(rxkad_abort_resp_tkt_inst, "rxkad-resp-tk-inst") \
EM(rxkad_abort_resp_tkt_len, "rxkad-resp-tk-len") \
EM(rxkad_abort_resp_tkt_realm, "rxkad-resp-tk-realm") \
EM(rxkad_abort_resp_tkt_short, "rxkad-resp-tk-short") \
EM(rxkad_abort_resp_tkt_sinst, "rxkad-resp-tk-sinst") \
EM(rxkad_abort_resp_tkt_sname, "rxkad-resp-tk-sname") \
EM(rxkad_abort_resp_unknown_tkt, "rxkad-resp-unknown-tkt") \
EM(rxkad_abort_resp_version, "rxkad-resp-version") \
/* rxrpc errors */ \
EM(rxrpc_abort_call_improper_term, "call-improper-term") \
EM(rxrpc_abort_call_reset, "call-reset") \
EM(rxrpc_abort_call_sendmsg, "call-sendmsg") \
EM(rxrpc_abort_call_sock_release, "call-sock-rel") \
EM(rxrpc_abort_call_sock_release_tba, "call-sock-rel-tba") \
EM(rxrpc_abort_call_timeout, "call-timeout") \
EM(rxrpc_abort_no_service_key, "no-serv-key") \
EM(rxrpc_abort_nomem, "nomem") \
EM(rxrpc_abort_service_not_offered, "serv-not-offered") \
EM(rxrpc_abort_shut_down, "shut-down") \
EM(rxrpc_abort_unsupported_security, "unsup-sec") \
EM(rxrpc_badmsg_bad_abort, "bad-abort") \
EM(rxrpc_badmsg_bad_jumbo, "bad-jumbo") \
EM(rxrpc_badmsg_short_ack, "short-ack") \
EM(rxrpc_badmsg_short_ack_trailer, "short-ack-trailer") \
EM(rxrpc_badmsg_short_hdr, "short-hdr") \
EM(rxrpc_badmsg_unsupported_packet, "unsup-pkt") \
EM(rxrpc_badmsg_zero_call, "zero-call") \
EM(rxrpc_badmsg_zero_seq, "zero-seq") \
EM(rxrpc_badmsg_zero_service, "zero-service") \
EM(rxrpc_eproto_ackr_outside_window, "ackr-out-win") \
EM(rxrpc_eproto_ackr_sack_overflow, "ackr-sack-over") \
EM(rxrpc_eproto_ackr_short_sack, "ackr-short-sack") \
EM(rxrpc_eproto_ackr_zero, "ackr-zero") \
EM(rxrpc_eproto_bad_upgrade, "bad-upgrade") \
EM(rxrpc_eproto_data_after_last, "data-after-last") \
EM(rxrpc_eproto_different_last, "diff-last") \
EM(rxrpc_eproto_early_reply, "early-reply") \
EM(rxrpc_eproto_improper_term, "improper-term") \
EM(rxrpc_eproto_no_client_call, "no-cl-call") \
EM(rxrpc_eproto_no_client_conn, "no-cl-conn") \
EM(rxrpc_eproto_no_service_call, "no-sv-call") \
EM(rxrpc_eproto_reupgrade, "re-upgrade") \
EM(rxrpc_eproto_rxnull_challenge, "rxnull-chall") \
EM(rxrpc_eproto_rxnull_response, "rxnull-resp") \
EM(rxrpc_eproto_tx_rot_last, "tx-rot-last") \
EM(rxrpc_eproto_unexpected_ack, "unex-ack") \
EM(rxrpc_eproto_unexpected_ackall, "unex-ackall") \
EM(rxrpc_eproto_unexpected_implicit_end, "unex-impl-end") \
EM(rxrpc_eproto_unexpected_reply, "unex-reply") \
EM(rxrpc_eproto_wrong_security, "wrong-sec") \
EM(rxrpc_recvmsg_excess_data, "recvmsg-excess") \
EM(rxrpc_recvmsg_short_data, "recvmsg-short") \
E_(rxrpc_sendmsg_late_send, "sendmsg-late")
#define rxrpc_call_poke_traces \
EM(rxrpc_call_poke_abort, "Abort") \
EM(rxrpc_call_poke_complete, "Compl") \
EM(rxrpc_call_poke_error, "Error") \
EM(rxrpc_call_poke_idle, "Idle") \
EM(rxrpc_call_poke_start, "Start") \
EM(rxrpc_call_poke_timer, "Timer") \
E_(rxrpc_call_poke_timer_now, "Timer-now")
#define rxrpc_skb_traces \
EM(rxrpc_skb_eaten_by_unshare, "ETN unshare ") \
EM(rxrpc_skb_eaten_by_unshare_nomem, "ETN unshar-nm") \
EM(rxrpc_skb_get_conn_secured, "GET conn-secd") \
EM(rxrpc_skb_get_conn_work, "GET conn-work") \
rxrpc: Fix counting of new acks and nacks [ Upstream commit 41b7fa157ea1c8c3a575ca7f5f32034de9bee3ae ] Fix the counting of new acks and nacks when parsing a packet - something that is used in congestion control. As the code stands, it merely notes if there are any nacks whereas what we really should do is compare the previous SACK table to the new one, assuming we get two successive ACK packets with nacks in them. However, we really don't want to do that if we can avoid it as the tables might not correspond directly as one may be shifted from the other - something that will only get harder to deal with once extended ACK tables come into full use (with a capacity of up to 8192). Instead, count the number of nacks shifted out of the old SACK, the number of nacks retained in the portion still active and the number of new acks and nacks in the new table then calculate what we need. Note this ends up a bit of an estimate as the Rx protocol allows acks to be withdrawn by the receiver and packets requested to be retransmitted. Fixes: d57a3a151660 ("rxrpc: Save last ACK's SACK table rather than marking txbufs") Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: "David S. Miller" <davem@davemloft.net> cc: Eric Dumazet <edumazet@google.com> cc: Jakub Kicinski <kuba@kernel.org> cc: Paolo Abeni <pabeni@redhat.com> cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Sasha Levin <sashal@kernel.org>
2024-02-02 23:19:16 +08:00
EM(rxrpc_skb_get_last_nack, "GET last-nack") \
EM(rxrpc_skb_get_local_work, "GET locl-work") \
EM(rxrpc_skb_get_reject_work, "GET rej-work ") \
EM(rxrpc_skb_get_to_recvmsg, "GET to-recv ") \
EM(rxrpc_skb_get_to_recvmsg_oos, "GET to-recv-o") \
EM(rxrpc_skb_new_encap_rcv, "NEW encap-rcv") \
EM(rxrpc_skb_new_error_report, "NEW error-rpt") \
EM(rxrpc_skb_new_jumbo_subpacket, "NEW jumbo-sub") \
EM(rxrpc_skb_new_unshared, "NEW unshared ") \
EM(rxrpc_skb_put_conn_secured, "PUT conn-secd") \
EM(rxrpc_skb_put_conn_work, "PUT conn-work") \
EM(rxrpc_skb_put_error_report, "PUT error-rep") \
EM(rxrpc_skb_put_input, "PUT input ") \
EM(rxrpc_skb_put_jumbo_subpacket, "PUT jumbo-sub") \
rxrpc: Fix counting of new acks and nacks [ Upstream commit 41b7fa157ea1c8c3a575ca7f5f32034de9bee3ae ] Fix the counting of new acks and nacks when parsing a packet - something that is used in congestion control. As the code stands, it merely notes if there are any nacks whereas what we really should do is compare the previous SACK table to the new one, assuming we get two successive ACK packets with nacks in them. However, we really don't want to do that if we can avoid it as the tables might not correspond directly as one may be shifted from the other - something that will only get harder to deal with once extended ACK tables come into full use (with a capacity of up to 8192). Instead, count the number of nacks shifted out of the old SACK, the number of nacks retained in the portion still active and the number of new acks and nacks in the new table then calculate what we need. Note this ends up a bit of an estimate as the Rx protocol allows acks to be withdrawn by the receiver and packets requested to be retransmitted. Fixes: d57a3a151660 ("rxrpc: Save last ACK's SACK table rather than marking txbufs") Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: "David S. Miller" <davem@davemloft.net> cc: Eric Dumazet <edumazet@google.com> cc: Jakub Kicinski <kuba@kernel.org> cc: Paolo Abeni <pabeni@redhat.com> cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Sasha Levin <sashal@kernel.org>
2024-02-02 23:19:16 +08:00
EM(rxrpc_skb_put_last_nack, "PUT last-nack") \
EM(rxrpc_skb_put_purge, "PUT purge ") \
EM(rxrpc_skb_put_rotate, "PUT rotate ") \
EM(rxrpc_skb_put_unknown, "PUT unknown ") \
EM(rxrpc_skb_see_conn_work, "SEE conn-work") \
EM(rxrpc_skb_see_recvmsg, "SEE recvmsg ") \
EM(rxrpc_skb_see_reject, "SEE reject ") \
EM(rxrpc_skb_see_rotate, "SEE rotate ") \
E_(rxrpc_skb_see_version, "SEE version ")
#define rxrpc_local_traces \
EM(rxrpc_local_free, "FREE ") \
EM(rxrpc_local_get_call, "GET call ") \
EM(rxrpc_local_get_client_conn, "GET conn-cln") \
EM(rxrpc_local_get_for_use, "GET for-use ") \
EM(rxrpc_local_get_peer, "GET peer ") \
EM(rxrpc_local_get_prealloc_conn, "GET conn-pre") \
EM(rxrpc_local_new, "NEW ") \
EM(rxrpc_local_put_bind, "PUT bind ") \
EM(rxrpc_local_put_call, "PUT call ") \
EM(rxrpc_local_put_for_use, "PUT for-use ") \
EM(rxrpc_local_put_kill_conn, "PUT conn-kil") \
EM(rxrpc_local_put_peer, "PUT peer ") \
EM(rxrpc_local_put_prealloc_peer, "PUT peer-pre") \
EM(rxrpc_local_put_release_sock, "PUT rel-sock") \
EM(rxrpc_local_stop, "STOP ") \
EM(rxrpc_local_stopped, "STOPPED ") \
EM(rxrpc_local_unuse_bind, "UNU bind ") \
EM(rxrpc_local_unuse_conn_work, "UNU conn-wrk") \
EM(rxrpc_local_unuse_peer_keepalive, "UNU peer-kpa") \
EM(rxrpc_local_unuse_release_sock, "UNU rel-sock") \
EM(rxrpc_local_use_conn_work, "USE conn-wrk") \
EM(rxrpc_local_use_lookup, "USE lookup ") \
E_(rxrpc_local_use_peer_keepalive, "USE peer-kpa")
#define rxrpc_peer_traces \
EM(rxrpc_peer_free, "FREE ") \
EM(rxrpc_peer_get_accept, "GET accept ") \
EM(rxrpc_peer_get_bundle, "GET bundle ") \
EM(rxrpc_peer_get_client_conn, "GET cln-conn") \
EM(rxrpc_peer_get_input, "GET input ") \
EM(rxrpc_peer_get_input_error, "GET inpt-err") \
EM(rxrpc_peer_get_keepalive, "GET keepaliv") \
EM(rxrpc_peer_get_lookup_client, "GET look-cln") \
EM(rxrpc_peer_get_service_conn, "GET srv-conn") \
EM(rxrpc_peer_new_client, "NEW client ") \
EM(rxrpc_peer_new_prealloc, "NEW prealloc") \
EM(rxrpc_peer_put_bundle, "PUT bundle ") \
EM(rxrpc_peer_put_call, "PUT call ") \
EM(rxrpc_peer_put_conn, "PUT conn ") \
EM(rxrpc_peer_put_input, "PUT input ") \
EM(rxrpc_peer_put_input_error, "PUT inpt-err") \
E_(rxrpc_peer_put_keepalive, "PUT keepaliv")
#define rxrpc_bundle_traces \
EM(rxrpc_bundle_free, "FREE ") \
EM(rxrpc_bundle_get_client_call, "GET clt-call") \
EM(rxrpc_bundle_get_client_conn, "GET clt-conn") \
EM(rxrpc_bundle_get_service_conn, "GET svc-conn") \
EM(rxrpc_bundle_put_call, "PUT call ") \
EM(rxrpc_bundle_put_conn, "PUT conn ") \
EM(rxrpc_bundle_put_discard, "PUT discard ") \
E_(rxrpc_bundle_new, "NEW ")
#define rxrpc_conn_traces \
EM(rxrpc_conn_free, "FREE ") \
EM(rxrpc_conn_get_activate_call, "GET act-call") \
EM(rxrpc_conn_get_call_input, "GET inp-call") \
EM(rxrpc_conn_get_conn_input, "GET inp-conn") \
EM(rxrpc_conn_get_idle, "GET idle ") \
EM(rxrpc_conn_get_poke_abort, "GET pk-abort") \
EM(rxrpc_conn_get_poke_timer, "GET poke ") \
EM(rxrpc_conn_get_service_conn, "GET svc-conn") \
EM(rxrpc_conn_new_client, "NEW client ") \
EM(rxrpc_conn_new_service, "NEW service ") \
EM(rxrpc_conn_put_call, "PUT call ") \
EM(rxrpc_conn_put_call_input, "PUT inp-call") \
EM(rxrpc_conn_put_conn_input, "PUT inp-conn") \
EM(rxrpc_conn_put_discard_idle, "PUT disc-idl") \
EM(rxrpc_conn_put_local_dead, "PUT loc-dead") \
EM(rxrpc_conn_put_noreuse, "PUT noreuse ") \
EM(rxrpc_conn_put_poke, "PUT poke ") \
rxrpc: Don't hold a ref for connection workqueue Currently, rxrpc gives the connection's work item a ref on the connection when it queues it - and this is called from the timer expiration function. The problem comes when queue_work() fails (ie. the work item is already queued): the timer routine must put the ref - but this may cause the cleanup code to run. This has the unfortunate effect that the cleanup code may then be run in softirq context - which means that any spinlocks it might need to touch have to be guarded to disable softirqs (ie. they need a "_bh" suffix). (1) Don't give a ref to the work item. (2) Simplify handling of service connections by adding a separate active count so that the refcount isn't also used for this. (3) Connection destruction for both client and service connections can then be cleaned up by putting rxrpc_put_connection() out of line and making a tidy progression through the destruction code (offloaded to a workqueue if put from softirq or processor function context). The RCU part of the cleanup then only deals with the freeing at the end. (4) Make rxrpc_queue_conn() return immediately if it sees the active count is -1 rather then queuing the connection. (5) Make sure that the cleanup routine waits for the work item to complete. (6) Stash the rxrpc_net pointer in the conn struct so that the rcu free routine can use it, even if the local endpoint has been freed. Unfortunately, neither the timer nor the work item can simply get around the problem by just using refcount_inc_not_zero() as the waits would still have to be done, and there would still be the possibility of having to put the ref in the expiration function. Note the connection work item is mostly going to go away with the main event work being transferred to the I/O thread, so the wait in (6) will become obsolete. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2022-11-25 20:43:50 +08:00
EM(rxrpc_conn_put_service_reaped, "PUT svc-reap") \
EM(rxrpc_conn_put_unbundle, "PUT unbundle") \
EM(rxrpc_conn_put_unidle, "PUT unidle ") \
EM(rxrpc_conn_put_work, "PUT work ") \
rxrpc: Don't hold a ref for connection workqueue Currently, rxrpc gives the connection's work item a ref on the connection when it queues it - and this is called from the timer expiration function. The problem comes when queue_work() fails (ie. the work item is already queued): the timer routine must put the ref - but this may cause the cleanup code to run. This has the unfortunate effect that the cleanup code may then be run in softirq context - which means that any spinlocks it might need to touch have to be guarded to disable softirqs (ie. they need a "_bh" suffix). (1) Don't give a ref to the work item. (2) Simplify handling of service connections by adding a separate active count so that the refcount isn't also used for this. (3) Connection destruction for both client and service connections can then be cleaned up by putting rxrpc_put_connection() out of line and making a tidy progression through the destruction code (offloaded to a workqueue if put from softirq or processor function context). The RCU part of the cleanup then only deals with the freeing at the end. (4) Make rxrpc_queue_conn() return immediately if it sees the active count is -1 rather then queuing the connection. (5) Make sure that the cleanup routine waits for the work item to complete. (6) Stash the rxrpc_net pointer in the conn struct so that the rcu free routine can use it, even if the local endpoint has been freed. Unfortunately, neither the timer nor the work item can simply get around the problem by just using refcount_inc_not_zero() as the waits would still have to be done, and there would still be the possibility of having to put the ref in the expiration function. Note the connection work item is mostly going to go away with the main event work being transferred to the I/O thread, so the wait in (6) will become obsolete. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2022-11-25 20:43:50 +08:00
EM(rxrpc_conn_queue_challenge, "QUE chall ") \
EM(rxrpc_conn_queue_retry_work, "QUE retry-wk") \
rxrpc: Don't hold a ref for connection workqueue Currently, rxrpc gives the connection's work item a ref on the connection when it queues it - and this is called from the timer expiration function. The problem comes when queue_work() fails (ie. the work item is already queued): the timer routine must put the ref - but this may cause the cleanup code to run. This has the unfortunate effect that the cleanup code may then be run in softirq context - which means that any spinlocks it might need to touch have to be guarded to disable softirqs (ie. they need a "_bh" suffix). (1) Don't give a ref to the work item. (2) Simplify handling of service connections by adding a separate active count so that the refcount isn't also used for this. (3) Connection destruction for both client and service connections can then be cleaned up by putting rxrpc_put_connection() out of line and making a tidy progression through the destruction code (offloaded to a workqueue if put from softirq or processor function context). The RCU part of the cleanup then only deals with the freeing at the end. (4) Make rxrpc_queue_conn() return immediately if it sees the active count is -1 rather then queuing the connection. (5) Make sure that the cleanup routine waits for the work item to complete. (6) Stash the rxrpc_net pointer in the conn struct so that the rcu free routine can use it, even if the local endpoint has been freed. Unfortunately, neither the timer nor the work item can simply get around the problem by just using refcount_inc_not_zero() as the waits would still have to be done, and there would still be the possibility of having to put the ref in the expiration function. Note the connection work item is mostly going to go away with the main event work being transferred to the I/O thread, so the wait in (6) will become obsolete. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2022-11-25 20:43:50 +08:00
EM(rxrpc_conn_queue_rx_work, "QUE rx-work ") \
EM(rxrpc_conn_see_new_service_conn, "SEE new-svc ") \
EM(rxrpc_conn_see_reap_service, "SEE reap-svc") \
E_(rxrpc_conn_see_work, "SEE work ")
#define rxrpc_client_traces \
EM(rxrpc_client_activate_chans, "Activa") \
EM(rxrpc_client_alloc, "Alloc ") \
EM(rxrpc_client_chan_activate, "ChActv") \
EM(rxrpc_client_chan_disconnect, "ChDisc") \
EM(rxrpc_client_chan_pass, "ChPass") \
EM(rxrpc_client_cleanup, "Clean ") \
EM(rxrpc_client_discard, "Discar") \
EM(rxrpc_client_exposed, "Expose") \
EM(rxrpc_client_replace, "Replac") \
EM(rxrpc_client_queue_new_call, "Q-Call") \
EM(rxrpc_client_to_active, "->Actv") \
rxrpc: Rewrite the client connection manager Rewrite the rxrpc client connection manager so that it can support multiple connections for a given security key to a peer. The following changes are made: (1) For each open socket, the code currently maintains an rbtree with the connections placed into it, keyed by communications parameters. This is tricky to maintain as connections can be culled from the tree or replaced within it. Connections can require replacement for a number of reasons, e.g. their IDs span too great a range for the IDR data type to represent efficiently, the call ID numbers on that conn would overflow or the conn got aborted. This is changed so that there's now a connection bundle object placed in the tree, keyed on the same parameters. The bundle, however, does not need to be replaced. (2) An rxrpc_bundle object can now manage the available channels for a set of parallel connections. The lock that manages this is moved there from the rxrpc_connection struct (channel_lock). (3) There'a a dummy bundle for all incoming connections to share so that they have a channel_lock too. It might be better to give each incoming connection its own bundle. This bundle is not needed to manage which channels incoming calls are made on because that's the solely at whim of the client. (4) The restrictions on how many client connections are around are removed. Instead, a previous patch limits the number of client calls that can be allocated. Ordinarily, client connections are reaped after 2 minutes on the idle queue, but when more than a certain number of connections are in existence, the reaper starts reaping them after 2s of idleness instead to get the numbers back down. It could also be made such that new call allocations are forced to wait until the number of outstanding connections subsides. Signed-off-by: David Howells <dhowells@redhat.com>
2020-07-01 18:15:32 +08:00
E_(rxrpc_client_to_idle, "->Idle")
#define rxrpc_call_traces \
EM(rxrpc_call_get_io_thread, "GET iothread") \
EM(rxrpc_call_get_input, "GET input ") \
EM(rxrpc_call_get_kernel_service, "GET krnl-srv") \
EM(rxrpc_call_get_notify_socket, "GET notify ") \
EM(rxrpc_call_get_poke, "GET poke ") \
EM(rxrpc_call_get_recvmsg, "GET recvmsg ") \
EM(rxrpc_call_get_release_sock, "GET rel-sock") \
EM(rxrpc_call_get_sendmsg, "GET sendmsg ") \
EM(rxrpc_call_get_userid, "GET user-id ") \
EM(rxrpc_call_new_client, "NEW client ") \
EM(rxrpc_call_new_prealloc_service, "NEW prealloc") \
EM(rxrpc_call_put_discard_prealloc, "PUT disc-pre") \
EM(rxrpc_call_put_discard_error, "PUT disc-err") \
EM(rxrpc_call_put_io_thread, "PUT iothread") \
EM(rxrpc_call_put_input, "PUT input ") \
EM(rxrpc_call_put_kernel, "PUT kernel ") \
EM(rxrpc_call_put_poke, "PUT poke ") \
EM(rxrpc_call_put_recvmsg, "PUT recvmsg ") \
EM(rxrpc_call_put_release_sock, "PUT rls-sock") \
EM(rxrpc_call_put_release_sock_tba, "PUT rls-sk-a") \
EM(rxrpc_call_put_sendmsg, "PUT sendmsg ") \
EM(rxrpc_call_put_unnotify, "PUT unnotify") \
EM(rxrpc_call_put_userid_exists, "PUT u-exists") \
EM(rxrpc_call_put_userid, "PUT user-id ") \
EM(rxrpc_call_see_accept, "SEE accept ") \
EM(rxrpc_call_see_activate_client, "SEE act-clnt") \
EM(rxrpc_call_see_connect_failed, "SEE con-fail") \
EM(rxrpc_call_see_connected, "SEE connect ") \
EM(rxrpc_call_see_disconnected, "SEE disconn ") \
EM(rxrpc_call_see_distribute_error, "SEE dist-err") \
EM(rxrpc_call_see_input, "SEE input ") \
EM(rxrpc_call_see_release, "SEE release ") \
EM(rxrpc_call_see_userid_exists, "SEE u-exists") \
E_(rxrpc_call_see_zap, "SEE zap ")
rxrpc: Don't use a ring buffer for call Tx queue Change the way the Tx queueing works to make the following ends easier to achieve: (1) The filling of packets, the encryption of packets and the transmission of packets can be handled in parallel by separate threads, rather than rxrpc_sendmsg() allocating, filling, encrypting and transmitting each packet before moving onto the next one. (2) Get rid of the fixed-size ring which sets a hard limit on the number of packets that can be retained in the ring. This allows the number of packets to increase without having to allocate a very large ring or having variable-sized rings. [Note: the downside of this is that it's then less efficient to locate a packet for retransmission as we then have to step through a list and examine each buffer in the list.] (3) Allow the filler/encrypter to run ahead of the transmission window. (4) Make it easier to do zero copy UDP from the packet buffers. (5) Make it easier to do zero copy from userspace to the packet buffers - and thence to UDP (only if for unauthenticated connections). To that end, the following changes are made: (1) Use the new rxrpc_txbuf struct instead of sk_buff for keeping packets to be transmitted in. This allows them to be placed on multiple queues simultaneously. An sk_buff isn't really necessary as it's never passed on to lower-level networking code. (2) Keep the transmissable packets in a linked list on the call struct rather than in a ring. As a consequence, the annotation buffer isn't used either; rather a flag is set on the packet to indicate ackedness. (3) Use the RXRPC_CALL_TX_LAST flag to indicate that the last packet to be transmitted has been queued. Add RXRPC_CALL_TX_ALL_ACKED to indicate that all packets up to and including the last got hard acked. (4) Wire headers are now stored in the txbuf rather than being concocted on the stack and they're stored immediately before the data, thereby allowing zerocopy of a single span. (5) Don't bother with instant-resend on transmission failure; rather, leave it for a timer or an ACK packet to trigger. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2022-04-01 06:55:08 +08:00
#define rxrpc_txqueue_traces \
EM(rxrpc_txqueue_await_reply, "AWR") \
EM(rxrpc_txqueue_dequeue, "DEQ") \
EM(rxrpc_txqueue_end, "END") \
EM(rxrpc_txqueue_queue, "QUE") \
EM(rxrpc_txqueue_queue_last, "QLS") \
EM(rxrpc_txqueue_rotate, "ROT") \
EM(rxrpc_txqueue_rotate_last, "RLS") \
E_(rxrpc_txqueue_wait, "WAI")
#define rxrpc_receive_traces \
EM(rxrpc_receive_end, "END") \
EM(rxrpc_receive_front, "FRN") \
EM(rxrpc_receive_incoming, "INC") \
EM(rxrpc_receive_queue, "QUE") \
EM(rxrpc_receive_queue_last, "QLS") \
EM(rxrpc_receive_queue_oos, "QUO") \
EM(rxrpc_receive_queue_oos_last, "QOL") \
EM(rxrpc_receive_oos, "OOS") \
EM(rxrpc_receive_oos_last, "OSL") \
EM(rxrpc_receive_rotate, "ROT") \
E_(rxrpc_receive_rotate_last, "RLS")
#define rxrpc_recvmsg_traces \
EM(rxrpc_recvmsg_cont, "CONT") \
EM(rxrpc_recvmsg_data_return, "DATA") \
EM(rxrpc_recvmsg_dequeue, "DEQU") \
EM(rxrpc_recvmsg_enter, "ENTR") \
EM(rxrpc_recvmsg_full, "FULL") \
EM(rxrpc_recvmsg_hole, "HOLE") \
EM(rxrpc_recvmsg_next, "NEXT") \
rxrpc: Fix deadlock between call creation and sendmsg/recvmsg All the routines by which rxrpc is accessed from the outside are serialised by means of the socket lock (sendmsg, recvmsg, bind, rxrpc_kernel_begin_call(), ...) and this presents a problem: (1) If a number of calls on the same socket are in the process of connection to the same peer, a maximum of four concurrent live calls are permitted before further calls need to wait for a slot. (2) If a call is waiting for a slot, it is deep inside sendmsg() or rxrpc_kernel_begin_call() and the entry function is holding the socket lock. (3) sendmsg() and recvmsg() or the in-kernel equivalents are prevented from servicing the other calls as they need to take the socket lock to do so. (4) The socket is stuck until a call is aborted and makes its slot available to the waiter. Fix this by: (1) Provide each call with a mutex ('user_mutex') that arbitrates access by the users of rxrpc separately for each specific call. (2) Make rxrpc_sendmsg() and rxrpc_recvmsg() unlock the socket as soon as they've got a call and taken its mutex. Note that I'm returning EWOULDBLOCK from recvmsg() if MSG_DONTWAIT is set but someone else has the lock. Should I instead only return EWOULDBLOCK if there's nothing currently to be done on a socket, and sleep in this particular instance because there is something to be done, but we appear to be blocked by the interrupt handler doing its ping? (3) Make rxrpc_new_client_call() unlock the socket after allocating a new call, locking its user mutex and adding it to the socket's call tree. The call is returned locked so that sendmsg() can add data to it immediately. From the moment the call is in the socket tree, it is subject to access by sendmsg() and recvmsg() - even if it isn't connected yet. (4) Lock new service calls in the UDP data_ready handler (in rxrpc_new_incoming_call()) because they may already be in the socket's tree and the data_ready handler makes them live immediately if a user ID has already been preassigned. Note that the new call is locked before any notifications are sent that it is live, so doing mutex_trylock() *ought* to always succeed. Userspace is prevented from doing sendmsg() on calls that are in a too-early state in rxrpc_do_sendmsg(). (5) Make rxrpc_new_incoming_call() return the call with the user mutex held so that a ping can be scheduled immediately under it. Note that it might be worth moving the ping call into rxrpc_new_incoming_call() and then we can drop the mutex there. (6) Make rxrpc_accept_call() take the lock on the call it is accepting and release the socket after adding the call to the socket's tree. This is slightly tricky as we've dequeued the call by that point and have to requeue it. Note that requeuing emits a trace event. (7) Make rxrpc_kernel_send_data() and rxrpc_kernel_recv_data() take the new mutex immediately and don't bother with the socket mutex at all. This patch has the nice bonus that calls on the same socket are now to some extent parallelisable. Note that we might want to move rxrpc_service_prealloc() calls out from the socket lock and give it its own lock, so that we don't hang progress in other calls because we're waiting for the allocator. We probably also want to avoid calling rxrpc_notify_socket() from within the socket lock (rxrpc_accept_call()). Signed-off-by: David Howells <dhowells@redhat.com> Tested-by: Marc Dionne <marc.c.dionne@auristor.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-02-27 23:43:06 +08:00
EM(rxrpc_recvmsg_requeue, "REQU") \
EM(rxrpc_recvmsg_return, "RETN") \
EM(rxrpc_recvmsg_terminal, "TERM") \
EM(rxrpc_recvmsg_to_be_accepted, "TBAC") \
EM(rxrpc_recvmsg_unqueue, "UNQU") \
E_(rxrpc_recvmsg_wait, "WAIT")
#define rxrpc_rtt_tx_traces \
rxrpc: Fix loss of RTT samples due to interposed ACK The Rx protocol has a mechanism to help generate RTT samples that works by a client transmitting a REQUESTED-type ACK when it receives a DATA packet that has the REQUEST_ACK flag set. The peer, however, may interpose other ACKs before transmitting the REQUESTED-ACK, as can be seen in the following trace excerpt: rxrpc_tx_data: c=00000044 DATA d0b5ece8:00000001 00000001 q=00000001 fl=07 rxrpc_rx_ack: c=00000044 00000001 PNG r=00000000 f=00000002 p=00000000 n=0 rxrpc_rx_ack: c=00000044 00000002 REQ r=00000001 f=00000002 p=00000001 n=0 ... DATA packet 1 (q=xx) has REQUEST_ACK set (bit 1 of fl=xx). The incoming ping (labelled PNG) hard-acks the request DATA packet (f=xx exceeds the sequence number of the DATA packet), causing it to be discarded from the Tx ring. The ACK that was requested (labelled REQ, r=xx references the serial of the DATA packet) comes after the ping, but the sk_buff holding the timestamp has gone and the RTT sample is lost. This is particularly noticeable on RPC calls used to probe the service offered by the peer. A lot of peers end up with an unknown RTT because we only ever sent a single RPC. This confuses the server rotation algorithm. Fix this by caching the information about the outgoing packet in RTT calculations in the rxrpc_call struct rather than looking in the Tx ring. A four-deep buffer is maintained and both REQUEST_ACK-flagged DATA and PING-ACK transmissions are recorded in there. When the appropriate response ACK is received, the buffer is checked for a match and, if found, an RTT sample is recorded. If a received ACK refers to a packet with a later serial number than an entry in the cache, that entry is presumed lost and the entry is made available to record a new transmission. ACKs types other than REQUESTED-type and PING-type cause any matching sample to be cancelled as they don't necessarily represent a useful measurement. If there's no space in the buffer on ping/data transmission, the sample base is discarded. Fixes: 50235c4b5a2f ("rxrpc: Obtain RTT data by requesting ACKs on DATA packets") Signed-off-by: David Howells <dhowells@redhat.com>
2020-08-20 06:29:16 +08:00
EM(rxrpc_rtt_tx_cancel, "CNCE") \
EM(rxrpc_rtt_tx_data, "DATA") \
rxrpc: Fix loss of RTT samples due to interposed ACK The Rx protocol has a mechanism to help generate RTT samples that works by a client transmitting a REQUESTED-type ACK when it receives a DATA packet that has the REQUEST_ACK flag set. The peer, however, may interpose other ACKs before transmitting the REQUESTED-ACK, as can be seen in the following trace excerpt: rxrpc_tx_data: c=00000044 DATA d0b5ece8:00000001 00000001 q=00000001 fl=07 rxrpc_rx_ack: c=00000044 00000001 PNG r=00000000 f=00000002 p=00000000 n=0 rxrpc_rx_ack: c=00000044 00000002 REQ r=00000001 f=00000002 p=00000001 n=0 ... DATA packet 1 (q=xx) has REQUEST_ACK set (bit 1 of fl=xx). The incoming ping (labelled PNG) hard-acks the request DATA packet (f=xx exceeds the sequence number of the DATA packet), causing it to be discarded from the Tx ring. The ACK that was requested (labelled REQ, r=xx references the serial of the DATA packet) comes after the ping, but the sk_buff holding the timestamp has gone and the RTT sample is lost. This is particularly noticeable on RPC calls used to probe the service offered by the peer. A lot of peers end up with an unknown RTT because we only ever sent a single RPC. This confuses the server rotation algorithm. Fix this by caching the information about the outgoing packet in RTT calculations in the rxrpc_call struct rather than looking in the Tx ring. A four-deep buffer is maintained and both REQUEST_ACK-flagged DATA and PING-ACK transmissions are recorded in there. When the appropriate response ACK is received, the buffer is checked for a match and, if found, an RTT sample is recorded. If a received ACK refers to a packet with a later serial number than an entry in the cache, that entry is presumed lost and the entry is made available to record a new transmission. ACKs types other than REQUESTED-type and PING-type cause any matching sample to be cancelled as they don't necessarily represent a useful measurement. If there's no space in the buffer on ping/data transmission, the sample base is discarded. Fixes: 50235c4b5a2f ("rxrpc: Obtain RTT data by requesting ACKs on DATA packets") Signed-off-by: David Howells <dhowells@redhat.com>
2020-08-20 06:29:16 +08:00
EM(rxrpc_rtt_tx_no_slot, "FULL") \
E_(rxrpc_rtt_tx_ping, "PING")
#define rxrpc_rtt_rx_traces \
EM(rxrpc_rtt_rx_other_ack, "OACK") \
rxrpc: Fix loss of RTT samples due to interposed ACK The Rx protocol has a mechanism to help generate RTT samples that works by a client transmitting a REQUESTED-type ACK when it receives a DATA packet that has the REQUEST_ACK flag set. The peer, however, may interpose other ACKs before transmitting the REQUESTED-ACK, as can be seen in the following trace excerpt: rxrpc_tx_data: c=00000044 DATA d0b5ece8:00000001 00000001 q=00000001 fl=07 rxrpc_rx_ack: c=00000044 00000001 PNG r=00000000 f=00000002 p=00000000 n=0 rxrpc_rx_ack: c=00000044 00000002 REQ r=00000001 f=00000002 p=00000001 n=0 ... DATA packet 1 (q=xx) has REQUEST_ACK set (bit 1 of fl=xx). The incoming ping (labelled PNG) hard-acks the request DATA packet (f=xx exceeds the sequence number of the DATA packet), causing it to be discarded from the Tx ring. The ACK that was requested (labelled REQ, r=xx references the serial of the DATA packet) comes after the ping, but the sk_buff holding the timestamp has gone and the RTT sample is lost. This is particularly noticeable on RPC calls used to probe the service offered by the peer. A lot of peers end up with an unknown RTT because we only ever sent a single RPC. This confuses the server rotation algorithm. Fix this by caching the information about the outgoing packet in RTT calculations in the rxrpc_call struct rather than looking in the Tx ring. A four-deep buffer is maintained and both REQUEST_ACK-flagged DATA and PING-ACK transmissions are recorded in there. When the appropriate response ACK is received, the buffer is checked for a match and, if found, an RTT sample is recorded. If a received ACK refers to a packet with a later serial number than an entry in the cache, that entry is presumed lost and the entry is made available to record a new transmission. ACKs types other than REQUESTED-type and PING-type cause any matching sample to be cancelled as they don't necessarily represent a useful measurement. If there's no space in the buffer on ping/data transmission, the sample base is discarded. Fixes: 50235c4b5a2f ("rxrpc: Obtain RTT data by requesting ACKs on DATA packets") Signed-off-by: David Howells <dhowells@redhat.com>
2020-08-20 06:29:16 +08:00
EM(rxrpc_rtt_rx_obsolete, "OBSL") \
EM(rxrpc_rtt_rx_lost, "LOST") \
EM(rxrpc_rtt_rx_ping_response, "PONG") \
E_(rxrpc_rtt_rx_requested_ack, "RACK")
#define rxrpc_timer_traces \
EM(rxrpc_timer_begin, "Begin ") \
rxrpc: Fix call timeouts Fix the rxrpc call expiration timeouts and make them settable from userspace. By analogy with other rx implementations, there should be three timeouts: (1) "Normal timeout" This is set for all calls and is triggered if we haven't received any packets from the peer in a while. It is measured from the last time we received any packet on that call. This is not reset by any connection packets (such as CHALLENGE/RESPONSE packets). If a service operation takes a long time, the server should generate PING ACKs at a duration that's substantially less than the normal timeout so is to keep both sides alive. This is set at 1/6 of normal timeout. (2) "Idle timeout" This is set only for a service call and is triggered if we stop receiving the DATA packets that comprise the request data. It is measured from the last time we received a DATA packet. (3) "Hard timeout" This can be set for a call and specified the maximum lifetime of that call. It should not be specified by default. Some operations (such as volume transfer) take a long time. Allow userspace to set/change the timeouts on a call with sendmsg, using a control message: RXRPC_SET_CALL_TIMEOUTS The data to the message is a number of 32-bit words, not all of which need be given: u32 hard_timeout; /* sec from first packet */ u32 idle_timeout; /* msec from packet Rx */ u32 normal_timeout; /* msec from data Rx */ This can be set in combination with any other sendmsg() that affects a call. Signed-off-by: David Howells <dhowells@redhat.com>
2017-11-24 18:18:41 +08:00
EM(rxrpc_timer_exp_ack, "ExpAck") \
EM(rxrpc_timer_exp_hard, "ExpHrd") \
EM(rxrpc_timer_exp_idle, "ExpIdl") \
EM(rxrpc_timer_exp_keepalive, "ExpKA ") \
EM(rxrpc_timer_exp_lost_ack, "ExpLoA") \
rxrpc: Fix call timeouts Fix the rxrpc call expiration timeouts and make them settable from userspace. By analogy with other rx implementations, there should be three timeouts: (1) "Normal timeout" This is set for all calls and is triggered if we haven't received any packets from the peer in a while. It is measured from the last time we received any packet on that call. This is not reset by any connection packets (such as CHALLENGE/RESPONSE packets). If a service operation takes a long time, the server should generate PING ACKs at a duration that's substantially less than the normal timeout so is to keep both sides alive. This is set at 1/6 of normal timeout. (2) "Idle timeout" This is set only for a service call and is triggered if we stop receiving the DATA packets that comprise the request data. It is measured from the last time we received a DATA packet. (3) "Hard timeout" This can be set for a call and specified the maximum lifetime of that call. It should not be specified by default. Some operations (such as volume transfer) take a long time. Allow userspace to set/change the timeouts on a call with sendmsg, using a control message: RXRPC_SET_CALL_TIMEOUTS The data to the message is a number of 32-bit words, not all of which need be given: u32 hard_timeout; /* sec from first packet */ u32 idle_timeout; /* msec from packet Rx */ u32 normal_timeout; /* msec from data Rx */ This can be set in combination with any other sendmsg() that affects a call. Signed-off-by: David Howells <dhowells@redhat.com>
2017-11-24 18:18:41 +08:00
EM(rxrpc_timer_exp_normal, "ExpNml") \
EM(rxrpc_timer_exp_ping, "ExpPng") \
EM(rxrpc_timer_exp_resend, "ExpRsn") \
EM(rxrpc_timer_init_for_reply, "IniRpl") \
EM(rxrpc_timer_init_for_send_reply, "SndRpl") \
rxrpc: Fix call timeouts Fix the rxrpc call expiration timeouts and make them settable from userspace. By analogy with other rx implementations, there should be three timeouts: (1) "Normal timeout" This is set for all calls and is triggered if we haven't received any packets from the peer in a while. It is measured from the last time we received any packet on that call. This is not reset by any connection packets (such as CHALLENGE/RESPONSE packets). If a service operation takes a long time, the server should generate PING ACKs at a duration that's substantially less than the normal timeout so is to keep both sides alive. This is set at 1/6 of normal timeout. (2) "Idle timeout" This is set only for a service call and is triggered if we stop receiving the DATA packets that comprise the request data. It is measured from the last time we received a DATA packet. (3) "Hard timeout" This can be set for a call and specified the maximum lifetime of that call. It should not be specified by default. Some operations (such as volume transfer) take a long time. Allow userspace to set/change the timeouts on a call with sendmsg, using a control message: RXRPC_SET_CALL_TIMEOUTS The data to the message is a number of 32-bit words, not all of which need be given: u32 hard_timeout; /* sec from first packet */ u32 idle_timeout; /* msec from packet Rx */ u32 normal_timeout; /* msec from data Rx */ This can be set in combination with any other sendmsg() that affects a call. Signed-off-by: David Howells <dhowells@redhat.com>
2017-11-24 18:18:41 +08:00
EM(rxrpc_timer_restart, "Restrt") \
EM(rxrpc_timer_set_for_ack, "SetAck") \
rxrpc: Fix call timeouts Fix the rxrpc call expiration timeouts and make them settable from userspace. By analogy with other rx implementations, there should be three timeouts: (1) "Normal timeout" This is set for all calls and is triggered if we haven't received any packets from the peer in a while. It is measured from the last time we received any packet on that call. This is not reset by any connection packets (such as CHALLENGE/RESPONSE packets). If a service operation takes a long time, the server should generate PING ACKs at a duration that's substantially less than the normal timeout so is to keep both sides alive. This is set at 1/6 of normal timeout. (2) "Idle timeout" This is set only for a service call and is triggered if we stop receiving the DATA packets that comprise the request data. It is measured from the last time we received a DATA packet. (3) "Hard timeout" This can be set for a call and specified the maximum lifetime of that call. It should not be specified by default. Some operations (such as volume transfer) take a long time. Allow userspace to set/change the timeouts on a call with sendmsg, using a control message: RXRPC_SET_CALL_TIMEOUTS The data to the message is a number of 32-bit words, not all of which need be given: u32 hard_timeout; /* sec from first packet */ u32 idle_timeout; /* msec from packet Rx */ u32 normal_timeout; /* msec from data Rx */ This can be set in combination with any other sendmsg() that affects a call. Signed-off-by: David Howells <dhowells@redhat.com>
2017-11-24 18:18:41 +08:00
EM(rxrpc_timer_set_for_hard, "SetHrd") \
EM(rxrpc_timer_set_for_idle, "SetIdl") \
EM(rxrpc_timer_set_for_keepalive, "KeepAl") \
EM(rxrpc_timer_set_for_lost_ack, "SetLoA") \
rxrpc: Fix call timeouts Fix the rxrpc call expiration timeouts and make them settable from userspace. By analogy with other rx implementations, there should be three timeouts: (1) "Normal timeout" This is set for all calls and is triggered if we haven't received any packets from the peer in a while. It is measured from the last time we received any packet on that call. This is not reset by any connection packets (such as CHALLENGE/RESPONSE packets). If a service operation takes a long time, the server should generate PING ACKs at a duration that's substantially less than the normal timeout so is to keep both sides alive. This is set at 1/6 of normal timeout. (2) "Idle timeout" This is set only for a service call and is triggered if we stop receiving the DATA packets that comprise the request data. It is measured from the last time we received a DATA packet. (3) "Hard timeout" This can be set for a call and specified the maximum lifetime of that call. It should not be specified by default. Some operations (such as volume transfer) take a long time. Allow userspace to set/change the timeouts on a call with sendmsg, using a control message: RXRPC_SET_CALL_TIMEOUTS The data to the message is a number of 32-bit words, not all of which need be given: u32 hard_timeout; /* sec from first packet */ u32 idle_timeout; /* msec from packet Rx */ u32 normal_timeout; /* msec from data Rx */ This can be set in combination with any other sendmsg() that affects a call. Signed-off-by: David Howells <dhowells@redhat.com>
2017-11-24 18:18:41 +08:00
EM(rxrpc_timer_set_for_normal, "SetNml") \
EM(rxrpc_timer_set_for_ping, "SetPng") \
EM(rxrpc_timer_set_for_resend, "SetRTx") \
rxrpc: Fix call timeouts Fix the rxrpc call expiration timeouts and make them settable from userspace. By analogy with other rx implementations, there should be three timeouts: (1) "Normal timeout" This is set for all calls and is triggered if we haven't received any packets from the peer in a while. It is measured from the last time we received any packet on that call. This is not reset by any connection packets (such as CHALLENGE/RESPONSE packets). If a service operation takes a long time, the server should generate PING ACKs at a duration that's substantially less than the normal timeout so is to keep both sides alive. This is set at 1/6 of normal timeout. (2) "Idle timeout" This is set only for a service call and is triggered if we stop receiving the DATA packets that comprise the request data. It is measured from the last time we received a DATA packet. (3) "Hard timeout" This can be set for a call and specified the maximum lifetime of that call. It should not be specified by default. Some operations (such as volume transfer) take a long time. Allow userspace to set/change the timeouts on a call with sendmsg, using a control message: RXRPC_SET_CALL_TIMEOUTS The data to the message is a number of 32-bit words, not all of which need be given: u32 hard_timeout; /* sec from first packet */ u32 idle_timeout; /* msec from packet Rx */ u32 normal_timeout; /* msec from data Rx */ This can be set in combination with any other sendmsg() that affects a call. Signed-off-by: David Howells <dhowells@redhat.com>
2017-11-24 18:18:41 +08:00
E_(rxrpc_timer_set_for_send, "SetSnd")
#define rxrpc_propose_ack_traces \
EM(rxrpc_propose_ack_client_tx_end, "ClTxEnd") \
EM(rxrpc_propose_ack_input_data, "DataIn ") \
EM(rxrpc_propose_ack_input_data_hole, "DataInH") \
EM(rxrpc_propose_ack_ping_for_keepalive, "KeepAlv") \
EM(rxrpc_propose_ack_ping_for_lost_ack, "LostAck") \
EM(rxrpc_propose_ack_ping_for_lost_reply, "LostRpl") \
EM(rxrpc_propose_ack_ping_for_old_rtt, "OldRtt ") \
EM(rxrpc_propose_ack_ping_for_params, "Params ") \
EM(rxrpc_propose_ack_ping_for_rtt, "Rtt ") \
EM(rxrpc_propose_ack_processing_op, "ProcOp ") \
EM(rxrpc_propose_ack_respond_to_ack, "Rsp2Ack") \
EM(rxrpc_propose_ack_respond_to_ping, "Rsp2Png") \
EM(rxrpc_propose_ack_retry_tx, "RetryTx") \
EM(rxrpc_propose_ack_rotate_rx, "RxAck ") \
EM(rxrpc_propose_ack_rx_idle, "RxIdle ") \
E_(rxrpc_propose_ack_terminal_ack, "ClTerm ")
#define rxrpc_congest_modes \
EM(RXRPC_CALL_CONGEST_AVOIDANCE, "CongAvoid") \
EM(RXRPC_CALL_FAST_RETRANSMIT, "FastReTx ") \
EM(RXRPC_CALL_PACKET_LOSS, "PktLoss ") \
E_(RXRPC_CALL_SLOW_START, "SlowStart")
#define rxrpc_congest_changes \
EM(rxrpc_cong_begin_retransmission, " Retrans") \
EM(rxrpc_cong_cleared_nacks, " Cleared") \
EM(rxrpc_cong_new_low_nack, " NewLowN") \
EM(rxrpc_cong_no_change, " -") \
EM(rxrpc_cong_progress, " Progres") \
EM(rxrpc_cong_idle_reset, " IdleRes") \
EM(rxrpc_cong_retransmit_again, " ReTxAgn") \
EM(rxrpc_cong_rtt_window_end, " RttWinE") \
E_(rxrpc_cong_saw_nack, " SawNack")
#define rxrpc_pkts \
EM(0, "?00") \
EM(RXRPC_PACKET_TYPE_DATA, "DATA") \
EM(RXRPC_PACKET_TYPE_ACK, "ACK") \
EM(RXRPC_PACKET_TYPE_BUSY, "BUSY") \
EM(RXRPC_PACKET_TYPE_ABORT, "ABORT") \
EM(RXRPC_PACKET_TYPE_ACKALL, "ACKALL") \
EM(RXRPC_PACKET_TYPE_CHALLENGE, "CHALL") \
EM(RXRPC_PACKET_TYPE_RESPONSE, "RESP") \
EM(RXRPC_PACKET_TYPE_DEBUG, "DEBUG") \
EM(9, "?09") \
EM(10, "?10") \
EM(11, "?11") \
EM(12, "?12") \
EM(RXRPC_PACKET_TYPE_VERSION, "VERSION") \
EM(14, "?14") \
E_(15, "?15")
#define rxrpc_ack_names \
EM(0, "-0-") \
EM(RXRPC_ACK_REQUESTED, "REQ") \
EM(RXRPC_ACK_DUPLICATE, "DUP") \
EM(RXRPC_ACK_OUT_OF_SEQUENCE, "OOS") \
EM(RXRPC_ACK_EXCEEDS_WINDOW, "WIN") \
EM(RXRPC_ACK_NOSPACE, "MEM") \
EM(RXRPC_ACK_PING, "PNG") \
EM(RXRPC_ACK_PING_RESPONSE, "PNR") \
EM(RXRPC_ACK_DELAY, "DLY") \
EM(RXRPC_ACK_IDLE, "IDL") \
E_(RXRPC_ACK__INVALID, "-?-")
#define rxrpc_sack_traces \
EM(rxrpc_sack_advance, "ADV") \
EM(rxrpc_sack_fill, "FIL") \
EM(rxrpc_sack_nack, "NAK") \
EM(rxrpc_sack_none, "---") \
E_(rxrpc_sack_oos, "OOS")
#define rxrpc_completions \
EM(RXRPC_CALL_SUCCEEDED, "Succeeded") \
EM(RXRPC_CALL_REMOTELY_ABORTED, "RemoteAbort") \
EM(RXRPC_CALL_LOCALLY_ABORTED, "LocalAbort") \
EM(RXRPC_CALL_LOCAL_ERROR, "LocalError") \
E_(RXRPC_CALL_NETWORK_ERROR, "NetError")
#define rxrpc_tx_points \
EM(rxrpc_tx_point_call_abort, "CallAbort") \
EM(rxrpc_tx_point_call_ack, "CallAck") \
EM(rxrpc_tx_point_call_data_frag, "CallDataFrag") \
EM(rxrpc_tx_point_call_data_nofrag, "CallDataNofrag") \
EM(rxrpc_tx_point_call_final_resend, "CallFinalResend") \
EM(rxrpc_tx_point_conn_abort, "ConnAbort") \
EM(rxrpc_tx_point_reject, "Reject") \
EM(rxrpc_tx_point_rxkad_challenge, "RxkadChall") \
EM(rxrpc_tx_point_rxkad_response, "RxkadResp") \
EM(rxrpc_tx_point_version_keepalive, "VerKeepalive") \
E_(rxrpc_tx_point_version_reply, "VerReply")
#define rxrpc_req_ack_traces \
EM(rxrpc_reqack_ack_lost, "ACK-LOST ") \
EM(rxrpc_reqack_already_on, "ALREADY-ON") \
EM(rxrpc_reqack_more_rtt, "MORE-RTT ") \
EM(rxrpc_reqack_no_srv_last, "NO-SRVLAST") \
EM(rxrpc_reqack_old_rtt, "OLD-RTT ") \
EM(rxrpc_reqack_retrans, "RETRANS ") \
EM(rxrpc_reqack_slow_start, "SLOW-START") \
E_(rxrpc_reqack_small_txwin, "SMALL-TXWN")
/* ---- Must update size of stat_why_req_ack[] if more are added! */
#define rxrpc_txbuf_traces \
EM(rxrpc_txbuf_alloc_ack, "ALLOC ACK ") \
EM(rxrpc_txbuf_alloc_data, "ALLOC DATA ") \
EM(rxrpc_txbuf_free, "FREE ") \
rxrpc: Don't use a ring buffer for call Tx queue Change the way the Tx queueing works to make the following ends easier to achieve: (1) The filling of packets, the encryption of packets and the transmission of packets can be handled in parallel by separate threads, rather than rxrpc_sendmsg() allocating, filling, encrypting and transmitting each packet before moving onto the next one. (2) Get rid of the fixed-size ring which sets a hard limit on the number of packets that can be retained in the ring. This allows the number of packets to increase without having to allocate a very large ring or having variable-sized rings. [Note: the downside of this is that it's then less efficient to locate a packet for retransmission as we then have to step through a list and examine each buffer in the list.] (3) Allow the filler/encrypter to run ahead of the transmission window. (4) Make it easier to do zero copy UDP from the packet buffers. (5) Make it easier to do zero copy from userspace to the packet buffers - and thence to UDP (only if for unauthenticated connections). To that end, the following changes are made: (1) Use the new rxrpc_txbuf struct instead of sk_buff for keeping packets to be transmitted in. This allows them to be placed on multiple queues simultaneously. An sk_buff isn't really necessary as it's never passed on to lower-level networking code. (2) Keep the transmissable packets in a linked list on the call struct rather than in a ring. As a consequence, the annotation buffer isn't used either; rather a flag is set on the packet to indicate ackedness. (3) Use the RXRPC_CALL_TX_LAST flag to indicate that the last packet to be transmitted has been queued. Add RXRPC_CALL_TX_ALL_ACKED to indicate that all packets up to and including the last got hard acked. (4) Wire headers are now stored in the txbuf rather than being concocted on the stack and they're stored immediately before the data, thereby allowing zerocopy of a single span. (5) Don't bother with instant-resend on transmission failure; rather, leave it for a timer or an ACK packet to trigger. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2022-04-01 06:55:08 +08:00
EM(rxrpc_txbuf_get_buffer, "GET BUFFER ") \
EM(rxrpc_txbuf_get_trans, "GET TRANS ") \
EM(rxrpc_txbuf_get_retrans, "GET RETRANS") \
EM(rxrpc_txbuf_put_ack_tx, "PUT ACK TX ") \
EM(rxrpc_txbuf_put_cleaned, "PUT CLEANED") \
EM(rxrpc_txbuf_put_nomem, "PUT NOMEM ") \
EM(rxrpc_txbuf_put_rotated, "PUT ROTATED") \
EM(rxrpc_txbuf_put_send_aborted, "PUT SEND-X ") \
rxrpc: Don't use a ring buffer for call Tx queue Change the way the Tx queueing works to make the following ends easier to achieve: (1) The filling of packets, the encryption of packets and the transmission of packets can be handled in parallel by separate threads, rather than rxrpc_sendmsg() allocating, filling, encrypting and transmitting each packet before moving onto the next one. (2) Get rid of the fixed-size ring which sets a hard limit on the number of packets that can be retained in the ring. This allows the number of packets to increase without having to allocate a very large ring or having variable-sized rings. [Note: the downside of this is that it's then less efficient to locate a packet for retransmission as we then have to step through a list and examine each buffer in the list.] (3) Allow the filler/encrypter to run ahead of the transmission window. (4) Make it easier to do zero copy UDP from the packet buffers. (5) Make it easier to do zero copy from userspace to the packet buffers - and thence to UDP (only if for unauthenticated connections). To that end, the following changes are made: (1) Use the new rxrpc_txbuf struct instead of sk_buff for keeping packets to be transmitted in. This allows them to be placed on multiple queues simultaneously. An sk_buff isn't really necessary as it's never passed on to lower-level networking code. (2) Keep the transmissable packets in a linked list on the call struct rather than in a ring. As a consequence, the annotation buffer isn't used either; rather a flag is set on the packet to indicate ackedness. (3) Use the RXRPC_CALL_TX_LAST flag to indicate that the last packet to be transmitted has been queued. Add RXRPC_CALL_TX_ALL_ACKED to indicate that all packets up to and including the last got hard acked. (4) Wire headers are now stored in the txbuf rather than being concocted on the stack and they're stored immediately before the data, thereby allowing zerocopy of a single span. (5) Don't bother with instant-resend on transmission failure; rather, leave it for a timer or an ACK packet to trigger. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2022-04-01 06:55:08 +08:00
EM(rxrpc_txbuf_put_trans, "PUT TRANS ") \
rxrpc: Don't hold a ref for call timer or workqueue Currently, rxrpc gives the call timer a ref on the call when it starts it and this is passed along to the workqueue by the timer expiration function. The problem comes when queue_work() fails (ie. the work item is already queued): the timer routine must put the ref - but this may cause the cleanup code to run. This has the unfortunate effect that the cleanup code may then be run in softirq context - which means that any spinlocks it might need to touch have to be guarded to disable softirqs (ie. they need a "_bh" suffix). Fix this by: (1) Don't give a ref to the timer. (2) Making the expiration function not do anything if the refcount is 0. Note that this is more of an optimisation. (3) Make sure that the cleanup routine waits for timer to complete. However, this has a consequence that timer cannot give a ref to the work item. Therefore the following fixes are also necessary: (4) Don't give a ref to the work item. (5) Make the work item return asap if it sees the ref count is 0. (6) Make sure that the cleanup routine waits for the work item to complete. Unfortunately, neither the timer nor the work item can simply get around the problem by just using refcount_inc_not_zero() as the waits would still have to be done, and there would still be the possibility of having to put the ref in the expiration function. Note the call work item is going to go away with the work being transferred to the I/O thread, so the wait in (6) will become obsolete. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2022-11-25 17:00:55 +08:00
EM(rxrpc_txbuf_see_out_of_step, "OUT-OF-STEP") \
EM(rxrpc_txbuf_see_send_more, "SEE SEND+ ") \
E_(rxrpc_txbuf_see_unacked, "SEE UNACKED")
/*
* Generate enums for tracing information.
*/
#ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
#define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
#undef EM
#undef E_
#define EM(a, b) a,
#define E_(a, b) a
enum rxrpc_abort_reason { rxrpc_abort_reasons } __mode(byte);
enum rxrpc_bundle_trace { rxrpc_bundle_traces } __mode(byte);
enum rxrpc_call_poke_trace { rxrpc_call_poke_traces } __mode(byte);
enum rxrpc_call_trace { rxrpc_call_traces } __mode(byte);
enum rxrpc_client_trace { rxrpc_client_traces } __mode(byte);
enum rxrpc_congest_change { rxrpc_congest_changes } __mode(byte);
enum rxrpc_conn_trace { rxrpc_conn_traces } __mode(byte);
enum rxrpc_local_trace { rxrpc_local_traces } __mode(byte);
enum rxrpc_peer_trace { rxrpc_peer_traces } __mode(byte);
enum rxrpc_propose_ack_outcome { rxrpc_propose_ack_outcomes } __mode(byte);
enum rxrpc_propose_ack_trace { rxrpc_propose_ack_traces } __mode(byte);
enum rxrpc_receive_trace { rxrpc_receive_traces } __mode(byte);
enum rxrpc_recvmsg_trace { rxrpc_recvmsg_traces } __mode(byte);
enum rxrpc_req_ack_trace { rxrpc_req_ack_traces } __mode(byte);
enum rxrpc_rtt_rx_trace { rxrpc_rtt_rx_traces } __mode(byte);
enum rxrpc_rtt_tx_trace { rxrpc_rtt_tx_traces } __mode(byte);
enum rxrpc_sack_trace { rxrpc_sack_traces } __mode(byte);
enum rxrpc_skb_trace { rxrpc_skb_traces } __mode(byte);
enum rxrpc_timer_trace { rxrpc_timer_traces } __mode(byte);
enum rxrpc_tx_point { rxrpc_tx_points } __mode(byte);
enum rxrpc_txbuf_trace { rxrpc_txbuf_traces } __mode(byte);
rxrpc: Don't use a ring buffer for call Tx queue Change the way the Tx queueing works to make the following ends easier to achieve: (1) The filling of packets, the encryption of packets and the transmission of packets can be handled in parallel by separate threads, rather than rxrpc_sendmsg() allocating, filling, encrypting and transmitting each packet before moving onto the next one. (2) Get rid of the fixed-size ring which sets a hard limit on the number of packets that can be retained in the ring. This allows the number of packets to increase without having to allocate a very large ring or having variable-sized rings. [Note: the downside of this is that it's then less efficient to locate a packet for retransmission as we then have to step through a list and examine each buffer in the list.] (3) Allow the filler/encrypter to run ahead of the transmission window. (4) Make it easier to do zero copy UDP from the packet buffers. (5) Make it easier to do zero copy from userspace to the packet buffers - and thence to UDP (only if for unauthenticated connections). To that end, the following changes are made: (1) Use the new rxrpc_txbuf struct instead of sk_buff for keeping packets to be transmitted in. This allows them to be placed on multiple queues simultaneously. An sk_buff isn't really necessary as it's never passed on to lower-level networking code. (2) Keep the transmissable packets in a linked list on the call struct rather than in a ring. As a consequence, the annotation buffer isn't used either; rather a flag is set on the packet to indicate ackedness. (3) Use the RXRPC_CALL_TX_LAST flag to indicate that the last packet to be transmitted has been queued. Add RXRPC_CALL_TX_ALL_ACKED to indicate that all packets up to and including the last got hard acked. (4) Wire headers are now stored in the txbuf rather than being concocted on the stack and they're stored immediately before the data, thereby allowing zerocopy of a single span. (5) Don't bother with instant-resend on transmission failure; rather, leave it for a timer or an ACK packet to trigger. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2022-04-01 06:55:08 +08:00
enum rxrpc_txqueue_trace { rxrpc_txqueue_traces } __mode(byte);
#endif /* end __RXRPC_DECLARE_TRACE_ENUMS_ONCE_ONLY */
/*
* Export enum symbols via userspace.
*/
#undef EM
#undef E_
#ifndef RXRPC_TRACE_ONLY_DEFINE_ENUMS
#define EM(a, b) TRACE_DEFINE_ENUM(a);
#define E_(a, b) TRACE_DEFINE_ENUM(a);
rxrpc_abort_reasons;
rxrpc_bundle_traces;
rxrpc_call_poke_traces;
rxrpc_call_traces;
rxrpc_client_traces;
rxrpc_congest_changes;
rxrpc_congest_modes;
rxrpc_conn_traces;
rxrpc_local_traces;
rxrpc_propose_ack_traces;
rxrpc_receive_traces;
rxrpc_recvmsg_traces;
rxrpc_req_ack_traces;
rxrpc_rtt_rx_traces;
rxrpc_rtt_tx_traces;
rxrpc_sack_traces;
rxrpc_skb_traces;
rxrpc_timer_traces;
rxrpc_tx_points;
rxrpc_txbuf_traces;
rxrpc: Don't use a ring buffer for call Tx queue Change the way the Tx queueing works to make the following ends easier to achieve: (1) The filling of packets, the encryption of packets and the transmission of packets can be handled in parallel by separate threads, rather than rxrpc_sendmsg() allocating, filling, encrypting and transmitting each packet before moving onto the next one. (2) Get rid of the fixed-size ring which sets a hard limit on the number of packets that can be retained in the ring. This allows the number of packets to increase without having to allocate a very large ring or having variable-sized rings. [Note: the downside of this is that it's then less efficient to locate a packet for retransmission as we then have to step through a list and examine each buffer in the list.] (3) Allow the filler/encrypter to run ahead of the transmission window. (4) Make it easier to do zero copy UDP from the packet buffers. (5) Make it easier to do zero copy from userspace to the packet buffers - and thence to UDP (only if for unauthenticated connections). To that end, the following changes are made: (1) Use the new rxrpc_txbuf struct instead of sk_buff for keeping packets to be transmitted in. This allows them to be placed on multiple queues simultaneously. An sk_buff isn't really necessary as it's never passed on to lower-level networking code. (2) Keep the transmissable packets in a linked list on the call struct rather than in a ring. As a consequence, the annotation buffer isn't used either; rather a flag is set on the packet to indicate ackedness. (3) Use the RXRPC_CALL_TX_LAST flag to indicate that the last packet to be transmitted has been queued. Add RXRPC_CALL_TX_ALL_ACKED to indicate that all packets up to and including the last got hard acked. (4) Wire headers are now stored in the txbuf rather than being concocted on the stack and they're stored immediately before the data, thereby allowing zerocopy of a single span. (5) Don't bother with instant-resend on transmission failure; rather, leave it for a timer or an ACK packet to trigger. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2022-04-01 06:55:08 +08:00
rxrpc_txqueue_traces;
/*
* Now redefine the EM() and E_() macros to map the enums to the strings that
* will be printed in the output.
*/
#undef EM
#undef E_
#define EM(a, b) { a, b },
#define E_(a, b) { a, b }
TRACE_EVENT(rxrpc_local,
TP_PROTO(unsigned int local_debug_id, enum rxrpc_local_trace op,
int ref, int usage),
TP_ARGS(local_debug_id, op, ref, usage),
TP_STRUCT__entry(
__field(unsigned int, local)
__field(int, op)
__field(int, ref)
__field(int, usage)
),
TP_fast_assign(
__entry->local = local_debug_id;
__entry->op = op;
__entry->ref = ref;
__entry->usage = usage;
),
TP_printk("L=%08x %s r=%d u=%d",
__entry->local,
__print_symbolic(__entry->op, rxrpc_local_traces),
__entry->ref,
__entry->usage)
);
TRACE_EVENT(rxrpc_peer,
TP_PROTO(unsigned int peer_debug_id, int ref, enum rxrpc_peer_trace why),
TP_ARGS(peer_debug_id, ref, why),
TP_STRUCT__entry(
__field(unsigned int, peer)
__field(int, ref)
__field(enum rxrpc_peer_trace, why)
),
TP_fast_assign(
__entry->peer = peer_debug_id;
__entry->ref = ref;
__entry->why = why;
),
TP_printk("P=%08x %s r=%d",
__entry->peer,
__print_symbolic(__entry->why, rxrpc_peer_traces),
__entry->ref)
);
TRACE_EVENT(rxrpc_bundle,
TP_PROTO(unsigned int bundle_debug_id, int ref, enum rxrpc_bundle_trace why),
TP_ARGS(bundle_debug_id, ref, why),
TP_STRUCT__entry(
__field(unsigned int, bundle)
__field(int, ref)
__field(int, why)
),
TP_fast_assign(
__entry->bundle = bundle_debug_id;
__entry->ref = ref;
__entry->why = why;
),
TP_printk("CB=%08x %s r=%d",
__entry->bundle,
__print_symbolic(__entry->why, rxrpc_bundle_traces),
__entry->ref)
);
TRACE_EVENT(rxrpc_conn,
TP_PROTO(unsigned int conn_debug_id, int ref, enum rxrpc_conn_trace why),
TP_ARGS(conn_debug_id, ref, why),
TP_STRUCT__entry(
__field(unsigned int, conn)
__field(int, ref)
__field(int, why)
),
TP_fast_assign(
__entry->conn = conn_debug_id;
__entry->ref = ref;
__entry->why = why;
),
TP_printk("C=%08x %s r=%d",
__entry->conn,
__print_symbolic(__entry->why, rxrpc_conn_traces),
__entry->ref)
);
TRACE_EVENT(rxrpc_client,
TP_PROTO(struct rxrpc_connection *conn, int channel,
enum rxrpc_client_trace op),
TP_ARGS(conn, channel, op),
TP_STRUCT__entry(
__field(unsigned int, conn)
__field(u32, cid)
__field(int, channel)
__field(int, usage)
__field(enum rxrpc_client_trace, op)
),
TP_fast_assign(
rxrpc: Rewrite the client connection manager Rewrite the rxrpc client connection manager so that it can support multiple connections for a given security key to a peer. The following changes are made: (1) For each open socket, the code currently maintains an rbtree with the connections placed into it, keyed by communications parameters. This is tricky to maintain as connections can be culled from the tree or replaced within it. Connections can require replacement for a number of reasons, e.g. their IDs span too great a range for the IDR data type to represent efficiently, the call ID numbers on that conn would overflow or the conn got aborted. This is changed so that there's now a connection bundle object placed in the tree, keyed on the same parameters. The bundle, however, does not need to be replaced. (2) An rxrpc_bundle object can now manage the available channels for a set of parallel connections. The lock that manages this is moved there from the rxrpc_connection struct (channel_lock). (3) There'a a dummy bundle for all incoming connections to share so that they have a channel_lock too. It might be better to give each incoming connection its own bundle. This bundle is not needed to manage which channels incoming calls are made on because that's the solely at whim of the client. (4) The restrictions on how many client connections are around are removed. Instead, a previous patch limits the number of client calls that can be allocated. Ordinarily, client connections are reaped after 2 minutes on the idle queue, but when more than a certain number of connections are in existence, the reaper starts reaping them after 2s of idleness instead to get the numbers back down. It could also be made such that new call allocations are forced to wait until the number of outstanding connections subsides. Signed-off-by: David Howells <dhowells@redhat.com>
2020-07-01 18:15:32 +08:00
__entry->conn = conn ? conn->debug_id : 0;
__entry->channel = channel;
__entry->usage = conn ? refcount_read(&conn->ref) : -2;
__entry->op = op;
__entry->cid = conn ? conn->proto.cid : 0;
),
rxrpc: Rewrite the client connection manager Rewrite the rxrpc client connection manager so that it can support multiple connections for a given security key to a peer. The following changes are made: (1) For each open socket, the code currently maintains an rbtree with the connections placed into it, keyed by communications parameters. This is tricky to maintain as connections can be culled from the tree or replaced within it. Connections can require replacement for a number of reasons, e.g. their IDs span too great a range for the IDR data type to represent efficiently, the call ID numbers on that conn would overflow or the conn got aborted. This is changed so that there's now a connection bundle object placed in the tree, keyed on the same parameters. The bundle, however, does not need to be replaced. (2) An rxrpc_bundle object can now manage the available channels for a set of parallel connections. The lock that manages this is moved there from the rxrpc_connection struct (channel_lock). (3) There'a a dummy bundle for all incoming connections to share so that they have a channel_lock too. It might be better to give each incoming connection its own bundle. This bundle is not needed to manage which channels incoming calls are made on because that's the solely at whim of the client. (4) The restrictions on how many client connections are around are removed. Instead, a previous patch limits the number of client calls that can be allocated. Ordinarily, client connections are reaped after 2 minutes on the idle queue, but when more than a certain number of connections are in existence, the reaper starts reaping them after 2s of idleness instead to get the numbers back down. It could also be made such that new call allocations are forced to wait until the number of outstanding connections subsides. Signed-off-by: David Howells <dhowells@redhat.com>
2020-07-01 18:15:32 +08:00
TP_printk("C=%08x h=%2d %s i=%08x u=%d",
__entry->conn,
__entry->channel,
__print_symbolic(__entry->op, rxrpc_client_traces),
__entry->cid,
__entry->usage)
);
TRACE_EVENT(rxrpc_call,
TP_PROTO(unsigned int call_debug_id, int ref, unsigned long aux,
enum rxrpc_call_trace why),
TP_ARGS(call_debug_id, ref, aux, why),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(int, ref)
__field(int, why)
__field(unsigned long, aux)
),
TP_fast_assign(
__entry->call = call_debug_id;
__entry->ref = ref;
__entry->why = why;
__entry->aux = aux;
),
TP_printk("c=%08x %s r=%d a=%lx",
__entry->call,
__print_symbolic(__entry->why, rxrpc_call_traces),
__entry->ref,
__entry->aux)
);
TRACE_EVENT(rxrpc_skb,
TP_PROTO(struct sk_buff *skb, int usage, int mod_count,
enum rxrpc_skb_trace why),
TP_ARGS(skb, usage, mod_count, why),
TP_STRUCT__entry(
__field(struct sk_buff *, skb)
__field(int, usage)
__field(int, mod_count)
__field(enum rxrpc_skb_trace, why)
),
TP_fast_assign(
__entry->skb = skb;
__entry->usage = usage;
__entry->mod_count = mod_count;
__entry->why = why;
),
TP_printk("s=%p Rx %s u=%d m=%d",
__entry->skb,
__print_symbolic(__entry->why, rxrpc_skb_traces),
__entry->usage,
__entry->mod_count)
);
TRACE_EVENT(rxrpc_rx_packet,
TP_PROTO(struct rxrpc_skb_priv *sp),
TP_ARGS(sp),
TP_STRUCT__entry(
__field_struct(struct rxrpc_host_header, hdr)
),
TP_fast_assign(
memcpy(&__entry->hdr, &sp->hdr, sizeof(__entry->hdr));
),
TP_printk("%08x:%08x:%08x:%04x %08x %08x %02x %02x %s",
__entry->hdr.epoch, __entry->hdr.cid,
__entry->hdr.callNumber, __entry->hdr.serviceId,
__entry->hdr.serial, __entry->hdr.seq,
__entry->hdr.securityIndex, __entry->hdr.flags,
__print_symbolic(__entry->hdr.type, rxrpc_pkts))
);
TRACE_EVENT(rxrpc_rx_done,
TP_PROTO(int result, int abort_code),
TP_ARGS(result, abort_code),
TP_STRUCT__entry(
__field(int, result)
__field(int, abort_code)
),
TP_fast_assign(
__entry->result = result;
__entry->abort_code = abort_code;
),
TP_printk("r=%d a=%d", __entry->result, __entry->abort_code)
);
TRACE_EVENT(rxrpc_abort,
TP_PROTO(unsigned int call_nr, enum rxrpc_abort_reason why,
u32 cid, u32 call_id, rxrpc_seq_t seq, int abort_code, int error),
TP_ARGS(call_nr, why, cid, call_id, seq, abort_code, error),
TP_STRUCT__entry(
__field(unsigned int, call_nr)
__field(enum rxrpc_abort_reason, why)
__field(u32, cid)
__field(u32, call_id)
__field(rxrpc_seq_t, seq)
__field(int, abort_code)
__field(int, error)
),
TP_fast_assign(
__entry->call_nr = call_nr;
__entry->why = why;
__entry->cid = cid;
__entry->call_id = call_id;
__entry->abort_code = abort_code;
__entry->error = error;
__entry->seq = seq;
),
TP_printk("c=%08x %08x:%08x s=%u a=%d e=%d %s",
__entry->call_nr,
__entry->cid, __entry->call_id, __entry->seq,
__entry->abort_code, __entry->error,
__print_symbolic(__entry->why, rxrpc_abort_reasons))
);
TRACE_EVENT(rxrpc_call_complete,
TP_PROTO(struct rxrpc_call *call),
TP_ARGS(call),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(enum rxrpc_call_completion, compl)
__field(int, error)
__field(u32, abort_code)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->compl = call->completion;
__entry->error = call->error;
__entry->abort_code = call->abort_code;
),
TP_printk("c=%08x %s r=%d ac=%d",
__entry->call,
__print_symbolic(__entry->compl, rxrpc_completions),
__entry->error,
__entry->abort_code)
);
rxrpc: Don't use a ring buffer for call Tx queue Change the way the Tx queueing works to make the following ends easier to achieve: (1) The filling of packets, the encryption of packets and the transmission of packets can be handled in parallel by separate threads, rather than rxrpc_sendmsg() allocating, filling, encrypting and transmitting each packet before moving onto the next one. (2) Get rid of the fixed-size ring which sets a hard limit on the number of packets that can be retained in the ring. This allows the number of packets to increase without having to allocate a very large ring or having variable-sized rings. [Note: the downside of this is that it's then less efficient to locate a packet for retransmission as we then have to step through a list and examine each buffer in the list.] (3) Allow the filler/encrypter to run ahead of the transmission window. (4) Make it easier to do zero copy UDP from the packet buffers. (5) Make it easier to do zero copy from userspace to the packet buffers - and thence to UDP (only if for unauthenticated connections). To that end, the following changes are made: (1) Use the new rxrpc_txbuf struct instead of sk_buff for keeping packets to be transmitted in. This allows them to be placed on multiple queues simultaneously. An sk_buff isn't really necessary as it's never passed on to lower-level networking code. (2) Keep the transmissable packets in a linked list on the call struct rather than in a ring. As a consequence, the annotation buffer isn't used either; rather a flag is set on the packet to indicate ackedness. (3) Use the RXRPC_CALL_TX_LAST flag to indicate that the last packet to be transmitted has been queued. Add RXRPC_CALL_TX_ALL_ACKED to indicate that all packets up to and including the last got hard acked. (4) Wire headers are now stored in the txbuf rather than being concocted on the stack and they're stored immediately before the data, thereby allowing zerocopy of a single span. (5) Don't bother with instant-resend on transmission failure; rather, leave it for a timer or an ACK packet to trigger. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2022-04-01 06:55:08 +08:00
TRACE_EVENT(rxrpc_txqueue,
TP_PROTO(struct rxrpc_call *call, enum rxrpc_txqueue_trace why),
TP_ARGS(call, why),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(enum rxrpc_txqueue_trace, why)
__field(rxrpc_seq_t, acks_hard_ack)
__field(rxrpc_seq_t, tx_bottom)
__field(rxrpc_seq_t, tx_top)
__field(rxrpc_seq_t, tx_prepared)
__field(int, tx_winsize)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->why = why;
rxrpc: Don't use a ring buffer for call Tx queue Change the way the Tx queueing works to make the following ends easier to achieve: (1) The filling of packets, the encryption of packets and the transmission of packets can be handled in parallel by separate threads, rather than rxrpc_sendmsg() allocating, filling, encrypting and transmitting each packet before moving onto the next one. (2) Get rid of the fixed-size ring which sets a hard limit on the number of packets that can be retained in the ring. This allows the number of packets to increase without having to allocate a very large ring or having variable-sized rings. [Note: the downside of this is that it's then less efficient to locate a packet for retransmission as we then have to step through a list and examine each buffer in the list.] (3) Allow the filler/encrypter to run ahead of the transmission window. (4) Make it easier to do zero copy UDP from the packet buffers. (5) Make it easier to do zero copy from userspace to the packet buffers - and thence to UDP (only if for unauthenticated connections). To that end, the following changes are made: (1) Use the new rxrpc_txbuf struct instead of sk_buff for keeping packets to be transmitted in. This allows them to be placed on multiple queues simultaneously. An sk_buff isn't really necessary as it's never passed on to lower-level networking code. (2) Keep the transmissable packets in a linked list on the call struct rather than in a ring. As a consequence, the annotation buffer isn't used either; rather a flag is set on the packet to indicate ackedness. (3) Use the RXRPC_CALL_TX_LAST flag to indicate that the last packet to be transmitted has been queued. Add RXRPC_CALL_TX_ALL_ACKED to indicate that all packets up to and including the last got hard acked. (4) Wire headers are now stored in the txbuf rather than being concocted on the stack and they're stored immediately before the data, thereby allowing zerocopy of a single span. (5) Don't bother with instant-resend on transmission failure; rather, leave it for a timer or an ACK packet to trigger. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2022-04-01 06:55:08 +08:00
__entry->acks_hard_ack = call->acks_hard_ack;
__entry->tx_bottom = call->tx_bottom;
__entry->tx_top = call->tx_top;
__entry->tx_prepared = call->tx_prepared;
__entry->tx_winsize = call->tx_winsize;
),
TP_printk("c=%08x %s f=%08x h=%08x n=%u/%u/%u/%u",
__entry->call,
rxrpc: Don't use a ring buffer for call Tx queue Change the way the Tx queueing works to make the following ends easier to achieve: (1) The filling of packets, the encryption of packets and the transmission of packets can be handled in parallel by separate threads, rather than rxrpc_sendmsg() allocating, filling, encrypting and transmitting each packet before moving onto the next one. (2) Get rid of the fixed-size ring which sets a hard limit on the number of packets that can be retained in the ring. This allows the number of packets to increase without having to allocate a very large ring or having variable-sized rings. [Note: the downside of this is that it's then less efficient to locate a packet for retransmission as we then have to step through a list and examine each buffer in the list.] (3) Allow the filler/encrypter to run ahead of the transmission window. (4) Make it easier to do zero copy UDP from the packet buffers. (5) Make it easier to do zero copy from userspace to the packet buffers - and thence to UDP (only if for unauthenticated connections). To that end, the following changes are made: (1) Use the new rxrpc_txbuf struct instead of sk_buff for keeping packets to be transmitted in. This allows them to be placed on multiple queues simultaneously. An sk_buff isn't really necessary as it's never passed on to lower-level networking code. (2) Keep the transmissable packets in a linked list on the call struct rather than in a ring. As a consequence, the annotation buffer isn't used either; rather a flag is set on the packet to indicate ackedness. (3) Use the RXRPC_CALL_TX_LAST flag to indicate that the last packet to be transmitted has been queued. Add RXRPC_CALL_TX_ALL_ACKED to indicate that all packets up to and including the last got hard acked. (4) Wire headers are now stored in the txbuf rather than being concocted on the stack and they're stored immediately before the data, thereby allowing zerocopy of a single span. (5) Don't bother with instant-resend on transmission failure; rather, leave it for a timer or an ACK packet to trigger. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2022-04-01 06:55:08 +08:00
__print_symbolic(__entry->why, rxrpc_txqueue_traces),
__entry->tx_bottom,
__entry->acks_hard_ack,
__entry->tx_top - __entry->tx_bottom,
__entry->tx_top - __entry->acks_hard_ack,
__entry->tx_prepared - __entry->tx_bottom,
__entry->tx_winsize)
);
TRACE_EVENT(rxrpc_rx_data,
TP_PROTO(unsigned int call, rxrpc_seq_t seq,
rxrpc_serial_t serial, u8 flags),
TP_ARGS(call, seq, serial, flags),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(rxrpc_seq_t, seq)
__field(rxrpc_serial_t, serial)
__field(u8, flags)
),
TP_fast_assign(
__entry->call = call;
__entry->seq = seq;
__entry->serial = serial;
__entry->flags = flags;
),
TP_printk("c=%08x DATA %08x q=%08x fl=%02x",
__entry->call,
__entry->serial,
__entry->seq,
__entry->flags)
);
TRACE_EVENT(rxrpc_rx_ack,
TP_PROTO(struct rxrpc_call *call,
rxrpc_serial_t serial, rxrpc_serial_t ack_serial,
rxrpc_seq_t first, rxrpc_seq_t prev, u8 reason, u8 n_acks),
TP_ARGS(call, serial, ack_serial, first, prev, reason, n_acks),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(rxrpc_serial_t, serial)
__field(rxrpc_serial_t, ack_serial)
__field(rxrpc_seq_t, first)
__field(rxrpc_seq_t, prev)
__field(u8, reason)
__field(u8, n_acks)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->serial = serial;
__entry->ack_serial = ack_serial;
__entry->first = first;
__entry->prev = prev;
__entry->reason = reason;
__entry->n_acks = n_acks;
),
TP_printk("c=%08x %08x %s r=%08x f=%08x p=%08x n=%u",
__entry->call,
__entry->serial,
__print_symbolic(__entry->reason, rxrpc_ack_names),
__entry->ack_serial,
__entry->first,
__entry->prev,
__entry->n_acks)
);
TRACE_EVENT(rxrpc_rx_abort,
TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t serial,
u32 abort_code),
TP_ARGS(call, serial, abort_code),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(rxrpc_serial_t, serial)
__field(u32, abort_code)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->serial = serial;
__entry->abort_code = abort_code;
),
TP_printk("c=%08x ABORT %08x ac=%d",
__entry->call,
__entry->serial,
__entry->abort_code)
);
TRACE_EVENT(rxrpc_rx_challenge,
TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial,
u32 version, u32 nonce, u32 min_level),
TP_ARGS(conn, serial, version, nonce, min_level),
TP_STRUCT__entry(
__field(unsigned int, conn)
__field(rxrpc_serial_t, serial)
__field(u32, version)
__field(u32, nonce)
__field(u32, min_level)
),
TP_fast_assign(
__entry->conn = conn->debug_id;
__entry->serial = serial;
__entry->version = version;
__entry->nonce = nonce;
__entry->min_level = min_level;
),
TP_printk("C=%08x CHALLENGE %08x v=%x n=%x ml=%x",
__entry->conn,
__entry->serial,
__entry->version,
__entry->nonce,
__entry->min_level)
);
TRACE_EVENT(rxrpc_rx_response,
TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial,
u32 version, u32 kvno, u32 ticket_len),
TP_ARGS(conn, serial, version, kvno, ticket_len),
TP_STRUCT__entry(
__field(unsigned int, conn)
__field(rxrpc_serial_t, serial)
__field(u32, version)
__field(u32, kvno)
__field(u32, ticket_len)
),
TP_fast_assign(
__entry->conn = conn->debug_id;
__entry->serial = serial;
__entry->version = version;
__entry->kvno = kvno;
__entry->ticket_len = ticket_len;
),
TP_printk("C=%08x RESPONSE %08x v=%x kvno=%x tl=%x",
__entry->conn,
__entry->serial,
__entry->version,
__entry->kvno,
__entry->ticket_len)
);
TRACE_EVENT(rxrpc_rx_rwind_change,
TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t serial,
u32 rwind, bool wake),
TP_ARGS(call, serial, rwind, wake),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(rxrpc_serial_t, serial)
__field(u32, rwind)
__field(bool, wake)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->serial = serial;
__entry->rwind = rwind;
__entry->wake = wake;
),
TP_printk("c=%08x %08x rw=%u%s",
__entry->call,
__entry->serial,
__entry->rwind,
__entry->wake ? " wake" : "")
);
TRACE_EVENT(rxrpc_tx_packet,
TP_PROTO(unsigned int call_id, struct rxrpc_wire_header *whdr,
enum rxrpc_tx_point where),
TP_ARGS(call_id, whdr, where),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(enum rxrpc_tx_point, where)
__field_struct(struct rxrpc_wire_header, whdr)
),
TP_fast_assign(
__entry->call = call_id;
memcpy(&__entry->whdr, whdr, sizeof(__entry->whdr));
__entry->where = where;
),
TP_printk("c=%08x %08x:%08x:%08x:%04x %08x %08x %02x %02x %s %s",
__entry->call,
ntohl(__entry->whdr.epoch),
ntohl(__entry->whdr.cid),
ntohl(__entry->whdr.callNumber),
ntohs(__entry->whdr.serviceId),
ntohl(__entry->whdr.serial),
ntohl(__entry->whdr.seq),
__entry->whdr.type, __entry->whdr.flags,
__entry->whdr.type <= 15 ?
__print_symbolic(__entry->whdr.type, rxrpc_pkts) : "?UNK",
__print_symbolic(__entry->where, rxrpc_tx_points))
);
TRACE_EVENT(rxrpc_tx_data,
TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq,
rxrpc_serial_t serial, u8 flags, bool retrans, bool lose),
TP_ARGS(call, seq, serial, flags, retrans, lose),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(rxrpc_seq_t, seq)
__field(rxrpc_serial_t, serial)
__field(u32, cid)
__field(u32, call_id)
__field(u8, flags)
__field(bool, retrans)
__field(bool, lose)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->cid = call->cid;
__entry->call_id = call->call_id;
__entry->seq = seq;
__entry->serial = serial;
__entry->flags = flags;
__entry->retrans = retrans;
__entry->lose = lose;
),
TP_printk("c=%08x DATA %08x:%08x %08x q=%08x fl=%02x%s%s",
__entry->call,
__entry->cid,
__entry->call_id,
__entry->serial,
__entry->seq,
__entry->flags,
__entry->retrans ? " *RETRANS*" : "",
__entry->lose ? " *LOSE*" : "")
);
TRACE_EVENT(rxrpc_tx_ack,
TP_PROTO(unsigned int call, rxrpc_serial_t serial,
rxrpc_seq_t ack_first, rxrpc_serial_t ack_serial,
u8 reason, u8 n_acks, u16 rwind),
TP_ARGS(call, serial, ack_first, ack_serial, reason, n_acks, rwind),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(rxrpc_serial_t, serial)
__field(rxrpc_seq_t, ack_first)
__field(rxrpc_serial_t, ack_serial)
__field(u8, reason)
__field(u8, n_acks)
__field(u16, rwind)
),
TP_fast_assign(
__entry->call = call;
__entry->serial = serial;
__entry->ack_first = ack_first;
__entry->ack_serial = ack_serial;
__entry->reason = reason;
__entry->n_acks = n_acks;
__entry->rwind = rwind;
),
TP_printk(" c=%08x ACK %08x %s f=%08x r=%08x n=%u rw=%u",
__entry->call,
__entry->serial,
__print_symbolic(__entry->reason, rxrpc_ack_names),
__entry->ack_first,
__entry->ack_serial,
__entry->n_acks,
__entry->rwind)
);
TRACE_EVENT(rxrpc_receive,
TP_PROTO(struct rxrpc_call *call, enum rxrpc_receive_trace why,
rxrpc_serial_t serial, rxrpc_seq_t seq),
TP_ARGS(call, why, serial, seq),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(enum rxrpc_receive_trace, why)
__field(rxrpc_serial_t, serial)
__field(rxrpc_seq_t, seq)
__field(rxrpc_seq_t, window)
__field(rxrpc_seq_t, wtop)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->why = why;
__entry->serial = serial;
__entry->seq = seq;
__entry->window = call->ackr_window;
__entry->wtop = call->ackr_wtop;
),
TP_printk("c=%08x %s r=%08x q=%08x w=%08x-%08x",
__entry->call,
__print_symbolic(__entry->why, rxrpc_receive_traces),
__entry->serial,
__entry->seq,
__entry->window,
__entry->wtop)
);
TRACE_EVENT(rxrpc_recvmsg,
TP_PROTO(unsigned int call_debug_id, enum rxrpc_recvmsg_trace why,
int ret),
TP_ARGS(call_debug_id, why, ret),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(enum rxrpc_recvmsg_trace, why)
__field(int, ret)
),
TP_fast_assign(
__entry->call = call_debug_id;
__entry->why = why;
__entry->ret = ret;
),
TP_printk("c=%08x %s ret=%d",
__entry->call,
__print_symbolic(__entry->why, rxrpc_recvmsg_traces),
__entry->ret)
);
TRACE_EVENT(rxrpc_recvdata,
TP_PROTO(struct rxrpc_call *call, enum rxrpc_recvmsg_trace why,
rxrpc_seq_t seq, unsigned int offset, unsigned int len,
int ret),
TP_ARGS(call, why, seq, offset, len, ret),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(enum rxrpc_recvmsg_trace, why)
__field(rxrpc_seq_t, seq)
__field(unsigned int, offset)
__field(unsigned int, len)
__field(int, ret)
),
TP_fast_assign(
__entry->call = call ? call->debug_id : 0;
__entry->why = why;
__entry->seq = seq;
__entry->offset = offset;
__entry->len = len;
__entry->ret = ret;
),
TP_printk("c=%08x %s q=%08x o=%u l=%u ret=%d",
__entry->call,
__print_symbolic(__entry->why, rxrpc_recvmsg_traces),
__entry->seq,
__entry->offset,
__entry->len,
__entry->ret)
);
TRACE_EVENT(rxrpc_rtt_tx,
TP_PROTO(struct rxrpc_call *call, enum rxrpc_rtt_tx_trace why,
rxrpc: Fix loss of RTT samples due to interposed ACK The Rx protocol has a mechanism to help generate RTT samples that works by a client transmitting a REQUESTED-type ACK when it receives a DATA packet that has the REQUEST_ACK flag set. The peer, however, may interpose other ACKs before transmitting the REQUESTED-ACK, as can be seen in the following trace excerpt: rxrpc_tx_data: c=00000044 DATA d0b5ece8:00000001 00000001 q=00000001 fl=07 rxrpc_rx_ack: c=00000044 00000001 PNG r=00000000 f=00000002 p=00000000 n=0 rxrpc_rx_ack: c=00000044 00000002 REQ r=00000001 f=00000002 p=00000001 n=0 ... DATA packet 1 (q=xx) has REQUEST_ACK set (bit 1 of fl=xx). The incoming ping (labelled PNG) hard-acks the request DATA packet (f=xx exceeds the sequence number of the DATA packet), causing it to be discarded from the Tx ring. The ACK that was requested (labelled REQ, r=xx references the serial of the DATA packet) comes after the ping, but the sk_buff holding the timestamp has gone and the RTT sample is lost. This is particularly noticeable on RPC calls used to probe the service offered by the peer. A lot of peers end up with an unknown RTT because we only ever sent a single RPC. This confuses the server rotation algorithm. Fix this by caching the information about the outgoing packet in RTT calculations in the rxrpc_call struct rather than looking in the Tx ring. A four-deep buffer is maintained and both REQUEST_ACK-flagged DATA and PING-ACK transmissions are recorded in there. When the appropriate response ACK is received, the buffer is checked for a match and, if found, an RTT sample is recorded. If a received ACK refers to a packet with a later serial number than an entry in the cache, that entry is presumed lost and the entry is made available to record a new transmission. ACKs types other than REQUESTED-type and PING-type cause any matching sample to be cancelled as they don't necessarily represent a useful measurement. If there's no space in the buffer on ping/data transmission, the sample base is discarded. Fixes: 50235c4b5a2f ("rxrpc: Obtain RTT data by requesting ACKs on DATA packets") Signed-off-by: David Howells <dhowells@redhat.com>
2020-08-20 06:29:16 +08:00
int slot, rxrpc_serial_t send_serial),
rxrpc: Fix loss of RTT samples due to interposed ACK The Rx protocol has a mechanism to help generate RTT samples that works by a client transmitting a REQUESTED-type ACK when it receives a DATA packet that has the REQUEST_ACK flag set. The peer, however, may interpose other ACKs before transmitting the REQUESTED-ACK, as can be seen in the following trace excerpt: rxrpc_tx_data: c=00000044 DATA d0b5ece8:00000001 00000001 q=00000001 fl=07 rxrpc_rx_ack: c=00000044 00000001 PNG r=00000000 f=00000002 p=00000000 n=0 rxrpc_rx_ack: c=00000044 00000002 REQ r=00000001 f=00000002 p=00000001 n=0 ... DATA packet 1 (q=xx) has REQUEST_ACK set (bit 1 of fl=xx). The incoming ping (labelled PNG) hard-acks the request DATA packet (f=xx exceeds the sequence number of the DATA packet), causing it to be discarded from the Tx ring. The ACK that was requested (labelled REQ, r=xx references the serial of the DATA packet) comes after the ping, but the sk_buff holding the timestamp has gone and the RTT sample is lost. This is particularly noticeable on RPC calls used to probe the service offered by the peer. A lot of peers end up with an unknown RTT because we only ever sent a single RPC. This confuses the server rotation algorithm. Fix this by caching the information about the outgoing packet in RTT calculations in the rxrpc_call struct rather than looking in the Tx ring. A four-deep buffer is maintained and both REQUEST_ACK-flagged DATA and PING-ACK transmissions are recorded in there. When the appropriate response ACK is received, the buffer is checked for a match and, if found, an RTT sample is recorded. If a received ACK refers to a packet with a later serial number than an entry in the cache, that entry is presumed lost and the entry is made available to record a new transmission. ACKs types other than REQUESTED-type and PING-type cause any matching sample to be cancelled as they don't necessarily represent a useful measurement. If there's no space in the buffer on ping/data transmission, the sample base is discarded. Fixes: 50235c4b5a2f ("rxrpc: Obtain RTT data by requesting ACKs on DATA packets") Signed-off-by: David Howells <dhowells@redhat.com>
2020-08-20 06:29:16 +08:00
TP_ARGS(call, why, slot, send_serial),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(enum rxrpc_rtt_tx_trace, why)
__field(int, slot)
__field(rxrpc_serial_t, send_serial)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->why = why;
rxrpc: Fix loss of RTT samples due to interposed ACK The Rx protocol has a mechanism to help generate RTT samples that works by a client transmitting a REQUESTED-type ACK when it receives a DATA packet that has the REQUEST_ACK flag set. The peer, however, may interpose other ACKs before transmitting the REQUESTED-ACK, as can be seen in the following trace excerpt: rxrpc_tx_data: c=00000044 DATA d0b5ece8:00000001 00000001 q=00000001 fl=07 rxrpc_rx_ack: c=00000044 00000001 PNG r=00000000 f=00000002 p=00000000 n=0 rxrpc_rx_ack: c=00000044 00000002 REQ r=00000001 f=00000002 p=00000001 n=0 ... DATA packet 1 (q=xx) has REQUEST_ACK set (bit 1 of fl=xx). The incoming ping (labelled PNG) hard-acks the request DATA packet (f=xx exceeds the sequence number of the DATA packet), causing it to be discarded from the Tx ring. The ACK that was requested (labelled REQ, r=xx references the serial of the DATA packet) comes after the ping, but the sk_buff holding the timestamp has gone and the RTT sample is lost. This is particularly noticeable on RPC calls used to probe the service offered by the peer. A lot of peers end up with an unknown RTT because we only ever sent a single RPC. This confuses the server rotation algorithm. Fix this by caching the information about the outgoing packet in RTT calculations in the rxrpc_call struct rather than looking in the Tx ring. A four-deep buffer is maintained and both REQUEST_ACK-flagged DATA and PING-ACK transmissions are recorded in there. When the appropriate response ACK is received, the buffer is checked for a match and, if found, an RTT sample is recorded. If a received ACK refers to a packet with a later serial number than an entry in the cache, that entry is presumed lost and the entry is made available to record a new transmission. ACKs types other than REQUESTED-type and PING-type cause any matching sample to be cancelled as they don't necessarily represent a useful measurement. If there's no space in the buffer on ping/data transmission, the sample base is discarded. Fixes: 50235c4b5a2f ("rxrpc: Obtain RTT data by requesting ACKs on DATA packets") Signed-off-by: David Howells <dhowells@redhat.com>
2020-08-20 06:29:16 +08:00
__entry->slot = slot;
__entry->send_serial = send_serial;
),
rxrpc: Fix loss of RTT samples due to interposed ACK The Rx protocol has a mechanism to help generate RTT samples that works by a client transmitting a REQUESTED-type ACK when it receives a DATA packet that has the REQUEST_ACK flag set. The peer, however, may interpose other ACKs before transmitting the REQUESTED-ACK, as can be seen in the following trace excerpt: rxrpc_tx_data: c=00000044 DATA d0b5ece8:00000001 00000001 q=00000001 fl=07 rxrpc_rx_ack: c=00000044 00000001 PNG r=00000000 f=00000002 p=00000000 n=0 rxrpc_rx_ack: c=00000044 00000002 REQ r=00000001 f=00000002 p=00000001 n=0 ... DATA packet 1 (q=xx) has REQUEST_ACK set (bit 1 of fl=xx). The incoming ping (labelled PNG) hard-acks the request DATA packet (f=xx exceeds the sequence number of the DATA packet), causing it to be discarded from the Tx ring. The ACK that was requested (labelled REQ, r=xx references the serial of the DATA packet) comes after the ping, but the sk_buff holding the timestamp has gone and the RTT sample is lost. This is particularly noticeable on RPC calls used to probe the service offered by the peer. A lot of peers end up with an unknown RTT because we only ever sent a single RPC. This confuses the server rotation algorithm. Fix this by caching the information about the outgoing packet in RTT calculations in the rxrpc_call struct rather than looking in the Tx ring. A four-deep buffer is maintained and both REQUEST_ACK-flagged DATA and PING-ACK transmissions are recorded in there. When the appropriate response ACK is received, the buffer is checked for a match and, if found, an RTT sample is recorded. If a received ACK refers to a packet with a later serial number than an entry in the cache, that entry is presumed lost and the entry is made available to record a new transmission. ACKs types other than REQUESTED-type and PING-type cause any matching sample to be cancelled as they don't necessarily represent a useful measurement. If there's no space in the buffer on ping/data transmission, the sample base is discarded. Fixes: 50235c4b5a2f ("rxrpc: Obtain RTT data by requesting ACKs on DATA packets") Signed-off-by: David Howells <dhowells@redhat.com>
2020-08-20 06:29:16 +08:00
TP_printk("c=%08x [%d] %s sr=%08x",
__entry->call,
rxrpc: Fix loss of RTT samples due to interposed ACK The Rx protocol has a mechanism to help generate RTT samples that works by a client transmitting a REQUESTED-type ACK when it receives a DATA packet that has the REQUEST_ACK flag set. The peer, however, may interpose other ACKs before transmitting the REQUESTED-ACK, as can be seen in the following trace excerpt: rxrpc_tx_data: c=00000044 DATA d0b5ece8:00000001 00000001 q=00000001 fl=07 rxrpc_rx_ack: c=00000044 00000001 PNG r=00000000 f=00000002 p=00000000 n=0 rxrpc_rx_ack: c=00000044 00000002 REQ r=00000001 f=00000002 p=00000001 n=0 ... DATA packet 1 (q=xx) has REQUEST_ACK set (bit 1 of fl=xx). The incoming ping (labelled PNG) hard-acks the request DATA packet (f=xx exceeds the sequence number of the DATA packet), causing it to be discarded from the Tx ring. The ACK that was requested (labelled REQ, r=xx references the serial of the DATA packet) comes after the ping, but the sk_buff holding the timestamp has gone and the RTT sample is lost. This is particularly noticeable on RPC calls used to probe the service offered by the peer. A lot of peers end up with an unknown RTT because we only ever sent a single RPC. This confuses the server rotation algorithm. Fix this by caching the information about the outgoing packet in RTT calculations in the rxrpc_call struct rather than looking in the Tx ring. A four-deep buffer is maintained and both REQUEST_ACK-flagged DATA and PING-ACK transmissions are recorded in there. When the appropriate response ACK is received, the buffer is checked for a match and, if found, an RTT sample is recorded. If a received ACK refers to a packet with a later serial number than an entry in the cache, that entry is presumed lost and the entry is made available to record a new transmission. ACKs types other than REQUESTED-type and PING-type cause any matching sample to be cancelled as they don't necessarily represent a useful measurement. If there's no space in the buffer on ping/data transmission, the sample base is discarded. Fixes: 50235c4b5a2f ("rxrpc: Obtain RTT data by requesting ACKs on DATA packets") Signed-off-by: David Howells <dhowells@redhat.com>
2020-08-20 06:29:16 +08:00
__entry->slot,
__print_symbolic(__entry->why, rxrpc_rtt_tx_traces),
__entry->send_serial)
);
TRACE_EVENT(rxrpc_rtt_rx,
TP_PROTO(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
rxrpc: Fix loss of RTT samples due to interposed ACK The Rx protocol has a mechanism to help generate RTT samples that works by a client transmitting a REQUESTED-type ACK when it receives a DATA packet that has the REQUEST_ACK flag set. The peer, however, may interpose other ACKs before transmitting the REQUESTED-ACK, as can be seen in the following trace excerpt: rxrpc_tx_data: c=00000044 DATA d0b5ece8:00000001 00000001 q=00000001 fl=07 rxrpc_rx_ack: c=00000044 00000001 PNG r=00000000 f=00000002 p=00000000 n=0 rxrpc_rx_ack: c=00000044 00000002 REQ r=00000001 f=00000002 p=00000001 n=0 ... DATA packet 1 (q=xx) has REQUEST_ACK set (bit 1 of fl=xx). The incoming ping (labelled PNG) hard-acks the request DATA packet (f=xx exceeds the sequence number of the DATA packet), causing it to be discarded from the Tx ring. The ACK that was requested (labelled REQ, r=xx references the serial of the DATA packet) comes after the ping, but the sk_buff holding the timestamp has gone and the RTT sample is lost. This is particularly noticeable on RPC calls used to probe the service offered by the peer. A lot of peers end up with an unknown RTT because we only ever sent a single RPC. This confuses the server rotation algorithm. Fix this by caching the information about the outgoing packet in RTT calculations in the rxrpc_call struct rather than looking in the Tx ring. A four-deep buffer is maintained and both REQUEST_ACK-flagged DATA and PING-ACK transmissions are recorded in there. When the appropriate response ACK is received, the buffer is checked for a match and, if found, an RTT sample is recorded. If a received ACK refers to a packet with a later serial number than an entry in the cache, that entry is presumed lost and the entry is made available to record a new transmission. ACKs types other than REQUESTED-type and PING-type cause any matching sample to be cancelled as they don't necessarily represent a useful measurement. If there's no space in the buffer on ping/data transmission, the sample base is discarded. Fixes: 50235c4b5a2f ("rxrpc: Obtain RTT data by requesting ACKs on DATA packets") Signed-off-by: David Howells <dhowells@redhat.com>
2020-08-20 06:29:16 +08:00
int slot,
rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial,
rxrpc: Fix the excessive initial retransmission timeout rxrpc currently uses a fixed 4s retransmission timeout until the RTT is sufficiently sampled. This can cause problems with some fileservers with calls to the cache manager in the afs filesystem being dropped from the fileserver because a packet goes missing and the retransmission timeout is greater than the call expiry timeout. Fix this by: (1) Copying the RTT/RTO calculation code from Linux's TCP implementation and altering it to fit rxrpc. (2) Altering the various users of the RTT to make use of the new SRTT value. (3) Replacing the use of rxrpc_resend_timeout to use the calculated RTO value instead (which is needed in jiffies), along with a backoff. Notes: (1) rxrpc provides RTT samples by matching the serial numbers on outgoing DATA packets that have the RXRPC_REQUEST_ACK set and PING ACK packets against the reference serial number in incoming REQUESTED ACK and PING-RESPONSE ACK packets. (2) Each packet that is transmitted on an rxrpc connection gets a new per-connection serial number, even for retransmissions, so an ACK can be cross-referenced to a specific trigger packet. This allows RTT information to be drawn from retransmitted DATA packets also. (3) rxrpc maintains the RTT/RTO state on the rxrpc_peer record rather than on an rxrpc_call because many RPC calls won't live long enough to generate more than one sample. (4) The calculated SRTT value is in units of 8ths of a microsecond rather than nanoseconds. The (S)RTT and RTO values are displayed in /proc/net/rxrpc/peers. Fixes: 17926a79320a ([AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both"") Signed-off-by: David Howells <dhowells@redhat.com>
2020-05-11 21:54:34 +08:00
u32 rtt, u32 rto),
rxrpc: Fix loss of RTT samples due to interposed ACK The Rx protocol has a mechanism to help generate RTT samples that works by a client transmitting a REQUESTED-type ACK when it receives a DATA packet that has the REQUEST_ACK flag set. The peer, however, may interpose other ACKs before transmitting the REQUESTED-ACK, as can be seen in the following trace excerpt: rxrpc_tx_data: c=00000044 DATA d0b5ece8:00000001 00000001 q=00000001 fl=07 rxrpc_rx_ack: c=00000044 00000001 PNG r=00000000 f=00000002 p=00000000 n=0 rxrpc_rx_ack: c=00000044 00000002 REQ r=00000001 f=00000002 p=00000001 n=0 ... DATA packet 1 (q=xx) has REQUEST_ACK set (bit 1 of fl=xx). The incoming ping (labelled PNG) hard-acks the request DATA packet (f=xx exceeds the sequence number of the DATA packet), causing it to be discarded from the Tx ring. The ACK that was requested (labelled REQ, r=xx references the serial of the DATA packet) comes after the ping, but the sk_buff holding the timestamp has gone and the RTT sample is lost. This is particularly noticeable on RPC calls used to probe the service offered by the peer. A lot of peers end up with an unknown RTT because we only ever sent a single RPC. This confuses the server rotation algorithm. Fix this by caching the information about the outgoing packet in RTT calculations in the rxrpc_call struct rather than looking in the Tx ring. A four-deep buffer is maintained and both REQUEST_ACK-flagged DATA and PING-ACK transmissions are recorded in there. When the appropriate response ACK is received, the buffer is checked for a match and, if found, an RTT sample is recorded. If a received ACK refers to a packet with a later serial number than an entry in the cache, that entry is presumed lost and the entry is made available to record a new transmission. ACKs types other than REQUESTED-type and PING-type cause any matching sample to be cancelled as they don't necessarily represent a useful measurement. If there's no space in the buffer on ping/data transmission, the sample base is discarded. Fixes: 50235c4b5a2f ("rxrpc: Obtain RTT data by requesting ACKs on DATA packets") Signed-off-by: David Howells <dhowells@redhat.com>
2020-08-20 06:29:16 +08:00
TP_ARGS(call, why, slot, send_serial, resp_serial, rtt, rto),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(enum rxrpc_rtt_rx_trace, why)
__field(int, slot)
__field(rxrpc_serial_t, send_serial)
__field(rxrpc_serial_t, resp_serial)
__field(u32, rtt)
__field(u32, rto)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->why = why;
rxrpc: Fix loss of RTT samples due to interposed ACK The Rx protocol has a mechanism to help generate RTT samples that works by a client transmitting a REQUESTED-type ACK when it receives a DATA packet that has the REQUEST_ACK flag set. The peer, however, may interpose other ACKs before transmitting the REQUESTED-ACK, as can be seen in the following trace excerpt: rxrpc_tx_data: c=00000044 DATA d0b5ece8:00000001 00000001 q=00000001 fl=07 rxrpc_rx_ack: c=00000044 00000001 PNG r=00000000 f=00000002 p=00000000 n=0 rxrpc_rx_ack: c=00000044 00000002 REQ r=00000001 f=00000002 p=00000001 n=0 ... DATA packet 1 (q=xx) has REQUEST_ACK set (bit 1 of fl=xx). The incoming ping (labelled PNG) hard-acks the request DATA packet (f=xx exceeds the sequence number of the DATA packet), causing it to be discarded from the Tx ring. The ACK that was requested (labelled REQ, r=xx references the serial of the DATA packet) comes after the ping, but the sk_buff holding the timestamp has gone and the RTT sample is lost. This is particularly noticeable on RPC calls used to probe the service offered by the peer. A lot of peers end up with an unknown RTT because we only ever sent a single RPC. This confuses the server rotation algorithm. Fix this by caching the information about the outgoing packet in RTT calculations in the rxrpc_call struct rather than looking in the Tx ring. A four-deep buffer is maintained and both REQUEST_ACK-flagged DATA and PING-ACK transmissions are recorded in there. When the appropriate response ACK is received, the buffer is checked for a match and, if found, an RTT sample is recorded. If a received ACK refers to a packet with a later serial number than an entry in the cache, that entry is presumed lost and the entry is made available to record a new transmission. ACKs types other than REQUESTED-type and PING-type cause any matching sample to be cancelled as they don't necessarily represent a useful measurement. If there's no space in the buffer on ping/data transmission, the sample base is discarded. Fixes: 50235c4b5a2f ("rxrpc: Obtain RTT data by requesting ACKs on DATA packets") Signed-off-by: David Howells <dhowells@redhat.com>
2020-08-20 06:29:16 +08:00
__entry->slot = slot;
__entry->send_serial = send_serial;
__entry->resp_serial = resp_serial;
__entry->rtt = rtt;
rxrpc: Fix the excessive initial retransmission timeout rxrpc currently uses a fixed 4s retransmission timeout until the RTT is sufficiently sampled. This can cause problems with some fileservers with calls to the cache manager in the afs filesystem being dropped from the fileserver because a packet goes missing and the retransmission timeout is greater than the call expiry timeout. Fix this by: (1) Copying the RTT/RTO calculation code from Linux's TCP implementation and altering it to fit rxrpc. (2) Altering the various users of the RTT to make use of the new SRTT value. (3) Replacing the use of rxrpc_resend_timeout to use the calculated RTO value instead (which is needed in jiffies), along with a backoff. Notes: (1) rxrpc provides RTT samples by matching the serial numbers on outgoing DATA packets that have the RXRPC_REQUEST_ACK set and PING ACK packets against the reference serial number in incoming REQUESTED ACK and PING-RESPONSE ACK packets. (2) Each packet that is transmitted on an rxrpc connection gets a new per-connection serial number, even for retransmissions, so an ACK can be cross-referenced to a specific trigger packet. This allows RTT information to be drawn from retransmitted DATA packets also. (3) rxrpc maintains the RTT/RTO state on the rxrpc_peer record rather than on an rxrpc_call because many RPC calls won't live long enough to generate more than one sample. (4) The calculated SRTT value is in units of 8ths of a microsecond rather than nanoseconds. The (S)RTT and RTO values are displayed in /proc/net/rxrpc/peers. Fixes: 17926a79320a ([AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both"") Signed-off-by: David Howells <dhowells@redhat.com>
2020-05-11 21:54:34 +08:00
__entry->rto = rto;
),
rxrpc: Fix loss of RTT samples due to interposed ACK The Rx protocol has a mechanism to help generate RTT samples that works by a client transmitting a REQUESTED-type ACK when it receives a DATA packet that has the REQUEST_ACK flag set. The peer, however, may interpose other ACKs before transmitting the REQUESTED-ACK, as can be seen in the following trace excerpt: rxrpc_tx_data: c=00000044 DATA d0b5ece8:00000001 00000001 q=00000001 fl=07 rxrpc_rx_ack: c=00000044 00000001 PNG r=00000000 f=00000002 p=00000000 n=0 rxrpc_rx_ack: c=00000044 00000002 REQ r=00000001 f=00000002 p=00000001 n=0 ... DATA packet 1 (q=xx) has REQUEST_ACK set (bit 1 of fl=xx). The incoming ping (labelled PNG) hard-acks the request DATA packet (f=xx exceeds the sequence number of the DATA packet), causing it to be discarded from the Tx ring. The ACK that was requested (labelled REQ, r=xx references the serial of the DATA packet) comes after the ping, but the sk_buff holding the timestamp has gone and the RTT sample is lost. This is particularly noticeable on RPC calls used to probe the service offered by the peer. A lot of peers end up with an unknown RTT because we only ever sent a single RPC. This confuses the server rotation algorithm. Fix this by caching the information about the outgoing packet in RTT calculations in the rxrpc_call struct rather than looking in the Tx ring. A four-deep buffer is maintained and both REQUEST_ACK-flagged DATA and PING-ACK transmissions are recorded in there. When the appropriate response ACK is received, the buffer is checked for a match and, if found, an RTT sample is recorded. If a received ACK refers to a packet with a later serial number than an entry in the cache, that entry is presumed lost and the entry is made available to record a new transmission. ACKs types other than REQUESTED-type and PING-type cause any matching sample to be cancelled as they don't necessarily represent a useful measurement. If there's no space in the buffer on ping/data transmission, the sample base is discarded. Fixes: 50235c4b5a2f ("rxrpc: Obtain RTT data by requesting ACKs on DATA packets") Signed-off-by: David Howells <dhowells@redhat.com>
2020-08-20 06:29:16 +08:00
TP_printk("c=%08x [%d] %s sr=%08x rr=%08x rtt=%u rto=%u",
__entry->call,
rxrpc: Fix loss of RTT samples due to interposed ACK The Rx protocol has a mechanism to help generate RTT samples that works by a client transmitting a REQUESTED-type ACK when it receives a DATA packet that has the REQUEST_ACK flag set. The peer, however, may interpose other ACKs before transmitting the REQUESTED-ACK, as can be seen in the following trace excerpt: rxrpc_tx_data: c=00000044 DATA d0b5ece8:00000001 00000001 q=00000001 fl=07 rxrpc_rx_ack: c=00000044 00000001 PNG r=00000000 f=00000002 p=00000000 n=0 rxrpc_rx_ack: c=00000044 00000002 REQ r=00000001 f=00000002 p=00000001 n=0 ... DATA packet 1 (q=xx) has REQUEST_ACK set (bit 1 of fl=xx). The incoming ping (labelled PNG) hard-acks the request DATA packet (f=xx exceeds the sequence number of the DATA packet), causing it to be discarded from the Tx ring. The ACK that was requested (labelled REQ, r=xx references the serial of the DATA packet) comes after the ping, but the sk_buff holding the timestamp has gone and the RTT sample is lost. This is particularly noticeable on RPC calls used to probe the service offered by the peer. A lot of peers end up with an unknown RTT because we only ever sent a single RPC. This confuses the server rotation algorithm. Fix this by caching the information about the outgoing packet in RTT calculations in the rxrpc_call struct rather than looking in the Tx ring. A four-deep buffer is maintained and both REQUEST_ACK-flagged DATA and PING-ACK transmissions are recorded in there. When the appropriate response ACK is received, the buffer is checked for a match and, if found, an RTT sample is recorded. If a received ACK refers to a packet with a later serial number than an entry in the cache, that entry is presumed lost and the entry is made available to record a new transmission. ACKs types other than REQUESTED-type and PING-type cause any matching sample to be cancelled as they don't necessarily represent a useful measurement. If there's no space in the buffer on ping/data transmission, the sample base is discarded. Fixes: 50235c4b5a2f ("rxrpc: Obtain RTT data by requesting ACKs on DATA packets") Signed-off-by: David Howells <dhowells@redhat.com>
2020-08-20 06:29:16 +08:00
__entry->slot,
__print_symbolic(__entry->why, rxrpc_rtt_rx_traces),
__entry->send_serial,
__entry->resp_serial,
__entry->rtt,
rxrpc: Fix the excessive initial retransmission timeout rxrpc currently uses a fixed 4s retransmission timeout until the RTT is sufficiently sampled. This can cause problems with some fileservers with calls to the cache manager in the afs filesystem being dropped from the fileserver because a packet goes missing and the retransmission timeout is greater than the call expiry timeout. Fix this by: (1) Copying the RTT/RTO calculation code from Linux's TCP implementation and altering it to fit rxrpc. (2) Altering the various users of the RTT to make use of the new SRTT value. (3) Replacing the use of rxrpc_resend_timeout to use the calculated RTO value instead (which is needed in jiffies), along with a backoff. Notes: (1) rxrpc provides RTT samples by matching the serial numbers on outgoing DATA packets that have the RXRPC_REQUEST_ACK set and PING ACK packets against the reference serial number in incoming REQUESTED ACK and PING-RESPONSE ACK packets. (2) Each packet that is transmitted on an rxrpc connection gets a new per-connection serial number, even for retransmissions, so an ACK can be cross-referenced to a specific trigger packet. This allows RTT information to be drawn from retransmitted DATA packets also. (3) rxrpc maintains the RTT/RTO state on the rxrpc_peer record rather than on an rxrpc_call because many RPC calls won't live long enough to generate more than one sample. (4) The calculated SRTT value is in units of 8ths of a microsecond rather than nanoseconds. The (S)RTT and RTO values are displayed in /proc/net/rxrpc/peers. Fixes: 17926a79320a ([AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both"") Signed-off-by: David Howells <dhowells@redhat.com>
2020-05-11 21:54:34 +08:00
__entry->rto)
);
TRACE_EVENT(rxrpc_timer,
TP_PROTO(struct rxrpc_call *call, enum rxrpc_timer_trace why,
rxrpc: Fix call timeouts Fix the rxrpc call expiration timeouts and make them settable from userspace. By analogy with other rx implementations, there should be three timeouts: (1) "Normal timeout" This is set for all calls and is triggered if we haven't received any packets from the peer in a while. It is measured from the last time we received any packet on that call. This is not reset by any connection packets (such as CHALLENGE/RESPONSE packets). If a service operation takes a long time, the server should generate PING ACKs at a duration that's substantially less than the normal timeout so is to keep both sides alive. This is set at 1/6 of normal timeout. (2) "Idle timeout" This is set only for a service call and is triggered if we stop receiving the DATA packets that comprise the request data. It is measured from the last time we received a DATA packet. (3) "Hard timeout" This can be set for a call and specified the maximum lifetime of that call. It should not be specified by default. Some operations (such as volume transfer) take a long time. Allow userspace to set/change the timeouts on a call with sendmsg, using a control message: RXRPC_SET_CALL_TIMEOUTS The data to the message is a number of 32-bit words, not all of which need be given: u32 hard_timeout; /* sec from first packet */ u32 idle_timeout; /* msec from packet Rx */ u32 normal_timeout; /* msec from data Rx */ This can be set in combination with any other sendmsg() that affects a call. Signed-off-by: David Howells <dhowells@redhat.com>
2017-11-24 18:18:41 +08:00
unsigned long now),
rxrpc: Fix call timeouts Fix the rxrpc call expiration timeouts and make them settable from userspace. By analogy with other rx implementations, there should be three timeouts: (1) "Normal timeout" This is set for all calls and is triggered if we haven't received any packets from the peer in a while. It is measured from the last time we received any packet on that call. This is not reset by any connection packets (such as CHALLENGE/RESPONSE packets). If a service operation takes a long time, the server should generate PING ACKs at a duration that's substantially less than the normal timeout so is to keep both sides alive. This is set at 1/6 of normal timeout. (2) "Idle timeout" This is set only for a service call and is triggered if we stop receiving the DATA packets that comprise the request data. It is measured from the last time we received a DATA packet. (3) "Hard timeout" This can be set for a call and specified the maximum lifetime of that call. It should not be specified by default. Some operations (such as volume transfer) take a long time. Allow userspace to set/change the timeouts on a call with sendmsg, using a control message: RXRPC_SET_CALL_TIMEOUTS The data to the message is a number of 32-bit words, not all of which need be given: u32 hard_timeout; /* sec from first packet */ u32 idle_timeout; /* msec from packet Rx */ u32 normal_timeout; /* msec from data Rx */ This can be set in combination with any other sendmsg() that affects a call. Signed-off-by: David Howells <dhowells@redhat.com>
2017-11-24 18:18:41 +08:00
TP_ARGS(call, why, now),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(enum rxrpc_timer_trace, why)
__field(long, now)
__field(long, ack_at)
__field(long, ack_lost_at)
__field(long, resend_at)
__field(long, ping_at)
__field(long, expect_rx_by)
__field(long, expect_req_by)
__field(long, expect_term_by)
__field(long, timer)
),
TP_fast_assign(
__entry->call = call->debug_id;
rxrpc: Fix call timeouts Fix the rxrpc call expiration timeouts and make them settable from userspace. By analogy with other rx implementations, there should be three timeouts: (1) "Normal timeout" This is set for all calls and is triggered if we haven't received any packets from the peer in a while. It is measured from the last time we received any packet on that call. This is not reset by any connection packets (such as CHALLENGE/RESPONSE packets). If a service operation takes a long time, the server should generate PING ACKs at a duration that's substantially less than the normal timeout so is to keep both sides alive. This is set at 1/6 of normal timeout. (2) "Idle timeout" This is set only for a service call and is triggered if we stop receiving the DATA packets that comprise the request data. It is measured from the last time we received a DATA packet. (3) "Hard timeout" This can be set for a call and specified the maximum lifetime of that call. It should not be specified by default. Some operations (such as volume transfer) take a long time. Allow userspace to set/change the timeouts on a call with sendmsg, using a control message: RXRPC_SET_CALL_TIMEOUTS The data to the message is a number of 32-bit words, not all of which need be given: u32 hard_timeout; /* sec from first packet */ u32 idle_timeout; /* msec from packet Rx */ u32 normal_timeout; /* msec from data Rx */ This can be set in combination with any other sendmsg() that affects a call. Signed-off-by: David Howells <dhowells@redhat.com>
2017-11-24 18:18:41 +08:00
__entry->why = why;
__entry->now = now;
__entry->ack_at = call->delay_ack_at;
__entry->ack_lost_at = call->ack_lost_at;
rxrpc: Fix call timeouts Fix the rxrpc call expiration timeouts and make them settable from userspace. By analogy with other rx implementations, there should be three timeouts: (1) "Normal timeout" This is set for all calls and is triggered if we haven't received any packets from the peer in a while. It is measured from the last time we received any packet on that call. This is not reset by any connection packets (such as CHALLENGE/RESPONSE packets). If a service operation takes a long time, the server should generate PING ACKs at a duration that's substantially less than the normal timeout so is to keep both sides alive. This is set at 1/6 of normal timeout. (2) "Idle timeout" This is set only for a service call and is triggered if we stop receiving the DATA packets that comprise the request data. It is measured from the last time we received a DATA packet. (3) "Hard timeout" This can be set for a call and specified the maximum lifetime of that call. It should not be specified by default. Some operations (such as volume transfer) take a long time. Allow userspace to set/change the timeouts on a call with sendmsg, using a control message: RXRPC_SET_CALL_TIMEOUTS The data to the message is a number of 32-bit words, not all of which need be given: u32 hard_timeout; /* sec from first packet */ u32 idle_timeout; /* msec from packet Rx */ u32 normal_timeout; /* msec from data Rx */ This can be set in combination with any other sendmsg() that affects a call. Signed-off-by: David Howells <dhowells@redhat.com>
2017-11-24 18:18:41 +08:00
__entry->resend_at = call->resend_at;
__entry->expect_rx_by = call->expect_rx_by;
__entry->expect_req_by = call->expect_req_by;
__entry->expect_term_by = call->expect_term_by;
__entry->timer = call->timer.expires;
),
TP_printk("c=%08x %s a=%ld la=%ld r=%ld xr=%ld xq=%ld xt=%ld t=%ld",
__entry->call,
__print_symbolic(__entry->why, rxrpc_timer_traces),
rxrpc: Fix call timeouts Fix the rxrpc call expiration timeouts and make them settable from userspace. By analogy with other rx implementations, there should be three timeouts: (1) "Normal timeout" This is set for all calls and is triggered if we haven't received any packets from the peer in a while. It is measured from the last time we received any packet on that call. This is not reset by any connection packets (such as CHALLENGE/RESPONSE packets). If a service operation takes a long time, the server should generate PING ACKs at a duration that's substantially less than the normal timeout so is to keep both sides alive. This is set at 1/6 of normal timeout. (2) "Idle timeout" This is set only for a service call and is triggered if we stop receiving the DATA packets that comprise the request data. It is measured from the last time we received a DATA packet. (3) "Hard timeout" This can be set for a call and specified the maximum lifetime of that call. It should not be specified by default. Some operations (such as volume transfer) take a long time. Allow userspace to set/change the timeouts on a call with sendmsg, using a control message: RXRPC_SET_CALL_TIMEOUTS The data to the message is a number of 32-bit words, not all of which need be given: u32 hard_timeout; /* sec from first packet */ u32 idle_timeout; /* msec from packet Rx */ u32 normal_timeout; /* msec from data Rx */ This can be set in combination with any other sendmsg() that affects a call. Signed-off-by: David Howells <dhowells@redhat.com>
2017-11-24 18:18:41 +08:00
__entry->ack_at - __entry->now,
__entry->ack_lost_at - __entry->now,
rxrpc: Fix call timeouts Fix the rxrpc call expiration timeouts and make them settable from userspace. By analogy with other rx implementations, there should be three timeouts: (1) "Normal timeout" This is set for all calls and is triggered if we haven't received any packets from the peer in a while. It is measured from the last time we received any packet on that call. This is not reset by any connection packets (such as CHALLENGE/RESPONSE packets). If a service operation takes a long time, the server should generate PING ACKs at a duration that's substantially less than the normal timeout so is to keep both sides alive. This is set at 1/6 of normal timeout. (2) "Idle timeout" This is set only for a service call and is triggered if we stop receiving the DATA packets that comprise the request data. It is measured from the last time we received a DATA packet. (3) "Hard timeout" This can be set for a call and specified the maximum lifetime of that call. It should not be specified by default. Some operations (such as volume transfer) take a long time. Allow userspace to set/change the timeouts on a call with sendmsg, using a control message: RXRPC_SET_CALL_TIMEOUTS The data to the message is a number of 32-bit words, not all of which need be given: u32 hard_timeout; /* sec from first packet */ u32 idle_timeout; /* msec from packet Rx */ u32 normal_timeout; /* msec from data Rx */ This can be set in combination with any other sendmsg() that affects a call. Signed-off-by: David Howells <dhowells@redhat.com>
2017-11-24 18:18:41 +08:00
__entry->resend_at - __entry->now,
__entry->expect_rx_by - __entry->now,
__entry->expect_req_by - __entry->now,
__entry->expect_term_by - __entry->now,
__entry->timer - __entry->now)
);
TRACE_EVENT(rxrpc_timer_expired,
TP_PROTO(struct rxrpc_call *call, unsigned long now),
TP_ARGS(call, now),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(long, now)
__field(long, ack_at)
__field(long, ack_lost_at)
__field(long, resend_at)
__field(long, ping_at)
__field(long, expect_rx_by)
__field(long, expect_req_by)
__field(long, expect_term_by)
__field(long, timer)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->now = now;
__entry->ack_at = call->delay_ack_at;
__entry->ack_lost_at = call->ack_lost_at;
__entry->resend_at = call->resend_at;
__entry->expect_rx_by = call->expect_rx_by;
__entry->expect_req_by = call->expect_req_by;
__entry->expect_term_by = call->expect_term_by;
__entry->timer = call->timer.expires;
),
TP_printk("c=%08x EXPIRED a=%ld la=%ld r=%ld xr=%ld xq=%ld xt=%ld t=%ld",
__entry->call,
__entry->ack_at - __entry->now,
__entry->ack_lost_at - __entry->now,
__entry->resend_at - __entry->now,
__entry->expect_rx_by - __entry->now,
__entry->expect_req_by - __entry->now,
__entry->expect_term_by - __entry->now,
__entry->timer - __entry->now)
);
TRACE_EVENT(rxrpc_rx_lose,
TP_PROTO(struct rxrpc_skb_priv *sp),
TP_ARGS(sp),
TP_STRUCT__entry(
__field_struct(struct rxrpc_host_header, hdr)
),
TP_fast_assign(
memcpy(&__entry->hdr, &sp->hdr, sizeof(__entry->hdr));
),
TP_printk("%08x:%08x:%08x:%04x %08x %08x %02x %02x %s *LOSE*",
__entry->hdr.epoch, __entry->hdr.cid,
__entry->hdr.callNumber, __entry->hdr.serviceId,
__entry->hdr.serial, __entry->hdr.seq,
__entry->hdr.type, __entry->hdr.flags,
__entry->hdr.type <= 15 ?
__print_symbolic(__entry->hdr.type, rxrpc_pkts) : "?UNK")
);
TRACE_EVENT(rxrpc_propose_ack,
TP_PROTO(struct rxrpc_call *call, enum rxrpc_propose_ack_trace why,
u8 ack_reason, rxrpc_serial_t serial),
TP_ARGS(call, why, ack_reason, serial),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(enum rxrpc_propose_ack_trace, why)
__field(rxrpc_serial_t, serial)
__field(u8, ack_reason)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->why = why;
__entry->serial = serial;
__entry->ack_reason = ack_reason;
),
TP_printk("c=%08x %s %s r=%08x",
__entry->call,
__print_symbolic(__entry->why, rxrpc_propose_ack_traces),
__print_symbolic(__entry->ack_reason, rxrpc_ack_names),
__entry->serial)
);
TRACE_EVENT(rxrpc_send_ack,
TP_PROTO(struct rxrpc_call *call, enum rxrpc_propose_ack_trace why,
u8 ack_reason, rxrpc_serial_t serial),
TP_ARGS(call, why, ack_reason, serial),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(enum rxrpc_propose_ack_trace, why)
__field(rxrpc_serial_t, serial)
__field(u8, ack_reason)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->why = why;
__entry->serial = serial;
__entry->ack_reason = ack_reason;
),
TP_printk("c=%08x %s %s r=%08x",
__entry->call,
__print_symbolic(__entry->why, rxrpc_propose_ack_traces),
__print_symbolic(__entry->ack_reason, rxrpc_ack_names),
__entry->serial)
);
TRACE_EVENT(rxrpc_drop_ack,
TP_PROTO(struct rxrpc_call *call, enum rxrpc_propose_ack_trace why,
u8 ack_reason, rxrpc_serial_t serial, bool nobuf),
TP_ARGS(call, why, ack_reason, serial, nobuf),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(enum rxrpc_propose_ack_trace, why)
__field(rxrpc_serial_t, serial)
__field(u8, ack_reason)
__field(bool, nobuf)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->why = why;
__entry->serial = serial;
__entry->ack_reason = ack_reason;
__entry->nobuf = nobuf;
),
TP_printk("c=%08x %s %s r=%08x nbf=%u",
__entry->call,
__print_symbolic(__entry->why, rxrpc_propose_ack_traces),
__print_symbolic(__entry->ack_reason, rxrpc_ack_names),
__entry->serial, __entry->nobuf)
);
TRACE_EVENT(rxrpc_retransmit,
rxrpc: Don't use a ring buffer for call Tx queue Change the way the Tx queueing works to make the following ends easier to achieve: (1) The filling of packets, the encryption of packets and the transmission of packets can be handled in parallel by separate threads, rather than rxrpc_sendmsg() allocating, filling, encrypting and transmitting each packet before moving onto the next one. (2) Get rid of the fixed-size ring which sets a hard limit on the number of packets that can be retained in the ring. This allows the number of packets to increase without having to allocate a very large ring or having variable-sized rings. [Note: the downside of this is that it's then less efficient to locate a packet for retransmission as we then have to step through a list and examine each buffer in the list.] (3) Allow the filler/encrypter to run ahead of the transmission window. (4) Make it easier to do zero copy UDP from the packet buffers. (5) Make it easier to do zero copy from userspace to the packet buffers - and thence to UDP (only if for unauthenticated connections). To that end, the following changes are made: (1) Use the new rxrpc_txbuf struct instead of sk_buff for keeping packets to be transmitted in. This allows them to be placed on multiple queues simultaneously. An sk_buff isn't really necessary as it's never passed on to lower-level networking code. (2) Keep the transmissable packets in a linked list on the call struct rather than in a ring. As a consequence, the annotation buffer isn't used either; rather a flag is set on the packet to indicate ackedness. (3) Use the RXRPC_CALL_TX_LAST flag to indicate that the last packet to be transmitted has been queued. Add RXRPC_CALL_TX_ALL_ACKED to indicate that all packets up to and including the last got hard acked. (4) Wire headers are now stored in the txbuf rather than being concocted on the stack and they're stored immediately before the data, thereby allowing zerocopy of a single span. (5) Don't bother with instant-resend on transmission failure; rather, leave it for a timer or an ACK packet to trigger. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2022-04-01 06:55:08 +08:00
TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, s64 expiry),
rxrpc: Don't use a ring buffer for call Tx queue Change the way the Tx queueing works to make the following ends easier to achieve: (1) The filling of packets, the encryption of packets and the transmission of packets can be handled in parallel by separate threads, rather than rxrpc_sendmsg() allocating, filling, encrypting and transmitting each packet before moving onto the next one. (2) Get rid of the fixed-size ring which sets a hard limit on the number of packets that can be retained in the ring. This allows the number of packets to increase without having to allocate a very large ring or having variable-sized rings. [Note: the downside of this is that it's then less efficient to locate a packet for retransmission as we then have to step through a list and examine each buffer in the list.] (3) Allow the filler/encrypter to run ahead of the transmission window. (4) Make it easier to do zero copy UDP from the packet buffers. (5) Make it easier to do zero copy from userspace to the packet buffers - and thence to UDP (only if for unauthenticated connections). To that end, the following changes are made: (1) Use the new rxrpc_txbuf struct instead of sk_buff for keeping packets to be transmitted in. This allows them to be placed on multiple queues simultaneously. An sk_buff isn't really necessary as it's never passed on to lower-level networking code. (2) Keep the transmissable packets in a linked list on the call struct rather than in a ring. As a consequence, the annotation buffer isn't used either; rather a flag is set on the packet to indicate ackedness. (3) Use the RXRPC_CALL_TX_LAST flag to indicate that the last packet to be transmitted has been queued. Add RXRPC_CALL_TX_ALL_ACKED to indicate that all packets up to and including the last got hard acked. (4) Wire headers are now stored in the txbuf rather than being concocted on the stack and they're stored immediately before the data, thereby allowing zerocopy of a single span. (5) Don't bother with instant-resend on transmission failure; rather, leave it for a timer or an ACK packet to trigger. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2022-04-01 06:55:08 +08:00
TP_ARGS(call, seq, expiry),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(rxrpc_seq_t, seq)
__field(s64, expiry)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->seq = seq;
__entry->expiry = expiry;
),
rxrpc: Don't use a ring buffer for call Tx queue Change the way the Tx queueing works to make the following ends easier to achieve: (1) The filling of packets, the encryption of packets and the transmission of packets can be handled in parallel by separate threads, rather than rxrpc_sendmsg() allocating, filling, encrypting and transmitting each packet before moving onto the next one. (2) Get rid of the fixed-size ring which sets a hard limit on the number of packets that can be retained in the ring. This allows the number of packets to increase without having to allocate a very large ring or having variable-sized rings. [Note: the downside of this is that it's then less efficient to locate a packet for retransmission as we then have to step through a list and examine each buffer in the list.] (3) Allow the filler/encrypter to run ahead of the transmission window. (4) Make it easier to do zero copy UDP from the packet buffers. (5) Make it easier to do zero copy from userspace to the packet buffers - and thence to UDP (only if for unauthenticated connections). To that end, the following changes are made: (1) Use the new rxrpc_txbuf struct instead of sk_buff for keeping packets to be transmitted in. This allows them to be placed on multiple queues simultaneously. An sk_buff isn't really necessary as it's never passed on to lower-level networking code. (2) Keep the transmissable packets in a linked list on the call struct rather than in a ring. As a consequence, the annotation buffer isn't used either; rather a flag is set on the packet to indicate ackedness. (3) Use the RXRPC_CALL_TX_LAST flag to indicate that the last packet to be transmitted has been queued. Add RXRPC_CALL_TX_ALL_ACKED to indicate that all packets up to and including the last got hard acked. (4) Wire headers are now stored in the txbuf rather than being concocted on the stack and they're stored immediately before the data, thereby allowing zerocopy of a single span. (5) Don't bother with instant-resend on transmission failure; rather, leave it for a timer or an ACK packet to trigger. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2022-04-01 06:55:08 +08:00
TP_printk("c=%08x q=%x xp=%lld",
__entry->call,
__entry->seq,
__entry->expiry)
);
TRACE_EVENT(rxrpc_congest,
TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary,
rxrpc_serial_t ack_serial, enum rxrpc_congest_change change),
TP_ARGS(call, summary, ack_serial, change),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(enum rxrpc_congest_change, change)
__field(rxrpc_seq_t, hard_ack)
__field(rxrpc_seq_t, top)
__field(rxrpc_seq_t, lowest_nak)
__field(rxrpc_serial_t, ack_serial)
__field_struct(struct rxrpc_ack_summary, sum)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->change = change;
rxrpc: Don't use a ring buffer for call Tx queue Change the way the Tx queueing works to make the following ends easier to achieve: (1) The filling of packets, the encryption of packets and the transmission of packets can be handled in parallel by separate threads, rather than rxrpc_sendmsg() allocating, filling, encrypting and transmitting each packet before moving onto the next one. (2) Get rid of the fixed-size ring which sets a hard limit on the number of packets that can be retained in the ring. This allows the number of packets to increase without having to allocate a very large ring or having variable-sized rings. [Note: the downside of this is that it's then less efficient to locate a packet for retransmission as we then have to step through a list and examine each buffer in the list.] (3) Allow the filler/encrypter to run ahead of the transmission window. (4) Make it easier to do zero copy UDP from the packet buffers. (5) Make it easier to do zero copy from userspace to the packet buffers - and thence to UDP (only if for unauthenticated connections). To that end, the following changes are made: (1) Use the new rxrpc_txbuf struct instead of sk_buff for keeping packets to be transmitted in. This allows them to be placed on multiple queues simultaneously. An sk_buff isn't really necessary as it's never passed on to lower-level networking code. (2) Keep the transmissable packets in a linked list on the call struct rather than in a ring. As a consequence, the annotation buffer isn't used either; rather a flag is set on the packet to indicate ackedness. (3) Use the RXRPC_CALL_TX_LAST flag to indicate that the last packet to be transmitted has been queued. Add RXRPC_CALL_TX_ALL_ACKED to indicate that all packets up to and including the last got hard acked. (4) Wire headers are now stored in the txbuf rather than being concocted on the stack and they're stored immediately before the data, thereby allowing zerocopy of a single span. (5) Don't bother with instant-resend on transmission failure; rather, leave it for a timer or an ACK packet to trigger. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2022-04-01 06:55:08 +08:00
__entry->hard_ack = call->acks_hard_ack;
__entry->top = call->tx_top;
__entry->lowest_nak = call->acks_lowest_nak;
__entry->ack_serial = ack_serial;
memcpy(&__entry->sum, summary, sizeof(__entry->sum));
),
rxrpc: Fix counting of new acks and nacks [ Upstream commit 41b7fa157ea1c8c3a575ca7f5f32034de9bee3ae ] Fix the counting of new acks and nacks when parsing a packet - something that is used in congestion control. As the code stands, it merely notes if there are any nacks whereas what we really should do is compare the previous SACK table to the new one, assuming we get two successive ACK packets with nacks in them. However, we really don't want to do that if we can avoid it as the tables might not correspond directly as one may be shifted from the other - something that will only get harder to deal with once extended ACK tables come into full use (with a capacity of up to 8192). Instead, count the number of nacks shifted out of the old SACK, the number of nacks retained in the portion still active and the number of new acks and nacks in the new table then calculate what we need. Note this ends up a bit of an estimate as the Rx protocol allows acks to be withdrawn by the receiver and packets requested to be retransmitted. Fixes: d57a3a151660 ("rxrpc: Save last ACK's SACK table rather than marking txbufs") Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: "David S. Miller" <davem@davemloft.net> cc: Eric Dumazet <edumazet@google.com> cc: Jakub Kicinski <kuba@kernel.org> cc: Paolo Abeni <pabeni@redhat.com> cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Sasha Levin <sashal@kernel.org>
2024-02-02 23:19:16 +08:00
TP_printk("c=%08x r=%08x %s q=%08x %s cw=%u ss=%u nA=%u,%u+%u,%u b=%u u=%u d=%u l=%x%s%s%s",
__entry->call,
__entry->ack_serial,
__print_symbolic(__entry->sum.ack_reason, rxrpc_ack_names),
__entry->hard_ack,
__print_symbolic(__entry->sum.mode, rxrpc_congest_modes),
__entry->sum.cwnd,
__entry->sum.ssthresh,
rxrpc: Fix counting of new acks and nacks [ Upstream commit 41b7fa157ea1c8c3a575ca7f5f32034de9bee3ae ] Fix the counting of new acks and nacks when parsing a packet - something that is used in congestion control. As the code stands, it merely notes if there are any nacks whereas what we really should do is compare the previous SACK table to the new one, assuming we get two successive ACK packets with nacks in them. However, we really don't want to do that if we can avoid it as the tables might not correspond directly as one may be shifted from the other - something that will only get harder to deal with once extended ACK tables come into full use (with a capacity of up to 8192). Instead, count the number of nacks shifted out of the old SACK, the number of nacks retained in the portion still active and the number of new acks and nacks in the new table then calculate what we need. Note this ends up a bit of an estimate as the Rx protocol allows acks to be withdrawn by the receiver and packets requested to be retransmitted. Fixes: d57a3a151660 ("rxrpc: Save last ACK's SACK table rather than marking txbufs") Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: "David S. Miller" <davem@davemloft.net> cc: Eric Dumazet <edumazet@google.com> cc: Jakub Kicinski <kuba@kernel.org> cc: Paolo Abeni <pabeni@redhat.com> cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Sasha Levin <sashal@kernel.org>
2024-02-02 23:19:16 +08:00
__entry->sum.nr_acks, __entry->sum.nr_retained_nacks,
__entry->sum.nr_new_acks,
rxrpc: Fix counting of new acks and nacks [ Upstream commit 41b7fa157ea1c8c3a575ca7f5f32034de9bee3ae ] Fix the counting of new acks and nacks when parsing a packet - something that is used in congestion control. As the code stands, it merely notes if there are any nacks whereas what we really should do is compare the previous SACK table to the new one, assuming we get two successive ACK packets with nacks in them. However, we really don't want to do that if we can avoid it as the tables might not correspond directly as one may be shifted from the other - something that will only get harder to deal with once extended ACK tables come into full use (with a capacity of up to 8192). Instead, count the number of nacks shifted out of the old SACK, the number of nacks retained in the portion still active and the number of new acks and nacks in the new table then calculate what we need. Note this ends up a bit of an estimate as the Rx protocol allows acks to be withdrawn by the receiver and packets requested to be retransmitted. Fixes: d57a3a151660 ("rxrpc: Save last ACK's SACK table rather than marking txbufs") Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: "David S. Miller" <davem@davemloft.net> cc: Eric Dumazet <edumazet@google.com> cc: Jakub Kicinski <kuba@kernel.org> cc: Paolo Abeni <pabeni@redhat.com> cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Sasha Levin <sashal@kernel.org>
2024-02-02 23:19:16 +08:00
__entry->sum.nr_new_nacks,
__entry->top - __entry->hard_ack,
__entry->sum.cumulative_acks,
__entry->sum.dup_acks,
__entry->lowest_nak, __entry->sum.new_low_nack ? "!" : "",
__print_symbolic(__entry->change, rxrpc_congest_changes),
__entry->sum.retrans_timeo ? " rTxTo" : "")
);
TRACE_EVENT(rxrpc_reset_cwnd,
TP_PROTO(struct rxrpc_call *call, ktime_t now),
TP_ARGS(call, now),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(enum rxrpc_congest_mode, mode)
__field(unsigned short, cwnd)
__field(unsigned short, extra)
__field(rxrpc_seq_t, hard_ack)
__field(rxrpc_seq_t, prepared)
__field(ktime_t, since_last_tx)
__field(bool, has_data)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->mode = call->cong_mode;
__entry->cwnd = call->cong_cwnd;
__entry->extra = call->cong_extra;
__entry->hard_ack = call->acks_hard_ack;
__entry->prepared = call->tx_prepared - call->tx_bottom;
__entry->since_last_tx = ktime_sub(now, call->tx_last_sent);
__entry->has_data = !list_empty(&call->tx_sendmsg);
),
TP_printk("c=%08x q=%08x %s cw=%u+%u pr=%u tm=%llu d=%u",
__entry->call,
__entry->hard_ack,
__print_symbolic(__entry->mode, rxrpc_congest_modes),
__entry->cwnd,
__entry->extra,
__entry->prepared,
ktime_to_ns(__entry->since_last_tx),
__entry->has_data)
);
TRACE_EVENT(rxrpc_disconnect_call,
TP_PROTO(struct rxrpc_call *call),
TP_ARGS(call),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(u32, abort_code)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->abort_code = call->abort_code;
),
TP_printk("c=%08x ab=%08x",
__entry->call,
__entry->abort_code)
);
TRACE_EVENT(rxrpc_improper_term,
TP_PROTO(struct rxrpc_call *call),
TP_ARGS(call),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(u32, abort_code)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->abort_code = call->abort_code;
),
TP_printk("c=%08x ab=%08x",
__entry->call,
__entry->abort_code)
);
TRACE_EVENT(rxrpc_connect_call,
TP_PROTO(struct rxrpc_call *call),
TP_ARGS(call),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(unsigned long, user_call_ID)
__field(u32, cid)
__field(u32, call_id)
__field_struct(struct sockaddr_rxrpc, srx)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->user_call_ID = call->user_call_ID;
__entry->cid = call->cid;
__entry->call_id = call->call_id;
__entry->srx = call->dest_srx;
),
TP_printk("c=%08x u=%p %08x:%08x dst=%pISp",
__entry->call,
(void *)__entry->user_call_ID,
__entry->cid,
__entry->call_id,
&__entry->srx.transport)
);
TRACE_EVENT(rxrpc_resend,
TP_PROTO(struct rxrpc_call *call, struct sk_buff *ack),
TP_ARGS(call, ack),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(rxrpc_seq_t, seq)
__field(rxrpc_seq_t, transmitted)
__field(rxrpc_serial_t, ack_serial)
),
TP_fast_assign(
struct rxrpc_skb_priv *sp = ack ? rxrpc_skb(ack) : NULL;
__entry->call = call->debug_id;
rxrpc: Don't use a ring buffer for call Tx queue Change the way the Tx queueing works to make the following ends easier to achieve: (1) The filling of packets, the encryption of packets and the transmission of packets can be handled in parallel by separate threads, rather than rxrpc_sendmsg() allocating, filling, encrypting and transmitting each packet before moving onto the next one. (2) Get rid of the fixed-size ring which sets a hard limit on the number of packets that can be retained in the ring. This allows the number of packets to increase without having to allocate a very large ring or having variable-sized rings. [Note: the downside of this is that it's then less efficient to locate a packet for retransmission as we then have to step through a list and examine each buffer in the list.] (3) Allow the filler/encrypter to run ahead of the transmission window. (4) Make it easier to do zero copy UDP from the packet buffers. (5) Make it easier to do zero copy from userspace to the packet buffers - and thence to UDP (only if for unauthenticated connections). To that end, the following changes are made: (1) Use the new rxrpc_txbuf struct instead of sk_buff for keeping packets to be transmitted in. This allows them to be placed on multiple queues simultaneously. An sk_buff isn't really necessary as it's never passed on to lower-level networking code. (2) Keep the transmissable packets in a linked list on the call struct rather than in a ring. As a consequence, the annotation buffer isn't used either; rather a flag is set on the packet to indicate ackedness. (3) Use the RXRPC_CALL_TX_LAST flag to indicate that the last packet to be transmitted has been queued. Add RXRPC_CALL_TX_ALL_ACKED to indicate that all packets up to and including the last got hard acked. (4) Wire headers are now stored in the txbuf rather than being concocted on the stack and they're stored immediately before the data, thereby allowing zerocopy of a single span. (5) Don't bother with instant-resend on transmission failure; rather, leave it for a timer or an ACK packet to trigger. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2022-04-01 06:55:08 +08:00
__entry->seq = call->acks_hard_ack;
__entry->transmitted = call->tx_transmitted;
__entry->ack_serial = sp ? sp->hdr.serial : 0;
),
TP_printk("c=%08x r=%x q=%x tq=%x",
__entry->call,
__entry->ack_serial,
__entry->seq,
__entry->transmitted)
);
TRACE_EVENT(rxrpc_rx_icmp,
TP_PROTO(struct rxrpc_peer *peer, struct sock_extended_err *ee,
struct sockaddr_rxrpc *srx),
TP_ARGS(peer, ee, srx),
TP_STRUCT__entry(
__field(unsigned int, peer)
__field_struct(struct sock_extended_err, ee)
__field_struct(struct sockaddr_rxrpc, srx)
),
TP_fast_assign(
__entry->peer = peer->debug_id;
memcpy(&__entry->ee, ee, sizeof(__entry->ee));
memcpy(&__entry->srx, srx, sizeof(__entry->srx));
),
TP_printk("P=%08x o=%u t=%u c=%u i=%u d=%u e=%d %pISp",
__entry->peer,
__entry->ee.ee_origin,
__entry->ee.ee_type,
__entry->ee.ee_code,
__entry->ee.ee_info,
__entry->ee.ee_data,
__entry->ee.ee_errno,
&__entry->srx.transport)
);
TRACE_EVENT(rxrpc_tx_fail,
TP_PROTO(unsigned int debug_id, rxrpc_serial_t serial, int ret,
enum rxrpc_tx_point where),
TP_ARGS(debug_id, serial, ret, where),
TP_STRUCT__entry(
__field(unsigned int, debug_id)
__field(rxrpc_serial_t, serial)
__field(int, ret)
__field(enum rxrpc_tx_point, where)
),
TP_fast_assign(
__entry->debug_id = debug_id;
__entry->serial = serial;
__entry->ret = ret;
__entry->where = where;
),
TP_printk("c=%08x r=%x ret=%d %s",
__entry->debug_id,
__entry->serial,
__entry->ret,
__print_symbolic(__entry->where, rxrpc_tx_points))
);
rxrpc: Fix handling of call quietly cancelled out on server Sometimes an in-progress call will stop responding on the fileserver when the fileserver quietly cancels the call with an internally marked abort (RX_CALL_DEAD), without sending an ABORT to the client. This causes the client's call to eventually expire from lack of incoming packets directed its way, which currently leads to it being cancelled locally with ETIME. Note that it's not currently clear as to why this happens as it's really hard to reproduce. The rotation policy implement by kAFS, however, doesn't differentiate between ETIME meaning we didn't get any response from the server and ETIME meaning the call got cancelled mid-flow. The latter leads to an oops when fetching data as the rotation partially resets the afs_read descriptor, which can result in a cleared page pointer being dereferenced because that page has already been filled. Handle this by the following means: (1) Set a flag on a call when we receive a packet for it. (2) Store the highest packet serial number so far received for a call (bearing in mind this may wrap). (3) If, when the "not received anything recently" timeout expires on a call, we've received at least one packet for a call and the connection as a whole has received packets more recently than that call, then cancel the call locally with ECONNRESET rather than ETIME. This indicates that the call was definitely in progress on the server. (4) In kAFS, if the rotation algorithm sees ECONNRESET rather than ETIME, don't try the next server, but rather abort the call. This avoids the oops as we don't try to reuse the afs_read struct. Rather, as-yet ungotten pages will be reread at a later data. Also: (5) Add an rxrpc tracepoint to log detection of the call being reset. Without this, I occasionally see an oops like the following: general protection fault: 0000 [#1] SMP PTI ... RIP: 0010:_copy_to_iter+0x204/0x310 RSP: 0018:ffff8800cae0f828 EFLAGS: 00010206 RAX: 0000000000000560 RBX: 0000000000000560 RCX: 0000000000000560 RDX: ffff8800cae0f968 RSI: ffff8800d58b3312 RDI: 0005080000000000 RBP: ffff8800cae0f968 R08: 0000000000000560 R09: ffff8800ca00f400 R10: ffff8800c36f28d4 R11: 00000000000008c4 R12: ffff8800cae0f958 R13: 0000000000000560 R14: ffff8800d58b3312 R15: 0000000000000560 FS: 00007fdaef108080(0000) GS:ffff8800ca680000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fb28a8fa000 CR3: 00000000d2a76002 CR4: 00000000001606e0 Call Trace: skb_copy_datagram_iter+0x14e/0x289 rxrpc_recvmsg_data.isra.0+0x6f3/0xf68 ? trace_buffer_unlock_commit_regs+0x4f/0x89 rxrpc_kernel_recv_data+0x149/0x421 afs_extract_data+0x1e0/0x798 ? afs_wait_for_call_to_complete+0xc9/0x52e afs_deliver_fs_fetch_data+0x33a/0x5ab afs_deliver_to_call+0x1ee/0x5e0 ? afs_wait_for_call_to_complete+0xc9/0x52e afs_wait_for_call_to_complete+0x12b/0x52e ? wake_up_q+0x54/0x54 afs_make_call+0x287/0x462 ? afs_fs_fetch_data+0x3e6/0x3ed ? rcu_read_lock_sched_held+0x5d/0x63 afs_fs_fetch_data+0x3e6/0x3ed afs_fetch_data+0xbb/0x14a afs_readpages+0x317/0x40d __do_page_cache_readahead+0x203/0x2ba ? ondemand_readahead+0x3a7/0x3c1 ondemand_readahead+0x3a7/0x3c1 generic_file_buffered_read+0x18b/0x62f __vfs_read+0xdb/0xfe vfs_read+0xb2/0x137 ksys_read+0x50/0x8c do_syscall_64+0x7d/0x1a0 entry_SYSCALL_64_after_hwframe+0x49/0xbe Note the weird value in RDI which is a result of trying to kmap() a NULL page pointer. Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-06-03 09:17:39 +08:00
TRACE_EVENT(rxrpc_call_reset,
TP_PROTO(struct rxrpc_call *call),
TP_ARGS(call),
TP_STRUCT__entry(
__field(unsigned int, debug_id)
__field(u32, cid)
__field(u32, call_id)
__field(rxrpc_serial_t, call_serial)
__field(rxrpc_serial_t, conn_serial)
__field(rxrpc_seq_t, tx_seq)
__field(rxrpc_seq_t, rx_seq)
rxrpc: Fix handling of call quietly cancelled out on server Sometimes an in-progress call will stop responding on the fileserver when the fileserver quietly cancels the call with an internally marked abort (RX_CALL_DEAD), without sending an ABORT to the client. This causes the client's call to eventually expire from lack of incoming packets directed its way, which currently leads to it being cancelled locally with ETIME. Note that it's not currently clear as to why this happens as it's really hard to reproduce. The rotation policy implement by kAFS, however, doesn't differentiate between ETIME meaning we didn't get any response from the server and ETIME meaning the call got cancelled mid-flow. The latter leads to an oops when fetching data as the rotation partially resets the afs_read descriptor, which can result in a cleared page pointer being dereferenced because that page has already been filled. Handle this by the following means: (1) Set a flag on a call when we receive a packet for it. (2) Store the highest packet serial number so far received for a call (bearing in mind this may wrap). (3) If, when the "not received anything recently" timeout expires on a call, we've received at least one packet for a call and the connection as a whole has received packets more recently than that call, then cancel the call locally with ECONNRESET rather than ETIME. This indicates that the call was definitely in progress on the server. (4) In kAFS, if the rotation algorithm sees ECONNRESET rather than ETIME, don't try the next server, but rather abort the call. This avoids the oops as we don't try to reuse the afs_read struct. Rather, as-yet ungotten pages will be reread at a later data. Also: (5) Add an rxrpc tracepoint to log detection of the call being reset. Without this, I occasionally see an oops like the following: general protection fault: 0000 [#1] SMP PTI ... RIP: 0010:_copy_to_iter+0x204/0x310 RSP: 0018:ffff8800cae0f828 EFLAGS: 00010206 RAX: 0000000000000560 RBX: 0000000000000560 RCX: 0000000000000560 RDX: ffff8800cae0f968 RSI: ffff8800d58b3312 RDI: 0005080000000000 RBP: ffff8800cae0f968 R08: 0000000000000560 R09: ffff8800ca00f400 R10: ffff8800c36f28d4 R11: 00000000000008c4 R12: ffff8800cae0f958 R13: 0000000000000560 R14: ffff8800d58b3312 R15: 0000000000000560 FS: 00007fdaef108080(0000) GS:ffff8800ca680000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fb28a8fa000 CR3: 00000000d2a76002 CR4: 00000000001606e0 Call Trace: skb_copy_datagram_iter+0x14e/0x289 rxrpc_recvmsg_data.isra.0+0x6f3/0xf68 ? trace_buffer_unlock_commit_regs+0x4f/0x89 rxrpc_kernel_recv_data+0x149/0x421 afs_extract_data+0x1e0/0x798 ? afs_wait_for_call_to_complete+0xc9/0x52e afs_deliver_fs_fetch_data+0x33a/0x5ab afs_deliver_to_call+0x1ee/0x5e0 ? afs_wait_for_call_to_complete+0xc9/0x52e afs_wait_for_call_to_complete+0x12b/0x52e ? wake_up_q+0x54/0x54 afs_make_call+0x287/0x462 ? afs_fs_fetch_data+0x3e6/0x3ed ? rcu_read_lock_sched_held+0x5d/0x63 afs_fs_fetch_data+0x3e6/0x3ed afs_fetch_data+0xbb/0x14a afs_readpages+0x317/0x40d __do_page_cache_readahead+0x203/0x2ba ? ondemand_readahead+0x3a7/0x3c1 ondemand_readahead+0x3a7/0x3c1 generic_file_buffered_read+0x18b/0x62f __vfs_read+0xdb/0xfe vfs_read+0xb2/0x137 ksys_read+0x50/0x8c do_syscall_64+0x7d/0x1a0 entry_SYSCALL_64_after_hwframe+0x49/0xbe Note the weird value in RDI which is a result of trying to kmap() a NULL page pointer. Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-06-03 09:17:39 +08:00
),
TP_fast_assign(
__entry->debug_id = call->debug_id;
__entry->cid = call->cid;
__entry->call_id = call->call_id;
__entry->call_serial = call->rx_serial;
__entry->conn_serial = call->conn->hi_serial;
rxrpc: Don't use a ring buffer for call Tx queue Change the way the Tx queueing works to make the following ends easier to achieve: (1) The filling of packets, the encryption of packets and the transmission of packets can be handled in parallel by separate threads, rather than rxrpc_sendmsg() allocating, filling, encrypting and transmitting each packet before moving onto the next one. (2) Get rid of the fixed-size ring which sets a hard limit on the number of packets that can be retained in the ring. This allows the number of packets to increase without having to allocate a very large ring or having variable-sized rings. [Note: the downside of this is that it's then less efficient to locate a packet for retransmission as we then have to step through a list and examine each buffer in the list.] (3) Allow the filler/encrypter to run ahead of the transmission window. (4) Make it easier to do zero copy UDP from the packet buffers. (5) Make it easier to do zero copy from userspace to the packet buffers - and thence to UDP (only if for unauthenticated connections). To that end, the following changes are made: (1) Use the new rxrpc_txbuf struct instead of sk_buff for keeping packets to be transmitted in. This allows them to be placed on multiple queues simultaneously. An sk_buff isn't really necessary as it's never passed on to lower-level networking code. (2) Keep the transmissable packets in a linked list on the call struct rather than in a ring. As a consequence, the annotation buffer isn't used either; rather a flag is set on the packet to indicate ackedness. (3) Use the RXRPC_CALL_TX_LAST flag to indicate that the last packet to be transmitted has been queued. Add RXRPC_CALL_TX_ALL_ACKED to indicate that all packets up to and including the last got hard acked. (4) Wire headers are now stored in the txbuf rather than being concocted on the stack and they're stored immediately before the data, thereby allowing zerocopy of a single span. (5) Don't bother with instant-resend on transmission failure; rather, leave it for a timer or an ACK packet to trigger. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2022-04-01 06:55:08 +08:00
__entry->tx_seq = call->acks_hard_ack;
__entry->rx_seq = call->rx_highest_seq;
rxrpc: Fix handling of call quietly cancelled out on server Sometimes an in-progress call will stop responding on the fileserver when the fileserver quietly cancels the call with an internally marked abort (RX_CALL_DEAD), without sending an ABORT to the client. This causes the client's call to eventually expire from lack of incoming packets directed its way, which currently leads to it being cancelled locally with ETIME. Note that it's not currently clear as to why this happens as it's really hard to reproduce. The rotation policy implement by kAFS, however, doesn't differentiate between ETIME meaning we didn't get any response from the server and ETIME meaning the call got cancelled mid-flow. The latter leads to an oops when fetching data as the rotation partially resets the afs_read descriptor, which can result in a cleared page pointer being dereferenced because that page has already been filled. Handle this by the following means: (1) Set a flag on a call when we receive a packet for it. (2) Store the highest packet serial number so far received for a call (bearing in mind this may wrap). (3) If, when the "not received anything recently" timeout expires on a call, we've received at least one packet for a call and the connection as a whole has received packets more recently than that call, then cancel the call locally with ECONNRESET rather than ETIME. This indicates that the call was definitely in progress on the server. (4) In kAFS, if the rotation algorithm sees ECONNRESET rather than ETIME, don't try the next server, but rather abort the call. This avoids the oops as we don't try to reuse the afs_read struct. Rather, as-yet ungotten pages will be reread at a later data. Also: (5) Add an rxrpc tracepoint to log detection of the call being reset. Without this, I occasionally see an oops like the following: general protection fault: 0000 [#1] SMP PTI ... RIP: 0010:_copy_to_iter+0x204/0x310 RSP: 0018:ffff8800cae0f828 EFLAGS: 00010206 RAX: 0000000000000560 RBX: 0000000000000560 RCX: 0000000000000560 RDX: ffff8800cae0f968 RSI: ffff8800d58b3312 RDI: 0005080000000000 RBP: ffff8800cae0f968 R08: 0000000000000560 R09: ffff8800ca00f400 R10: ffff8800c36f28d4 R11: 00000000000008c4 R12: ffff8800cae0f958 R13: 0000000000000560 R14: ffff8800d58b3312 R15: 0000000000000560 FS: 00007fdaef108080(0000) GS:ffff8800ca680000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fb28a8fa000 CR3: 00000000d2a76002 CR4: 00000000001606e0 Call Trace: skb_copy_datagram_iter+0x14e/0x289 rxrpc_recvmsg_data.isra.0+0x6f3/0xf68 ? trace_buffer_unlock_commit_regs+0x4f/0x89 rxrpc_kernel_recv_data+0x149/0x421 afs_extract_data+0x1e0/0x798 ? afs_wait_for_call_to_complete+0xc9/0x52e afs_deliver_fs_fetch_data+0x33a/0x5ab afs_deliver_to_call+0x1ee/0x5e0 ? afs_wait_for_call_to_complete+0xc9/0x52e afs_wait_for_call_to_complete+0x12b/0x52e ? wake_up_q+0x54/0x54 afs_make_call+0x287/0x462 ? afs_fs_fetch_data+0x3e6/0x3ed ? rcu_read_lock_sched_held+0x5d/0x63 afs_fs_fetch_data+0x3e6/0x3ed afs_fetch_data+0xbb/0x14a afs_readpages+0x317/0x40d __do_page_cache_readahead+0x203/0x2ba ? ondemand_readahead+0x3a7/0x3c1 ondemand_readahead+0x3a7/0x3c1 generic_file_buffered_read+0x18b/0x62f __vfs_read+0xdb/0xfe vfs_read+0xb2/0x137 ksys_read+0x50/0x8c do_syscall_64+0x7d/0x1a0 entry_SYSCALL_64_after_hwframe+0x49/0xbe Note the weird value in RDI which is a result of trying to kmap() a NULL page pointer. Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-06-03 09:17:39 +08:00
),
TP_printk("c=%08x %08x:%08x r=%08x/%08x tx=%08x rx=%08x",
__entry->debug_id,
__entry->cid, __entry->call_id,
__entry->call_serial, __entry->conn_serial,
__entry->tx_seq, __entry->rx_seq)
);
TRACE_EVENT(rxrpc_notify_socket,
TP_PROTO(unsigned int debug_id, rxrpc_serial_t serial),
TP_ARGS(debug_id, serial),
TP_STRUCT__entry(
__field(unsigned int, debug_id)
__field(rxrpc_serial_t, serial)
),
TP_fast_assign(
__entry->debug_id = debug_id;
__entry->serial = serial;
),
TP_printk("c=%08x r=%08x",
__entry->debug_id,
__entry->serial)
);
TRACE_EVENT(rxrpc_rx_discard_ack,
TP_PROTO(unsigned int debug_id, rxrpc_serial_t serial,
rxrpc_seq_t first_soft_ack, rxrpc_seq_t call_ackr_first,
rxrpc_seq_t prev_pkt, rxrpc_seq_t call_ackr_prev),
TP_ARGS(debug_id, serial, first_soft_ack, call_ackr_first,
prev_pkt, call_ackr_prev),
TP_STRUCT__entry(
__field(unsigned int, debug_id)
__field(rxrpc_serial_t, serial)
__field(rxrpc_seq_t, first_soft_ack)
__field(rxrpc_seq_t, call_ackr_first)
__field(rxrpc_seq_t, prev_pkt)
__field(rxrpc_seq_t, call_ackr_prev)
),
TP_fast_assign(
__entry->debug_id = debug_id;
__entry->serial = serial;
__entry->first_soft_ack = first_soft_ack;
__entry->call_ackr_first = call_ackr_first;
__entry->prev_pkt = prev_pkt;
__entry->call_ackr_prev = call_ackr_prev;
),
TP_printk("c=%08x r=%08x %08x<%08x %08x<%08x",
__entry->debug_id,
__entry->serial,
__entry->first_soft_ack,
__entry->call_ackr_first,
__entry->prev_pkt,
__entry->call_ackr_prev)
);
TRACE_EVENT(rxrpc_req_ack,
TP_PROTO(unsigned int call_debug_id, rxrpc_seq_t seq,
enum rxrpc_req_ack_trace why),
TP_ARGS(call_debug_id, seq, why),
TP_STRUCT__entry(
__field(unsigned int, call_debug_id)
__field(rxrpc_seq_t, seq)
__field(enum rxrpc_req_ack_trace, why)
),
TP_fast_assign(
__entry->call_debug_id = call_debug_id;
__entry->seq = seq;
__entry->why = why;
),
TP_printk("c=%08x q=%08x REQ-%s",
__entry->call_debug_id,
__entry->seq,
__print_symbolic(__entry->why, rxrpc_req_ack_traces))
);
TRACE_EVENT(rxrpc_txbuf,
TP_PROTO(unsigned int debug_id,
unsigned int call_debug_id, rxrpc_seq_t seq,
int ref, enum rxrpc_txbuf_trace what),
TP_ARGS(debug_id, call_debug_id, seq, ref, what),
TP_STRUCT__entry(
__field(unsigned int, debug_id)
__field(unsigned int, call_debug_id)
__field(rxrpc_seq_t, seq)
__field(int, ref)
__field(enum rxrpc_txbuf_trace, what)
),
TP_fast_assign(
__entry->debug_id = debug_id;
__entry->call_debug_id = call_debug_id;
__entry->seq = seq;
__entry->ref = ref;
__entry->what = what;
),
TP_printk("B=%08x c=%08x q=%08x %s r=%d",
__entry->debug_id,
__entry->call_debug_id,
__entry->seq,
__print_symbolic(__entry->what, rxrpc_txbuf_traces),
__entry->ref)
);
TRACE_EVENT(rxrpc_poke_call,
TP_PROTO(struct rxrpc_call *call, bool busy,
enum rxrpc_call_poke_trace what),
TP_ARGS(call, busy, what),
TP_STRUCT__entry(
__field(unsigned int, call_debug_id)
__field(bool, busy)
__field(enum rxrpc_call_poke_trace, what)
),
TP_fast_assign(
__entry->call_debug_id = call->debug_id;
__entry->busy = busy;
__entry->what = what;
),
TP_printk("c=%08x %s%s",
__entry->call_debug_id,
__print_symbolic(__entry->what, rxrpc_call_poke_traces),
__entry->busy ? "!" : "")
);
TRACE_EVENT(rxrpc_call_poked,
TP_PROTO(struct rxrpc_call *call),
TP_ARGS(call),
TP_STRUCT__entry(
__field(unsigned int, call_debug_id)
),
TP_fast_assign(
__entry->call_debug_id = call->debug_id;
),
TP_printk("c=%08x",
__entry->call_debug_id)
);
TRACE_EVENT(rxrpc_sack,
TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq,
unsigned int sack, enum rxrpc_sack_trace what),
TP_ARGS(call, seq, sack, what),
TP_STRUCT__entry(
__field(unsigned int, call_debug_id)
__field(rxrpc_seq_t, seq)
__field(unsigned int, sack)
__field(enum rxrpc_sack_trace, what)
),
TP_fast_assign(
__entry->call_debug_id = call->debug_id;
__entry->seq = seq;
__entry->sack = sack;
__entry->what = what;
),
TP_printk("c=%08x q=%08x %s k=%x",
__entry->call_debug_id,
__entry->seq,
__print_symbolic(__entry->what, rxrpc_sack_traces),
__entry->sack)
);
#undef EM
#undef E_
#endif /* RXRPC_TRACE_ONLY_DEFINE_ENUMS */
#endif /* _TRACE_RXRPC_H */
/* This part must be outside protection */
#include <trace/define_trace.h>