2019-05-21 01:08:01 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2018-10-20 07:57:59 +08:00
|
|
|
/* AFS fileserver probing
|
|
|
|
*
|
afs: Actively poll fileservers to maintain NAT or firewall openings
When an AFS client accesses a file, it receives a limited-duration callback
promise that the server will notify it if another client changes a file.
This callback duration can be a few hours in length.
If a client mounts a volume and then an application prevents it from being
unmounted, say by chdir'ing into it, but then does nothing for some time,
the rxrpc_peer record will expire and rxrpc-level keepalive will cease.
If there is NAT or a firewall between the client and the server, the route
back for the server may close after a comparatively short duration, meaning
that attempts by the server to notify the client may then bounce.
The client, however, may (so far as it knows) still have a valid unexpired
promise and will then rely on its cached data and will not see changes made
on the server by a third party until it incidentally rechecks the status or
the promise needs renewal.
To deal with this, the client needs to regularly probe the server. This
has two effects: firstly, it keeps a route open back for the server, and
secondly, it causes the server to disgorge any notifications that got
queued up because they couldn't be sent.
Fix this by adding a mechanism to emit regular probes.
Two levels of probing are made available: Under normal circumstances the
'slow' queue will be used for a fileserver - this just probes the preferred
address once every 5 mins or so; however, if server fails to respond to any
probes, the server will shift to the 'fast' queue from which all its
interfaces will be probed every 30s. When it finally responds, the record
will switch back to the slow queue.
Further notes:
(1) Probing is now no longer driven from the fileserver rotation
algorithm.
(2) Probes are dispatched to all interfaces on a fileserver when that an
afs_server object is set up to record it.
(3) The afs_server object is removed from the probe queues when we start
to probe it. afs_is_probing_server() returns true if it's not listed
- ie. it's undergoing probing.
(4) The afs_server object is added back on to the probe queue when the
final outstanding probe completes, but the probed_at time is set when
we're about to launch a probe so that it's not dependent on the probe
duration.
(5) The timer and the work item added for this must be handed a count on
net->servers_outstanding, which they hand on or release. This makes
sure that network namespace cleanup waits for them.
Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 22:10:00 +08:00
|
|
|
* Copyright (C) 2018, 2020 Red Hat, Inc. All Rights Reserved.
|
2018-10-20 07:57:59 +08:00
|
|
|
* Written by David Howells (dhowells@redhat.com)
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/sched.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include "afs_fs.h"
|
|
|
|
#include "internal.h"
|
afs: Fix corruption in reads at fpos 2G-4G from an OpenAFS server
AFS-3 has two data fetch RPC variants, FS.FetchData and FS.FetchData64, and
Linux's afs client switches between them when talking to a non-YFS server
if the read size, the file position or the sum of the two have the upper 32
bits set of the 64-bit value.
This is a problem, however, since the file position and length fields of
FS.FetchData are *signed* 32-bit values.
Fix this by capturing the capability bits obtained from the fileserver when
it's sent an FS.GetCapabilities RPC, rather than just discarding them, and
then picking out the VICED_CAPABILITY_64BITFILES flag. This can then be
used to decide whether to use FS.FetchData or FS.FetchData64 - and also
FS.StoreData or FS.StoreData64 - rather than using upper_32_bits() to
switch on the parameter values.
This capabilities flag could also be used to limit the maximum size of the
file, but all servers must be checked for that.
Note that the issue does not exist with FS.StoreData - that uses *unsigned*
32-bit values. It's also not a problem with Auristor servers as its
YFS.FetchData64 op uses unsigned 64-bit values.
This can be tested by cloning a git repo through an OpenAFS client to an
OpenAFS server and then doing "git status" on it from a Linux afs
client[1]. Provided the clone has a pack file that's in the 2G-4G range,
the git status will show errors like:
error: packfile .git/objects/pack/pack-5e813c51d12b6847bbc0fcd97c2bca66da50079c.pack does not match index
error: packfile .git/objects/pack/pack-5e813c51d12b6847bbc0fcd97c2bca66da50079c.pack does not match index
This can be observed in the server's FileLog with something like the
following appearing:
Sun Aug 29 19:31:39 2021 SRXAFS_FetchData, Fid = 2303380852.491776.3263114, Host 192.168.11.201:7001, Id 1001
Sun Aug 29 19:31:39 2021 CheckRights: len=0, for host=192.168.11.201:7001
Sun Aug 29 19:31:39 2021 FetchData_RXStyle: Pos 18446744071815340032, Len 3154
Sun Aug 29 19:31:39 2021 FetchData_RXStyle: file size 2400758866
...
Sun Aug 29 19:31:40 2021 SRXAFS_FetchData returns 5
Note the file position of 18446744071815340032. This is the requested file
position sign-extended.
Fixes: b9b1f8d5930a ("AFS: write support fixes")
Reported-by: Markus Suvanto <markus.suvanto@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Marc Dionne <marc.dionne@auristor.com>
Tested-by: Markus Suvanto <markus.suvanto@gmail.com>
cc: linux-afs@lists.infradead.org
cc: openafs-devel@openafs.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=214217#c9 [1]
Link: https://lore.kernel.org/r/951332.1631308745@warthog.procyon.org.uk/
2021-09-10 07:01:52 +08:00
|
|
|
#include "protocol_afs.h"
|
2018-10-20 07:57:59 +08:00
|
|
|
#include "protocol_yfs.h"
|
|
|
|
|
afs: Actively poll fileservers to maintain NAT or firewall openings
When an AFS client accesses a file, it receives a limited-duration callback
promise that the server will notify it if another client changes a file.
This callback duration can be a few hours in length.
If a client mounts a volume and then an application prevents it from being
unmounted, say by chdir'ing into it, but then does nothing for some time,
the rxrpc_peer record will expire and rxrpc-level keepalive will cease.
If there is NAT or a firewall between the client and the server, the route
back for the server may close after a comparatively short duration, meaning
that attempts by the server to notify the client may then bounce.
The client, however, may (so far as it knows) still have a valid unexpired
promise and will then rely on its cached data and will not see changes made
on the server by a third party until it incidentally rechecks the status or
the promise needs renewal.
To deal with this, the client needs to regularly probe the server. This
has two effects: firstly, it keeps a route open back for the server, and
secondly, it causes the server to disgorge any notifications that got
queued up because they couldn't be sent.
Fix this by adding a mechanism to emit regular probes.
Two levels of probing are made available: Under normal circumstances the
'slow' queue will be used for a fileserver - this just probes the preferred
address once every 5 mins or so; however, if server fails to respond to any
probes, the server will shift to the 'fast' queue from which all its
interfaces will be probed every 30s. When it finally responds, the record
will switch back to the slow queue.
Further notes:
(1) Probing is now no longer driven from the fileserver rotation
algorithm.
(2) Probes are dispatched to all interfaces on a fileserver when that an
afs_server object is set up to record it.
(3) The afs_server object is removed from the probe queues when we start
to probe it. afs_is_probing_server() returns true if it's not listed
- ie. it's undergoing probing.
(4) The afs_server object is added back on to the probe queue when the
final outstanding probe completes, but the probed_at time is set when
we're about to launch a probe so that it's not dependent on the probe
duration.
(5) The timer and the work item added for this must be handed a count on
net->servers_outstanding, which they hand on or release. This makes
sure that network namespace cleanup waits for them.
Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 22:10:00 +08:00
|
|
|
static unsigned int afs_fs_probe_fast_poll_interval = 30 * HZ;
|
|
|
|
static unsigned int afs_fs_probe_slow_poll_interval = 5 * 60 * HZ;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Start the probe polling timer. We have to supply it with an inc on the
|
|
|
|
* outstanding server count.
|
|
|
|
*/
|
|
|
|
static void afs_schedule_fs_probe(struct afs_net *net,
|
|
|
|
struct afs_server *server, bool fast)
|
|
|
|
{
|
|
|
|
unsigned long atj;
|
|
|
|
|
|
|
|
if (!net->live)
|
|
|
|
return;
|
|
|
|
|
|
|
|
atj = server->probed_at;
|
|
|
|
atj += fast ? afs_fs_probe_fast_poll_interval : afs_fs_probe_slow_poll_interval;
|
|
|
|
|
|
|
|
afs_inc_servers_outstanding(net);
|
|
|
|
if (timer_reduce(&net->fs_probe_timer, atj))
|
|
|
|
afs_dec_servers_outstanding(net);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Handle the completion of a set of probes.
|
|
|
|
*/
|
|
|
|
static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server)
|
|
|
|
{
|
|
|
|
bool responded = server->probe.responded;
|
|
|
|
|
|
|
|
write_seqlock(&net->fs_lock);
|
2020-05-02 20:39:57 +08:00
|
|
|
if (responded) {
|
afs: Actively poll fileservers to maintain NAT or firewall openings
When an AFS client accesses a file, it receives a limited-duration callback
promise that the server will notify it if another client changes a file.
This callback duration can be a few hours in length.
If a client mounts a volume and then an application prevents it from being
unmounted, say by chdir'ing into it, but then does nothing for some time,
the rxrpc_peer record will expire and rxrpc-level keepalive will cease.
If there is NAT or a firewall between the client and the server, the route
back for the server may close after a comparatively short duration, meaning
that attempts by the server to notify the client may then bounce.
The client, however, may (so far as it knows) still have a valid unexpired
promise and will then rely on its cached data and will not see changes made
on the server by a third party until it incidentally rechecks the status or
the promise needs renewal.
To deal with this, the client needs to regularly probe the server. This
has two effects: firstly, it keeps a route open back for the server, and
secondly, it causes the server to disgorge any notifications that got
queued up because they couldn't be sent.
Fix this by adding a mechanism to emit regular probes.
Two levels of probing are made available: Under normal circumstances the
'slow' queue will be used for a fileserver - this just probes the preferred
address once every 5 mins or so; however, if server fails to respond to any
probes, the server will shift to the 'fast' queue from which all its
interfaces will be probed every 30s. When it finally responds, the record
will switch back to the slow queue.
Further notes:
(1) Probing is now no longer driven from the fileserver rotation
algorithm.
(2) Probes are dispatched to all interfaces on a fileserver when that an
afs_server object is set up to record it.
(3) The afs_server object is removed from the probe queues when we start
to probe it. afs_is_probing_server() returns true if it's not listed
- ie. it's undergoing probing.
(4) The afs_server object is added back on to the probe queue when the
final outstanding probe completes, but the probed_at time is set when
we're about to launch a probe so that it's not dependent on the probe
duration.
(5) The timer and the work item added for this must be handed a count on
net->servers_outstanding, which they hand on or release. This makes
sure that network namespace cleanup waits for them.
Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 22:10:00 +08:00
|
|
|
list_add_tail(&server->probe_link, &net->fs_probe_slow);
|
2020-05-02 20:39:57 +08:00
|
|
|
} else {
|
|
|
|
server->rtt = UINT_MAX;
|
|
|
|
clear_bit(AFS_SERVER_FL_RESPONDING, &server->flags);
|
afs: Actively poll fileservers to maintain NAT or firewall openings
When an AFS client accesses a file, it receives a limited-duration callback
promise that the server will notify it if another client changes a file.
This callback duration can be a few hours in length.
If a client mounts a volume and then an application prevents it from being
unmounted, say by chdir'ing into it, but then does nothing for some time,
the rxrpc_peer record will expire and rxrpc-level keepalive will cease.
If there is NAT or a firewall between the client and the server, the route
back for the server may close after a comparatively short duration, meaning
that attempts by the server to notify the client may then bounce.
The client, however, may (so far as it knows) still have a valid unexpired
promise and will then rely on its cached data and will not see changes made
on the server by a third party until it incidentally rechecks the status or
the promise needs renewal.
To deal with this, the client needs to regularly probe the server. This
has two effects: firstly, it keeps a route open back for the server, and
secondly, it causes the server to disgorge any notifications that got
queued up because they couldn't be sent.
Fix this by adding a mechanism to emit regular probes.
Two levels of probing are made available: Under normal circumstances the
'slow' queue will be used for a fileserver - this just probes the preferred
address once every 5 mins or so; however, if server fails to respond to any
probes, the server will shift to the 'fast' queue from which all its
interfaces will be probed every 30s. When it finally responds, the record
will switch back to the slow queue.
Further notes:
(1) Probing is now no longer driven from the fileserver rotation
algorithm.
(2) Probes are dispatched to all interfaces on a fileserver when that an
afs_server object is set up to record it.
(3) The afs_server object is removed from the probe queues when we start
to probe it. afs_is_probing_server() returns true if it's not listed
- ie. it's undergoing probing.
(4) The afs_server object is added back on to the probe queue when the
final outstanding probe completes, but the probed_at time is set when
we're about to launch a probe so that it's not dependent on the probe
duration.
(5) The timer and the work item added for this must be handed a count on
net->servers_outstanding, which they hand on or release. This makes
sure that network namespace cleanup waits for them.
Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 22:10:00 +08:00
|
|
|
list_add_tail(&server->probe_link, &net->fs_probe_fast);
|
2020-05-02 20:39:57 +08:00
|
|
|
}
|
afs: Actively poll fileservers to maintain NAT or firewall openings
When an AFS client accesses a file, it receives a limited-duration callback
promise that the server will notify it if another client changes a file.
This callback duration can be a few hours in length.
If a client mounts a volume and then an application prevents it from being
unmounted, say by chdir'ing into it, but then does nothing for some time,
the rxrpc_peer record will expire and rxrpc-level keepalive will cease.
If there is NAT or a firewall between the client and the server, the route
back for the server may close after a comparatively short duration, meaning
that attempts by the server to notify the client may then bounce.
The client, however, may (so far as it knows) still have a valid unexpired
promise and will then rely on its cached data and will not see changes made
on the server by a third party until it incidentally rechecks the status or
the promise needs renewal.
To deal with this, the client needs to regularly probe the server. This
has two effects: firstly, it keeps a route open back for the server, and
secondly, it causes the server to disgorge any notifications that got
queued up because they couldn't be sent.
Fix this by adding a mechanism to emit regular probes.
Two levels of probing are made available: Under normal circumstances the
'slow' queue will be used for a fileserver - this just probes the preferred
address once every 5 mins or so; however, if server fails to respond to any
probes, the server will shift to the 'fast' queue from which all its
interfaces will be probed every 30s. When it finally responds, the record
will switch back to the slow queue.
Further notes:
(1) Probing is now no longer driven from the fileserver rotation
algorithm.
(2) Probes are dispatched to all interfaces on a fileserver when that an
afs_server object is set up to record it.
(3) The afs_server object is removed from the probe queues when we start
to probe it. afs_is_probing_server() returns true if it's not listed
- ie. it's undergoing probing.
(4) The afs_server object is added back on to the probe queue when the
final outstanding probe completes, but the probed_at time is set when
we're about to launch a probe so that it's not dependent on the probe
duration.
(5) The timer and the work item added for this must be handed a count on
net->servers_outstanding, which they hand on or release. This makes
sure that network namespace cleanup waits for them.
Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 22:10:00 +08:00
|
|
|
write_sequnlock(&net->fs_lock);
|
|
|
|
|
|
|
|
afs_schedule_fs_probe(net, server, !responded);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Handle the completion of a probe.
|
|
|
|
*/
|
|
|
|
static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server)
|
|
|
|
{
|
|
|
|
_enter("");
|
|
|
|
|
|
|
|
if (atomic_dec_and_test(&server->probe_outstanding))
|
|
|
|
afs_finished_fs_probe(net, server);
|
|
|
|
|
|
|
|
wake_up_all(&server->probe_wq);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Handle inability to send a probe due to ENOMEM when trying to allocate a
|
|
|
|
* call struct.
|
|
|
|
*/
|
|
|
|
static void afs_fs_probe_not_done(struct afs_net *net,
|
|
|
|
struct afs_server *server,
|
|
|
|
struct afs_addr_cursor *ac)
|
2018-10-20 07:57:59 +08:00
|
|
|
{
|
afs: Actively poll fileservers to maintain NAT or firewall openings
When an AFS client accesses a file, it receives a limited-duration callback
promise that the server will notify it if another client changes a file.
This callback duration can be a few hours in length.
If a client mounts a volume and then an application prevents it from being
unmounted, say by chdir'ing into it, but then does nothing for some time,
the rxrpc_peer record will expire and rxrpc-level keepalive will cease.
If there is NAT or a firewall between the client and the server, the route
back for the server may close after a comparatively short duration, meaning
that attempts by the server to notify the client may then bounce.
The client, however, may (so far as it knows) still have a valid unexpired
promise and will then rely on its cached data and will not see changes made
on the server by a third party until it incidentally rechecks the status or
the promise needs renewal.
To deal with this, the client needs to regularly probe the server. This
has two effects: firstly, it keeps a route open back for the server, and
secondly, it causes the server to disgorge any notifications that got
queued up because they couldn't be sent.
Fix this by adding a mechanism to emit regular probes.
Two levels of probing are made available: Under normal circumstances the
'slow' queue will be used for a fileserver - this just probes the preferred
address once every 5 mins or so; however, if server fails to respond to any
probes, the server will shift to the 'fast' queue from which all its
interfaces will be probed every 30s. When it finally responds, the record
will switch back to the slow queue.
Further notes:
(1) Probing is now no longer driven from the fileserver rotation
algorithm.
(2) Probes are dispatched to all interfaces on a fileserver when that an
afs_server object is set up to record it.
(3) The afs_server object is removed from the probe queues when we start
to probe it. afs_is_probing_server() returns true if it's not listed
- ie. it's undergoing probing.
(4) The afs_server object is added back on to the probe queue when the
final outstanding probe completes, but the probed_at time is set when
we're about to launch a probe so that it's not dependent on the probe
duration.
(5) The timer and the work item added for this must be handed a count on
net->servers_outstanding, which they hand on or release. This makes
sure that network namespace cleanup waits for them.
Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 22:10:00 +08:00
|
|
|
struct afs_addr_list *alist = ac->alist;
|
|
|
|
unsigned int index = ac->index;
|
|
|
|
|
|
|
|
_enter("");
|
2018-10-20 07:57:59 +08:00
|
|
|
|
afs: Actively poll fileservers to maintain NAT or firewall openings
When an AFS client accesses a file, it receives a limited-duration callback
promise that the server will notify it if another client changes a file.
This callback duration can be a few hours in length.
If a client mounts a volume and then an application prevents it from being
unmounted, say by chdir'ing into it, but then does nothing for some time,
the rxrpc_peer record will expire and rxrpc-level keepalive will cease.
If there is NAT or a firewall between the client and the server, the route
back for the server may close after a comparatively short duration, meaning
that attempts by the server to notify the client may then bounce.
The client, however, may (so far as it knows) still have a valid unexpired
promise and will then rely on its cached data and will not see changes made
on the server by a third party until it incidentally rechecks the status or
the promise needs renewal.
To deal with this, the client needs to regularly probe the server. This
has two effects: firstly, it keeps a route open back for the server, and
secondly, it causes the server to disgorge any notifications that got
queued up because they couldn't be sent.
Fix this by adding a mechanism to emit regular probes.
Two levels of probing are made available: Under normal circumstances the
'slow' queue will be used for a fileserver - this just probes the preferred
address once every 5 mins or so; however, if server fails to respond to any
probes, the server will shift to the 'fast' queue from which all its
interfaces will be probed every 30s. When it finally responds, the record
will switch back to the slow queue.
Further notes:
(1) Probing is now no longer driven from the fileserver rotation
algorithm.
(2) Probes are dispatched to all interfaces on a fileserver when that an
afs_server object is set up to record it.
(3) The afs_server object is removed from the probe queues when we start
to probe it. afs_is_probing_server() returns true if it's not listed
- ie. it's undergoing probing.
(4) The afs_server object is added back on to the probe queue when the
final outstanding probe completes, but the probed_at time is set when
we're about to launch a probe so that it's not dependent on the probe
duration.
(5) The timer and the work item added for this must be handed a count on
net->servers_outstanding, which they hand on or release. This makes
sure that network namespace cleanup waits for them.
Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 22:10:00 +08:00
|
|
|
trace_afs_io_error(0, -ENOMEM, afs_io_error_fs_probe_fail);
|
|
|
|
spin_lock(&server->probe_lock);
|
|
|
|
|
|
|
|
server->probe.local_failure = true;
|
|
|
|
if (server->probe.error == 0)
|
|
|
|
server->probe.error = -ENOMEM;
|
|
|
|
|
|
|
|
set_bit(index, &alist->failed);
|
|
|
|
|
|
|
|
spin_unlock(&server->probe_lock);
|
|
|
|
return afs_done_one_fs_probe(net, server);
|
2018-10-20 07:57:59 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Process the result of probing a fileserver. This is called after successful
|
|
|
|
* or failed delivery of an FS.GetCapabilities operation.
|
|
|
|
*/
|
|
|
|
void afs_fileserver_probe_result(struct afs_call *call)
|
|
|
|
{
|
|
|
|
struct afs_addr_list *alist = call->alist;
|
2019-05-10 05:22:50 +08:00
|
|
|
struct afs_server *server = call->server;
|
2018-10-20 07:57:59 +08:00
|
|
|
unsigned int index = call->addr_ix;
|
afs: Fix corruption in reads at fpos 2G-4G from an OpenAFS server
AFS-3 has two data fetch RPC variants, FS.FetchData and FS.FetchData64, and
Linux's afs client switches between them when talking to a non-YFS server
if the read size, the file position or the sum of the two have the upper 32
bits set of the 64-bit value.
This is a problem, however, since the file position and length fields of
FS.FetchData are *signed* 32-bit values.
Fix this by capturing the capability bits obtained from the fileserver when
it's sent an FS.GetCapabilities RPC, rather than just discarding them, and
then picking out the VICED_CAPABILITY_64BITFILES flag. This can then be
used to decide whether to use FS.FetchData or FS.FetchData64 - and also
FS.StoreData or FS.StoreData64 - rather than using upper_32_bits() to
switch on the parameter values.
This capabilities flag could also be used to limit the maximum size of the
file, but all servers must be checked for that.
Note that the issue does not exist with FS.StoreData - that uses *unsigned*
32-bit values. It's also not a problem with Auristor servers as its
YFS.FetchData64 op uses unsigned 64-bit values.
This can be tested by cloning a git repo through an OpenAFS client to an
OpenAFS server and then doing "git status" on it from a Linux afs
client[1]. Provided the clone has a pack file that's in the 2G-4G range,
the git status will show errors like:
error: packfile .git/objects/pack/pack-5e813c51d12b6847bbc0fcd97c2bca66da50079c.pack does not match index
error: packfile .git/objects/pack/pack-5e813c51d12b6847bbc0fcd97c2bca66da50079c.pack does not match index
This can be observed in the server's FileLog with something like the
following appearing:
Sun Aug 29 19:31:39 2021 SRXAFS_FetchData, Fid = 2303380852.491776.3263114, Host 192.168.11.201:7001, Id 1001
Sun Aug 29 19:31:39 2021 CheckRights: len=0, for host=192.168.11.201:7001
Sun Aug 29 19:31:39 2021 FetchData_RXStyle: Pos 18446744071815340032, Len 3154
Sun Aug 29 19:31:39 2021 FetchData_RXStyle: file size 2400758866
...
Sun Aug 29 19:31:40 2021 SRXAFS_FetchData returns 5
Note the file position of 18446744071815340032. This is the requested file
position sign-extended.
Fixes: b9b1f8d5930a ("AFS: write support fixes")
Reported-by: Markus Suvanto <markus.suvanto@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Marc Dionne <marc.dionne@auristor.com>
Tested-by: Markus Suvanto <markus.suvanto@gmail.com>
cc: linux-afs@lists.infradead.org
cc: openafs-devel@openafs.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=214217#c9 [1]
Link: https://lore.kernel.org/r/951332.1631308745@warthog.procyon.org.uk/
2021-09-10 07:01:52 +08:00
|
|
|
unsigned int rtt_us = 0, cap0;
|
2018-10-20 07:57:59 +08:00
|
|
|
int ret = call->error;
|
|
|
|
|
|
|
|
_enter("%pU,%u", &server->uuid, index);
|
|
|
|
|
|
|
|
spin_lock(&server->probe_lock);
|
|
|
|
|
|
|
|
switch (ret) {
|
|
|
|
case 0:
|
|
|
|
server->probe.error = 0;
|
|
|
|
goto responded;
|
|
|
|
case -ECONNABORTED:
|
|
|
|
if (!server->probe.responded) {
|
|
|
|
server->probe.abort_code = call->abort_code;
|
|
|
|
server->probe.error = ret;
|
|
|
|
}
|
|
|
|
goto responded;
|
|
|
|
case -ENOMEM:
|
|
|
|
case -ENONET:
|
afs: Actively poll fileservers to maintain NAT or firewall openings
When an AFS client accesses a file, it receives a limited-duration callback
promise that the server will notify it if another client changes a file.
This callback duration can be a few hours in length.
If a client mounts a volume and then an application prevents it from being
unmounted, say by chdir'ing into it, but then does nothing for some time,
the rxrpc_peer record will expire and rxrpc-level keepalive will cease.
If there is NAT or a firewall between the client and the server, the route
back for the server may close after a comparatively short duration, meaning
that attempts by the server to notify the client may then bounce.
The client, however, may (so far as it knows) still have a valid unexpired
promise and will then rely on its cached data and will not see changes made
on the server by a third party until it incidentally rechecks the status or
the promise needs renewal.
To deal with this, the client needs to regularly probe the server. This
has two effects: firstly, it keeps a route open back for the server, and
secondly, it causes the server to disgorge any notifications that got
queued up because they couldn't be sent.
Fix this by adding a mechanism to emit regular probes.
Two levels of probing are made available: Under normal circumstances the
'slow' queue will be used for a fileserver - this just probes the preferred
address once every 5 mins or so; however, if server fails to respond to any
probes, the server will shift to the 'fast' queue from which all its
interfaces will be probed every 30s. When it finally responds, the record
will switch back to the slow queue.
Further notes:
(1) Probing is now no longer driven from the fileserver rotation
algorithm.
(2) Probes are dispatched to all interfaces on a fileserver when that an
afs_server object is set up to record it.
(3) The afs_server object is removed from the probe queues when we start
to probe it. afs_is_probing_server() returns true if it's not listed
- ie. it's undergoing probing.
(4) The afs_server object is added back on to the probe queue when the
final outstanding probe completes, but the probed_at time is set when
we're about to launch a probe so that it's not dependent on the probe
duration.
(5) The timer and the work item added for this must be handed a count on
net->servers_outstanding, which they hand on or release. This makes
sure that network namespace cleanup waits for them.
Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 22:10:00 +08:00
|
|
|
clear_bit(index, &alist->responded);
|
2018-10-20 07:57:59 +08:00
|
|
|
server->probe.local_failure = true;
|
afs: Actively poll fileservers to maintain NAT or firewall openings
When an AFS client accesses a file, it receives a limited-duration callback
promise that the server will notify it if another client changes a file.
This callback duration can be a few hours in length.
If a client mounts a volume and then an application prevents it from being
unmounted, say by chdir'ing into it, but then does nothing for some time,
the rxrpc_peer record will expire and rxrpc-level keepalive will cease.
If there is NAT or a firewall between the client and the server, the route
back for the server may close after a comparatively short duration, meaning
that attempts by the server to notify the client may then bounce.
The client, however, may (so far as it knows) still have a valid unexpired
promise and will then rely on its cached data and will not see changes made
on the server by a third party until it incidentally rechecks the status or
the promise needs renewal.
To deal with this, the client needs to regularly probe the server. This
has two effects: firstly, it keeps a route open back for the server, and
secondly, it causes the server to disgorge any notifications that got
queued up because they couldn't be sent.
Fix this by adding a mechanism to emit regular probes.
Two levels of probing are made available: Under normal circumstances the
'slow' queue will be used for a fileserver - this just probes the preferred
address once every 5 mins or so; however, if server fails to respond to any
probes, the server will shift to the 'fast' queue from which all its
interfaces will be probed every 30s. When it finally responds, the record
will switch back to the slow queue.
Further notes:
(1) Probing is now no longer driven from the fileserver rotation
algorithm.
(2) Probes are dispatched to all interfaces on a fileserver when that an
afs_server object is set up to record it.
(3) The afs_server object is removed from the probe queues when we start
to probe it. afs_is_probing_server() returns true if it's not listed
- ie. it's undergoing probing.
(4) The afs_server object is added back on to the probe queue when the
final outstanding probe completes, but the probed_at time is set when
we're about to launch a probe so that it's not dependent on the probe
duration.
(5) The timer and the work item added for this must be handed a count on
net->servers_outstanding, which they hand on or release. This makes
sure that network namespace cleanup waits for them.
Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 22:10:00 +08:00
|
|
|
trace_afs_io_error(call->debug_id, ret, afs_io_error_fs_probe_fail);
|
2018-10-20 07:57:59 +08:00
|
|
|
goto out;
|
|
|
|
case -ECONNRESET: /* Responded, but call expired. */
|
2018-11-14 07:20:28 +08:00
|
|
|
case -ERFKILL:
|
|
|
|
case -EADDRNOTAVAIL:
|
2018-10-20 07:57:59 +08:00
|
|
|
case -ENETUNREACH:
|
|
|
|
case -EHOSTUNREACH:
|
2018-11-14 07:20:28 +08:00
|
|
|
case -EHOSTDOWN:
|
2018-10-20 07:57:59 +08:00
|
|
|
case -ECONNREFUSED:
|
|
|
|
case -ETIMEDOUT:
|
|
|
|
case -ETIME:
|
|
|
|
default:
|
|
|
|
clear_bit(index, &alist->responded);
|
|
|
|
set_bit(index, &alist->failed);
|
|
|
|
if (!server->probe.responded &&
|
|
|
|
(server->probe.error == 0 ||
|
|
|
|
server->probe.error == -ETIMEDOUT ||
|
|
|
|
server->probe.error == -ETIME))
|
|
|
|
server->probe.error = ret;
|
afs: Actively poll fileservers to maintain NAT or firewall openings
When an AFS client accesses a file, it receives a limited-duration callback
promise that the server will notify it if another client changes a file.
This callback duration can be a few hours in length.
If a client mounts a volume and then an application prevents it from being
unmounted, say by chdir'ing into it, but then does nothing for some time,
the rxrpc_peer record will expire and rxrpc-level keepalive will cease.
If there is NAT or a firewall between the client and the server, the route
back for the server may close after a comparatively short duration, meaning
that attempts by the server to notify the client may then bounce.
The client, however, may (so far as it knows) still have a valid unexpired
promise and will then rely on its cached data and will not see changes made
on the server by a third party until it incidentally rechecks the status or
the promise needs renewal.
To deal with this, the client needs to regularly probe the server. This
has two effects: firstly, it keeps a route open back for the server, and
secondly, it causes the server to disgorge any notifications that got
queued up because they couldn't be sent.
Fix this by adding a mechanism to emit regular probes.
Two levels of probing are made available: Under normal circumstances the
'slow' queue will be used for a fileserver - this just probes the preferred
address once every 5 mins or so; however, if server fails to respond to any
probes, the server will shift to the 'fast' queue from which all its
interfaces will be probed every 30s. When it finally responds, the record
will switch back to the slow queue.
Further notes:
(1) Probing is now no longer driven from the fileserver rotation
algorithm.
(2) Probes are dispatched to all interfaces on a fileserver when that an
afs_server object is set up to record it.
(3) The afs_server object is removed from the probe queues when we start
to probe it. afs_is_probing_server() returns true if it's not listed
- ie. it's undergoing probing.
(4) The afs_server object is added back on to the probe queue when the
final outstanding probe completes, but the probed_at time is set when
we're about to launch a probe so that it's not dependent on the probe
duration.
(5) The timer and the work item added for this must be handed a count on
net->servers_outstanding, which they hand on or release. This makes
sure that network namespace cleanup waits for them.
Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 22:10:00 +08:00
|
|
|
trace_afs_io_error(call->debug_id, ret, afs_io_error_fs_probe_fail);
|
2018-10-20 07:57:59 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
responded:
|
|
|
|
clear_bit(index, &alist->failed);
|
|
|
|
|
|
|
|
if (call->service_id == YFS_FS_SERVICE) {
|
|
|
|
server->probe.is_yfs = true;
|
|
|
|
set_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
|
|
|
|
alist->addrs[index].srx_service = call->service_id;
|
|
|
|
} else {
|
|
|
|
server->probe.not_yfs = true;
|
|
|
|
if (!server->probe.is_yfs) {
|
|
|
|
clear_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
|
|
|
|
alist->addrs[index].srx_service = call->service_id;
|
|
|
|
}
|
afs: Fix corruption in reads at fpos 2G-4G from an OpenAFS server
AFS-3 has two data fetch RPC variants, FS.FetchData and FS.FetchData64, and
Linux's afs client switches between them when talking to a non-YFS server
if the read size, the file position or the sum of the two have the upper 32
bits set of the 64-bit value.
This is a problem, however, since the file position and length fields of
FS.FetchData are *signed* 32-bit values.
Fix this by capturing the capability bits obtained from the fileserver when
it's sent an FS.GetCapabilities RPC, rather than just discarding them, and
then picking out the VICED_CAPABILITY_64BITFILES flag. This can then be
used to decide whether to use FS.FetchData or FS.FetchData64 - and also
FS.StoreData or FS.StoreData64 - rather than using upper_32_bits() to
switch on the parameter values.
This capabilities flag could also be used to limit the maximum size of the
file, but all servers must be checked for that.
Note that the issue does not exist with FS.StoreData - that uses *unsigned*
32-bit values. It's also not a problem with Auristor servers as its
YFS.FetchData64 op uses unsigned 64-bit values.
This can be tested by cloning a git repo through an OpenAFS client to an
OpenAFS server and then doing "git status" on it from a Linux afs
client[1]. Provided the clone has a pack file that's in the 2G-4G range,
the git status will show errors like:
error: packfile .git/objects/pack/pack-5e813c51d12b6847bbc0fcd97c2bca66da50079c.pack does not match index
error: packfile .git/objects/pack/pack-5e813c51d12b6847bbc0fcd97c2bca66da50079c.pack does not match index
This can be observed in the server's FileLog with something like the
following appearing:
Sun Aug 29 19:31:39 2021 SRXAFS_FetchData, Fid = 2303380852.491776.3263114, Host 192.168.11.201:7001, Id 1001
Sun Aug 29 19:31:39 2021 CheckRights: len=0, for host=192.168.11.201:7001
Sun Aug 29 19:31:39 2021 FetchData_RXStyle: Pos 18446744071815340032, Len 3154
Sun Aug 29 19:31:39 2021 FetchData_RXStyle: file size 2400758866
...
Sun Aug 29 19:31:40 2021 SRXAFS_FetchData returns 5
Note the file position of 18446744071815340032. This is the requested file
position sign-extended.
Fixes: b9b1f8d5930a ("AFS: write support fixes")
Reported-by: Markus Suvanto <markus.suvanto@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Marc Dionne <marc.dionne@auristor.com>
Tested-by: Markus Suvanto <markus.suvanto@gmail.com>
cc: linux-afs@lists.infradead.org
cc: openafs-devel@openafs.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=214217#c9 [1]
Link: https://lore.kernel.org/r/951332.1631308745@warthog.procyon.org.uk/
2021-09-10 07:01:52 +08:00
|
|
|
cap0 = ntohl(call->tmp);
|
|
|
|
if (cap0 & AFS3_VICED_CAPABILITY_64BITFILES)
|
|
|
|
set_bit(AFS_SERVER_FL_HAS_FS64, &server->flags);
|
|
|
|
else
|
|
|
|
clear_bit(AFS_SERVER_FL_HAS_FS64, &server->flags);
|
2018-10-20 07:57:59 +08:00
|
|
|
}
|
|
|
|
|
2020-08-20 22:13:00 +08:00
|
|
|
if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) &&
|
|
|
|
rtt_us < server->probe.rtt) {
|
2020-05-11 21:54:34 +08:00
|
|
|
server->probe.rtt = rtt_us;
|
2020-05-02 20:39:57 +08:00
|
|
|
server->rtt = rtt_us;
|
2018-10-20 07:57:59 +08:00
|
|
|
alist->preferred = index;
|
|
|
|
}
|
|
|
|
|
|
|
|
smp_wmb(); /* Set rtt before responded. */
|
|
|
|
server->probe.responded = true;
|
afs: Actively poll fileservers to maintain NAT or firewall openings
When an AFS client accesses a file, it receives a limited-duration callback
promise that the server will notify it if another client changes a file.
This callback duration can be a few hours in length.
If a client mounts a volume and then an application prevents it from being
unmounted, say by chdir'ing into it, but then does nothing for some time,
the rxrpc_peer record will expire and rxrpc-level keepalive will cease.
If there is NAT or a firewall between the client and the server, the route
back for the server may close after a comparatively short duration, meaning
that attempts by the server to notify the client may then bounce.
The client, however, may (so far as it knows) still have a valid unexpired
promise and will then rely on its cached data and will not see changes made
on the server by a third party until it incidentally rechecks the status or
the promise needs renewal.
To deal with this, the client needs to regularly probe the server. This
has two effects: firstly, it keeps a route open back for the server, and
secondly, it causes the server to disgorge any notifications that got
queued up because they couldn't be sent.
Fix this by adding a mechanism to emit regular probes.
Two levels of probing are made available: Under normal circumstances the
'slow' queue will be used for a fileserver - this just probes the preferred
address once every 5 mins or so; however, if server fails to respond to any
probes, the server will shift to the 'fast' queue from which all its
interfaces will be probed every 30s. When it finally responds, the record
will switch back to the slow queue.
Further notes:
(1) Probing is now no longer driven from the fileserver rotation
algorithm.
(2) Probes are dispatched to all interfaces on a fileserver when that an
afs_server object is set up to record it.
(3) The afs_server object is removed from the probe queues when we start
to probe it. afs_is_probing_server() returns true if it's not listed
- ie. it's undergoing probing.
(4) The afs_server object is added back on to the probe queue when the
final outstanding probe completes, but the probed_at time is set when
we're about to launch a probe so that it's not dependent on the probe
duration.
(5) The timer and the work item added for this must be handed a count on
net->servers_outstanding, which they hand on or release. This makes
sure that network namespace cleanup waits for them.
Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 22:10:00 +08:00
|
|
|
set_bit(index, &alist->responded);
|
2020-05-02 20:39:57 +08:00
|
|
|
set_bit(AFS_SERVER_FL_RESPONDING, &server->flags);
|
2018-10-20 07:57:59 +08:00
|
|
|
out:
|
|
|
|
spin_unlock(&server->probe_lock);
|
|
|
|
|
afs: Actively poll fileservers to maintain NAT or firewall openings
When an AFS client accesses a file, it receives a limited-duration callback
promise that the server will notify it if another client changes a file.
This callback duration can be a few hours in length.
If a client mounts a volume and then an application prevents it from being
unmounted, say by chdir'ing into it, but then does nothing for some time,
the rxrpc_peer record will expire and rxrpc-level keepalive will cease.
If there is NAT or a firewall between the client and the server, the route
back for the server may close after a comparatively short duration, meaning
that attempts by the server to notify the client may then bounce.
The client, however, may (so far as it knows) still have a valid unexpired
promise and will then rely on its cached data and will not see changes made
on the server by a third party until it incidentally rechecks the status or
the promise needs renewal.
To deal with this, the client needs to regularly probe the server. This
has two effects: firstly, it keeps a route open back for the server, and
secondly, it causes the server to disgorge any notifications that got
queued up because they couldn't be sent.
Fix this by adding a mechanism to emit regular probes.
Two levels of probing are made available: Under normal circumstances the
'slow' queue will be used for a fileserver - this just probes the preferred
address once every 5 mins or so; however, if server fails to respond to any
probes, the server will shift to the 'fast' queue from which all its
interfaces will be probed every 30s. When it finally responds, the record
will switch back to the slow queue.
Further notes:
(1) Probing is now no longer driven from the fileserver rotation
algorithm.
(2) Probes are dispatched to all interfaces on a fileserver when that an
afs_server object is set up to record it.
(3) The afs_server object is removed from the probe queues when we start
to probe it. afs_is_probing_server() returns true if it's not listed
- ie. it's undergoing probing.
(4) The afs_server object is added back on to the probe queue when the
final outstanding probe completes, but the probed_at time is set when
we're about to launch a probe so that it's not dependent on the probe
duration.
(5) The timer and the work item added for this must be handed a count on
net->servers_outstanding, which they hand on or release. This makes
sure that network namespace cleanup waits for them.
Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 22:10:00 +08:00
|
|
|
_debug("probe %pU [%u] %pISpc rtt=%u ret=%d",
|
|
|
|
&server->uuid, index, &alist->addrs[index].transport,
|
|
|
|
rtt_us, ret);
|
2018-10-20 07:57:59 +08:00
|
|
|
|
afs: Actively poll fileservers to maintain NAT or firewall openings
When an AFS client accesses a file, it receives a limited-duration callback
promise that the server will notify it if another client changes a file.
This callback duration can be a few hours in length.
If a client mounts a volume and then an application prevents it from being
unmounted, say by chdir'ing into it, but then does nothing for some time,
the rxrpc_peer record will expire and rxrpc-level keepalive will cease.
If there is NAT or a firewall between the client and the server, the route
back for the server may close after a comparatively short duration, meaning
that attempts by the server to notify the client may then bounce.
The client, however, may (so far as it knows) still have a valid unexpired
promise and will then rely on its cached data and will not see changes made
on the server by a third party until it incidentally rechecks the status or
the promise needs renewal.
To deal with this, the client needs to regularly probe the server. This
has two effects: firstly, it keeps a route open back for the server, and
secondly, it causes the server to disgorge any notifications that got
queued up because they couldn't be sent.
Fix this by adding a mechanism to emit regular probes.
Two levels of probing are made available: Under normal circumstances the
'slow' queue will be used for a fileserver - this just probes the preferred
address once every 5 mins or so; however, if server fails to respond to any
probes, the server will shift to the 'fast' queue from which all its
interfaces will be probed every 30s. When it finally responds, the record
will switch back to the slow queue.
Further notes:
(1) Probing is now no longer driven from the fileserver rotation
algorithm.
(2) Probes are dispatched to all interfaces on a fileserver when that an
afs_server object is set up to record it.
(3) The afs_server object is removed from the probe queues when we start
to probe it. afs_is_probing_server() returns true if it's not listed
- ie. it's undergoing probing.
(4) The afs_server object is added back on to the probe queue when the
final outstanding probe completes, but the probed_at time is set when
we're about to launch a probe so that it's not dependent on the probe
duration.
(5) The timer and the work item added for this must be handed a count on
net->servers_outstanding, which they hand on or release. This makes
sure that network namespace cleanup waits for them.
Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 22:10:00 +08:00
|
|
|
return afs_done_one_fs_probe(call->net, server);
|
2018-10-20 07:57:59 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
afs: Actively poll fileservers to maintain NAT or firewall openings
When an AFS client accesses a file, it receives a limited-duration callback
promise that the server will notify it if another client changes a file.
This callback duration can be a few hours in length.
If a client mounts a volume and then an application prevents it from being
unmounted, say by chdir'ing into it, but then does nothing for some time,
the rxrpc_peer record will expire and rxrpc-level keepalive will cease.
If there is NAT or a firewall between the client and the server, the route
back for the server may close after a comparatively short duration, meaning
that attempts by the server to notify the client may then bounce.
The client, however, may (so far as it knows) still have a valid unexpired
promise and will then rely on its cached data and will not see changes made
on the server by a third party until it incidentally rechecks the status or
the promise needs renewal.
To deal with this, the client needs to regularly probe the server. This
has two effects: firstly, it keeps a route open back for the server, and
secondly, it causes the server to disgorge any notifications that got
queued up because they couldn't be sent.
Fix this by adding a mechanism to emit regular probes.
Two levels of probing are made available: Under normal circumstances the
'slow' queue will be used for a fileserver - this just probes the preferred
address once every 5 mins or so; however, if server fails to respond to any
probes, the server will shift to the 'fast' queue from which all its
interfaces will be probed every 30s. When it finally responds, the record
will switch back to the slow queue.
Further notes:
(1) Probing is now no longer driven from the fileserver rotation
algorithm.
(2) Probes are dispatched to all interfaces on a fileserver when that an
afs_server object is set up to record it.
(3) The afs_server object is removed from the probe queues when we start
to probe it. afs_is_probing_server() returns true if it's not listed
- ie. it's undergoing probing.
(4) The afs_server object is added back on to the probe queue when the
final outstanding probe completes, but the probed_at time is set when
we're about to launch a probe so that it's not dependent on the probe
duration.
(5) The timer and the work item added for this must be handed a count on
net->servers_outstanding, which they hand on or release. This makes
sure that network namespace cleanup waits for them.
Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 22:10:00 +08:00
|
|
|
* Probe one or all of a fileserver's addresses to find out the best route and
|
|
|
|
* to query its capabilities.
|
2018-10-20 07:57:59 +08:00
|
|
|
*/
|
afs: Actively poll fileservers to maintain NAT or firewall openings
When an AFS client accesses a file, it receives a limited-duration callback
promise that the server will notify it if another client changes a file.
This callback duration can be a few hours in length.
If a client mounts a volume and then an application prevents it from being
unmounted, say by chdir'ing into it, but then does nothing for some time,
the rxrpc_peer record will expire and rxrpc-level keepalive will cease.
If there is NAT or a firewall between the client and the server, the route
back for the server may close after a comparatively short duration, meaning
that attempts by the server to notify the client may then bounce.
The client, however, may (so far as it knows) still have a valid unexpired
promise and will then rely on its cached data and will not see changes made
on the server by a third party until it incidentally rechecks the status or
the promise needs renewal.
To deal with this, the client needs to regularly probe the server. This
has two effects: firstly, it keeps a route open back for the server, and
secondly, it causes the server to disgorge any notifications that got
queued up because they couldn't be sent.
Fix this by adding a mechanism to emit regular probes.
Two levels of probing are made available: Under normal circumstances the
'slow' queue will be used for a fileserver - this just probes the preferred
address once every 5 mins or so; however, if server fails to respond to any
probes, the server will shift to the 'fast' queue from which all its
interfaces will be probed every 30s. When it finally responds, the record
will switch back to the slow queue.
Further notes:
(1) Probing is now no longer driven from the fileserver rotation
algorithm.
(2) Probes are dispatched to all interfaces on a fileserver when that an
afs_server object is set up to record it.
(3) The afs_server object is removed from the probe queues when we start
to probe it. afs_is_probing_server() returns true if it's not listed
- ie. it's undergoing probing.
(4) The afs_server object is added back on to the probe queue when the
final outstanding probe completes, but the probed_at time is set when
we're about to launch a probe so that it's not dependent on the probe
duration.
(5) The timer and the work item added for this must be handed a count on
net->servers_outstanding, which they hand on or release. This makes
sure that network namespace cleanup waits for them.
Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 22:10:00 +08:00
|
|
|
void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
|
|
|
|
struct key *key, bool all)
|
2018-10-20 07:57:59 +08:00
|
|
|
{
|
|
|
|
struct afs_addr_cursor ac = {
|
|
|
|
.index = 0,
|
|
|
|
};
|
|
|
|
|
|
|
|
_enter("%pU", &server->uuid);
|
|
|
|
|
|
|
|
read_lock(&server->fs_lock);
|
|
|
|
ac.alist = rcu_dereference_protected(server->addresses,
|
|
|
|
lockdep_is_held(&server->fs_lock));
|
2020-03-26 23:24:07 +08:00
|
|
|
afs_get_addrlist(ac.alist);
|
2018-10-20 07:57:59 +08:00
|
|
|
read_unlock(&server->fs_lock);
|
|
|
|
|
afs: Actively poll fileservers to maintain NAT or firewall openings
When an AFS client accesses a file, it receives a limited-duration callback
promise that the server will notify it if another client changes a file.
This callback duration can be a few hours in length.
If a client mounts a volume and then an application prevents it from being
unmounted, say by chdir'ing into it, but then does nothing for some time,
the rxrpc_peer record will expire and rxrpc-level keepalive will cease.
If there is NAT or a firewall between the client and the server, the route
back for the server may close after a comparatively short duration, meaning
that attempts by the server to notify the client may then bounce.
The client, however, may (so far as it knows) still have a valid unexpired
promise and will then rely on its cached data and will not see changes made
on the server by a third party until it incidentally rechecks the status or
the promise needs renewal.
To deal with this, the client needs to regularly probe the server. This
has two effects: firstly, it keeps a route open back for the server, and
secondly, it causes the server to disgorge any notifications that got
queued up because they couldn't be sent.
Fix this by adding a mechanism to emit regular probes.
Two levels of probing are made available: Under normal circumstances the
'slow' queue will be used for a fileserver - this just probes the preferred
address once every 5 mins or so; however, if server fails to respond to any
probes, the server will shift to the 'fast' queue from which all its
interfaces will be probed every 30s. When it finally responds, the record
will switch back to the slow queue.
Further notes:
(1) Probing is now no longer driven from the fileserver rotation
algorithm.
(2) Probes are dispatched to all interfaces on a fileserver when that an
afs_server object is set up to record it.
(3) The afs_server object is removed from the probe queues when we start
to probe it. afs_is_probing_server() returns true if it's not listed
- ie. it's undergoing probing.
(4) The afs_server object is added back on to the probe queue when the
final outstanding probe completes, but the probed_at time is set when
we're about to launch a probe so that it's not dependent on the probe
duration.
(5) The timer and the work item added for this must be handed a count on
net->servers_outstanding, which they hand on or release. This makes
sure that network namespace cleanup waits for them.
Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 22:10:00 +08:00
|
|
|
server->probed_at = jiffies;
|
|
|
|
atomic_set(&server->probe_outstanding, all ? ac.alist->nr_addrs : 1);
|
2018-10-20 07:57:59 +08:00
|
|
|
memset(&server->probe, 0, sizeof(server->probe));
|
|
|
|
server->probe.rtt = UINT_MAX;
|
|
|
|
|
afs: Actively poll fileservers to maintain NAT or firewall openings
When an AFS client accesses a file, it receives a limited-duration callback
promise that the server will notify it if another client changes a file.
This callback duration can be a few hours in length.
If a client mounts a volume and then an application prevents it from being
unmounted, say by chdir'ing into it, but then does nothing for some time,
the rxrpc_peer record will expire and rxrpc-level keepalive will cease.
If there is NAT or a firewall between the client and the server, the route
back for the server may close after a comparatively short duration, meaning
that attempts by the server to notify the client may then bounce.
The client, however, may (so far as it knows) still have a valid unexpired
promise and will then rely on its cached data and will not see changes made
on the server by a third party until it incidentally rechecks the status or
the promise needs renewal.
To deal with this, the client needs to regularly probe the server. This
has two effects: firstly, it keeps a route open back for the server, and
secondly, it causes the server to disgorge any notifications that got
queued up because they couldn't be sent.
Fix this by adding a mechanism to emit regular probes.
Two levels of probing are made available: Under normal circumstances the
'slow' queue will be used for a fileserver - this just probes the preferred
address once every 5 mins or so; however, if server fails to respond to any
probes, the server will shift to the 'fast' queue from which all its
interfaces will be probed every 30s. When it finally responds, the record
will switch back to the slow queue.
Further notes:
(1) Probing is now no longer driven from the fileserver rotation
algorithm.
(2) Probes are dispatched to all interfaces on a fileserver when that an
afs_server object is set up to record it.
(3) The afs_server object is removed from the probe queues when we start
to probe it. afs_is_probing_server() returns true if it's not listed
- ie. it's undergoing probing.
(4) The afs_server object is added back on to the probe queue when the
final outstanding probe completes, but the probed_at time is set when
we're about to launch a probe so that it's not dependent on the probe
duration.
(5) The timer and the work item added for this must be handed a count on
net->servers_outstanding, which they hand on or release. This makes
sure that network namespace cleanup waits for them.
Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 22:10:00 +08:00
|
|
|
ac.index = ac.alist->preferred;
|
|
|
|
if (ac.index < 0 || ac.index >= ac.alist->nr_addrs)
|
|
|
|
all = true;
|
2018-10-20 07:57:59 +08:00
|
|
|
|
afs: Actively poll fileservers to maintain NAT or firewall openings
When an AFS client accesses a file, it receives a limited-duration callback
promise that the server will notify it if another client changes a file.
This callback duration can be a few hours in length.
If a client mounts a volume and then an application prevents it from being
unmounted, say by chdir'ing into it, but then does nothing for some time,
the rxrpc_peer record will expire and rxrpc-level keepalive will cease.
If there is NAT or a firewall between the client and the server, the route
back for the server may close after a comparatively short duration, meaning
that attempts by the server to notify the client may then bounce.
The client, however, may (so far as it knows) still have a valid unexpired
promise and will then rely on its cached data and will not see changes made
on the server by a third party until it incidentally rechecks the status or
the promise needs renewal.
To deal with this, the client needs to regularly probe the server. This
has two effects: firstly, it keeps a route open back for the server, and
secondly, it causes the server to disgorge any notifications that got
queued up because they couldn't be sent.
Fix this by adding a mechanism to emit regular probes.
Two levels of probing are made available: Under normal circumstances the
'slow' queue will be used for a fileserver - this just probes the preferred
address once every 5 mins or so; however, if server fails to respond to any
probes, the server will shift to the 'fast' queue from which all its
interfaces will be probed every 30s. When it finally responds, the record
will switch back to the slow queue.
Further notes:
(1) Probing is now no longer driven from the fileserver rotation
algorithm.
(2) Probes are dispatched to all interfaces on a fileserver when that an
afs_server object is set up to record it.
(3) The afs_server object is removed from the probe queues when we start
to probe it. afs_is_probing_server() returns true if it's not listed
- ie. it's undergoing probing.
(4) The afs_server object is added back on to the probe queue when the
final outstanding probe completes, but the probed_at time is set when
we're about to launch a probe so that it's not dependent on the probe
duration.
(5) The timer and the work item added for this must be handed a count on
net->servers_outstanding, which they hand on or release. This makes
sure that network namespace cleanup waits for them.
Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 22:10:00 +08:00
|
|
|
if (all) {
|
|
|
|
for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++)
|
|
|
|
if (!afs_fs_get_capabilities(net, server, &ac, key))
|
|
|
|
afs_fs_probe_not_done(net, server, &ac);
|
|
|
|
} else {
|
|
|
|
if (!afs_fs_get_capabilities(net, server, &ac, key))
|
|
|
|
afs_fs_probe_not_done(net, server, &ac);
|
2018-10-20 07:57:59 +08:00
|
|
|
}
|
|
|
|
|
afs: Actively poll fileservers to maintain NAT or firewall openings
When an AFS client accesses a file, it receives a limited-duration callback
promise that the server will notify it if another client changes a file.
This callback duration can be a few hours in length.
If a client mounts a volume and then an application prevents it from being
unmounted, say by chdir'ing into it, but then does nothing for some time,
the rxrpc_peer record will expire and rxrpc-level keepalive will cease.
If there is NAT or a firewall between the client and the server, the route
back for the server may close after a comparatively short duration, meaning
that attempts by the server to notify the client may then bounce.
The client, however, may (so far as it knows) still have a valid unexpired
promise and will then rely on its cached data and will not see changes made
on the server by a third party until it incidentally rechecks the status or
the promise needs renewal.
To deal with this, the client needs to regularly probe the server. This
has two effects: firstly, it keeps a route open back for the server, and
secondly, it causes the server to disgorge any notifications that got
queued up because they couldn't be sent.
Fix this by adding a mechanism to emit regular probes.
Two levels of probing are made available: Under normal circumstances the
'slow' queue will be used for a fileserver - this just probes the preferred
address once every 5 mins or so; however, if server fails to respond to any
probes, the server will shift to the 'fast' queue from which all its
interfaces will be probed every 30s. When it finally responds, the record
will switch back to the slow queue.
Further notes:
(1) Probing is now no longer driven from the fileserver rotation
algorithm.
(2) Probes are dispatched to all interfaces on a fileserver when that an
afs_server object is set up to record it.
(3) The afs_server object is removed from the probe queues when we start
to probe it. afs_is_probing_server() returns true if it's not listed
- ie. it's undergoing probing.
(4) The afs_server object is added back on to the probe queue when the
final outstanding probe completes, but the probed_at time is set when
we're about to launch a probe so that it's not dependent on the probe
duration.
(5) The timer and the work item added for this must be handed a count on
net->servers_outstanding, which they hand on or release. This makes
sure that network namespace cleanup waits for them.
Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 22:10:00 +08:00
|
|
|
afs_put_addrlist(ac.alist);
|
2018-10-20 07:57:59 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Wait for the first as-yet untried fileserver to respond.
|
|
|
|
*/
|
|
|
|
int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried)
|
|
|
|
{
|
|
|
|
struct wait_queue_entry *waits;
|
|
|
|
struct afs_server *server;
|
2020-05-02 20:39:57 +08:00
|
|
|
unsigned int rtt = UINT_MAX, rtt_s;
|
2018-10-20 07:57:59 +08:00
|
|
|
bool have_responders = false;
|
|
|
|
int pref = -1, i;
|
|
|
|
|
|
|
|
_enter("%u,%lx", slist->nr_servers, untried);
|
|
|
|
|
|
|
|
/* Only wait for servers that have a probe outstanding. */
|
|
|
|
for (i = 0; i < slist->nr_servers; i++) {
|
|
|
|
if (test_bit(i, &untried)) {
|
|
|
|
server = slist->servers[i].server;
|
afs: Actively poll fileservers to maintain NAT or firewall openings
When an AFS client accesses a file, it receives a limited-duration callback
promise that the server will notify it if another client changes a file.
This callback duration can be a few hours in length.
If a client mounts a volume and then an application prevents it from being
unmounted, say by chdir'ing into it, but then does nothing for some time,
the rxrpc_peer record will expire and rxrpc-level keepalive will cease.
If there is NAT or a firewall between the client and the server, the route
back for the server may close after a comparatively short duration, meaning
that attempts by the server to notify the client may then bounce.
The client, however, may (so far as it knows) still have a valid unexpired
promise and will then rely on its cached data and will not see changes made
on the server by a third party until it incidentally rechecks the status or
the promise needs renewal.
To deal with this, the client needs to regularly probe the server. This
has two effects: firstly, it keeps a route open back for the server, and
secondly, it causes the server to disgorge any notifications that got
queued up because they couldn't be sent.
Fix this by adding a mechanism to emit regular probes.
Two levels of probing are made available: Under normal circumstances the
'slow' queue will be used for a fileserver - this just probes the preferred
address once every 5 mins or so; however, if server fails to respond to any
probes, the server will shift to the 'fast' queue from which all its
interfaces will be probed every 30s. When it finally responds, the record
will switch back to the slow queue.
Further notes:
(1) Probing is now no longer driven from the fileserver rotation
algorithm.
(2) Probes are dispatched to all interfaces on a fileserver when that an
afs_server object is set up to record it.
(3) The afs_server object is removed from the probe queues when we start
to probe it. afs_is_probing_server() returns true if it's not listed
- ie. it's undergoing probing.
(4) The afs_server object is added back on to the probe queue when the
final outstanding probe completes, but the probed_at time is set when
we're about to launch a probe so that it's not dependent on the probe
duration.
(5) The timer and the work item added for this must be handed a count on
net->servers_outstanding, which they hand on or release. This makes
sure that network namespace cleanup waits for them.
Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 22:10:00 +08:00
|
|
|
if (!atomic_read(&server->probe_outstanding))
|
2018-10-20 07:57:59 +08:00
|
|
|
__clear_bit(i, &untried);
|
|
|
|
if (server->probe.responded)
|
|
|
|
have_responders = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (have_responders || !untried)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
waits = kmalloc(array_size(slist->nr_servers, sizeof(*waits)), GFP_KERNEL);
|
|
|
|
if (!waits)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
for (i = 0; i < slist->nr_servers; i++) {
|
|
|
|
if (test_bit(i, &untried)) {
|
|
|
|
server = slist->servers[i].server;
|
|
|
|
init_waitqueue_entry(&waits[i], current);
|
|
|
|
add_wait_queue(&server->probe_wq, &waits[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
bool still_probing = false;
|
|
|
|
|
|
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
|
|
for (i = 0; i < slist->nr_servers; i++) {
|
|
|
|
if (test_bit(i, &untried)) {
|
|
|
|
server = slist->servers[i].server;
|
|
|
|
if (server->probe.responded)
|
|
|
|
goto stop;
|
afs: Actively poll fileservers to maintain NAT or firewall openings
When an AFS client accesses a file, it receives a limited-duration callback
promise that the server will notify it if another client changes a file.
This callback duration can be a few hours in length.
If a client mounts a volume and then an application prevents it from being
unmounted, say by chdir'ing into it, but then does nothing for some time,
the rxrpc_peer record will expire and rxrpc-level keepalive will cease.
If there is NAT or a firewall between the client and the server, the route
back for the server may close after a comparatively short duration, meaning
that attempts by the server to notify the client may then bounce.
The client, however, may (so far as it knows) still have a valid unexpired
promise and will then rely on its cached data and will not see changes made
on the server by a third party until it incidentally rechecks the status or
the promise needs renewal.
To deal with this, the client needs to regularly probe the server. This
has two effects: firstly, it keeps a route open back for the server, and
secondly, it causes the server to disgorge any notifications that got
queued up because they couldn't be sent.
Fix this by adding a mechanism to emit regular probes.
Two levels of probing are made available: Under normal circumstances the
'slow' queue will be used for a fileserver - this just probes the preferred
address once every 5 mins or so; however, if server fails to respond to any
probes, the server will shift to the 'fast' queue from which all its
interfaces will be probed every 30s. When it finally responds, the record
will switch back to the slow queue.
Further notes:
(1) Probing is now no longer driven from the fileserver rotation
algorithm.
(2) Probes are dispatched to all interfaces on a fileserver when that an
afs_server object is set up to record it.
(3) The afs_server object is removed from the probe queues when we start
to probe it. afs_is_probing_server() returns true if it's not listed
- ie. it's undergoing probing.
(4) The afs_server object is added back on to the probe queue when the
final outstanding probe completes, but the probed_at time is set when
we're about to launch a probe so that it's not dependent on the probe
duration.
(5) The timer and the work item added for this must be handed a count on
net->servers_outstanding, which they hand on or release. This makes
sure that network namespace cleanup waits for them.
Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 22:10:00 +08:00
|
|
|
if (atomic_read(&server->probe_outstanding))
|
2018-10-20 07:57:59 +08:00
|
|
|
still_probing = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-01-04 07:28:58 +08:00
|
|
|
if (!still_probing || signal_pending(current))
|
2018-10-20 07:57:59 +08:00
|
|
|
goto stop;
|
|
|
|
schedule();
|
|
|
|
}
|
|
|
|
|
|
|
|
stop:
|
|
|
|
set_current_state(TASK_RUNNING);
|
|
|
|
|
|
|
|
for (i = 0; i < slist->nr_servers; i++) {
|
|
|
|
if (test_bit(i, &untried)) {
|
|
|
|
server = slist->servers[i].server;
|
2020-05-02 20:39:57 +08:00
|
|
|
rtt_s = READ_ONCE(server->rtt);
|
|
|
|
if (test_bit(AFS_SERVER_FL_RESPONDING, &server->flags) &&
|
|
|
|
rtt_s < rtt) {
|
2018-10-20 07:57:59 +08:00
|
|
|
pref = i;
|
2020-05-02 20:39:57 +08:00
|
|
|
rtt = rtt_s;
|
2018-10-20 07:57:59 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
remove_wait_queue(&server->probe_wq, &waits[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
kfree(waits);
|
|
|
|
|
|
|
|
if (pref == -1 && signal_pending(current))
|
|
|
|
return -ERESTARTSYS;
|
|
|
|
|
|
|
|
if (pref >= 0)
|
|
|
|
slist->preferred = pref;
|
|
|
|
return 0;
|
|
|
|
}
|
afs: Actively poll fileservers to maintain NAT or firewall openings
When an AFS client accesses a file, it receives a limited-duration callback
promise that the server will notify it if another client changes a file.
This callback duration can be a few hours in length.
If a client mounts a volume and then an application prevents it from being
unmounted, say by chdir'ing into it, but then does nothing for some time,
the rxrpc_peer record will expire and rxrpc-level keepalive will cease.
If there is NAT or a firewall between the client and the server, the route
back for the server may close after a comparatively short duration, meaning
that attempts by the server to notify the client may then bounce.
The client, however, may (so far as it knows) still have a valid unexpired
promise and will then rely on its cached data and will not see changes made
on the server by a third party until it incidentally rechecks the status or
the promise needs renewal.
To deal with this, the client needs to regularly probe the server. This
has two effects: firstly, it keeps a route open back for the server, and
secondly, it causes the server to disgorge any notifications that got
queued up because they couldn't be sent.
Fix this by adding a mechanism to emit regular probes.
Two levels of probing are made available: Under normal circumstances the
'slow' queue will be used for a fileserver - this just probes the preferred
address once every 5 mins or so; however, if server fails to respond to any
probes, the server will shift to the 'fast' queue from which all its
interfaces will be probed every 30s. When it finally responds, the record
will switch back to the slow queue.
Further notes:
(1) Probing is now no longer driven from the fileserver rotation
algorithm.
(2) Probes are dispatched to all interfaces on a fileserver when that an
afs_server object is set up to record it.
(3) The afs_server object is removed from the probe queues when we start
to probe it. afs_is_probing_server() returns true if it's not listed
- ie. it's undergoing probing.
(4) The afs_server object is added back on to the probe queue when the
final outstanding probe completes, but the probed_at time is set when
we're about to launch a probe so that it's not dependent on the probe
duration.
(5) The timer and the work item added for this must be handed a count on
net->servers_outstanding, which they hand on or release. This makes
sure that network namespace cleanup waits for them.
Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 22:10:00 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Probe timer. We have an increment on fs_outstanding that we need to pass
|
|
|
|
* along to the work item.
|
|
|
|
*/
|
|
|
|
void afs_fs_probe_timer(struct timer_list *timer)
|
|
|
|
{
|
|
|
|
struct afs_net *net = container_of(timer, struct afs_net, fs_probe_timer);
|
|
|
|
|
2020-06-20 06:39:36 +08:00
|
|
|
if (!net->live || !queue_work(afs_wq, &net->fs_prober))
|
afs: Actively poll fileservers to maintain NAT or firewall openings
When an AFS client accesses a file, it receives a limited-duration callback
promise that the server will notify it if another client changes a file.
This callback duration can be a few hours in length.
If a client mounts a volume and then an application prevents it from being
unmounted, say by chdir'ing into it, but then does nothing for some time,
the rxrpc_peer record will expire and rxrpc-level keepalive will cease.
If there is NAT or a firewall between the client and the server, the route
back for the server may close after a comparatively short duration, meaning
that attempts by the server to notify the client may then bounce.
The client, however, may (so far as it knows) still have a valid unexpired
promise and will then rely on its cached data and will not see changes made
on the server by a third party until it incidentally rechecks the status or
the promise needs renewal.
To deal with this, the client needs to regularly probe the server. This
has two effects: firstly, it keeps a route open back for the server, and
secondly, it causes the server to disgorge any notifications that got
queued up because they couldn't be sent.
Fix this by adding a mechanism to emit regular probes.
Two levels of probing are made available: Under normal circumstances the
'slow' queue will be used for a fileserver - this just probes the preferred
address once every 5 mins or so; however, if server fails to respond to any
probes, the server will shift to the 'fast' queue from which all its
interfaces will be probed every 30s. When it finally responds, the record
will switch back to the slow queue.
Further notes:
(1) Probing is now no longer driven from the fileserver rotation
algorithm.
(2) Probes are dispatched to all interfaces on a fileserver when that an
afs_server object is set up to record it.
(3) The afs_server object is removed from the probe queues when we start
to probe it. afs_is_probing_server() returns true if it's not listed
- ie. it's undergoing probing.
(4) The afs_server object is added back on to the probe queue when the
final outstanding probe completes, but the probed_at time is set when
we're about to launch a probe so that it's not dependent on the probe
duration.
(5) The timer and the work item added for this must be handed a count on
net->servers_outstanding, which they hand on or release. This makes
sure that network namespace cleanup waits for them.
Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 22:10:00 +08:00
|
|
|
afs_dec_servers_outstanding(net);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Dispatch a probe to a server.
|
|
|
|
*/
|
|
|
|
static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server, bool all)
|
|
|
|
__releases(&net->fs_lock)
|
|
|
|
{
|
|
|
|
struct key *key = NULL;
|
|
|
|
|
|
|
|
/* We remove it from the queues here - it will be added back to
|
|
|
|
* one of the queues on the completion of the probe.
|
|
|
|
*/
|
|
|
|
list_del_init(&server->probe_link);
|
|
|
|
|
|
|
|
afs_get_server(server, afs_server_trace_get_probe);
|
|
|
|
write_sequnlock(&net->fs_lock);
|
|
|
|
|
|
|
|
afs_fs_probe_fileserver(net, server, key, all);
|
|
|
|
afs_put_server(net, server, afs_server_trace_put_probe);
|
|
|
|
}
|
|
|
|
|
2020-04-22 07:02:46 +08:00
|
|
|
/*
|
|
|
|
* Probe a server immediately without waiting for its due time to come
|
|
|
|
* round. This is used when all of the addresses have been tried.
|
|
|
|
*/
|
|
|
|
void afs_probe_fileserver(struct afs_net *net, struct afs_server *server)
|
|
|
|
{
|
|
|
|
write_seqlock(&net->fs_lock);
|
|
|
|
if (!list_empty(&server->probe_link))
|
|
|
|
return afs_dispatch_fs_probe(net, server, true);
|
|
|
|
write_sequnlock(&net->fs_lock);
|
|
|
|
}
|
|
|
|
|
afs: Actively poll fileservers to maintain NAT or firewall openings
When an AFS client accesses a file, it receives a limited-duration callback
promise that the server will notify it if another client changes a file.
This callback duration can be a few hours in length.
If a client mounts a volume and then an application prevents it from being
unmounted, say by chdir'ing into it, but then does nothing for some time,
the rxrpc_peer record will expire and rxrpc-level keepalive will cease.
If there is NAT or a firewall between the client and the server, the route
back for the server may close after a comparatively short duration, meaning
that attempts by the server to notify the client may then bounce.
The client, however, may (so far as it knows) still have a valid unexpired
promise and will then rely on its cached data and will not see changes made
on the server by a third party until it incidentally rechecks the status or
the promise needs renewal.
To deal with this, the client needs to regularly probe the server. This
has two effects: firstly, it keeps a route open back for the server, and
secondly, it causes the server to disgorge any notifications that got
queued up because they couldn't be sent.
Fix this by adding a mechanism to emit regular probes.
Two levels of probing are made available: Under normal circumstances the
'slow' queue will be used for a fileserver - this just probes the preferred
address once every 5 mins or so; however, if server fails to respond to any
probes, the server will shift to the 'fast' queue from which all its
interfaces will be probed every 30s. When it finally responds, the record
will switch back to the slow queue.
Further notes:
(1) Probing is now no longer driven from the fileserver rotation
algorithm.
(2) Probes are dispatched to all interfaces on a fileserver when that an
afs_server object is set up to record it.
(3) The afs_server object is removed from the probe queues when we start
to probe it. afs_is_probing_server() returns true if it's not listed
- ie. it's undergoing probing.
(4) The afs_server object is added back on to the probe queue when the
final outstanding probe completes, but the probed_at time is set when
we're about to launch a probe so that it's not dependent on the probe
duration.
(5) The timer and the work item added for this must be handed a count on
net->servers_outstanding, which they hand on or release. This makes
sure that network namespace cleanup waits for them.
Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 22:10:00 +08:00
|
|
|
/*
|
|
|
|
* Probe dispatcher to regularly dispatch probes to keep NAT alive.
|
|
|
|
*/
|
|
|
|
void afs_fs_probe_dispatcher(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct afs_net *net = container_of(work, struct afs_net, fs_prober);
|
|
|
|
struct afs_server *fast, *slow, *server;
|
|
|
|
unsigned long nowj, timer_at, poll_at;
|
|
|
|
bool first_pass = true, set_timer = false;
|
|
|
|
|
|
|
|
if (!net->live)
|
|
|
|
return;
|
|
|
|
|
|
|
|
_enter("");
|
|
|
|
|
|
|
|
if (list_empty(&net->fs_probe_fast) && list_empty(&net->fs_probe_slow)) {
|
|
|
|
_leave(" [none]");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
again:
|
|
|
|
write_seqlock(&net->fs_lock);
|
|
|
|
|
|
|
|
fast = slow = server = NULL;
|
|
|
|
nowj = jiffies;
|
|
|
|
timer_at = nowj + MAX_JIFFY_OFFSET;
|
|
|
|
|
|
|
|
if (!list_empty(&net->fs_probe_fast)) {
|
|
|
|
fast = list_first_entry(&net->fs_probe_fast, struct afs_server, probe_link);
|
|
|
|
poll_at = fast->probed_at + afs_fs_probe_fast_poll_interval;
|
|
|
|
if (time_before(nowj, poll_at)) {
|
|
|
|
timer_at = poll_at;
|
|
|
|
set_timer = true;
|
|
|
|
fast = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!list_empty(&net->fs_probe_slow)) {
|
|
|
|
slow = list_first_entry(&net->fs_probe_slow, struct afs_server, probe_link);
|
|
|
|
poll_at = slow->probed_at + afs_fs_probe_slow_poll_interval;
|
|
|
|
if (time_before(nowj, poll_at)) {
|
|
|
|
if (time_before(poll_at, timer_at))
|
|
|
|
timer_at = poll_at;
|
|
|
|
set_timer = true;
|
|
|
|
slow = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
server = fast ?: slow;
|
|
|
|
if (server)
|
|
|
|
_debug("probe %pU", &server->uuid);
|
|
|
|
|
|
|
|
if (server && (first_pass || !need_resched())) {
|
|
|
|
afs_dispatch_fs_probe(net, server, server == fast);
|
|
|
|
first_pass = false;
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
|
|
|
write_sequnlock(&net->fs_lock);
|
|
|
|
|
|
|
|
if (server) {
|
|
|
|
if (!queue_work(afs_wq, &net->fs_prober))
|
|
|
|
afs_dec_servers_outstanding(net);
|
|
|
|
_leave(" [requeue]");
|
|
|
|
} else if (set_timer) {
|
|
|
|
if (timer_reduce(&net->fs_probe_timer, timer_at))
|
|
|
|
afs_dec_servers_outstanding(net);
|
|
|
|
_leave(" [timer]");
|
|
|
|
} else {
|
|
|
|
afs_dec_servers_outstanding(net);
|
|
|
|
_leave(" [quiesce]");
|
|
|
|
}
|
|
|
|
}
|
2020-04-22 07:02:46 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Wait for a probe on a particular fileserver to complete for 2s.
|
|
|
|
*/
|
|
|
|
int afs_wait_for_one_fs_probe(struct afs_server *server, bool is_intr)
|
|
|
|
{
|
|
|
|
struct wait_queue_entry wait;
|
|
|
|
unsigned long timo = 2 * HZ;
|
|
|
|
|
|
|
|
if (atomic_read(&server->probe_outstanding) == 0)
|
|
|
|
goto dont_wait;
|
|
|
|
|
|
|
|
init_wait_entry(&wait, 0);
|
|
|
|
for (;;) {
|
|
|
|
prepare_to_wait_event(&server->probe_wq, &wait,
|
|
|
|
is_intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
|
|
|
|
if (timo == 0 ||
|
|
|
|
server->probe.responded ||
|
|
|
|
atomic_read(&server->probe_outstanding) == 0 ||
|
|
|
|
(is_intr && signal_pending(current)))
|
|
|
|
break;
|
|
|
|
timo = schedule_timeout(timo);
|
|
|
|
}
|
|
|
|
|
|
|
|
finish_wait(&server->probe_wq, &wait);
|
|
|
|
|
|
|
|
dont_wait:
|
|
|
|
if (server->probe.responded)
|
|
|
|
return 0;
|
|
|
|
if (is_intr && signal_pending(current))
|
|
|
|
return -ERESTARTSYS;
|
|
|
|
if (timo == 0)
|
|
|
|
return -ETIME;
|
|
|
|
return -EDESTADDRREQ;
|
|
|
|
}
|
2020-06-20 06:39:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Clean up the probing when the namespace is killed off.
|
|
|
|
*/
|
|
|
|
void afs_fs_probe_cleanup(struct afs_net *net)
|
|
|
|
{
|
|
|
|
if (del_timer_sync(&net->fs_probe_timer))
|
|
|
|
afs_dec_servers_outstanding(net);
|
|
|
|
}
|