Commit 495f2ae9 authored by David Howells's avatar David Howells

afs: Fix fileserver rotation

Fix the fileserver rotation so that it doesn't use RTT as the basis for
deciding which server and address to use as this doesn't necessarily give a
good indication of the best path.  Instead, use the configurable preference
list in conjunction with whatever probes have succeeded at the time of
looking.

To this end, make the following changes:

 (1) Keep an array of "server states" to track what addresses we've tried
     on each server and move the waitqueue entries there that we'll need
     for probing.

 (2) Each afs_server_state struct is made to pin the corresponding server's
     endpoint state rather than the afs_operation struct carrying a pin on
     the server we're currently looking at.

 (3) Drop the server list preference; we now always rescan the server list.

 (4) afs_wait_for_probes() now uses the server state list to guide it in
     what it waits for (and to provide the waitqueue entries) and returns
     an indication of whether we'd got a response, run out of responsive
     addresses or the endpoint state had been superseded and we need to
     restart the iteration.

 (5) Call afs_get_address_preferences*() occasionally to refresh the
     preference values.

 (6) When picking a server, scan the addresses of the servers for which we
     have as-yet untested communications, looking for the highest priority
     one and use that instead of trying all the addresses for a particular
     server in ascending-RTT order.

 (7) When a Busy or Offline state is seen across all available servers, do
     a short sleep.

 (8) If we detect that we accessed a future RO volume version whilst it is
     undergoing replication, reissue the op against the older version until
     at least half of the servers are replicated.

 (9) Whilst RO replication is ongoing, increase the frequency of Volume
     Location server checks for that volume to every ten minutes instead of
     hourly.

Also add a tracepoint to track progress through the rotation algorithm.
Signed-off-by: default avatarDavid Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
parent 453924de
...@@ -229,7 +229,6 @@ void afs_wait_for_operation(struct afs_operation *op) ...@@ -229,7 +229,6 @@ void afs_wait_for_operation(struct afs_operation *op)
*/ */
int afs_put_operation(struct afs_operation *op) int afs_put_operation(struct afs_operation *op)
{ {
struct afs_endpoint_state *estate = op->estate;
struct afs_addr_list *alist; struct afs_addr_list *alist;
int i, ret = afs_op_error(op); int i, ret = afs_op_error(op);
...@@ -253,18 +252,17 @@ int afs_put_operation(struct afs_operation *op) ...@@ -253,18 +252,17 @@ int afs_put_operation(struct afs_operation *op)
kfree(op->more_files); kfree(op->more_files);
} }
if (estate) { if (op->estate) {
alist = estate->addresses; alist = op->estate->addresses;
if (alist) { if (alist) {
if (op->call_responded && if (op->call_responded &&
op->addr_index != alist->preferred && op->addr_index != alist->preferred &&
test_bit(alist->preferred, &op->addr_tried)) test_bit(alist->preferred, &op->addr_tried))
WRITE_ONCE(alist->preferred, op->addr_index); WRITE_ONCE(alist->preferred, op->addr_index);
} }
afs_put_endpoint_state(estate, afs_estate_trace_put_operation);
op->estate = NULL;
} }
afs_clear_server_states(op);
afs_put_serverlist(op->net, op->server_list); afs_put_serverlist(op->net, op->server_list);
afs_put_volume(op->volume, afs_volume_trace_put_put_op); afs_put_volume(op->volume, afs_volume_trace_put_put_op);
key_put(op->key); key_put(op->key);
......
...@@ -296,58 +296,48 @@ void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, ...@@ -296,58 +296,48 @@ void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
} }
/* /*
* Wait for the first as-yet untried fileserver to respond. * Wait for the first as-yet untried fileserver to respond, for the probe state
* to be superseded or for all probes to finish.
*/ */
int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried) int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr)
{ {
struct afs_endpoint_state *estate; struct afs_endpoint_state *estate;
struct wait_queue_entry *waits; struct afs_server_list *slist = op->server_list;
struct afs_server *server; bool still_probing = true;
unsigned int rtt = UINT_MAX, rtt_s; int ret = 0, i;
bool have_responders = false;
int pref = -1, i;
_enter("%u,%lx", slist->nr_servers, untried); _enter("%u", slist->nr_servers);
/* Only wait for servers that have a probe outstanding. */
rcu_read_lock();
for (i = 0; i < slist->nr_servers; i++) { for (i = 0; i < slist->nr_servers; i++) {
if (test_bit(i, &untried)) { estate = states[i].endpoint_state;
server = slist->servers[i].server; if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags))
estate = rcu_dereference(server->endpoint_state); return 2;
if (!atomic_read(&estate->nr_probing)) if (atomic_read(&estate->nr_probing))
__clear_bit(i, &untried); still_probing = true;
if (test_bit(AFS_ESTATE_RESPONDED, &estate->flags)) if (estate->responsive_set & states[i].untried_addrs)
have_responders = true; return 1;
}
} }
rcu_read_unlock(); if (!still_probing)
if (have_responders || !untried)
return 0; return 0;
waits = kmalloc(array_size(slist->nr_servers, sizeof(*waits)), GFP_KERNEL); for (i = 0; i < slist->nr_servers; i++)
if (!waits) add_wait_queue(&slist->servers[i].server->probe_wq, &states[i].probe_waiter);
return -ENOMEM;
for (i = 0; i < slist->nr_servers; i++) {
if (test_bit(i, &untried)) {
server = slist->servers[i].server;
init_waitqueue_entry(&waits[i], current);
add_wait_queue(&server->probe_wq, &waits[i]);
}
}
for (;;) { for (;;) {
bool still_probing = false; still_probing = false;
set_current_state(TASK_INTERRUPTIBLE); set_current_state(intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
for (i = 0; i < slist->nr_servers; i++) { for (i = 0; i < slist->nr_servers; i++) {
if (test_bit(i, &untried)) { estate = states[i].endpoint_state;
server = slist->servers[i].server; if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags)) {
if (test_bit(AFS_ESTATE_RESPONDED, &estate->flags)) ret = 2;
goto stop; goto stop;
if (atomic_read(&estate->nr_probing)) }
still_probing = true; if (atomic_read(&estate->nr_probing))
still_probing = true;
if (estate->responsive_set & states[i].untried_addrs) {
ret = 1;
goto stop;
} }
} }
...@@ -359,28 +349,12 @@ int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried) ...@@ -359,28 +349,12 @@ int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried)
stop: stop:
set_current_state(TASK_RUNNING); set_current_state(TASK_RUNNING);
for (i = 0; i < slist->nr_servers; i++) { for (i = 0; i < slist->nr_servers; i++)
if (test_bit(i, &untried)) { remove_wait_queue(&slist->servers[i].server->probe_wq, &states[i].probe_waiter);
server = slist->servers[i].server;
rtt_s = READ_ONCE(server->rtt);
if (test_bit(AFS_SERVER_FL_RESPONDING, &server->flags) &&
rtt_s < rtt) {
pref = i;
rtt = rtt_s;
}
remove_wait_queue(&server->probe_wq, &waits[i]);
}
}
kfree(waits);
if (pref == -1 && signal_pending(current))
return -ERESTARTSYS;
if (pref >= 0) if (!ret && signal_pending(current))
slist->preferred = pref; ret = -ERESTARTSYS;
return 0; return ret;
} }
/* /*
...@@ -508,7 +482,7 @@ void afs_fs_probe_dispatcher(struct work_struct *work) ...@@ -508,7 +482,7 @@ void afs_fs_probe_dispatcher(struct work_struct *work)
* Wait for a probe on a particular fileserver to complete for 2s. * Wait for a probe on a particular fileserver to complete for 2s.
*/ */
int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate, int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate,
bool is_intr) unsigned long exclude, bool is_intr)
{ {
struct wait_queue_entry wait; struct wait_queue_entry wait;
unsigned long timo = 2 * HZ; unsigned long timo = 2 * HZ;
...@@ -521,7 +495,8 @@ int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_sta ...@@ -521,7 +495,8 @@ int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_sta
prepare_to_wait_event(&server->probe_wq, &wait, prepare_to_wait_event(&server->probe_wq, &wait,
is_intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); is_intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
if (timo == 0 || if (timo == 0 ||
test_bit(AFS_ESTATE_RESPONDED, &estate->flags) || test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags) ||
(estate->responsive_set & ~exclude) ||
atomic_read(&estate->nr_probing) == 0 || atomic_read(&estate->nr_probing) == 0 ||
(is_intr && signal_pending(current))) (is_intr && signal_pending(current)))
break; break;
...@@ -531,7 +506,9 @@ int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_sta ...@@ -531,7 +506,9 @@ int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_sta
finish_wait(&server->probe_wq, &wait); finish_wait(&server->probe_wq, &wait);
dont_wait: dont_wait:
if (test_bit(AFS_ESTATE_RESPONDED, &estate->flags)) if (estate->responsive_set & ~exclude)
return 1;
if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags))
return 0; return 0;
if (is_intr && signal_pending(current)) if (is_intr && signal_pending(current))
return -ERESTARTSYS; return -ERESTARTSYS;
......
...@@ -620,7 +620,6 @@ struct afs_server_list { ...@@ -620,7 +620,6 @@ struct afs_server_list {
bool attached; /* T if attached to servers */ bool attached; /* T if attached to servers */
enum afs_ro_replicating ro_replicating; /* RW->RO update (probably) in progress */ enum afs_ro_replicating ro_replicating; /* RW->RO update (probably) in progress */
unsigned char nr_servers; unsigned char nr_servers;
unsigned char preferred; /* Preferred server */
unsigned short vnovol_mask; /* Servers to be skipped due to VNOVOL */ unsigned short vnovol_mask; /* Servers to be skipped due to VNOVOL */
unsigned int seq; /* Set to ->servers_seq when installed */ unsigned int seq; /* Set to ->servers_seq when installed */
rwlock_t lock; rwlock_t lock;
...@@ -821,6 +820,20 @@ struct afs_vl_cursor { ...@@ -821,6 +820,20 @@ struct afs_vl_cursor {
bool call_responded; /* T if the current address responded */ bool call_responded; /* T if the current address responded */
}; };
/*
* Fileserver state tracking for an operation. An array of these is kept,
* indexed by server index.
*/
struct afs_server_state {
/* Tracking of fileserver probe state. Other operations may interfere
* by probing a fileserver when accessing other volumes.
*/
unsigned int probe_seq;
unsigned long untried_addrs; /* Addresses we haven't tried yet */
struct wait_queue_entry probe_waiter;
struct afs_endpoint_state *endpoint_state; /* Endpoint state being monitored */
};
/* /*
* Fileserver operation methods. * Fileserver operation methods.
*/ */
...@@ -921,7 +934,8 @@ struct afs_operation { ...@@ -921,7 +934,8 @@ struct afs_operation {
/* Fileserver iteration state */ /* Fileserver iteration state */
struct afs_server_list *server_list; /* Current server list (pins ref) */ struct afs_server_list *server_list; /* Current server list (pins ref) */
struct afs_server *server; /* Server we're using (ref pinned by server_list) */ struct afs_server *server; /* Server we're using (ref pinned by server_list) */
struct afs_endpoint_state *estate; /* Current endpoint state (pins ref) */ struct afs_endpoint_state *estate; /* Current endpoint state (doesn't pin ref) */
struct afs_server_state *server_states; /* States of the servers involved */
struct afs_call *call; struct afs_call *call;
unsigned long untried_servers; /* Bitmask of untried servers */ unsigned long untried_servers; /* Bitmask of untried servers */
unsigned long addr_tried; /* Tried addresses */ unsigned long addr_tried; /* Tried addresses */
...@@ -1235,11 +1249,11 @@ void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_t ...@@ -1235,11 +1249,11 @@ void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_t
extern void afs_fileserver_probe_result(struct afs_call *); extern void afs_fileserver_probe_result(struct afs_call *);
void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
struct afs_addr_list *new_addrs, struct key *key); struct afs_addr_list *new_addrs, struct key *key);
extern int afs_wait_for_fs_probes(struct afs_server_list *, unsigned long); int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr);
extern void afs_probe_fileserver(struct afs_net *, struct afs_server *); extern void afs_probe_fileserver(struct afs_net *, struct afs_server *);
extern void afs_fs_probe_dispatcher(struct work_struct *); extern void afs_fs_probe_dispatcher(struct work_struct *);
int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate, int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate,
bool is_intr); unsigned long exclude, bool is_intr);
extern void afs_fs_probe_cleanup(struct afs_net *); extern void afs_fs_probe_cleanup(struct afs_net *);
/* /*
...@@ -1363,6 +1377,7 @@ static inline void afs_put_sysnames(struct afs_sysnames *sysnames) {} ...@@ -1363,6 +1377,7 @@ static inline void afs_put_sysnames(struct afs_sysnames *sysnames) {}
/* /*
* rotate.c * rotate.c
*/ */
void afs_clear_server_states(struct afs_operation *op);
extern bool afs_select_fileserver(struct afs_operation *); extern bool afs_select_fileserver(struct afs_operation *);
extern void afs_dump_edestaddrreq(const struct afs_operation *); extern void afs_dump_edestaddrreq(const struct afs_operation *);
......
This diff is collapsed.
...@@ -134,8 +134,7 @@ bool afs_annotate_server_list(struct afs_server_list *new, ...@@ -134,8 +134,7 @@ bool afs_annotate_server_list(struct afs_server_list *new,
struct afs_server_list *old) struct afs_server_list *old)
{ {
unsigned long mask = 1UL << AFS_SE_EXCLUDED; unsigned long mask = 1UL << AFS_SE_EXCLUDED;
struct afs_server *cur; int i;
int i, j;
if (old->nr_servers != new->nr_servers || if (old->nr_servers != new->nr_servers ||
old->ro_replicating != new->ro_replicating) old->ro_replicating != new->ro_replicating)
...@@ -148,18 +147,7 @@ bool afs_annotate_server_list(struct afs_server_list *new, ...@@ -148,18 +147,7 @@ bool afs_annotate_server_list(struct afs_server_list *new,
goto changed; goto changed;
} }
return false; return false;
changed: changed:
/* Maintain the same preferred server as before if possible. */
cur = old->servers[old->preferred].server;
for (j = 0; j < new->nr_servers; j++) {
if (new->servers[j].server == cur) {
if (!test_bit(AFS_SE_EXCLUDED, &new->servers[j].flags))
new->preferred = j;
break;
}
}
return true; return true;
} }
......
...@@ -397,7 +397,11 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key) ...@@ -397,7 +397,11 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key)
discard = old; discard = old;
} }
volume->update_at = ktime_get_real_seconds() + afs_volume_record_life; /* Check more often if replication is ongoing. */
if (new->ro_replicating)
volume->update_at = ktime_get_real_seconds() + 10 * 60;
else
volume->update_at = ktime_get_real_seconds() + afs_volume_record_life;
write_unlock(&volume->servers_lock); write_unlock(&volume->servers_lock);
if (discard == old) if (discard == old)
......
...@@ -230,15 +230,12 @@ enum yfs_cm_operation { ...@@ -230,15 +230,12 @@ enum yfs_cm_operation {
#define afs_estate_traces \ #define afs_estate_traces \
EM(afs_estate_trace_alloc_probe, "ALLOC prob") \ EM(afs_estate_trace_alloc_probe, "ALLOC prob") \
EM(afs_estate_trace_alloc_server, "ALLOC srvr") \ EM(afs_estate_trace_alloc_server, "ALLOC srvr") \
EM(afs_estate_trace_get_fsrotate_set, "GET fs-rot") \ EM(afs_estate_trace_get_server_state, "GET srv-st") \
EM(afs_estate_trace_get_getcaps, "GET getcap") \ EM(afs_estate_trace_get_getcaps, "GET getcap") \
EM(afs_estate_trace_put_getcaps, "PUT getcap") \ EM(afs_estate_trace_put_getcaps, "PUT getcap") \
EM(afs_estate_trace_put_next_server, "PUT nx-srv") \
EM(afs_estate_trace_put_op_failed, "PUT op-fai") \
EM(afs_estate_trace_put_operation, "PUT op ") \
EM(afs_estate_trace_put_probe, "PUT probe ") \ EM(afs_estate_trace_put_probe, "PUT probe ") \
EM(afs_estate_trace_put_restart_rotate, "PUT rstrot") \
EM(afs_estate_trace_put_server, "PUT server") \ EM(afs_estate_trace_put_server, "PUT server") \
EM(afs_estate_trace_put_server_state, "PUT srv-st") \
E_(afs_estate_trace_free, "FREE ") E_(afs_estate_trace_free, "FREE ")
#define afs_fs_operations \ #define afs_fs_operations \
...@@ -448,6 +445,29 @@ enum yfs_cm_operation { ...@@ -448,6 +445,29 @@ enum yfs_cm_operation {
EM(afs_cb_break_for_vos_release, "break-vos-release") \ EM(afs_cb_break_for_vos_release, "break-vos-release") \
E_(afs_cb_break_volume_excluded, "vol-excluded") E_(afs_cb_break_volume_excluded, "vol-excluded")
#define afs_rotate_traces \
EM(afs_rotate_trace_aborted, "Abortd") \
EM(afs_rotate_trace_busy_sleep, "BsySlp") \
EM(afs_rotate_trace_check_vol_status, "VolStt") \
EM(afs_rotate_trace_failed, "Failed") \
EM(afs_rotate_trace_iter, "Iter ") \
EM(afs_rotate_trace_iterate_addr, "ItAddr") \
EM(afs_rotate_trace_next_server, "NextSv") \
EM(afs_rotate_trace_no_more_servers, "NoMore") \
EM(afs_rotate_trace_nomem, "Nomem ") \
EM(afs_rotate_trace_probe_error, "PrbErr") \
EM(afs_rotate_trace_probe_fileserver, "PrbFsv") \
EM(afs_rotate_trace_probe_none, "PrbNon") \
EM(afs_rotate_trace_probe_response, "PrbRsp") \
EM(afs_rotate_trace_probe_superseded, "PrbSup") \
EM(afs_rotate_trace_restart, "Rstart") \
EM(afs_rotate_trace_retry_server, "RtrySv") \
EM(afs_rotate_trace_selected_server, "SlctSv") \
EM(afs_rotate_trace_stale_lock, "StlLck") \
EM(afs_rotate_trace_start, "Start ") \
EM(afs_rotate_trace_stop, "Stop ") \
E_(afs_rotate_trace_stopped, "Stoppd")
/* /*
* Generate enums for tracing information. * Generate enums for tracing information.
*/ */
...@@ -471,6 +491,7 @@ enum afs_file_error { afs_file_errors } __mode(byte); ...@@ -471,6 +491,7 @@ enum afs_file_error { afs_file_errors } __mode(byte);
enum afs_flock_event { afs_flock_events } __mode(byte); enum afs_flock_event { afs_flock_events } __mode(byte);
enum afs_flock_operation { afs_flock_operations } __mode(byte); enum afs_flock_operation { afs_flock_operations } __mode(byte);
enum afs_io_error { afs_io_errors } __mode(byte); enum afs_io_error { afs_io_errors } __mode(byte);
enum afs_rotate_trace { afs_rotate_traces } __mode(byte);
enum afs_server_trace { afs_server_traces } __mode(byte); enum afs_server_trace { afs_server_traces } __mode(byte);
enum afs_volume_trace { afs_volume_traces } __mode(byte); enum afs_volume_trace { afs_volume_traces } __mode(byte);
...@@ -486,21 +507,22 @@ enum afs_volume_trace { afs_volume_traces } __mode(byte); ...@@ -486,21 +507,22 @@ enum afs_volume_trace { afs_volume_traces } __mode(byte);
afs_alist_traces; afs_alist_traces;
afs_call_traces; afs_call_traces;
afs_server_traces; afs_cb_break_reasons;
afs_cell_traces; afs_cell_traces;
afs_fs_operations;
afs_vl_operations;
afs_cm_operations; afs_cm_operations;
yfs_cm_operations;
afs_edit_dir_ops; afs_edit_dir_ops;
afs_edit_dir_reasons; afs_edit_dir_reasons;
afs_eproto_causes; afs_eproto_causes;
afs_estate_traces; afs_estate_traces;
afs_io_errors;
afs_file_errors; afs_file_errors;
afs_flock_types;
afs_flock_operations; afs_flock_operations;
afs_cb_break_reasons; afs_flock_types;
afs_fs_operations;
afs_io_errors;
afs_rotate_traces;
afs_server_traces;
afs_vl_operations;
yfs_cm_operations;
/* /*
* Now redefine the EM() and E_() macros to map the enums to the strings that * Now redefine the EM() and E_() macros to map the enums to the strings that
...@@ -1519,6 +1541,41 @@ TRACE_EVENT(afs_vl_probe, ...@@ -1519,6 +1541,41 @@ TRACE_EVENT(afs_vl_probe,
&__entry->srx.transport) &__entry->srx.transport)
); );
TRACE_EVENT(afs_rotate,
TP_PROTO(struct afs_operation *op, enum afs_rotate_trace reason, unsigned int extra),
TP_ARGS(op, reason, extra),
TP_STRUCT__entry(
__field(unsigned int, op)
__field(unsigned int, flags)
__field(unsigned int, extra)
__field(unsigned short, iteration)
__field(short, server_index)
__field(short, addr_index)
__field(enum afs_rotate_trace, reason)
),
TP_fast_assign(
__entry->op = op->debug_id;
__entry->flags = op->flags;
__entry->iteration = op->nr_iterations;
__entry->server_index = op->server_index;
__entry->addr_index = op->addr_index;
__entry->reason = reason;
__entry->extra = extra;
),
TP_printk("OP=%08x it=%02x %s fl=%x sx=%d ax=%d ext=%d",
__entry->op,
__entry->iteration,
__print_symbolic(__entry->reason, afs_rotate_traces),
__entry->flags,
__entry->server_index,
__entry->addr_index,
__entry->extra)
);
#endif /* _TRACE_AFS_H */ #endif /* _TRACE_AFS_H */
/* This part must be outside protection */ /* This part must be outside protection */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment