1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* Handle fileserver selection and rotation.
4 * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
8 #include <linux/kernel.h>
9 #include <linux/slab.h>
11 #include <linux/sched.h>
12 #include <linux/delay.h>
13 #include <linux/sched/signal.h>
16 #include "protocol_uae.h"
19 * Begin iteration through a server list, starting with the vnode's last used
20 * server if possible, or the last recorded good server if not.
22 static bool afs_start_fs_iteration(struct afs_operation *op,
23 struct afs_vnode *vnode)
25 struct afs_server *server;
29 read_lock(&op->volume->servers_lock);
30 op->server_list = afs_get_serverlist(
31 rcu_dereference_protected(op->volume->servers,
32 lockdep_is_held(&op->volume->servers_lock)));
33 read_unlock(&op->volume->servers_lock);
35 op->untried = (1UL << op->server_list->nr_servers) - 1;
36 op->index = READ_ONCE(op->server_list->preferred);
38 cb_server = vnode->cb_server;
40 /* See if the vnode's preferred record is still available */
41 for (i = 0; i < op->server_list->nr_servers; i++) {
42 server = op->server_list->servers[i].server;
43 if (server == cb_server) {
49 /* If we have a lock outstanding on a server that's no longer
50 * serving this vnode, then we can't switch to another server
51 * and have to return an error.
53 if (op->flags & AFS_OPERATION_CUR_ONLY) {
54 afs_op_set_error(op, -ESTALE);
58 /* Note that the callback promise is effectively broken */
59 write_seqlock(&vnode->cb_lock);
60 ASSERTCMP(cb_server, ==, vnode->cb_server);
61 vnode->cb_server = NULL;
62 if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
64 write_sequnlock(&vnode->cb_lock);
72 * Post volume busy note.
74 static void afs_busy(struct afs_volume *volume, u32 abort_code)
79 case VOFFLINE: m = "offline"; break;
80 case VRESTARTING: m = "restarting"; break;
81 case VSALVAGING: m = "being salvaged"; break;
82 default: m = "busy"; break;
85 pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m);
89 * Sleep and retry the operation to the same fileserver.
91 static bool afs_sleep_and_retry(struct afs_operation *op)
93 if (!(op->flags & AFS_OPERATION_UNINTR)) {
94 msleep_interruptible(1000);
95 if (signal_pending(current)) {
96 afs_op_set_error(op, -ERESTARTSYS);
107 * Select the fileserver to use. May be called multiple times to rotate
108 * through the fileservers.
110 bool afs_select_fileserver(struct afs_operation *op)
112 struct afs_addr_list *alist;
113 struct afs_server *server;
114 struct afs_vnode *vnode = op->file[0].vnode;
116 s32 abort_code = op->call_abort_code;
117 int error = op->call_error, i;
121 _enter("OP=%x+%x,%llx,%lx[%d],%lx[%d],%d,%d",
122 op->debug_id, op->nr_iterations, op->volume->vid,
123 op->untried, op->index,
124 op->ac.tried, op->ac.index,
127 if (op->flags & AFS_OPERATION_STOP) {
128 _leave(" = f [stopped]");
132 if (op->nr_iterations == 0)
135 /* Evaluate the result of the previous operation, if there was one. */
136 switch (op->call_error) {
138 op->cumul_error.responded = true;
141 /* Success or local failure. Stop. */
142 afs_op_set_error(op, error);
143 op->flags |= AFS_OPERATION_STOP;
144 _leave(" = f [okay/local %d]", error);
148 /* The far side rejected the operation on some grounds. This
149 * might involve the server being busy or the volume having been moved.
151 * Note that various V* errors should not be sent to a cache manager
152 * by a fileserver as they should be translated to more modern UAE*
153 * errors instead. IBM AFS and OpenAFS fileservers, however, do leak
156 op->cumul_error.responded = true;
157 switch (abort_code) {
159 /* This fileserver doesn't know about the volume.
160 * - May indicate that the VL is wrong - retry once and compare
162 * - May indicate that the fileserver couldn't attach to the vol.
163 * - The volume might have been temporarily removed so that it can
164 * be replaced by a volume restore. "vos" might have ended one
165 * transaction and has yet to create the next.
166 * - The volume might not be blessed or might not be in-service
167 * (administrative action).
169 if (op->flags & AFS_OPERATION_VNOVOL) {
170 afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
174 write_lock(&op->volume->servers_lock);
175 op->server_list->vnovol_mask |= 1 << op->index;
176 write_unlock(&op->volume->servers_lock);
178 set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
179 error = afs_check_volume_status(op->volume, op);
181 afs_op_set_error(op, error);
185 if (test_bit(AFS_VOLUME_DELETED, &op->volume->flags)) {
186 afs_op_set_error(op, -ENOMEDIUM);
190 /* If the server list didn't change, then assume that
191 * it's the fileserver having trouble.
193 if (rcu_access_pointer(op->volume->servers) == op->server_list) {
194 afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
199 op->flags |= AFS_OPERATION_VNOVOL;
200 _leave(" = t [vnovol]");
205 /* These should not be returned from the fileserver. */
206 pr_warn("Fileserver returned unexpected abort %d\n",
208 afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
212 /* Prior to AFS 3.2 VNOSERVICE was returned from the fileserver
213 * if the volume was neither in-service nor administratively
214 * blessed. All usage was replaced by VNOVOL because AFS 3.1 and
215 * earlier cache managers did not handle VNOSERVICE and assumed
216 * it was the client OSes errno 105.
218 * Starting with OpenAFS 1.4.8 VNOSERVICE was repurposed as the
219 * fileserver idle dead time error which was sent in place of
220 * RX_CALL_TIMEOUT (-3). The error was intended to be sent if the
221 * fileserver took too long to send a reply to the client.
222 * RX_CALL_TIMEOUT would have caused the cache manager to mark the
223 * server down whereas VNOSERVICE since AFS 3.2 would cause cache
224 * manager to temporarily (up to 15 minutes) mark the volume
225 * instance as unusable.
227 * The idle dead logic resulted in cache inconsistency since a
228 * state changing call that the cache manager assumed was dead
229 * could still be processed to completion by the fileserver. This
230 * logic was removed in OpenAFS 1.8.0 and VNOSERVICE is no longer
231 * returned. However, many 1.4.8 through 1.6.24 fileservers are
232 * still in existence.
234 * AuriStorFS fileservers have never returned VNOSERVICE.
236 * VNOSERVICE should be treated as an alias for RX_CALL_TIMEOUT.
238 case RX_CALL_TIMEOUT:
239 afs_op_accumulate_error(op, -ETIMEDOUT, abort_code);
242 case VSALVAGING: /* This error should not be leaked to cache managers
243 * but is from OpenAFS demand attach fileservers.
244 * It should be treated as an alias for VOFFLINE.
246 case VSALVAGE: /* VSALVAGE should be treated as a synonym of VOFFLINE */
248 /* The volume is in use by the volserver or another volume utility
249 * for an operation that might alter the contents. The volume is
250 * expected to come back but it might take a long time (could be
253 if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &op->volume->flags)) {
254 afs_busy(op->volume, abort_code);
255 clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
257 if (op->flags & AFS_OPERATION_NO_VSLEEP) {
258 afs_op_set_error(op, -EADV);
261 if (op->flags & AFS_OPERATION_CUR_ONLY) {
262 afs_op_set_error(op, -ESTALE);
267 case VRESTARTING: /* The fileserver is either shutting down or starting up. */
269 /* The volume is in use by the volserver or another volume
270 * utility for an operation that is not expected to alter the
271 * contents of the volume. VBUSY does not need to be returned
272 * for a ROVOL or BACKVOL bound to an ITBusy volserver
273 * transaction. The fileserver is permitted to continue serving
274 * content from ROVOLs and BACKVOLs during an ITBusy transaction
275 * because the content will not change. However, many fileserver
276 * releases do return VBUSY for ROVOL and BACKVOL instances under
277 * many circumstances.
279 * Retry after going round all the servers unless we have a file
280 * lock we need to maintain.
282 if (op->flags & AFS_OPERATION_NO_VSLEEP) {
283 afs_op_set_error(op, -EBUSY);
286 if (!test_and_set_bit(AFS_VOLUME_BUSY, &op->volume->flags)) {
287 afs_busy(op->volume, abort_code);
288 clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
291 if (op->flags & AFS_OPERATION_CUR_ONLY) {
292 if (!afs_sleep_and_retry(op))
295 /* Retry with same server & address */
296 _leave(" = t [vbusy]");
300 op->flags |= AFS_OPERATION_VBUSY;
304 /* The volume migrated to another server. We consider
305 * consider all locks and callbacks broken and request
306 * an update from the VLDB.
308 * We also limit the number of VMOVED hops we will
309 * honour, just in case someone sets up a loop.
311 if (op->flags & AFS_OPERATION_VMOVED) {
312 afs_op_set_error(op, -EREMOTEIO);
315 op->flags |= AFS_OPERATION_VMOVED;
317 set_bit(AFS_VOLUME_WAIT, &op->volume->flags);
318 set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
319 error = afs_check_volume_status(op->volume, op);
321 afs_op_set_error(op, error);
325 /* If the server list didn't change, then the VLDB is
326 * out of sync with the fileservers. This is hopefully
327 * a temporary condition, however, so we don't want to
328 * permanently block access to the file.
330 * TODO: Try other fileservers if we can.
332 * TODO: Retry a few times with sleeps.
334 if (rcu_access_pointer(op->volume->servers) == op->server_list) {
335 afs_op_accumulate_error(op, -ENOMEDIUM, abort_code);
339 goto restart_from_beginning;
343 afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
344 if (op->volume->type != AFSVL_RWVOL)
350 /* The partition is full. Only applies to RWVOLs.
351 * Translate locally and return ENOSPC.
352 * No replicas to failover to.
354 afs_op_set_error(op, -ENOSPC);
355 goto failed_but_online;
359 /* Volume is full. Only applies to RWVOLs.
360 * Translate locally and return EDQUOT.
361 * No replicas to failover to.
363 afs_op_set_error(op, -EDQUOT);
364 goto failed_but_online;
367 afs_op_accumulate_error(op, error, abort_code);
369 clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
370 clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
376 if (afs_op_error(op) != -EDESTADDRREQ)
377 goto iterate_address;
386 afs_op_accumulate_error(op, error, 0);
387 goto iterate_address;
390 pr_warn("kAFS: Peer reset %s (op=%x)\n",
391 op->type ? op->type->name : "???", op->debug_id);
394 _debug("call reset");
395 afs_op_set_error(op, error);
399 restart_from_beginning:
401 afs_end_cursor(&op->ac);
403 afs_put_serverlist(op->net, op->server_list);
404 op->server_list = NULL;
407 /* See if we need to do an update of the volume record. Note that the
408 * volume may have moved or even have been deleted.
410 error = afs_check_volume_status(op->volume, op);
412 afs_op_set_error(op, error);
416 if (!afs_start_fs_iteration(op, vnode))
419 _debug("__ VOL %llx __", op->volume->vid);
422 _debug("pick [%lx]", op->untried);
424 error = afs_wait_for_fs_probes(op->server_list, op->untried);
426 afs_op_set_error(op, error);
430 /* Pick the untried server with the lowest RTT. If we have outstanding
431 * callbacks, we stick with the server we're already using if we can.
434 _debug("server %u", op->index);
435 if (test_bit(op->index, &op->untried))
436 goto selected_server;
443 for (i = 0; i < op->server_list->nr_servers; i++) {
444 struct afs_server *s = op->server_list->servers[i].server;
446 if (!test_bit(i, &op->untried) ||
447 !test_bit(AFS_SERVER_FL_RESPONDING, &s->flags))
449 if (s->probe.rtt < rtt) {
456 goto no_more_servers;
459 _debug("use %d", op->index);
460 __clear_bit(op->index, &op->untried);
462 /* We're starting on a different fileserver from the list. We need to
463 * check it, create a callback intercept, find its address list and
464 * probe its capabilities before we use it.
466 ASSERTCMP(op->ac.alist, ==, NULL);
467 server = op->server_list->servers[op->index].server;
469 if (!afs_check_server_record(op, server))
472 _debug("USING SERVER: %pU", &server->uuid);
474 op->flags |= AFS_OPERATION_RETRY_SERVER;
476 if (vnode->cb_server != server) {
477 vnode->cb_server = server;
478 vnode->cb_s_break = server->cb_s_break;
479 vnode->cb_fs_s_break = atomic_read(&server->cell->fs_s_break);
480 vnode->cb_v_break = vnode->volume->cb_v_break;
481 clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
484 read_lock(&server->fs_lock);
485 alist = rcu_dereference_protected(server->addresses,
486 lockdep_is_held(&server->fs_lock));
487 afs_get_addrlist(alist);
488 read_unlock(&server->fs_lock);
491 memset(&op->ac, 0, sizeof(op->ac));
494 op->ac.alist = alist;
496 afs_put_addrlist(alist);
501 ASSERT(op->ac.alist);
502 /* Iterate over the current server's address list to try and find an
503 * address on which it will respond to us.
505 if (!afs_iterate_addresses(&op->ac))
506 goto out_of_addresses;
508 _debug("address [%u] %u/%u %pISp",
509 op->index, op->ac.index, op->ac.alist->nr_addrs,
510 rxrpc_kernel_remote_addr(op->ac.alist->addrs[op->ac.index].peer));
512 op->call_responded = false;
517 /* We've now had a failure to respond on all of a server's addresses -
518 * immediately probe them again and consider retrying the server.
520 afs_probe_fileserver(op->net, op->server);
521 if (op->flags & AFS_OPERATION_RETRY_SERVER) {
522 alist = op->ac.alist;
523 error = afs_wait_for_one_fs_probe(
524 op->server, !(op->flags & AFS_OPERATION_UNINTR));
527 op->flags &= ~AFS_OPERATION_RETRY_SERVER;
530 afs_op_set_error(op, error);
540 afs_end_cursor(&op->ac);
544 /* That's all the servers poked to no good effect. Try again if some
547 if (op->flags & AFS_OPERATION_VBUSY)
548 goto restart_from_beginning;
550 for (i = 0; i < op->server_list->nr_servers; i++) {
551 struct afs_server *s = op->server_list->servers[i].server;
553 error = READ_ONCE(s->probe.error);
555 afs_op_accumulate_error(op, error, s->probe.abort_code);
559 op->flags |= AFS_OPERATION_STOP;
560 afs_end_cursor(&op->ac);
561 _leave(" = f [failed %d]", afs_op_error(op));
566 * Dump cursor state in the case of the error being EDESTADDRREQ.
568 void afs_dump_edestaddrreq(const struct afs_operation *op)
573 if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3)
579 pr_notice("EDESTADDR occurred\n");
580 pr_notice("OP: cbb=%x cbb2=%x fl=%x err=%hd\n",
581 op->file[0].cb_break_before,
582 op->file[1].cb_break_before, op->flags, op->cumul_error.error);
583 pr_notice("OP: ut=%lx ix=%d ni=%u\n",
584 op->untried, op->index, op->nr_iterations);
585 pr_notice("OP: call er=%d ac=%d r=%u\n",
586 op->call_error, op->call_abort_code, op->call_responded);
588 if (op->server_list) {
589 const struct afs_server_list *sl = op->server_list;
590 pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n",
591 sl->nr_servers, sl->preferred, sl->vnovol_mask);
592 for (i = 0; i < sl->nr_servers; i++) {
593 const struct afs_server *s = sl->servers[i].server;
594 pr_notice("FC: server fl=%lx av=%u %pU\n",
595 s->flags, s->addr_version, &s->uuid);
597 const struct afs_addr_list *a =
598 rcu_dereference(s->addresses);
599 pr_notice("FC: - av=%u nr=%u/%u/%u pr=%u\n",
601 a->nr_ipv4, a->nr_addrs, a->max_addrs,
603 pr_notice("FC: - R=%lx F=%lx\n",
604 a->responded, a->failed);
605 if (a == op->ac.alist)
606 pr_notice("FC: - current\n");
611 pr_notice("AC: t=%lx ax=%u ni=%u\n",
612 op->ac.tried, op->ac.index, op->ac.nr_iterations);