GNU Linux-libre 6.7.9-gnu
[releases.git] / fs / afs / rotate.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* Handle fileserver selection and rotation.
3  *
4  * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
5  * Written by David Howells (dhowells@redhat.com)
6  */
7
8 #include <linux/kernel.h>
9 #include <linux/slab.h>
10 #include <linux/fs.h>
11 #include <linux/sched.h>
12 #include <linux/delay.h>
13 #include <linux/sched/signal.h>
14 #include "internal.h"
15 #include "afs_fs.h"
16 #include "protocol_uae.h"
17
18 /*
19  * Begin iteration through a server list, starting with the vnode's last used
20  * server if possible, or the last recorded good server if not.
21  */
22 static bool afs_start_fs_iteration(struct afs_operation *op,
23                                    struct afs_vnode *vnode)
24 {
25         struct afs_server *server;
26         void *cb_server;
27         int i;
28
29         read_lock(&op->volume->servers_lock);
30         op->server_list = afs_get_serverlist(
31                 rcu_dereference_protected(op->volume->servers,
32                                           lockdep_is_held(&op->volume->servers_lock)));
33         read_unlock(&op->volume->servers_lock);
34
35         op->untried = (1UL << op->server_list->nr_servers) - 1;
36         op->index = READ_ONCE(op->server_list->preferred);
37
38         cb_server = vnode->cb_server;
39         if (cb_server) {
40                 /* See if the vnode's preferred record is still available */
41                 for (i = 0; i < op->server_list->nr_servers; i++) {
42                         server = op->server_list->servers[i].server;
43                         if (server == cb_server) {
44                                 op->index = i;
45                                 goto found_interest;
46                         }
47                 }
48
49                 /* If we have a lock outstanding on a server that's no longer
50                  * serving this vnode, then we can't switch to another server
51                  * and have to return an error.
52                  */
53                 if (op->flags & AFS_OPERATION_CUR_ONLY) {
54                         afs_op_set_error(op, -ESTALE);
55                         return false;
56                 }
57
58                 /* Note that the callback promise is effectively broken */
59                 write_seqlock(&vnode->cb_lock);
60                 ASSERTCMP(cb_server, ==, vnode->cb_server);
61                 vnode->cb_server = NULL;
62                 if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
63                         vnode->cb_break++;
64                 write_sequnlock(&vnode->cb_lock);
65         }
66
67 found_interest:
68         return true;
69 }
70
71 /*
72  * Post volume busy note.
73  */
74 static void afs_busy(struct afs_volume *volume, u32 abort_code)
75 {
76         const char *m;
77
78         switch (abort_code) {
79         case VOFFLINE:          m = "offline";          break;
80         case VRESTARTING:       m = "restarting";       break;
81         case VSALVAGING:        m = "being salvaged";   break;
82         default:                m = "busy";             break;
83         }
84
85         pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m);
86 }
87
88 /*
89  * Sleep and retry the operation to the same fileserver.
90  */
91 static bool afs_sleep_and_retry(struct afs_operation *op)
92 {
93         if (!(op->flags & AFS_OPERATION_UNINTR)) {
94                 msleep_interruptible(1000);
95                 if (signal_pending(current)) {
96                         afs_op_set_error(op, -ERESTARTSYS);
97                         return false;
98                 }
99         } else {
100                 msleep(1000);
101         }
102
103         return true;
104 }
105
106 /*
107  * Select the fileserver to use.  May be called multiple times to rotate
108  * through the fileservers.
109  */
110 bool afs_select_fileserver(struct afs_operation *op)
111 {
112         struct afs_addr_list *alist;
113         struct afs_server *server;
114         struct afs_vnode *vnode = op->file[0].vnode;
115         unsigned int rtt;
116         s32 abort_code = op->call_abort_code;
117         int error = op->call_error, i;
118
119         op->nr_iterations++;
120
121         _enter("OP=%x+%x,%llx,%lx[%d],%lx[%d],%d,%d",
122                op->debug_id, op->nr_iterations, op->volume->vid,
123                op->untried, op->index,
124                op->ac.tried, op->ac.index,
125                error, abort_code);
126
127         if (op->flags & AFS_OPERATION_STOP) {
128                 _leave(" = f [stopped]");
129                 return false;
130         }
131
132         if (op->nr_iterations == 0)
133                 goto start;
134
135         /* Evaluate the result of the previous operation, if there was one. */
136         switch (op->call_error) {
137         case 0:
138                 op->cumul_error.responded = true;
139                 fallthrough;
140         default:
141                 /* Success or local failure.  Stop. */
142                 afs_op_set_error(op, error);
143                 op->flags |= AFS_OPERATION_STOP;
144                 _leave(" = f [okay/local %d]", error);
145                 return false;
146
147         case -ECONNABORTED:
148                 /* The far side rejected the operation on some grounds.  This
149                  * might involve the server being busy or the volume having been moved.
150                  *
151                  * Note that various V* errors should not be sent to a cache manager
152                  * by a fileserver as they should be translated to more modern UAE*
153                  * errors instead.  IBM AFS and OpenAFS fileservers, however, do leak
154                  * these abort codes.
155                  */
156                 op->cumul_error.responded = true;
157                 switch (abort_code) {
158                 case VNOVOL:
159                         /* This fileserver doesn't know about the volume.
160                          * - May indicate that the VL is wrong - retry once and compare
161                          *   the results.
162                          * - May indicate that the fileserver couldn't attach to the vol.
163                          * - The volume might have been temporarily removed so that it can
164                          *   be replaced by a volume restore.  "vos" might have ended one
165                          *   transaction and has yet to create the next.
166                          * - The volume might not be blessed or might not be in-service
167                          *   (administrative action).
168                          */
169                         if (op->flags & AFS_OPERATION_VNOVOL) {
170                                 afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
171                                 goto next_server;
172                         }
173
174                         write_lock(&op->volume->servers_lock);
175                         op->server_list->vnovol_mask |= 1 << op->index;
176                         write_unlock(&op->volume->servers_lock);
177
178                         set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
179                         error = afs_check_volume_status(op->volume, op);
180                         if (error < 0) {
181                                 afs_op_set_error(op, error);
182                                 goto failed;
183                         }
184
185                         if (test_bit(AFS_VOLUME_DELETED, &op->volume->flags)) {
186                                 afs_op_set_error(op, -ENOMEDIUM);
187                                 goto failed;
188                         }
189
190                         /* If the server list didn't change, then assume that
191                          * it's the fileserver having trouble.
192                          */
193                         if (rcu_access_pointer(op->volume->servers) == op->server_list) {
194                                 afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
195                                 goto next_server;
196                         }
197
198                         /* Try again */
199                         op->flags |= AFS_OPERATION_VNOVOL;
200                         _leave(" = t [vnovol]");
201                         return true;
202
203                 case VVOLEXISTS:
204                 case VONLINE:
205                         /* These should not be returned from the fileserver. */
206                         pr_warn("Fileserver returned unexpected abort %d\n",
207                                 abort_code);
208                         afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
209                         goto next_server;
210
211                 case VNOSERVICE:
212                         /* Prior to AFS 3.2 VNOSERVICE was returned from the fileserver
213                          * if the volume was neither in-service nor administratively
214                          * blessed.  All usage was replaced by VNOVOL because AFS 3.1 and
215                          * earlier cache managers did not handle VNOSERVICE and assumed
216                          * it was the client OSes errno 105.
217                          *
218                          * Starting with OpenAFS 1.4.8 VNOSERVICE was repurposed as the
219                          * fileserver idle dead time error which was sent in place of
220                          * RX_CALL_TIMEOUT (-3).  The error was intended to be sent if the
221                          * fileserver took too long to send a reply to the client.
222                          * RX_CALL_TIMEOUT would have caused the cache manager to mark the
223                          * server down whereas VNOSERVICE since AFS 3.2 would cause cache
224                          * manager to temporarily (up to 15 minutes) mark the volume
225                          * instance as unusable.
226                          *
227                          * The idle dead logic resulted in cache inconsistency since a
228                          * state changing call that the cache manager assumed was dead
229                          * could still be processed to completion by the fileserver.  This
230                          * logic was removed in OpenAFS 1.8.0 and VNOSERVICE is no longer
231                          * returned.  However, many 1.4.8 through 1.6.24 fileservers are
232                          * still in existence.
233                          *
234                          * AuriStorFS fileservers have never returned VNOSERVICE.
235                          *
236                          * VNOSERVICE should be treated as an alias for RX_CALL_TIMEOUT.
237                          */
238                 case RX_CALL_TIMEOUT:
239                         afs_op_accumulate_error(op, -ETIMEDOUT, abort_code);
240                         goto next_server;
241
242                 case VSALVAGING: /* This error should not be leaked to cache managers
243                                   * but is from OpenAFS demand attach fileservers.
244                                   * It should be treated as an alias for VOFFLINE.
245                                   */
246                 case VSALVAGE: /* VSALVAGE should be treated as a synonym of VOFFLINE */
247                 case VOFFLINE:
248                         /* The volume is in use by the volserver or another volume utility
249                          * for an operation that might alter the contents.  The volume is
250                          * expected to come back but it might take a long time (could be
251                          * days).
252                          */
253                         if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &op->volume->flags)) {
254                                 afs_busy(op->volume, abort_code);
255                                 clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
256                         }
257                         if (op->flags & AFS_OPERATION_NO_VSLEEP) {
258                                 afs_op_set_error(op, -EADV);
259                                 goto failed;
260                         }
261                         if (op->flags & AFS_OPERATION_CUR_ONLY) {
262                                 afs_op_set_error(op, -ESTALE);
263                                 goto failed;
264                         }
265                         goto busy;
266
267                 case VRESTARTING: /* The fileserver is either shutting down or starting up. */
268                 case VBUSY:
269                         /* The volume is in use by the volserver or another volume
270                          * utility for an operation that is not expected to alter the
271                          * contents of the volume.  VBUSY does not need to be returned
272                          * for a ROVOL or BACKVOL bound to an ITBusy volserver
273                          * transaction.  The fileserver is permitted to continue serving
274                          * content from ROVOLs and BACKVOLs during an ITBusy transaction
275                          * because the content will not change.  However, many fileserver
276                          * releases do return VBUSY for ROVOL and BACKVOL instances under
277                          * many circumstances.
278                          *
279                          * Retry after going round all the servers unless we have a file
280                          * lock we need to maintain.
281                          */
282                         if (op->flags & AFS_OPERATION_NO_VSLEEP) {
283                                 afs_op_set_error(op, -EBUSY);
284                                 goto failed;
285                         }
286                         if (!test_and_set_bit(AFS_VOLUME_BUSY, &op->volume->flags)) {
287                                 afs_busy(op->volume, abort_code);
288                                 clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
289                         }
290                 busy:
291                         if (op->flags & AFS_OPERATION_CUR_ONLY) {
292                                 if (!afs_sleep_and_retry(op))
293                                         goto failed;
294
295                                 /* Retry with same server & address */
296                                 _leave(" = t [vbusy]");
297                                 return true;
298                         }
299
300                         op->flags |= AFS_OPERATION_VBUSY;
301                         goto next_server;
302
303                 case VMOVED:
304                         /* The volume migrated to another server.  We consider
305                          * consider all locks and callbacks broken and request
306                          * an update from the VLDB.
307                          *
308                          * We also limit the number of VMOVED hops we will
309                          * honour, just in case someone sets up a loop.
310                          */
311                         if (op->flags & AFS_OPERATION_VMOVED) {
312                                 afs_op_set_error(op, -EREMOTEIO);
313                                 goto failed;
314                         }
315                         op->flags |= AFS_OPERATION_VMOVED;
316
317                         set_bit(AFS_VOLUME_WAIT, &op->volume->flags);
318                         set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
319                         error = afs_check_volume_status(op->volume, op);
320                         if (error < 0) {
321                                 afs_op_set_error(op, error);
322                                 goto failed;
323                         }
324
325                         /* If the server list didn't change, then the VLDB is
326                          * out of sync with the fileservers.  This is hopefully
327                          * a temporary condition, however, so we don't want to
328                          * permanently block access to the file.
329                          *
330                          * TODO: Try other fileservers if we can.
331                          *
332                          * TODO: Retry a few times with sleeps.
333                          */
334                         if (rcu_access_pointer(op->volume->servers) == op->server_list) {
335                                 afs_op_accumulate_error(op, -ENOMEDIUM, abort_code);
336                                 goto failed;
337                         }
338
339                         goto restart_from_beginning;
340
341                 case UAEIO:
342                 case VIO:
343                         afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
344                         if (op->volume->type != AFSVL_RWVOL)
345                                 goto next_server;
346                         goto failed;
347
348                 case VDISKFULL:
349                 case UAENOSPC:
350                         /* The partition is full.  Only applies to RWVOLs.
351                          * Translate locally and return ENOSPC.
352                          * No replicas to failover to.
353                          */
354                         afs_op_set_error(op, -ENOSPC);
355                         goto failed_but_online;
356
357                 case VOVERQUOTA:
358                 case UAEDQUOT:
359                         /* Volume is full.  Only applies to RWVOLs.
360                          * Translate locally and return EDQUOT.
361                          * No replicas to failover to.
362                          */
363                         afs_op_set_error(op, -EDQUOT);
364                         goto failed_but_online;
365
366                 default:
367                         afs_op_accumulate_error(op, error, abort_code);
368                 failed_but_online:
369                         clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
370                         clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
371                         goto failed;
372                 }
373
374         case -ETIMEDOUT:
375         case -ETIME:
376                 if (afs_op_error(op) != -EDESTADDRREQ)
377                         goto iterate_address;
378                 fallthrough;
379         case -ERFKILL:
380         case -EADDRNOTAVAIL:
381         case -ENETUNREACH:
382         case -EHOSTUNREACH:
383         case -EHOSTDOWN:
384         case -ECONNREFUSED:
385                 _debug("no conn");
386                 afs_op_accumulate_error(op, error, 0);
387                 goto iterate_address;
388
389         case -ENETRESET:
390                 pr_warn("kAFS: Peer reset %s (op=%x)\n",
391                         op->type ? op->type->name : "???", op->debug_id);
392                 fallthrough;
393         case -ECONNRESET:
394                 _debug("call reset");
395                 afs_op_set_error(op, error);
396                 goto failed;
397         }
398
399 restart_from_beginning:
400         _debug("restart");
401         afs_end_cursor(&op->ac);
402         op->server = NULL;
403         afs_put_serverlist(op->net, op->server_list);
404         op->server_list = NULL;
405 start:
406         _debug("start");
407         /* See if we need to do an update of the volume record.  Note that the
408          * volume may have moved or even have been deleted.
409          */
410         error = afs_check_volume_status(op->volume, op);
411         if (error < 0) {
412                 afs_op_set_error(op, error);
413                 goto failed;
414         }
415
416         if (!afs_start_fs_iteration(op, vnode))
417                 goto failed;
418
419         _debug("__ VOL %llx __", op->volume->vid);
420
421 pick_server:
422         _debug("pick [%lx]", op->untried);
423
424         error = afs_wait_for_fs_probes(op->server_list, op->untried);
425         if (error < 0) {
426                 afs_op_set_error(op, error);
427                 goto failed;
428         }
429
430         /* Pick the untried server with the lowest RTT.  If we have outstanding
431          * callbacks, we stick with the server we're already using if we can.
432          */
433         if (op->server) {
434                 _debug("server %u", op->index);
435                 if (test_bit(op->index, &op->untried))
436                         goto selected_server;
437                 op->server = NULL;
438                 _debug("no server");
439         }
440
441         op->index = -1;
442         rtt = UINT_MAX;
443         for (i = 0; i < op->server_list->nr_servers; i++) {
444                 struct afs_server *s = op->server_list->servers[i].server;
445
446                 if (!test_bit(i, &op->untried) ||
447                     !test_bit(AFS_SERVER_FL_RESPONDING, &s->flags))
448                         continue;
449                 if (s->probe.rtt < rtt) {
450                         op->index = i;
451                         rtt = s->probe.rtt;
452                 }
453         }
454
455         if (op->index == -1)
456                 goto no_more_servers;
457
458 selected_server:
459         _debug("use %d", op->index);
460         __clear_bit(op->index, &op->untried);
461
462         /* We're starting on a different fileserver from the list.  We need to
463          * check it, create a callback intercept, find its address list and
464          * probe its capabilities before we use it.
465          */
466         ASSERTCMP(op->ac.alist, ==, NULL);
467         server = op->server_list->servers[op->index].server;
468
469         if (!afs_check_server_record(op, server))
470                 goto failed;
471
472         _debug("USING SERVER: %pU", &server->uuid);
473
474         op->flags |= AFS_OPERATION_RETRY_SERVER;
475         op->server = server;
476         if (vnode->cb_server != server) {
477                 vnode->cb_server = server;
478                 vnode->cb_s_break = server->cb_s_break;
479                 vnode->cb_fs_s_break = atomic_read(&server->cell->fs_s_break);
480                 vnode->cb_v_break = vnode->volume->cb_v_break;
481                 clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
482         }
483
484         read_lock(&server->fs_lock);
485         alist = rcu_dereference_protected(server->addresses,
486                                           lockdep_is_held(&server->fs_lock));
487         afs_get_addrlist(alist);
488         read_unlock(&server->fs_lock);
489
490 retry_server:
491         memset(&op->ac, 0, sizeof(op->ac));
492
493         if (!op->ac.alist)
494                 op->ac.alist = alist;
495         else
496                 afs_put_addrlist(alist);
497
498         op->ac.index = -1;
499
500 iterate_address:
501         ASSERT(op->ac.alist);
502         /* Iterate over the current server's address list to try and find an
503          * address on which it will respond to us.
504          */
505         if (!afs_iterate_addresses(&op->ac))
506                 goto out_of_addresses;
507
508         _debug("address [%u] %u/%u %pISp",
509                op->index, op->ac.index, op->ac.alist->nr_addrs,
510                rxrpc_kernel_remote_addr(op->ac.alist->addrs[op->ac.index].peer));
511
512         op->call_responded = false;
513         _leave(" = t");
514         return true;
515
516 out_of_addresses:
517         /* We've now had a failure to respond on all of a server's addresses -
518          * immediately probe them again and consider retrying the server.
519          */
520         afs_probe_fileserver(op->net, op->server);
521         if (op->flags & AFS_OPERATION_RETRY_SERVER) {
522                 alist = op->ac.alist;
523                 error = afs_wait_for_one_fs_probe(
524                         op->server, !(op->flags & AFS_OPERATION_UNINTR));
525                 switch (error) {
526                 case 0:
527                         op->flags &= ~AFS_OPERATION_RETRY_SERVER;
528                         goto retry_server;
529                 case -ERESTARTSYS:
530                         afs_op_set_error(op, error);
531                         goto failed;
532                 case -ETIME:
533                 case -EDESTADDRREQ:
534                         goto next_server;
535                 }
536         }
537
538 next_server:
539         _debug("next");
540         afs_end_cursor(&op->ac);
541         goto pick_server;
542
543 no_more_servers:
544         /* That's all the servers poked to no good effect.  Try again if some
545          * of them were busy.
546          */
547         if (op->flags & AFS_OPERATION_VBUSY)
548                 goto restart_from_beginning;
549
550         for (i = 0; i < op->server_list->nr_servers; i++) {
551                 struct afs_server *s = op->server_list->servers[i].server;
552
553                 error = READ_ONCE(s->probe.error);
554                 if (error < 0)
555                         afs_op_accumulate_error(op, error, s->probe.abort_code);
556         }
557
558 failed:
559         op->flags |= AFS_OPERATION_STOP;
560         afs_end_cursor(&op->ac);
561         _leave(" = f [failed %d]", afs_op_error(op));
562         return false;
563 }
564
565 /*
566  * Dump cursor state in the case of the error being EDESTADDRREQ.
567  */
568 void afs_dump_edestaddrreq(const struct afs_operation *op)
569 {
570         static int count;
571         int i;
572
573         if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3)
574                 return;
575         count++;
576
577         rcu_read_lock();
578
579         pr_notice("EDESTADDR occurred\n");
580         pr_notice("OP: cbb=%x cbb2=%x fl=%x err=%hd\n",
581                   op->file[0].cb_break_before,
582                   op->file[1].cb_break_before, op->flags, op->cumul_error.error);
583         pr_notice("OP: ut=%lx ix=%d ni=%u\n",
584                   op->untried, op->index, op->nr_iterations);
585         pr_notice("OP: call  er=%d ac=%d r=%u\n",
586                   op->call_error, op->call_abort_code, op->call_responded);
587
588         if (op->server_list) {
589                 const struct afs_server_list *sl = op->server_list;
590                 pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n",
591                           sl->nr_servers, sl->preferred, sl->vnovol_mask);
592                 for (i = 0; i < sl->nr_servers; i++) {
593                         const struct afs_server *s = sl->servers[i].server;
594                         pr_notice("FC: server fl=%lx av=%u %pU\n",
595                                   s->flags, s->addr_version, &s->uuid);
596                         if (s->addresses) {
597                                 const struct afs_addr_list *a =
598                                         rcu_dereference(s->addresses);
599                                 pr_notice("FC:  - av=%u nr=%u/%u/%u pr=%u\n",
600                                           a->version,
601                                           a->nr_ipv4, a->nr_addrs, a->max_addrs,
602                                           a->preferred);
603                                 pr_notice("FC:  - R=%lx F=%lx\n",
604                                           a->responded, a->failed);
605                                 if (a == op->ac.alist)
606                                         pr_notice("FC:  - current\n");
607                         }
608                 }
609         }
610
611         pr_notice("AC: t=%lx ax=%u ni=%u\n",
612                   op->ac.tried, op->ac.index, op->ac.nr_iterations);
613         rcu_read_unlock();
614 }