fs/afs/rotate.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /* Handle fileserver selection and rotation.
   3  *
   4  * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
   5  * Written by David Howells (dhowells@redhat.com)
   6  */
   7
   8 #include <linux/kernel.h>
   9 #include <linux/slab.h>
  10 #include <linux/fs.h>
  11 #include <linux/sched.h>
  12 #include <linux/delay.h>
  13 #include <linux/sched/signal.h>
  14 #include "internal.h"
  15 #include "afs_fs.h"
  16 #include "protocol_uae.h"
  17
  18 /*
  19  * Begin iteration through a server list, starting with the vnode's last used
  20  * server if possible, or the last recorded good server if not.
  21  */
  22 static bool afs_start_fs_iteration(struct afs_operation *op,
  23                                    struct afs_vnode *vnode)
  24 {
  25         struct afs_server *server;
  26         void *cb_server;
  27         int i;
  28
  29         read_lock(&op->volume->servers_lock);
  30         op->server_list = afs_get_serverlist(
  31                 rcu_dereference_protected(op->volume->servers,
  32                                           lockdep_is_held(&op->volume->servers_lock)));
  33         read_unlock(&op->volume->servers_lock);
  34
  35         op->untried = (1UL << op->server_list->nr_servers) - 1;
  36         op->index = READ_ONCE(op->server_list->preferred);
  37
  38         cb_server = vnode->cb_server;
  39         if (cb_server) {
  40                 /* See if the vnode's preferred record is still available */
  41                 for (i = 0; i < op->server_list->nr_servers; i++) {
  42                         server = op->server_list->servers[i].server;
  43                         if (server == cb_server) {
  44                                 op->index = i;
  45                                 goto found_interest;
  46                         }
  47                 }
  48
  49                 /* If we have a lock outstanding on a server that's no longer
  50                  * serving this vnode, then we can't switch to another server
  51                  * and have to return an error.
  52                  */
  53                 if (op->flags & AFS_OPERATION_CUR_ONLY) {
  54                         afs_op_set_error(op, -ESTALE);
  55                         return false;
  56                 }
  57
  58                 /* Note that the callback promise is effectively broken */
  59                 write_seqlock(&vnode->cb_lock);
  60                 ASSERTCMP(cb_server, ==, vnode->cb_server);
  61                 vnode->cb_server = NULL;
  62                 if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
  63                         vnode->cb_break++;
  64                 write_sequnlock(&vnode->cb_lock);
  65         }
  66
  67 found_interest:
  68         return true;
  69 }
  70
  71 /*
  72  * Post volume busy note.
  73  */
  74 static void afs_busy(struct afs_volume *volume, u32 abort_code)
  75 {
  76         const char *m;
  77
  78         switch (abort_code) {
  79         case VOFFLINE:          m = "offline";          break;
  80         case VRESTARTING:       m = "restarting";       break;
  81         case VSALVAGING:        m = "being salvaged";   break;
  82         default:                m = "busy";             break;
  83         }
  84
  85         pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m);
  86 }
  87
  88 /*
  89  * Sleep and retry the operation to the same fileserver.
  90  */
  91 static bool afs_sleep_and_retry(struct afs_operation *op)
  92 {
  93         if (!(op->flags & AFS_OPERATION_UNINTR)) {
  94                 msleep_interruptible(1000);
  95                 if (signal_pending(current)) {
  96                         afs_op_set_error(op, -ERESTARTSYS);
  97                         return false;
  98                 }
  99         } else {
 100                 msleep(1000);
 101         }
 102
 103         return true;
 104 }
 105
 106 /*
 107  * Select the fileserver to use.  May be called multiple times to rotate
 108  * through the fileservers.
 109  */
 110 bool afs_select_fileserver(struct afs_operation *op)
 111 {
 112         struct afs_addr_list *alist;
 113         struct afs_server *server;
 114         struct afs_vnode *vnode = op->file[0].vnode;
 115         unsigned int rtt;
 116         s32 abort_code = op->call_abort_code;
 117         int error = op->call_error, i;
 118
 119         op->nr_iterations++;
 120
 121         _enter("OP=%x+%x,%llx,%lx[%d],%lx[%d],%d,%d",
 122                op->debug_id, op->nr_iterations, op->volume->vid,
 123                op->untried, op->index,
 124                op->ac.tried, op->ac.index,
 125                error, abort_code);
 126
 127         if (op->flags & AFS_OPERATION_STOP) {
 128                 _leave(" = f [stopped]");
 129                 return false;
 130         }
 131
 132         if (op->nr_iterations == 0)
 133                 goto start;
 134
 135         /* Evaluate the result of the previous operation, if there was one. */
 136         switch (op->call_error) {
 137         case 0:
 138                 op->cumul_error.responded = true;
 139                 fallthrough;
 140         default:
 141                 /* Success or local failure.  Stop. */
 142                 afs_op_set_error(op, error);
 143                 op->flags |= AFS_OPERATION_STOP;
 144                 _leave(" = f [okay/local %d]", error);
 145                 return false;
 146
 147         case -ECONNABORTED:
 148                 /* The far side rejected the operation on some grounds.  This
 149                  * might involve the server being busy or the volume having been moved.
 150                  *
 151                  * Note that various V* errors should not be sent to a cache manager
 152                  * by a fileserver as they should be translated to more modern UAE*
 153                  * errors instead.  IBM AFS and OpenAFS fileservers, however, do leak
 154                  * these abort codes.
 155                  */
 156                 op->cumul_error.responded = true;
 157                 switch (abort_code) {
 158                 case VNOVOL:
 159                         /* This fileserver doesn't know about the volume.
 160                          * - May indicate that the VL is wrong - retry once and compare
 161                          *   the results.
 162                          * - May indicate that the fileserver couldn't attach to the vol.
 163                          * - The volume might have been temporarily removed so that it can
 164                          *   be replaced by a volume restore.  "vos" might have ended one
 165                          *   transaction and has yet to create the next.
 166                          * - The volume might not be blessed or might not be in-service
 167                          *   (administrative action).
 168                          */
 169                         if (op->flags & AFS_OPERATION_VNOVOL) {
 170                                 afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
 171                                 goto next_server;
 172                         }
 173
 174                         write_lock(&op->volume->servers_lock);
 175                         op->server_list->vnovol_mask |= 1 << op->index;
 176                         write_unlock(&op->volume->servers_lock);
 177
 178                         set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
 179                         error = afs_check_volume_status(op->volume, op);
 180                         if (error < 0) {
 181                                 afs_op_set_error(op, error);
 182                                 goto failed;
 183                         }
 184
 185                         if (test_bit(AFS_VOLUME_DELETED, &op->volume->flags)) {
 186                                 afs_op_set_error(op, -ENOMEDIUM);
 187                                 goto failed;
 188                         }
 189
 190                         /* If the server list didn't change, then assume that
 191                          * it's the fileserver having trouble.
 192                          */
 193                         if (rcu_access_pointer(op->volume->servers) == op->server_list) {
 194                                 afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
 195                                 goto next_server;
 196                         }
 197
 198                         /* Try again */
 199                         op->flags |= AFS_OPERATION_VNOVOL;
 200                         _leave(" = t [vnovol]");
 201                         return true;
 202
 203                 case VVOLEXISTS:
 204                 case VONLINE:
 205                         /* These should not be returned from the fileserver. */
 206                         pr_warn("Fileserver returned unexpected abort %d\n",
 207                                 abort_code);
 208                         afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
 209                         goto next_server;
 210
 211                 case VNOSERVICE:
 212                         /* Prior to AFS 3.2 VNOSERVICE was returned from the fileserver
 213                          * if the volume was neither in-service nor administratively
 214                          * blessed.  All usage was replaced by VNOVOL because AFS 3.1 and
 215                          * earlier cache managers did not handle VNOSERVICE and assumed
 216                          * it was the client OSes errno 105.
 217                          *
 218                          * Starting with OpenAFS 1.4.8 VNOSERVICE was repurposed as the
 219                          * fileserver idle dead time error which was sent in place of
 220                          * RX_CALL_TIMEOUT (-3).  The error was intended to be sent if the
 221                          * fileserver took too long to send a reply to the client.
 222                          * RX_CALL_TIMEOUT would have caused the cache manager to mark the
 223                          * server down whereas VNOSERVICE since AFS 3.2 would cause cache
 224                          * manager to temporarily (up to 15 minutes) mark the volume
 225                          * instance as unusable.
 226                          *
 227                          * The idle dead logic resulted in cache inconsistency since a
 228                          * state changing call that the cache manager assumed was dead
 229                          * could still be processed to completion by the fileserver.  This
 230                          * logic was removed in OpenAFS 1.8.0 and VNOSERVICE is no longer
 231                          * returned.  However, many 1.4.8 through 1.6.24 fileservers are
 232                          * still in existence.
 233                          *
 234                          * AuriStorFS fileservers have never returned VNOSERVICE.
 235                          *
 236                          * VNOSERVICE should be treated as an alias for RX_CALL_TIMEOUT.
 237                          */
 238                 case RX_CALL_TIMEOUT:
 239                         afs_op_accumulate_error(op, -ETIMEDOUT, abort_code);
 240                         goto next_server;
 241
 242                 case VSALVAGING: /* This error should not be leaked to cache managers
 243                                   * but is from OpenAFS demand attach fileservers.
 244                                   * It should be treated as an alias for VOFFLINE.
 245                                   */
 246                 case VSALVAGE: /* VSALVAGE should be treated as a synonym of VOFFLINE */
 247                 case VOFFLINE:
 248                         /* The volume is in use by the volserver or another volume utility
 249                          * for an operation that might alter the contents.  The volume is
 250                          * expected to come back but it might take a long time (could be
 251                          * days).
 252                          */
 253                         if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &op->volume->flags)) {
 254                                 afs_busy(op->volume, abort_code);
 255                                 clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
 256                         }
 257                         if (op->flags & AFS_OPERATION_NO_VSLEEP) {
 258                                 afs_op_set_error(op, -EADV);
 259                                 goto failed;
 260                         }
 261                         if (op->flags & AFS_OPERATION_CUR_ONLY) {
 262                                 afs_op_set_error(op, -ESTALE);
 263                                 goto failed;
 264                         }
 265                         goto busy;
 266
 267                 case VRESTARTING: /* The fileserver is either shutting down or starting up. */
 268                 case VBUSY:
 269                         /* The volume is in use by the volserver or another volume
 270                          * utility for an operation that is not expected to alter the
 271                          * contents of the volume.  VBUSY does not need to be returned
 272                          * for a ROVOL or BACKVOL bound to an ITBusy volserver
 273                          * transaction.  The fileserver is permitted to continue serving
 274                          * content from ROVOLs and BACKVOLs during an ITBusy transaction
 275                          * because the content will not change.  However, many fileserver
 276                          * releases do return VBUSY for ROVOL and BACKVOL instances under
 277                          * many circumstances.
 278                          *
 279                          * Retry after going round all the servers unless we have a file
 280                          * lock we need to maintain.
 281                          */
 282                         if (op->flags & AFS_OPERATION_NO_VSLEEP) {
 283                                 afs_op_set_error(op, -EBUSY);
 284                                 goto failed;
 285                         }
 286                         if (!test_and_set_bit(AFS_VOLUME_BUSY, &op->volume->flags)) {
 287                                 afs_busy(op->volume, abort_code);
 288                                 clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
 289                         }
 290                 busy:
 291                         if (op->flags & AFS_OPERATION_CUR_ONLY) {
 292                                 if (!afs_sleep_and_retry(op))
 293                                         goto failed;
 294
 295                                 /* Retry with same server & address */
 296                                 _leave(" = t [vbusy]");
 297                                 return true;
 298                         }
 299
 300                         op->flags |= AFS_OPERATION_VBUSY;
 301                         goto next_server;
 302
 303                 case VMOVED:
 304                         /* The volume migrated to another server.  We consider
 305                          * consider all locks and callbacks broken and request
 306                          * an update from the VLDB.
 307                          *
 308                          * We also limit the number of VMOVED hops we will
 309                          * honour, just in case someone sets up a loop.
 310                          */
 311                         if (op->flags & AFS_OPERATION_VMOVED) {
 312                                 afs_op_set_error(op, -EREMOTEIO);
 313                                 goto failed;
 314                         }
 315                         op->flags |= AFS_OPERATION_VMOVED;
 316
 317                         set_bit(AFS_VOLUME_WAIT, &op->volume->flags);
 318                         set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
 319                         error = afs_check_volume_status(op->volume, op);
 320                         if (error < 0) {
 321                                 afs_op_set_error(op, error);
 322                                 goto failed;
 323                         }
 324
 325                         /* If the server list didn't change, then the VLDB is
 326                          * out of sync with the fileservers.  This is hopefully
 327                          * a temporary condition, however, so we don't want to
 328                          * permanently block access to the file.
 329                          *
 330                          * TODO: Try other fileservers if we can.
 331                          *
 332                          * TODO: Retry a few times with sleeps.
 333                          */
 334                         if (rcu_access_pointer(op->volume->servers) == op->server_list) {
 335                                 afs_op_accumulate_error(op, -ENOMEDIUM, abort_code);
 336                                 goto failed;
 337                         }
 338
 339                         goto restart_from_beginning;
 340
 341                 case UAEIO:
 342                 case VIO:
 343                         afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
 344                         if (op->volume->type != AFSVL_RWVOL)
 345                                 goto next_server;
 346                         goto failed;
 347
 348                 case VDISKFULL:
 349                 case UAENOSPC:
 350                         /* The partition is full.  Only applies to RWVOLs.
 351                          * Translate locally and return ENOSPC.
 352                          * No replicas to failover to.
 353                          */
 354                         afs_op_set_error(op, -ENOSPC);
 355                         goto failed_but_online;
 356
 357                 case VOVERQUOTA:
 358                 case UAEDQUOT:
 359                         /* Volume is full.  Only applies to RWVOLs.
 360                          * Translate locally and return EDQUOT.
 361                          * No replicas to failover to.
 362                          */
 363                         afs_op_set_error(op, -EDQUOT);
 364                         goto failed_but_online;
 365
 366                 default:
 367                         afs_op_accumulate_error(op, error, abort_code);
 368                 failed_but_online:
 369                         clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
 370                         clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
 371                         goto failed;
 372                 }
 373
 374         case -ETIMEDOUT:
 375         case -ETIME:
 376                 if (afs_op_error(op) != -EDESTADDRREQ)
 377                         goto iterate_address;
 378                 fallthrough;
 379         case -ERFKILL:
 380         case -EADDRNOTAVAIL:
 381         case -ENETUNREACH:
 382         case -EHOSTUNREACH:
 383         case -EHOSTDOWN:
 384         case -ECONNREFUSED:
 385                 _debug("no conn");
 386                 afs_op_accumulate_error(op, error, 0);
 387                 goto iterate_address;
 388
 389         case -ENETRESET:
 390                 pr_warn("kAFS: Peer reset %s (op=%x)\n",
 391                         op->type ? op->type->name : "???", op->debug_id);
 392                 fallthrough;
 393         case -ECONNRESET:
 394                 _debug("call reset");
 395                 afs_op_set_error(op, error);
 396                 goto failed;
 397         }
 398
 399 restart_from_beginning:
 400         _debug("restart");
 401         afs_end_cursor(&op->ac);
 402         op->server = NULL;
 403         afs_put_serverlist(op->net, op->server_list);
 404         op->server_list = NULL;
 405 start:
 406         _debug("start");
 407         /* See if we need to do an update of the volume record.  Note that the
 408          * volume may have moved or even have been deleted.
 409          */
 410         error = afs_check_volume_status(op->volume, op);
 411         if (error < 0) {
 412                 afs_op_set_error(op, error);
 413                 goto failed;
 414         }
 415
 416         if (!afs_start_fs_iteration(op, vnode))
 417                 goto failed;
 418
 419         _debug("__ VOL %llx __", op->volume->vid);
 420
 421 pick_server:
 422         _debug("pick [%lx]", op->untried);
 423
 424         error = afs_wait_for_fs_probes(op->server_list, op->untried);
 425         if (error < 0) {
 426                 afs_op_set_error(op, error);
 427                 goto failed;
 428         }
 429
 430         /* Pick the untried server with the lowest RTT.  If we have outstanding
 431          * callbacks, we stick with the server we're already using if we can.
 432          */
 433         if (op->server) {
 434                 _debug("server %u", op->index);
 435                 if (test_bit(op->index, &op->untried))
 436                         goto selected_server;
 437                 op->server = NULL;
 438                 _debug("no server");
 439         }
 440
 441         op->index = -1;
 442         rtt = UINT_MAX;
 443         for (i = 0; i < op->server_list->nr_servers; i++) {
 444                 struct afs_server *s = op->server_list->servers[i].server;
 445
 446                 if (!test_bit(i, &op->untried) ||
 447                     !test_bit(AFS_SERVER_FL_RESPONDING, &s->flags))
 448                         continue;
 449                 if (s->probe.rtt < rtt) {
 450                         op->index = i;
 451                         rtt = s->probe.rtt;
 452                 }
 453         }
 454
 455         if (op->index == -1)
 456                 goto no_more_servers;
 457
 458 selected_server:
 459         _debug("use %d", op->index);
 460         __clear_bit(op->index, &op->untried);
 461
 462         /* We're starting on a different fileserver from the list.  We need to
 463          * check it, create a callback intercept, find its address list and
 464          * probe its capabilities before we use it.
 465          */
 466         ASSERTCMP(op->ac.alist, ==, NULL);
 467         server = op->server_list->servers[op->index].server;
 468
 469         if (!afs_check_server_record(op, server))
 470                 goto failed;
 471
 472         _debug("USING SERVER: %pU", &server->uuid);
 473
 474         op->flags |= AFS_OPERATION_RETRY_SERVER;
 475         op->server = server;
 476         if (vnode->cb_server != server) {
 477                 vnode->cb_server = server;
 478                 vnode->cb_s_break = server->cb_s_break;
 479                 vnode->cb_fs_s_break = atomic_read(&server->cell->fs_s_break);
 480                 vnode->cb_v_break = vnode->volume->cb_v_break;
 481                 clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
 482         }
 483
 484         read_lock(&server->fs_lock);
 485         alist = rcu_dereference_protected(server->addresses,
 486                                           lockdep_is_held(&server->fs_lock));
 487         afs_get_addrlist(alist);
 488         read_unlock(&server->fs_lock);
 489
 490 retry_server:
 491         memset(&op->ac, 0, sizeof(op->ac));
 492
 493         if (!op->ac.alist)
 494                 op->ac.alist = alist;
 495         else
 496                 afs_put_addrlist(alist);
 497
 498         op->ac.index = -1;
 499
 500 iterate_address:
 501         ASSERT(op->ac.alist);
 502         /* Iterate over the current server's address list to try and find an
 503          * address on which it will respond to us.
 504          */
 505         if (!afs_iterate_addresses(&op->ac))
 506                 goto out_of_addresses;
 507
 508         _debug("address [%u] %u/%u %pISp",
 509                op->index, op->ac.index, op->ac.alist->nr_addrs,
 510                rxrpc_kernel_remote_addr(op->ac.alist->addrs[op->ac.index].peer));
 511
 512         op->call_responded = false;
 513         _leave(" = t");
 514         return true;
 515
 516 out_of_addresses:
 517         /* We've now had a failure to respond on all of a server's addresses -
 518          * immediately probe them again and consider retrying the server.
 519          */
 520         afs_probe_fileserver(op->net, op->server);
 521         if (op->flags & AFS_OPERATION_RETRY_SERVER) {
 522                 alist = op->ac.alist;
 523                 error = afs_wait_for_one_fs_probe(
 524                         op->server, !(op->flags & AFS_OPERATION_UNINTR));
 525                 switch (error) {
 526                 case 0:
 527                         op->flags &= ~AFS_OPERATION_RETRY_SERVER;
 528                         goto retry_server;
 529                 case -ERESTARTSYS:
 530                         afs_op_set_error(op, error);
 531                         goto failed;
 532                 case -ETIME:
 533                 case -EDESTADDRREQ:
 534                         goto next_server;
 535                 }
 536         }
 537
 538 next_server:
 539         _debug("next");
 540         afs_end_cursor(&op->ac);
 541         goto pick_server;
 542
 543 no_more_servers:
 544         /* That's all the servers poked to no good effect.  Try again if some
 545          * of them were busy.
 546          */
 547         if (op->flags & AFS_OPERATION_VBUSY)
 548                 goto restart_from_beginning;
 549
 550         for (i = 0; i < op->server_list->nr_servers; i++) {
 551                 struct afs_server *s = op->server_list->servers[i].server;
 552
 553                 error = READ_ONCE(s->probe.error);
 554                 if (error < 0)
 555                         afs_op_accumulate_error(op, error, s->probe.abort_code);
 556         }
 557
 558 failed:
 559         op->flags |= AFS_OPERATION_STOP;
 560         afs_end_cursor(&op->ac);
 561         _leave(" = f [failed %d]", afs_op_error(op));
 562         return false;
 563 }
 564
 565 /*
 566  * Dump cursor state in the case of the error being EDESTADDRREQ.
 567  */
 568 void afs_dump_edestaddrreq(const struct afs_operation *op)
 569 {
 570         static int count;
 571         int i;
 572
 573         if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3)
 574                 return;
 575         count++;
 576
 577         rcu_read_lock();
 578
 579         pr_notice("EDESTADDR occurred\n");
 580         pr_notice("OP: cbb=%x cbb2=%x fl=%x err=%hd\n",
 581                   op->file[0].cb_break_before,
 582                   op->file[1].cb_break_before, op->flags, op->cumul_error.error);
 583         pr_notice("OP: ut=%lx ix=%d ni=%u\n",
 584                   op->untried, op->index, op->nr_iterations);
 585         pr_notice("OP: call  er=%d ac=%d r=%u\n",
 586                   op->call_error, op->call_abort_code, op->call_responded);
 587
 588         if (op->server_list) {
 589                 const struct afs_server_list *sl = op->server_list;
 590                 pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n",
 591                           sl->nr_servers, sl->preferred, sl->vnovol_mask);
 592                 for (i = 0; i < sl->nr_servers; i++) {
 593                         const struct afs_server *s = sl->servers[i].server;
 594                         pr_notice("FC: server fl=%lx av=%u %pU\n",
 595                                   s->flags, s->addr_version, &s->uuid);
 596                         if (s->addresses) {
 597                                 const struct afs_addr_list *a =
 598                                         rcu_dereference(s->addresses);
 599                                 pr_notice("FC:  - av=%u nr=%u/%u/%u pr=%u\n",
 600                                           a->version,
 601                                           a->nr_ipv4, a->nr_addrs, a->max_addrs,
 602                                           a->preferred);
 603                                 pr_notice("FC:  - R=%lx F=%lx\n",
 604                                           a->responded, a->failed);
 605                                 if (a == op->ac.alist)
 606                                         pr_notice("FC:  - current\n");
 607                         }
 608                 }
 609         }
 610
 611         pr_notice("AC: t=%lx ax=%u ni=%u\n",
 612                   op->ac.tried, op->ac.index, op->ac.nr_iterations);
 613         rcu_read_unlock();
 614 }