drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2012, 2015, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lnet/klnds/o2iblnd/o2iblnd_cb.c
  33  *
  34  * Author: Eric Barton <eric@bartonsoftware.com>
  35  */
  36
  37 #include "o2iblnd.h"
  38
  39 #define MAX_CONN_RACES_BEFORE_ABORT 20
  40
  41 static void kiblnd_peer_alive(struct kib_peer *peer);
  42 static void kiblnd_peer_connect_failed(struct kib_peer *peer, int active, int error);
  43 static void kiblnd_init_tx_msg(struct lnet_ni *ni, struct kib_tx *tx,
  44                                int type, int body_nob);
  45 static int kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type,
  46                             int resid, struct kib_rdma_desc *dstrd,
  47                             __u64 dstcookie);
  48 static void kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn);
  49 static void kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn);
  50 static void kiblnd_unmap_tx(struct kib_tx *tx);
  51 static void kiblnd_check_sends_locked(struct kib_conn *conn);
  52
  53 static void
  54 kiblnd_tx_done(struct lnet_ni *ni, struct kib_tx *tx)
  55 {
  56         struct lnet_msg *lntmsg[2];
  57         struct kib_net *net = ni->ni_data;
  58         int rc;
  59         int i;
  60
  61         LASSERT(net);
  62         LASSERT(!in_interrupt());
  63         LASSERT(!tx->tx_queued);               /* mustn't be queued for sending */
  64         LASSERT(!tx->tx_sending);         /* mustn't be awaiting sent callback */
  65         LASSERT(!tx->tx_waiting);             /* mustn't be awaiting peer response */
  66         LASSERT(tx->tx_pool);
  67
  68         kiblnd_unmap_tx(tx);
  69
  70         /* tx may have up to 2 lnet msgs to finalise */
  71         lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
  72         lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
  73         rc = tx->tx_status;
  74
  75         if (tx->tx_conn) {
  76                 LASSERT(ni == tx->tx_conn->ibc_peer->ibp_ni);
  77
  78                 kiblnd_conn_decref(tx->tx_conn);
  79                 tx->tx_conn = NULL;
  80         }
  81
  82         tx->tx_nwrq = 0;
  83         tx->tx_status = 0;
  84
  85         kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list);
  86
  87         /* delay finalize until my descs have been freed */
  88         for (i = 0; i < 2; i++) {
  89                 if (!lntmsg[i])
  90                         continue;
  91
  92                 lnet_finalize(ni, lntmsg[i], rc);
  93         }
  94 }
  95
  96 void
  97 kiblnd_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int status)
  98 {
  99         struct kib_tx *tx;
 100
 101         while (!list_empty(txlist)) {
 102                 tx = list_entry(txlist->next, struct kib_tx, tx_list);
 103
 104                 list_del(&tx->tx_list);
 105                 /* complete now */
 106                 tx->tx_waiting = 0;
 107                 tx->tx_status = status;
 108                 kiblnd_tx_done(ni, tx);
 109         }
 110 }
 111
 112 static struct kib_tx *
 113 kiblnd_get_idle_tx(struct lnet_ni *ni, lnet_nid_t target)
 114 {
 115         struct kib_net *net = (struct kib_net *)ni->ni_data;
 116         struct list_head *node;
 117         struct kib_tx *tx;
 118         struct kib_tx_poolset *tps;
 119
 120         tps = net->ibn_tx_ps[lnet_cpt_of_nid(target)];
 121         node = kiblnd_pool_alloc_node(&tps->tps_poolset);
 122         if (!node)
 123                 return NULL;
 124         tx = list_entry(node, struct kib_tx, tx_list);
 125
 126         LASSERT(!tx->tx_nwrq);
 127         LASSERT(!tx->tx_queued);
 128         LASSERT(!tx->tx_sending);
 129         LASSERT(!tx->tx_waiting);
 130         LASSERT(!tx->tx_status);
 131         LASSERT(!tx->tx_conn);
 132         LASSERT(!tx->tx_lntmsg[0]);
 133         LASSERT(!tx->tx_lntmsg[1]);
 134         LASSERT(!tx->tx_nfrags);
 135
 136         return tx;
 137 }
 138
 139 static void
 140 kiblnd_drop_rx(struct kib_rx *rx)
 141 {
 142         struct kib_conn *conn = rx->rx_conn;
 143         struct kib_sched_info *sched = conn->ibc_sched;
 144         unsigned long flags;
 145
 146         spin_lock_irqsave(&sched->ibs_lock, flags);
 147         LASSERT(conn->ibc_nrx > 0);
 148         conn->ibc_nrx--;
 149         spin_unlock_irqrestore(&sched->ibs_lock, flags);
 150
 151         kiblnd_conn_decref(conn);
 152 }
 153
 154 int
 155 kiblnd_post_rx(struct kib_rx *rx, int credit)
 156 {
 157         struct kib_conn *conn = rx->rx_conn;
 158         struct kib_net *net = conn->ibc_peer->ibp_ni->ni_data;
 159         struct ib_recv_wr *bad_wrq = NULL;
 160         int rc;
 161
 162         LASSERT(net);
 163         LASSERT(!in_interrupt());
 164         LASSERT(credit == IBLND_POSTRX_NO_CREDIT ||
 165                 credit == IBLND_POSTRX_PEER_CREDIT ||
 166                 credit == IBLND_POSTRX_RSRVD_CREDIT);
 167
 168         rx->rx_sge.lkey   = conn->ibc_hdev->ibh_pd->local_dma_lkey;
 169         rx->rx_sge.addr   = rx->rx_msgaddr;
 170         rx->rx_sge.length = IBLND_MSG_SIZE;
 171
 172         rx->rx_wrq.next    = NULL;
 173         rx->rx_wrq.sg_list = &rx->rx_sge;
 174         rx->rx_wrq.num_sge = 1;
 175         rx->rx_wrq.wr_id   = kiblnd_ptr2wreqid(rx, IBLND_WID_RX);
 176
 177         LASSERT(conn->ibc_state >= IBLND_CONN_INIT);
 178         LASSERT(rx->rx_nob >= 0);             /* not posted */
 179
 180         if (conn->ibc_state > IBLND_CONN_ESTABLISHED) {
 181                 kiblnd_drop_rx(rx);          /* No more posts for this rx */
 182                 return 0;
 183         }
 184
 185         rx->rx_nob = -1;                        /* flag posted */
 186
 187         /* NB: need an extra reference after ib_post_recv because we don't
 188          * own this rx (and rx::rx_conn) anymore, LU-5678.
 189          */
 190         kiblnd_conn_addref(conn);
 191         rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, &bad_wrq);
 192         if (unlikely(rc)) {
 193                 CERROR("Can't post rx for %s: %d, bad_wrq: %p\n",
 194                        libcfs_nid2str(conn->ibc_peer->ibp_nid), rc, bad_wrq);
 195                 rx->rx_nob = 0;
 196         }
 197
 198         if (conn->ibc_state < IBLND_CONN_ESTABLISHED) /* Initial post */
 199                 goto out;
 200
 201         if (unlikely(rc)) {
 202                 kiblnd_close_conn(conn, rc);
 203                 kiblnd_drop_rx(rx);          /* No more posts for this rx */
 204                 goto out;
 205         }
 206
 207         if (credit == IBLND_POSTRX_NO_CREDIT)
 208                 goto out;
 209
 210         spin_lock(&conn->ibc_lock);
 211         if (credit == IBLND_POSTRX_PEER_CREDIT)
 212                 conn->ibc_outstanding_credits++;
 213         else
 214                 conn->ibc_reserved_credits++;
 215         kiblnd_check_sends_locked(conn);
 216         spin_unlock(&conn->ibc_lock);
 217
 218 out:
 219         kiblnd_conn_decref(conn);
 220         return rc;
 221 }
 222
 223 static struct kib_tx *
 224 kiblnd_find_waiting_tx_locked(struct kib_conn *conn, int txtype, __u64 cookie)
 225 {
 226         struct list_head *tmp;
 227
 228         list_for_each(tmp, &conn->ibc_active_txs) {
 229                 struct kib_tx *tx = list_entry(tmp, struct kib_tx, tx_list);
 230
 231                 LASSERT(!tx->tx_queued);
 232                 LASSERT(tx->tx_sending || tx->tx_waiting);
 233
 234                 if (tx->tx_cookie != cookie)
 235                         continue;
 236
 237                 if (tx->tx_waiting &&
 238                     tx->tx_msg->ibm_type == txtype)
 239                         return tx;
 240
 241                 CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
 242                       tx->tx_waiting ? "" : "NOT ",
 243                       tx->tx_msg->ibm_type, txtype);
 244         }
 245         return NULL;
 246 }
 247
 248 static void
 249 kiblnd_handle_completion(struct kib_conn *conn, int txtype, int status, __u64 cookie)
 250 {
 251         struct kib_tx *tx;
 252         struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
 253         int idle;
 254
 255         spin_lock(&conn->ibc_lock);
 256
 257         tx = kiblnd_find_waiting_tx_locked(conn, txtype, cookie);
 258         if (!tx) {
 259                 spin_unlock(&conn->ibc_lock);
 260
 261                 CWARN("Unmatched completion type %x cookie %#llx from %s\n",
 262                       txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
 263                 kiblnd_close_conn(conn, -EPROTO);
 264                 return;
 265         }
 266
 267         if (!tx->tx_status) {          /* success so far */
 268                 if (status < 0) /* failed? */
 269                         tx->tx_status = status;
 270                 else if (txtype == IBLND_MSG_GET_REQ)
 271                         lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
 272         }
 273
 274         tx->tx_waiting = 0;
 275
 276         idle = !tx->tx_queued && !tx->tx_sending;
 277         if (idle)
 278                 list_del(&tx->tx_list);
 279
 280         spin_unlock(&conn->ibc_lock);
 281
 282         if (idle)
 283                 kiblnd_tx_done(ni, tx);
 284 }
 285
 286 static void
 287 kiblnd_send_completion(struct kib_conn *conn, int type, int status, __u64 cookie)
 288 {
 289         struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
 290         struct kib_tx *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
 291
 292         if (!tx) {
 293                 CERROR("Can't get tx for completion %x for %s\n",
 294                        type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
 295                 return;
 296         }
 297
 298         tx->tx_msg->ibm_u.completion.ibcm_status = status;
 299         tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
 300         kiblnd_init_tx_msg(ni, tx, type, sizeof(struct kib_completion_msg));
 301
 302         kiblnd_queue_tx(tx, conn);
 303 }
 304
 305 static void
 306 kiblnd_handle_rx(struct kib_rx *rx)
 307 {
 308         struct kib_msg *msg = rx->rx_msg;
 309         struct kib_conn *conn = rx->rx_conn;
 310         struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
 311         int credits = msg->ibm_credits;
 312         struct kib_tx *tx;
 313         int rc = 0;
 314         int rc2;
 315         int post_credit;
 316
 317         LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
 318
 319         CDEBUG(D_NET, "Received %x[%d] from %s\n",
 320                msg->ibm_type, credits,
 321                libcfs_nid2str(conn->ibc_peer->ibp_nid));
 322
 323         if (credits) {
 324                 /* Have I received credits that will let me send? */
 325                 spin_lock(&conn->ibc_lock);
 326
 327                 if (conn->ibc_credits + credits >
 328                     conn->ibc_queue_depth) {
 329                         rc2 = conn->ibc_credits;
 330                         spin_unlock(&conn->ibc_lock);
 331
 332                         CERROR("Bad credits from %s: %d + %d > %d\n",
 333                                libcfs_nid2str(conn->ibc_peer->ibp_nid),
 334                                rc2, credits, conn->ibc_queue_depth);
 335
 336                         kiblnd_close_conn(conn, -EPROTO);
 337                         kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT);
 338                         return;
 339                 }
 340
 341                 conn->ibc_credits += credits;
 342
 343                 /* This ensures the credit taken by NOOP can be returned */
 344                 if (msg->ibm_type == IBLND_MSG_NOOP &&
 345                     !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */
 346                         conn->ibc_outstanding_credits++;
 347
 348                 kiblnd_check_sends_locked(conn);
 349                 spin_unlock(&conn->ibc_lock);
 350         }
 351
 352         switch (msg->ibm_type) {
 353         default:
 354                 CERROR("Bad IBLND message type %x from %s\n",
 355                        msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
 356                 post_credit = IBLND_POSTRX_NO_CREDIT;
 357                 rc = -EPROTO;
 358                 break;
 359
 360         case IBLND_MSG_NOOP:
 361                 if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
 362                         post_credit = IBLND_POSTRX_NO_CREDIT;
 363                         break;
 364                 }
 365
 366                 if (credits) /* credit already posted */
 367                         post_credit = IBLND_POSTRX_NO_CREDIT;
 368                 else          /* a keepalive NOOP */
 369                         post_credit = IBLND_POSTRX_PEER_CREDIT;
 370                 break;
 371
 372         case IBLND_MSG_IMMEDIATE:
 373                 post_credit = IBLND_POSTRX_DONT_POST;
 374                 rc = lnet_parse(ni, &msg->ibm_u.immediate.ibim_hdr,
 375                                 msg->ibm_srcnid, rx, 0);
 376                 if (rc < 0)                  /* repost on error */
 377                         post_credit = IBLND_POSTRX_PEER_CREDIT;
 378                 break;
 379
 380         case IBLND_MSG_PUT_REQ:
 381                 post_credit = IBLND_POSTRX_DONT_POST;
 382                 rc = lnet_parse(ni, &msg->ibm_u.putreq.ibprm_hdr,
 383                                 msg->ibm_srcnid, rx, 1);
 384                 if (rc < 0)                  /* repost on error */
 385                         post_credit = IBLND_POSTRX_PEER_CREDIT;
 386                 break;
 387
 388         case IBLND_MSG_PUT_NAK:
 389                 CWARN("PUT_NACK from %s\n",
 390                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
 391                 post_credit = IBLND_POSTRX_RSRVD_CREDIT;
 392                 kiblnd_handle_completion(conn, IBLND_MSG_PUT_REQ,
 393                                          msg->ibm_u.completion.ibcm_status,
 394                                          msg->ibm_u.completion.ibcm_cookie);
 395                 break;
 396
 397         case IBLND_MSG_PUT_ACK:
 398                 post_credit = IBLND_POSTRX_RSRVD_CREDIT;
 399
 400                 spin_lock(&conn->ibc_lock);
 401                 tx = kiblnd_find_waiting_tx_locked(conn, IBLND_MSG_PUT_REQ,
 402                                                    msg->ibm_u.putack.ibpam_src_cookie);
 403                 if (tx)
 404                         list_del(&tx->tx_list);
 405                 spin_unlock(&conn->ibc_lock);
 406
 407                 if (!tx) {
 408                         CERROR("Unmatched PUT_ACK from %s\n",
 409                                libcfs_nid2str(conn->ibc_peer->ibp_nid));
 410                         rc = -EPROTO;
 411                         break;
 412                 }
 413
 414                 LASSERT(tx->tx_waiting);
 415                 /*
 416                  * CAVEAT EMPTOR: I could be racing with tx_complete, but...
 417                  * (a) I can overwrite tx_msg since my peer has received it!
 418                  * (b) tx_waiting set tells tx_complete() it's not done.
 419                  */
 420                 tx->tx_nwrq = 0;                /* overwrite PUT_REQ */
 421
 422                 rc2 = kiblnd_init_rdma(conn, tx, IBLND_MSG_PUT_DONE,
 423                                        kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd),
 424                                        &msg->ibm_u.putack.ibpam_rd,
 425                                        msg->ibm_u.putack.ibpam_dst_cookie);
 426                 if (rc2 < 0)
 427                         CERROR("Can't setup rdma for PUT to %s: %d\n",
 428                                libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2);
 429
 430                 spin_lock(&conn->ibc_lock);
 431                 tx->tx_waiting = 0;     /* clear waiting and queue atomically */
 432                 kiblnd_queue_tx_locked(tx, conn);
 433                 spin_unlock(&conn->ibc_lock);
 434                 break;
 435
 436         case IBLND_MSG_PUT_DONE:
 437                 post_credit = IBLND_POSTRX_PEER_CREDIT;
 438                 kiblnd_handle_completion(conn, IBLND_MSG_PUT_ACK,
 439                                          msg->ibm_u.completion.ibcm_status,
 440                                          msg->ibm_u.completion.ibcm_cookie);
 441                 break;
 442
 443         case IBLND_MSG_GET_REQ:
 444                 post_credit = IBLND_POSTRX_DONT_POST;
 445                 rc = lnet_parse(ni, &msg->ibm_u.get.ibgm_hdr,
 446                                 msg->ibm_srcnid, rx, 1);
 447                 if (rc < 0)                  /* repost on error */
 448                         post_credit = IBLND_POSTRX_PEER_CREDIT;
 449                 break;
 450
 451         case IBLND_MSG_GET_DONE:
 452                 post_credit = IBLND_POSTRX_RSRVD_CREDIT;
 453                 kiblnd_handle_completion(conn, IBLND_MSG_GET_REQ,
 454                                          msg->ibm_u.completion.ibcm_status,
 455                                          msg->ibm_u.completion.ibcm_cookie);
 456                 break;
 457         }
 458
 459         if (rc < 0)                          /* protocol error */
 460                 kiblnd_close_conn(conn, rc);
 461
 462         if (post_credit != IBLND_POSTRX_DONT_POST)
 463                 kiblnd_post_rx(rx, post_credit);
 464 }
 465
 466 static void
 467 kiblnd_rx_complete(struct kib_rx *rx, int status, int nob)
 468 {
 469         struct kib_msg *msg = rx->rx_msg;
 470         struct kib_conn *conn = rx->rx_conn;
 471         struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
 472         struct kib_net *net = ni->ni_data;
 473         int rc;
 474         int err = -EIO;
 475
 476         LASSERT(net);
 477         LASSERT(rx->rx_nob < 0);               /* was posted */
 478         rx->rx_nob = 0;                  /* isn't now */
 479
 480         if (conn->ibc_state > IBLND_CONN_ESTABLISHED)
 481                 goto ignore;
 482
 483         if (status != IB_WC_SUCCESS) {
 484                 CNETERR("Rx from %s failed: %d\n",
 485                         libcfs_nid2str(conn->ibc_peer->ibp_nid), status);
 486                 goto failed;
 487         }
 488
 489         LASSERT(nob >= 0);
 490         rx->rx_nob = nob;
 491
 492         rc = kiblnd_unpack_msg(msg, rx->rx_nob);
 493         if (rc) {
 494                 CERROR("Error %d unpacking rx from %s\n",
 495                        rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
 496                 goto failed;
 497         }
 498
 499         if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
 500             msg->ibm_dstnid != ni->ni_nid ||
 501             msg->ibm_srcstamp != conn->ibc_incarnation ||
 502             msg->ibm_dststamp != net->ibn_incarnation) {
 503                 CERROR("Stale rx from %s\n",
 504                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
 505                 err = -ESTALE;
 506                 goto failed;
 507         }
 508
 509         /* set time last known alive */
 510         kiblnd_peer_alive(conn->ibc_peer);
 511
 512         /* racing with connection establishment/teardown! */
 513
 514         if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
 515                 rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
 516                 unsigned long flags;
 517
 518                 write_lock_irqsave(g_lock, flags);
 519                 /* must check holding global lock to eliminate race */
 520                 if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
 521                         list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
 522                         write_unlock_irqrestore(g_lock, flags);
 523                         return;
 524                 }
 525                 write_unlock_irqrestore(g_lock, flags);
 526         }
 527         kiblnd_handle_rx(rx);
 528         return;
 529
 530  failed:
 531         CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
 532         kiblnd_close_conn(conn, err);
 533  ignore:
 534         kiblnd_drop_rx(rx);                  /* Don't re-post rx. */
 535 }
 536
 537 static struct page *
 538 kiblnd_kvaddr_to_page(unsigned long vaddr)
 539 {
 540         struct page *page;
 541
 542         if (is_vmalloc_addr((void *)vaddr)) {
 543                 page = vmalloc_to_page((void *)vaddr);
 544                 LASSERT(page);
 545                 return page;
 546         }
 547 #ifdef CONFIG_HIGHMEM
 548         if (vaddr >= PKMAP_BASE &&
 549             vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
 550                 /* No highmem pages only used for bulk (kiov) I/O */
 551                 CERROR("find page for address in highmem\n");
 552                 LBUG();
 553         }
 554 #endif
 555         page = virt_to_page(vaddr);
 556         LASSERT(page);
 557         return page;
 558 }
 559
 560 static int
 561 kiblnd_fmr_map_tx(struct kib_net *net, struct kib_tx *tx, struct kib_rdma_desc *rd, __u32 nob)
 562 {
 563         struct kib_hca_dev *hdev;
 564         struct kib_fmr_poolset *fps;
 565         int cpt;
 566         int rc;
 567
 568         LASSERT(tx->tx_pool);
 569         LASSERT(tx->tx_pool->tpo_pool.po_owner);
 570
 571         hdev = tx->tx_pool->tpo_hdev;
 572         cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
 573
 574         fps = net->ibn_fmr_ps[cpt];
 575         rc = kiblnd_fmr_pool_map(fps, tx, rd, nob, 0, &tx->fmr);
 576         if (rc) {
 577                 CERROR("Can't map %u bytes: %d\n", nob, rc);
 578                 return rc;
 579         }
 580
 581         /*
 582          * If rd is not tx_rd, it's going to get sent to a peer, who will need
 583          * the rkey
 584          */
 585         rd->rd_key = tx->fmr.fmr_key;
 586         rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask;
 587         rd->rd_frags[0].rf_nob = nob;
 588         rd->rd_nfrags = 1;
 589
 590         return 0;
 591 }
 592
 593 static void kiblnd_unmap_tx(struct kib_tx *tx)
 594 {
 595         if (tx->fmr.fmr_pfmr || tx->fmr.fmr_frd)
 596                 kiblnd_fmr_pool_unmap(&tx->fmr, tx->tx_status);
 597
 598         if (tx->tx_nfrags) {
 599                 kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev,
 600                                     tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
 601                 tx->tx_nfrags = 0;
 602         }
 603 }
 604
 605 static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
 606                          struct kib_rdma_desc *rd, int nfrags)
 607 {
 608         struct kib_net *net = ni->ni_data;
 609         struct kib_hca_dev *hdev = net->ibn_dev->ibd_hdev;
 610         __u32 nob;
 611         int i;
 612
 613         /*
 614          * If rd is not tx_rd, it's going to get sent to a peer and I'm the
 615          * RDMA sink
 616          */
 617         tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
 618         tx->tx_nfrags = nfrags;
 619
 620         rd->rd_nfrags = kiblnd_dma_map_sg(hdev->ibh_ibdev, tx->tx_frags,
 621                                           tx->tx_nfrags, tx->tx_dmadir);
 622
 623         for (i = 0, nob = 0; i < rd->rd_nfrags; i++) {
 624                 rd->rd_frags[i].rf_nob  = kiblnd_sg_dma_len(
 625                         hdev->ibh_ibdev, &tx->tx_frags[i]);
 626                 rd->rd_frags[i].rf_addr = kiblnd_sg_dma_address(
 627                         hdev->ibh_ibdev, &tx->tx_frags[i]);
 628                 nob += rd->rd_frags[i].rf_nob;
 629         }
 630
 631         if (net->ibn_fmr_ps)
 632                 return kiblnd_fmr_map_tx(net, tx, rd, nob);
 633
 634         return -EINVAL;
 635 }
 636
 637 static int
 638 kiblnd_setup_rd_iov(struct lnet_ni *ni, struct kib_tx *tx,
 639                     struct kib_rdma_desc *rd, unsigned int niov,
 640                     const struct kvec *iov, int offset, int nob)
 641 {
 642         struct kib_net *net = ni->ni_data;
 643         struct page *page;
 644         struct scatterlist *sg;
 645         unsigned long vaddr;
 646         int fragnob;
 647         int page_offset;
 648
 649         LASSERT(nob > 0);
 650         LASSERT(niov > 0);
 651         LASSERT(net);
 652
 653         while (offset >= iov->iov_len) {
 654                 offset -= iov->iov_len;
 655                 niov--;
 656                 iov++;
 657                 LASSERT(niov > 0);
 658         }
 659
 660         sg = tx->tx_frags;
 661         do {
 662                 LASSERT(niov > 0);
 663
 664                 vaddr = ((unsigned long)iov->iov_base) + offset;
 665                 page_offset = vaddr & (PAGE_SIZE - 1);
 666                 page = kiblnd_kvaddr_to_page(vaddr);
 667                 if (!page) {
 668                         CERROR("Can't find page\n");
 669                         return -EFAULT;
 670                 }
 671
 672                 fragnob = min((int)(iov->iov_len - offset), nob);
 673                 fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
 674
 675                 sg_set_page(sg, page, fragnob, page_offset);
 676                 sg = sg_next(sg);
 677                 if (!sg) {
 678                         CERROR("lacking enough sg entries to map tx\n");
 679                         return -EFAULT;
 680                 }
 681
 682                 if (offset + fragnob < iov->iov_len) {
 683                         offset += fragnob;
 684                 } else {
 685                         offset = 0;
 686                         iov++;
 687                         niov--;
 688                 }
 689                 nob -= fragnob;
 690         } while (nob > 0);
 691
 692         return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
 693 }
 694
 695 static int
 696 kiblnd_setup_rd_kiov(struct lnet_ni *ni, struct kib_tx *tx,
 697                      struct kib_rdma_desc *rd, int nkiov,
 698                      const struct bio_vec *kiov, int offset, int nob)
 699 {
 700         struct kib_net *net = ni->ni_data;
 701         struct scatterlist *sg;
 702         int fragnob;
 703
 704         CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
 705
 706         LASSERT(nob > 0);
 707         LASSERT(nkiov > 0);
 708         LASSERT(net);
 709
 710         while (offset >= kiov->bv_len) {
 711                 offset -= kiov->bv_len;
 712                 nkiov--;
 713                 kiov++;
 714                 LASSERT(nkiov > 0);
 715         }
 716
 717         sg = tx->tx_frags;
 718         do {
 719                 LASSERT(nkiov > 0);
 720
 721                 fragnob = min((int)(kiov->bv_len - offset), nob);
 722
 723                 sg_set_page(sg, kiov->bv_page, fragnob,
 724                             kiov->bv_offset + offset);
 725                 sg = sg_next(sg);
 726                 if (!sg) {
 727                         CERROR("lacking enough sg entries to map tx\n");
 728                         return -EFAULT;
 729                 }
 730
 731                 offset = 0;
 732                 kiov++;
 733                 nkiov--;
 734                 nob -= fragnob;
 735         } while (nob > 0);
 736
 737         return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
 738 }
 739
 740 static int
 741 kiblnd_post_tx_locked(struct kib_conn *conn, struct kib_tx *tx, int credit)
 742         __must_hold(&conn->ibc_lock)
 743 {
 744         struct kib_msg *msg = tx->tx_msg;
 745         struct kib_peer *peer = conn->ibc_peer;
 746         struct lnet_ni *ni = peer->ibp_ni;
 747         int ver = conn->ibc_version;
 748         int rc;
 749         int done;
 750
 751         LASSERT(tx->tx_queued);
 752         /* We rely on this for QP sizing */
 753         LASSERT(tx->tx_nwrq > 0);
 754
 755         LASSERT(!credit || credit == 1);
 756         LASSERT(conn->ibc_outstanding_credits >= 0);
 757         LASSERT(conn->ibc_outstanding_credits <= conn->ibc_queue_depth);
 758         LASSERT(conn->ibc_credits >= 0);
 759         LASSERT(conn->ibc_credits <= conn->ibc_queue_depth);
 760
 761         if (conn->ibc_nsends_posted == kiblnd_concurrent_sends(ver, ni)) {
 762                 /* tx completions outstanding... */
 763                 CDEBUG(D_NET, "%s: posted enough\n",
 764                        libcfs_nid2str(peer->ibp_nid));
 765                 return -EAGAIN;
 766         }
 767
 768         if (credit && !conn->ibc_credits) {   /* no credits */
 769                 CDEBUG(D_NET, "%s: no credits\n",
 770                        libcfs_nid2str(peer->ibp_nid));
 771                 return -EAGAIN;
 772         }
 773
 774         if (credit && !IBLND_OOB_CAPABLE(ver) &&
 775             conn->ibc_credits == 1 &&   /* last credit reserved */
 776             msg->ibm_type != IBLND_MSG_NOOP) {      /* for NOOP */
 777                 CDEBUG(D_NET, "%s: not using last credit\n",
 778                        libcfs_nid2str(peer->ibp_nid));
 779                 return -EAGAIN;
 780         }
 781
 782         /* NB don't drop ibc_lock before bumping tx_sending */
 783         list_del(&tx->tx_list);
 784         tx->tx_queued = 0;
 785
 786         if (msg->ibm_type == IBLND_MSG_NOOP &&
 787             (!kiblnd_need_noop(conn) ||     /* redundant NOOP */
 788              (IBLND_OOB_CAPABLE(ver) && /* posted enough NOOP */
 789               conn->ibc_noops_posted == IBLND_OOB_MSGS(ver)))) {
 790                 /*
 791                  * OK to drop when posted enough NOOPs, since
 792                  * kiblnd_check_sends_locked will queue NOOP again when
 793                  * posted NOOPs complete
 794                  */
 795                 spin_unlock(&conn->ibc_lock);
 796                 kiblnd_tx_done(peer->ibp_ni, tx);
 797                 spin_lock(&conn->ibc_lock);
 798                 CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n",
 799                        libcfs_nid2str(peer->ibp_nid),
 800                        conn->ibc_noops_posted);
 801                 return 0;
 802         }
 803
 804         kiblnd_pack_msg(peer->ibp_ni, msg, ver, conn->ibc_outstanding_credits,
 805                         peer->ibp_nid, conn->ibc_incarnation);
 806
 807         conn->ibc_credits -= credit;
 808         conn->ibc_outstanding_credits = 0;
 809         conn->ibc_nsends_posted++;
 810         if (msg->ibm_type == IBLND_MSG_NOOP)
 811                 conn->ibc_noops_posted++;
 812
 813         /*
 814          * CAVEAT EMPTOR!  This tx could be the PUT_DONE of an RDMA
 815          * PUT.  If so, it was first queued here as a PUT_REQ, sent and
 816          * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
 817          * and then re-queued here.  It's (just) possible that
 818          * tx_sending is non-zero if we've not done the tx_complete()
 819          * from the first send; hence the ++ rather than = below.
 820          */
 821         tx->tx_sending++;
 822         list_add(&tx->tx_list, &conn->ibc_active_txs);
 823
 824         /* I'm still holding ibc_lock! */
 825         if (conn->ibc_state != IBLND_CONN_ESTABLISHED) {
 826                 rc = -ECONNABORTED;
 827         } else if (tx->tx_pool->tpo_pool.po_failed ||
 828                  conn->ibc_hdev != tx->tx_pool->tpo_hdev) {
 829                 /* close_conn will launch failover */
 830                 rc = -ENETDOWN;
 831         } else {
 832                 struct kib_fast_reg_descriptor *frd = tx->fmr.fmr_frd;
 833                 struct ib_send_wr *bad = &tx->tx_wrq[tx->tx_nwrq - 1].wr;
 834                 struct ib_send_wr *wrq = &tx->tx_wrq[0].wr;
 835
 836                 if (frd) {
 837                         if (!frd->frd_valid) {
 838                                 wrq = &frd->frd_inv_wr;
 839                                 wrq->next = &frd->frd_fastreg_wr.wr;
 840                         } else {
 841                                 wrq = &frd->frd_fastreg_wr.wr;
 842                         }
 843                         frd->frd_fastreg_wr.wr.next = &tx->tx_wrq[0].wr;
 844                 }
 845
 846                 LASSERTF(bad->wr_id == kiblnd_ptr2wreqid(tx, IBLND_WID_TX),
 847                          "bad wr_id %llx, opc %d, flags %d, peer: %s\n",
 848                          bad->wr_id, bad->opcode, bad->send_flags,
 849                          libcfs_nid2str(conn->ibc_peer->ibp_nid));
 850                 bad = NULL;
 851                 rc = ib_post_send(conn->ibc_cmid->qp, wrq, &bad);
 852         }
 853
 854         conn->ibc_last_send = jiffies;
 855
 856         if (!rc)
 857                 return 0;
 858
 859         /*
 860          * NB credits are transferred in the actual
 861          * message, which can only be the last work item
 862          */
 863         conn->ibc_credits += credit;
 864         conn->ibc_outstanding_credits += msg->ibm_credits;
 865         conn->ibc_nsends_posted--;
 866         if (msg->ibm_type == IBLND_MSG_NOOP)
 867                 conn->ibc_noops_posted--;
 868
 869         tx->tx_status = rc;
 870         tx->tx_waiting = 0;
 871         tx->tx_sending--;
 872
 873         done = !tx->tx_sending;
 874         if (done)
 875                 list_del(&tx->tx_list);
 876
 877         spin_unlock(&conn->ibc_lock);
 878
 879         if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
 880                 CERROR("Error %d posting transmit to %s\n",
 881                        rc, libcfs_nid2str(peer->ibp_nid));
 882         else
 883                 CDEBUG(D_NET, "Error %d posting transmit to %s\n",
 884                        rc, libcfs_nid2str(peer->ibp_nid));
 885
 886         kiblnd_close_conn(conn, rc);
 887
 888         if (done)
 889                 kiblnd_tx_done(peer->ibp_ni, tx);
 890
 891         spin_lock(&conn->ibc_lock);
 892
 893         return -EIO;
 894 }
 895
 896 static void
 897 kiblnd_check_sends_locked(struct kib_conn *conn)
 898 {
 899         int ver = conn->ibc_version;
 900         struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
 901         struct kib_tx *tx;
 902
 903         /* Don't send anything until after the connection is established */
 904         if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
 905                 CDEBUG(D_NET, "%s too soon\n",
 906                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
 907                 return;
 908         }
 909
 910         LASSERT(conn->ibc_nsends_posted <= kiblnd_concurrent_sends(ver, ni));
 911         LASSERT(!IBLND_OOB_CAPABLE(ver) ||
 912                 conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver));
 913         LASSERT(conn->ibc_reserved_credits >= 0);
 914
 915         while (conn->ibc_reserved_credits > 0 &&
 916                !list_empty(&conn->ibc_tx_queue_rsrvd)) {
 917                 tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
 918                                 struct kib_tx, tx_list);
 919                 list_del(&tx->tx_list);
 920                 list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
 921                 conn->ibc_reserved_credits--;
 922         }
 923
 924         if (kiblnd_need_noop(conn)) {
 925                 spin_unlock(&conn->ibc_lock);
 926
 927                 tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
 928                 if (tx)
 929                         kiblnd_init_tx_msg(ni, tx, IBLND_MSG_NOOP, 0);
 930
 931                 spin_lock(&conn->ibc_lock);
 932                 if (tx)
 933                         kiblnd_queue_tx_locked(tx, conn);
 934         }
 935
 936         for (;;) {
 937                 int credit;
 938
 939                 if (!list_empty(&conn->ibc_tx_queue_nocred)) {
 940                         credit = 0;
 941                         tx = list_entry(conn->ibc_tx_queue_nocred.next,
 942                                         struct kib_tx, tx_list);
 943                 } else if (!list_empty(&conn->ibc_tx_noops)) {
 944                         LASSERT(!IBLND_OOB_CAPABLE(ver));
 945                         credit = 1;
 946                         tx = list_entry(conn->ibc_tx_noops.next,
 947                                         struct kib_tx, tx_list);
 948                 } else if (!list_empty(&conn->ibc_tx_queue)) {
 949                         credit = 1;
 950                         tx = list_entry(conn->ibc_tx_queue.next,
 951                                         struct kib_tx, tx_list);
 952                 } else {
 953                         break;
 954                 }
 955
 956                 if (kiblnd_post_tx_locked(conn, tx, credit))
 957                         break;
 958         }
 959 }
 960
 961 static void
 962 kiblnd_tx_complete(struct kib_tx *tx, int status)
 963 {
 964         int failed = (status != IB_WC_SUCCESS);
 965         struct kib_conn *conn = tx->tx_conn;
 966         int idle;
 967
 968         LASSERT(tx->tx_sending > 0);
 969
 970         if (failed) {
 971                 if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
 972                         CNETERR("Tx -> %s cookie %#llx sending %d waiting %d: failed %d\n",
 973                                 libcfs_nid2str(conn->ibc_peer->ibp_nid),
 974                                 tx->tx_cookie, tx->tx_sending, tx->tx_waiting,
 975                                 status);
 976
 977                 kiblnd_close_conn(conn, -EIO);
 978         } else {
 979                 kiblnd_peer_alive(conn->ibc_peer);
 980         }
 981
 982         spin_lock(&conn->ibc_lock);
 983
 984         /*
 985          * I could be racing with rdma completion.  Whoever makes 'tx' idle
 986          * gets to free it, which also drops its ref on 'conn'.
 987          */
 988         tx->tx_sending--;
 989         conn->ibc_nsends_posted--;
 990         if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP)
 991                 conn->ibc_noops_posted--;
 992
 993         if (failed) {
 994                 tx->tx_waiting = 0;          /* don't wait for peer */
 995                 tx->tx_status = -EIO;
 996         }
 997
 998         idle = !tx->tx_sending &&        /* This is the final callback */
 999                !tx->tx_waiting &&              /* Not waiting for peer */
1000                !tx->tx_queued;            /* Not re-queued (PUT_DONE) */
1001         if (idle)
1002                 list_del(&tx->tx_list);
1003
1004         kiblnd_check_sends_locked(conn);
1005         spin_unlock(&conn->ibc_lock);
1006
1007         if (idle)
1008                 kiblnd_tx_done(conn->ibc_peer->ibp_ni, tx);
1009 }
1010
1011 static void
1012 kiblnd_init_tx_msg(struct lnet_ni *ni, struct kib_tx *tx, int type,
1013                    int body_nob)
1014 {
1015         struct kib_hca_dev *hdev = tx->tx_pool->tpo_hdev;
1016         struct ib_sge *sge = &tx->tx_sge[tx->tx_nwrq];
1017         struct ib_rdma_wr *wrq = &tx->tx_wrq[tx->tx_nwrq];
1018         int nob = offsetof(struct kib_msg, ibm_u) + body_nob;
1019
1020         LASSERT(tx->tx_nwrq >= 0);
1021         LASSERT(tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1);
1022         LASSERT(nob <= IBLND_MSG_SIZE);
1023
1024         kiblnd_init_msg(tx->tx_msg, type, body_nob);
1025
1026         sge->lkey   = hdev->ibh_pd->local_dma_lkey;
1027         sge->addr   = tx->tx_msgaddr;
1028         sge->length = nob;
1029
1030         memset(wrq, 0, sizeof(*wrq));
1031
1032         wrq->wr.next       = NULL;
1033         wrq->wr.wr_id      = kiblnd_ptr2wreqid(tx, IBLND_WID_TX);
1034         wrq->wr.sg_list    = sge;
1035         wrq->wr.num_sge    = 1;
1036         wrq->wr.opcode     = IB_WR_SEND;
1037         wrq->wr.send_flags = IB_SEND_SIGNALED;
1038
1039         tx->tx_nwrq++;
1040 }
1041
1042 static int
1043 kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type,
1044                  int resid, struct kib_rdma_desc *dstrd, __u64 dstcookie)
1045 {
1046         struct kib_msg *ibmsg = tx->tx_msg;
1047         struct kib_rdma_desc *srcrd = tx->tx_rd;
1048         struct ib_sge *sge = &tx->tx_sge[0];
1049         struct ib_rdma_wr *wrq, *next;
1050         int rc  = resid;
1051         int srcidx = 0;
1052         int dstidx = 0;
1053         int wrknob;
1054
1055         LASSERT(!in_interrupt());
1056         LASSERT(!tx->tx_nwrq);
1057         LASSERT(type == IBLND_MSG_GET_DONE ||
1058                 type == IBLND_MSG_PUT_DONE);
1059
1060         if (kiblnd_rd_size(srcrd) > conn->ibc_max_frags << PAGE_SHIFT) {
1061                 CERROR("RDMA is too large for peer %s (%d), src size: %d dst size: %d\n",
1062                        libcfs_nid2str(conn->ibc_peer->ibp_nid),
1063                        conn->ibc_max_frags << PAGE_SHIFT,
1064                        kiblnd_rd_size(srcrd), kiblnd_rd_size(dstrd));
1065                 rc = -EMSGSIZE;
1066                 goto too_big;
1067         }
1068
1069         while (resid > 0) {
1070                 if (srcidx >= srcrd->rd_nfrags) {
1071                         CERROR("Src buffer exhausted: %d frags\n", srcidx);
1072                         rc = -EPROTO;
1073                         break;
1074                 }
1075
1076                 if (dstidx == dstrd->rd_nfrags) {
1077                         CERROR("Dst buffer exhausted: %d frags\n", dstidx);
1078                         rc = -EPROTO;
1079                         break;
1080                 }
1081
1082                 if (tx->tx_nwrq >= IBLND_MAX_RDMA_FRAGS) {
1083                         CERROR("RDMA has too many fragments for peer %s (%d), src idx/frags: %d/%d dst idx/frags: %d/%d\n",
1084                                libcfs_nid2str(conn->ibc_peer->ibp_nid),
1085                                IBLND_MAX_RDMA_FRAGS,
1086                                srcidx, srcrd->rd_nfrags,
1087                                dstidx, dstrd->rd_nfrags);
1088                         rc = -EMSGSIZE;
1089                         break;
1090                 }
1091
1092                 wrknob = min3(kiblnd_rd_frag_size(srcrd, srcidx),
1093                               kiblnd_rd_frag_size(dstrd, dstidx),
1094                               (__u32)resid);
1095
1096                 sge = &tx->tx_sge[tx->tx_nwrq];
1097                 sge->addr   = kiblnd_rd_frag_addr(srcrd, srcidx);
1098                 sge->lkey   = kiblnd_rd_frag_key(srcrd, srcidx);
1099                 sge->length = wrknob;
1100
1101                 wrq = &tx->tx_wrq[tx->tx_nwrq];
1102                 next = wrq + 1;
1103
1104                 wrq->wr.next       = &next->wr;
1105                 wrq->wr.wr_id      = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA);
1106                 wrq->wr.sg_list    = sge;
1107                 wrq->wr.num_sge    = 1;
1108                 wrq->wr.opcode     = IB_WR_RDMA_WRITE;
1109                 wrq->wr.send_flags = 0;
1110
1111                 wrq->remote_addr = kiblnd_rd_frag_addr(dstrd, dstidx);
1112                 wrq->rkey        = kiblnd_rd_frag_key(dstrd, dstidx);
1113
1114                 srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, wrknob);
1115                 dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, wrknob);
1116
1117                 resid -= wrknob;
1118
1119                 tx->tx_nwrq++;
1120                 wrq++;
1121                 sge++;
1122         }
1123 too_big:
1124         if (rc < 0)                          /* no RDMA if completing with failure */
1125                 tx->tx_nwrq = 0;
1126
1127         ibmsg->ibm_u.completion.ibcm_status = rc;
1128         ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
1129         kiblnd_init_tx_msg(conn->ibc_peer->ibp_ni, tx,
1130                            type, sizeof(struct kib_completion_msg));
1131
1132         return rc;
1133 }
1134
1135 static void
1136 kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn)
1137 {
1138         struct list_head *q;
1139
1140         LASSERT(tx->tx_nwrq > 0);             /* work items set up */
1141         LASSERT(!tx->tx_queued);               /* not queued for sending already */
1142         LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
1143
1144         tx->tx_queued = 1;
1145         tx->tx_deadline = jiffies +
1146                           msecs_to_jiffies(*kiblnd_tunables.kib_timeout *
1147                                            MSEC_PER_SEC);
1148
1149         if (!tx->tx_conn) {
1150                 kiblnd_conn_addref(conn);
1151                 tx->tx_conn = conn;
1152                 LASSERT(tx->tx_msg->ibm_type != IBLND_MSG_PUT_DONE);
1153         } else {
1154                 /* PUT_DONE first attached to conn as a PUT_REQ */
1155                 LASSERT(tx->tx_conn == conn);
1156                 LASSERT(tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE);
1157         }
1158
1159         switch (tx->tx_msg->ibm_type) {
1160         default:
1161                 LBUG();
1162
1163         case IBLND_MSG_PUT_REQ:
1164         case IBLND_MSG_GET_REQ:
1165                 q = &conn->ibc_tx_queue_rsrvd;
1166                 break;
1167
1168         case IBLND_MSG_PUT_NAK:
1169         case IBLND_MSG_PUT_ACK:
1170         case IBLND_MSG_PUT_DONE:
1171         case IBLND_MSG_GET_DONE:
1172                 q = &conn->ibc_tx_queue_nocred;
1173                 break;
1174
1175         case IBLND_MSG_NOOP:
1176                 if (IBLND_OOB_CAPABLE(conn->ibc_version))
1177                         q = &conn->ibc_tx_queue_nocred;
1178                 else
1179                         q = &conn->ibc_tx_noops;
1180                 break;
1181
1182         case IBLND_MSG_IMMEDIATE:
1183                 q = &conn->ibc_tx_queue;
1184                 break;
1185         }
1186
1187         list_add_tail(&tx->tx_list, q);
1188 }
1189
1190 static void
1191 kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn)
1192 {
1193         spin_lock(&conn->ibc_lock);
1194         kiblnd_queue_tx_locked(tx, conn);
1195         kiblnd_check_sends_locked(conn);
1196         spin_unlock(&conn->ibc_lock);
1197 }
1198
1199 static int kiblnd_resolve_addr(struct rdma_cm_id *cmid,
1200                                struct sockaddr_in *srcaddr,
1201                                struct sockaddr_in *dstaddr,
1202                                int timeout_ms)
1203 {
1204         unsigned short port;
1205         int rc;
1206
1207         /* allow the port to be reused */
1208         rc = rdma_set_reuseaddr(cmid, 1);
1209         if (rc) {
1210                 CERROR("Unable to set reuse on cmid: %d\n", rc);
1211                 return rc;
1212         }
1213
1214         /* look for a free privileged port */
1215         for (port = PROT_SOCK - 1; port > 0; port--) {
1216                 srcaddr->sin_port = htons(port);
1217                 rc = rdma_resolve_addr(cmid,
1218                                        (struct sockaddr *)srcaddr,
1219                                        (struct sockaddr *)dstaddr,
1220                                        timeout_ms);
1221                 if (!rc) {
1222                         CDEBUG(D_NET, "bound to port %hu\n", port);
1223                         return 0;
1224                 } else if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL) {
1225                         CDEBUG(D_NET, "bind to port %hu failed: %d\n",
1226                                port, rc);
1227                 } else {
1228                         return rc;
1229                 }
1230         }
1231
1232         CERROR("Failed to bind to a free privileged port\n");
1233         return rc;
1234 }
1235
1236 static void
1237 kiblnd_connect_peer(struct kib_peer *peer)
1238 {
1239         struct rdma_cm_id *cmid;
1240         struct kib_dev *dev;
1241         struct kib_net *net = peer->ibp_ni->ni_data;
1242         struct sockaddr_in srcaddr;
1243         struct sockaddr_in dstaddr;
1244         int rc;
1245
1246         LASSERT(net);
1247         LASSERT(peer->ibp_connecting > 0);
1248         LASSERT(!peer->ibp_reconnecting);
1249
1250         cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, peer, RDMA_PS_TCP,
1251                                      IB_QPT_RC);
1252
1253         if (IS_ERR(cmid)) {
1254                 CERROR("Can't create CMID for %s: %ld\n",
1255                        libcfs_nid2str(peer->ibp_nid), PTR_ERR(cmid));
1256                 rc = PTR_ERR(cmid);
1257                 goto failed;
1258         }
1259
1260         dev = net->ibn_dev;
1261         memset(&srcaddr, 0, sizeof(srcaddr));
1262         srcaddr.sin_family = AF_INET;
1263         srcaddr.sin_addr.s_addr = htonl(dev->ibd_ifip);
1264
1265         memset(&dstaddr, 0, sizeof(dstaddr));
1266         dstaddr.sin_family = AF_INET;
1267         dstaddr.sin_port = htons(*kiblnd_tunables.kib_service);
1268         dstaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer->ibp_nid));
1269
1270         kiblnd_peer_addref(peer);              /* cmid's ref */
1271
1272         if (*kiblnd_tunables.kib_use_priv_port) {
1273                 rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
1274                                          *kiblnd_tunables.kib_timeout * 1000);
1275         } else {
1276                 rc = rdma_resolve_addr(cmid,
1277                                        (struct sockaddr *)&srcaddr,
1278                                        (struct sockaddr *)&dstaddr,
1279                                        *kiblnd_tunables.kib_timeout * 1000);
1280         }
1281         if (rc) {
1282                 /* Can't initiate address resolution:  */
1283                 CERROR("Can't resolve addr for %s: %d\n",
1284                        libcfs_nid2str(peer->ibp_nid), rc);
1285                 goto failed2;
1286         }
1287
1288         return;
1289
1290  failed2:
1291         kiblnd_peer_connect_failed(peer, 1, rc);
1292         kiblnd_peer_decref(peer);              /* cmid's ref */
1293         rdma_destroy_id(cmid);
1294         return;
1295  failed:
1296         kiblnd_peer_connect_failed(peer, 1, rc);
1297 }
1298
1299 bool
1300 kiblnd_reconnect_peer(struct kib_peer *peer)
1301 {
1302         rwlock_t *glock = &kiblnd_data.kib_global_lock;
1303         char *reason = NULL;
1304         struct list_head txs;
1305         unsigned long flags;
1306
1307         INIT_LIST_HEAD(&txs);
1308
1309         write_lock_irqsave(glock, flags);
1310         if (!peer->ibp_reconnecting) {
1311                 if (peer->ibp_accepting)
1312                         reason = "accepting";
1313                 else if (peer->ibp_connecting)
1314                         reason = "connecting";
1315                 else if (!list_empty(&peer->ibp_conns))
1316                         reason = "connected";
1317                 else /* connected then closed */
1318                         reason = "closed";
1319
1320                 goto no_reconnect;
1321         }
1322
1323         LASSERT(!peer->ibp_accepting && !peer->ibp_connecting &&
1324                 list_empty(&peer->ibp_conns));
1325         peer->ibp_reconnecting = 0;
1326
1327         if (!kiblnd_peer_active(peer)) {
1328                 list_splice_init(&peer->ibp_tx_queue, &txs);
1329                 reason = "unlinked";
1330                 goto no_reconnect;
1331         }
1332
1333         peer->ibp_connecting++;
1334         peer->ibp_reconnected++;
1335         write_unlock_irqrestore(glock, flags);
1336
1337         kiblnd_connect_peer(peer);
1338         return true;
1339
1340 no_reconnect:
1341         write_unlock_irqrestore(glock, flags);
1342
1343         CWARN("Abort reconnection of %s: %s\n",
1344               libcfs_nid2str(peer->ibp_nid), reason);
1345         kiblnd_txlist_done(peer->ibp_ni, &txs, -ECONNABORTED);
1346         return false;
1347 }
1348
1349 void
1350 kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid)
1351 {
1352         struct kib_peer *peer;
1353         struct kib_peer *peer2;
1354         struct kib_conn *conn;
1355         rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
1356         unsigned long flags;
1357         int rc;
1358
1359         /*
1360          * If I get here, I've committed to send, so I complete the tx with
1361          * failure on any problems
1362          */
1363         LASSERT(!tx || !tx->tx_conn); /* only set when assigned a conn */
1364         LASSERT(!tx || tx->tx_nwrq > 0);     /* work items have been set up */
1365
1366         /*
1367          * First time, just use a read lock since I expect to find my peer
1368          * connected
1369          */
1370         read_lock_irqsave(g_lock, flags);
1371
1372         peer = kiblnd_find_peer_locked(nid);
1373         if (peer && !list_empty(&peer->ibp_conns)) {
1374                 /* Found a peer with an established connection */
1375                 conn = kiblnd_get_conn_locked(peer);
1376                 kiblnd_conn_addref(conn); /* 1 ref for me... */
1377
1378                 read_unlock_irqrestore(g_lock, flags);
1379
1380                 if (tx)
1381                         kiblnd_queue_tx(tx, conn);
1382                 kiblnd_conn_decref(conn); /* ...to here */
1383                 return;
1384         }
1385
1386         read_unlock(g_lock);
1387         /* Re-try with a write lock */
1388         write_lock(g_lock);
1389
1390         peer = kiblnd_find_peer_locked(nid);
1391         if (peer) {
1392                 if (list_empty(&peer->ibp_conns)) {
1393                         /* found a peer, but it's still connecting... */
1394                         LASSERT(kiblnd_peer_connecting(peer));
1395                         if (tx)
1396                                 list_add_tail(&tx->tx_list,
1397                                               &peer->ibp_tx_queue);
1398                         write_unlock_irqrestore(g_lock, flags);
1399                 } else {
1400                         conn = kiblnd_get_conn_locked(peer);
1401                         kiblnd_conn_addref(conn); /* 1 ref for me... */
1402
1403                         write_unlock_irqrestore(g_lock, flags);
1404
1405                         if (tx)
1406                                 kiblnd_queue_tx(tx, conn);
1407                         kiblnd_conn_decref(conn); /* ...to here */
1408                 }
1409                 return;
1410         }
1411
1412         write_unlock_irqrestore(g_lock, flags);
1413
1414         /* Allocate a peer ready to add to the peer table and retry */
1415         rc = kiblnd_create_peer(ni, &peer, nid);
1416         if (rc) {
1417                 CERROR("Can't create peer %s\n", libcfs_nid2str(nid));
1418                 if (tx) {
1419                         tx->tx_status = -EHOSTUNREACH;
1420                         tx->tx_waiting = 0;
1421                         kiblnd_tx_done(ni, tx);
1422                 }
1423                 return;
1424         }
1425
1426         write_lock_irqsave(g_lock, flags);
1427
1428         peer2 = kiblnd_find_peer_locked(nid);
1429         if (peer2) {
1430                 if (list_empty(&peer2->ibp_conns)) {
1431                         /* found a peer, but it's still connecting... */
1432                         LASSERT(kiblnd_peer_connecting(peer2));
1433                         if (tx)
1434                                 list_add_tail(&tx->tx_list,
1435                                               &peer2->ibp_tx_queue);
1436                         write_unlock_irqrestore(g_lock, flags);
1437                 } else {
1438                         conn = kiblnd_get_conn_locked(peer2);
1439                         kiblnd_conn_addref(conn); /* 1 ref for me... */
1440
1441                         write_unlock_irqrestore(g_lock, flags);
1442
1443                         if (tx)
1444                                 kiblnd_queue_tx(tx, conn);
1445                         kiblnd_conn_decref(conn); /* ...to here */
1446                 }
1447
1448                 kiblnd_peer_decref(peer);
1449                 return;
1450         }
1451
1452         /* Brand new peer */
1453         LASSERT(!peer->ibp_connecting);
1454         peer->ibp_connecting = 1;
1455
1456         /* always called with a ref on ni, which prevents ni being shutdown */
1457         LASSERT(!((struct kib_net *)ni->ni_data)->ibn_shutdown);
1458
1459         if (tx)
1460                 list_add_tail(&tx->tx_list, &peer->ibp_tx_queue);
1461
1462         kiblnd_peer_addref(peer);
1463         list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
1464
1465         write_unlock_irqrestore(g_lock, flags);
1466
1467         kiblnd_connect_peer(peer);
1468         kiblnd_peer_decref(peer);
1469 }
1470
1471 int
1472 kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
1473 {
1474         struct lnet_hdr *hdr = &lntmsg->msg_hdr;
1475         int type = lntmsg->msg_type;
1476         struct lnet_process_id target = lntmsg->msg_target;
1477         int target_is_router = lntmsg->msg_target_is_router;
1478         int routing = lntmsg->msg_routing;
1479         unsigned int payload_niov = lntmsg->msg_niov;
1480         struct kvec *payload_iov = lntmsg->msg_iov;
1481         struct bio_vec *payload_kiov = lntmsg->msg_kiov;
1482         unsigned int payload_offset = lntmsg->msg_offset;
1483         unsigned int payload_nob = lntmsg->msg_len;
1484         struct iov_iter from;
1485         struct kib_msg *ibmsg;
1486         struct kib_rdma_desc  *rd;
1487         struct kib_tx *tx;
1488         int nob;
1489         int rc;
1490
1491         /* NB 'private' is different depending on what we're sending.... */
1492
1493         CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
1494                payload_nob, payload_niov, libcfs_id2str(target));
1495
1496         LASSERT(!payload_nob || payload_niov > 0);
1497         LASSERT(payload_niov <= LNET_MAX_IOV);
1498
1499         /* Thread context */
1500         LASSERT(!in_interrupt());
1501         /* payload is either all vaddrs or all pages */
1502         LASSERT(!(payload_kiov && payload_iov));
1503
1504         if (payload_kiov)
1505                 iov_iter_bvec(&from, ITER_BVEC | WRITE,
1506                               payload_kiov, payload_niov,
1507                               payload_nob + payload_offset);
1508         else
1509                 iov_iter_kvec(&from, ITER_KVEC | WRITE,
1510                               payload_iov, payload_niov,
1511                               payload_nob + payload_offset);
1512
1513         iov_iter_advance(&from, payload_offset);
1514
1515         switch (type) {
1516         default:
1517                 LBUG();
1518                 return -EIO;
1519
1520         case LNET_MSG_ACK:
1521                 LASSERT(!payload_nob);
1522                 break;
1523
1524         case LNET_MSG_GET:
1525                 if (routing || target_is_router)
1526                         break;            /* send IMMEDIATE */
1527
1528                 /* is the REPLY message too small for RDMA? */
1529                 nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
1530                 if (nob <= IBLND_MSG_SIZE)
1531                         break;            /* send IMMEDIATE */
1532
1533                 tx = kiblnd_get_idle_tx(ni, target.nid);
1534                 if (!tx) {
1535                         CERROR("Can't allocate txd for GET to %s\n",
1536                                libcfs_nid2str(target.nid));
1537                         return -ENOMEM;
1538                 }
1539
1540                 ibmsg = tx->tx_msg;
1541                 rd = &ibmsg->ibm_u.get.ibgm_rd;
1542                 if (!(lntmsg->msg_md->md_options & LNET_MD_KIOV))
1543                         rc = kiblnd_setup_rd_iov(ni, tx, rd,
1544                                                  lntmsg->msg_md->md_niov,
1545                                                  lntmsg->msg_md->md_iov.iov,
1546                                                  0, lntmsg->msg_md->md_length);
1547                 else
1548                         rc = kiblnd_setup_rd_kiov(ni, tx, rd,
1549                                                   lntmsg->msg_md->md_niov,
1550                                                   lntmsg->msg_md->md_iov.kiov,
1551                                                   0, lntmsg->msg_md->md_length);
1552                 if (rc) {
1553                         CERROR("Can't setup GET sink for %s: %d\n",
1554                                libcfs_nid2str(target.nid), rc);
1555                         kiblnd_tx_done(ni, tx);
1556                         return -EIO;
1557                 }
1558
1559                 nob = offsetof(struct kib_get_msg, ibgm_rd.rd_frags[rd->rd_nfrags]);
1560                 ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
1561                 ibmsg->ibm_u.get.ibgm_hdr = *hdr;
1562
1563                 kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob);
1564
1565                 tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
1566                 if (!tx->tx_lntmsg[1]) {
1567                         CERROR("Can't create reply for GET -> %s\n",
1568                                libcfs_nid2str(target.nid));
1569                         kiblnd_tx_done(ni, tx);
1570                         return -EIO;
1571                 }
1572
1573                 tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg[0,1] on completion */
1574                 tx->tx_waiting = 1;          /* waiting for GET_DONE */
1575                 kiblnd_launch_tx(ni, tx, target.nid);
1576                 return 0;
1577
1578         case LNET_MSG_REPLY:
1579         case LNET_MSG_PUT:
1580                 /* Is the payload small enough not to need RDMA? */
1581                 nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob]);
1582                 if (nob <= IBLND_MSG_SIZE)
1583                         break;            /* send IMMEDIATE */
1584
1585                 tx = kiblnd_get_idle_tx(ni, target.nid);
1586                 if (!tx) {
1587                         CERROR("Can't allocate %s txd for %s\n",
1588                                type == LNET_MSG_PUT ? "PUT" : "REPLY",
1589                                libcfs_nid2str(target.nid));
1590                         return -ENOMEM;
1591                 }
1592
1593                 if (!payload_kiov)
1594                         rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
1595                                                  payload_niov, payload_iov,
1596                                                  payload_offset, payload_nob);
1597                 else
1598                         rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
1599                                                   payload_niov, payload_kiov,
1600                                                   payload_offset, payload_nob);
1601                 if (rc) {
1602                         CERROR("Can't setup PUT src for %s: %d\n",
1603                                libcfs_nid2str(target.nid), rc);
1604                         kiblnd_tx_done(ni, tx);
1605                         return -EIO;
1606                 }
1607
1608                 ibmsg = tx->tx_msg;
1609                 ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
1610                 ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
1611                 kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ, sizeof(struct kib_putreq_msg));
1612
1613                 tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
1614                 tx->tx_waiting = 1;          /* waiting for PUT_{ACK,NAK} */
1615                 kiblnd_launch_tx(ni, tx, target.nid);
1616                 return 0;
1617         }
1618
1619         /* send IMMEDIATE */
1620
1621         LASSERT(offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob])
1622                  <= IBLND_MSG_SIZE);
1623
1624         tx = kiblnd_get_idle_tx(ni, target.nid);
1625         if (!tx) {
1626                 CERROR("Can't send %d to %s: tx descs exhausted\n",
1627                        type, libcfs_nid2str(target.nid));
1628                 return -ENOMEM;
1629         }
1630
1631         ibmsg = tx->tx_msg;
1632         ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
1633
1634         rc = copy_from_iter(&ibmsg->ibm_u.immediate.ibim_payload, payload_nob,
1635                             &from);
1636         if (rc != payload_nob) {
1637                 kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list);
1638                 return -EFAULT;
1639         }
1640
1641         nob = offsetof(struct kib_immediate_msg, ibim_payload[payload_nob]);
1642         kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob);
1643
1644         tx->tx_lntmsg[0] = lntmsg;            /* finalise lntmsg on completion */
1645         kiblnd_launch_tx(ni, tx, target.nid);
1646         return 0;
1647 }
1648
1649 static void
1650 kiblnd_reply(struct lnet_ni *ni, struct kib_rx *rx, struct lnet_msg *lntmsg)
1651 {
1652         struct lnet_process_id target = lntmsg->msg_target;
1653         unsigned int niov = lntmsg->msg_niov;
1654         struct kvec *iov = lntmsg->msg_iov;
1655         struct bio_vec *kiov = lntmsg->msg_kiov;
1656         unsigned int offset = lntmsg->msg_offset;
1657         unsigned int nob = lntmsg->msg_len;
1658         struct kib_tx *tx;
1659         int rc;
1660
1661         tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid);
1662         if (!tx) {
1663                 CERROR("Can't get tx for REPLY to %s\n",
1664                        libcfs_nid2str(target.nid));
1665                 goto failed_0;
1666         }
1667
1668         if (!nob)
1669                 rc = 0;
1670         else if (!kiov)
1671                 rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
1672                                          niov, iov, offset, nob);
1673         else
1674                 rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
1675                                           niov, kiov, offset, nob);
1676
1677         if (rc) {
1678                 CERROR("Can't setup GET src for %s: %d\n",
1679                        libcfs_nid2str(target.nid), rc);
1680                 goto failed_1;
1681         }
1682
1683         rc = kiblnd_init_rdma(rx->rx_conn, tx,
1684                               IBLND_MSG_GET_DONE, nob,
1685                               &rx->rx_msg->ibm_u.get.ibgm_rd,
1686                               rx->rx_msg->ibm_u.get.ibgm_cookie);
1687         if (rc < 0) {
1688                 CERROR("Can't setup rdma for GET from %s: %d\n",
1689                        libcfs_nid2str(target.nid), rc);
1690                 goto failed_1;
1691         }
1692
1693         if (!nob) {
1694                 /* No RDMA: local completion may happen now! */
1695                 lnet_finalize(ni, lntmsg, 0);
1696         } else {
1697                 /* RDMA: lnet_finalize(lntmsg) when it completes */
1698                 tx->tx_lntmsg[0] = lntmsg;
1699         }
1700
1701         kiblnd_queue_tx(tx, rx->rx_conn);
1702         return;
1703
1704  failed_1:
1705         kiblnd_tx_done(ni, tx);
1706  failed_0:
1707         lnet_finalize(ni, lntmsg, -EIO);
1708 }
1709
1710 int
1711 kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
1712             int delayed, struct iov_iter *to, unsigned int rlen)
1713 {
1714         struct kib_rx *rx = private;
1715         struct kib_msg *rxmsg = rx->rx_msg;
1716         struct kib_conn *conn = rx->rx_conn;
1717         struct kib_tx *tx;
1718         int nob;
1719         int post_credit = IBLND_POSTRX_PEER_CREDIT;
1720         int rc = 0;
1721
1722         LASSERT(iov_iter_count(to) <= rlen);
1723         LASSERT(!in_interrupt());
1724         /* Either all pages or all vaddrs */
1725
1726         switch (rxmsg->ibm_type) {
1727         default:
1728                 LBUG();
1729
1730         case IBLND_MSG_IMMEDIATE:
1731                 nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[rlen]);
1732                 if (nob > rx->rx_nob) {
1733                         CERROR("Immediate message from %s too big: %d(%d)\n",
1734                                libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
1735                                nob, rx->rx_nob);
1736                         rc = -EPROTO;
1737                         break;
1738                 }
1739
1740                 rc = copy_to_iter(&rxmsg->ibm_u.immediate.ibim_payload, rlen,
1741                                   to);
1742                 if (rc != rlen) {
1743                         rc = -EFAULT;
1744                         break;
1745                 }
1746
1747                 rc = 0;
1748                 lnet_finalize(ni, lntmsg, 0);
1749                 break;
1750
1751         case IBLND_MSG_PUT_REQ: {
1752                 struct kib_msg  *txmsg;
1753                 struct kib_rdma_desc *rd;
1754
1755                 if (!iov_iter_count(to)) {
1756                         lnet_finalize(ni, lntmsg, 0);
1757                         kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, 0,
1758                                                rxmsg->ibm_u.putreq.ibprm_cookie);
1759                         break;
1760                 }
1761
1762                 tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
1763                 if (!tx) {
1764                         CERROR("Can't allocate tx for %s\n",
1765                                libcfs_nid2str(conn->ibc_peer->ibp_nid));
1766                         /* Not replying will break the connection */
1767                         rc = -ENOMEM;
1768                         break;
1769                 }
1770
1771                 txmsg = tx->tx_msg;
1772                 rd = &txmsg->ibm_u.putack.ibpam_rd;
1773                 if (!(to->type & ITER_BVEC))
1774                         rc = kiblnd_setup_rd_iov(ni, tx, rd,
1775                                                  to->nr_segs, to->kvec,
1776                                                  to->iov_offset,
1777                                                  iov_iter_count(to));
1778                 else
1779                         rc = kiblnd_setup_rd_kiov(ni, tx, rd,
1780                                                   to->nr_segs, to->bvec,
1781                                                   to->iov_offset,
1782                                                   iov_iter_count(to));
1783                 if (rc) {
1784                         CERROR("Can't setup PUT sink for %s: %d\n",
1785                                libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
1786                         kiblnd_tx_done(ni, tx);
1787                         /* tell peer it's over */
1788                         kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, rc,
1789                                                rxmsg->ibm_u.putreq.ibprm_cookie);
1790                         break;
1791                 }
1792
1793                 nob = offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[rd->rd_nfrags]);
1794                 txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
1795                 txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
1796
1797                 kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_ACK, nob);
1798
1799                 tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
1800                 tx->tx_waiting = 1;          /* waiting for PUT_DONE */
1801                 kiblnd_queue_tx(tx, conn);
1802
1803                 /* reposted buffer reserved for PUT_DONE */
1804                 post_credit = IBLND_POSTRX_NO_CREDIT;
1805                 break;
1806                 }
1807
1808         case IBLND_MSG_GET_REQ:
1809                 if (lntmsg) {
1810                         /* Optimized GET; RDMA lntmsg's payload */
1811                         kiblnd_reply(ni, rx, lntmsg);
1812                 } else {
1813                         /* GET didn't match anything */
1814                         kiblnd_send_completion(rx->rx_conn, IBLND_MSG_GET_DONE,
1815                                                -ENODATA,
1816                                                rxmsg->ibm_u.get.ibgm_cookie);
1817                 }
1818                 break;
1819         }
1820
1821         kiblnd_post_rx(rx, post_credit);
1822         return rc;
1823 }
1824
1825 int
1826 kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name)
1827 {
1828         struct task_struct *task = kthread_run(fn, arg, "%s", name);
1829
1830         if (IS_ERR(task))
1831                 return PTR_ERR(task);
1832
1833         atomic_inc(&kiblnd_data.kib_nthreads);
1834         return 0;
1835 }
1836
1837 static void
1838 kiblnd_thread_fini(void)
1839 {
1840         atomic_dec(&kiblnd_data.kib_nthreads);
1841 }
1842
1843 static void
1844 kiblnd_peer_alive(struct kib_peer *peer)
1845 {
1846         /* This is racy, but everyone's only writing cfs_time_current() */
1847         peer->ibp_last_alive = cfs_time_current();
1848         mb();
1849 }
1850
1851 static void
1852 kiblnd_peer_notify(struct kib_peer *peer)
1853 {
1854         int error = 0;
1855         unsigned long last_alive = 0;
1856         unsigned long flags;
1857
1858         read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
1859
1860         if (kiblnd_peer_idle(peer) && peer->ibp_error) {
1861                 error = peer->ibp_error;
1862                 peer->ibp_error = 0;
1863
1864                 last_alive = peer->ibp_last_alive;
1865         }
1866
1867         read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
1868
1869         if (error)
1870                 lnet_notify(peer->ibp_ni,
1871                             peer->ibp_nid, 0, last_alive);
1872 }
1873
1874 void
1875 kiblnd_close_conn_locked(struct kib_conn *conn, int error)
1876 {
1877         /*
1878          * This just does the immediate housekeeping. 'error' is zero for a
1879          * normal shutdown which can happen only after the connection has been
1880          * established.  If the connection is established, schedule the
1881          * connection to be finished off by the connd. Otherwise the connd is
1882          * already dealing with it (either to set it up or tear it down).
1883          * Caller holds kib_global_lock exclusively in irq context
1884          */
1885         struct kib_peer *peer = conn->ibc_peer;
1886         struct kib_dev *dev;
1887         unsigned long flags;
1888
1889         LASSERT(error || conn->ibc_state >= IBLND_CONN_ESTABLISHED);
1890
1891         if (error && !conn->ibc_comms_error)
1892                 conn->ibc_comms_error = error;
1893
1894         if (conn->ibc_state != IBLND_CONN_ESTABLISHED)
1895                 return; /* already being handled  */
1896
1897         if (!error &&
1898             list_empty(&conn->ibc_tx_noops) &&
1899             list_empty(&conn->ibc_tx_queue) &&
1900             list_empty(&conn->ibc_tx_queue_rsrvd) &&
1901             list_empty(&conn->ibc_tx_queue_nocred) &&
1902             list_empty(&conn->ibc_active_txs)) {
1903                 CDEBUG(D_NET, "closing conn to %s\n",
1904                        libcfs_nid2str(peer->ibp_nid));
1905         } else {
1906                 CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n",
1907                         libcfs_nid2str(peer->ibp_nid), error,
1908                         list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
1909                         list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)",
1910                         list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
1911                         list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
1912                         list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
1913         }
1914
1915         dev = ((struct kib_net *)peer->ibp_ni->ni_data)->ibn_dev;
1916         list_del(&conn->ibc_list);
1917         /* connd (see below) takes over ibc_list's ref */
1918
1919         if (list_empty(&peer->ibp_conns) &&    /* no more conns */
1920             kiblnd_peer_active(peer)) {  /* still in peer table */
1921                 kiblnd_unlink_peer_locked(peer);
1922
1923                 /* set/clear error on last conn */
1924                 peer->ibp_error = conn->ibc_comms_error;
1925         }
1926
1927         kiblnd_set_conn_state(conn, IBLND_CONN_CLOSING);
1928
1929         if (error &&
1930             kiblnd_dev_can_failover(dev)) {
1931                 list_add_tail(&dev->ibd_fail_list,
1932                               &kiblnd_data.kib_failed_devs);
1933                 wake_up(&kiblnd_data.kib_failover_waitq);
1934         }
1935
1936         spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
1937
1938         list_add_tail(&conn->ibc_list, &kiblnd_data.kib_connd_conns);
1939         wake_up(&kiblnd_data.kib_connd_waitq);
1940
1941         spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
1942 }
1943
1944 void
1945 kiblnd_close_conn(struct kib_conn *conn, int error)
1946 {
1947         unsigned long flags;
1948
1949         write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
1950
1951         kiblnd_close_conn_locked(conn, error);
1952
1953         write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
1954 }
1955
1956 static void
1957 kiblnd_handle_early_rxs(struct kib_conn *conn)
1958 {
1959         unsigned long flags;
1960         struct kib_rx *rx;
1961         struct kib_rx *tmp;
1962
1963         LASSERT(!in_interrupt());
1964         LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
1965
1966         write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
1967         list_for_each_entry_safe(rx, tmp, &conn->ibc_early_rxs, rx_list) {
1968                 list_del(&rx->rx_list);
1969                 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
1970
1971                 kiblnd_handle_rx(rx);
1972
1973                 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
1974         }
1975         write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
1976 }
1977
1978 static void
1979 kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs)
1980 {
1981         LIST_HEAD(zombies);
1982         struct list_head *tmp;
1983         struct list_head *nxt;
1984         struct kib_tx *tx;
1985
1986         spin_lock(&conn->ibc_lock);
1987
1988         list_for_each_safe(tmp, nxt, txs) {
1989                 tx = list_entry(tmp, struct kib_tx, tx_list);
1990
1991                 if (txs == &conn->ibc_active_txs) {
1992                         LASSERT(!tx->tx_queued);
1993                         LASSERT(tx->tx_waiting || tx->tx_sending);
1994                 } else {
1995                         LASSERT(tx->tx_queued);
1996                 }
1997
1998                 tx->tx_status = -ECONNABORTED;
1999                 tx->tx_waiting = 0;
2000
2001                 if (!tx->tx_sending) {
2002                         tx->tx_queued = 0;
2003                         list_del(&tx->tx_list);
2004                         list_add(&tx->tx_list, &zombies);
2005                 }
2006         }
2007
2008         spin_unlock(&conn->ibc_lock);
2009
2010         kiblnd_txlist_done(conn->ibc_peer->ibp_ni, &zombies, -ECONNABORTED);
2011 }
2012
2013 static void
2014 kiblnd_finalise_conn(struct kib_conn *conn)
2015 {
2016         LASSERT(!in_interrupt());
2017         LASSERT(conn->ibc_state > IBLND_CONN_INIT);
2018
2019         kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
2020
2021         /*
2022          * abort_receives moves QP state to IB_QPS_ERR.  This is only required
2023          * for connections that didn't get as far as being connected, because
2024          * rdma_disconnect() does this for free.
2025          */
2026         kiblnd_abort_receives(conn);
2027
2028         /*
2029          * Complete all tx descs not waiting for sends to complete.
2030          * NB we should be safe from RDMA now that the QP has changed state
2031          */
2032         kiblnd_abort_txs(conn, &conn->ibc_tx_noops);
2033         kiblnd_abort_txs(conn, &conn->ibc_tx_queue);
2034         kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
2035         kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred);
2036         kiblnd_abort_txs(conn, &conn->ibc_active_txs);
2037
2038         kiblnd_handle_early_rxs(conn);
2039 }
2040
2041 static void
2042 kiblnd_peer_connect_failed(struct kib_peer *peer, int active, int error)
2043 {
2044         LIST_HEAD(zombies);
2045         unsigned long flags;
2046
2047         LASSERT(error);
2048         LASSERT(!in_interrupt());
2049
2050         write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2051
2052         if (active) {
2053                 LASSERT(peer->ibp_connecting > 0);
2054                 peer->ibp_connecting--;
2055         } else {
2056                 LASSERT(peer->ibp_accepting > 0);
2057                 peer->ibp_accepting--;
2058         }
2059
2060         if (kiblnd_peer_connecting(peer)) {
2061                 /* another connection attempt under way... */
2062                 write_unlock_irqrestore(&kiblnd_data.kib_global_lock,
2063                                         flags);
2064                 return;
2065         }
2066
2067         peer->ibp_reconnected = 0;
2068         if (list_empty(&peer->ibp_conns)) {
2069                 /* Take peer's blocked transmits to complete with error */
2070                 list_add(&zombies, &peer->ibp_tx_queue);
2071                 list_del_init(&peer->ibp_tx_queue);
2072
2073                 if (kiblnd_peer_active(peer))
2074                         kiblnd_unlink_peer_locked(peer);
2075
2076                 peer->ibp_error = error;
2077         } else {
2078                 /* Can't have blocked transmits if there are connections */
2079                 LASSERT(list_empty(&peer->ibp_tx_queue));
2080         }
2081
2082         write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2083
2084         kiblnd_peer_notify(peer);
2085
2086         if (list_empty(&zombies))
2087                 return;
2088
2089         CNETERR("Deleting messages for %s: connection failed\n",
2090                 libcfs_nid2str(peer->ibp_nid));
2091
2092         kiblnd_txlist_done(peer->ibp_ni, &zombies, -EHOSTUNREACH);
2093 }
2094
2095 static void
2096 kiblnd_connreq_done(struct kib_conn *conn, int status)
2097 {
2098         struct kib_peer *peer = conn->ibc_peer;
2099         struct kib_tx *tx;
2100         struct kib_tx *tmp;
2101         struct list_head txs;
2102         unsigned long flags;
2103         int active;
2104
2105         active = (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
2106
2107         CDEBUG(D_NET, "%s: active(%d), version(%x), status(%d)\n",
2108                libcfs_nid2str(peer->ibp_nid), active,
2109                conn->ibc_version, status);
2110
2111         LASSERT(!in_interrupt());
2112         LASSERT((conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT &&
2113                  peer->ibp_connecting > 0) ||
2114                  (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT &&
2115                  peer->ibp_accepting > 0));
2116
2117         LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
2118         conn->ibc_connvars = NULL;
2119
2120         if (status) {
2121                 /* failed to establish connection */
2122                 kiblnd_peer_connect_failed(peer, active, status);
2123                 kiblnd_finalise_conn(conn);
2124                 return;
2125         }
2126
2127         /* connection established */
2128         write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2129
2130         conn->ibc_last_send = jiffies;
2131         kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED);
2132         kiblnd_peer_alive(peer);
2133
2134         /*
2135          * Add conn to peer's list and nuke any dangling conns from a different
2136          * peer instance...
2137          */
2138         kiblnd_conn_addref(conn);              /* +1 ref for ibc_list */
2139         list_add(&conn->ibc_list, &peer->ibp_conns);
2140         peer->ibp_reconnected = 0;
2141         if (active)
2142                 peer->ibp_connecting--;
2143         else
2144                 peer->ibp_accepting--;
2145
2146         if (!peer->ibp_version) {
2147                 peer->ibp_version     = conn->ibc_version;
2148                 peer->ibp_incarnation = conn->ibc_incarnation;
2149         }
2150
2151         if (peer->ibp_version     != conn->ibc_version ||
2152             peer->ibp_incarnation != conn->ibc_incarnation) {
2153                 kiblnd_close_stale_conns_locked(peer, conn->ibc_version,
2154                                                 conn->ibc_incarnation);
2155                 peer->ibp_version     = conn->ibc_version;
2156                 peer->ibp_incarnation = conn->ibc_incarnation;
2157         }
2158
2159         /* grab pending txs while I have the lock */
2160         list_add(&txs, &peer->ibp_tx_queue);
2161         list_del_init(&peer->ibp_tx_queue);
2162
2163         if (!kiblnd_peer_active(peer) ||        /* peer has been deleted */
2164             conn->ibc_comms_error) {       /* error has happened already */
2165                 struct lnet_ni *ni = peer->ibp_ni;
2166
2167                 /* start to shut down connection */
2168                 kiblnd_close_conn_locked(conn, -ECONNABORTED);
2169                 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2170
2171                 kiblnd_txlist_done(ni, &txs, -ECONNABORTED);
2172
2173                 return;
2174         }
2175
2176         /*
2177          * +1 ref for myself, this connection is visible to other threads
2178          * now, refcount of peer:ibp_conns can be released by connection
2179          * close from either a different thread, or the calling of
2180          * kiblnd_check_sends_locked() below. See bz21911 for details.
2181          */
2182         kiblnd_conn_addref(conn);
2183         write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2184
2185         /* Schedule blocked txs */
2186         spin_lock(&conn->ibc_lock);
2187         list_for_each_entry_safe(tx, tmp, &txs, tx_list) {
2188                 list_del(&tx->tx_list);
2189
2190                 kiblnd_queue_tx_locked(tx, conn);
2191         }
2192         kiblnd_check_sends_locked(conn);
2193         spin_unlock(&conn->ibc_lock);
2194
2195         /* schedule blocked rxs */
2196         kiblnd_handle_early_rxs(conn);
2197
2198         kiblnd_conn_decref(conn);
2199 }
2200
2201 static void
2202 kiblnd_reject(struct rdma_cm_id *cmid, struct kib_rej *rej)
2203 {
2204         int rc;
2205
2206         rc = rdma_reject(cmid, rej, sizeof(*rej));
2207
2208         if (rc)
2209                 CWARN("Error %d sending reject\n", rc);
2210 }
2211
2212 static int
2213 kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
2214 {
2215         rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
2216         struct kib_msg *reqmsg = priv;
2217         struct kib_msg *ackmsg;
2218         struct kib_dev *ibdev;
2219         struct kib_peer *peer;
2220         struct kib_peer *peer2;
2221         struct kib_conn *conn;
2222         struct lnet_ni *ni  = NULL;
2223         struct kib_net *net = NULL;
2224         lnet_nid_t nid;
2225         struct rdma_conn_param cp;
2226         struct kib_rej rej;
2227         int version = IBLND_MSG_VERSION;
2228         unsigned long flags;
2229         int max_frags;
2230         int rc;
2231         struct sockaddr_in *peer_addr;
2232
2233         LASSERT(!in_interrupt());
2234
2235         /* cmid inherits 'context' from the corresponding listener id */
2236         ibdev = (struct kib_dev *)cmid->context;
2237         LASSERT(ibdev);
2238
2239         memset(&rej, 0, sizeof(rej));
2240         rej.ibr_magic = IBLND_MSG_MAGIC;
2241         rej.ibr_why = IBLND_REJECT_FATAL;
2242         rej.ibr_cp.ibcp_max_msg_size = IBLND_MSG_SIZE;
2243
2244         peer_addr = (struct sockaddr_in *)&cmid->route.addr.dst_addr;
2245         if (*kiblnd_tunables.kib_require_priv_port &&
2246             ntohs(peer_addr->sin_port) >= PROT_SOCK) {
2247                 __u32 ip = ntohl(peer_addr->sin_addr.s_addr);
2248
2249                 CERROR("Peer's port (%pI4h:%hu) is not privileged\n",
2250                        &ip, ntohs(peer_addr->sin_port));
2251                 goto failed;
2252         }
2253
2254         if (priv_nob < offsetof(struct kib_msg, ibm_type)) {
2255                 CERROR("Short connection request\n");
2256                 goto failed;
2257         }
2258
2259         /*
2260          * Future protocol version compatibility support!  If the
2261          * o2iblnd-specific protocol changes, or when LNET unifies
2262          * protocols over all LNDs, the initial connection will
2263          * negotiate a protocol version.  I trap this here to avoid
2264          * console errors; the reject tells the peer which protocol I
2265          * speak.
2266          */
2267         if (reqmsg->ibm_magic == LNET_PROTO_MAGIC ||
2268             reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
2269                 goto failed;
2270         if (reqmsg->ibm_magic == IBLND_MSG_MAGIC &&
2271             reqmsg->ibm_version != IBLND_MSG_VERSION &&
2272             reqmsg->ibm_version != IBLND_MSG_VERSION_1)
2273                 goto failed;
2274         if (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) &&
2275             reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION) &&
2276             reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION_1))
2277                 goto failed;
2278
2279         rc = kiblnd_unpack_msg(reqmsg, priv_nob);
2280         if (rc) {
2281                 CERROR("Can't parse connection request: %d\n", rc);
2282                 goto failed;
2283         }
2284
2285         nid = reqmsg->ibm_srcnid;
2286         ni = lnet_net2ni(LNET_NIDNET(reqmsg->ibm_dstnid));
2287
2288         if (ni) {
2289                 net = (struct kib_net *)ni->ni_data;
2290                 rej.ibr_incarnation = net->ibn_incarnation;
2291         }
2292
2293         if (!ni ||                       /* no matching net */
2294             ni->ni_nid != reqmsg->ibm_dstnid ||   /* right NET, wrong NID! */
2295             net->ibn_dev != ibdev) {          /* wrong device */
2296                 CERROR("Can't accept conn from %s on %s (%s:%d:%pI4h): bad dst nid %s\n",
2297                        libcfs_nid2str(nid),
2298                        !ni ? "NA" : libcfs_nid2str(ni->ni_nid),
2299                        ibdev->ibd_ifname, ibdev->ibd_nnets,
2300                        &ibdev->ibd_ifip,
2301                        libcfs_nid2str(reqmsg->ibm_dstnid));
2302
2303                 goto failed;
2304         }
2305
2306        /* check time stamp as soon as possible */
2307         if (reqmsg->ibm_dststamp &&
2308             reqmsg->ibm_dststamp != net->ibn_incarnation) {
2309                 CWARN("Stale connection request\n");
2310                 rej.ibr_why = IBLND_REJECT_CONN_STALE;
2311                 goto failed;
2312         }
2313
2314         /* I can accept peer's version */
2315         version = reqmsg->ibm_version;
2316
2317         if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) {
2318                 CERROR("Unexpected connreq msg type: %x from %s\n",
2319                        reqmsg->ibm_type, libcfs_nid2str(nid));
2320                 goto failed;
2321         }
2322
2323         if (reqmsg->ibm_u.connparams.ibcp_queue_depth >
2324             kiblnd_msg_queue_size(version, ni)) {
2325                 CERROR("Can't accept conn from %s, queue depth too large: %d (<=%d wanted)\n",
2326                        libcfs_nid2str(nid),
2327                        reqmsg->ibm_u.connparams.ibcp_queue_depth,
2328                        kiblnd_msg_queue_size(version, ni));
2329
2330                 if (version == IBLND_MSG_VERSION)
2331                         rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE;
2332
2333                 goto failed;
2334         }
2335
2336         max_frags = reqmsg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT;
2337         if (max_frags > kiblnd_rdma_frags(version, ni)) {
2338                 CWARN("Can't accept conn from %s (version %x): max message size %d is too large (%d wanted)\n",
2339                       libcfs_nid2str(nid), version, max_frags,
2340                       kiblnd_rdma_frags(version, ni));
2341
2342                 if (version >= IBLND_MSG_VERSION)
2343                         rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
2344
2345                 goto failed;
2346         } else if (max_frags < kiblnd_rdma_frags(version, ni) &&
2347                    !net->ibn_fmr_ps) {
2348                 CWARN("Can't accept conn from %s (version %x): max message size %d incompatible without FMR pool (%d wanted)\n",
2349                       libcfs_nid2str(nid), version, max_frags,
2350                       kiblnd_rdma_frags(version, ni));
2351
2352                 if (version == IBLND_MSG_VERSION)
2353                         rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
2354
2355                 goto failed;
2356         }
2357
2358         if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
2359                 CERROR("Can't accept %s: message size %d too big (%d max)\n",
2360                        libcfs_nid2str(nid),
2361                        reqmsg->ibm_u.connparams.ibcp_max_msg_size,
2362                        IBLND_MSG_SIZE);
2363                 goto failed;
2364         }
2365
2366         /* assume 'nid' is a new peer; create  */
2367         rc = kiblnd_create_peer(ni, &peer, nid);
2368         if (rc) {
2369                 CERROR("Can't create peer for %s\n", libcfs_nid2str(nid));
2370                 rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
2371                 goto failed;
2372         }
2373
2374         /* We have validated the peer's parameters so use those */
2375         peer->ibp_max_frags = max_frags;
2376         peer->ibp_queue_depth = reqmsg->ibm_u.connparams.ibcp_queue_depth;
2377
2378         write_lock_irqsave(g_lock, flags);
2379
2380         peer2 = kiblnd_find_peer_locked(nid);
2381         if (peer2) {
2382                 if (!peer2->ibp_version) {
2383                         peer2->ibp_version     = version;
2384                         peer2->ibp_incarnation = reqmsg->ibm_srcstamp;
2385                 }
2386
2387                 /* not the guy I've talked with */
2388                 if (peer2->ibp_incarnation != reqmsg->ibm_srcstamp ||
2389                     peer2->ibp_version     != version) {
2390                         kiblnd_close_peer_conns_locked(peer2, -ESTALE);
2391
2392                         if (kiblnd_peer_active(peer2)) {
2393                                 peer2->ibp_incarnation = reqmsg->ibm_srcstamp;
2394                                 peer2->ibp_version = version;
2395                         }
2396                         write_unlock_irqrestore(g_lock, flags);
2397
2398                         CWARN("Conn stale %s version %x/%x incarnation %llu/%llu\n",
2399                               libcfs_nid2str(nid), peer2->ibp_version, version,
2400                               peer2->ibp_incarnation, reqmsg->ibm_srcstamp);
2401
2402                         kiblnd_peer_decref(peer);
2403                         rej.ibr_why = IBLND_REJECT_CONN_STALE;
2404                         goto failed;
2405                 }
2406
2407                 /*
2408                  * Tie-break connection race in favour of the higher NID.
2409                  * If we keep running into a race condition multiple times,
2410                  * we have to assume that the connection attempt with the
2411                  * higher NID is stuck in a connecting state and will never
2412                  * recover.  As such, we pass through this if-block and let
2413                  * the lower NID connection win so we can move forward.
2414                  */
2415                 if (peer2->ibp_connecting &&
2416                     nid < ni->ni_nid && peer2->ibp_races <
2417                     MAX_CONN_RACES_BEFORE_ABORT) {
2418                         peer2->ibp_races++;
2419                         write_unlock_irqrestore(g_lock, flags);
2420
2421                         CDEBUG(D_NET, "Conn race %s\n",
2422                                libcfs_nid2str(peer2->ibp_nid));
2423
2424                         kiblnd_peer_decref(peer);
2425                         rej.ibr_why = IBLND_REJECT_CONN_RACE;
2426                         goto failed;
2427                 }
2428                 if (peer2->ibp_races >= MAX_CONN_RACES_BEFORE_ABORT)
2429                         CNETERR("Conn race %s: unresolved after %d attempts, letting lower NID win\n",
2430                                 libcfs_nid2str(peer2->ibp_nid),
2431                                 MAX_CONN_RACES_BEFORE_ABORT);
2432                 /**
2433                  * passive connection is allowed even this peer is waiting for
2434                  * reconnection.
2435                  */
2436                 peer2->ibp_reconnecting = 0;
2437                 peer2->ibp_races = 0;
2438                 peer2->ibp_accepting++;
2439                 kiblnd_peer_addref(peer2);
2440
2441                 /**
2442                  * Race with kiblnd_launch_tx (active connect) to create peer
2443                  * so copy validated parameters since we now know what the
2444                  * peer's limits are
2445                  */
2446                 peer2->ibp_max_frags = peer->ibp_max_frags;
2447                 peer2->ibp_queue_depth = peer->ibp_queue_depth;
2448
2449                 write_unlock_irqrestore(g_lock, flags);
2450                 kiblnd_peer_decref(peer);
2451                 peer = peer2;
2452         } else {
2453                 /* Brand new peer */
2454                 LASSERT(!peer->ibp_accepting);
2455                 LASSERT(!peer->ibp_version &&
2456                         !peer->ibp_incarnation);
2457
2458                 peer->ibp_accepting   = 1;
2459                 peer->ibp_version     = version;
2460                 peer->ibp_incarnation = reqmsg->ibm_srcstamp;
2461
2462                 /* I have a ref on ni that prevents it being shutdown */
2463                 LASSERT(!net->ibn_shutdown);
2464
2465                 kiblnd_peer_addref(peer);
2466                 list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
2467
2468                 write_unlock_irqrestore(g_lock, flags);
2469         }
2470
2471         conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_PASSIVE_WAIT,
2472                                   version);
2473         if (!conn) {
2474                 kiblnd_peer_connect_failed(peer, 0, -ENOMEM);
2475                 kiblnd_peer_decref(peer);
2476                 rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
2477                 goto failed;
2478         }
2479
2480         /*
2481          * conn now "owns" cmid, so I return success from here on to ensure the
2482          * CM callback doesn't destroy cmid.
2483          */
2484         conn->ibc_incarnation      = reqmsg->ibm_srcstamp;
2485         conn->ibc_credits          = conn->ibc_queue_depth;
2486         conn->ibc_reserved_credits = conn->ibc_queue_depth;
2487         LASSERT(conn->ibc_credits + conn->ibc_reserved_credits +
2488                 IBLND_OOB_MSGS(version) <= IBLND_RX_MSGS(conn));
2489
2490         ackmsg = &conn->ibc_connvars->cv_msg;
2491         memset(ackmsg, 0, sizeof(*ackmsg));
2492
2493         kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK,
2494                         sizeof(ackmsg->ibm_u.connparams));
2495         ackmsg->ibm_u.connparams.ibcp_queue_depth = conn->ibc_queue_depth;
2496         ackmsg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags << IBLND_FRAG_SHIFT;
2497         ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
2498
2499         kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp);
2500
2501         memset(&cp, 0, sizeof(cp));
2502         cp.private_data = ackmsg;
2503         cp.private_data_len = ackmsg->ibm_nob;
2504         cp.responder_resources = 0;          /* No atomic ops or RDMA reads */
2505         cp.initiator_depth = 0;
2506         cp.flow_control = 1;
2507         cp.retry_count = *kiblnd_tunables.kib_retry_count;
2508         cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count;
2509
2510         CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid));
2511
2512         rc = rdma_accept(cmid, &cp);
2513         if (rc) {
2514                 CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc);
2515                 rej.ibr_version = version;
2516                 rej.ibr_why     = IBLND_REJECT_FATAL;
2517
2518                 kiblnd_reject(cmid, &rej);
2519                 kiblnd_connreq_done(conn, rc);
2520                 kiblnd_conn_decref(conn);
2521         }
2522
2523         lnet_ni_decref(ni);
2524         return 0;
2525
2526  failed:
2527         if (ni) {
2528                 rej.ibr_cp.ibcp_queue_depth = kiblnd_msg_queue_size(version, ni);
2529                 rej.ibr_cp.ibcp_max_frags = kiblnd_rdma_frags(version, ni);
2530                 lnet_ni_decref(ni);
2531         }
2532
2533         rej.ibr_version             = version;
2534         kiblnd_reject(cmid, &rej);
2535
2536         return -ECONNREFUSED;
2537 }
2538
2539 static void
2540 kiblnd_check_reconnect(struct kib_conn *conn, int version,
2541                        __u64 incarnation, int why, struct kib_connparams *cp)
2542 {
2543         rwlock_t *glock = &kiblnd_data.kib_global_lock;
2544         struct kib_peer *peer = conn->ibc_peer;
2545         char *reason;
2546         int msg_size = IBLND_MSG_SIZE;
2547         int frag_num = -1;
2548         int queue_dep = -1;
2549         bool reconnect;
2550         unsigned long flags;
2551
2552         LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
2553         LASSERT(peer->ibp_connecting > 0);     /* 'conn' at least */
2554         LASSERT(!peer->ibp_reconnecting);
2555
2556         if (cp) {
2557                 msg_size = cp->ibcp_max_msg_size;
2558                 frag_num        = cp->ibcp_max_frags << IBLND_FRAG_SHIFT;
2559                 queue_dep = cp->ibcp_queue_depth;
2560         }
2561
2562         write_lock_irqsave(glock, flags);
2563         /**
2564          * retry connection if it's still needed and no other connection
2565          * attempts (active or passive) are in progress
2566          * NB: reconnect is still needed even when ibp_tx_queue is
2567          * empty if ibp_version != version because reconnect may be
2568          * initiated by kiblnd_query()
2569          */
2570         reconnect = (!list_empty(&peer->ibp_tx_queue) ||
2571                      peer->ibp_version != version) &&
2572                     peer->ibp_connecting == 1 &&
2573                     !peer->ibp_accepting;
2574         if (!reconnect) {
2575                 reason = "no need";
2576                 goto out;
2577         }
2578
2579         switch (why) {
2580         default:
2581                 reason = "Unknown";
2582                 break;
2583
2584         case IBLND_REJECT_RDMA_FRAGS: {
2585                 struct lnet_ioctl_config_lnd_tunables *tunables;
2586
2587                 if (!cp) {
2588                         reason = "can't negotiate max frags";
2589                         goto out;
2590                 }
2591                 tunables = peer->ibp_ni->ni_lnd_tunables;
2592                 if (!tunables->lt_tun_u.lt_o2ib.lnd_map_on_demand) {
2593                         reason = "map_on_demand must be enabled";
2594                         goto out;
2595                 }
2596                 if (conn->ibc_max_frags <= frag_num) {
2597                         reason = "unsupported max frags";
2598                         goto out;
2599                 }
2600
2601                 peer->ibp_max_frags = frag_num;
2602                 reason = "rdma fragments";
2603                 break;
2604         }
2605         case IBLND_REJECT_MSG_QUEUE_SIZE:
2606                 if (!cp) {
2607                         reason = "can't negotiate queue depth";
2608                         goto out;
2609                 }
2610                 if (conn->ibc_queue_depth <= queue_dep) {
2611                         reason = "unsupported queue depth";
2612                         goto out;
2613                 }
2614
2615                 peer->ibp_queue_depth = queue_dep;
2616                 reason = "queue depth";
2617                 break;
2618
2619         case IBLND_REJECT_CONN_STALE:
2620                 reason = "stale";
2621                 break;
2622
2623         case IBLND_REJECT_CONN_RACE:
2624                 reason = "conn race";
2625                 break;
2626
2627         case IBLND_REJECT_CONN_UNCOMPAT:
2628                 reason = "version negotiation";
2629                 break;
2630         }
2631
2632         conn->ibc_reconnect = 1;
2633         peer->ibp_reconnecting = 1;
2634         peer->ibp_version = version;
2635         if (incarnation)
2636                 peer->ibp_incarnation = incarnation;
2637 out:
2638         write_unlock_irqrestore(glock, flags);
2639
2640         CNETERR("%s: %s (%s), %x, %x, msg_size: %d, queue_depth: %d/%d, max_frags: %d/%d\n",
2641                 libcfs_nid2str(peer->ibp_nid),
2642                 reconnect ? "reconnect" : "don't reconnect",
2643                 reason, IBLND_MSG_VERSION, version, msg_size,
2644                 conn->ibc_queue_depth, queue_dep,
2645                 conn->ibc_max_frags, frag_num);
2646         /**
2647          * if conn::ibc_reconnect is TRUE, connd will reconnect to the peer
2648          * while destroying the zombie
2649          */
2650 }
2651
2652 static void
2653 kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob)
2654 {
2655         struct kib_peer *peer = conn->ibc_peer;
2656
2657         LASSERT(!in_interrupt());
2658         LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
2659
2660         switch (reason) {
2661         case IB_CM_REJ_STALE_CONN:
2662                 kiblnd_check_reconnect(conn, IBLND_MSG_VERSION, 0,
2663                                        IBLND_REJECT_CONN_STALE, NULL);
2664                 break;
2665
2666         case IB_CM_REJ_INVALID_SERVICE_ID:
2667                 CNETERR("%s rejected: no listener at %d\n",
2668                         libcfs_nid2str(peer->ibp_nid),
2669                         *kiblnd_tunables.kib_service);
2670                 break;
2671
2672         case IB_CM_REJ_CONSUMER_DEFINED:
2673                 if (priv_nob >= offsetof(struct kib_rej, ibr_padding)) {
2674                         struct kib_rej *rej = priv;
2675                         struct kib_connparams *cp = NULL;
2676                         int flip = 0;
2677                         __u64 incarnation = -1;
2678
2679                         /* NB. default incarnation is -1 because:
2680                          * a) V1 will ignore dst incarnation in connreq.
2681                          * b) V2 will provide incarnation while rejecting me,
2682                          *    -1 will be overwrote.
2683                          *
2684                          * if I try to connect to a V1 peer with V2 protocol,
2685                          * it rejected me then upgrade to V2, I have no idea
2686                          * about the upgrading and try to reconnect with V1,
2687                          * in this case upgraded V2 can find out I'm trying to
2688                          * talk to the old guy and reject me(incarnation is -1).
2689                          */
2690
2691                         if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) ||
2692                             rej->ibr_magic == __swab32(LNET_PROTO_MAGIC)) {
2693                                 __swab32s(&rej->ibr_magic);
2694                                 __swab16s(&rej->ibr_version);
2695                                 flip = 1;
2696                         }
2697
2698                         if (priv_nob >= sizeof(struct kib_rej) &&
2699                             rej->ibr_version > IBLND_MSG_VERSION_1) {
2700                                 /*
2701                                  * priv_nob is always 148 in current version
2702                                  * of OFED, so we still need to check version.
2703                                  * (define of IB_CM_REJ_PRIVATE_DATA_SIZE)
2704                                  */
2705                                 cp = &rej->ibr_cp;
2706
2707                                 if (flip) {
2708                                         __swab64s(&rej->ibr_incarnation);
2709                                         __swab16s(&cp->ibcp_queue_depth);
2710                                         __swab16s(&cp->ibcp_max_frags);
2711                                         __swab32s(&cp->ibcp_max_msg_size);
2712                                 }
2713
2714                                 incarnation = rej->ibr_incarnation;
2715                         }
2716
2717                         if (rej->ibr_magic != IBLND_MSG_MAGIC &&
2718                             rej->ibr_magic != LNET_PROTO_MAGIC) {
2719                                 CERROR("%s rejected: consumer defined fatal error\n",
2720                                        libcfs_nid2str(peer->ibp_nid));
2721                                 break;
2722                         }
2723
2724                         if (rej->ibr_version != IBLND_MSG_VERSION &&
2725                             rej->ibr_version != IBLND_MSG_VERSION_1) {
2726                                 CERROR("%s rejected: o2iblnd version %x error\n",
2727                                        libcfs_nid2str(peer->ibp_nid),
2728                                        rej->ibr_version);
2729                                 break;
2730                         }
2731
2732                         if (rej->ibr_why     == IBLND_REJECT_FATAL &&
2733                             rej->ibr_version == IBLND_MSG_VERSION_1) {
2734                                 CDEBUG(D_NET, "rejected by old version peer %s: %x\n",
2735                                        libcfs_nid2str(peer->ibp_nid), rej->ibr_version);
2736
2737                                 if (conn->ibc_version != IBLND_MSG_VERSION_1)
2738                                         rej->ibr_why = IBLND_REJECT_CONN_UNCOMPAT;
2739                         }
2740
2741                         switch (rej->ibr_why) {
2742                         case IBLND_REJECT_CONN_RACE:
2743                         case IBLND_REJECT_CONN_STALE:
2744                         case IBLND_REJECT_CONN_UNCOMPAT:
2745                         case IBLND_REJECT_MSG_QUEUE_SIZE:
2746                         case IBLND_REJECT_RDMA_FRAGS:
2747                                 kiblnd_check_reconnect(conn, rej->ibr_version,
2748                                                        incarnation,
2749                                                        rej->ibr_why, cp);
2750                                 break;
2751
2752                         case IBLND_REJECT_NO_RESOURCES:
2753                                 CERROR("%s rejected: o2iblnd no resources\n",
2754                                        libcfs_nid2str(peer->ibp_nid));
2755                                 break;
2756
2757                         case IBLND_REJECT_FATAL:
2758                                 CERROR("%s rejected: o2iblnd fatal error\n",
2759                                        libcfs_nid2str(peer->ibp_nid));
2760                                 break;
2761
2762                         default:
2763                                 CERROR("%s rejected: o2iblnd reason %d\n",
2764                                        libcfs_nid2str(peer->ibp_nid),
2765                                        rej->ibr_why);
2766                                 break;
2767                         }
2768                         break;
2769                 }
2770                 /* fall through */
2771         default:
2772                 CNETERR("%s rejected: reason %d, size %d\n",
2773                         libcfs_nid2str(peer->ibp_nid), reason, priv_nob);
2774                 break;
2775         }
2776
2777         kiblnd_connreq_done(conn, -ECONNREFUSED);
2778 }
2779
2780 static void
2781 kiblnd_check_connreply(struct kib_conn *conn, void *priv, int priv_nob)
2782 {
2783         struct kib_peer *peer = conn->ibc_peer;
2784         struct lnet_ni *ni = peer->ibp_ni;
2785         struct kib_net *net = ni->ni_data;
2786         struct kib_msg *msg = priv;
2787         int ver = conn->ibc_version;
2788         int rc = kiblnd_unpack_msg(msg, priv_nob);
2789         unsigned long flags;
2790
2791         LASSERT(net);
2792
2793         if (rc) {
2794                 CERROR("Can't unpack connack from %s: %d\n",
2795                        libcfs_nid2str(peer->ibp_nid), rc);
2796                 goto failed;
2797         }
2798
2799         if (msg->ibm_type != IBLND_MSG_CONNACK) {
2800                 CERROR("Unexpected message %d from %s\n",
2801                        msg->ibm_type, libcfs_nid2str(peer->ibp_nid));
2802                 rc = -EPROTO;
2803                 goto failed;
2804         }
2805
2806         if (ver != msg->ibm_version) {
2807                 CERROR("%s replied version %x is different with requested version %x\n",
2808                        libcfs_nid2str(peer->ibp_nid), msg->ibm_version, ver);
2809                 rc = -EPROTO;
2810                 goto failed;
2811         }
2812
2813         if (msg->ibm_u.connparams.ibcp_queue_depth >
2814             conn->ibc_queue_depth) {
2815                 CERROR("%s has incompatible queue depth %d (<=%d wanted)\n",
2816                        libcfs_nid2str(peer->ibp_nid),
2817                        msg->ibm_u.connparams.ibcp_queue_depth,
2818                        conn->ibc_queue_depth);
2819                 rc = -EPROTO;
2820                 goto failed;
2821         }
2822
2823         if ((msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT) >
2824             conn->ibc_max_frags) {
2825                 CERROR("%s has incompatible max_frags %d (<=%d wanted)\n",
2826                        libcfs_nid2str(peer->ibp_nid),
2827                        msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT,
2828                        conn->ibc_max_frags);
2829                 rc = -EPROTO;
2830                 goto failed;
2831         }
2832
2833         if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
2834                 CERROR("%s max message size %d too big (%d max)\n",
2835                        libcfs_nid2str(peer->ibp_nid),
2836                        msg->ibm_u.connparams.ibcp_max_msg_size,
2837                        IBLND_MSG_SIZE);
2838                 rc = -EPROTO;
2839                 goto failed;
2840         }
2841
2842         read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2843         if (msg->ibm_dstnid == ni->ni_nid &&
2844             msg->ibm_dststamp == net->ibn_incarnation)
2845                 rc = 0;
2846         else
2847                 rc = -ESTALE;
2848         read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2849
2850         if (rc) {
2851                 CERROR("Bad connection reply from %s, rc = %d, version: %x max_frags: %d\n",
2852                        libcfs_nid2str(peer->ibp_nid), rc,
2853                        msg->ibm_version, msg->ibm_u.connparams.ibcp_max_frags);
2854                 goto failed;
2855         }
2856
2857         conn->ibc_incarnation = msg->ibm_srcstamp;
2858         conn->ibc_credits = msg->ibm_u.connparams.ibcp_queue_depth;
2859         conn->ibc_reserved_credits = msg->ibm_u.connparams.ibcp_queue_depth;
2860         conn->ibc_queue_depth = msg->ibm_u.connparams.ibcp_queue_depth;
2861         conn->ibc_max_frags = msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT;
2862         LASSERT(conn->ibc_credits + conn->ibc_reserved_credits +
2863                 IBLND_OOB_MSGS(ver) <= IBLND_RX_MSGS(conn));
2864
2865         kiblnd_connreq_done(conn, 0);
2866         return;
2867
2868  failed:
2869         /*
2870          * NB My QP has already established itself, so I handle anything going
2871          * wrong here by setting ibc_comms_error.
2872          * kiblnd_connreq_done(0) moves the conn state to ESTABLISHED, but then
2873          * immediately tears it down.
2874          */
2875         LASSERT(rc);
2876         conn->ibc_comms_error = rc;
2877         kiblnd_connreq_done(conn, 0);
2878 }
2879
2880 static int
2881 kiblnd_active_connect(struct rdma_cm_id *cmid)
2882 {
2883         struct kib_peer *peer = (struct kib_peer *)cmid->context;
2884         struct kib_conn *conn;
2885         struct kib_msg *msg;
2886         struct rdma_conn_param cp;
2887         int version;
2888         __u64 incarnation;
2889         unsigned long flags;
2890         int rc;
2891
2892         read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2893
2894         incarnation = peer->ibp_incarnation;
2895         version = !peer->ibp_version ? IBLND_MSG_VERSION :
2896                                        peer->ibp_version;
2897
2898         read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2899
2900         conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT,
2901                                   version);
2902         if (!conn) {
2903                 kiblnd_peer_connect_failed(peer, 1, -ENOMEM);
2904                 kiblnd_peer_decref(peer); /* lose cmid's ref */
2905                 return -ENOMEM;
2906         }
2907
2908         /*
2909          * conn "owns" cmid now, so I return success from here on to ensure the
2910          * CM callback doesn't destroy cmid. conn also takes over cmid's ref
2911          * on peer
2912          */
2913         msg = &conn->ibc_connvars->cv_msg;
2914
2915         memset(msg, 0, sizeof(*msg));
2916         kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams));
2917         msg->ibm_u.connparams.ibcp_queue_depth = conn->ibc_queue_depth;
2918         msg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags << IBLND_FRAG_SHIFT;
2919         msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
2920
2921         kiblnd_pack_msg(peer->ibp_ni, msg, version,
2922                         0, peer->ibp_nid, incarnation);
2923
2924         memset(&cp, 0, sizeof(cp));
2925         cp.private_data = msg;
2926         cp.private_data_len    = msg->ibm_nob;
2927         cp.responder_resources = 0;          /* No atomic ops or RDMA reads */
2928         cp.initiator_depth     = 0;
2929         cp.flow_control        = 1;
2930         cp.retry_count         = *kiblnd_tunables.kib_retry_count;
2931         cp.rnr_retry_count     = *kiblnd_tunables.kib_rnr_retry_count;
2932
2933         LASSERT(cmid->context == (void *)conn);
2934         LASSERT(conn->ibc_cmid == cmid);
2935
2936         rc = rdma_connect(cmid, &cp);
2937         if (rc) {
2938                 CERROR("Can't connect to %s: %d\n",
2939                        libcfs_nid2str(peer->ibp_nid), rc);
2940                 kiblnd_connreq_done(conn, rc);
2941                 kiblnd_conn_decref(conn);
2942         }
2943
2944         return 0;
2945 }
2946
2947 int
2948 kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
2949 {
2950         struct kib_peer *peer;
2951         struct kib_conn *conn;
2952         int rc;
2953
2954         switch (event->event) {
2955         default:
2956                 CERROR("Unexpected event: %d, status: %d\n",
2957                        event->event, event->status);
2958                 LBUG();
2959
2960         case RDMA_CM_EVENT_CONNECT_REQUEST:
2961                 /* destroy cmid on failure */
2962                 rc = kiblnd_passive_connect(cmid,
2963                                             (void *)KIBLND_CONN_PARAM(event),
2964                                             KIBLND_CONN_PARAM_LEN(event));
2965                 CDEBUG(D_NET, "connreq: %d\n", rc);
2966                 return rc;
2967
2968         case RDMA_CM_EVENT_ADDR_ERROR:
2969                 peer = (struct kib_peer *)cmid->context;
2970                 CNETERR("%s: ADDR ERROR %d\n",
2971                         libcfs_nid2str(peer->ibp_nid), event->status);
2972                 kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
2973                 kiblnd_peer_decref(peer);
2974                 return -EHOSTUNREACH;      /* rc destroys cmid */
2975
2976         case RDMA_CM_EVENT_ADDR_RESOLVED:
2977                 peer = (struct kib_peer *)cmid->context;
2978
2979                 CDEBUG(D_NET, "%s Addr resolved: %d\n",
2980                        libcfs_nid2str(peer->ibp_nid), event->status);
2981
2982                 if (event->status) {
2983                         CNETERR("Can't resolve address for %s: %d\n",
2984                                 libcfs_nid2str(peer->ibp_nid), event->status);
2985                         rc = event->status;
2986                 } else {
2987                         rc = rdma_resolve_route(
2988                                 cmid, *kiblnd_tunables.kib_timeout * 1000);
2989                         if (!rc) {
2990                                 struct kib_net *net = peer->ibp_ni->ni_data;
2991                                 struct kib_dev *dev = net->ibn_dev;
2992
2993                                 CDEBUG(D_NET, "%s: connection bound to "\
2994                                        "%s:%pI4h:%s\n",
2995                                        libcfs_nid2str(peer->ibp_nid),
2996                                        dev->ibd_ifname,
2997                                        &dev->ibd_ifip, cmid->device->name);
2998
2999                                 return 0;
3000                         }
3001
3002                         /* Can't initiate route resolution */
3003                         CERROR("Can't resolve route for %s: %d\n",
3004                                libcfs_nid2str(peer->ibp_nid), rc);
3005                 }
3006                 kiblnd_peer_connect_failed(peer, 1, rc);
3007                 kiblnd_peer_decref(peer);
3008                 return rc;                    /* rc destroys cmid */
3009
3010         case RDMA_CM_EVENT_ROUTE_ERROR:
3011                 peer = (struct kib_peer *)cmid->context;
3012                 CNETERR("%s: ROUTE ERROR %d\n",
3013                         libcfs_nid2str(peer->ibp_nid), event->status);
3014                 kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
3015                 kiblnd_peer_decref(peer);
3016                 return -EHOSTUNREACH;      /* rc destroys cmid */
3017
3018         case RDMA_CM_EVENT_ROUTE_RESOLVED:
3019                 peer = (struct kib_peer *)cmid->context;
3020                 CDEBUG(D_NET, "%s Route resolved: %d\n",
3021                        libcfs_nid2str(peer->ibp_nid), event->status);
3022
3023                 if (!event->status)
3024                         return kiblnd_active_connect(cmid);
3025
3026                 CNETERR("Can't resolve route for %s: %d\n",
3027                         libcfs_nid2str(peer->ibp_nid), event->status);
3028                 kiblnd_peer_connect_failed(peer, 1, event->status);
3029                 kiblnd_peer_decref(peer);
3030                 return event->status;      /* rc destroys cmid */
3031
3032         case RDMA_CM_EVENT_UNREACHABLE:
3033                 conn = (struct kib_conn *)cmid->context;
3034                 LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
3035                         conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
3036                 CNETERR("%s: UNREACHABLE %d\n",
3037                         libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
3038                 kiblnd_connreq_done(conn, -ENETDOWN);
3039                 kiblnd_conn_decref(conn);
3040                 return 0;
3041
3042         case RDMA_CM_EVENT_CONNECT_ERROR:
3043                 conn = (struct kib_conn *)cmid->context;
3044                 LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
3045                         conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
3046                 CNETERR("%s: CONNECT ERROR %d\n",
3047                         libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
3048                 kiblnd_connreq_done(conn, -ENOTCONN);
3049                 kiblnd_conn_decref(conn);
3050                 return 0;
3051
3052         case RDMA_CM_EVENT_REJECTED:
3053                 conn = (struct kib_conn *)cmid->context;
3054                 switch (conn->ibc_state) {
3055                 default:
3056                         LBUG();
3057
3058                 case IBLND_CONN_PASSIVE_WAIT:
3059                         CERROR("%s: REJECTED %d\n",
3060                                libcfs_nid2str(conn->ibc_peer->ibp_nid),
3061                                event->status);
3062                         kiblnd_connreq_done(conn, -ECONNRESET);
3063                         break;
3064
3065                 case IBLND_CONN_ACTIVE_CONNECT:
3066                         kiblnd_rejected(conn, event->status,
3067                                         (void *)KIBLND_CONN_PARAM(event),
3068                                         KIBLND_CONN_PARAM_LEN(event));
3069                         break;
3070                 }
3071                 kiblnd_conn_decref(conn);
3072                 return 0;
3073
3074         case RDMA_CM_EVENT_ESTABLISHED:
3075                 conn = (struct kib_conn *)cmid->context;
3076                 switch (conn->ibc_state) {
3077                 default:
3078                         LBUG();
3079
3080                 case IBLND_CONN_PASSIVE_WAIT:
3081                         CDEBUG(D_NET, "ESTABLISHED (passive): %s\n",
3082                                libcfs_nid2str(conn->ibc_peer->ibp_nid));
3083                         kiblnd_connreq_done(conn, 0);
3084                         break;
3085
3086                 case IBLND_CONN_ACTIVE_CONNECT:
3087                         CDEBUG(D_NET, "ESTABLISHED(active): %s\n",
3088                                libcfs_nid2str(conn->ibc_peer->ibp_nid));
3089                         kiblnd_check_connreply(conn,
3090                                                (void *)KIBLND_CONN_PARAM(event),
3091                                                KIBLND_CONN_PARAM_LEN(event));
3092                         break;
3093                 }
3094                 /* net keeps its ref on conn! */
3095                 return 0;
3096
3097         case RDMA_CM_EVENT_TIMEWAIT_EXIT:
3098                 CDEBUG(D_NET, "Ignore TIMEWAIT_EXIT event\n");
3099                 return 0;
3100         case RDMA_CM_EVENT_DISCONNECTED:
3101                 conn = (struct kib_conn *)cmid->context;
3102                 if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
3103                         CERROR("%s DISCONNECTED\n",
3104                                libcfs_nid2str(conn->ibc_peer->ibp_nid));
3105                         kiblnd_connreq_done(conn, -ECONNRESET);
3106                 } else {
3107                         kiblnd_close_conn(conn, 0);
3108                 }
3109                 kiblnd_conn_decref(conn);
3110                 cmid->context = NULL;
3111                 return 0;
3112
3113         case RDMA_CM_EVENT_DEVICE_REMOVAL:
3114                 LCONSOLE_ERROR_MSG(0x131,
3115                                    "Received notification of device removal\n"
3116                                    "Please shutdown LNET to allow this to proceed\n");
3117                 /*
3118                  * Can't remove network from underneath LNET for now, so I have
3119                  * to ignore this
3120                  */
3121                 return 0;
3122
3123         case RDMA_CM_EVENT_ADDR_CHANGE:
3124                 LCONSOLE_INFO("Physical link changed (eg hca/port)\n");
3125                 return 0;
3126         }
3127 }
3128
3129 static int
3130 kiblnd_check_txs_locked(struct kib_conn *conn, struct list_head *txs)
3131 {
3132         struct kib_tx *tx;
3133         struct list_head *ttmp;
3134
3135         list_for_each(ttmp, txs) {
3136                 tx = list_entry(ttmp, struct kib_tx, tx_list);
3137
3138                 if (txs != &conn->ibc_active_txs) {
3139                         LASSERT(tx->tx_queued);
3140                 } else {
3141                         LASSERT(!tx->tx_queued);
3142                         LASSERT(tx->tx_waiting || tx->tx_sending);
3143                 }
3144
3145                 if (cfs_time_aftereq(jiffies, tx->tx_deadline)) {
3146                         CERROR("Timed out tx: %s, %lu seconds\n",
3147                                kiblnd_queue2str(conn, txs),
3148                                cfs_duration_sec(jiffies - tx->tx_deadline));
3149                         return 1;
3150                 }
3151         }
3152
3153         return 0;
3154 }
3155
3156 static int
3157 kiblnd_conn_timed_out_locked(struct kib_conn *conn)
3158 {
3159         return  kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) ||
3160                 kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) ||
3161                 kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_rsrvd) ||
3162                 kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_nocred) ||
3163                 kiblnd_check_txs_locked(conn, &conn->ibc_active_txs);
3164 }
3165
3166 static void
3167 kiblnd_check_conns(int idx)
3168 {
3169         LIST_HEAD(closes);
3170         LIST_HEAD(checksends);
3171         struct list_head *peers = &kiblnd_data.kib_peers[idx];
3172         struct list_head *ptmp;
3173         struct kib_peer *peer;
3174         struct kib_conn *conn;
3175         struct kib_conn *temp;
3176         struct kib_conn *tmp;
3177         struct list_head *ctmp;
3178         unsigned long flags;
3179
3180         /*
3181          * NB. We expect to have a look at all the peers and not find any
3182          * RDMAs to time out, so we just use a shared lock while we
3183          * take a look...
3184          */
3185         read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
3186
3187         list_for_each(ptmp, peers) {
3188                 peer = list_entry(ptmp, struct kib_peer, ibp_list);
3189
3190                 list_for_each(ctmp, &peer->ibp_conns) {
3191                         int timedout;
3192                         int sendnoop;
3193
3194                         conn = list_entry(ctmp, struct kib_conn, ibc_list);
3195
3196                         LASSERT(conn->ibc_state == IBLND_CONN_ESTABLISHED);
3197
3198                         spin_lock(&conn->ibc_lock);
3199
3200                         sendnoop = kiblnd_need_noop(conn);
3201                         timedout = kiblnd_conn_timed_out_locked(conn);
3202                         if (!sendnoop && !timedout) {
3203                                 spin_unlock(&conn->ibc_lock);
3204                                 continue;
3205                         }
3206
3207                         if (timedout) {
3208                                 CERROR("Timed out RDMA with %s (%lu): c: %u, oc: %u, rc: %u\n",
3209                                        libcfs_nid2str(peer->ibp_nid),
3210                                        cfs_duration_sec(cfs_time_current() -
3211                                                         peer->ibp_last_alive),
3212                                        conn->ibc_credits,
3213                                        conn->ibc_outstanding_credits,
3214                                        conn->ibc_reserved_credits);
3215                                 list_add(&conn->ibc_connd_list, &closes);
3216                         } else {
3217                                 list_add(&conn->ibc_connd_list, &checksends);
3218                         }
3219                         /* +ref for 'closes' or 'checksends' */
3220                         kiblnd_conn_addref(conn);
3221
3222                         spin_unlock(&conn->ibc_lock);
3223                 }
3224         }
3225
3226         read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
3227
3228         /*
3229          * Handle timeout by closing the whole
3230          * connection. We can only be sure RDMA activity
3231          * has ceased once the QP has been modified.
3232          */
3233         list_for_each_entry_safe(conn, tmp, &closes, ibc_connd_list) {
3234                 list_del(&conn->ibc_connd_list);
3235                 kiblnd_close_conn(conn, -ETIMEDOUT);
3236                 kiblnd_conn_decref(conn);
3237         }
3238
3239         /*
3240          * In case we have enough credits to return via a
3241          * NOOP, but there were no non-blocking tx descs
3242          * free to do it last time...
3243          */
3244         list_for_each_entry_safe(conn, temp, &checksends, ibc_connd_list) {
3245                 list_del(&conn->ibc_connd_list);
3246
3247                 spin_lock(&conn->ibc_lock);
3248                 kiblnd_check_sends_locked(conn);
3249                 spin_unlock(&conn->ibc_lock);
3250
3251                 kiblnd_conn_decref(conn);
3252         }
3253 }
3254
3255 static void
3256 kiblnd_disconnect_conn(struct kib_conn *conn)
3257 {
3258         LASSERT(!in_interrupt());
3259         LASSERT(current == kiblnd_data.kib_connd);
3260         LASSERT(conn->ibc_state == IBLND_CONN_CLOSING);
3261
3262         rdma_disconnect(conn->ibc_cmid);
3263         kiblnd_finalise_conn(conn);
3264
3265         kiblnd_peer_notify(conn->ibc_peer);
3266 }
3267
3268 /**
3269  * High-water for reconnection to the same peer, reconnection attempt should
3270  * be delayed after trying more than KIB_RECONN_HIGH_RACE.
3271  */
3272 #define KIB_RECONN_HIGH_RACE    10
3273 /**
3274  * Allow connd to take a break and handle other things after consecutive
3275  * reconnection attempts.
3276  */
3277 #define KIB_RECONN_BREAK        100
3278
3279 int
3280 kiblnd_connd(void *arg)
3281 {
3282         spinlock_t *lock = &kiblnd_data.kib_connd_lock;
3283         wait_queue_entry_t wait;
3284         unsigned long flags;
3285         struct kib_conn *conn;
3286         int timeout;
3287         int i;
3288         int dropped_lock;
3289         int peer_index = 0;
3290         unsigned long deadline = jiffies;
3291
3292         cfs_block_allsigs();
3293
3294         init_waitqueue_entry(&wait, current);
3295         kiblnd_data.kib_connd = current;
3296
3297         spin_lock_irqsave(lock, flags);
3298
3299         while (!kiblnd_data.kib_shutdown) {
3300                 int reconn = 0;
3301
3302                 dropped_lock = 0;
3303
3304                 if (!list_empty(&kiblnd_data.kib_connd_zombies)) {
3305                         struct kib_peer *peer = NULL;
3306
3307                         conn = list_entry(kiblnd_data.kib_connd_zombies.next,
3308                                           struct kib_conn, ibc_list);
3309                         list_del(&conn->ibc_list);
3310                         if (conn->ibc_reconnect) {
3311                                 peer = conn->ibc_peer;
3312                                 kiblnd_peer_addref(peer);
3313                         }
3314
3315                         spin_unlock_irqrestore(lock, flags);
3316                         dropped_lock = 1;
3317
3318                         kiblnd_destroy_conn(conn);
3319
3320                         spin_lock_irqsave(lock, flags);
3321                         if (!peer) {
3322                                 kfree(conn);
3323                                 continue;
3324                         }
3325
3326                         conn->ibc_peer = peer;
3327                         if (peer->ibp_reconnected < KIB_RECONN_HIGH_RACE)
3328                                 list_add_tail(&conn->ibc_list,
3329                                               &kiblnd_data.kib_reconn_list);
3330                         else
3331                                 list_add_tail(&conn->ibc_list,
3332                                               &kiblnd_data.kib_reconn_wait);
3333                 }
3334
3335                 if (!list_empty(&kiblnd_data.kib_connd_conns)) {
3336                         conn = list_entry(kiblnd_data.kib_connd_conns.next,
3337                                           struct kib_conn, ibc_list);
3338                         list_del(&conn->ibc_list);
3339
3340                         spin_unlock_irqrestore(lock, flags);
3341                         dropped_lock = 1;
3342
3343                         kiblnd_disconnect_conn(conn);
3344                         kiblnd_conn_decref(conn);
3345
3346                         spin_lock_irqsave(lock, flags);
3347                 }
3348
3349                 while (reconn < KIB_RECONN_BREAK) {
3350                         if (kiblnd_data.kib_reconn_sec !=
3351                             ktime_get_real_seconds()) {
3352                                 kiblnd_data.kib_reconn_sec = ktime_get_real_seconds();
3353                                 list_splice_init(&kiblnd_data.kib_reconn_wait,
3354                                                  &kiblnd_data.kib_reconn_list);
3355                         }
3356
3357                         if (list_empty(&kiblnd_data.kib_reconn_list))
3358                                 break;
3359
3360                         conn = list_entry(kiblnd_data.kib_reconn_list.next,
3361                                           struct kib_conn, ibc_list);
3362                         list_del(&conn->ibc_list);
3363
3364                         spin_unlock_irqrestore(lock, flags);
3365                         dropped_lock = 1;
3366
3367                         reconn += kiblnd_reconnect_peer(conn->ibc_peer);
3368                         kiblnd_peer_decref(conn->ibc_peer);
3369                         LIBCFS_FREE(conn, sizeof(*conn));
3370
3371                         spin_lock_irqsave(lock, flags);
3372                 }
3373
3374                 /* careful with the jiffy wrap... */
3375                 timeout = (int)(deadline - jiffies);
3376                 if (timeout <= 0) {
3377                         const int n = 4;
3378                         const int p = 1;
3379                         int chunk = kiblnd_data.kib_peer_hash_size;
3380
3381                         spin_unlock_irqrestore(lock, flags);
3382                         dropped_lock = 1;
3383
3384                         /*
3385                          * Time to check for RDMA timeouts on a few more
3386                          * peers: I do checks every 'p' seconds on a
3387                          * proportion of the peer table and I need to check
3388                          * every connection 'n' times within a timeout
3389                          * interval, to ensure I detect a timeout on any
3390                          * connection within (n+1)/n times the timeout
3391                          * interval.
3392                          */
3393                         if (*kiblnd_tunables.kib_timeout > n * p)
3394                                 chunk = (chunk * n * p) /
3395                                         *kiblnd_tunables.kib_timeout;
3396                         if (!chunk)
3397                                 chunk = 1;
3398
3399                         for (i = 0; i < chunk; i++) {
3400                                 kiblnd_check_conns(peer_index);
3401                                 peer_index = (peer_index + 1) %
3402                                              kiblnd_data.kib_peer_hash_size;
3403                         }
3404
3405                         deadline += msecs_to_jiffies(p * MSEC_PER_SEC);
3406                         spin_lock_irqsave(lock, flags);
3407                 }
3408
3409                 if (dropped_lock)
3410                         continue;
3411
3412                 /* Nothing to do for 'timeout'  */
3413                 set_current_state(TASK_INTERRUPTIBLE);
3414                 add_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
3415                 spin_unlock_irqrestore(lock, flags);
3416
3417                 schedule_timeout(timeout);
3418
3419                 remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
3420                 spin_lock_irqsave(lock, flags);
3421         }
3422
3423         spin_unlock_irqrestore(lock, flags);
3424
3425         kiblnd_thread_fini();
3426         return 0;
3427 }
3428
3429 void
3430 kiblnd_qp_event(struct ib_event *event, void *arg)
3431 {
3432         struct kib_conn *conn = arg;
3433
3434         switch (event->event) {
3435         case IB_EVENT_COMM_EST:
3436                 CDEBUG(D_NET, "%s established\n",
3437                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
3438                 /*
3439                  * We received a packet but connection isn't established
3440                  * probably handshake packet was lost, so free to
3441                  * force make connection established
3442                  */
3443                 rdma_notify(conn->ibc_cmid, IB_EVENT_COMM_EST);
3444                 return;
3445
3446         default:
3447                 CERROR("%s: Async QP event type %d\n",
3448                        libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
3449                 return;
3450         }
3451 }
3452
3453 static void
3454 kiblnd_complete(struct ib_wc *wc)
3455 {
3456         switch (kiblnd_wreqid2type(wc->wr_id)) {
3457         default:
3458                 LBUG();
3459
3460         case IBLND_WID_MR:
3461                 if (wc->status != IB_WC_SUCCESS &&
3462                     wc->status != IB_WC_WR_FLUSH_ERR)
3463                         CNETERR("FastReg failed: %d\n", wc->status);
3464                 break;
3465
3466         case IBLND_WID_RDMA:
3467                 /*
3468                  * We only get RDMA completion notification if it fails.  All
3469                  * subsequent work items, including the final SEND will fail
3470                  * too.  However we can't print out any more info about the
3471                  * failing RDMA because 'tx' might be back on the idle list or
3472                  * even reused already if we didn't manage to post all our work
3473                  * items
3474                  */
3475                 CNETERR("RDMA (tx: %p) failed: %d\n",
3476                         kiblnd_wreqid2ptr(wc->wr_id), wc->status);
3477                 return;
3478
3479         case IBLND_WID_TX:
3480                 kiblnd_tx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status);
3481                 return;
3482
3483         case IBLND_WID_RX:
3484                 kiblnd_rx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status,
3485                                    wc->byte_len);
3486                 return;
3487         }
3488 }
3489
3490 void
3491 kiblnd_cq_completion(struct ib_cq *cq, void *arg)
3492 {
3493         /*
3494          * NB I'm not allowed to schedule this conn once its refcount has
3495          * reached 0.  Since fundamentally I'm racing with scheduler threads
3496          * consuming my CQ I could be called after all completions have
3497          * occurred.  But in this case, !ibc_nrx && !ibc_nsends_posted
3498          * and this CQ is about to be destroyed so I NOOP.
3499          */
3500         struct kib_conn *conn = arg;
3501         struct kib_sched_info *sched = conn->ibc_sched;
3502         unsigned long flags;
3503
3504         LASSERT(cq == conn->ibc_cq);
3505
3506         spin_lock_irqsave(&sched->ibs_lock, flags);
3507
3508         conn->ibc_ready = 1;
3509
3510         if (!conn->ibc_scheduled &&
3511             (conn->ibc_nrx > 0 ||
3512              conn->ibc_nsends_posted > 0)) {
3513                 kiblnd_conn_addref(conn); /* +1 ref for sched_conns */
3514                 conn->ibc_scheduled = 1;
3515                 list_add_tail(&conn->ibc_sched_list, &sched->ibs_conns);
3516
3517                 if (waitqueue_active(&sched->ibs_waitq))
3518                         wake_up(&sched->ibs_waitq);
3519         }
3520
3521         spin_unlock_irqrestore(&sched->ibs_lock, flags);
3522 }
3523
3524 void
3525 kiblnd_cq_event(struct ib_event *event, void *arg)
3526 {
3527         struct kib_conn *conn = arg;
3528
3529         CERROR("%s: async CQ event type %d\n",
3530                libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
3531 }
3532
3533 int
3534 kiblnd_scheduler(void *arg)
3535 {
3536         long id = (long)arg;
3537         struct kib_sched_info *sched;
3538         struct kib_conn *conn;
3539         wait_queue_entry_t wait;
3540         unsigned long flags;
3541         struct ib_wc wc;
3542         int did_something;
3543         int busy_loops = 0;
3544         int rc;
3545
3546         cfs_block_allsigs();
3547
3548         init_waitqueue_entry(&wait, current);
3549
3550         sched = kiblnd_data.kib_scheds[KIB_THREAD_CPT(id)];
3551
3552         rc = cfs_cpt_bind(lnet_cpt_table(), sched->ibs_cpt);
3553         if (rc) {
3554                 CWARN("Unable to bind on CPU partition %d, please verify whether all CPUs are healthy and reload modules if necessary, otherwise your system might under risk of low performance\n",
3555                       sched->ibs_cpt);
3556         }
3557
3558         spin_lock_irqsave(&sched->ibs_lock, flags);
3559
3560         while (!kiblnd_data.kib_shutdown) {
3561                 if (busy_loops++ >= IBLND_RESCHED) {
3562                         spin_unlock_irqrestore(&sched->ibs_lock, flags);
3563
3564                         cond_resched();
3565                         busy_loops = 0;
3566
3567                         spin_lock_irqsave(&sched->ibs_lock, flags);
3568                 }
3569
3570                 did_something = 0;
3571
3572                 if (!list_empty(&sched->ibs_conns)) {
3573                         conn = list_entry(sched->ibs_conns.next, struct kib_conn,
3574                                           ibc_sched_list);
3575                         /* take over kib_sched_conns' ref on conn... */
3576                         LASSERT(conn->ibc_scheduled);
3577                         list_del(&conn->ibc_sched_list);
3578                         conn->ibc_ready = 0;
3579
3580                         spin_unlock_irqrestore(&sched->ibs_lock, flags);
3581
3582                         wc.wr_id = IBLND_WID_INVAL;
3583
3584                         rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
3585                         if (!rc) {
3586                                 rc = ib_req_notify_cq(conn->ibc_cq,
3587                                                       IB_CQ_NEXT_COMP);
3588                                 if (rc < 0) {
3589                                         CWARN("%s: ib_req_notify_cq failed: %d, closing connection\n",
3590                                               libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
3591                                         kiblnd_close_conn(conn, -EIO);
3592                                         kiblnd_conn_decref(conn);
3593                                         spin_lock_irqsave(&sched->ibs_lock,
3594                                                           flags);
3595                                         continue;
3596                                 }
3597
3598                                 rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
3599                         }
3600
3601                         if (unlikely(rc > 0 && wc.wr_id == IBLND_WID_INVAL)) {
3602                                 LCONSOLE_ERROR("ib_poll_cq (rc: %d) returned invalid wr_id, opcode %d, status: %d, vendor_err: %d, conn: %s status: %d\nplease upgrade firmware and OFED or contact vendor.\n",
3603                                                rc, wc.opcode, wc.status,
3604                                                wc.vendor_err,
3605                                                libcfs_nid2str(conn->ibc_peer->ibp_nid),
3606                                                conn->ibc_state);
3607                                 rc = -EINVAL;
3608                         }
3609
3610                         if (rc < 0) {
3611                                 CWARN("%s: ib_poll_cq failed: %d, closing connection\n",
3612                                       libcfs_nid2str(conn->ibc_peer->ibp_nid),
3613                                       rc);
3614                                 kiblnd_close_conn(conn, -EIO);
3615                                 kiblnd_conn_decref(conn);
3616                                 spin_lock_irqsave(&sched->ibs_lock, flags);
3617                                 continue;
3618                         }
3619
3620                         spin_lock_irqsave(&sched->ibs_lock, flags);
3621
3622                         if (rc || conn->ibc_ready) {
3623                                 /*
3624                                  * There may be another completion waiting; get
3625                                  * another scheduler to check while I handle
3626                                  * this one...
3627                                  */
3628                                 /* +1 ref for sched_conns */
3629                                 kiblnd_conn_addref(conn);
3630                                 list_add_tail(&conn->ibc_sched_list,
3631                                               &sched->ibs_conns);
3632                                 if (waitqueue_active(&sched->ibs_waitq))
3633                                         wake_up(&sched->ibs_waitq);
3634                         } else {
3635                                 conn->ibc_scheduled = 0;
3636                         }
3637
3638                         if (rc) {
3639                                 spin_unlock_irqrestore(&sched->ibs_lock, flags);
3640                                 kiblnd_complete(&wc);
3641
3642                                 spin_lock_irqsave(&sched->ibs_lock, flags);
3643                         }
3644
3645                         kiblnd_conn_decref(conn); /* ...drop my ref from above */
3646                         did_something = 1;
3647                 }
3648
3649                 if (did_something)
3650                         continue;
3651
3652                 set_current_state(TASK_INTERRUPTIBLE);
3653                 add_wait_queue_exclusive(&sched->ibs_waitq, &wait);
3654                 spin_unlock_irqrestore(&sched->ibs_lock, flags);
3655
3656                 schedule();
3657                 busy_loops = 0;
3658
3659                 remove_wait_queue(&sched->ibs_waitq, &wait);
3660                 spin_lock_irqsave(&sched->ibs_lock, flags);
3661         }
3662
3663         spin_unlock_irqrestore(&sched->ibs_lock, flags);
3664
3665         kiblnd_thread_fini();
3666         return 0;
3667 }
3668
3669 int
3670 kiblnd_failover_thread(void *arg)
3671 {
3672         rwlock_t *glock = &kiblnd_data.kib_global_lock;
3673         struct kib_dev *dev;
3674         wait_queue_entry_t wait;
3675         unsigned long flags;
3676         int rc;
3677
3678         LASSERT(*kiblnd_tunables.kib_dev_failover);
3679
3680         cfs_block_allsigs();
3681
3682         init_waitqueue_entry(&wait, current);
3683         write_lock_irqsave(glock, flags);
3684
3685         while (!kiblnd_data.kib_shutdown) {
3686                 int do_failover = 0;
3687                 int long_sleep;
3688
3689                 list_for_each_entry(dev, &kiblnd_data.kib_failed_devs,
3690                                     ibd_fail_list) {
3691                         if (time_before(cfs_time_current(),
3692                                         dev->ibd_next_failover))
3693                                 continue;
3694                         do_failover = 1;
3695                         break;
3696                 }
3697
3698                 if (do_failover) {
3699                         list_del_init(&dev->ibd_fail_list);
3700                         dev->ibd_failover = 1;
3701                         write_unlock_irqrestore(glock, flags);
3702
3703                         rc = kiblnd_dev_failover(dev);
3704
3705                         write_lock_irqsave(glock, flags);
3706
3707                         LASSERT(dev->ibd_failover);
3708                         dev->ibd_failover = 0;
3709                         if (rc >= 0) { /* Device is OK or failover succeed */
3710                                 dev->ibd_next_failover = cfs_time_shift(3);
3711                                 continue;
3712                         }
3713
3714                         /* failed to failover, retry later */
3715                         dev->ibd_next_failover =
3716                                 cfs_time_shift(min(dev->ibd_failed_failover, 10));
3717                         if (kiblnd_dev_can_failover(dev)) {
3718                                 list_add_tail(&dev->ibd_fail_list,
3719                                               &kiblnd_data.kib_failed_devs);
3720                         }
3721
3722                         continue;
3723                 }
3724
3725                 /* long sleep if no more pending failover */
3726                 long_sleep = list_empty(&kiblnd_data.kib_failed_devs);
3727
3728                 set_current_state(TASK_INTERRUPTIBLE);
3729                 add_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
3730                 write_unlock_irqrestore(glock, flags);
3731
3732                 rc = schedule_timeout(long_sleep ? cfs_time_seconds(10) :
3733                                                    cfs_time_seconds(1));
3734                 remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
3735                 write_lock_irqsave(glock, flags);
3736
3737                 if (!long_sleep || rc)
3738                         continue;
3739
3740                 /*
3741                  * have a long sleep, routine check all active devices,
3742                  * we need checking like this because if there is not active
3743                  * connection on the dev and no SEND from local, we may listen
3744                  * on wrong HCA for ever while there is a bonding failover
3745                  */
3746                 list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
3747                         if (kiblnd_dev_can_failover(dev)) {
3748                                 list_add_tail(&dev->ibd_fail_list,
3749                                               &kiblnd_data.kib_failed_devs);
3750                         }
3751                 }
3752         }
3753
3754         write_unlock_irqrestore(glock, flags);
3755
3756         kiblnd_thread_fini();
3757         return 0;
3758 }