GNU Linux-libre 5.10.215-gnu1
[releases.git] / drivers / infiniband / sw / siw / siw_qp.c
1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
2
3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4 /* Copyright (c) 2008-2019, IBM Corporation */
5
6 #include <linux/errno.h>
7 #include <linux/types.h>
8 #include <linux/net.h>
9 #include <linux/scatterlist.h>
10 #include <linux/llist.h>
11 #include <asm/barrier.h>
12 #include <net/tcp.h>
13
14 #include "siw.h"
15 #include "siw_verbs.h"
16 #include "siw_mem.h"
17
18 static char siw_qp_state_to_string[SIW_QP_STATE_COUNT][sizeof "TERMINATE"] = {
19         [SIW_QP_STATE_IDLE] = "IDLE",
20         [SIW_QP_STATE_RTR] = "RTR",
21         [SIW_QP_STATE_RTS] = "RTS",
22         [SIW_QP_STATE_CLOSING] = "CLOSING",
23         [SIW_QP_STATE_TERMINATE] = "TERMINATE",
24         [SIW_QP_STATE_ERROR] = "ERROR"
25 };
26
27 /*
28  * iWARP (RDMAP, DDP and MPA) parameters as well as Softiwarp settings on a
29  * per-RDMAP message basis. Please keep order of initializer. All MPA len
30  * is initialized to minimum packet size.
31  */
32 struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1] = {
33         { /* RDMAP_RDMA_WRITE */
34           .hdr_len = sizeof(struct iwarp_rdma_write),
35           .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_write) - 2),
36           .ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST |
37                                  cpu_to_be16(DDP_VERSION << 8) |
38                                  cpu_to_be16(RDMAP_VERSION << 6) |
39                                  cpu_to_be16(RDMAP_RDMA_WRITE),
40           .rx_data = siw_proc_write },
41         { /* RDMAP_RDMA_READ_REQ */
42           .hdr_len = sizeof(struct iwarp_rdma_rreq),
43           .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rreq) - 2),
44           .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
45                                  cpu_to_be16(RDMAP_VERSION << 6) |
46                                  cpu_to_be16(RDMAP_RDMA_READ_REQ),
47           .rx_data = siw_proc_rreq },
48         { /* RDMAP_RDMA_READ_RESP */
49           .hdr_len = sizeof(struct iwarp_rdma_rresp),
50           .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rresp) - 2),
51           .ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST |
52                                  cpu_to_be16(DDP_VERSION << 8) |
53                                  cpu_to_be16(RDMAP_VERSION << 6) |
54                                  cpu_to_be16(RDMAP_RDMA_READ_RESP),
55           .rx_data = siw_proc_rresp },
56         { /* RDMAP_SEND */
57           .hdr_len = sizeof(struct iwarp_send),
58           .ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
59           .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
60                                  cpu_to_be16(RDMAP_VERSION << 6) |
61                                  cpu_to_be16(RDMAP_SEND),
62           .rx_data = siw_proc_send },
63         { /* RDMAP_SEND_INVAL */
64           .hdr_len = sizeof(struct iwarp_send_inv),
65           .ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
66           .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
67                                  cpu_to_be16(RDMAP_VERSION << 6) |
68                                  cpu_to_be16(RDMAP_SEND_INVAL),
69           .rx_data = siw_proc_send },
70         { /* RDMAP_SEND_SE */
71           .hdr_len = sizeof(struct iwarp_send),
72           .ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
73           .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
74                                  cpu_to_be16(RDMAP_VERSION << 6) |
75                                  cpu_to_be16(RDMAP_SEND_SE),
76           .rx_data = siw_proc_send },
77         { /* RDMAP_SEND_SE_INVAL */
78           .hdr_len = sizeof(struct iwarp_send_inv),
79           .ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
80           .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
81                                  cpu_to_be16(RDMAP_VERSION << 6) |
82                                  cpu_to_be16(RDMAP_SEND_SE_INVAL),
83           .rx_data = siw_proc_send },
84         { /* RDMAP_TERMINATE */
85           .hdr_len = sizeof(struct iwarp_terminate),
86           .ctrl.mpa_len = htons(sizeof(struct iwarp_terminate) - 2),
87           .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
88                                  cpu_to_be16(RDMAP_VERSION << 6) |
89                                  cpu_to_be16(RDMAP_TERMINATE),
90           .rx_data = siw_proc_terminate }
91 };
92
93 void siw_qp_llp_data_ready(struct sock *sk)
94 {
95         struct siw_qp *qp;
96
97         read_lock(&sk->sk_callback_lock);
98
99         if (unlikely(!sk->sk_user_data || !sk_to_qp(sk)))
100                 goto done;
101
102         qp = sk_to_qp(sk);
103
104         if (likely(!qp->rx_stream.rx_suspend &&
105                    down_read_trylock(&qp->state_lock))) {
106                 read_descriptor_t rd_desc = { .arg.data = qp, .count = 1 };
107
108                 if (likely(qp->attrs.state == SIW_QP_STATE_RTS))
109                         /*
110                          * Implements data receive operation during
111                          * socket callback. TCP gracefully catches
112                          * the case where there is nothing to receive
113                          * (not calling siw_tcp_rx_data() then).
114                          */
115                         tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
116
117                 up_read(&qp->state_lock);
118         } else {
119                 siw_dbg_qp(qp, "unable to process RX, suspend: %d\n",
120                            qp->rx_stream.rx_suspend);
121         }
122 done:
123         read_unlock(&sk->sk_callback_lock);
124 }
125
126 void siw_qp_llp_close(struct siw_qp *qp)
127 {
128         siw_dbg_qp(qp, "enter llp close, state = %s\n",
129                    siw_qp_state_to_string[qp->attrs.state]);
130
131         down_write(&qp->state_lock);
132
133         qp->rx_stream.rx_suspend = 1;
134         qp->tx_ctx.tx_suspend = 1;
135         qp->attrs.sk = NULL;
136
137         switch (qp->attrs.state) {
138         case SIW_QP_STATE_RTS:
139         case SIW_QP_STATE_RTR:
140         case SIW_QP_STATE_IDLE:
141         case SIW_QP_STATE_TERMINATE:
142                 qp->attrs.state = SIW_QP_STATE_ERROR;
143                 break;
144         /*
145          * SIW_QP_STATE_CLOSING:
146          *
147          * This is a forced close. shall the QP be moved to
148          * ERROR or IDLE ?
149          */
150         case SIW_QP_STATE_CLOSING:
151                 if (tx_wqe(qp)->wr_status == SIW_WR_IDLE)
152                         qp->attrs.state = SIW_QP_STATE_ERROR;
153                 else
154                         qp->attrs.state = SIW_QP_STATE_IDLE;
155                 break;
156
157         default:
158                 siw_dbg_qp(qp, "llp close: no state transition needed: %s\n",
159                            siw_qp_state_to_string[qp->attrs.state]);
160                 break;
161         }
162         siw_sq_flush(qp);
163         siw_rq_flush(qp);
164
165         /*
166          * Dereference closing CEP
167          */
168         if (qp->cep) {
169                 siw_cep_put(qp->cep);
170                 qp->cep = NULL;
171         }
172
173         up_write(&qp->state_lock);
174
175         siw_dbg_qp(qp, "llp close exit: state %s\n",
176                    siw_qp_state_to_string[qp->attrs.state]);
177 }
178
179 /*
180  * socket callback routine informing about newly available send space.
181  * Function schedules SQ work for processing SQ items.
182  */
183 void siw_qp_llp_write_space(struct sock *sk)
184 {
185         struct siw_cep *cep;
186
187         read_lock(&sk->sk_callback_lock);
188
189         cep  = sk_to_cep(sk);
190         if (cep) {
191                 cep->sk_write_space(sk);
192
193                 if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
194                         (void)siw_sq_start(cep->qp);
195         }
196
197         read_unlock(&sk->sk_callback_lock);
198 }
199
200 static int siw_qp_readq_init(struct siw_qp *qp, int irq_size, int orq_size)
201 {
202         if (irq_size) {
203                 irq_size = roundup_pow_of_two(irq_size);
204                 qp->irq = vzalloc(irq_size * sizeof(struct siw_sqe));
205                 if (!qp->irq) {
206                         qp->attrs.irq_size = 0;
207                         return -ENOMEM;
208                 }
209         }
210         if (orq_size) {
211                 orq_size = roundup_pow_of_two(orq_size);
212                 qp->orq = vzalloc(orq_size * sizeof(struct siw_sqe));
213                 if (!qp->orq) {
214                         qp->attrs.orq_size = 0;
215                         qp->attrs.irq_size = 0;
216                         vfree(qp->irq);
217                         return -ENOMEM;
218                 }
219         }
220         qp->attrs.irq_size = irq_size;
221         qp->attrs.orq_size = orq_size;
222         siw_dbg_qp(qp, "ORD %d, IRD %d\n", orq_size, irq_size);
223         return 0;
224 }
225
226 static int siw_qp_enable_crc(struct siw_qp *qp)
227 {
228         struct siw_rx_stream *c_rx = &qp->rx_stream;
229         struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
230         int size;
231
232         if (siw_crypto_shash == NULL)
233                 return -ENOENT;
234
235         size = crypto_shash_descsize(siw_crypto_shash) +
236                 sizeof(struct shash_desc);
237
238         c_tx->mpa_crc_hd = kzalloc(size, GFP_KERNEL);
239         c_rx->mpa_crc_hd = kzalloc(size, GFP_KERNEL);
240         if (!c_tx->mpa_crc_hd || !c_rx->mpa_crc_hd) {
241                 kfree(c_tx->mpa_crc_hd);
242                 kfree(c_rx->mpa_crc_hd);
243                 c_tx->mpa_crc_hd = NULL;
244                 c_rx->mpa_crc_hd = NULL;
245                 return -ENOMEM;
246         }
247         c_tx->mpa_crc_hd->tfm = siw_crypto_shash;
248         c_rx->mpa_crc_hd->tfm = siw_crypto_shash;
249
250         return 0;
251 }
252
253 /*
254  * Send a non signalled READ or WRITE to peer side as negotiated
255  * with MPAv2 P2P setup protocol. The work request is only created
256  * as a current active WR and does not consume Send Queue space.
257  *
258  * Caller must hold QP state lock.
259  */
260 int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl)
261 {
262         struct siw_wqe *wqe = tx_wqe(qp);
263         unsigned long flags;
264         int rv = 0;
265
266         spin_lock_irqsave(&qp->sq_lock, flags);
267
268         if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
269                 spin_unlock_irqrestore(&qp->sq_lock, flags);
270                 return -EIO;
271         }
272         memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
273
274         wqe->wr_status = SIW_WR_QUEUED;
275         wqe->sqe.flags = 0;
276         wqe->sqe.num_sge = 1;
277         wqe->sqe.sge[0].length = 0;
278         wqe->sqe.sge[0].laddr = 0;
279         wqe->sqe.sge[0].lkey = 0;
280         /*
281          * While it must not be checked for inbound zero length
282          * READ/WRITE, some HW may treat STag 0 special.
283          */
284         wqe->sqe.rkey = 1;
285         wqe->sqe.raddr = 0;
286         wqe->processed = 0;
287
288         if (ctrl & MPA_V2_RDMA_WRITE_RTR)
289                 wqe->sqe.opcode = SIW_OP_WRITE;
290         else if (ctrl & MPA_V2_RDMA_READ_RTR) {
291                 struct siw_sqe *rreq = NULL;
292
293                 wqe->sqe.opcode = SIW_OP_READ;
294
295                 spin_lock(&qp->orq_lock);
296
297                 if (qp->attrs.orq_size)
298                         rreq = orq_get_free(qp);
299                 if (rreq) {
300                         siw_read_to_orq(rreq, &wqe->sqe);
301                         qp->orq_put++;
302                 } else
303                         rv = -EIO;
304
305                 spin_unlock(&qp->orq_lock);
306         } else
307                 rv = -EINVAL;
308
309         if (rv)
310                 wqe->wr_status = SIW_WR_IDLE;
311
312         spin_unlock_irqrestore(&qp->sq_lock, flags);
313
314         if (!rv)
315                 rv = siw_sq_start(qp);
316
317         return rv;
318 }
319
320 /*
321  * Map memory access error to DDP tagged error
322  */
323 enum ddp_ecode siw_tagged_error(enum siw_access_state state)
324 {
325         switch (state) {
326         case E_STAG_INVALID:
327                 return DDP_ECODE_T_INVALID_STAG;
328         case E_BASE_BOUNDS:
329                 return DDP_ECODE_T_BASE_BOUNDS;
330         case E_PD_MISMATCH:
331                 return DDP_ECODE_T_STAG_NOT_ASSOC;
332         case E_ACCESS_PERM:
333                 /*
334                  * RFC 5041 (DDP) lacks an ecode for insufficient access
335                  * permissions. 'Invalid STag' seem to be the closest
336                  * match though.
337                  */
338                 return DDP_ECODE_T_INVALID_STAG;
339         default:
340                 WARN_ON(1);
341                 return DDP_ECODE_T_INVALID_STAG;
342         }
343 }
344
345 /*
346  * Map memory access error to RDMAP protection error
347  */
348 enum rdmap_ecode siw_rdmap_error(enum siw_access_state state)
349 {
350         switch (state) {
351         case E_STAG_INVALID:
352                 return RDMAP_ECODE_INVALID_STAG;
353         case E_BASE_BOUNDS:
354                 return RDMAP_ECODE_BASE_BOUNDS;
355         case E_PD_MISMATCH:
356                 return RDMAP_ECODE_STAG_NOT_ASSOC;
357         case E_ACCESS_PERM:
358                 return RDMAP_ECODE_ACCESS_RIGHTS;
359         default:
360                 return RDMAP_ECODE_UNSPECIFIED;
361         }
362 }
363
364 void siw_init_terminate(struct siw_qp *qp, enum term_elayer layer, u8 etype,
365                         u8 ecode, int in_tx)
366 {
367         if (!qp->term_info.valid) {
368                 memset(&qp->term_info, 0, sizeof(qp->term_info));
369                 qp->term_info.layer = layer;
370                 qp->term_info.etype = etype;
371                 qp->term_info.ecode = ecode;
372                 qp->term_info.in_tx = in_tx;
373                 qp->term_info.valid = 1;
374         }
375         siw_dbg_qp(qp, "init TERM: layer %d, type %d, code %d, in tx %s\n",
376                    layer, etype, ecode, in_tx ? "yes" : "no");
377 }
378
379 /*
380  * Send a TERMINATE message, as defined in RFC's 5040/5041/5044/6581.
381  * Sending TERMINATE messages is best effort - such messages
382  * can only be send if the QP is still connected and it does
383  * not have another outbound message in-progress, i.e. the
384  * TERMINATE message must not interfer with an incomplete current
385  * transmit operation.
386  */
387 void siw_send_terminate(struct siw_qp *qp)
388 {
389         struct kvec iov[3];
390         struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
391         struct iwarp_terminate *term = NULL;
392         union iwarp_hdr *err_hdr = NULL;
393         struct socket *s = qp->attrs.sk;
394         struct siw_rx_stream *srx = &qp->rx_stream;
395         union iwarp_hdr *rx_hdr = &srx->hdr;
396         u32 crc = 0;
397         int num_frags, len_terminate, rv;
398
399         if (!qp->term_info.valid)
400                 return;
401
402         qp->term_info.valid = 0;
403
404         if (tx_wqe(qp)->wr_status == SIW_WR_INPROGRESS) {
405                 siw_dbg_qp(qp, "cannot send TERMINATE: op %d in progress\n",
406                            tx_type(tx_wqe(qp)));
407                 return;
408         }
409         if (!s && qp->cep)
410                 /* QP not yet in RTS. Take socket from connection end point */
411                 s = qp->cep->sock;
412
413         if (!s) {
414                 siw_dbg_qp(qp, "cannot send TERMINATE: not connected\n");
415                 return;
416         }
417
418         term = kzalloc(sizeof(*term), GFP_KERNEL);
419         if (!term)
420                 return;
421
422         term->ddp_qn = cpu_to_be32(RDMAP_UNTAGGED_QN_TERMINATE);
423         term->ddp_mo = 0;
424         term->ddp_msn = cpu_to_be32(1);
425
426         iov[0].iov_base = term;
427         iov[0].iov_len = sizeof(*term);
428
429         if ((qp->term_info.layer == TERM_ERROR_LAYER_DDP) ||
430             ((qp->term_info.layer == TERM_ERROR_LAYER_RDMAP) &&
431              (qp->term_info.etype != RDMAP_ETYPE_CATASTROPHIC))) {
432                 err_hdr = kzalloc(sizeof(*err_hdr), GFP_KERNEL);
433                 if (!err_hdr) {
434                         kfree(term);
435                         return;
436                 }
437         }
438         memcpy(&term->ctrl, &iwarp_pktinfo[RDMAP_TERMINATE].ctrl,
439                sizeof(struct iwarp_ctrl));
440
441         __rdmap_term_set_layer(term, qp->term_info.layer);
442         __rdmap_term_set_etype(term, qp->term_info.etype);
443         __rdmap_term_set_ecode(term, qp->term_info.ecode);
444
445         switch (qp->term_info.layer) {
446         case TERM_ERROR_LAYER_RDMAP:
447                 if (qp->term_info.etype == RDMAP_ETYPE_CATASTROPHIC)
448                         /* No additional DDP/RDMAP header to be included */
449                         break;
450
451                 if (qp->term_info.etype == RDMAP_ETYPE_REMOTE_PROTECTION) {
452                         /*
453                          * Complete RDMAP frame will get attached, and
454                          * DDP segment length is valid
455                          */
456                         term->flag_m = 1;
457                         term->flag_d = 1;
458                         term->flag_r = 1;
459
460                         if (qp->term_info.in_tx) {
461                                 struct iwarp_rdma_rreq *rreq;
462                                 struct siw_wqe *wqe = tx_wqe(qp);
463
464                                 /* Inbound RREQ error, detected during
465                                  * RRESP creation. Take state from
466                                  * current TX work queue element to
467                                  * reconstruct peers RREQ.
468                                  */
469                                 rreq = (struct iwarp_rdma_rreq *)err_hdr;
470
471                                 memcpy(&rreq->ctrl,
472                                        &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl,
473                                        sizeof(struct iwarp_ctrl));
474
475                                 rreq->rsvd = 0;
476                                 rreq->ddp_qn =
477                                         htonl(RDMAP_UNTAGGED_QN_RDMA_READ);
478
479                                 /* Provide RREQ's MSN as kept aside */
480                                 rreq->ddp_msn = htonl(wqe->sqe.sge[0].length);
481
482                                 rreq->ddp_mo = htonl(wqe->processed);
483                                 rreq->sink_stag = htonl(wqe->sqe.rkey);
484                                 rreq->sink_to = cpu_to_be64(wqe->sqe.raddr);
485                                 rreq->read_size = htonl(wqe->sqe.sge[0].length);
486                                 rreq->source_stag = htonl(wqe->sqe.sge[0].lkey);
487                                 rreq->source_to =
488                                         cpu_to_be64(wqe->sqe.sge[0].laddr);
489
490                                 iov[1].iov_base = rreq;
491                                 iov[1].iov_len = sizeof(*rreq);
492
493                                 rx_hdr = (union iwarp_hdr *)rreq;
494                         } else {
495                                 /* Take RDMAP/DDP information from
496                                  * current (failed) inbound frame.
497                                  */
498                                 iov[1].iov_base = rx_hdr;
499
500                                 if (__rdmap_get_opcode(&rx_hdr->ctrl) ==
501                                     RDMAP_RDMA_READ_REQ)
502                                         iov[1].iov_len =
503                                                 sizeof(struct iwarp_rdma_rreq);
504                                 else /* SEND type */
505                                         iov[1].iov_len =
506                                                 sizeof(struct iwarp_send);
507                         }
508                 } else {
509                         /* Do not report DDP hdr information if packet
510                          * layout is unknown
511                          */
512                         if ((qp->term_info.ecode == RDMAP_ECODE_VERSION) ||
513                             (qp->term_info.ecode == RDMAP_ECODE_OPCODE))
514                                 break;
515
516                         iov[1].iov_base = rx_hdr;
517
518                         /* Only DDP frame will get attached */
519                         if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
520                                 iov[1].iov_len =
521                                         sizeof(struct iwarp_rdma_write);
522                         else
523                                 iov[1].iov_len = sizeof(struct iwarp_send);
524
525                         term->flag_m = 1;
526                         term->flag_d = 1;
527                 }
528                 term->ctrl.mpa_len = cpu_to_be16(iov[1].iov_len);
529                 break;
530
531         case TERM_ERROR_LAYER_DDP:
532                 /* Report error encountered while DDP processing.
533                  * This can only happen as a result of inbound
534                  * DDP processing
535                  */
536
537                 /* Do not report DDP hdr information if packet
538                  * layout is unknown
539                  */
540                 if (((qp->term_info.etype == DDP_ETYPE_TAGGED_BUF) &&
541                      (qp->term_info.ecode == DDP_ECODE_T_VERSION)) ||
542                     ((qp->term_info.etype == DDP_ETYPE_UNTAGGED_BUF) &&
543                      (qp->term_info.ecode == DDP_ECODE_UT_VERSION)))
544                         break;
545
546                 iov[1].iov_base = rx_hdr;
547
548                 if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
549                         iov[1].iov_len = sizeof(struct iwarp_ctrl_tagged);
550                 else
551                         iov[1].iov_len = sizeof(struct iwarp_ctrl_untagged);
552
553                 term->flag_m = 1;
554                 term->flag_d = 1;
555                 break;
556
557         default:
558                 break;
559         }
560         if (term->flag_m || term->flag_d || term->flag_r) {
561                 iov[2].iov_base = &crc;
562                 iov[2].iov_len = sizeof(crc);
563                 len_terminate = sizeof(*term) + iov[1].iov_len + MPA_CRC_SIZE;
564                 num_frags = 3;
565         } else {
566                 iov[1].iov_base = &crc;
567                 iov[1].iov_len = sizeof(crc);
568                 len_terminate = sizeof(*term) + MPA_CRC_SIZE;
569                 num_frags = 2;
570         }
571
572         /* Adjust DDP Segment Length parameter, if valid */
573         if (term->flag_m) {
574                 u32 real_ddp_len = be16_to_cpu(rx_hdr->ctrl.mpa_len);
575                 enum rdma_opcode op = __rdmap_get_opcode(&rx_hdr->ctrl);
576
577                 real_ddp_len -= iwarp_pktinfo[op].hdr_len - MPA_HDR_SIZE;
578                 rx_hdr->ctrl.mpa_len = cpu_to_be16(real_ddp_len);
579         }
580
581         term->ctrl.mpa_len =
582                 cpu_to_be16(len_terminate - (MPA_HDR_SIZE + MPA_CRC_SIZE));
583         if (qp->tx_ctx.mpa_crc_hd) {
584                 crypto_shash_init(qp->tx_ctx.mpa_crc_hd);
585                 if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd,
586                                         (u8 *)iov[0].iov_base,
587                                         iov[0].iov_len))
588                         goto out;
589
590                 if (num_frags == 3) {
591                         if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd,
592                                                 (u8 *)iov[1].iov_base,
593                                                 iov[1].iov_len))
594                                 goto out;
595                 }
596                 crypto_shash_final(qp->tx_ctx.mpa_crc_hd, (u8 *)&crc);
597         }
598
599         rv = kernel_sendmsg(s, &msg, iov, num_frags, len_terminate);
600         siw_dbg_qp(qp, "sent TERM: %s, layer %d, type %d, code %d (%d bytes)\n",
601                    rv == len_terminate ? "success" : "failure",
602                    __rdmap_term_layer(term), __rdmap_term_etype(term),
603                    __rdmap_term_ecode(term), rv);
604 out:
605         kfree(term);
606         kfree(err_hdr);
607 }
608
609 /*
610  * Handle all attrs other than state
611  */
612 static void siw_qp_modify_nonstate(struct siw_qp *qp,
613                                    struct siw_qp_attrs *attrs,
614                                    enum siw_qp_attr_mask mask)
615 {
616         if (mask & SIW_QP_ATTR_ACCESS_FLAGS) {
617                 if (attrs->flags & SIW_RDMA_BIND_ENABLED)
618                         qp->attrs.flags |= SIW_RDMA_BIND_ENABLED;
619                 else
620                         qp->attrs.flags &= ~SIW_RDMA_BIND_ENABLED;
621
622                 if (attrs->flags & SIW_RDMA_WRITE_ENABLED)
623                         qp->attrs.flags |= SIW_RDMA_WRITE_ENABLED;
624                 else
625                         qp->attrs.flags &= ~SIW_RDMA_WRITE_ENABLED;
626
627                 if (attrs->flags & SIW_RDMA_READ_ENABLED)
628                         qp->attrs.flags |= SIW_RDMA_READ_ENABLED;
629                 else
630                         qp->attrs.flags &= ~SIW_RDMA_READ_ENABLED;
631         }
632 }
633
634 static int siw_qp_nextstate_from_idle(struct siw_qp *qp,
635                                       struct siw_qp_attrs *attrs,
636                                       enum siw_qp_attr_mask mask)
637 {
638         int rv = 0;
639
640         switch (attrs->state) {
641         case SIW_QP_STATE_RTS:
642                 if (attrs->flags & SIW_MPA_CRC) {
643                         rv = siw_qp_enable_crc(qp);
644                         if (rv)
645                                 break;
646                 }
647                 if (!(mask & SIW_QP_ATTR_LLP_HANDLE)) {
648                         siw_dbg_qp(qp, "no socket\n");
649                         rv = -EINVAL;
650                         break;
651                 }
652                 if (!(mask & SIW_QP_ATTR_MPA)) {
653                         siw_dbg_qp(qp, "no MPA\n");
654                         rv = -EINVAL;
655                         break;
656                 }
657                 /*
658                  * Initialize iWARP TX state
659                  */
660                 qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 0;
661                 qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 0;
662                 qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 0;
663
664                 /*
665                  * Initialize iWARP RX state
666                  */
667                 qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 1;
668                 qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 1;
669                 qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 1;
670
671                 /*
672                  * init IRD free queue, caller has already checked
673                  * limits.
674                  */
675                 rv = siw_qp_readq_init(qp, attrs->irq_size,
676                                        attrs->orq_size);
677                 if (rv)
678                         break;
679
680                 qp->attrs.sk = attrs->sk;
681                 qp->attrs.state = SIW_QP_STATE_RTS;
682
683                 siw_dbg_qp(qp, "enter RTS: crc=%s, ord=%u, ird=%u\n",
684                            attrs->flags & SIW_MPA_CRC ? "y" : "n",
685                            qp->attrs.orq_size, qp->attrs.irq_size);
686                 break;
687
688         case SIW_QP_STATE_ERROR:
689                 siw_rq_flush(qp);
690                 qp->attrs.state = SIW_QP_STATE_ERROR;
691                 if (qp->cep) {
692                         siw_cep_put(qp->cep);
693                         qp->cep = NULL;
694                 }
695                 break;
696
697         default:
698                 break;
699         }
700         return rv;
701 }
702
703 static int siw_qp_nextstate_from_rts(struct siw_qp *qp,
704                                      struct siw_qp_attrs *attrs)
705 {
706         int drop_conn = 0;
707
708         switch (attrs->state) {
709         case SIW_QP_STATE_CLOSING:
710                 /*
711                  * Verbs: move to IDLE if SQ and ORQ are empty.
712                  * Move to ERROR otherwise. But first of all we must
713                  * close the connection. So we keep CLOSING or ERROR
714                  * as a transient state, schedule connection drop work
715                  * and wait for the socket state change upcall to
716                  * come back closed.
717                  */
718                 if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) {
719                         qp->attrs.state = SIW_QP_STATE_CLOSING;
720                 } else {
721                         qp->attrs.state = SIW_QP_STATE_ERROR;
722                         siw_sq_flush(qp);
723                 }
724                 siw_rq_flush(qp);
725
726                 drop_conn = 1;
727                 break;
728
729         case SIW_QP_STATE_TERMINATE:
730                 qp->attrs.state = SIW_QP_STATE_TERMINATE;
731
732                 siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
733                                    RDMAP_ETYPE_CATASTROPHIC,
734                                    RDMAP_ECODE_UNSPECIFIED, 1);
735                 drop_conn = 1;
736                 break;
737
738         case SIW_QP_STATE_ERROR:
739                 /*
740                  * This is an emergency close.
741                  *
742                  * Any in progress transmit operation will get
743                  * cancelled.
744                  * This will likely result in a protocol failure,
745                  * if a TX operation is in transit. The caller
746                  * could unconditional wait to give the current
747                  * operation a chance to complete.
748                  * Esp., how to handle the non-empty IRQ case?
749                  * The peer was asking for data transfer at a valid
750                  * point in time.
751                  */
752                 siw_sq_flush(qp);
753                 siw_rq_flush(qp);
754                 qp->attrs.state = SIW_QP_STATE_ERROR;
755                 drop_conn = 1;
756                 break;
757
758         default:
759                 break;
760         }
761         return drop_conn;
762 }
763
764 static void siw_qp_nextstate_from_term(struct siw_qp *qp,
765                                        struct siw_qp_attrs *attrs)
766 {
767         switch (attrs->state) {
768         case SIW_QP_STATE_ERROR:
769                 siw_rq_flush(qp);
770                 qp->attrs.state = SIW_QP_STATE_ERROR;
771
772                 if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
773                         siw_sq_flush(qp);
774                 break;
775
776         default:
777                 break;
778         }
779 }
780
781 static int siw_qp_nextstate_from_close(struct siw_qp *qp,
782                                        struct siw_qp_attrs *attrs)
783 {
784         int rv = 0;
785
786         switch (attrs->state) {
787         case SIW_QP_STATE_IDLE:
788                 WARN_ON(tx_wqe(qp)->wr_status != SIW_WR_IDLE);
789                 qp->attrs.state = SIW_QP_STATE_IDLE;
790                 break;
791
792         case SIW_QP_STATE_CLOSING:
793                 /*
794                  * The LLP may already moved the QP to closing
795                  * due to graceful peer close init
796                  */
797                 break;
798
799         case SIW_QP_STATE_ERROR:
800                 /*
801                  * QP was moved to CLOSING by LLP event
802                  * not yet seen by user.
803                  */
804                 qp->attrs.state = SIW_QP_STATE_ERROR;
805
806                 if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
807                         siw_sq_flush(qp);
808
809                 siw_rq_flush(qp);
810                 break;
811
812         default:
813                 siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
814                            siw_qp_state_to_string[qp->attrs.state],
815                            siw_qp_state_to_string[attrs->state]);
816
817                 rv = -ECONNABORTED;
818         }
819         return rv;
820 }
821
822 /*
823  * Caller must hold qp->state_lock
824  */
825 int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attrs,
826                   enum siw_qp_attr_mask mask)
827 {
828         int drop_conn = 0, rv = 0;
829
830         if (!mask)
831                 return 0;
832
833         siw_dbg_qp(qp, "state: %s => %s\n",
834                    siw_qp_state_to_string[qp->attrs.state],
835                    siw_qp_state_to_string[attrs->state]);
836
837         if (mask != SIW_QP_ATTR_STATE)
838                 siw_qp_modify_nonstate(qp, attrs, mask);
839
840         if (!(mask & SIW_QP_ATTR_STATE))
841                 return 0;
842
843         switch (qp->attrs.state) {
844         case SIW_QP_STATE_IDLE:
845         case SIW_QP_STATE_RTR:
846                 rv = siw_qp_nextstate_from_idle(qp, attrs, mask);
847                 break;
848
849         case SIW_QP_STATE_RTS:
850                 drop_conn = siw_qp_nextstate_from_rts(qp, attrs);
851                 break;
852
853         case SIW_QP_STATE_TERMINATE:
854                 siw_qp_nextstate_from_term(qp, attrs);
855                 break;
856
857         case SIW_QP_STATE_CLOSING:
858                 siw_qp_nextstate_from_close(qp, attrs);
859                 break;
860         default:
861                 break;
862         }
863         if (drop_conn)
864                 siw_qp_cm_drop(qp, 0);
865
866         return rv;
867 }
868
869 void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe)
870 {
871         rreq->id = sqe->id;
872         rreq->opcode = sqe->opcode;
873         rreq->sge[0].laddr = sqe->sge[0].laddr;
874         rreq->sge[0].length = sqe->sge[0].length;
875         rreq->sge[0].lkey = sqe->sge[0].lkey;
876         rreq->sge[1].lkey = sqe->sge[1].lkey;
877         rreq->flags = sqe->flags | SIW_WQE_VALID;
878         rreq->num_sge = 1;
879 }
880
881 static int siw_activate_tx_from_sq(struct siw_qp *qp)
882 {
883         struct siw_sqe *sqe;
884         struct siw_wqe *wqe = tx_wqe(qp);
885         int rv = 1;
886
887         sqe = sq_get_next(qp);
888         if (!sqe)
889                 return 0;
890
891         memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
892         wqe->wr_status = SIW_WR_QUEUED;
893
894         /* First copy SQE to kernel private memory */
895         memcpy(&wqe->sqe, sqe, sizeof(*sqe));
896
897         if (wqe->sqe.opcode >= SIW_NUM_OPCODES) {
898                 rv = -EINVAL;
899                 goto out;
900         }
901         if (wqe->sqe.flags & SIW_WQE_INLINE) {
902                 if (wqe->sqe.opcode != SIW_OP_SEND &&
903                     wqe->sqe.opcode != SIW_OP_WRITE) {
904                         rv = -EINVAL;
905                         goto out;
906                 }
907                 if (wqe->sqe.sge[0].length > SIW_MAX_INLINE) {
908                         rv = -EINVAL;
909                         goto out;
910                 }
911                 wqe->sqe.sge[0].laddr = (uintptr_t)&wqe->sqe.sge[1];
912                 wqe->sqe.sge[0].lkey = 0;
913                 wqe->sqe.num_sge = 1;
914         }
915         if (wqe->sqe.flags & SIW_WQE_READ_FENCE) {
916                 /* A READ cannot be fenced */
917                 if (unlikely(wqe->sqe.opcode == SIW_OP_READ ||
918                              wqe->sqe.opcode ==
919                                      SIW_OP_READ_LOCAL_INV)) {
920                         siw_dbg_qp(qp, "cannot fence read\n");
921                         rv = -EINVAL;
922                         goto out;
923                 }
924                 spin_lock(&qp->orq_lock);
925
926                 if (qp->attrs.orq_size && !siw_orq_empty(qp)) {
927                         qp->tx_ctx.orq_fence = 1;
928                         rv = 0;
929                 }
930                 spin_unlock(&qp->orq_lock);
931
932         } else if (wqe->sqe.opcode == SIW_OP_READ ||
933                    wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
934                 struct siw_sqe *rreq;
935
936                 if (unlikely(!qp->attrs.orq_size)) {
937                         /* We negotiated not to send READ req's */
938                         rv = -EINVAL;
939                         goto out;
940                 }
941                 wqe->sqe.num_sge = 1;
942
943                 spin_lock(&qp->orq_lock);
944
945                 rreq = orq_get_free(qp);
946                 if (rreq) {
947                         /*
948                          * Make an immediate copy in ORQ to be ready
949                          * to process loopback READ reply
950                          */
951                         siw_read_to_orq(rreq, &wqe->sqe);
952                         qp->orq_put++;
953                 } else {
954                         qp->tx_ctx.orq_fence = 1;
955                         rv = 0;
956                 }
957                 spin_unlock(&qp->orq_lock);
958         }
959
960         /* Clear SQE, can be re-used by application */
961         smp_store_mb(sqe->flags, 0);
962         qp->sq_get++;
963 out:
964         if (unlikely(rv < 0)) {
965                 siw_dbg_qp(qp, "error %d\n", rv);
966                 wqe->wr_status = SIW_WR_IDLE;
967         }
968         return rv;
969 }
970
971 /*
972  * Must be called with SQ locked.
973  * To avoid complete SQ starvation by constant inbound READ requests,
974  * the active IRQ will not be served after qp->irq_burst, if the
975  * SQ has pending work.
976  */
977 int siw_activate_tx(struct siw_qp *qp)
978 {
979         struct siw_sqe *irqe;
980         struct siw_wqe *wqe = tx_wqe(qp);
981
982         if (!qp->attrs.irq_size)
983                 return siw_activate_tx_from_sq(qp);
984
985         irqe = &qp->irq[qp->irq_get % qp->attrs.irq_size];
986
987         if (!(irqe->flags & SIW_WQE_VALID))
988                 return siw_activate_tx_from_sq(qp);
989
990         /*
991          * Avoid local WQE processing starvation in case
992          * of constant inbound READ request stream
993          */
994         if (sq_get_next(qp) && ++qp->irq_burst >= SIW_IRQ_MAXBURST_SQ_ACTIVE) {
995                 qp->irq_burst = 0;
996                 return siw_activate_tx_from_sq(qp);
997         }
998         memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
999         wqe->wr_status = SIW_WR_QUEUED;
1000
1001         /* start READ RESPONSE */
1002         wqe->sqe.opcode = SIW_OP_READ_RESPONSE;
1003         wqe->sqe.flags = 0;
1004         if (irqe->num_sge) {
1005                 wqe->sqe.num_sge = 1;
1006                 wqe->sqe.sge[0].length = irqe->sge[0].length;
1007                 wqe->sqe.sge[0].laddr = irqe->sge[0].laddr;
1008                 wqe->sqe.sge[0].lkey = irqe->sge[0].lkey;
1009         } else {
1010                 wqe->sqe.num_sge = 0;
1011         }
1012
1013         /* Retain original RREQ's message sequence number for
1014          * potential error reporting cases.
1015          */
1016         wqe->sqe.sge[1].length = irqe->sge[1].length;
1017
1018         wqe->sqe.rkey = irqe->rkey;
1019         wqe->sqe.raddr = irqe->raddr;
1020
1021         wqe->processed = 0;
1022         qp->irq_get++;
1023
1024         /* mark current IRQ entry free */
1025         smp_store_mb(irqe->flags, 0);
1026
1027         return 1;
1028 }
1029
1030 /*
1031  * Check if current CQ state qualifies for calling CQ completion
1032  * handler. Must be called with CQ lock held.
1033  */
1034 static bool siw_cq_notify_now(struct siw_cq *cq, u32 flags)
1035 {
1036         u32 cq_notify;
1037
1038         if (!cq->base_cq.comp_handler)
1039                 return false;
1040
1041         /* Read application shared notification state */
1042         cq_notify = READ_ONCE(cq->notify->flags);
1043
1044         if ((cq_notify & SIW_NOTIFY_NEXT_COMPLETION) ||
1045             ((cq_notify & SIW_NOTIFY_SOLICITED) &&
1046              (flags & SIW_WQE_SOLICITED))) {
1047                 /*
1048                  * CQ notification is one-shot: Since the
1049                  * current CQE causes user notification,
1050                  * the CQ gets dis-aremd and must be re-aremd
1051                  * by the user for a new notification.
1052                  */
1053                 WRITE_ONCE(cq->notify->flags, SIW_NOTIFY_NOT);
1054
1055                 return true;
1056         }
1057         return false;
1058 }
1059
1060 int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe, u32 bytes,
1061                      enum siw_wc_status status)
1062 {
1063         struct siw_cq *cq = qp->scq;
1064         int rv = 0;
1065
1066         if (cq) {
1067                 u32 sqe_flags = sqe->flags;
1068                 struct siw_cqe *cqe;
1069                 u32 idx;
1070                 unsigned long flags;
1071
1072                 spin_lock_irqsave(&cq->lock, flags);
1073
1074                 idx = cq->cq_put % cq->num_cqe;
1075                 cqe = &cq->queue[idx];
1076
1077                 if (!READ_ONCE(cqe->flags)) {
1078                         bool notify;
1079
1080                         cqe->id = sqe->id;
1081                         cqe->opcode = sqe->opcode;
1082                         cqe->status = status;
1083                         cqe->imm_data = 0;
1084                         cqe->bytes = bytes;
1085
1086                         if (rdma_is_kernel_res(&cq->base_cq.res))
1087                                 cqe->base_qp = &qp->base_qp;
1088                         else
1089                                 cqe->qp_id = qp_id(qp);
1090
1091                         /* mark CQE valid for application */
1092                         WRITE_ONCE(cqe->flags, SIW_WQE_VALID);
1093                         /* recycle SQE */
1094                         smp_store_mb(sqe->flags, 0);
1095
1096                         cq->cq_put++;
1097                         notify = siw_cq_notify_now(cq, sqe_flags);
1098
1099                         spin_unlock_irqrestore(&cq->lock, flags);
1100
1101                         if (notify) {
1102                                 siw_dbg_cq(cq, "Call completion handler\n");
1103                                 cq->base_cq.comp_handler(&cq->base_cq,
1104                                                 cq->base_cq.cq_context);
1105                         }
1106                 } else {
1107                         spin_unlock_irqrestore(&cq->lock, flags);
1108                         rv = -ENOMEM;
1109                         siw_cq_event(cq, IB_EVENT_CQ_ERR);
1110                 }
1111         } else {
1112                 /* recycle SQE */
1113                 smp_store_mb(sqe->flags, 0);
1114         }
1115         return rv;
1116 }
1117
1118 int siw_rqe_complete(struct siw_qp *qp, struct siw_rqe *rqe, u32 bytes,
1119                      u32 inval_stag, enum siw_wc_status status)
1120 {
1121         struct siw_cq *cq = qp->rcq;
1122         int rv = 0;
1123
1124         if (cq) {
1125                 struct siw_cqe *cqe;
1126                 u32 idx;
1127                 unsigned long flags;
1128
1129                 spin_lock_irqsave(&cq->lock, flags);
1130
1131                 idx = cq->cq_put % cq->num_cqe;
1132                 cqe = &cq->queue[idx];
1133
1134                 if (!READ_ONCE(cqe->flags)) {
1135                         bool notify;
1136                         u8 cqe_flags = SIW_WQE_VALID;
1137
1138                         cqe->id = rqe->id;
1139                         cqe->opcode = SIW_OP_RECEIVE;
1140                         cqe->status = status;
1141                         cqe->imm_data = 0;
1142                         cqe->bytes = bytes;
1143
1144                         if (rdma_is_kernel_res(&cq->base_cq.res)) {
1145                                 cqe->base_qp = &qp->base_qp;
1146                                 if (inval_stag) {
1147                                         cqe_flags |= SIW_WQE_REM_INVAL;
1148                                         cqe->inval_stag = inval_stag;
1149                                 }
1150                         } else {
1151                                 cqe->qp_id = qp_id(qp);
1152                         }
1153                         /* mark CQE valid for application */
1154                         WRITE_ONCE(cqe->flags, cqe_flags);
1155                         /* recycle RQE */
1156                         smp_store_mb(rqe->flags, 0);
1157
1158                         cq->cq_put++;
1159                         notify = siw_cq_notify_now(cq, SIW_WQE_SIGNALLED);
1160
1161                         spin_unlock_irqrestore(&cq->lock, flags);
1162
1163                         if (notify) {
1164                                 siw_dbg_cq(cq, "Call completion handler\n");
1165                                 cq->base_cq.comp_handler(&cq->base_cq,
1166                                                 cq->base_cq.cq_context);
1167                         }
1168                 } else {
1169                         spin_unlock_irqrestore(&cq->lock, flags);
1170                         rv = -ENOMEM;
1171                         siw_cq_event(cq, IB_EVENT_CQ_ERR);
1172                 }
1173         } else {
1174                 /* recycle RQE */
1175                 smp_store_mb(rqe->flags, 0);
1176         }
1177         return rv;
1178 }
1179
1180 /*
1181  * siw_sq_flush()
1182  *
1183  * Flush SQ and ORRQ entries to CQ.
1184  *
1185  * Must be called with QP state write lock held.
1186  * Therefore, SQ and ORQ lock must not be taken.
1187  */
1188 void siw_sq_flush(struct siw_qp *qp)
1189 {
1190         struct siw_sqe *sqe;
1191         struct siw_wqe *wqe = tx_wqe(qp);
1192         int async_event = 0;
1193
1194         /*
1195          * Start with completing any work currently on the ORQ
1196          */
1197         while (qp->attrs.orq_size) {
1198                 sqe = &qp->orq[qp->orq_get % qp->attrs.orq_size];
1199                 if (!READ_ONCE(sqe->flags))
1200                         break;
1201
1202                 if (siw_sqe_complete(qp, sqe, 0, SIW_WC_WR_FLUSH_ERR) != 0)
1203                         break;
1204
1205                 WRITE_ONCE(sqe->flags, 0);
1206                 qp->orq_get++;
1207         }
1208         /*
1209          * Flush an in-progress WQE if present
1210          */
1211         if (wqe->wr_status != SIW_WR_IDLE) {
1212                 siw_dbg_qp(qp, "flush current SQE, type %d, status %d\n",
1213                            tx_type(wqe), wqe->wr_status);
1214
1215                 siw_wqe_put_mem(wqe, tx_type(wqe));
1216
1217                 if (tx_type(wqe) != SIW_OP_READ_RESPONSE &&
1218                     ((tx_type(wqe) != SIW_OP_READ &&
1219                       tx_type(wqe) != SIW_OP_READ_LOCAL_INV) ||
1220                      wqe->wr_status == SIW_WR_QUEUED))
1221                         /*
1222                          * An in-progress Read Request is already in
1223                          * the ORQ
1224                          */
1225                         siw_sqe_complete(qp, &wqe->sqe, wqe->bytes,
1226                                          SIW_WC_WR_FLUSH_ERR);
1227
1228                 wqe->wr_status = SIW_WR_IDLE;
1229         }
1230         /*
1231          * Flush the Send Queue
1232          */
1233         while (qp->attrs.sq_size) {
1234                 sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size];
1235                 if (!READ_ONCE(sqe->flags))
1236                         break;
1237
1238                 async_event = 1;
1239                 if (siw_sqe_complete(qp, sqe, 0, SIW_WC_WR_FLUSH_ERR) != 0)
1240                         /*
1241                          * Shall IB_EVENT_SQ_DRAINED be supressed if work
1242                          * completion fails?
1243                          */
1244                         break;
1245
1246                 WRITE_ONCE(sqe->flags, 0);
1247                 qp->sq_get++;
1248         }
1249         if (async_event)
1250                 siw_qp_event(qp, IB_EVENT_SQ_DRAINED);
1251 }
1252
1253 /*
1254  * siw_rq_flush()
1255  *
1256  * Flush recv queue entries to CQ. Also
1257  * takes care of pending active tagged and untagged
1258  * inbound transfers, which have target memory
1259  * referenced.
1260  *
1261  * Must be called with QP state write lock held.
1262  * Therefore, RQ lock must not be taken.
1263  */
1264 void siw_rq_flush(struct siw_qp *qp)
1265 {
1266         struct siw_wqe *wqe = &qp->rx_untagged.wqe_active;
1267
1268         /*
1269          * Flush an in-progress untagged operation if present
1270          */
1271         if (wqe->wr_status != SIW_WR_IDLE) {
1272                 siw_dbg_qp(qp, "flush current rqe, type %d, status %d\n",
1273                            rx_type(wqe), wqe->wr_status);
1274
1275                 siw_wqe_put_mem(wqe, rx_type(wqe));
1276
1277                 if (rx_type(wqe) == SIW_OP_RECEIVE) {
1278                         siw_rqe_complete(qp, &wqe->rqe, wqe->bytes,
1279                                          0, SIW_WC_WR_FLUSH_ERR);
1280                 } else if (rx_type(wqe) != SIW_OP_READ &&
1281                            rx_type(wqe) != SIW_OP_READ_RESPONSE &&
1282                            rx_type(wqe) != SIW_OP_WRITE) {
1283                         siw_sqe_complete(qp, &wqe->sqe, 0, SIW_WC_WR_FLUSH_ERR);
1284                 }
1285                 wqe->wr_status = SIW_WR_IDLE;
1286         }
1287         wqe = &qp->rx_tagged.wqe_active;
1288
1289         if (wqe->wr_status != SIW_WR_IDLE) {
1290                 siw_wqe_put_mem(wqe, rx_type(wqe));
1291                 wqe->wr_status = SIW_WR_IDLE;
1292         }
1293         /*
1294          * Flush the Receive Queue
1295          */
1296         while (qp->attrs.rq_size) {
1297                 struct siw_rqe *rqe =
1298                         &qp->recvq[qp->rq_get % qp->attrs.rq_size];
1299
1300                 if (!READ_ONCE(rqe->flags))
1301                         break;
1302
1303                 if (siw_rqe_complete(qp, rqe, 0, 0, SIW_WC_WR_FLUSH_ERR) != 0)
1304                         break;
1305
1306                 WRITE_ONCE(rqe->flags, 0);
1307                 qp->rq_get++;
1308         }
1309 }
1310
1311 int siw_qp_add(struct siw_device *sdev, struct siw_qp *qp)
1312 {
1313         int rv = xa_alloc(&sdev->qp_xa, &qp->base_qp.qp_num, qp, xa_limit_32b,
1314                           GFP_KERNEL);
1315
1316         if (!rv) {
1317                 kref_init(&qp->ref);
1318                 qp->sdev = sdev;
1319                 siw_dbg_qp(qp, "new QP\n");
1320         }
1321         return rv;
1322 }
1323
1324 void siw_free_qp(struct kref *ref)
1325 {
1326         struct siw_qp *found, *qp = container_of(ref, struct siw_qp, ref);
1327         struct siw_device *sdev = qp->sdev;
1328         unsigned long flags;
1329
1330         if (qp->cep)
1331                 siw_cep_put(qp->cep);
1332
1333         found = xa_erase(&sdev->qp_xa, qp_id(qp));
1334         WARN_ON(found != qp);
1335         spin_lock_irqsave(&sdev->lock, flags);
1336         list_del(&qp->devq);
1337         spin_unlock_irqrestore(&sdev->lock, flags);
1338
1339         vfree(qp->sendq);
1340         vfree(qp->recvq);
1341         vfree(qp->irq);
1342         vfree(qp->orq);
1343
1344         siw_put_tx_cpu(qp->tx_cpu);
1345
1346         atomic_dec(&sdev->num_qp);
1347         siw_dbg_qp(qp, "free QP\n");
1348         kfree_rcu(qp, rcu);
1349 }