1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4 /* Copyright (c) 2008-2019, IBM Corporation */
6 #include <linux/errno.h>
7 #include <linux/types.h>
9 #include <linux/scatterlist.h>
10 #include <linux/highmem.h>
12 #include <rdma/iw_cm.h>
13 #include <rdma/ib_verbs.h>
16 #include "siw_verbs.h"
22 * Receive data of @len into target referenced by @dest_addr.
24 * @srx: Receive Context
25 * @umem: siw representation of target memory
26 * @dest_addr: user virtual address
27 * @len: number of bytes to place
29 static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem,
30 u64 dest_addr, int len)
36 int pg_off, bytes, rv;
39 p = siw_get_upage(umem, dest_addr);
41 pr_warn("siw: %s: [QP %u]: bogus addr: %pK, %pK\n",
42 __func__, qp_id(rx_qp(srx)),
43 (void *)(uintptr_t)dest_addr,
44 (void *)(uintptr_t)umem->fp_addr);
45 /* siw internal error */
46 srx->skb_copied += copied;
47 srx->skb_new -= copied;
51 pg_off = dest_addr & ~PAGE_MASK;
52 bytes = min(len, (int)PAGE_SIZE - pg_off);
54 siw_dbg_qp(rx_qp(srx), "page %pK, bytes=%u\n", p, bytes);
56 dest = kmap_atomic(p);
57 rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off,
62 srx->skb_copied += copied;
63 srx->skb_new -= copied;
65 pr_warn("siw: [QP %u]: %s, len %d, page %p, rv %d\n",
66 qp_id(rx_qp(srx)), __func__, len, p, rv);
70 if (srx->mpa_crc_hd) {
71 if (rdma_is_kernel_res(&rx_qp(srx)->base_qp.res)) {
72 crypto_shash_update(srx->mpa_crc_hd,
73 (u8 *)(dest + pg_off), bytes);
78 * Do CRC on original, not target buffer.
79 * Some user land applications may
80 * concurrently write the target buffer,
81 * which would yield a broken CRC.
82 * Walking the skb twice is very ineffcient.
83 * Folding the CRC into skb_copy_bits()
84 * would be much better, but is currently
87 siw_crc_skb(srx, bytes);
92 srx->skb_offset += bytes;
98 srx->skb_copied += copied;
99 srx->skb_new -= copied;
104 static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len)
108 siw_dbg_qp(rx_qp(srx), "kva: 0x%pK, len: %u\n", kva, len);
110 rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len);
112 pr_warn("siw: [QP %u]: %s, len %d, kva 0x%pK, rv %d\n",
113 qp_id(rx_qp(srx)), __func__, len, kva, rv);
118 crypto_shash_update(srx->mpa_crc_hd, (u8 *)kva, len);
120 srx->skb_offset += len;
121 srx->skb_copied += len;
127 static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx,
128 struct siw_mem *mem, u64 addr, int len)
130 struct siw_pbl *pbl = mem->pbl;
131 u64 offset = addr - mem->va;
136 dma_addr_t buf_addr =
137 siw_pbl_get_buffer(pbl, offset, &bytes, pbl_idx);
141 bytes = min(bytes, len);
142 if (siw_rx_kva(srx, (void *)(uintptr_t)buf_addr, bytes) ==
155 * siw_rresp_check_ntoh()
157 * Check incoming RRESP fragment header against expected
158 * header values and update expected values for potential next
161 * NOTE: This function must be called only if a RRESP DDP segment
162 * starts but not for fragmented consecutive pieces of an
163 * already started DDP segment.
165 static int siw_rresp_check_ntoh(struct siw_rx_stream *srx,
166 struct siw_rx_fpdu *frx)
168 struct iwarp_rdma_rresp *rresp = &srx->hdr.rresp;
169 struct siw_wqe *wqe = &frx->wqe_active;
170 enum ddp_ecode ecode;
172 u32 sink_stag = be32_to_cpu(rresp->sink_stag);
173 u64 sink_to = be64_to_cpu(rresp->sink_to);
175 if (frx->first_ddp_seg) {
176 srx->ddp_stag = wqe->sqe.sge[0].lkey;
177 srx->ddp_to = wqe->sqe.sge[0].laddr;
180 /* Below checks extend beyond the semantics of DDP, and
182 * We check if the read response matches exactly the
183 * read request which was send to the remote peer to
184 * trigger this read response. RFC5040/5041 do not
185 * always have a proper error code for the detected
186 * error cases. We choose 'base or bounds error' for
187 * cases where the inbound STag is valid, but offset
188 * or length do not match our response receive state.
190 if (unlikely(srx->ddp_stag != sink_stag)) {
191 pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n",
192 qp_id(rx_qp(srx)), sink_stag, srx->ddp_stag);
193 ecode = DDP_ECODE_T_INVALID_STAG;
196 if (unlikely(srx->ddp_to != sink_to)) {
197 pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n",
198 qp_id(rx_qp(srx)), (unsigned long long)sink_to,
199 (unsigned long long)srx->ddp_to);
200 ecode = DDP_ECODE_T_BASE_BOUNDS;
203 if (unlikely(!frx->more_ddp_segs &&
204 (wqe->processed + srx->fpdu_part_rem != wqe->bytes))) {
205 pr_warn("siw: [QP %u]: rresp len: %d != %d\n",
207 wqe->processed + srx->fpdu_part_rem, wqe->bytes);
208 ecode = DDP_ECODE_T_BASE_BOUNDS;
213 siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
214 DDP_ETYPE_TAGGED_BUF, ecode, 0);
219 * siw_write_check_ntoh()
221 * Check incoming WRITE fragment header against expected
222 * header values and update expected values for potential next
225 * NOTE: This function must be called only if a WRITE DDP segment
226 * starts but not for fragmented consecutive pieces of an
227 * already started DDP segment.
229 static int siw_write_check_ntoh(struct siw_rx_stream *srx,
230 struct siw_rx_fpdu *frx)
232 struct iwarp_rdma_write *write = &srx->hdr.rwrite;
233 enum ddp_ecode ecode;
235 u32 sink_stag = be32_to_cpu(write->sink_stag);
236 u64 sink_to = be64_to_cpu(write->sink_to);
238 if (frx->first_ddp_seg) {
239 srx->ddp_stag = sink_stag;
240 srx->ddp_to = sink_to;
243 if (unlikely(srx->ddp_stag != sink_stag)) {
244 pr_warn("siw: [QP %u]: write stag: %08x != %08x\n",
245 qp_id(rx_qp(srx)), sink_stag,
247 ecode = DDP_ECODE_T_INVALID_STAG;
250 if (unlikely(srx->ddp_to != sink_to)) {
251 pr_warn("siw: [QP %u]: write off: %016llx != %016llx\n",
253 (unsigned long long)sink_to,
254 (unsigned long long)srx->ddp_to);
255 ecode = DDP_ECODE_T_BASE_BOUNDS;
261 siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
262 DDP_ETYPE_TAGGED_BUF, ecode, 0);
267 * siw_send_check_ntoh()
269 * Check incoming SEND fragment header against expected
270 * header values and update expected MSN if no next
273 * NOTE: This function must be called only if a SEND DDP segment
274 * starts but not for fragmented consecutive pieces of an
275 * already started DDP segment.
277 static int siw_send_check_ntoh(struct siw_rx_stream *srx,
278 struct siw_rx_fpdu *frx)
280 struct iwarp_send_inv *send = &srx->hdr.send_inv;
281 struct siw_wqe *wqe = &frx->wqe_active;
282 enum ddp_ecode ecode;
284 u32 ddp_msn = be32_to_cpu(send->ddp_msn);
285 u32 ddp_mo = be32_to_cpu(send->ddp_mo);
286 u32 ddp_qn = be32_to_cpu(send->ddp_qn);
288 if (unlikely(ddp_qn != RDMAP_UNTAGGED_QN_SEND)) {
289 pr_warn("siw: [QP %u]: invalid ddp qn %d for send\n",
290 qp_id(rx_qp(srx)), ddp_qn);
291 ecode = DDP_ECODE_UT_INVALID_QN;
294 if (unlikely(ddp_msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND])) {
295 pr_warn("siw: [QP %u]: send msn: %u != %u\n",
296 qp_id(rx_qp(srx)), ddp_msn,
297 srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
298 ecode = DDP_ECODE_UT_INVALID_MSN_RANGE;
301 if (unlikely(ddp_mo != wqe->processed)) {
302 pr_warn("siw: [QP %u], send mo: %u != %u\n",
303 qp_id(rx_qp(srx)), ddp_mo, wqe->processed);
304 ecode = DDP_ECODE_UT_INVALID_MO;
307 if (frx->first_ddp_seg) {
308 /* initialize user memory write position */
313 /* only valid for SEND_INV and SEND_SE_INV operations */
314 srx->inval_stag = be32_to_cpu(send->inval_stag);
316 if (unlikely(wqe->bytes < wqe->processed + srx->fpdu_part_rem)) {
317 siw_dbg_qp(rx_qp(srx), "receive space short: %d - %d < %d\n",
318 wqe->bytes, wqe->processed, srx->fpdu_part_rem);
319 wqe->wc_status = SIW_WC_LOC_LEN_ERR;
320 ecode = DDP_ECODE_UT_INVALID_MSN_NOBUF;
325 siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
326 DDP_ETYPE_UNTAGGED_BUF, ecode, 0);
330 static struct siw_wqe *siw_rqe_get(struct siw_qp *qp)
334 struct siw_wqe *wqe = NULL;
335 bool srq_event = false;
340 spin_lock_irqsave(&srq->lock, flags);
341 if (unlikely(!srq->num_rqe))
344 rqe = &srq->recvq[srq->rq_get % srq->num_rqe];
346 if (unlikely(!qp->recvq))
349 rqe = &qp->recvq[qp->rq_get % qp->attrs.rq_size];
351 if (likely(rqe->flags == SIW_WQE_VALID)) {
352 int num_sge = rqe->num_sge;
354 if (likely(num_sge <= SIW_MAX_SGE)) {
357 wqe = rx_wqe(&qp->rx_untagged);
358 rx_type(wqe) = SIW_OP_RECEIVE;
359 wqe->wr_status = SIW_WR_INPROGRESS;
363 wqe->rqe.id = rqe->id;
364 wqe->rqe.num_sge = num_sge;
366 while (i < num_sge) {
367 wqe->rqe.sge[i].laddr = rqe->sge[i].laddr;
368 wqe->rqe.sge[i].lkey = rqe->sge[i].lkey;
369 wqe->rqe.sge[i].length = rqe->sge[i].length;
370 wqe->bytes += wqe->rqe.sge[i].length;
374 /* can be re-used by appl */
375 smp_store_mb(rqe->flags, 0);
377 siw_dbg_qp(qp, "too many sge's: %d\n", rqe->num_sge);
379 spin_unlock_irqrestore(&srq->lock, flags);
387 u32 off = (srq->rq_get + srq->limit) %
389 struct siw_rqe *rqe2 = &srq->recvq[off];
391 if (!(rqe2->flags & SIW_WQE_VALID)) {
401 spin_unlock_irqrestore(&srq->lock, flags);
403 siw_srq_event(srq, IB_EVENT_SRQ_LIMIT_REACHED);
411 * Process one incoming SEND and place data into memory referenced by
414 * Function supports partially received sends (suspending/resuming
415 * current receive wqe processing)
418 * 0: reached the end of a DDP segment
419 * -EAGAIN: to be called again to finish the DDP segment
421 int siw_proc_send(struct siw_qp *qp)
423 struct siw_rx_stream *srx = &qp->rx_stream;
424 struct siw_rx_fpdu *frx = &qp->rx_untagged;
426 u32 data_bytes; /* all data bytes available */
427 u32 rcvd_bytes; /* sum of data bytes rcvd */
430 if (frx->first_ddp_seg) {
431 wqe = siw_rqe_get(qp);
432 if (unlikely(!wqe)) {
433 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
434 DDP_ETYPE_UNTAGGED_BUF,
435 DDP_ECODE_UT_INVALID_MSN_NOBUF, 0);
441 if (srx->state == SIW_GET_DATA_START) {
442 rv = siw_send_check_ntoh(srx, frx);
444 siw_qp_event(qp, IB_EVENT_QP_FATAL);
447 if (!srx->fpdu_part_rem) /* zero length SEND */
450 data_bytes = min(srx->fpdu_part_rem, srx->skb_new);
453 /* A zero length SEND will skip below loop */
456 struct siw_mem **mem, *mem_p;
458 u32 sge_bytes; /* data bytes avail for SGE */
460 sge = &wqe->rqe.sge[frx->sge_idx];
463 /* just skip empty sge's */
469 sge_bytes = min(data_bytes, sge->length - frx->sge_off);
470 mem = &wqe->mem[frx->sge_idx];
473 * check with QP's PD if no SRQ present, SRQ's PD otherwise
475 pd = qp->srq == NULL ? qp->pd : qp->srq->base_srq.pd;
477 rv = siw_check_sge(pd, sge, mem, IB_ACCESS_LOCAL_WRITE,
478 frx->sge_off, sge_bytes);
480 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
481 DDP_ETYPE_CATASTROPHIC,
482 DDP_ECODE_CATASTROPHIC, 0);
484 siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
488 if (mem_p->mem_obj == NULL)
490 (void *)(uintptr_t)(sge->laddr + frx->sge_off),
492 else if (!mem_p->is_pbl)
493 rv = siw_rx_umem(srx, mem_p->umem,
494 sge->laddr + frx->sge_off, sge_bytes);
496 rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
497 sge->laddr + frx->sge_off, sge_bytes);
499 if (unlikely(rv != sge_bytes)) {
500 wqe->processed += rcvd_bytes;
502 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
503 DDP_ETYPE_CATASTROPHIC,
504 DDP_ECODE_CATASTROPHIC, 0);
509 if (frx->sge_off == sge->length) {
517 srx->fpdu_part_rem -= rv;
518 srx->fpdu_part_rcvd += rv;
520 wqe->processed += rcvd_bytes;
522 if (!srx->fpdu_part_rem)
525 return (rv < 0) ? rv : -EAGAIN;
531 * Place incoming WRITE after referencing and checking target buffer
533 * Function supports partially received WRITEs (suspending/resuming
534 * current receive processing)
537 * 0: reached the end of a DDP segment
538 * -EAGAIN: to be called again to finish the DDP segment
540 int siw_proc_write(struct siw_qp *qp)
542 struct siw_rx_stream *srx = &qp->rx_stream;
543 struct siw_rx_fpdu *frx = &qp->rx_tagged;
547 if (srx->state == SIW_GET_DATA_START) {
548 if (!srx->fpdu_part_rem) /* zero length WRITE */
551 rv = siw_write_check_ntoh(srx, frx);
553 siw_qp_event(qp, IB_EVENT_QP_FATAL);
557 bytes = min(srx->fpdu_part_rem, srx->skb_new);
559 if (frx->first_ddp_seg) {
560 struct siw_wqe *wqe = rx_wqe(frx);
562 rx_mem(frx) = siw_mem_id2obj(qp->sdev, srx->ddp_stag >> 8);
563 if (unlikely(!rx_mem(frx))) {
565 "sink stag not found/invalid, stag 0x%08x\n",
568 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
569 DDP_ETYPE_TAGGED_BUF,
570 DDP_ECODE_T_INVALID_STAG, 0);
573 wqe->rqe.num_sge = 1;
574 rx_type(wqe) = SIW_OP_WRITE;
575 wqe->wr_status = SIW_WR_INPROGRESS;
580 * Check if application re-registered memory with different
583 if (unlikely(mem->stag != srx->ddp_stag)) {
584 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
585 DDP_ETYPE_TAGGED_BUF,
586 DDP_ECODE_T_INVALID_STAG, 0);
589 rv = siw_check_mem(qp->pd, mem, srx->ddp_to + srx->fpdu_part_rcvd,
590 IB_ACCESS_REMOTE_WRITE, bytes);
592 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
593 DDP_ETYPE_TAGGED_BUF, siw_tagged_error(-rv),
596 siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
601 if (mem->mem_obj == NULL)
603 (void *)(uintptr_t)(srx->ddp_to + srx->fpdu_part_rcvd),
605 else if (!mem->is_pbl)
606 rv = siw_rx_umem(srx, mem->umem,
607 srx->ddp_to + srx->fpdu_part_rcvd, bytes);
609 rv = siw_rx_pbl(srx, &frx->pbl_idx, mem,
610 srx->ddp_to + srx->fpdu_part_rcvd, bytes);
612 if (unlikely(rv != bytes)) {
613 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
614 DDP_ETYPE_CATASTROPHIC,
615 DDP_ECODE_CATASTROPHIC, 0);
618 srx->fpdu_part_rem -= rv;
619 srx->fpdu_part_rcvd += rv;
621 if (!srx->fpdu_part_rem) {
622 srx->ddp_to += srx->fpdu_part_rcvd;
629 * Inbound RREQ's cannot carry user data.
631 int siw_proc_rreq(struct siw_qp *qp)
633 struct siw_rx_stream *srx = &qp->rx_stream;
635 if (!srx->fpdu_part_rem)
638 pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp),
639 be16_to_cpu(srx->hdr.ctrl.mpa_len));
647 * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE.
648 * Put it at the tail of the IRQ, if there is another WQE currently in
649 * transmit processing. If not, make it the current WQE to be processed
650 * and schedule transmit processing.
652 * Can be called from softirq context and from process
653 * context (RREAD socket loopback case!)
657 * failure code otherwise
660 static int siw_init_rresp(struct siw_qp *qp, struct siw_rx_stream *srx)
662 struct siw_wqe *tx_work = tx_wqe(qp);
663 struct siw_sqe *resp;
665 uint64_t raddr = be64_to_cpu(srx->hdr.rreq.sink_to),
666 laddr = be64_to_cpu(srx->hdr.rreq.source_to);
667 uint32_t length = be32_to_cpu(srx->hdr.rreq.read_size),
668 lkey = be32_to_cpu(srx->hdr.rreq.source_stag),
669 rkey = be32_to_cpu(srx->hdr.rreq.sink_stag),
670 msn = be32_to_cpu(srx->hdr.rreq.ddp_msn);
672 int run_sq = 1, rv = 0;
675 if (unlikely(msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ])) {
676 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
677 DDP_ETYPE_UNTAGGED_BUF,
678 DDP_ECODE_UT_INVALID_MSN_RANGE, 0);
681 spin_lock_irqsave(&qp->sq_lock, flags);
683 if (unlikely(!qp->attrs.irq_size)) {
687 if (tx_work->wr_status == SIW_WR_IDLE) {
689 * immediately schedule READ response w/o
690 * consuming IRQ entry: IRQ must be empty.
692 tx_work->processed = 0;
693 tx_work->mem[0] = NULL;
694 tx_work->wr_status = SIW_WR_QUEUED;
695 resp = &tx_work->sqe;
697 resp = irq_alloc_free(qp);
701 resp->opcode = SIW_OP_READ_RESPONSE;
703 resp->sge[0].length = length;
704 resp->sge[0].laddr = laddr;
705 resp->sge[0].lkey = lkey;
707 /* Keep aside message sequence number for potential
708 * error reporting during Read Response generation.
710 resp->sge[1].length = msn;
714 resp->num_sge = length ? 1 : 0;
716 /* RRESP now valid as current TX wqe or placed into IRQ */
717 smp_store_mb(resp->flags, SIW_WQE_VALID);
720 pr_warn("siw: [QP %u]: IRQ exceeded or null, size %d\n",
721 qp_id(qp), qp->attrs.irq_size);
723 siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
724 RDMAP_ETYPE_REMOTE_OPERATION,
725 RDMAP_ECODE_CATASTROPHIC_STREAM, 0);
729 spin_unlock_irqrestore(&qp->sq_lock, flags);
732 rv = siw_sq_start(qp);
738 * Only called at start of Read.Resonse processing.
739 * Transfer pending Read from tip of ORQ into currrent rx wqe,
740 * but keep ORQ entry valid until Read.Response processing done.
741 * No Queue locking needed.
743 static int siw_orqe_start_rx(struct siw_qp *qp)
745 struct siw_sqe *orqe;
746 struct siw_wqe *wqe = NULL;
748 if (unlikely(!qp->attrs.orq_size))
751 /* make sure ORQ indices are current */
754 orqe = orq_get_current(qp);
755 if (READ_ONCE(orqe->flags) & SIW_WQE_VALID) {
756 /* RRESP is a TAGGED RDMAP operation */
757 wqe = rx_wqe(&qp->rx_tagged);
758 wqe->sqe.id = orqe->id;
759 wqe->sqe.opcode = orqe->opcode;
760 wqe->sqe.sge[0].laddr = orqe->sge[0].laddr;
761 wqe->sqe.sge[0].lkey = orqe->sge[0].lkey;
762 wqe->sqe.sge[0].length = orqe->sge[0].length;
763 wqe->sqe.flags = orqe->flags;
764 wqe->sqe.num_sge = 1;
765 wqe->bytes = orqe->sge[0].length;
768 /* make sure WQE is completely written before valid */
770 wqe->wr_status = SIW_WR_INPROGRESS;
780 * Place incoming RRESP data into memory referenced by RREQ WQE
781 * which is at the tip of the ORQ
783 * Function supports partially received RRESP's (suspending/resuming
784 * current receive processing)
786 int siw_proc_rresp(struct siw_qp *qp)
788 struct siw_rx_stream *srx = &qp->rx_stream;
789 struct siw_rx_fpdu *frx = &qp->rx_tagged;
790 struct siw_wqe *wqe = rx_wqe(frx);
791 struct siw_mem **mem, *mem_p;
795 if (frx->first_ddp_seg) {
796 if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
797 pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n",
798 qp_id(qp), wqe->wr_status, wqe->sqe.opcode);
803 * fetch pending RREQ from orq
805 rv = siw_orqe_start_rx(qp);
807 pr_warn("siw: [QP %u]: ORQ empty, size %d\n",
808 qp_id(qp), qp->attrs.orq_size);
811 rv = siw_rresp_check_ntoh(srx, frx);
813 siw_qp_event(qp, IB_EVENT_QP_FATAL);
817 if (unlikely(wqe->wr_status != SIW_WR_INPROGRESS)) {
818 pr_warn("siw: [QP %u]: resume RRESP: status %d\n",
819 qp_id(qp), wqe->wr_status);
824 if (!srx->fpdu_part_rem) /* zero length RRESPONSE */
827 sge = wqe->sqe.sge; /* there is only one */
832 * check target memory which resolves memory on first fragment
834 rv = siw_check_sge(qp->pd, sge, mem, IB_ACCESS_LOCAL_WRITE, 0,
837 siw_dbg_qp(qp, "target mem check: %d\n", rv);
838 wqe->wc_status = SIW_WC_LOC_PROT_ERR;
840 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
841 DDP_ETYPE_TAGGED_BUF,
842 siw_tagged_error(-rv), 0);
844 siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
851 bytes = min(srx->fpdu_part_rem, srx->skb_new);
853 if (mem_p->mem_obj == NULL)
855 (void *)(uintptr_t)(sge->laddr + wqe->processed),
857 else if (!mem_p->is_pbl)
858 rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + wqe->processed,
861 rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
862 sge->laddr + wqe->processed, bytes);
864 wqe->wc_status = SIW_WC_GENERAL_ERR;
868 srx->fpdu_part_rem -= rv;
869 srx->fpdu_part_rcvd += rv;
870 wqe->processed += rv;
872 if (!srx->fpdu_part_rem) {
873 srx->ddp_to += srx->fpdu_part_rcvd;
879 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC,
880 DDP_ECODE_CATASTROPHIC, 0);
884 int siw_proc_terminate(struct siw_qp *qp)
886 struct siw_rx_stream *srx = &qp->rx_stream;
887 struct sk_buff *skb = srx->skb;
888 struct iwarp_terminate *term = &srx->hdr.terminate;
889 union iwarp_hdr term_info;
890 u8 *infop = (u8 *)&term_info;
892 u16 to_copy = sizeof(struct iwarp_ctrl);
894 pr_warn("siw: got TERMINATE. layer %d, type %d, code %d\n",
895 __rdmap_term_layer(term), __rdmap_term_etype(term),
896 __rdmap_term_ecode(term));
898 if (be32_to_cpu(term->ddp_qn) != RDMAP_UNTAGGED_QN_TERMINATE ||
899 be32_to_cpu(term->ddp_msn) !=
900 qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] ||
901 be32_to_cpu(term->ddp_mo) != 0) {
902 pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n",
903 be32_to_cpu(term->ddp_qn), be32_to_cpu(term->ddp_msn),
904 be32_to_cpu(term->ddp_mo));
908 * Receive remaining pieces of TERM if indicated
913 /* Do not take the effort to reassemble a network fragmented
916 if (srx->skb_new < sizeof(struct iwarp_ctrl_tagged))
919 memset(infop, 0, sizeof(term_info));
921 skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
923 op = __rdmap_get_opcode(&term_info.ctrl);
924 if (op >= RDMAP_TERMINATE)
928 srx->skb_offset += to_copy;
929 srx->skb_new -= to_copy;
930 srx->skb_copied += to_copy;
931 srx->fpdu_part_rcvd += to_copy;
932 srx->fpdu_part_rem -= to_copy;
934 to_copy = iwarp_pktinfo[op].hdr_len - to_copy;
936 /* Again, no network fragmented TERM's */
937 if (to_copy + MPA_CRC_SIZE > srx->skb_new)
940 skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
943 siw_dbg_qp(qp, "TERM reports RDMAP hdr type %u, len %u (%s)\n",
944 op, be16_to_cpu(term_info.ctrl.mpa_len),
945 term->flag_m ? "valid" : "invalid");
946 } else if (term->flag_d) {
947 siw_dbg_qp(qp, "TERM reports DDP hdr type %u, len %u (%s)\n",
948 op, be16_to_cpu(term_info.ctrl.mpa_len),
949 term->flag_m ? "valid" : "invalid");
952 srx->skb_new -= to_copy;
953 srx->skb_offset += to_copy;
954 srx->skb_copied += to_copy;
955 srx->fpdu_part_rcvd += to_copy;
956 srx->fpdu_part_rem -= to_copy;
961 static int siw_get_trailer(struct siw_qp *qp, struct siw_rx_stream *srx)
963 struct sk_buff *skb = srx->skb;
964 int avail = min(srx->skb_new, srx->fpdu_part_rem);
965 u8 *tbuf = (u8 *)&srx->trailer.crc - srx->pad;
966 __wsum crc_in, crc_own = 0;
968 siw_dbg_qp(qp, "expected %d, available %d, pad %u\n",
969 srx->fpdu_part_rem, srx->skb_new, srx->pad);
971 skb_copy_bits(skb, srx->skb_offset, tbuf, avail);
973 srx->skb_new -= avail;
974 srx->skb_offset += avail;
975 srx->skb_copied += avail;
976 srx->fpdu_part_rem -= avail;
978 if (srx->fpdu_part_rem)
981 if (!srx->mpa_crc_hd)
985 crypto_shash_update(srx->mpa_crc_hd, tbuf, srx->pad);
987 * CRC32 is computed, transmitted and received directly in NBO,
988 * so there's never a reason to convert byte order.
990 crypto_shash_final(srx->mpa_crc_hd, (u8 *)&crc_own);
991 crc_in = (__force __wsum)srx->trailer.crc;
993 if (unlikely(crc_in != crc_own)) {
994 pr_warn("siw: crc error. in: %08x, own %08x, op %u\n",
995 crc_in, crc_own, qp->rx_stream.rdmap_op);
997 siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
999 LLP_ECODE_RECEIVED_CRC, 0);
1005 #define MIN_DDP_HDR sizeof(struct iwarp_ctrl_tagged)
1007 static int siw_get_hdr(struct siw_rx_stream *srx)
1009 struct sk_buff *skb = srx->skb;
1010 struct siw_qp *qp = rx_qp(srx);
1011 struct iwarp_ctrl *c_hdr = &srx->hdr.ctrl;
1012 struct siw_rx_fpdu *frx;
1016 if (srx->fpdu_part_rcvd < MIN_DDP_HDR) {
1018 * copy a mimimum sized (tagged) DDP frame control part
1020 bytes = min_t(int, srx->skb_new,
1021 MIN_DDP_HDR - srx->fpdu_part_rcvd);
1023 skb_copy_bits(skb, srx->skb_offset,
1024 (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
1026 srx->fpdu_part_rcvd += bytes;
1028 srx->skb_new -= bytes;
1029 srx->skb_offset += bytes;
1030 srx->skb_copied += bytes;
1032 if (srx->fpdu_part_rcvd < MIN_DDP_HDR)
1035 if (unlikely(__ddp_get_version(c_hdr) != DDP_VERSION)) {
1036 enum ddp_etype etype;
1037 enum ddp_ecode ecode;
1039 pr_warn("siw: received ddp version unsupported %d\n",
1040 __ddp_get_version(c_hdr));
1042 if (c_hdr->ddp_rdmap_ctrl & DDP_FLAG_TAGGED) {
1043 etype = DDP_ETYPE_TAGGED_BUF;
1044 ecode = DDP_ECODE_T_VERSION;
1046 etype = DDP_ETYPE_UNTAGGED_BUF;
1047 ecode = DDP_ECODE_UT_VERSION;
1049 siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
1053 if (unlikely(__rdmap_get_version(c_hdr) != RDMAP_VERSION)) {
1054 pr_warn("siw: received rdmap version unsupported %d\n",
1055 __rdmap_get_version(c_hdr));
1057 siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
1058 RDMAP_ETYPE_REMOTE_OPERATION,
1059 RDMAP_ECODE_VERSION, 0);
1062 opcode = __rdmap_get_opcode(c_hdr);
1064 if (opcode > RDMAP_TERMINATE) {
1065 pr_warn("siw: received unknown packet type %u\n",
1068 siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
1069 RDMAP_ETYPE_REMOTE_OPERATION,
1070 RDMAP_ECODE_OPCODE, 0);
1073 siw_dbg_qp(rx_qp(srx), "new header, opcode %u\n", opcode);
1075 opcode = __rdmap_get_opcode(c_hdr);
1077 set_rx_fpdu_context(qp, opcode);
1081 * Figure out len of current hdr: variable length of
1082 * iwarp hdr may force us to copy hdr information in
1083 * two steps. Only tagged DDP messages are already
1084 * completely received.
1086 if (iwarp_pktinfo[opcode].hdr_len > sizeof(struct iwarp_ctrl_tagged)) {
1087 int hdrlen = iwarp_pktinfo[opcode].hdr_len;
1089 bytes = min_t(int, hdrlen - MIN_DDP_HDR, srx->skb_new);
1091 skb_copy_bits(skb, srx->skb_offset,
1092 (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
1094 srx->fpdu_part_rcvd += bytes;
1096 srx->skb_new -= bytes;
1097 srx->skb_offset += bytes;
1098 srx->skb_copied += bytes;
1100 if (srx->fpdu_part_rcvd < hdrlen)
1105 * DDP/RDMAP header receive completed. Check if the current
1106 * DDP segment starts a new RDMAP message or continues a previously
1107 * started RDMAP message.
1109 * Alternating reception of DDP segments (or FPDUs) from incomplete
1110 * tagged and untagged RDMAP messages is supported, as long as
1111 * the current tagged or untagged message gets eventually completed
1112 * w/o intersection from another message of the same type
1113 * (tagged/untagged). E.g., a WRITE can get intersected by a SEND,
1114 * but not by a READ RESPONSE etc.
1116 if (srx->mpa_crc_hd) {
1118 * Restart CRC computation
1120 crypto_shash_init(srx->mpa_crc_hd);
1121 crypto_shash_update(srx->mpa_crc_hd, (u8 *)c_hdr,
1122 srx->fpdu_part_rcvd);
1124 if (frx->more_ddp_segs) {
1125 frx->first_ddp_seg = 0;
1126 if (frx->prev_rdmap_op != opcode) {
1127 pr_warn("siw: packet intersection: %u : %u\n",
1128 frx->prev_rdmap_op, opcode);
1130 * The last inbound RDMA operation of same type
1131 * (tagged or untagged) is left unfinished.
1132 * To complete it in error, make it the current
1133 * operation again, even with the header already
1134 * overwritten. For error handling, only the opcode
1135 * and current rx context are relevant.
1137 set_rx_fpdu_context(qp, frx->prev_rdmap_op);
1138 __rdmap_set_opcode(c_hdr, frx->prev_rdmap_op);
1142 frx->prev_rdmap_op = opcode;
1143 frx->first_ddp_seg = 1;
1145 frx->more_ddp_segs = c_hdr->ddp_rdmap_ctrl & DDP_FLAG_LAST ? 0 : 1;
1150 static int siw_check_tx_fence(struct siw_qp *qp)
1152 struct siw_wqe *tx_waiting = tx_wqe(qp);
1153 struct siw_sqe *rreq;
1154 int resume_tx = 0, rv = 0;
1155 unsigned long flags;
1157 spin_lock_irqsave(&qp->orq_lock, flags);
1159 /* free current orq entry */
1160 rreq = orq_get_current(qp);
1161 WRITE_ONCE(rreq->flags, 0);
1165 if (qp->tx_ctx.orq_fence) {
1166 if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) {
1167 pr_warn("siw: [QP %u]: fence resume: bad status %d\n",
1168 qp_id(qp), tx_waiting->wr_status);
1172 /* resume SQ processing, if possible */
1173 if (tx_waiting->sqe.opcode == SIW_OP_READ ||
1174 tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
1176 /* SQ processing was stopped because of a full ORQ */
1177 rreq = orq_get_free(qp);
1178 if (unlikely(!rreq)) {
1179 pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp));
1183 siw_read_to_orq(rreq, &tx_waiting->sqe);
1186 qp->tx_ctx.orq_fence = 0;
1189 } else if (siw_orq_empty(qp)) {
1191 * SQ processing was stopped by fenced work request.
1192 * Resume since all previous Read's are now completed.
1194 qp->tx_ctx.orq_fence = 0;
1199 spin_unlock_irqrestore(&qp->orq_lock, flags);
1202 rv = siw_sq_start(qp);
1208 * siw_rdmap_complete()
1210 * Complete processing of an RDMA message after receiving all
1211 * DDP segmens or ABort processing after encountering error case.
1213 * o SENDs + RRESPs will need for completion,
1214 * o RREQs need for READ RESPONSE initialization
1215 * o WRITEs need memory dereferencing
1217 * TODO: Failed WRITEs need local error to be surfaced.
1219 static int siw_rdmap_complete(struct siw_qp *qp, int error)
1221 struct siw_rx_stream *srx = &qp->rx_stream;
1222 struct siw_wqe *wqe = rx_wqe(qp->rx_fpdu);
1223 enum siw_wc_status wc_status = wqe->wc_status;
1224 u8 opcode = __rdmap_get_opcode(&srx->hdr.ctrl);
1229 case RDMAP_SEND_SE_INVAL:
1230 wqe->rqe.flags |= SIW_WQE_SOLICITED;
1234 case RDMAP_SEND_INVAL:
1235 if (wqe->wr_status == SIW_WR_IDLE)
1238 srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++;
1240 if (error != 0 && wc_status == SIW_WC_SUCCESS)
1241 wc_status = SIW_WC_GENERAL_ERR;
1243 * Handle STag invalidation request
1245 if (wc_status == SIW_WC_SUCCESS &&
1246 (opcode == RDMAP_SEND_INVAL ||
1247 opcode == RDMAP_SEND_SE_INVAL)) {
1248 rv = siw_invalidate_stag(qp->pd, srx->inval_stag);
1251 qp, TERM_ERROR_LAYER_RDMAP,
1253 RDMAP_ETYPE_REMOTE_PROTECTION :
1254 RDMAP_ETYPE_REMOTE_OPERATION,
1255 RDMAP_ECODE_CANNOT_INVALIDATE, 0);
1257 wc_status = SIW_WC_REM_INV_REQ_ERR;
1259 rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
1260 rv ? 0 : srx->inval_stag,
1263 rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
1266 siw_wqe_put_mem(wqe, SIW_OP_RECEIVE);
1269 case RDMAP_RDMA_READ_RESP:
1270 if (wqe->wr_status == SIW_WR_IDLE)
1274 if ((srx->state == SIW_GET_HDR &&
1275 qp->rx_fpdu->first_ddp_seg) || error == -ENODATA)
1276 /* possible RREQ in ORQ left untouched */
1279 if (wc_status == SIW_WC_SUCCESS)
1280 wc_status = SIW_WC_GENERAL_ERR;
1281 } else if (rdma_is_kernel_res(&qp->base_qp.res) &&
1282 rx_type(wqe) == SIW_OP_READ_LOCAL_INV) {
1284 * Handle any STag invalidation request
1286 rv = siw_invalidate_stag(qp->pd, wqe->sqe.sge[0].lkey);
1288 siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
1289 RDMAP_ETYPE_CATASTROPHIC,
1290 RDMAP_ECODE_UNSPECIFIED, 0);
1292 if (wc_status == SIW_WC_SUCCESS) {
1293 wc_status = SIW_WC_GENERAL_ERR;
1299 * All errors turn the wqe into signalled.
1301 if ((wqe->sqe.flags & SIW_WQE_SIGNALLED) || error != 0)
1302 rv = siw_sqe_complete(qp, &wqe->sqe, wqe->processed,
1304 siw_wqe_put_mem(wqe, SIW_OP_READ);
1307 rv = siw_check_tx_fence(qp);
1309 /* Disable current ORQ element */
1310 if (qp->attrs.orq_size)
1311 WRITE_ONCE(orq_get_current(qp)->flags, 0);
1315 case RDMAP_RDMA_READ_REQ:
1317 rv = siw_init_rresp(qp, srx);
1318 srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
1322 case RDMAP_RDMA_WRITE:
1323 if (wqe->wr_status == SIW_WR_IDLE)
1327 * Free References from memory object if
1328 * attached to receive context (inbound WRITE).
1329 * While a zero-length WRITE is allowed,
1330 * no memory reference got created.
1332 if (rx_mem(&qp->rx_tagged)) {
1333 siw_mem_put(rx_mem(&qp->rx_tagged));
1334 rx_mem(&qp->rx_tagged) = NULL;
1341 wqe->wr_status = SIW_WR_IDLE;
1349 * Main routine to consume inbound TCP payload
1351 * @rd_desc: read descriptor
1352 * @skb: socket buffer
1353 * @off: offset in skb
1354 * @len: skb->len - offset : payload in skb
1356 int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
1357 unsigned int off, size_t len)
1359 struct siw_qp *qp = rd_desc->arg.data;
1360 struct siw_rx_stream *srx = &qp->rx_stream;
1364 srx->skb_new = skb->len - off;
1365 srx->skb_offset = off;
1366 srx->skb_copied = 0;
1368 siw_dbg_qp(qp, "new data, len %d\n", srx->skb_new);
1370 while (srx->skb_new) {
1371 int run_completion = 1;
1373 if (unlikely(srx->rx_suspend)) {
1374 /* Do not process any more data */
1375 srx->skb_copied += srx->skb_new;
1378 switch (srx->state) {
1380 rv = siw_get_hdr(srx);
1382 srx->fpdu_part_rem =
1383 be16_to_cpu(srx->hdr.ctrl.mpa_len) -
1384 srx->fpdu_part_rcvd + MPA_HDR_SIZE;
1386 if (srx->fpdu_part_rem)
1387 srx->pad = -srx->fpdu_part_rem & 0x3;
1391 srx->state = SIW_GET_DATA_START;
1392 srx->fpdu_part_rcvd = 0;
1396 case SIW_GET_DATA_MORE:
1398 * Another data fragment of the same DDP segment.
1399 * Setting first_ddp_seg = 0 avoids repeating
1400 * initializations that shall occur only once per
1403 qp->rx_fpdu->first_ddp_seg = 0;
1406 case SIW_GET_DATA_START:
1408 * Headers will be checked by the opcode-specific
1409 * data receive function below.
1411 rv = iwarp_pktinfo[qp->rx_stream.rdmap_op].rx_data(qp);
1414 be16_to_cpu(srx->hdr.ctrl.mpa_len)
1417 srx->fpdu_part_rem = (-mpa_len & 0x3)
1419 srx->fpdu_part_rcvd = 0;
1420 srx->state = SIW_GET_TRAILER;
1422 if (unlikely(rv == -ECONNRESET))
1425 srx->state = SIW_GET_DATA_MORE;
1429 case SIW_GET_TRAILER:
1431 * read CRC + any padding
1433 rv = siw_get_trailer(qp, srx);
1437 * complete RDMAP message if last fragment
1439 srx->state = SIW_GET_HDR;
1440 srx->fpdu_part_rcvd = 0;
1442 if (!(srx->hdr.ctrl.ddp_rdmap_ctrl &
1447 rv = siw_rdmap_complete(qp, 0);
1453 pr_warn("QP[%u]: RX out of state\n", qp_id(qp));
1457 if (unlikely(rv != 0 && rv != -EAGAIN)) {
1458 if ((srx->state > SIW_GET_HDR ||
1459 qp->rx_fpdu->more_ddp_segs) && run_completion)
1460 siw_rdmap_complete(qp, rv);
1462 siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv,
1465 siw_qp_cm_drop(qp, 1);
1470 siw_dbg_qp(qp, "fpdu fragment, state %d, missing %d\n",
1471 srx->state, srx->fpdu_part_rem);
1475 return srx->skb_copied;