GNU Linux-libre 5.10.153-gnu1
[releases.git] / drivers / nvme / target / tcp.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * NVMe over Fabrics TCP target.
4  * Copyright (c) 2018 Lightbits Labs. All rights reserved.
5  */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 #include <linux/module.h>
8 #include <linux/init.h>
9 #include <linux/slab.h>
10 #include <linux/err.h>
11 #include <linux/nvme-tcp.h>
12 #include <net/sock.h>
13 #include <net/tcp.h>
14 #include <linux/inet.h>
15 #include <linux/llist.h>
16 #include <crypto/hash.h>
17
18 #include "nvmet.h"
19
20 #define NVMET_TCP_DEF_INLINE_DATA_SIZE  (4 * PAGE_SIZE)
21
22 /* Define the socket priority to use for connections were it is desirable
23  * that the NIC consider performing optimized packet processing or filtering.
24  * A non-zero value being sufficient to indicate general consideration of any
25  * possible optimization.  Making it a module param allows for alternative
26  * values that may be unique for some NIC implementations.
27  */
28 static int so_priority;
29 module_param(so_priority, int, 0644);
30 MODULE_PARM_DESC(so_priority, "nvmet tcp socket optimize priority");
31
32 #define NVMET_TCP_RECV_BUDGET           8
33 #define NVMET_TCP_SEND_BUDGET           8
34 #define NVMET_TCP_IO_WORK_BUDGET        64
35
36 enum nvmet_tcp_send_state {
37         NVMET_TCP_SEND_DATA_PDU,
38         NVMET_TCP_SEND_DATA,
39         NVMET_TCP_SEND_R2T,
40         NVMET_TCP_SEND_DDGST,
41         NVMET_TCP_SEND_RESPONSE
42 };
43
44 enum nvmet_tcp_recv_state {
45         NVMET_TCP_RECV_PDU,
46         NVMET_TCP_RECV_DATA,
47         NVMET_TCP_RECV_DDGST,
48         NVMET_TCP_RECV_ERR,
49 };
50
51 enum {
52         NVMET_TCP_F_INIT_FAILED = (1 << 0),
53 };
54
55 struct nvmet_tcp_cmd {
56         struct nvmet_tcp_queue          *queue;
57         struct nvmet_req                req;
58
59         struct nvme_tcp_cmd_pdu         *cmd_pdu;
60         struct nvme_tcp_rsp_pdu         *rsp_pdu;
61         struct nvme_tcp_data_pdu        *data_pdu;
62         struct nvme_tcp_r2t_pdu         *r2t_pdu;
63
64         u32                             rbytes_done;
65         u32                             wbytes_done;
66
67         u32                             pdu_len;
68         u32                             pdu_recv;
69         int                             sg_idx;
70         int                             nr_mapped;
71         struct msghdr                   recv_msg;
72         struct kvec                     *iov;
73         u32                             flags;
74
75         struct list_head                entry;
76         struct llist_node               lentry;
77
78         /* send state */
79         u32                             offset;
80         struct scatterlist              *cur_sg;
81         enum nvmet_tcp_send_state       state;
82
83         __le32                          exp_ddgst;
84         __le32                          recv_ddgst;
85 };
86
87 enum nvmet_tcp_queue_state {
88         NVMET_TCP_Q_CONNECTING,
89         NVMET_TCP_Q_LIVE,
90         NVMET_TCP_Q_DISCONNECTING,
91 };
92
93 struct nvmet_tcp_queue {
94         struct socket           *sock;
95         struct nvmet_tcp_port   *port;
96         struct work_struct      io_work;
97         struct nvmet_cq         nvme_cq;
98         struct nvmet_sq         nvme_sq;
99
100         /* send state */
101         struct nvmet_tcp_cmd    *cmds;
102         unsigned int            nr_cmds;
103         struct list_head        free_list;
104         struct llist_head       resp_list;
105         struct list_head        resp_send_list;
106         int                     send_list_len;
107         struct nvmet_tcp_cmd    *snd_cmd;
108
109         /* recv state */
110         int                     offset;
111         int                     left;
112         enum nvmet_tcp_recv_state rcv_state;
113         struct nvmet_tcp_cmd    *cmd;
114         union nvme_tcp_pdu      pdu;
115
116         /* digest state */
117         bool                    hdr_digest;
118         bool                    data_digest;
119         struct ahash_request    *snd_hash;
120         struct ahash_request    *rcv_hash;
121
122         spinlock_t              state_lock;
123         enum nvmet_tcp_queue_state state;
124
125         struct sockaddr_storage sockaddr;
126         struct sockaddr_storage sockaddr_peer;
127         struct work_struct      release_work;
128
129         int                     idx;
130         struct list_head        queue_list;
131
132         struct nvmet_tcp_cmd    connect;
133
134         struct page_frag_cache  pf_cache;
135
136         void (*data_ready)(struct sock *);
137         void (*state_change)(struct sock *);
138         void (*write_space)(struct sock *);
139 };
140
141 struct nvmet_tcp_port {
142         struct socket           *sock;
143         struct work_struct      accept_work;
144         struct nvmet_port       *nport;
145         struct sockaddr_storage addr;
146         void (*data_ready)(struct sock *);
147 };
148
149 static DEFINE_IDA(nvmet_tcp_queue_ida);
150 static LIST_HEAD(nvmet_tcp_queue_list);
151 static DEFINE_MUTEX(nvmet_tcp_queue_mutex);
152
153 static struct workqueue_struct *nvmet_tcp_wq;
154 static const struct nvmet_fabrics_ops nvmet_tcp_ops;
155 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
156 static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd);
157
158 static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
159                 struct nvmet_tcp_cmd *cmd)
160 {
161         if (unlikely(!queue->nr_cmds)) {
162                 /* We didn't allocate cmds yet, send 0xffff */
163                 return USHRT_MAX;
164         }
165
166         return cmd - queue->cmds;
167 }
168
169 static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd)
170 {
171         return nvme_is_write(cmd->req.cmd) &&
172                 cmd->rbytes_done < cmd->req.transfer_len;
173 }
174
175 static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd)
176 {
177         return nvmet_tcp_has_data_in(cmd) && !cmd->req.cqe->status;
178 }
179
180 static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd)
181 {
182         return !nvme_is_write(cmd->req.cmd) &&
183                 cmd->req.transfer_len > 0 &&
184                 !cmd->req.cqe->status;
185 }
186
187 static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd)
188 {
189         return nvme_is_write(cmd->req.cmd) && cmd->pdu_len &&
190                 !cmd->rbytes_done;
191 }
192
193 static inline struct nvmet_tcp_cmd *
194 nvmet_tcp_get_cmd(struct nvmet_tcp_queue *queue)
195 {
196         struct nvmet_tcp_cmd *cmd;
197
198         cmd = list_first_entry_or_null(&queue->free_list,
199                                 struct nvmet_tcp_cmd, entry);
200         if (!cmd)
201                 return NULL;
202         list_del_init(&cmd->entry);
203
204         cmd->rbytes_done = cmd->wbytes_done = 0;
205         cmd->pdu_len = 0;
206         cmd->pdu_recv = 0;
207         cmd->iov = NULL;
208         cmd->flags = 0;
209         return cmd;
210 }
211
212 static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd)
213 {
214         if (unlikely(cmd == &cmd->queue->connect))
215                 return;
216
217         list_add_tail(&cmd->entry, &cmd->queue->free_list);
218 }
219
220 static inline int queue_cpu(struct nvmet_tcp_queue *queue)
221 {
222         return queue->sock->sk->sk_incoming_cpu;
223 }
224
225 static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue)
226 {
227         return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
228 }
229
230 static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
231 {
232         return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
233 }
234
235 static inline void nvmet_tcp_hdgst(struct ahash_request *hash,
236                 void *pdu, size_t len)
237 {
238         struct scatterlist sg;
239
240         sg_init_one(&sg, pdu, len);
241         ahash_request_set_crypt(hash, &sg, pdu + len, len);
242         crypto_ahash_digest(hash);
243 }
244
245 static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
246         void *pdu, size_t len)
247 {
248         struct nvme_tcp_hdr *hdr = pdu;
249         __le32 recv_digest;
250         __le32 exp_digest;
251
252         if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
253                 pr_err("queue %d: header digest enabled but no header digest\n",
254                         queue->idx);
255                 return -EPROTO;
256         }
257
258         recv_digest = *(__le32 *)(pdu + hdr->hlen);
259         nvmet_tcp_hdgst(queue->rcv_hash, pdu, len);
260         exp_digest = *(__le32 *)(pdu + hdr->hlen);
261         if (recv_digest != exp_digest) {
262                 pr_err("queue %d: header digest error: recv %#x expected %#x\n",
263                         queue->idx, le32_to_cpu(recv_digest),
264                         le32_to_cpu(exp_digest));
265                 return -EPROTO;
266         }
267
268         return 0;
269 }
270
271 static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu)
272 {
273         struct nvme_tcp_hdr *hdr = pdu;
274         u8 digest_len = nvmet_tcp_hdgst_len(queue);
275         u32 len;
276
277         len = le32_to_cpu(hdr->plen) - hdr->hlen -
278                 (hdr->flags & NVME_TCP_F_HDGST ? digest_len : 0);
279
280         if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
281                 pr_err("queue %d: data digest flag is cleared\n", queue->idx);
282                 return -EPROTO;
283         }
284
285         return 0;
286 }
287
288 static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd)
289 {
290         struct scatterlist *sg;
291         int i;
292
293         sg = &cmd->req.sg[cmd->sg_idx];
294
295         for (i = 0; i < cmd->nr_mapped; i++)
296                 kunmap(sg_page(&sg[i]));
297 }
298
299 static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd)
300 {
301         struct kvec *iov = cmd->iov;
302         struct scatterlist *sg;
303         u32 length, offset, sg_offset;
304
305         length = cmd->pdu_len;
306         cmd->nr_mapped = DIV_ROUND_UP(length, PAGE_SIZE);
307         offset = cmd->rbytes_done;
308         cmd->sg_idx = offset / PAGE_SIZE;
309         sg_offset = offset % PAGE_SIZE;
310         sg = &cmd->req.sg[cmd->sg_idx];
311
312         while (length) {
313                 u32 iov_len = min_t(u32, length, sg->length - sg_offset);
314
315                 iov->iov_base = kmap(sg_page(sg)) + sg->offset + sg_offset;
316                 iov->iov_len = iov_len;
317
318                 length -= iov_len;
319                 sg = sg_next(sg);
320                 iov++;
321                 sg_offset = 0;
322         }
323
324         iov_iter_kvec(&cmd->recv_msg.msg_iter, READ, cmd->iov,
325                 cmd->nr_mapped, cmd->pdu_len);
326 }
327
328 static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue)
329 {
330         queue->rcv_state = NVMET_TCP_RECV_ERR;
331         if (queue->nvme_sq.ctrl)
332                 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
333         else
334                 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
335 }
336
337 static void nvmet_tcp_socket_error(struct nvmet_tcp_queue *queue, int status)
338 {
339         if (status == -EPIPE || status == -ECONNRESET)
340                 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
341         else
342                 nvmet_tcp_fatal_error(queue);
343 }
344
345 static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
346 {
347         struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl;
348         u32 len = le32_to_cpu(sgl->length);
349
350         if (!len)
351                 return 0;
352
353         if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) |
354                           NVME_SGL_FMT_OFFSET)) {
355                 if (!nvme_is_write(cmd->req.cmd))
356                         return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
357
358                 if (len > cmd->req.port->inline_data_size)
359                         return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
360                 cmd->pdu_len = len;
361         }
362         cmd->req.transfer_len += len;
363
364         cmd->req.sg = sgl_alloc(len, GFP_KERNEL, &cmd->req.sg_cnt);
365         if (!cmd->req.sg)
366                 return NVME_SC_INTERNAL;
367         cmd->cur_sg = cmd->req.sg;
368
369         if (nvmet_tcp_has_data_in(cmd)) {
370                 cmd->iov = kmalloc_array(cmd->req.sg_cnt,
371                                 sizeof(*cmd->iov), GFP_KERNEL);
372                 if (!cmd->iov)
373                         goto err;
374         }
375
376         return 0;
377 err:
378         sgl_free(cmd->req.sg);
379         return NVME_SC_INTERNAL;
380 }
381
382 static void nvmet_tcp_send_ddgst(struct ahash_request *hash,
383                 struct nvmet_tcp_cmd *cmd)
384 {
385         ahash_request_set_crypt(hash, cmd->req.sg,
386                 (void *)&cmd->exp_ddgst, cmd->req.transfer_len);
387         crypto_ahash_digest(hash);
388 }
389
390 static void nvmet_tcp_recv_ddgst(struct ahash_request *hash,
391                 struct nvmet_tcp_cmd *cmd)
392 {
393         struct scatterlist sg;
394         struct kvec *iov;
395         int i;
396
397         crypto_ahash_init(hash);
398         for (i = 0, iov = cmd->iov; i < cmd->nr_mapped; i++, iov++) {
399                 sg_init_one(&sg, iov->iov_base, iov->iov_len);
400                 ahash_request_set_crypt(hash, &sg, NULL, iov->iov_len);
401                 crypto_ahash_update(hash);
402         }
403         ahash_request_set_crypt(hash, NULL, (void *)&cmd->exp_ddgst, 0);
404         crypto_ahash_final(hash);
405 }
406
407 static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
408 {
409         struct nvme_tcp_data_pdu *pdu = cmd->data_pdu;
410         struct nvmet_tcp_queue *queue = cmd->queue;
411         u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
412         u8 ddgst = nvmet_tcp_ddgst_len(cmd->queue);
413
414         cmd->offset = 0;
415         cmd->state = NVMET_TCP_SEND_DATA_PDU;
416
417         pdu->hdr.type = nvme_tcp_c2h_data;
418         pdu->hdr.flags = NVME_TCP_F_DATA_LAST | (queue->nvme_sq.sqhd_disabled ?
419                                                 NVME_TCP_F_DATA_SUCCESS : 0);
420         pdu->hdr.hlen = sizeof(*pdu);
421         pdu->hdr.pdo = pdu->hdr.hlen + hdgst;
422         pdu->hdr.plen =
423                 cpu_to_le32(pdu->hdr.hlen + hdgst +
424                                 cmd->req.transfer_len + ddgst);
425         pdu->command_id = cmd->req.cqe->command_id;
426         pdu->data_length = cpu_to_le32(cmd->req.transfer_len);
427         pdu->data_offset = cpu_to_le32(cmd->wbytes_done);
428
429         if (queue->data_digest) {
430                 pdu->hdr.flags |= NVME_TCP_F_DDGST;
431                 nvmet_tcp_send_ddgst(queue->snd_hash, cmd);
432         }
433
434         if (cmd->queue->hdr_digest) {
435                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
436                 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
437         }
438 }
439
440 static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
441 {
442         struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu;
443         struct nvmet_tcp_queue *queue = cmd->queue;
444         u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
445
446         cmd->offset = 0;
447         cmd->state = NVMET_TCP_SEND_R2T;
448
449         pdu->hdr.type = nvme_tcp_r2t;
450         pdu->hdr.flags = 0;
451         pdu->hdr.hlen = sizeof(*pdu);
452         pdu->hdr.pdo = 0;
453         pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
454
455         pdu->command_id = cmd->req.cmd->common.command_id;
456         pdu->ttag = nvmet_tcp_cmd_tag(cmd->queue, cmd);
457         pdu->r2t_length = cpu_to_le32(cmd->req.transfer_len - cmd->rbytes_done);
458         pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done);
459         if (cmd->queue->hdr_digest) {
460                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
461                 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
462         }
463 }
464
465 static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
466 {
467         struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu;
468         struct nvmet_tcp_queue *queue = cmd->queue;
469         u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
470
471         cmd->offset = 0;
472         cmd->state = NVMET_TCP_SEND_RESPONSE;
473
474         pdu->hdr.type = nvme_tcp_rsp;
475         pdu->hdr.flags = 0;
476         pdu->hdr.hlen = sizeof(*pdu);
477         pdu->hdr.pdo = 0;
478         pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
479         if (cmd->queue->hdr_digest) {
480                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
481                 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
482         }
483 }
484
485 static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue)
486 {
487         struct llist_node *node;
488         struct nvmet_tcp_cmd *cmd;
489
490         for (node = llist_del_all(&queue->resp_list); node; node = node->next) {
491                 cmd = llist_entry(node, struct nvmet_tcp_cmd, lentry);
492                 list_add(&cmd->entry, &queue->resp_send_list);
493                 queue->send_list_len++;
494         }
495 }
496
497 static struct nvmet_tcp_cmd *nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue *queue)
498 {
499         queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list,
500                                 struct nvmet_tcp_cmd, entry);
501         if (!queue->snd_cmd) {
502                 nvmet_tcp_process_resp_list(queue);
503                 queue->snd_cmd =
504                         list_first_entry_or_null(&queue->resp_send_list,
505                                         struct nvmet_tcp_cmd, entry);
506                 if (unlikely(!queue->snd_cmd))
507                         return NULL;
508         }
509
510         list_del_init(&queue->snd_cmd->entry);
511         queue->send_list_len--;
512
513         if (nvmet_tcp_need_data_out(queue->snd_cmd))
514                 nvmet_setup_c2h_data_pdu(queue->snd_cmd);
515         else if (nvmet_tcp_need_data_in(queue->snd_cmd))
516                 nvmet_setup_r2t_pdu(queue->snd_cmd);
517         else
518                 nvmet_setup_response_pdu(queue->snd_cmd);
519
520         return queue->snd_cmd;
521 }
522
523 static void nvmet_tcp_queue_response(struct nvmet_req *req)
524 {
525         struct nvmet_tcp_cmd *cmd =
526                 container_of(req, struct nvmet_tcp_cmd, req);
527         struct nvmet_tcp_queue  *queue = cmd->queue;
528         struct nvme_sgl_desc *sgl;
529         u32 len;
530
531         if (unlikely(cmd == queue->cmd)) {
532                 sgl = &cmd->req.cmd->common.dptr.sgl;
533                 len = le32_to_cpu(sgl->length);
534
535                 /*
536                  * Wait for inline data before processing the response.
537                  * Avoid using helpers, this might happen before
538                  * nvmet_req_init is completed.
539                  */
540                 if (queue->rcv_state == NVMET_TCP_RECV_PDU &&
541                     len && len <= cmd->req.port->inline_data_size &&
542                     nvme_is_write(cmd->req.cmd))
543                         return;
544         }
545
546         llist_add(&cmd->lentry, &queue->resp_list);
547         queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work);
548 }
549
550 static void nvmet_tcp_execute_request(struct nvmet_tcp_cmd *cmd)
551 {
552         if (unlikely(cmd->flags & NVMET_TCP_F_INIT_FAILED))
553                 nvmet_tcp_queue_response(&cmd->req);
554         else
555                 cmd->req.execute(&cmd->req);
556 }
557
558 static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
559 {
560         u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
561         int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst;
562         int ret;
563
564         ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu),
565                         offset_in_page(cmd->data_pdu) + cmd->offset,
566                         left, MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
567         if (ret <= 0)
568                 return ret;
569
570         cmd->offset += ret;
571         left -= ret;
572
573         if (left)
574                 return -EAGAIN;
575
576         cmd->state = NVMET_TCP_SEND_DATA;
577         cmd->offset  = 0;
578         return 1;
579 }
580
581 static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
582 {
583         struct nvmet_tcp_queue *queue = cmd->queue;
584         int ret;
585
586         while (cmd->cur_sg) {
587                 struct page *page = sg_page(cmd->cur_sg);
588                 u32 left = cmd->cur_sg->length - cmd->offset;
589                 int flags = MSG_DONTWAIT;
590
591                 if ((!last_in_batch && cmd->queue->send_list_len) ||
592                     cmd->wbytes_done + left < cmd->req.transfer_len ||
593                     queue->data_digest || !queue->nvme_sq.sqhd_disabled)
594                         flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
595
596                 ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset,
597                                         left, flags);
598                 if (ret <= 0)
599                         return ret;
600
601                 cmd->offset += ret;
602                 cmd->wbytes_done += ret;
603
604                 /* Done with sg?*/
605                 if (cmd->offset == cmd->cur_sg->length) {
606                         cmd->cur_sg = sg_next(cmd->cur_sg);
607                         cmd->offset = 0;
608                 }
609         }
610
611         if (queue->data_digest) {
612                 cmd->state = NVMET_TCP_SEND_DDGST;
613                 cmd->offset = 0;
614         } else {
615                 if (queue->nvme_sq.sqhd_disabled) {
616                         cmd->queue->snd_cmd = NULL;
617                         nvmet_tcp_put_cmd(cmd);
618                 } else {
619                         nvmet_setup_response_pdu(cmd);
620                 }
621         }
622
623         if (queue->nvme_sq.sqhd_disabled) {
624                 kfree(cmd->iov);
625                 sgl_free(cmd->req.sg);
626         }
627
628         return 1;
629
630 }
631
632 static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
633                 bool last_in_batch)
634 {
635         u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
636         int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
637         int flags = MSG_DONTWAIT;
638         int ret;
639
640         if (!last_in_batch && cmd->queue->send_list_len)
641                 flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
642         else
643                 flags |= MSG_EOR;
644
645         ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu),
646                 offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags);
647         if (ret <= 0)
648                 return ret;
649         cmd->offset += ret;
650         left -= ret;
651
652         if (left)
653                 return -EAGAIN;
654
655         kfree(cmd->iov);
656         sgl_free(cmd->req.sg);
657         cmd->queue->snd_cmd = NULL;
658         nvmet_tcp_put_cmd(cmd);
659         return 1;
660 }
661
662 static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
663 {
664         u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
665         int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst;
666         int flags = MSG_DONTWAIT;
667         int ret;
668
669         if (!last_in_batch && cmd->queue->send_list_len)
670                 flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
671         else
672                 flags |= MSG_EOR;
673
674         ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->r2t_pdu),
675                 offset_in_page(cmd->r2t_pdu) + cmd->offset, left, flags);
676         if (ret <= 0)
677                 return ret;
678         cmd->offset += ret;
679         left -= ret;
680
681         if (left)
682                 return -EAGAIN;
683
684         cmd->queue->snd_cmd = NULL;
685         return 1;
686 }
687
688 static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
689 {
690         struct nvmet_tcp_queue *queue = cmd->queue;
691         int left = NVME_TCP_DIGEST_LENGTH - cmd->offset;
692         struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
693         struct kvec iov = {
694                 .iov_base = (u8 *)&cmd->exp_ddgst + cmd->offset,
695                 .iov_len = left
696         };
697         int ret;
698
699         if (!last_in_batch && cmd->queue->send_list_len)
700                 msg.msg_flags |= MSG_MORE;
701         else
702                 msg.msg_flags |= MSG_EOR;
703
704         ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
705         if (unlikely(ret <= 0))
706                 return ret;
707
708         cmd->offset += ret;
709         left -= ret;
710
711         if (left)
712                 return -EAGAIN;
713
714         if (queue->nvme_sq.sqhd_disabled) {
715                 cmd->queue->snd_cmd = NULL;
716                 nvmet_tcp_put_cmd(cmd);
717         } else {
718                 nvmet_setup_response_pdu(cmd);
719         }
720         return 1;
721 }
722
723 static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue,
724                 bool last_in_batch)
725 {
726         struct nvmet_tcp_cmd *cmd = queue->snd_cmd;
727         int ret = 0;
728
729         if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) {
730                 cmd = nvmet_tcp_fetch_cmd(queue);
731                 if (unlikely(!cmd))
732                         return 0;
733         }
734
735         if (cmd->state == NVMET_TCP_SEND_DATA_PDU) {
736                 ret = nvmet_try_send_data_pdu(cmd);
737                 if (ret <= 0)
738                         goto done_send;
739         }
740
741         if (cmd->state == NVMET_TCP_SEND_DATA) {
742                 ret = nvmet_try_send_data(cmd, last_in_batch);
743                 if (ret <= 0)
744                         goto done_send;
745         }
746
747         if (cmd->state == NVMET_TCP_SEND_DDGST) {
748                 ret = nvmet_try_send_ddgst(cmd, last_in_batch);
749                 if (ret <= 0)
750                         goto done_send;
751         }
752
753         if (cmd->state == NVMET_TCP_SEND_R2T) {
754                 ret = nvmet_try_send_r2t(cmd, last_in_batch);
755                 if (ret <= 0)
756                         goto done_send;
757         }
758
759         if (cmd->state == NVMET_TCP_SEND_RESPONSE)
760                 ret = nvmet_try_send_response(cmd, last_in_batch);
761
762 done_send:
763         if (ret < 0) {
764                 if (ret == -EAGAIN)
765                         return 0;
766                 return ret;
767         }
768
769         return 1;
770 }
771
772 static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue,
773                 int budget, int *sends)
774 {
775         int i, ret = 0;
776
777         for (i = 0; i < budget; i++) {
778                 ret = nvmet_tcp_try_send_one(queue, i == budget - 1);
779                 if (unlikely(ret < 0)) {
780                         nvmet_tcp_socket_error(queue, ret);
781                         goto done;
782                 } else if (ret == 0) {
783                         break;
784                 }
785                 (*sends)++;
786         }
787 done:
788         return ret;
789 }
790
791 static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue)
792 {
793         queue->offset = 0;
794         queue->left = sizeof(struct nvme_tcp_hdr);
795         queue->cmd = NULL;
796         queue->rcv_state = NVMET_TCP_RECV_PDU;
797 }
798
799 static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue)
800 {
801         struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
802
803         ahash_request_free(queue->rcv_hash);
804         ahash_request_free(queue->snd_hash);
805         crypto_free_ahash(tfm);
806 }
807
808 static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue)
809 {
810         struct crypto_ahash *tfm;
811
812         tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
813         if (IS_ERR(tfm))
814                 return PTR_ERR(tfm);
815
816         queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
817         if (!queue->snd_hash)
818                 goto free_tfm;
819         ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
820
821         queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
822         if (!queue->rcv_hash)
823                 goto free_snd_hash;
824         ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
825
826         return 0;
827 free_snd_hash:
828         ahash_request_free(queue->snd_hash);
829 free_tfm:
830         crypto_free_ahash(tfm);
831         return -ENOMEM;
832 }
833
834
835 static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
836 {
837         struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq;
838         struct nvme_tcp_icresp_pdu *icresp = &queue->pdu.icresp;
839         struct msghdr msg = {};
840         struct kvec iov;
841         int ret;
842
843         if (le32_to_cpu(icreq->hdr.plen) != sizeof(struct nvme_tcp_icreq_pdu)) {
844                 pr_err("bad nvme-tcp pdu length (%d)\n",
845                         le32_to_cpu(icreq->hdr.plen));
846                 nvmet_tcp_fatal_error(queue);
847         }
848
849         if (icreq->pfv != NVME_TCP_PFV_1_0) {
850                 pr_err("queue %d: bad pfv %d\n", queue->idx, icreq->pfv);
851                 return -EPROTO;
852         }
853
854         if (icreq->hpda != 0) {
855                 pr_err("queue %d: unsupported hpda %d\n", queue->idx,
856                         icreq->hpda);
857                 return -EPROTO;
858         }
859
860         queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE);
861         queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE);
862         if (queue->hdr_digest || queue->data_digest) {
863                 ret = nvmet_tcp_alloc_crypto(queue);
864                 if (ret)
865                         return ret;
866         }
867
868         memset(icresp, 0, sizeof(*icresp));
869         icresp->hdr.type = nvme_tcp_icresp;
870         icresp->hdr.hlen = sizeof(*icresp);
871         icresp->hdr.pdo = 0;
872         icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen);
873         icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
874         icresp->maxdata = cpu_to_le32(0x400000); /* 16M arbitrary limit */
875         icresp->cpda = 0;
876         if (queue->hdr_digest)
877                 icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
878         if (queue->data_digest)
879                 icresp->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
880
881         iov.iov_base = icresp;
882         iov.iov_len = sizeof(*icresp);
883         ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
884         if (ret < 0)
885                 goto free_crypto;
886
887         queue->state = NVMET_TCP_Q_LIVE;
888         nvmet_prepare_receive_pdu(queue);
889         return 0;
890 free_crypto:
891         if (queue->hdr_digest || queue->data_digest)
892                 nvmet_tcp_free_crypto(queue);
893         return ret;
894 }
895
896 static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue,
897                 struct nvmet_tcp_cmd *cmd, struct nvmet_req *req)
898 {
899         size_t data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length);
900         int ret;
901
902         if (!nvme_is_write(cmd->req.cmd) ||
903             data_len > cmd->req.port->inline_data_size) {
904                 nvmet_prepare_receive_pdu(queue);
905                 return;
906         }
907
908         ret = nvmet_tcp_map_data(cmd);
909         if (unlikely(ret)) {
910                 pr_err("queue %d: failed to map data\n", queue->idx);
911                 nvmet_tcp_fatal_error(queue);
912                 return;
913         }
914
915         queue->rcv_state = NVMET_TCP_RECV_DATA;
916         nvmet_tcp_map_pdu_iovec(cmd);
917         cmd->flags |= NVMET_TCP_F_INIT_FAILED;
918 }
919
920 static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue)
921 {
922         struct nvme_tcp_data_pdu *data = &queue->pdu.data;
923         struct nvmet_tcp_cmd *cmd;
924
925         if (likely(queue->nr_cmds)) {
926                 if (unlikely(data->ttag >= queue->nr_cmds)) {
927                         pr_err("queue %d: received out of bound ttag %u, nr_cmds %u\n",
928                                 queue->idx, data->ttag, queue->nr_cmds);
929                         nvmet_tcp_fatal_error(queue);
930                         return -EPROTO;
931                 }
932                 cmd = &queue->cmds[data->ttag];
933         } else {
934                 cmd = &queue->connect;
935         }
936
937         if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) {
938                 pr_err("ttag %u unexpected data offset %u (expected %u)\n",
939                         data->ttag, le32_to_cpu(data->data_offset),
940                         cmd->rbytes_done);
941                 /* FIXME: use path and transport errors */
942                 nvmet_req_complete(&cmd->req,
943                         NVME_SC_INVALID_FIELD | NVME_SC_DNR);
944                 return -EPROTO;
945         }
946
947         cmd->pdu_len = le32_to_cpu(data->data_length);
948         cmd->pdu_recv = 0;
949         nvmet_tcp_map_pdu_iovec(cmd);
950         queue->cmd = cmd;
951         queue->rcv_state = NVMET_TCP_RECV_DATA;
952
953         return 0;
954 }
955
956 static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
957 {
958         struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
959         struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd;
960         struct nvmet_req *req;
961         int ret;
962
963         if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
964                 if (hdr->type != nvme_tcp_icreq) {
965                         pr_err("unexpected pdu type (%d) before icreq\n",
966                                 hdr->type);
967                         nvmet_tcp_fatal_error(queue);
968                         return -EPROTO;
969                 }
970                 return nvmet_tcp_handle_icreq(queue);
971         }
972
973         if (hdr->type == nvme_tcp_h2c_data) {
974                 ret = nvmet_tcp_handle_h2c_data_pdu(queue);
975                 if (unlikely(ret))
976                         return ret;
977                 return 0;
978         }
979
980         queue->cmd = nvmet_tcp_get_cmd(queue);
981         if (unlikely(!queue->cmd)) {
982                 /* This should never happen */
983                 pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d",
984                         queue->idx, queue->nr_cmds, queue->send_list_len,
985                         nvme_cmd->common.opcode);
986                 nvmet_tcp_fatal_error(queue);
987                 return -ENOMEM;
988         }
989
990         req = &queue->cmd->req;
991         memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd));
992
993         if (unlikely(!nvmet_req_init(req, &queue->nvme_cq,
994                         &queue->nvme_sq, &nvmet_tcp_ops))) {
995                 pr_err("failed cmd %p id %d opcode %d, data_len: %d\n",
996                         req->cmd, req->cmd->common.command_id,
997                         req->cmd->common.opcode,
998                         le32_to_cpu(req->cmd->common.dptr.sgl.length));
999
1000                 nvmet_tcp_handle_req_failure(queue, queue->cmd, req);
1001                 return 0;
1002         }
1003
1004         ret = nvmet_tcp_map_data(queue->cmd);
1005         if (unlikely(ret)) {
1006                 pr_err("queue %d: failed to map data\n", queue->idx);
1007                 if (nvmet_tcp_has_inline_data(queue->cmd))
1008                         nvmet_tcp_fatal_error(queue);
1009                 else
1010                         nvmet_req_complete(req, ret);
1011                 ret = -EAGAIN;
1012                 goto out;
1013         }
1014
1015         if (nvmet_tcp_need_data_in(queue->cmd)) {
1016                 if (nvmet_tcp_has_inline_data(queue->cmd)) {
1017                         queue->rcv_state = NVMET_TCP_RECV_DATA;
1018                         nvmet_tcp_map_pdu_iovec(queue->cmd);
1019                         return 0;
1020                 }
1021                 /* send back R2T */
1022                 nvmet_tcp_queue_response(&queue->cmd->req);
1023                 goto out;
1024         }
1025
1026         queue->cmd->req.execute(&queue->cmd->req);
1027 out:
1028         nvmet_prepare_receive_pdu(queue);
1029         return ret;
1030 }
1031
1032 static const u8 nvme_tcp_pdu_sizes[] = {
1033         [nvme_tcp_icreq]        = sizeof(struct nvme_tcp_icreq_pdu),
1034         [nvme_tcp_cmd]          = sizeof(struct nvme_tcp_cmd_pdu),
1035         [nvme_tcp_h2c_data]     = sizeof(struct nvme_tcp_data_pdu),
1036 };
1037
1038 static inline u8 nvmet_tcp_pdu_size(u8 type)
1039 {
1040         size_t idx = type;
1041
1042         return (idx < ARRAY_SIZE(nvme_tcp_pdu_sizes) &&
1043                 nvme_tcp_pdu_sizes[idx]) ?
1044                         nvme_tcp_pdu_sizes[idx] : 0;
1045 }
1046
1047 static inline bool nvmet_tcp_pdu_valid(u8 type)
1048 {
1049         switch (type) {
1050         case nvme_tcp_icreq:
1051         case nvme_tcp_cmd:
1052         case nvme_tcp_h2c_data:
1053                 /* fallthru */
1054                 return true;
1055         }
1056
1057         return false;
1058 }
1059
1060 static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue)
1061 {
1062         struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
1063         int len;
1064         struct kvec iov;
1065         struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1066
1067 recv:
1068         iov.iov_base = (void *)&queue->pdu + queue->offset;
1069         iov.iov_len = queue->left;
1070         len = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1071                         iov.iov_len, msg.msg_flags);
1072         if (unlikely(len < 0))
1073                 return len;
1074
1075         queue->offset += len;
1076         queue->left -= len;
1077         if (queue->left)
1078                 return -EAGAIN;
1079
1080         if (queue->offset == sizeof(struct nvme_tcp_hdr)) {
1081                 u8 hdgst = nvmet_tcp_hdgst_len(queue);
1082
1083                 if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) {
1084                         pr_err("unexpected pdu type %d\n", hdr->type);
1085                         nvmet_tcp_fatal_error(queue);
1086                         return -EIO;
1087                 }
1088
1089                 if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) {
1090                         pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen);
1091                         return -EIO;
1092                 }
1093
1094                 queue->left = hdr->hlen - queue->offset + hdgst;
1095                 goto recv;
1096         }
1097
1098         if (queue->hdr_digest &&
1099             nvmet_tcp_verify_hdgst(queue, &queue->pdu, hdr->hlen)) {
1100                 nvmet_tcp_fatal_error(queue); /* fatal */
1101                 return -EPROTO;
1102         }
1103
1104         if (queue->data_digest &&
1105             nvmet_tcp_check_ddgst(queue, &queue->pdu)) {
1106                 nvmet_tcp_fatal_error(queue); /* fatal */
1107                 return -EPROTO;
1108         }
1109
1110         return nvmet_tcp_done_recv_pdu(queue);
1111 }
1112
1113 static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd)
1114 {
1115         struct nvmet_tcp_queue *queue = cmd->queue;
1116
1117         nvmet_tcp_recv_ddgst(queue->rcv_hash, cmd);
1118         queue->offset = 0;
1119         queue->left = NVME_TCP_DIGEST_LENGTH;
1120         queue->rcv_state = NVMET_TCP_RECV_DDGST;
1121 }
1122
1123 static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
1124 {
1125         struct nvmet_tcp_cmd  *cmd = queue->cmd;
1126         int ret;
1127
1128         while (msg_data_left(&cmd->recv_msg)) {
1129                 ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg,
1130                         cmd->recv_msg.msg_flags);
1131                 if (ret <= 0)
1132                         return ret;
1133
1134                 cmd->pdu_recv += ret;
1135                 cmd->rbytes_done += ret;
1136         }
1137
1138         nvmet_tcp_unmap_pdu_iovec(cmd);
1139         if (queue->data_digest) {
1140                 nvmet_tcp_prep_recv_ddgst(cmd);
1141                 return 0;
1142         }
1143
1144         if (cmd->rbytes_done == cmd->req.transfer_len)
1145                 nvmet_tcp_execute_request(cmd);
1146
1147         nvmet_prepare_receive_pdu(queue);
1148         return 0;
1149 }
1150
1151 static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue)
1152 {
1153         struct nvmet_tcp_cmd *cmd = queue->cmd;
1154         int ret;
1155         struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1156         struct kvec iov = {
1157                 .iov_base = (void *)&cmd->recv_ddgst + queue->offset,
1158                 .iov_len = queue->left
1159         };
1160
1161         ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1162                         iov.iov_len, msg.msg_flags);
1163         if (unlikely(ret < 0))
1164                 return ret;
1165
1166         queue->offset += ret;
1167         queue->left -= ret;
1168         if (queue->left)
1169                 return -EAGAIN;
1170
1171         if (queue->data_digest && cmd->exp_ddgst != cmd->recv_ddgst) {
1172                 pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n",
1173                         queue->idx, cmd->req.cmd->common.command_id,
1174                         queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst),
1175                         le32_to_cpu(cmd->exp_ddgst));
1176                 nvmet_tcp_finish_cmd(cmd);
1177                 nvmet_tcp_fatal_error(queue);
1178                 ret = -EPROTO;
1179                 goto out;
1180         }
1181
1182         if (cmd->rbytes_done == cmd->req.transfer_len)
1183                 nvmet_tcp_execute_request(cmd);
1184
1185         ret = 0;
1186 out:
1187         nvmet_prepare_receive_pdu(queue);
1188         return ret;
1189 }
1190
1191 static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue *queue)
1192 {
1193         int result = 0;
1194
1195         if (unlikely(queue->rcv_state == NVMET_TCP_RECV_ERR))
1196                 return 0;
1197
1198         if (queue->rcv_state == NVMET_TCP_RECV_PDU) {
1199                 result = nvmet_tcp_try_recv_pdu(queue);
1200                 if (result != 0)
1201                         goto done_recv;
1202         }
1203
1204         if (queue->rcv_state == NVMET_TCP_RECV_DATA) {
1205                 result = nvmet_tcp_try_recv_data(queue);
1206                 if (result != 0)
1207                         goto done_recv;
1208         }
1209
1210         if (queue->rcv_state == NVMET_TCP_RECV_DDGST) {
1211                 result = nvmet_tcp_try_recv_ddgst(queue);
1212                 if (result != 0)
1213                         goto done_recv;
1214         }
1215
1216 done_recv:
1217         if (result < 0) {
1218                 if (result == -EAGAIN)
1219                         return 0;
1220                 return result;
1221         }
1222         return 1;
1223 }
1224
1225 static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue,
1226                 int budget, int *recvs)
1227 {
1228         int i, ret = 0;
1229
1230         for (i = 0; i < budget; i++) {
1231                 ret = nvmet_tcp_try_recv_one(queue);
1232                 if (unlikely(ret < 0)) {
1233                         nvmet_tcp_socket_error(queue, ret);
1234                         goto done;
1235                 } else if (ret == 0) {
1236                         break;
1237                 }
1238                 (*recvs)++;
1239         }
1240 done:
1241         return ret;
1242 }
1243
1244 static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue)
1245 {
1246         spin_lock(&queue->state_lock);
1247         if (queue->state != NVMET_TCP_Q_DISCONNECTING) {
1248                 queue->state = NVMET_TCP_Q_DISCONNECTING;
1249                 schedule_work(&queue->release_work);
1250         }
1251         spin_unlock(&queue->state_lock);
1252 }
1253
1254 static void nvmet_tcp_io_work(struct work_struct *w)
1255 {
1256         struct nvmet_tcp_queue *queue =
1257                 container_of(w, struct nvmet_tcp_queue, io_work);
1258         bool pending;
1259         int ret, ops = 0;
1260
1261         do {
1262                 pending = false;
1263
1264                 ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops);
1265                 if (ret > 0)
1266                         pending = true;
1267                 else if (ret < 0)
1268                         return;
1269
1270                 ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops);
1271                 if (ret > 0)
1272                         pending = true;
1273                 else if (ret < 0)
1274                         return;
1275
1276         } while (pending && ops < NVMET_TCP_IO_WORK_BUDGET);
1277
1278         /*
1279          * We exahusted our budget, requeue our selves
1280          */
1281         if (pending)
1282                 queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
1283 }
1284
1285 static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
1286                 struct nvmet_tcp_cmd *c)
1287 {
1288         u8 hdgst = nvmet_tcp_hdgst_len(queue);
1289
1290         c->queue = queue;
1291         c->req.port = queue->port->nport;
1292
1293         c->cmd_pdu = page_frag_alloc(&queue->pf_cache,
1294                         sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1295         if (!c->cmd_pdu)
1296                 return -ENOMEM;
1297         c->req.cmd = &c->cmd_pdu->cmd;
1298
1299         c->rsp_pdu = page_frag_alloc(&queue->pf_cache,
1300                         sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1301         if (!c->rsp_pdu)
1302                 goto out_free_cmd;
1303         c->req.cqe = &c->rsp_pdu->cqe;
1304
1305         c->data_pdu = page_frag_alloc(&queue->pf_cache,
1306                         sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1307         if (!c->data_pdu)
1308                 goto out_free_rsp;
1309
1310         c->r2t_pdu = page_frag_alloc(&queue->pf_cache,
1311                         sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1312         if (!c->r2t_pdu)
1313                 goto out_free_data;
1314
1315         c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
1316
1317         list_add_tail(&c->entry, &queue->free_list);
1318
1319         return 0;
1320 out_free_data:
1321         page_frag_free(c->data_pdu);
1322 out_free_rsp:
1323         page_frag_free(c->rsp_pdu);
1324 out_free_cmd:
1325         page_frag_free(c->cmd_pdu);
1326         return -ENOMEM;
1327 }
1328
1329 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c)
1330 {
1331         page_frag_free(c->r2t_pdu);
1332         page_frag_free(c->data_pdu);
1333         page_frag_free(c->rsp_pdu);
1334         page_frag_free(c->cmd_pdu);
1335 }
1336
1337 static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue)
1338 {
1339         struct nvmet_tcp_cmd *cmds;
1340         int i, ret = -EINVAL, nr_cmds = queue->nr_cmds;
1341
1342         cmds = kcalloc(nr_cmds, sizeof(struct nvmet_tcp_cmd), GFP_KERNEL);
1343         if (!cmds)
1344                 goto out;
1345
1346         for (i = 0; i < nr_cmds; i++) {
1347                 ret = nvmet_tcp_alloc_cmd(queue, cmds + i);
1348                 if (ret)
1349                         goto out_free;
1350         }
1351
1352         queue->cmds = cmds;
1353
1354         return 0;
1355 out_free:
1356         while (--i >= 0)
1357                 nvmet_tcp_free_cmd(cmds + i);
1358         kfree(cmds);
1359 out:
1360         return ret;
1361 }
1362
1363 static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue *queue)
1364 {
1365         struct nvmet_tcp_cmd *cmds = queue->cmds;
1366         int i;
1367
1368         for (i = 0; i < queue->nr_cmds; i++)
1369                 nvmet_tcp_free_cmd(cmds + i);
1370
1371         nvmet_tcp_free_cmd(&queue->connect);
1372         kfree(cmds);
1373 }
1374
1375 static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
1376 {
1377         struct socket *sock = queue->sock;
1378
1379         write_lock_bh(&sock->sk->sk_callback_lock);
1380         sock->sk->sk_data_ready =  queue->data_ready;
1381         sock->sk->sk_state_change = queue->state_change;
1382         sock->sk->sk_write_space = queue->write_space;
1383         sock->sk->sk_user_data = NULL;
1384         write_unlock_bh(&sock->sk->sk_callback_lock);
1385 }
1386
1387 static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd)
1388 {
1389         nvmet_req_uninit(&cmd->req);
1390         nvmet_tcp_unmap_pdu_iovec(cmd);
1391         kfree(cmd->iov);
1392         sgl_free(cmd->req.sg);
1393 }
1394
1395 static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
1396 {
1397         struct nvmet_tcp_cmd *cmd = queue->cmds;
1398         int i;
1399
1400         for (i = 0; i < queue->nr_cmds; i++, cmd++) {
1401                 if (nvmet_tcp_need_data_in(cmd))
1402                         nvmet_tcp_finish_cmd(cmd);
1403         }
1404
1405         if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) {
1406                 /* failed in connect */
1407                 nvmet_tcp_finish_cmd(&queue->connect);
1408         }
1409 }
1410
1411 static void nvmet_tcp_release_queue_work(struct work_struct *w)
1412 {
1413         struct page *page;
1414         struct nvmet_tcp_queue *queue =
1415                 container_of(w, struct nvmet_tcp_queue, release_work);
1416
1417         mutex_lock(&nvmet_tcp_queue_mutex);
1418         list_del_init(&queue->queue_list);
1419         mutex_unlock(&nvmet_tcp_queue_mutex);
1420
1421         nvmet_tcp_restore_socket_callbacks(queue);
1422         flush_work(&queue->io_work);
1423
1424         nvmet_tcp_uninit_data_in_cmds(queue);
1425         nvmet_sq_destroy(&queue->nvme_sq);
1426         cancel_work_sync(&queue->io_work);
1427         sock_release(queue->sock);
1428         nvmet_tcp_free_cmds(queue);
1429         if (queue->hdr_digest || queue->data_digest)
1430                 nvmet_tcp_free_crypto(queue);
1431         ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
1432
1433         page = virt_to_head_page(queue->pf_cache.va);
1434         __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
1435         kfree(queue);
1436 }
1437
1438 static void nvmet_tcp_data_ready(struct sock *sk)
1439 {
1440         struct nvmet_tcp_queue *queue;
1441
1442         read_lock_bh(&sk->sk_callback_lock);
1443         queue = sk->sk_user_data;
1444         if (likely(queue))
1445                 queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
1446         read_unlock_bh(&sk->sk_callback_lock);
1447 }
1448
1449 static void nvmet_tcp_write_space(struct sock *sk)
1450 {
1451         struct nvmet_tcp_queue *queue;
1452
1453         read_lock_bh(&sk->sk_callback_lock);
1454         queue = sk->sk_user_data;
1455         if (unlikely(!queue))
1456                 goto out;
1457
1458         if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
1459                 queue->write_space(sk);
1460                 goto out;
1461         }
1462
1463         if (sk_stream_is_writeable(sk)) {
1464                 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1465                 queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
1466         }
1467 out:
1468         read_unlock_bh(&sk->sk_callback_lock);
1469 }
1470
1471 static void nvmet_tcp_state_change(struct sock *sk)
1472 {
1473         struct nvmet_tcp_queue *queue;
1474
1475         read_lock_bh(&sk->sk_callback_lock);
1476         queue = sk->sk_user_data;
1477         if (!queue)
1478                 goto done;
1479
1480         switch (sk->sk_state) {
1481         case TCP_FIN_WAIT2:
1482         case TCP_LAST_ACK:
1483                 break;
1484         case TCP_FIN_WAIT1:
1485         case TCP_CLOSE_WAIT:
1486         case TCP_CLOSE:
1487                 /* FALLTHRU */
1488                 nvmet_tcp_schedule_release_queue(queue);
1489                 break;
1490         default:
1491                 pr_warn("queue %d unhandled state %d\n",
1492                         queue->idx, sk->sk_state);
1493         }
1494 done:
1495         read_unlock_bh(&sk->sk_callback_lock);
1496 }
1497
1498 static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
1499 {
1500         struct socket *sock = queue->sock;
1501         struct inet_sock *inet = inet_sk(sock->sk);
1502         int ret;
1503
1504         ret = kernel_getsockname(sock,
1505                 (struct sockaddr *)&queue->sockaddr);
1506         if (ret < 0)
1507                 return ret;
1508
1509         ret = kernel_getpeername(sock,
1510                 (struct sockaddr *)&queue->sockaddr_peer);
1511         if (ret < 0)
1512                 return ret;
1513
1514         /*
1515          * Cleanup whatever is sitting in the TCP transmit queue on socket
1516          * close. This is done to prevent stale data from being sent should
1517          * the network connection be restored before TCP times out.
1518          */
1519         sock_no_linger(sock->sk);
1520
1521         if (so_priority > 0)
1522                 sock_set_priority(sock->sk, so_priority);
1523
1524         /* Set socket type of service */
1525         if (inet->rcv_tos > 0)
1526                 ip_sock_set_tos(sock->sk, inet->rcv_tos);
1527
1528         ret = 0;
1529         write_lock_bh(&sock->sk->sk_callback_lock);
1530         if (sock->sk->sk_state != TCP_ESTABLISHED) {
1531                 /*
1532                  * If the socket is already closing, don't even start
1533                  * consuming it
1534                  */
1535                 ret = -ENOTCONN;
1536         } else {
1537                 sock->sk->sk_user_data = queue;
1538                 queue->data_ready = sock->sk->sk_data_ready;
1539                 sock->sk->sk_data_ready = nvmet_tcp_data_ready;
1540                 queue->state_change = sock->sk->sk_state_change;
1541                 sock->sk->sk_state_change = nvmet_tcp_state_change;
1542                 queue->write_space = sock->sk->sk_write_space;
1543                 sock->sk->sk_write_space = nvmet_tcp_write_space;
1544                 queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
1545         }
1546         write_unlock_bh(&sock->sk->sk_callback_lock);
1547
1548         return ret;
1549 }
1550
1551 static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
1552                 struct socket *newsock)
1553 {
1554         struct nvmet_tcp_queue *queue;
1555         int ret;
1556
1557         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
1558         if (!queue)
1559                 return -ENOMEM;
1560
1561         INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work);
1562         INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
1563         queue->sock = newsock;
1564         queue->port = port;
1565         queue->nr_cmds = 0;
1566         spin_lock_init(&queue->state_lock);
1567         queue->state = NVMET_TCP_Q_CONNECTING;
1568         INIT_LIST_HEAD(&queue->free_list);
1569         init_llist_head(&queue->resp_list);
1570         INIT_LIST_HEAD(&queue->resp_send_list);
1571
1572         queue->idx = ida_simple_get(&nvmet_tcp_queue_ida, 0, 0, GFP_KERNEL);
1573         if (queue->idx < 0) {
1574                 ret = queue->idx;
1575                 goto out_free_queue;
1576         }
1577
1578         ret = nvmet_tcp_alloc_cmd(queue, &queue->connect);
1579         if (ret)
1580                 goto out_ida_remove;
1581
1582         ret = nvmet_sq_init(&queue->nvme_sq);
1583         if (ret)
1584                 goto out_free_connect;
1585
1586         nvmet_prepare_receive_pdu(queue);
1587
1588         mutex_lock(&nvmet_tcp_queue_mutex);
1589         list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list);
1590         mutex_unlock(&nvmet_tcp_queue_mutex);
1591
1592         ret = nvmet_tcp_set_queue_sock(queue);
1593         if (ret)
1594                 goto out_destroy_sq;
1595
1596         return 0;
1597 out_destroy_sq:
1598         mutex_lock(&nvmet_tcp_queue_mutex);
1599         list_del_init(&queue->queue_list);
1600         mutex_unlock(&nvmet_tcp_queue_mutex);
1601         nvmet_sq_destroy(&queue->nvme_sq);
1602 out_free_connect:
1603         nvmet_tcp_free_cmd(&queue->connect);
1604 out_ida_remove:
1605         ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
1606 out_free_queue:
1607         kfree(queue);
1608         return ret;
1609 }
1610
1611 static void nvmet_tcp_accept_work(struct work_struct *w)
1612 {
1613         struct nvmet_tcp_port *port =
1614                 container_of(w, struct nvmet_tcp_port, accept_work);
1615         struct socket *newsock;
1616         int ret;
1617
1618         while (true) {
1619                 ret = kernel_accept(port->sock, &newsock, O_NONBLOCK);
1620                 if (ret < 0) {
1621                         if (ret != -EAGAIN)
1622                                 pr_warn("failed to accept err=%d\n", ret);
1623                         return;
1624                 }
1625                 ret = nvmet_tcp_alloc_queue(port, newsock);
1626                 if (ret) {
1627                         pr_err("failed to allocate queue\n");
1628                         sock_release(newsock);
1629                 }
1630         }
1631 }
1632
1633 static void nvmet_tcp_listen_data_ready(struct sock *sk)
1634 {
1635         struct nvmet_tcp_port *port;
1636
1637         read_lock_bh(&sk->sk_callback_lock);
1638         port = sk->sk_user_data;
1639         if (!port)
1640                 goto out;
1641
1642         if (sk->sk_state == TCP_LISTEN)
1643                 schedule_work(&port->accept_work);
1644 out:
1645         read_unlock_bh(&sk->sk_callback_lock);
1646 }
1647
1648 static int nvmet_tcp_add_port(struct nvmet_port *nport)
1649 {
1650         struct nvmet_tcp_port *port;
1651         __kernel_sa_family_t af;
1652         int ret;
1653
1654         port = kzalloc(sizeof(*port), GFP_KERNEL);
1655         if (!port)
1656                 return -ENOMEM;
1657
1658         switch (nport->disc_addr.adrfam) {
1659         case NVMF_ADDR_FAMILY_IP4:
1660                 af = AF_INET;
1661                 break;
1662         case NVMF_ADDR_FAMILY_IP6:
1663                 af = AF_INET6;
1664                 break;
1665         default:
1666                 pr_err("address family %d not supported\n",
1667                                 nport->disc_addr.adrfam);
1668                 ret = -EINVAL;
1669                 goto err_port;
1670         }
1671
1672         ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr,
1673                         nport->disc_addr.trsvcid, &port->addr);
1674         if (ret) {
1675                 pr_err("malformed ip/port passed: %s:%s\n",
1676                         nport->disc_addr.traddr, nport->disc_addr.trsvcid);
1677                 goto err_port;
1678         }
1679
1680         port->nport = nport;
1681         INIT_WORK(&port->accept_work, nvmet_tcp_accept_work);
1682         if (port->nport->inline_data_size < 0)
1683                 port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE;
1684
1685         ret = sock_create(port->addr.ss_family, SOCK_STREAM,
1686                                 IPPROTO_TCP, &port->sock);
1687         if (ret) {
1688                 pr_err("failed to create a socket\n");
1689                 goto err_port;
1690         }
1691
1692         port->sock->sk->sk_user_data = port;
1693         port->data_ready = port->sock->sk->sk_data_ready;
1694         port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready;
1695         sock_set_reuseaddr(port->sock->sk);
1696         tcp_sock_set_nodelay(port->sock->sk);
1697         if (so_priority > 0)
1698                 sock_set_priority(port->sock->sk, so_priority);
1699
1700         ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr,
1701                         sizeof(port->addr));
1702         if (ret) {
1703                 pr_err("failed to bind port socket %d\n", ret);
1704                 goto err_sock;
1705         }
1706
1707         ret = kernel_listen(port->sock, 128);
1708         if (ret) {
1709                 pr_err("failed to listen %d on port sock\n", ret);
1710                 goto err_sock;
1711         }
1712
1713         nport->priv = port;
1714         pr_info("enabling port %d (%pISpc)\n",
1715                 le16_to_cpu(nport->disc_addr.portid), &port->addr);
1716
1717         return 0;
1718
1719 err_sock:
1720         sock_release(port->sock);
1721 err_port:
1722         kfree(port);
1723         return ret;
1724 }
1725
1726 static void nvmet_tcp_destroy_port_queues(struct nvmet_tcp_port *port)
1727 {
1728         struct nvmet_tcp_queue *queue;
1729
1730         mutex_lock(&nvmet_tcp_queue_mutex);
1731         list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1732                 if (queue->port == port)
1733                         kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1734         mutex_unlock(&nvmet_tcp_queue_mutex);
1735 }
1736
1737 static void nvmet_tcp_remove_port(struct nvmet_port *nport)
1738 {
1739         struct nvmet_tcp_port *port = nport->priv;
1740
1741         write_lock_bh(&port->sock->sk->sk_callback_lock);
1742         port->sock->sk->sk_data_ready = port->data_ready;
1743         port->sock->sk->sk_user_data = NULL;
1744         write_unlock_bh(&port->sock->sk->sk_callback_lock);
1745         cancel_work_sync(&port->accept_work);
1746         /*
1747          * Destroy the remaining queues, which are not belong to any
1748          * controller yet.
1749          */
1750         nvmet_tcp_destroy_port_queues(port);
1751
1752         sock_release(port->sock);
1753         kfree(port);
1754 }
1755
1756 static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl *ctrl)
1757 {
1758         struct nvmet_tcp_queue *queue;
1759
1760         mutex_lock(&nvmet_tcp_queue_mutex);
1761         list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1762                 if (queue->nvme_sq.ctrl == ctrl)
1763                         kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1764         mutex_unlock(&nvmet_tcp_queue_mutex);
1765 }
1766
1767 static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq)
1768 {
1769         struct nvmet_tcp_queue *queue =
1770                 container_of(sq, struct nvmet_tcp_queue, nvme_sq);
1771
1772         if (sq->qid == 0) {
1773                 /* Let inflight controller teardown complete */
1774                 flush_scheduled_work();
1775         }
1776
1777         queue->nr_cmds = sq->size * 2;
1778         if (nvmet_tcp_alloc_cmds(queue))
1779                 return NVME_SC_INTERNAL;
1780         return 0;
1781 }
1782
1783 static void nvmet_tcp_disc_port_addr(struct nvmet_req *req,
1784                 struct nvmet_port *nport, char *traddr)
1785 {
1786         struct nvmet_tcp_port *port = nport->priv;
1787
1788         if (inet_addr_is_any((struct sockaddr *)&port->addr)) {
1789                 struct nvmet_tcp_cmd *cmd =
1790                         container_of(req, struct nvmet_tcp_cmd, req);
1791                 struct nvmet_tcp_queue *queue = cmd->queue;
1792
1793                 sprintf(traddr, "%pISc", (struct sockaddr *)&queue->sockaddr);
1794         } else {
1795                 memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE);
1796         }
1797 }
1798
1799 static const struct nvmet_fabrics_ops nvmet_tcp_ops = {
1800         .owner                  = THIS_MODULE,
1801         .type                   = NVMF_TRTYPE_TCP,
1802         .msdbd                  = 1,
1803         .add_port               = nvmet_tcp_add_port,
1804         .remove_port            = nvmet_tcp_remove_port,
1805         .queue_response         = nvmet_tcp_queue_response,
1806         .delete_ctrl            = nvmet_tcp_delete_ctrl,
1807         .install_queue          = nvmet_tcp_install_queue,
1808         .disc_traddr            = nvmet_tcp_disc_port_addr,
1809 };
1810
1811 static int __init nvmet_tcp_init(void)
1812 {
1813         int ret;
1814
1815         nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq",
1816                                 WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
1817         if (!nvmet_tcp_wq)
1818                 return -ENOMEM;
1819
1820         ret = nvmet_register_transport(&nvmet_tcp_ops);
1821         if (ret)
1822                 goto err;
1823
1824         return 0;
1825 err:
1826         destroy_workqueue(nvmet_tcp_wq);
1827         return ret;
1828 }
1829
1830 static void __exit nvmet_tcp_exit(void)
1831 {
1832         struct nvmet_tcp_queue *queue;
1833
1834         nvmet_unregister_transport(&nvmet_tcp_ops);
1835
1836         flush_scheduled_work();
1837         mutex_lock(&nvmet_tcp_queue_mutex);
1838         list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1839                 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1840         mutex_unlock(&nvmet_tcp_queue_mutex);
1841         flush_scheduled_work();
1842
1843         destroy_workqueue(nvmet_tcp_wq);
1844 }
1845
1846 module_init(nvmet_tcp_init);
1847 module_exit(nvmet_tcp_exit);
1848
1849 MODULE_LICENSE("GPL v2");
1850 MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */