GNU Linux-libre 4.14.313-gnu1
[releases.git] / net / smc / af_smc.c
1 /*
2  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
3  *
4  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
5  *  applies to SOCK_STREAM sockets only
6  *  offers an alternative communication option for TCP-protocol sockets
7  *  applicable with RoCE-cards only
8  *
9  *  Initial restrictions:
10  *    - non-blocking connect postponed
11  *    - IPv6 support postponed
12  *    - support for alternate links postponed
13  *    - partial support for non-blocking sockets only
14  *    - support for urgent data postponed
15  *
16  *  Copyright IBM Corp. 2016
17  *
18  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
19  *              based on prototype from Frank Blaschka
20  */
21
22 #define KMSG_COMPONENT "smc"
23 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
24
25 #include <linux/module.h>
26 #include <linux/socket.h>
27 #include <linux/inetdevice.h>
28 #include <linux/workqueue.h>
29 #include <linux/in.h>
30 #include <linux/sched/signal.h>
31
32 #include <net/sock.h>
33 #include <net/tcp.h>
34 #include <net/smc.h>
35
36 #include "smc.h"
37 #include "smc_clc.h"
38 #include "smc_llc.h"
39 #include "smc_cdc.h"
40 #include "smc_core.h"
41 #include "smc_ib.h"
42 #include "smc_pnet.h"
43 #include "smc_tx.h"
44 #include "smc_rx.h"
45 #include "smc_close.h"
46
47 static DEFINE_MUTEX(smc_create_lgr_pending);    /* serialize link group
48                                                  * creation
49                                                  */
50
51 struct smc_lgr_list smc_lgr_list = {            /* established link groups */
52         .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
53         .list = LIST_HEAD_INIT(smc_lgr_list.list),
54 };
55
56 static void smc_tcp_listen_work(struct work_struct *);
57
58 static void smc_set_keepalive(struct sock *sk, int val)
59 {
60         struct smc_sock *smc = smc_sk(sk);
61
62         smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
63 }
64
65 static struct smc_hashinfo smc_v4_hashinfo = {
66         .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
67 };
68
69 int smc_hash_sk(struct sock *sk)
70 {
71         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
72         struct hlist_head *head;
73
74         head = &h->ht;
75
76         write_lock_bh(&h->lock);
77         sk_add_node(sk, head);
78         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
79         write_unlock_bh(&h->lock);
80
81         return 0;
82 }
83 EXPORT_SYMBOL_GPL(smc_hash_sk);
84
85 void smc_unhash_sk(struct sock *sk)
86 {
87         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
88
89         write_lock_bh(&h->lock);
90         if (sk_del_node_init(sk))
91                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
92         write_unlock_bh(&h->lock);
93 }
94 EXPORT_SYMBOL_GPL(smc_unhash_sk);
95
96 struct proto smc_proto = {
97         .name           = "SMC",
98         .owner          = THIS_MODULE,
99         .keepalive      = smc_set_keepalive,
100         .hash           = smc_hash_sk,
101         .unhash         = smc_unhash_sk,
102         .obj_size       = sizeof(struct smc_sock),
103         .h.smc_hash     = &smc_v4_hashinfo,
104         .slab_flags     = SLAB_TYPESAFE_BY_RCU,
105 };
106 EXPORT_SYMBOL_GPL(smc_proto);
107
108 static int smc_release(struct socket *sock)
109 {
110         struct sock *sk = sock->sk;
111         struct smc_sock *smc;
112         int rc = 0;
113
114         if (!sk)
115                 goto out;
116
117         smc = smc_sk(sk);
118         sock_hold(sk);
119         if (sk->sk_state == SMC_LISTEN)
120                 /* smc_close_non_accepted() is called and acquires
121                  * sock lock for child sockets again
122                  */
123                 lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
124         else
125                 lock_sock(sk);
126
127         if (smc->use_fallback) {
128                 sk->sk_state = SMC_CLOSED;
129                 sk->sk_state_change(sk);
130         } else {
131                 rc = smc_close_active(smc);
132                 sock_set_flag(sk, SOCK_DEAD);
133                 sk->sk_shutdown |= SHUTDOWN_MASK;
134         }
135         if (smc->clcsock) {
136                 if (smc->use_fallback && sk->sk_state == SMC_LISTEN) {
137                         /* wake up clcsock accept */
138                         rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
139                 }
140                 mutex_lock(&smc->clcsock_release_lock);
141                 sock_release(smc->clcsock);
142                 smc->clcsock = NULL;
143                 mutex_unlock(&smc->clcsock_release_lock);
144         }
145
146         /* detach socket */
147         sock_orphan(sk);
148         sock->sk = NULL;
149         if (smc->use_fallback) {
150                 schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
151         } else if (sk->sk_state == SMC_CLOSED) {
152                 smc_conn_free(&smc->conn);
153                 schedule_delayed_work(&smc->sock_put_work,
154                                       SMC_CLOSE_SOCK_PUT_DELAY);
155         }
156         release_sock(sk);
157
158         sock_put(sk);
159 out:
160         return rc;
161 }
162
163 static void smc_destruct(struct sock *sk)
164 {
165         if (sk->sk_state != SMC_CLOSED)
166                 return;
167         if (!sock_flag(sk, SOCK_DEAD))
168                 return;
169
170         sk_refcnt_debug_dec(sk);
171 }
172
173 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
174 {
175         struct smc_sock *smc;
176         struct sock *sk;
177
178         sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0);
179         if (!sk)
180                 return NULL;
181
182         sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
183         sk->sk_state = SMC_INIT;
184         sk->sk_destruct = smc_destruct;
185         sk->sk_protocol = SMCPROTO_SMC;
186         smc = smc_sk(sk);
187         INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
188         INIT_LIST_HEAD(&smc->accept_q);
189         spin_lock_init(&smc->accept_q_lock);
190         INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work);
191         sk->sk_prot->hash(sk);
192         sk_refcnt_debug_inc(sk);
193         mutex_init(&smc->clcsock_release_lock);
194
195         return sk;
196 }
197
198 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
199                     int addr_len)
200 {
201         struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
202         struct sock *sk = sock->sk;
203         struct smc_sock *smc;
204         int rc;
205
206         smc = smc_sk(sk);
207
208         /* replicate tests from inet_bind(), to be safe wrt. future changes */
209         rc = -EINVAL;
210         if (addr_len < sizeof(struct sockaddr_in))
211                 goto out;
212
213         rc = -EAFNOSUPPORT;
214         /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
215         if ((addr->sin_family != AF_INET) &&
216             ((addr->sin_family != AF_UNSPEC) ||
217              (addr->sin_addr.s_addr != htonl(INADDR_ANY))))
218                 goto out;
219
220         lock_sock(sk);
221
222         /* Check if socket is already active */
223         rc = -EINVAL;
224         if (sk->sk_state != SMC_INIT)
225                 goto out_rel;
226
227         smc->clcsock->sk->sk_reuse = sk->sk_reuse;
228         rc = kernel_bind(smc->clcsock, uaddr, addr_len);
229
230 out_rel:
231         release_sock(sk);
232 out:
233         return rc;
234 }
235
236 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
237                                    unsigned long mask)
238 {
239         /* options we don't get control via setsockopt for */
240         nsk->sk_type = osk->sk_type;
241         nsk->sk_sndbuf = osk->sk_sndbuf;
242         nsk->sk_rcvbuf = osk->sk_rcvbuf;
243         nsk->sk_sndtimeo = osk->sk_sndtimeo;
244         nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
245         nsk->sk_mark = osk->sk_mark;
246         nsk->sk_priority = osk->sk_priority;
247         nsk->sk_rcvlowat = osk->sk_rcvlowat;
248         nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
249         nsk->sk_err = osk->sk_err;
250
251         nsk->sk_flags &= ~mask;
252         nsk->sk_flags |= osk->sk_flags & mask;
253 }
254
255 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
256                              (1UL << SOCK_KEEPOPEN) | \
257                              (1UL << SOCK_LINGER) | \
258                              (1UL << SOCK_BROADCAST) | \
259                              (1UL << SOCK_TIMESTAMP) | \
260                              (1UL << SOCK_DBG) | \
261                              (1UL << SOCK_RCVTSTAMP) | \
262                              (1UL << SOCK_RCVTSTAMPNS) | \
263                              (1UL << SOCK_LOCALROUTE) | \
264                              (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
265                              (1UL << SOCK_RXQ_OVFL) | \
266                              (1UL << SOCK_WIFI_STATUS) | \
267                              (1UL << SOCK_NOFCS) | \
268                              (1UL << SOCK_FILTER_LOCKED))
269 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
270  * clc socket (since smc is not called for these options from net/core)
271  */
272 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
273 {
274         smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
275 }
276
277 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
278                              (1UL << SOCK_KEEPOPEN) | \
279                              (1UL << SOCK_LINGER) | \
280                              (1UL << SOCK_DBG))
281 /* copy only settings and flags relevant for smc from clc to smc socket */
282 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
283 {
284         smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
285 }
286
287 /* determine subnet and mask of internal TCP socket */
288 int smc_netinfo_by_tcpsk(struct socket *clcsock,
289                          __be32 *subnet, u8 *prefix_len)
290 {
291         struct dst_entry *dst = sk_dst_get(clcsock->sk);
292         struct in_device *in_dev;
293         struct sockaddr_in addr;
294         int rc = -ENOENT;
295         int len;
296
297         if (!dst) {
298                 rc = -ENOTCONN;
299                 goto out;
300         }
301         if (!dst->dev) {
302                 rc = -ENODEV;
303                 goto out_rel;
304         }
305
306         /* get address to which the internal TCP socket is bound */
307         kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len);
308         /* analyze IPv4 specific data of net_device belonging to TCP socket */
309         rcu_read_lock();
310         in_dev = __in_dev_get_rcu(dst->dev);
311         for_ifa(in_dev) {
312                 if (!inet_ifa_match(addr.sin_addr.s_addr, ifa))
313                         continue;
314                 *prefix_len = inet_mask_len(ifa->ifa_mask);
315                 *subnet = ifa->ifa_address & ifa->ifa_mask;
316                 rc = 0;
317                 break;
318         } endfor_ifa(in_dev);
319         rcu_read_unlock();
320
321 out_rel:
322         dst_release(dst);
323 out:
324         return rc;
325 }
326
327 static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid)
328 {
329         struct smc_link_group *lgr = smc->conn.lgr;
330         struct smc_link *link;
331         int rest;
332         int rc;
333
334         link = &lgr->lnk[SMC_SINGLE_LINK];
335         /* receive CONFIRM LINK request from server over RoCE fabric */
336         rest = wait_for_completion_interruptible_timeout(
337                 &link->llc_confirm,
338                 SMC_LLC_WAIT_FIRST_TIME);
339         if (rest <= 0) {
340                 struct smc_clc_msg_decline dclc;
341
342                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
343                                       SMC_CLC_DECLINE);
344                 return rc;
345         }
346
347         rc = smc_ib_modify_qp_rts(link);
348         if (rc)
349                 return SMC_CLC_DECL_INTERR;
350
351         smc_wr_remember_qp_attr(link);
352
353         rc = smc_wr_reg_send(link,
354                              smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]);
355         if (rc)
356                 return SMC_CLC_DECL_INTERR;
357
358         /* send CONFIRM LINK response over RoCE fabric */
359         rc = smc_llc_send_confirm_link(link,
360                                        link->smcibdev->mac[link->ibport - 1],
361                                        gid, SMC_LLC_RESP);
362         if (rc < 0)
363                 return SMC_CLC_DECL_TCL;
364
365         return rc;
366 }
367
368 static void smc_conn_save_peer_info(struct smc_sock *smc,
369                                     struct smc_clc_msg_accept_confirm *clc)
370 {
371         smc->conn.peer_conn_idx = clc->conn_idx;
372         smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
373         smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
374         atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
375 }
376
377 static void smc_link_save_peer_info(struct smc_link *link,
378                                     struct smc_clc_msg_accept_confirm *clc)
379 {
380         link->peer_qpn = ntoh24(clc->qpn);
381         memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
382         memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
383         link->peer_psn = ntoh24(clc->psn);
384         link->peer_mtu = clc->qp_mtu;
385 }
386
387 /* setup for RDMA connection of client */
388 static int smc_connect_rdma(struct smc_sock *smc)
389 {
390         struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr;
391         struct smc_clc_msg_accept_confirm aclc;
392         int local_contact = SMC_FIRST_CONTACT;
393         struct smc_ib_device *smcibdev;
394         struct smc_link *link;
395         u8 srv_first_contact;
396         int reason_code = 0;
397         int rc = 0;
398         u8 ibport;
399
400         /* IPSec connections opt out of SMC-R optimizations */
401         if (using_ipsec(smc)) {
402                 reason_code = SMC_CLC_DECL_IPSEC;
403                 goto decline_rdma;
404         }
405
406         /* PNET table look up: search active ib_device and port
407          * within same PNETID that also contains the ethernet device
408          * used for the internal TCP socket
409          */
410         smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
411         if (!smcibdev) {
412                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
413                 goto decline_rdma;
414         }
415
416         /* do inband token exchange */
417         reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
418         if (reason_code < 0) {
419                 rc = reason_code;
420                 goto out_err;
421         }
422         if (reason_code > 0) /* configuration error */
423                 goto decline_rdma;
424         /* receive SMC Accept CLC message */
425         reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
426                                        SMC_CLC_ACCEPT);
427         if (reason_code < 0) {
428                 rc = reason_code;
429                 goto out_err;
430         }
431         if (reason_code > 0)
432                 goto decline_rdma;
433
434         srv_first_contact = aclc.hdr.flag;
435         mutex_lock(&smc_create_lgr_pending);
436         local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev,
437                                         ibport, &aclc.lcl, srv_first_contact);
438         if (local_contact < 0) {
439                 rc = local_contact;
440                 if (rc == -ENOMEM)
441                         reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
442                 else if (rc == -ENOLINK)
443                         reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
444                 goto decline_rdma_unlock;
445         }
446         link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
447
448         smc_conn_save_peer_info(smc, &aclc);
449
450         /* create send buffer and rmb */
451         rc = smc_buf_create(smc);
452         if (rc) {
453                 reason_code = SMC_CLC_DECL_MEM;
454                 goto decline_rdma_unlock;
455         }
456
457         if (local_contact == SMC_FIRST_CONTACT)
458                 smc_link_save_peer_info(link, &aclc);
459
460         rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
461         if (rc) {
462                 reason_code = SMC_CLC_DECL_INTERR;
463                 goto decline_rdma_unlock;
464         }
465
466         smc_close_init(smc);
467         smc_rx_init(smc);
468
469         if (local_contact == SMC_FIRST_CONTACT) {
470                 rc = smc_ib_ready_link(link);
471                 if (rc) {
472                         reason_code = SMC_CLC_DECL_INTERR;
473                         goto decline_rdma_unlock;
474                 }
475         } else {
476                 struct smc_buf_desc *buf_desc = smc->conn.rmb_desc;
477
478                 if (!buf_desc->reused) {
479                         /* register memory region for new rmb */
480                         rc = smc_wr_reg_send(link,
481                                              buf_desc->mr_rx[SMC_SINGLE_LINK]);
482                         if (rc) {
483                                 reason_code = SMC_CLC_DECL_INTERR;
484                                 goto decline_rdma_unlock;
485                         }
486                 }
487         }
488         smc_rmb_sync_sg_for_device(&smc->conn);
489
490         rc = smc_clc_send_confirm(smc);
491         if (rc)
492                 goto out_err_unlock;
493
494         if (local_contact == SMC_FIRST_CONTACT) {
495                 /* QP confirmation over RoCE fabric */
496                 reason_code = smc_clnt_conf_first_link(
497                         smc, &smcibdev->gid[ibport - 1]);
498                 if (reason_code < 0) {
499                         rc = reason_code;
500                         goto out_err_unlock;
501                 }
502                 if (reason_code > 0)
503                         goto decline_rdma_unlock;
504         }
505
506         mutex_unlock(&smc_create_lgr_pending);
507         smc_tx_init(smc);
508
509 out_connected:
510         smc_copy_sock_settings_to_clc(smc);
511         if (smc->sk.sk_state == SMC_INIT)
512                 smc->sk.sk_state = SMC_ACTIVE;
513
514         return rc ? rc : local_contact;
515
516 decline_rdma_unlock:
517         mutex_unlock(&smc_create_lgr_pending);
518         smc_conn_free(&smc->conn);
519 decline_rdma:
520         /* RDMA setup failed, switch back to TCP */
521         smc->use_fallback = true;
522         if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
523                 rc = smc_clc_send_decline(smc, reason_code);
524                 if (rc < sizeof(struct smc_clc_msg_decline))
525                         goto out_err;
526         }
527         goto out_connected;
528
529 out_err_unlock:
530         mutex_unlock(&smc_create_lgr_pending);
531         smc_conn_free(&smc->conn);
532 out_err:
533         return rc;
534 }
535
536 static int smc_connect(struct socket *sock, struct sockaddr *addr,
537                        int alen, int flags)
538 {
539         struct sock *sk = sock->sk;
540         struct smc_sock *smc;
541         int rc = -EINVAL;
542
543         smc = smc_sk(sk);
544
545         /* separate smc parameter checking to be safe */
546         if (alen < sizeof(addr->sa_family))
547                 goto out_err;
548         if (addr->sa_family != AF_INET)
549                 goto out_err;
550         smc->addr = addr;       /* needed for nonblocking connect */
551
552         lock_sock(sk);
553         switch (sk->sk_state) {
554         default:
555                 goto out;
556         case SMC_ACTIVE:
557                 rc = -EISCONN;
558                 goto out;
559         case SMC_INIT:
560                 rc = 0;
561                 break;
562         }
563
564         smc_copy_sock_settings_to_clc(smc);
565         rc = kernel_connect(smc->clcsock, addr, alen, flags);
566         if (rc)
567                 goto out;
568
569         /* setup RDMA connection */
570         rc = smc_connect_rdma(smc);
571         if (rc < 0)
572                 goto out;
573         else
574                 rc = 0; /* success cases including fallback */
575
576 out:
577         release_sock(sk);
578 out_err:
579         return rc;
580 }
581
582 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
583 {
584         struct sock *sk = &lsmc->sk;
585         struct socket *new_clcsock;
586         struct sock *new_sk;
587         int rc = -EINVAL;
588
589         release_sock(&lsmc->sk);
590         new_sk = smc_sock_alloc(sock_net(sk), NULL);
591         if (!new_sk) {
592                 rc = -ENOMEM;
593                 lsmc->sk.sk_err = ENOMEM;
594                 *new_smc = NULL;
595                 lock_sock(&lsmc->sk);
596                 goto out;
597         }
598         *new_smc = smc_sk(new_sk);
599
600         mutex_lock(&lsmc->clcsock_release_lock);
601         if (lsmc->clcsock)
602                 rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
603         mutex_unlock(&lsmc->clcsock_release_lock);
604         lock_sock(&lsmc->sk);
605         if  (rc < 0) {
606                 lsmc->sk.sk_err = -rc;
607                 new_sk->sk_state = SMC_CLOSED;
608                 sock_set_flag(new_sk, SOCK_DEAD);
609                 sk->sk_prot->unhash(new_sk);
610                 sock_put(new_sk);
611                 *new_smc = NULL;
612                 goto out;
613         }
614         if (lsmc->sk.sk_state == SMC_CLOSED) {
615                 if (new_clcsock)
616                         sock_release(new_clcsock);
617                 new_sk->sk_state = SMC_CLOSED;
618                 sock_set_flag(new_sk, SOCK_DEAD);
619                 sk->sk_prot->unhash(new_sk);
620                 sock_put(new_sk);
621                 *new_smc = NULL;
622                 goto out;
623         }
624
625         (*new_smc)->clcsock = new_clcsock;
626 out:
627         return rc;
628 }
629
630 /* add a just created sock to the accept queue of the listen sock as
631  * candidate for a following socket accept call from user space
632  */
633 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
634 {
635         struct smc_sock *par = smc_sk(parent);
636
637         sock_hold(sk);
638         spin_lock(&par->accept_q_lock);
639         list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
640         spin_unlock(&par->accept_q_lock);
641         sk_acceptq_added(parent);
642 }
643
644 /* remove a socket from the accept queue of its parental listening socket */
645 static void smc_accept_unlink(struct sock *sk)
646 {
647         struct smc_sock *par = smc_sk(sk)->listen_smc;
648
649         spin_lock(&par->accept_q_lock);
650         list_del_init(&smc_sk(sk)->accept_q);
651         spin_unlock(&par->accept_q_lock);
652         sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
653         sock_put(sk);
654 }
655
656 /* remove a sock from the accept queue to bind it to a new socket created
657  * for a socket accept call from user space
658  */
659 struct sock *smc_accept_dequeue(struct sock *parent,
660                                 struct socket *new_sock)
661 {
662         struct smc_sock *isk, *n;
663         struct sock *new_sk;
664
665         list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
666                 new_sk = (struct sock *)isk;
667
668                 smc_accept_unlink(new_sk);
669                 if (new_sk->sk_state == SMC_CLOSED) {
670                         new_sk->sk_prot->unhash(new_sk);
671                         sock_put(new_sk);
672                         continue;
673                 }
674                 if (new_sock)
675                         sock_graft(new_sk, new_sock);
676                 return new_sk;
677         }
678         return NULL;
679 }
680
681 /* clean up for a created but never accepted sock */
682 void smc_close_non_accepted(struct sock *sk)
683 {
684         struct smc_sock *smc = smc_sk(sk);
685
686         sock_hold(sk);
687         lock_sock(sk);
688         if (!sk->sk_lingertime)
689                 /* wait for peer closing */
690                 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
691         if (smc->use_fallback) {
692                 sk->sk_state = SMC_CLOSED;
693         } else {
694                 smc_close_active(smc);
695                 sock_set_flag(sk, SOCK_DEAD);
696                 sk->sk_shutdown |= SHUTDOWN_MASK;
697         }
698         if (smc->clcsock) {
699                 struct socket *tcp;
700
701                 tcp = smc->clcsock;
702                 smc->clcsock = NULL;
703                 sock_release(tcp);
704         }
705         if (smc->use_fallback) {
706                 schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
707         } else if (sk->sk_state == SMC_CLOSED) {
708                 smc_conn_free(&smc->conn);
709                 schedule_delayed_work(&smc->sock_put_work,
710                                       SMC_CLOSE_SOCK_PUT_DELAY);
711         }
712         release_sock(sk);
713         sock_put(sk);
714 }
715
716 static int smc_serv_conf_first_link(struct smc_sock *smc)
717 {
718         struct smc_link_group *lgr = smc->conn.lgr;
719         struct smc_link *link;
720         int rest;
721         int rc;
722
723         link = &lgr->lnk[SMC_SINGLE_LINK];
724
725         rc = smc_wr_reg_send(link,
726                              smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]);
727         if (rc)
728                 return SMC_CLC_DECL_INTERR;
729
730         /* send CONFIRM LINK request to client over the RoCE fabric */
731         rc = smc_llc_send_confirm_link(link,
732                                        link->smcibdev->mac[link->ibport - 1],
733                                        &link->smcibdev->gid[link->ibport - 1],
734                                        SMC_LLC_REQ);
735         if (rc < 0)
736                 return SMC_CLC_DECL_TCL;
737
738         /* receive CONFIRM LINK response from client over the RoCE fabric */
739         rest = wait_for_completion_interruptible_timeout(
740                 &link->llc_confirm_resp,
741                 SMC_LLC_WAIT_FIRST_TIME);
742         if (rest <= 0) {
743                 struct smc_clc_msg_decline dclc;
744
745                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
746                                       SMC_CLC_DECLINE);
747         }
748
749         return rc;
750 }
751
752 /* setup for RDMA connection of server */
753 static void smc_listen_work(struct work_struct *work)
754 {
755         struct smc_sock *new_smc = container_of(work, struct smc_sock,
756                                                 smc_listen_work);
757         struct socket *newclcsock = new_smc->clcsock;
758         struct smc_sock *lsmc = new_smc->listen_smc;
759         struct smc_clc_msg_accept_confirm cclc;
760         int local_contact = SMC_REUSE_CONTACT;
761         struct sock *newsmcsk = &new_smc->sk;
762         struct smc_clc_msg_proposal pclc;
763         struct smc_ib_device *smcibdev;
764         struct sockaddr_in peeraddr;
765         struct smc_link *link;
766         int reason_code = 0;
767         int rc = 0, len;
768         __be32 subnet;
769         u8 prefix_len;
770         u8 ibport;
771
772         /* do inband token exchange -
773          *wait for and receive SMC Proposal CLC message
774          */
775         reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc),
776                                        SMC_CLC_PROPOSAL);
777         if (reason_code < 0)
778                 goto out_err;
779         if (reason_code > 0)
780                 goto decline_rdma;
781
782         /* IPSec connections opt out of SMC-R optimizations */
783         if (using_ipsec(new_smc)) {
784                 reason_code = SMC_CLC_DECL_IPSEC;
785                 goto decline_rdma;
786         }
787
788         /* PNET table look up: search active ib_device and port
789          * within same PNETID that also contains the ethernet device
790          * used for the internal TCP socket
791          */
792         smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
793         if (!smcibdev) {
794                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
795                 goto decline_rdma;
796         }
797
798         /* determine subnet and mask from internal TCP socket */
799         rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
800         if (rc) {
801                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
802                 goto decline_rdma;
803         }
804         if ((pclc.outgoing_subnet != subnet) ||
805             (pclc.prefix_len != prefix_len)) {
806                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
807                 goto decline_rdma;
808         }
809
810         /* get address of the peer connected to the internal TCP socket */
811         kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len);
812
813         /* allocate connection / link group */
814         mutex_lock(&smc_create_lgr_pending);
815         local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr,
816                                         smcibdev, ibport, &pclc.lcl, 0);
817         if (local_contact < 0) {
818                 rc = local_contact;
819                 if (rc == -ENOMEM)
820                         reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
821                 goto decline_rdma;
822         }
823         link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
824
825         /* create send buffer and rmb */
826         rc = smc_buf_create(new_smc);
827         if (rc) {
828                 reason_code = SMC_CLC_DECL_MEM;
829                 goto decline_rdma;
830         }
831
832         smc_close_init(new_smc);
833         smc_rx_init(new_smc);
834
835         if (local_contact != SMC_FIRST_CONTACT) {
836                 struct smc_buf_desc *buf_desc = new_smc->conn.rmb_desc;
837
838                 if (!buf_desc->reused) {
839                         /* register memory region for new rmb */
840                         rc = smc_wr_reg_send(link,
841                                              buf_desc->mr_rx[SMC_SINGLE_LINK]);
842                         if (rc) {
843                                 reason_code = SMC_CLC_DECL_INTERR;
844                                 goto decline_rdma;
845                         }
846                 }
847         }
848         smc_rmb_sync_sg_for_device(&new_smc->conn);
849
850         rc = smc_clc_send_accept(new_smc, local_contact);
851         if (rc)
852                 goto out_err;
853
854         /* receive SMC Confirm CLC message */
855         reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
856                                        SMC_CLC_CONFIRM);
857         if (reason_code < 0)
858                 goto out_err;
859         if (reason_code > 0)
860                 goto decline_rdma;
861         smc_conn_save_peer_info(new_smc, &cclc);
862         if (local_contact == SMC_FIRST_CONTACT)
863                 smc_link_save_peer_info(link, &cclc);
864
865         rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
866         if (rc) {
867                 reason_code = SMC_CLC_DECL_INTERR;
868                 goto decline_rdma;
869         }
870
871         if (local_contact == SMC_FIRST_CONTACT) {
872                 rc = smc_ib_ready_link(link);
873                 if (rc) {
874                         reason_code = SMC_CLC_DECL_INTERR;
875                         goto decline_rdma;
876                 }
877                 /* QP confirmation over RoCE fabric */
878                 reason_code = smc_serv_conf_first_link(new_smc);
879                 if (reason_code < 0) {
880                         /* peer is not aware of a problem */
881                         rc = reason_code;
882                         goto out_err;
883                 }
884                 if (reason_code > 0)
885                         goto decline_rdma;
886         }
887
888         smc_tx_init(new_smc);
889
890 out_connected:
891         sk_refcnt_debug_inc(newsmcsk);
892         if (newsmcsk->sk_state == SMC_INIT)
893                 newsmcsk->sk_state = SMC_ACTIVE;
894 enqueue:
895         mutex_unlock(&smc_create_lgr_pending);
896         lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
897         if (lsmc->sk.sk_state == SMC_LISTEN) {
898                 smc_accept_enqueue(&lsmc->sk, newsmcsk);
899         } else { /* no longer listening */
900                 smc_close_non_accepted(newsmcsk);
901         }
902         release_sock(&lsmc->sk);
903
904         /* Wake up accept */
905         lsmc->sk.sk_data_ready(&lsmc->sk);
906         sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
907         return;
908
909 decline_rdma:
910         /* RDMA setup failed, switch back to TCP */
911         smc_conn_free(&new_smc->conn);
912         new_smc->use_fallback = true;
913         if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
914                 rc = smc_clc_send_decline(new_smc, reason_code);
915                 if (rc < sizeof(struct smc_clc_msg_decline))
916                         goto out_err;
917         }
918         goto out_connected;
919
920 out_err:
921         newsmcsk->sk_state = SMC_CLOSED;
922         smc_conn_free(&new_smc->conn);
923         goto enqueue; /* queue new sock with sk_err set */
924 }
925
926 static void smc_tcp_listen_work(struct work_struct *work)
927 {
928         struct smc_sock *lsmc = container_of(work, struct smc_sock,
929                                              tcp_listen_work);
930         struct smc_sock *new_smc;
931         int rc = 0;
932
933         lock_sock(&lsmc->sk);
934         while (lsmc->sk.sk_state == SMC_LISTEN) {
935                 rc = smc_clcsock_accept(lsmc, &new_smc);
936                 if (rc)
937                         goto out;
938                 if (!new_smc)
939                         continue;
940
941                 new_smc->listen_smc = lsmc;
942                 new_smc->use_fallback = false; /* assume rdma capability first*/
943                 sock_hold(&lsmc->sk); /* sock_put in smc_listen_work */
944                 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
945                 smc_copy_sock_settings_to_smc(new_smc);
946                 schedule_work(&new_smc->smc_listen_work);
947         }
948
949 out:
950         release_sock(&lsmc->sk);
951         lsmc->sk.sk_data_ready(&lsmc->sk); /* no more listening, wake accept */
952 }
953
954 static int smc_listen(struct socket *sock, int backlog)
955 {
956         struct sock *sk = sock->sk;
957         struct smc_sock *smc;
958         int rc;
959
960         smc = smc_sk(sk);
961         lock_sock(sk);
962
963         rc = -EINVAL;
964         if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
965                 goto out;
966
967         rc = 0;
968         if (sk->sk_state == SMC_LISTEN) {
969                 sk->sk_max_ack_backlog = backlog;
970                 goto out;
971         }
972         /* some socket options are handled in core, so we could not apply
973          * them to the clc socket -- copy smc socket options to clc socket
974          */
975         smc_copy_sock_settings_to_clc(smc);
976
977         rc = kernel_listen(smc->clcsock, backlog);
978         if (rc)
979                 goto out;
980         sk->sk_max_ack_backlog = backlog;
981         sk->sk_ack_backlog = 0;
982         sk->sk_state = SMC_LISTEN;
983         INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
984         schedule_work(&smc->tcp_listen_work);
985
986 out:
987         release_sock(sk);
988         return rc;
989 }
990
991 static int smc_accept(struct socket *sock, struct socket *new_sock,
992                       int flags, bool kern)
993 {
994         struct sock *sk = sock->sk, *nsk;
995         DECLARE_WAITQUEUE(wait, current);
996         struct smc_sock *lsmc;
997         long timeo;
998         int rc = 0;
999
1000         lsmc = smc_sk(sk);
1001         lock_sock(sk);
1002
1003         if (lsmc->sk.sk_state != SMC_LISTEN) {
1004                 rc = -EINVAL;
1005                 goto out;
1006         }
1007
1008         /* Wait for an incoming connection */
1009         timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1010         add_wait_queue_exclusive(sk_sleep(sk), &wait);
1011         while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1012                 set_current_state(TASK_INTERRUPTIBLE);
1013                 if (!timeo) {
1014                         rc = -EAGAIN;
1015                         break;
1016                 }
1017                 release_sock(sk);
1018                 timeo = schedule_timeout(timeo);
1019                 /* wakeup by sk_data_ready in smc_listen_work() */
1020                 sched_annotate_sleep();
1021                 lock_sock(sk);
1022                 if (signal_pending(current)) {
1023                         rc = sock_intr_errno(timeo);
1024                         break;
1025                 }
1026         }
1027         set_current_state(TASK_RUNNING);
1028         remove_wait_queue(sk_sleep(sk), &wait);
1029
1030         if (!rc)
1031                 rc = sock_error(nsk);
1032
1033 out:
1034         release_sock(sk);
1035         return rc;
1036 }
1037
1038 static int smc_getname(struct socket *sock, struct sockaddr *addr,
1039                        int *len, int peer)
1040 {
1041         struct smc_sock *smc;
1042
1043         if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1044             (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1045                 return -ENOTCONN;
1046
1047         smc = smc_sk(sock->sk);
1048
1049         return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer);
1050 }
1051
1052 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1053 {
1054         struct sock *sk = sock->sk;
1055         struct smc_sock *smc;
1056         int rc = -EPIPE;
1057
1058         smc = smc_sk(sk);
1059         lock_sock(sk);
1060         if ((sk->sk_state != SMC_ACTIVE) &&
1061             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1062             (sk->sk_state != SMC_INIT))
1063                 goto out;
1064         if (smc->use_fallback)
1065                 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1066         else
1067                 rc = smc_tx_sendmsg(smc, msg, len);
1068 out:
1069         release_sock(sk);
1070         return rc;
1071 }
1072
1073 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1074                        int flags)
1075 {
1076         struct sock *sk = sock->sk;
1077         struct smc_sock *smc;
1078         int rc = -ENOTCONN;
1079
1080         smc = smc_sk(sk);
1081         lock_sock(sk);
1082         if ((sk->sk_state == SMC_INIT) ||
1083             (sk->sk_state == SMC_LISTEN) ||
1084             (sk->sk_state == SMC_CLOSED))
1085                 goto out;
1086
1087         if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1088                 rc = 0;
1089                 goto out;
1090         }
1091
1092         if (smc->use_fallback)
1093                 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1094         else
1095                 rc = smc_rx_recvmsg(smc, msg, len, flags);
1096
1097 out:
1098         release_sock(sk);
1099         return rc;
1100 }
1101
1102 static unsigned int smc_accept_poll(struct sock *parent)
1103 {
1104         struct smc_sock *isk;
1105         struct sock *sk;
1106
1107         lock_sock(parent);
1108         list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) {
1109                 sk = (struct sock *)isk;
1110
1111                 if (sk->sk_state == SMC_ACTIVE) {
1112                         release_sock(parent);
1113                         return POLLIN | POLLRDNORM;
1114                 }
1115         }
1116         release_sock(parent);
1117
1118         return 0;
1119 }
1120
1121 static unsigned int smc_poll(struct file *file, struct socket *sock,
1122                              poll_table *wait)
1123 {
1124         struct sock *sk = sock->sk;
1125         unsigned int mask = 0;
1126         struct smc_sock *smc;
1127         int rc;
1128
1129         smc = smc_sk(sock->sk);
1130         if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
1131                 /* delegate to CLC child sock */
1132                 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1133                 /* if non-blocking connect finished ... */
1134                 lock_sock(sk);
1135                 if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) {
1136                         sk->sk_err = smc->clcsock->sk->sk_err;
1137                         if (sk->sk_err) {
1138                                 mask |= POLLERR;
1139                         } else {
1140                                 rc = smc_connect_rdma(smc);
1141                                 if (rc < 0)
1142                                         mask |= POLLERR;
1143                                 else
1144                                         /* success cases including fallback */
1145                                         mask |= POLLOUT | POLLWRNORM;
1146                         }
1147                 }
1148                 release_sock(sk);
1149         } else {
1150                 sock_poll_wait(file, sk_sleep(sk), wait);
1151                 if (sk->sk_state == SMC_LISTEN)
1152                         /* woken up by sk_data_ready in smc_listen_work() */
1153                         mask |= smc_accept_poll(sk);
1154                 if (sk->sk_err)
1155                         mask |= POLLERR;
1156                 if (atomic_read(&smc->conn.sndbuf_space) ||
1157                     (sk->sk_shutdown & SEND_SHUTDOWN)) {
1158                         mask |= POLLOUT | POLLWRNORM;
1159                 } else {
1160                         sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1161                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1162                 }
1163                 if (atomic_read(&smc->conn.bytes_to_rcv))
1164                         mask |= POLLIN | POLLRDNORM;
1165                 if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1166                     (sk->sk_state == SMC_CLOSED))
1167                         mask |= POLLHUP;
1168                 if (sk->sk_shutdown & RCV_SHUTDOWN)
1169                         mask |= POLLIN | POLLRDNORM | POLLRDHUP;
1170                 if (sk->sk_state == SMC_APPCLOSEWAIT1)
1171                         mask |= POLLIN;
1172
1173         }
1174
1175         return mask;
1176 }
1177
1178 static int smc_shutdown(struct socket *sock, int how)
1179 {
1180         struct sock *sk = sock->sk;
1181         bool do_shutdown = true;
1182         struct smc_sock *smc;
1183         int rc = -EINVAL;
1184         int old_state;
1185         int rc1 = 0;
1186
1187         smc = smc_sk(sk);
1188
1189         if ((how < SHUT_RD) || (how > SHUT_RDWR))
1190                 return rc;
1191
1192         lock_sock(sk);
1193
1194         rc = -ENOTCONN;
1195         if ((sk->sk_state != SMC_ACTIVE) &&
1196             (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1197             (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1198             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1199             (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1200             (sk->sk_state != SMC_APPFINCLOSEWAIT))
1201                 goto out;
1202         if (smc->use_fallback) {
1203                 rc = kernel_sock_shutdown(smc->clcsock, how);
1204                 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1205                 if (sk->sk_shutdown == SHUTDOWN_MASK)
1206                         sk->sk_state = SMC_CLOSED;
1207                 goto out;
1208         }
1209         switch (how) {
1210         case SHUT_RDWR:         /* shutdown in both directions */
1211                 old_state = sk->sk_state;
1212                 rc = smc_close_active(smc);
1213                 if (old_state == SMC_ACTIVE &&
1214                     sk->sk_state == SMC_PEERCLOSEWAIT1)
1215                         do_shutdown = false;
1216                 break;
1217         case SHUT_WR:
1218                 rc = smc_close_shutdown_write(smc);
1219                 break;
1220         case SHUT_RD:
1221                 rc = 0;
1222                 /* nothing more to do because peer is not involved */
1223                 break;
1224         }
1225         if (do_shutdown && smc->clcsock)
1226                 rc1 = kernel_sock_shutdown(smc->clcsock, how);
1227         /* map sock_shutdown_cmd constants to sk_shutdown value range */
1228         sk->sk_shutdown |= how + 1;
1229
1230 out:
1231         release_sock(sk);
1232         return rc ? rc : rc1;
1233 }
1234
1235 static int smc_setsockopt(struct socket *sock, int level, int optname,
1236                           char __user *optval, unsigned int optlen)
1237 {
1238         struct sock *sk = sock->sk;
1239         struct smc_sock *smc;
1240
1241         smc = smc_sk(sk);
1242
1243         /* generic setsockopts reaching us here always apply to the
1244          * CLC socket
1245          */
1246         return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1247                                              optval, optlen);
1248 }
1249
1250 static int smc_getsockopt(struct socket *sock, int level, int optname,
1251                           char __user *optval, int __user *optlen)
1252 {
1253         struct smc_sock *smc;
1254
1255         smc = smc_sk(sock->sk);
1256         /* socket options apply to the CLC socket */
1257         return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1258                                              optval, optlen);
1259 }
1260
1261 static int smc_ioctl(struct socket *sock, unsigned int cmd,
1262                      unsigned long arg)
1263 {
1264         struct smc_sock *smc;
1265
1266         smc = smc_sk(sock->sk);
1267         if (smc->use_fallback)
1268                 return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1269         else
1270                 return sock_no_ioctl(sock, cmd, arg);
1271 }
1272
1273 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1274                             int offset, size_t size, int flags)
1275 {
1276         struct sock *sk = sock->sk;
1277         struct smc_sock *smc;
1278         int rc = -EPIPE;
1279
1280         smc = smc_sk(sk);
1281         lock_sock(sk);
1282         if (sk->sk_state != SMC_ACTIVE) {
1283                 release_sock(sk);
1284                 goto out;
1285         }
1286         release_sock(sk);
1287         if (smc->use_fallback)
1288                 rc = kernel_sendpage(smc->clcsock, page, offset,
1289                                      size, flags);
1290         else
1291                 rc = sock_no_sendpage(sock, page, offset, size, flags);
1292
1293 out:
1294         return rc;
1295 }
1296
1297 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1298                                struct pipe_inode_info *pipe, size_t len,
1299                                     unsigned int flags)
1300 {
1301         struct sock *sk = sock->sk;
1302         struct smc_sock *smc;
1303         int rc = -ENOTCONN;
1304
1305         smc = smc_sk(sk);
1306         lock_sock(sk);
1307         if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
1308                 goto out;
1309         if (smc->use_fallback) {
1310                 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1311                                                     pipe, len, flags);
1312         } else {
1313                 rc = -EOPNOTSUPP;
1314         }
1315 out:
1316         release_sock(sk);
1317         return rc;
1318 }
1319
1320 /* must look like tcp */
1321 static const struct proto_ops smc_sock_ops = {
1322         .family         = PF_SMC,
1323         .owner          = THIS_MODULE,
1324         .release        = smc_release,
1325         .bind           = smc_bind,
1326         .connect        = smc_connect,
1327         .socketpair     = sock_no_socketpair,
1328         .accept         = smc_accept,
1329         .getname        = smc_getname,
1330         .poll           = smc_poll,
1331         .ioctl          = smc_ioctl,
1332         .listen         = smc_listen,
1333         .shutdown       = smc_shutdown,
1334         .setsockopt     = smc_setsockopt,
1335         .getsockopt     = smc_getsockopt,
1336         .sendmsg        = smc_sendmsg,
1337         .recvmsg        = smc_recvmsg,
1338         .mmap           = sock_no_mmap,
1339         .sendpage       = smc_sendpage,
1340         .splice_read    = smc_splice_read,
1341 };
1342
1343 static int smc_create(struct net *net, struct socket *sock, int protocol,
1344                       int kern)
1345 {
1346         struct smc_sock *smc;
1347         struct sock *sk;
1348         int rc;
1349
1350         rc = -ESOCKTNOSUPPORT;
1351         if (sock->type != SOCK_STREAM)
1352                 goto out;
1353
1354         rc = -EPROTONOSUPPORT;
1355         if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP))
1356                 goto out;
1357
1358         rc = -ENOBUFS;
1359         sock->ops = &smc_sock_ops;
1360         sk = smc_sock_alloc(net, sock);
1361         if (!sk)
1362                 goto out;
1363
1364         /* create internal TCP socket for CLC handshake and fallback */
1365         smc = smc_sk(sk);
1366         smc->use_fallback = false; /* assume rdma capability first */
1367         rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
1368                               IPPROTO_TCP, &smc->clcsock);
1369         if (rc) {
1370                 sk_common_release(sk);
1371                 goto out;
1372         }
1373         smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1374         smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1375
1376 out:
1377         return rc;
1378 }
1379
1380 static const struct net_proto_family smc_sock_family_ops = {
1381         .family = PF_SMC,
1382         .owner  = THIS_MODULE,
1383         .create = smc_create,
1384 };
1385
1386 static int __init smc_init(void)
1387 {
1388         int rc;
1389
1390         rc = smc_pnet_init();
1391         if (rc)
1392                 return rc;
1393
1394         rc = smc_llc_init();
1395         if (rc) {
1396                 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
1397                 goto out_pnet;
1398         }
1399
1400         rc = smc_cdc_init();
1401         if (rc) {
1402                 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
1403                 goto out_pnet;
1404         }
1405
1406         rc = proto_register(&smc_proto, 1);
1407         if (rc) {
1408                 pr_err("%s: proto_register fails with %d\n", __func__, rc);
1409                 goto out_pnet;
1410         }
1411
1412         rc = sock_register(&smc_sock_family_ops);
1413         if (rc) {
1414                 pr_err("%s: sock_register fails with %d\n", __func__, rc);
1415                 goto out_proto;
1416         }
1417         INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
1418
1419         rc = smc_ib_register_client();
1420         if (rc) {
1421                 pr_err("%s: ib_register fails with %d\n", __func__, rc);
1422                 goto out_sock;
1423         }
1424
1425         return 0;
1426
1427 out_sock:
1428         sock_unregister(PF_SMC);
1429 out_proto:
1430         proto_unregister(&smc_proto);
1431 out_pnet:
1432         smc_pnet_exit();
1433         return rc;
1434 }
1435
1436 static void __exit smc_exit(void)
1437 {
1438         struct smc_link_group *lgr, *lg;
1439         LIST_HEAD(lgr_freeing_list);
1440
1441         spin_lock_bh(&smc_lgr_list.lock);
1442         if (!list_empty(&smc_lgr_list.list))
1443                 list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
1444         spin_unlock_bh(&smc_lgr_list.lock);
1445         list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
1446                 list_del_init(&lgr->list);
1447                 smc_lgr_free(lgr); /* free link group */
1448         }
1449         smc_ib_unregister_client();
1450         sock_unregister(PF_SMC);
1451         proto_unregister(&smc_proto);
1452         smc_pnet_exit();
1453 }
1454
1455 module_init(smc_init);
1456 module_exit(smc_exit);
1457
1458 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
1459 MODULE_DESCRIPTION("smc socket address family");
1460 MODULE_LICENSE("GPL");
1461 MODULE_ALIAS_NETPROTO(PF_SMC);