GNU Linux-libre 4.19.211-gnu1
[releases.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
84
85 #include <crypto/hash.h>
86 #include <linux/scatterlist.h>
87
88 #include <trace/events/tcp.h>
89
90 #ifdef CONFIG_TCP_MD5SIG
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
92                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
93 #endif
94
95 struct inet_hashinfo tcp_hashinfo;
96 EXPORT_SYMBOL(tcp_hashinfo);
97
98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
99 {
100         return secure_tcp_seq(ip_hdr(skb)->daddr,
101                               ip_hdr(skb)->saddr,
102                               tcp_hdr(skb)->dest,
103                               tcp_hdr(skb)->source);
104 }
105
106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
107 {
108         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 }
110
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113         const struct inet_timewait_sock *tw = inet_twsk(sktw);
114         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115         struct tcp_sock *tp = tcp_sk(sk);
116         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
117
118         if (reuse == 2) {
119                 /* Still does not detect *everything* that goes through
120                  * lo, since we require a loopback src or dst address
121                  * or direct binding to 'lo' interface.
122                  */
123                 bool loopback = false;
124                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
125                         loopback = true;
126 #if IS_ENABLED(CONFIG_IPV6)
127                 if (tw->tw_family == AF_INET6) {
128                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
129                             (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
130                              (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
131                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
132                             (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
133                              (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
134                                 loopback = true;
135                 } else
136 #endif
137                 {
138                         if (ipv4_is_loopback(tw->tw_daddr) ||
139                             ipv4_is_loopback(tw->tw_rcv_saddr))
140                                 loopback = true;
141                 }
142                 if (!loopback)
143                         reuse = 0;
144         }
145
146         /* With PAWS, it is safe from the viewpoint
147            of data integrity. Even without PAWS it is safe provided sequence
148            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
149
150            Actually, the idea is close to VJ's one, only timestamp cache is
151            held not per host, but per port pair and TW bucket is used as state
152            holder.
153
154            If TW bucket has been already destroyed we fall back to VJ's scheme
155            and use initial timestamp retrieved from peer table.
156          */
157         if (tcptw->tw_ts_recent_stamp &&
158             (!twp || (reuse && time_after32(ktime_get_seconds(),
159                                             tcptw->tw_ts_recent_stamp)))) {
160                 /* In case of repair and re-using TIME-WAIT sockets we still
161                  * want to be sure that it is safe as above but honor the
162                  * sequence numbers and time stamps set as part of the repair
163                  * process.
164                  *
165                  * Without this check re-using a TIME-WAIT socket with TCP
166                  * repair would accumulate a -1 on the repair assigned
167                  * sequence number. The first time it is reused the sequence
168                  * is -1, the second time -2, etc. This fixes that issue
169                  * without appearing to create any others.
170                  */
171                 if (likely(!tp->repair)) {
172                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
173
174                         if (!seq)
175                                 seq = 1;
176                         WRITE_ONCE(tp->write_seq, seq);
177                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
178                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
179                 }
180                 sock_hold(sktw);
181                 return 1;
182         }
183
184         return 0;
185 }
186 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
187
188 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
189                               int addr_len)
190 {
191         /* This check is replicated from tcp_v4_connect() and intended to
192          * prevent BPF program called below from accessing bytes that are out
193          * of the bound specified by user in addr_len.
194          */
195         if (addr_len < sizeof(struct sockaddr_in))
196                 return -EINVAL;
197
198         sock_owned_by_me(sk);
199
200         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
201 }
202
203 /* This will initiate an outgoing connection. */
204 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
205 {
206         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
207         struct inet_sock *inet = inet_sk(sk);
208         struct tcp_sock *tp = tcp_sk(sk);
209         __be16 orig_sport, orig_dport;
210         __be32 daddr, nexthop;
211         struct flowi4 *fl4;
212         struct rtable *rt;
213         int err;
214         struct ip_options_rcu *inet_opt;
215         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
216
217         if (addr_len < sizeof(struct sockaddr_in))
218                 return -EINVAL;
219
220         if (usin->sin_family != AF_INET)
221                 return -EAFNOSUPPORT;
222
223         nexthop = daddr = usin->sin_addr.s_addr;
224         inet_opt = rcu_dereference_protected(inet->inet_opt,
225                                              lockdep_sock_is_held(sk));
226         if (inet_opt && inet_opt->opt.srr) {
227                 if (!daddr)
228                         return -EINVAL;
229                 nexthop = inet_opt->opt.faddr;
230         }
231
232         orig_sport = inet->inet_sport;
233         orig_dport = usin->sin_port;
234         fl4 = &inet->cork.fl.u.ip4;
235         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
236                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
237                               IPPROTO_TCP,
238                               orig_sport, orig_dport, sk);
239         if (IS_ERR(rt)) {
240                 err = PTR_ERR(rt);
241                 if (err == -ENETUNREACH)
242                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
243                 return err;
244         }
245
246         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
247                 ip_rt_put(rt);
248                 return -ENETUNREACH;
249         }
250
251         if (!inet_opt || !inet_opt->opt.srr)
252                 daddr = fl4->daddr;
253
254         if (!inet->inet_saddr)
255                 inet->inet_saddr = fl4->saddr;
256         sk_rcv_saddr_set(sk, inet->inet_saddr);
257
258         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
259                 /* Reset inherited state */
260                 tp->rx_opt.ts_recent       = 0;
261                 tp->rx_opt.ts_recent_stamp = 0;
262                 if (likely(!tp->repair))
263                         WRITE_ONCE(tp->write_seq, 0);
264         }
265
266         inet->inet_dport = usin->sin_port;
267         sk_daddr_set(sk, daddr);
268
269         inet_csk(sk)->icsk_ext_hdr_len = 0;
270         if (inet_opt)
271                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
272
273         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
274
275         /* Socket identity is still unknown (sport may be zero).
276          * However we set state to SYN-SENT and not releasing socket
277          * lock select source port, enter ourselves into the hash tables and
278          * complete initialization after this.
279          */
280         tcp_set_state(sk, TCP_SYN_SENT);
281         err = inet_hash_connect(tcp_death_row, sk);
282         if (err)
283                 goto failure;
284
285         sk_set_txhash(sk);
286
287         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
288                                inet->inet_sport, inet->inet_dport, sk);
289         if (IS_ERR(rt)) {
290                 err = PTR_ERR(rt);
291                 rt = NULL;
292                 goto failure;
293         }
294         /* OK, now commit destination to socket.  */
295         sk->sk_gso_type = SKB_GSO_TCPV4;
296         sk_setup_caps(sk, &rt->dst);
297         rt = NULL;
298
299         if (likely(!tp->repair)) {
300                 if (!tp->write_seq)
301                         WRITE_ONCE(tp->write_seq,
302                                    secure_tcp_seq(inet->inet_saddr,
303                                                   inet->inet_daddr,
304                                                   inet->inet_sport,
305                                                   usin->sin_port));
306                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
307                                                  inet->inet_saddr,
308                                                  inet->inet_daddr);
309         }
310
311         inet->inet_id = prandom_u32();
312
313         if (tcp_fastopen_defer_connect(sk, &err))
314                 return err;
315         if (err)
316                 goto failure;
317
318         err = tcp_connect(sk);
319
320         if (err)
321                 goto failure;
322
323         return 0;
324
325 failure:
326         /*
327          * This unhashes the socket and releases the local port,
328          * if necessary.
329          */
330         tcp_set_state(sk, TCP_CLOSE);
331         ip_rt_put(rt);
332         sk->sk_route_caps = 0;
333         inet->inet_dport = 0;
334         return err;
335 }
336 EXPORT_SYMBOL(tcp_v4_connect);
337
338 /*
339  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
340  * It can be called through tcp_release_cb() if socket was owned by user
341  * at the time tcp_v4_err() was called to handle ICMP message.
342  */
343 void tcp_v4_mtu_reduced(struct sock *sk)
344 {
345         struct inet_sock *inet = inet_sk(sk);
346         struct dst_entry *dst;
347         u32 mtu;
348
349         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
350                 return;
351         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
352         dst = inet_csk_update_pmtu(sk, mtu);
353         if (!dst)
354                 return;
355
356         /* Something is about to be wrong... Remember soft error
357          * for the case, if this connection will not able to recover.
358          */
359         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
360                 sk->sk_err_soft = EMSGSIZE;
361
362         mtu = dst_mtu(dst);
363
364         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
365             ip_sk_accept_pmtu(sk) &&
366             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
367                 tcp_sync_mss(sk, mtu);
368
369                 /* Resend the TCP packet because it's
370                  * clear that the old packet has been
371                  * dropped. This is the new "fast" path mtu
372                  * discovery.
373                  */
374                 tcp_simple_retransmit(sk);
375         } /* else let the usual retransmit timer handle it */
376 }
377 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
378
379 static void do_redirect(struct sk_buff *skb, struct sock *sk)
380 {
381         struct dst_entry *dst = __sk_dst_check(sk, 0);
382
383         if (dst)
384                 dst->ops->redirect(dst, sk, skb);
385 }
386
387
388 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
389 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
390 {
391         struct request_sock *req = inet_reqsk(sk);
392         struct net *net = sock_net(sk);
393
394         /* ICMPs are not backlogged, hence we cannot get
395          * an established socket here.
396          */
397         if (seq != tcp_rsk(req)->snt_isn) {
398                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
399         } else if (abort) {
400                 /*
401                  * Still in SYN_RECV, just remove it silently.
402                  * There is no good way to pass the error to the newly
403                  * created socket, and POSIX does not want network
404                  * errors returned from accept().
405                  */
406                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
407                 tcp_listendrop(req->rsk_listener);
408         }
409         reqsk_put(req);
410 }
411 EXPORT_SYMBOL(tcp_req_err);
412
413 /*
414  * This routine is called by the ICMP module when it gets some
415  * sort of error condition.  If err < 0 then the socket should
416  * be closed and the error returned to the user.  If err > 0
417  * it's just the icmp type << 8 | icmp code.  After adjustment
418  * header points to the first 8 bytes of the tcp header.  We need
419  * to find the appropriate port.
420  *
421  * The locking strategy used here is very "optimistic". When
422  * someone else accesses the socket the ICMP is just dropped
423  * and for some paths there is no check at all.
424  * A more general error queue to queue errors for later handling
425  * is probably better.
426  *
427  */
428
429 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
430 {
431         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
432         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
433         struct inet_connection_sock *icsk;
434         struct tcp_sock *tp;
435         struct inet_sock *inet;
436         const int type = icmp_hdr(icmp_skb)->type;
437         const int code = icmp_hdr(icmp_skb)->code;
438         struct sock *sk;
439         struct sk_buff *skb;
440         struct request_sock *fastopen;
441         u32 seq, snd_una;
442         s32 remaining;
443         u32 delta_us;
444         int err;
445         struct net *net = dev_net(icmp_skb->dev);
446
447         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
448                                        th->dest, iph->saddr, ntohs(th->source),
449                                        inet_iif(icmp_skb), 0);
450         if (!sk) {
451                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
452                 return;
453         }
454         if (sk->sk_state == TCP_TIME_WAIT) {
455                 inet_twsk_put(inet_twsk(sk));
456                 return;
457         }
458         seq = ntohl(th->seq);
459         if (sk->sk_state == TCP_NEW_SYN_RECV)
460                 return tcp_req_err(sk, seq,
461                                   type == ICMP_PARAMETERPROB ||
462                                   type == ICMP_TIME_EXCEEDED ||
463                                   (type == ICMP_DEST_UNREACH &&
464                                    (code == ICMP_NET_UNREACH ||
465                                     code == ICMP_HOST_UNREACH)));
466
467         bh_lock_sock(sk);
468         /* If too many ICMPs get dropped on busy
469          * servers this needs to be solved differently.
470          * We do take care of PMTU discovery (RFC1191) special case :
471          * we can receive locally generated ICMP messages while socket is held.
472          */
473         if (sock_owned_by_user(sk)) {
474                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
475                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
476         }
477         if (sk->sk_state == TCP_CLOSE)
478                 goto out;
479
480         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
481                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
482                 goto out;
483         }
484
485         icsk = inet_csk(sk);
486         tp = tcp_sk(sk);
487         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
488         fastopen = tp->fastopen_rsk;
489         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
490         if (sk->sk_state != TCP_LISTEN &&
491             !between(seq, snd_una, tp->snd_nxt)) {
492                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
493                 goto out;
494         }
495
496         switch (type) {
497         case ICMP_REDIRECT:
498                 if (!sock_owned_by_user(sk))
499                         do_redirect(icmp_skb, sk);
500                 goto out;
501         case ICMP_SOURCE_QUENCH:
502                 /* Just silently ignore these. */
503                 goto out;
504         case ICMP_PARAMETERPROB:
505                 err = EPROTO;
506                 break;
507         case ICMP_DEST_UNREACH:
508                 if (code > NR_ICMP_UNREACH)
509                         goto out;
510
511                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
512                         /* We are not interested in TCP_LISTEN and open_requests
513                          * (SYN-ACKs send out by Linux are always <576bytes so
514                          * they should go through unfragmented).
515                          */
516                         if (sk->sk_state == TCP_LISTEN)
517                                 goto out;
518
519                         WRITE_ONCE(tp->mtu_info, info);
520                         if (!sock_owned_by_user(sk)) {
521                                 tcp_v4_mtu_reduced(sk);
522                         } else {
523                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
524                                         sock_hold(sk);
525                         }
526                         goto out;
527                 }
528
529                 err = icmp_err_convert[code].errno;
530                 /* check if icmp_skb allows revert of backoff
531                  * (see draft-zimmermann-tcp-lcd) */
532                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
533                         break;
534                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
535                     !icsk->icsk_backoff || fastopen)
536                         break;
537
538                 if (sock_owned_by_user(sk))
539                         break;
540
541                 skb = tcp_rtx_queue_head(sk);
542                 if (WARN_ON_ONCE(!skb))
543                         break;
544
545                 icsk->icsk_backoff--;
546                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
547                                                TCP_TIMEOUT_INIT;
548                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
549
550                 tcp_mstamp_refresh(tp);
551                 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
552                 remaining = icsk->icsk_rto -
553                             usecs_to_jiffies(delta_us);
554
555                 if (remaining > 0) {
556                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
557                                                   remaining, TCP_RTO_MAX);
558                 } else {
559                         /* RTO revert clocked out retransmission.
560                          * Will retransmit now */
561                         tcp_retransmit_timer(sk);
562                 }
563
564                 break;
565         case ICMP_TIME_EXCEEDED:
566                 err = EHOSTUNREACH;
567                 break;
568         default:
569                 goto out;
570         }
571
572         switch (sk->sk_state) {
573         case TCP_SYN_SENT:
574         case TCP_SYN_RECV:
575                 /* Only in fast or simultaneous open. If a fast open socket is
576                  * is already accepted it is treated as a connected one below.
577                  */
578                 if (fastopen && !fastopen->sk)
579                         break;
580
581                 if (!sock_owned_by_user(sk)) {
582                         sk->sk_err = err;
583
584                         sk->sk_error_report(sk);
585
586                         tcp_done(sk);
587                 } else {
588                         sk->sk_err_soft = err;
589                 }
590                 goto out;
591         }
592
593         /* If we've already connected we will keep trying
594          * until we time out, or the user gives up.
595          *
596          * rfc1122 4.2.3.9 allows to consider as hard errors
597          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
598          * but it is obsoleted by pmtu discovery).
599          *
600          * Note, that in modern internet, where routing is unreliable
601          * and in each dark corner broken firewalls sit, sending random
602          * errors ordered by their masters even this two messages finally lose
603          * their original sense (even Linux sends invalid PORT_UNREACHs)
604          *
605          * Now we are in compliance with RFCs.
606          *                                                      --ANK (980905)
607          */
608
609         inet = inet_sk(sk);
610         if (!sock_owned_by_user(sk) && inet->recverr) {
611                 sk->sk_err = err;
612                 sk->sk_error_report(sk);
613         } else  { /* Only an error on timeout */
614                 sk->sk_err_soft = err;
615         }
616
617 out:
618         bh_unlock_sock(sk);
619         sock_put(sk);
620 }
621
622 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
623 {
624         struct tcphdr *th = tcp_hdr(skb);
625
626         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
627         skb->csum_start = skb_transport_header(skb) - skb->head;
628         skb->csum_offset = offsetof(struct tcphdr, check);
629 }
630
631 /* This routine computes an IPv4 TCP checksum. */
632 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
633 {
634         const struct inet_sock *inet = inet_sk(sk);
635
636         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
637 }
638 EXPORT_SYMBOL(tcp_v4_send_check);
639
640 /*
641  *      This routine will send an RST to the other tcp.
642  *
643  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
644  *                    for reset.
645  *      Answer: if a packet caused RST, it is not for a socket
646  *              existing in our system, if it is matched to a socket,
647  *              it is just duplicate segment or bug in other side's TCP.
648  *              So that we build reply only basing on parameters
649  *              arrived with segment.
650  *      Exception: precedence violation. We do not implement it in any case.
651  */
652
653 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
654 {
655         const struct tcphdr *th = tcp_hdr(skb);
656         struct {
657                 struct tcphdr th;
658 #ifdef CONFIG_TCP_MD5SIG
659                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
660 #endif
661         } rep;
662         struct ip_reply_arg arg;
663 #ifdef CONFIG_TCP_MD5SIG
664         struct tcp_md5sig_key *key = NULL;
665         const __u8 *hash_location = NULL;
666         unsigned char newhash[16];
667         int genhash;
668         struct sock *sk1 = NULL;
669 #endif
670         struct net *net;
671         struct sock *ctl_sk;
672
673         /* Never send a reset in response to a reset. */
674         if (th->rst)
675                 return;
676
677         /* If sk not NULL, it means we did a successful lookup and incoming
678          * route had to be correct. prequeue might have dropped our dst.
679          */
680         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
681                 return;
682
683         /* Swap the send and the receive. */
684         memset(&rep, 0, sizeof(rep));
685         rep.th.dest   = th->source;
686         rep.th.source = th->dest;
687         rep.th.doff   = sizeof(struct tcphdr) / 4;
688         rep.th.rst    = 1;
689
690         if (th->ack) {
691                 rep.th.seq = th->ack_seq;
692         } else {
693                 rep.th.ack = 1;
694                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
695                                        skb->len - (th->doff << 2));
696         }
697
698         memset(&arg, 0, sizeof(arg));
699         arg.iov[0].iov_base = (unsigned char *)&rep;
700         arg.iov[0].iov_len  = sizeof(rep.th);
701
702         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
703 #ifdef CONFIG_TCP_MD5SIG
704         rcu_read_lock();
705         hash_location = tcp_parse_md5sig_option(th);
706         if (sk && sk_fullsock(sk)) {
707                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
708                                         &ip_hdr(skb)->saddr, AF_INET);
709         } else if (hash_location) {
710                 /*
711                  * active side is lost. Try to find listening socket through
712                  * source port, and then find md5 key through listening socket.
713                  * we are not loose security here:
714                  * Incoming packet is checked with md5 hash with finding key,
715                  * no RST generated if md5 hash doesn't match.
716                  */
717                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
718                                              ip_hdr(skb)->saddr,
719                                              th->source, ip_hdr(skb)->daddr,
720                                              ntohs(th->source), inet_iif(skb),
721                                              tcp_v4_sdif(skb));
722                 /* don't send rst if it can't find key */
723                 if (!sk1)
724                         goto out;
725
726                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
727                                         &ip_hdr(skb)->saddr, AF_INET);
728                 if (!key)
729                         goto out;
730
731
732                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
733                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
734                         goto out;
735
736         }
737
738         if (key) {
739                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
740                                    (TCPOPT_NOP << 16) |
741                                    (TCPOPT_MD5SIG << 8) |
742                                    TCPOLEN_MD5SIG);
743                 /* Update length and the length the header thinks exists */
744                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
745                 rep.th.doff = arg.iov[0].iov_len / 4;
746
747                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
748                                      key, ip_hdr(skb)->saddr,
749                                      ip_hdr(skb)->daddr, &rep.th);
750         }
751 #endif
752         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
753                                       ip_hdr(skb)->saddr, /* XXX */
754                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
755         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
756         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
757
758         /* When socket is gone, all binding information is lost.
759          * routing might fail in this case. No choice here, if we choose to force
760          * input interface, we will misroute in case of asymmetric route.
761          */
762         if (sk) {
763                 arg.bound_dev_if = sk->sk_bound_dev_if;
764                 if (sk_fullsock(sk))
765                         trace_tcp_send_reset(sk, skb);
766         }
767
768         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
769                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
770
771         arg.tos = ip_hdr(skb)->tos;
772         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
773         local_bh_disable();
774         ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
775         if (sk)
776                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
777                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
778         ip_send_unicast_reply(ctl_sk,
779                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
780                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
781                               &arg, arg.iov[0].iov_len);
782
783         ctl_sk->sk_mark = 0;
784         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
785         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
786         local_bh_enable();
787
788 #ifdef CONFIG_TCP_MD5SIG
789 out:
790         rcu_read_unlock();
791 #endif
792 }
793
794 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
795    outside socket context is ugly, certainly. What can I do?
796  */
797
798 static void tcp_v4_send_ack(const struct sock *sk,
799                             struct sk_buff *skb, u32 seq, u32 ack,
800                             u32 win, u32 tsval, u32 tsecr, int oif,
801                             struct tcp_md5sig_key *key,
802                             int reply_flags, u8 tos)
803 {
804         const struct tcphdr *th = tcp_hdr(skb);
805         struct {
806                 struct tcphdr th;
807                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
808 #ifdef CONFIG_TCP_MD5SIG
809                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
810 #endif
811                         ];
812         } rep;
813         struct net *net = sock_net(sk);
814         struct ip_reply_arg arg;
815         struct sock *ctl_sk;
816
817         memset(&rep.th, 0, sizeof(struct tcphdr));
818         memset(&arg, 0, sizeof(arg));
819
820         arg.iov[0].iov_base = (unsigned char *)&rep;
821         arg.iov[0].iov_len  = sizeof(rep.th);
822         if (tsecr) {
823                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
824                                    (TCPOPT_TIMESTAMP << 8) |
825                                    TCPOLEN_TIMESTAMP);
826                 rep.opt[1] = htonl(tsval);
827                 rep.opt[2] = htonl(tsecr);
828                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
829         }
830
831         /* Swap the send and the receive. */
832         rep.th.dest    = th->source;
833         rep.th.source  = th->dest;
834         rep.th.doff    = arg.iov[0].iov_len / 4;
835         rep.th.seq     = htonl(seq);
836         rep.th.ack_seq = htonl(ack);
837         rep.th.ack     = 1;
838         rep.th.window  = htons(win);
839
840 #ifdef CONFIG_TCP_MD5SIG
841         if (key) {
842                 int offset = (tsecr) ? 3 : 0;
843
844                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
845                                           (TCPOPT_NOP << 16) |
846                                           (TCPOPT_MD5SIG << 8) |
847                                           TCPOLEN_MD5SIG);
848                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
849                 rep.th.doff = arg.iov[0].iov_len/4;
850
851                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
852                                     key, ip_hdr(skb)->saddr,
853                                     ip_hdr(skb)->daddr, &rep.th);
854         }
855 #endif
856         arg.flags = reply_flags;
857         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
858                                       ip_hdr(skb)->saddr, /* XXX */
859                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
860         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
861         if (oif)
862                 arg.bound_dev_if = oif;
863         arg.tos = tos;
864         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
865         local_bh_disable();
866         ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
867         if (sk)
868                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
869                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
870         ip_send_unicast_reply(ctl_sk,
871                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
872                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
873                               &arg, arg.iov[0].iov_len);
874
875         ctl_sk->sk_mark = 0;
876         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
877         local_bh_enable();
878 }
879
880 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
881 {
882         struct inet_timewait_sock *tw = inet_twsk(sk);
883         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
884
885         tcp_v4_send_ack(sk, skb,
886                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
887                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
888                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
889                         tcptw->tw_ts_recent,
890                         tw->tw_bound_dev_if,
891                         tcp_twsk_md5_key(tcptw),
892                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
893                         tw->tw_tos
894                         );
895
896         inet_twsk_put(tw);
897 }
898
899 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
900                                   struct request_sock *req)
901 {
902         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
903          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
904          */
905         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
906                                              tcp_sk(sk)->snd_nxt;
907
908         /* RFC 7323 2.3
909          * The window field (SEG.WND) of every outgoing segment, with the
910          * exception of <SYN> segments, MUST be right-shifted by
911          * Rcv.Wind.Shift bits:
912          */
913         tcp_v4_send_ack(sk, skb, seq,
914                         tcp_rsk(req)->rcv_nxt,
915                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
916                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
917                         req->ts_recent,
918                         0,
919                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
920                                           AF_INET),
921                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
922                         ip_hdr(skb)->tos);
923 }
924
925 /*
926  *      Send a SYN-ACK after having received a SYN.
927  *      This still operates on a request_sock only, not on a big
928  *      socket.
929  */
930 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
931                               struct flowi *fl,
932                               struct request_sock *req,
933                               struct tcp_fastopen_cookie *foc,
934                               enum tcp_synack_type synack_type)
935 {
936         const struct inet_request_sock *ireq = inet_rsk(req);
937         struct flowi4 fl4;
938         int err = -1;
939         struct sk_buff *skb;
940
941         /* First, grab a route. */
942         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
943                 return -1;
944
945         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
946
947         if (skb) {
948                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
949
950                 rcu_read_lock();
951                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
952                                             ireq->ir_rmt_addr,
953                                             rcu_dereference(ireq->ireq_opt));
954                 rcu_read_unlock();
955                 err = net_xmit_eval(err);
956         }
957
958         return err;
959 }
960
961 /*
962  *      IPv4 request_sock destructor.
963  */
964 static void tcp_v4_reqsk_destructor(struct request_sock *req)
965 {
966         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
967 }
968
969 #ifdef CONFIG_TCP_MD5SIG
970 /*
971  * RFC2385 MD5 checksumming requires a mapping of
972  * IP address->MD5 Key.
973  * We need to maintain these in the sk structure.
974  */
975
976 /* Find the Key structure for an address.  */
977 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
978                                          const union tcp_md5_addr *addr,
979                                          int family)
980 {
981         const struct tcp_sock *tp = tcp_sk(sk);
982         struct tcp_md5sig_key *key;
983         const struct tcp_md5sig_info *md5sig;
984         __be32 mask;
985         struct tcp_md5sig_key *best_match = NULL;
986         bool match;
987
988         /* caller either holds rcu_read_lock() or socket lock */
989         md5sig = rcu_dereference_check(tp->md5sig_info,
990                                        lockdep_sock_is_held(sk));
991         if (!md5sig)
992                 return NULL;
993
994         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
995                 if (key->family != family)
996                         continue;
997
998                 if (family == AF_INET) {
999                         mask = inet_make_mask(key->prefixlen);
1000                         match = (key->addr.a4.s_addr & mask) ==
1001                                 (addr->a4.s_addr & mask);
1002 #if IS_ENABLED(CONFIG_IPV6)
1003                 } else if (family == AF_INET6) {
1004                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1005                                                   key->prefixlen);
1006 #endif
1007                 } else {
1008                         match = false;
1009                 }
1010
1011                 if (match && (!best_match ||
1012                               key->prefixlen > best_match->prefixlen))
1013                         best_match = key;
1014         }
1015         return best_match;
1016 }
1017 EXPORT_SYMBOL(tcp_md5_do_lookup);
1018
1019 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1020                                                       const union tcp_md5_addr *addr,
1021                                                       int family, u8 prefixlen)
1022 {
1023         const struct tcp_sock *tp = tcp_sk(sk);
1024         struct tcp_md5sig_key *key;
1025         unsigned int size = sizeof(struct in_addr);
1026         const struct tcp_md5sig_info *md5sig;
1027
1028         /* caller either holds rcu_read_lock() or socket lock */
1029         md5sig = rcu_dereference_check(tp->md5sig_info,
1030                                        lockdep_sock_is_held(sk));
1031         if (!md5sig)
1032                 return NULL;
1033 #if IS_ENABLED(CONFIG_IPV6)
1034         if (family == AF_INET6)
1035                 size = sizeof(struct in6_addr);
1036 #endif
1037         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1038                 if (key->family != family)
1039                         continue;
1040                 if (!memcmp(&key->addr, addr, size) &&
1041                     key->prefixlen == prefixlen)
1042                         return key;
1043         }
1044         return NULL;
1045 }
1046
1047 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1048                                          const struct sock *addr_sk)
1049 {
1050         const union tcp_md5_addr *addr;
1051
1052         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1053         return tcp_md5_do_lookup(sk, addr, AF_INET);
1054 }
1055 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1056
1057 /* This can be called on a newly created socket, from other files */
1058 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1059                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1060                    gfp_t gfp)
1061 {
1062         /* Add Key to the list */
1063         struct tcp_md5sig_key *key;
1064         struct tcp_sock *tp = tcp_sk(sk);
1065         struct tcp_md5sig_info *md5sig;
1066
1067         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1068         if (key) {
1069                 /* Pre-existing entry - just update that one.
1070                  * Note that the key might be used concurrently.
1071                  */
1072                 memcpy(key->key, newkey, newkeylen);
1073
1074                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1075                  * Also note that a reader could catch new key->keylen value
1076                  * but old key->key[], this is the reason we use __GFP_ZERO
1077                  * at sock_kmalloc() time below these lines.
1078                  */
1079                 WRITE_ONCE(key->keylen, newkeylen);
1080
1081                 return 0;
1082         }
1083
1084         md5sig = rcu_dereference_protected(tp->md5sig_info,
1085                                            lockdep_sock_is_held(sk));
1086         if (!md5sig) {
1087                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1088                 if (!md5sig)
1089                         return -ENOMEM;
1090
1091                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1092                 INIT_HLIST_HEAD(&md5sig->head);
1093                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1094         }
1095
1096         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1097         if (!key)
1098                 return -ENOMEM;
1099         if (!tcp_alloc_md5sig_pool()) {
1100                 sock_kfree_s(sk, key, sizeof(*key));
1101                 return -ENOMEM;
1102         }
1103
1104         memcpy(key->key, newkey, newkeylen);
1105         key->keylen = newkeylen;
1106         key->family = family;
1107         key->prefixlen = prefixlen;
1108         memcpy(&key->addr, addr,
1109                (family == AF_INET6) ? sizeof(struct in6_addr) :
1110                                       sizeof(struct in_addr));
1111         hlist_add_head_rcu(&key->node, &md5sig->head);
1112         return 0;
1113 }
1114 EXPORT_SYMBOL(tcp_md5_do_add);
1115
1116 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1117                    u8 prefixlen)
1118 {
1119         struct tcp_md5sig_key *key;
1120
1121         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1122         if (!key)
1123                 return -ENOENT;
1124         hlist_del_rcu(&key->node);
1125         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1126         kfree_rcu(key, rcu);
1127         return 0;
1128 }
1129 EXPORT_SYMBOL(tcp_md5_do_del);
1130
1131 static void tcp_clear_md5_list(struct sock *sk)
1132 {
1133         struct tcp_sock *tp = tcp_sk(sk);
1134         struct tcp_md5sig_key *key;
1135         struct hlist_node *n;
1136         struct tcp_md5sig_info *md5sig;
1137
1138         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1139
1140         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1141                 hlist_del_rcu(&key->node);
1142                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1143                 kfree_rcu(key, rcu);
1144         }
1145 }
1146
1147 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1148                                  char __user *optval, int optlen)
1149 {
1150         struct tcp_md5sig cmd;
1151         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1152         u8 prefixlen = 32;
1153
1154         if (optlen < sizeof(cmd))
1155                 return -EINVAL;
1156
1157         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1158                 return -EFAULT;
1159
1160         if (sin->sin_family != AF_INET)
1161                 return -EINVAL;
1162
1163         if (optname == TCP_MD5SIG_EXT &&
1164             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1165                 prefixlen = cmd.tcpm_prefixlen;
1166                 if (prefixlen > 32)
1167                         return -EINVAL;
1168         }
1169
1170         if (!cmd.tcpm_keylen)
1171                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1172                                       AF_INET, prefixlen);
1173
1174         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1175                 return -EINVAL;
1176
1177         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1178                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1179                               GFP_KERNEL);
1180 }
1181
1182 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1183                                    __be32 daddr, __be32 saddr,
1184                                    const struct tcphdr *th, int nbytes)
1185 {
1186         struct tcp4_pseudohdr *bp;
1187         struct scatterlist sg;
1188         struct tcphdr *_th;
1189
1190         bp = hp->scratch;
1191         bp->saddr = saddr;
1192         bp->daddr = daddr;
1193         bp->pad = 0;
1194         bp->protocol = IPPROTO_TCP;
1195         bp->len = cpu_to_be16(nbytes);
1196
1197         _th = (struct tcphdr *)(bp + 1);
1198         memcpy(_th, th, sizeof(*th));
1199         _th->check = 0;
1200
1201         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1202         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1203                                 sizeof(*bp) + sizeof(*th));
1204         return crypto_ahash_update(hp->md5_req);
1205 }
1206
1207 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1208                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1209 {
1210         struct tcp_md5sig_pool *hp;
1211         struct ahash_request *req;
1212
1213         hp = tcp_get_md5sig_pool();
1214         if (!hp)
1215                 goto clear_hash_noput;
1216         req = hp->md5_req;
1217
1218         if (crypto_ahash_init(req))
1219                 goto clear_hash;
1220         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1221                 goto clear_hash;
1222         if (tcp_md5_hash_key(hp, key))
1223                 goto clear_hash;
1224         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1225         if (crypto_ahash_final(req))
1226                 goto clear_hash;
1227
1228         tcp_put_md5sig_pool();
1229         return 0;
1230
1231 clear_hash:
1232         tcp_put_md5sig_pool();
1233 clear_hash_noput:
1234         memset(md5_hash, 0, 16);
1235         return 1;
1236 }
1237
1238 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1239                         const struct sock *sk,
1240                         const struct sk_buff *skb)
1241 {
1242         struct tcp_md5sig_pool *hp;
1243         struct ahash_request *req;
1244         const struct tcphdr *th = tcp_hdr(skb);
1245         __be32 saddr, daddr;
1246
1247         if (sk) { /* valid for establish/request sockets */
1248                 saddr = sk->sk_rcv_saddr;
1249                 daddr = sk->sk_daddr;
1250         } else {
1251                 const struct iphdr *iph = ip_hdr(skb);
1252                 saddr = iph->saddr;
1253                 daddr = iph->daddr;
1254         }
1255
1256         hp = tcp_get_md5sig_pool();
1257         if (!hp)
1258                 goto clear_hash_noput;
1259         req = hp->md5_req;
1260
1261         if (crypto_ahash_init(req))
1262                 goto clear_hash;
1263
1264         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1265                 goto clear_hash;
1266         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1267                 goto clear_hash;
1268         if (tcp_md5_hash_key(hp, key))
1269                 goto clear_hash;
1270         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1271         if (crypto_ahash_final(req))
1272                 goto clear_hash;
1273
1274         tcp_put_md5sig_pool();
1275         return 0;
1276
1277 clear_hash:
1278         tcp_put_md5sig_pool();
1279 clear_hash_noput:
1280         memset(md5_hash, 0, 16);
1281         return 1;
1282 }
1283 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1284
1285 #endif
1286
1287 /* Called with rcu_read_lock() */
1288 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1289                                     const struct sk_buff *skb)
1290 {
1291 #ifdef CONFIG_TCP_MD5SIG
1292         /*
1293          * This gets called for each TCP segment that arrives
1294          * so we want to be efficient.
1295          * We have 3 drop cases:
1296          * o No MD5 hash and one expected.
1297          * o MD5 hash and we're not expecting one.
1298          * o MD5 hash and its wrong.
1299          */
1300         const __u8 *hash_location = NULL;
1301         struct tcp_md5sig_key *hash_expected;
1302         const struct iphdr *iph = ip_hdr(skb);
1303         const struct tcphdr *th = tcp_hdr(skb);
1304         int genhash;
1305         unsigned char newhash[16];
1306
1307         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1308                                           AF_INET);
1309         hash_location = tcp_parse_md5sig_option(th);
1310
1311         /* We've parsed the options - do we have a hash? */
1312         if (!hash_expected && !hash_location)
1313                 return false;
1314
1315         if (hash_expected && !hash_location) {
1316                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1317                 return true;
1318         }
1319
1320         if (!hash_expected && hash_location) {
1321                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1322                 return true;
1323         }
1324
1325         /* Okay, so this is hash_expected and hash_location -
1326          * so we need to calculate the checksum.
1327          */
1328         genhash = tcp_v4_md5_hash_skb(newhash,
1329                                       hash_expected,
1330                                       NULL, skb);
1331
1332         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1333                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1334                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1335                                      &iph->saddr, ntohs(th->source),
1336                                      &iph->daddr, ntohs(th->dest),
1337                                      genhash ? " tcp_v4_calc_md5_hash failed"
1338                                      : "");
1339                 return true;
1340         }
1341         return false;
1342 #endif
1343         return false;
1344 }
1345
1346 static void tcp_v4_init_req(struct request_sock *req,
1347                             const struct sock *sk_listener,
1348                             struct sk_buff *skb)
1349 {
1350         struct inet_request_sock *ireq = inet_rsk(req);
1351         struct net *net = sock_net(sk_listener);
1352
1353         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1354         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1355         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1356 }
1357
1358 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1359                                           struct flowi *fl,
1360                                           const struct request_sock *req)
1361 {
1362         return inet_csk_route_req(sk, &fl->u.ip4, req);
1363 }
1364
1365 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1366         .family         =       PF_INET,
1367         .obj_size       =       sizeof(struct tcp_request_sock),
1368         .rtx_syn_ack    =       tcp_rtx_synack,
1369         .send_ack       =       tcp_v4_reqsk_send_ack,
1370         .destructor     =       tcp_v4_reqsk_destructor,
1371         .send_reset     =       tcp_v4_send_reset,
1372         .syn_ack_timeout =      tcp_syn_ack_timeout,
1373 };
1374
1375 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1376         .mss_clamp      =       TCP_MSS_DEFAULT,
1377 #ifdef CONFIG_TCP_MD5SIG
1378         .req_md5_lookup =       tcp_v4_md5_lookup,
1379         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1380 #endif
1381         .init_req       =       tcp_v4_init_req,
1382 #ifdef CONFIG_SYN_COOKIES
1383         .cookie_init_seq =      cookie_v4_init_sequence,
1384 #endif
1385         .route_req      =       tcp_v4_route_req,
1386         .init_seq       =       tcp_v4_init_seq,
1387         .init_ts_off    =       tcp_v4_init_ts_off,
1388         .send_synack    =       tcp_v4_send_synack,
1389 };
1390
1391 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1392 {
1393         /* Never answer to SYNs send to broadcast or multicast */
1394         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1395                 goto drop;
1396
1397         return tcp_conn_request(&tcp_request_sock_ops,
1398                                 &tcp_request_sock_ipv4_ops, sk, skb);
1399
1400 drop:
1401         tcp_listendrop(sk);
1402         return 0;
1403 }
1404 EXPORT_SYMBOL(tcp_v4_conn_request);
1405
1406
1407 /*
1408  * The three way handshake has completed - we got a valid synack -
1409  * now create the new socket.
1410  */
1411 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1412                                   struct request_sock *req,
1413                                   struct dst_entry *dst,
1414                                   struct request_sock *req_unhash,
1415                                   bool *own_req)
1416 {
1417         struct inet_request_sock *ireq;
1418         struct inet_sock *newinet;
1419         struct tcp_sock *newtp;
1420         struct sock *newsk;
1421 #ifdef CONFIG_TCP_MD5SIG
1422         struct tcp_md5sig_key *key;
1423 #endif
1424         struct ip_options_rcu *inet_opt;
1425
1426         if (sk_acceptq_is_full(sk))
1427                 goto exit_overflow;
1428
1429         newsk = tcp_create_openreq_child(sk, req, skb);
1430         if (!newsk)
1431                 goto exit_nonewsk;
1432
1433         newsk->sk_gso_type = SKB_GSO_TCPV4;
1434         inet_sk_rx_dst_set(newsk, skb);
1435
1436         newtp                 = tcp_sk(newsk);
1437         newinet               = inet_sk(newsk);
1438         ireq                  = inet_rsk(req);
1439         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1440         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1441         newsk->sk_bound_dev_if = ireq->ir_iif;
1442         newinet->inet_saddr   = ireq->ir_loc_addr;
1443         inet_opt              = rcu_dereference(ireq->ireq_opt);
1444         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1445         newinet->mc_index     = inet_iif(skb);
1446         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1447         newinet->rcv_tos      = ip_hdr(skb)->tos;
1448         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1449         if (inet_opt)
1450                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1451         newinet->inet_id = prandom_u32();
1452
1453         if (!dst) {
1454                 dst = inet_csk_route_child_sock(sk, newsk, req);
1455                 if (!dst)
1456                         goto put_and_exit;
1457         } else {
1458                 /* syncookie case : see end of cookie_v4_check() */
1459         }
1460         sk_setup_caps(newsk, dst);
1461
1462         tcp_ca_openreq_child(newsk, dst);
1463
1464         tcp_sync_mss(newsk, dst_mtu(dst));
1465         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1466
1467         tcp_initialize_rcv_mss(newsk);
1468
1469 #ifdef CONFIG_TCP_MD5SIG
1470         /* Copy over the MD5 key from the original socket */
1471         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1472                                 AF_INET);
1473         if (key) {
1474                 /*
1475                  * We're using one, so create a matching key
1476                  * on the newsk structure. If we fail to get
1477                  * memory, then we end up not copying the key
1478                  * across. Shucks.
1479                  */
1480                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1481                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1482                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1483         }
1484 #endif
1485
1486         if (__inet_inherit_port(sk, newsk) < 0)
1487                 goto put_and_exit;
1488         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1489         if (likely(*own_req)) {
1490                 tcp_move_syn(newtp, req);
1491                 ireq->ireq_opt = NULL;
1492         } else {
1493                 newinet->inet_opt = NULL;
1494         }
1495         return newsk;
1496
1497 exit_overflow:
1498         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1499 exit_nonewsk:
1500         dst_release(dst);
1501 exit:
1502         tcp_listendrop(sk);
1503         return NULL;
1504 put_and_exit:
1505         newinet->inet_opt = NULL;
1506         inet_csk_prepare_forced_close(newsk);
1507         tcp_done(newsk);
1508         goto exit;
1509 }
1510 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1511
1512 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1513 {
1514 #ifdef CONFIG_SYN_COOKIES
1515         const struct tcphdr *th = tcp_hdr(skb);
1516
1517         if (!th->syn)
1518                 sk = cookie_v4_check(sk, skb);
1519 #endif
1520         return sk;
1521 }
1522
1523 /* The socket must have it's spinlock held when we get
1524  * here, unless it is a TCP_LISTEN socket.
1525  *
1526  * We have a potential double-lock case here, so even when
1527  * doing backlog processing we use the BH locking scheme.
1528  * This is because we cannot sleep with the original spinlock
1529  * held.
1530  */
1531 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1532 {
1533         struct sock *rsk;
1534
1535         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1536                 struct dst_entry *dst = sk->sk_rx_dst;
1537
1538                 sock_rps_save_rxhash(sk, skb);
1539                 sk_mark_napi_id(sk, skb);
1540                 if (dst) {
1541                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1542                             !dst->ops->check(dst, 0)) {
1543                                 dst_release(dst);
1544                                 sk->sk_rx_dst = NULL;
1545                         }
1546                 }
1547                 tcp_rcv_established(sk, skb);
1548                 return 0;
1549         }
1550
1551         if (tcp_checksum_complete(skb))
1552                 goto csum_err;
1553
1554         if (sk->sk_state == TCP_LISTEN) {
1555                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1556
1557                 if (!nsk)
1558                         goto discard;
1559                 if (nsk != sk) {
1560                         if (tcp_child_process(sk, nsk, skb)) {
1561                                 rsk = nsk;
1562                                 goto reset;
1563                         }
1564                         return 0;
1565                 }
1566         } else
1567                 sock_rps_save_rxhash(sk, skb);
1568
1569         if (tcp_rcv_state_process(sk, skb)) {
1570                 rsk = sk;
1571                 goto reset;
1572         }
1573         return 0;
1574
1575 reset:
1576         tcp_v4_send_reset(rsk, skb);
1577 discard:
1578         kfree_skb(skb);
1579         /* Be careful here. If this function gets more complicated and
1580          * gcc suffers from register pressure on the x86, sk (in %ebx)
1581          * might be destroyed here. This current version compiles correctly,
1582          * but you have been warned.
1583          */
1584         return 0;
1585
1586 csum_err:
1587         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1588         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1589         goto discard;
1590 }
1591 EXPORT_SYMBOL(tcp_v4_do_rcv);
1592
1593 int tcp_v4_early_demux(struct sk_buff *skb)
1594 {
1595         const struct iphdr *iph;
1596         const struct tcphdr *th;
1597         struct sock *sk;
1598
1599         if (skb->pkt_type != PACKET_HOST)
1600                 return 0;
1601
1602         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1603                 return 0;
1604
1605         iph = ip_hdr(skb);
1606         th = tcp_hdr(skb);
1607
1608         if (th->doff < sizeof(struct tcphdr) / 4)
1609                 return 0;
1610
1611         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1612                                        iph->saddr, th->source,
1613                                        iph->daddr, ntohs(th->dest),
1614                                        skb->skb_iif, inet_sdif(skb));
1615         if (sk) {
1616                 skb->sk = sk;
1617                 skb->destructor = sock_edemux;
1618                 if (sk_fullsock(sk)) {
1619                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1620
1621                         if (dst)
1622                                 dst = dst_check(dst, 0);
1623                         if (dst &&
1624                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1625                                 skb_dst_set_noref(skb, dst);
1626                 }
1627         }
1628         return 0;
1629 }
1630
1631 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1632 {
1633         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1634
1635         /* Only socket owner can try to collapse/prune rx queues
1636          * to reduce memory overhead, so add a little headroom here.
1637          * Few sockets backlog are possibly concurrently non empty.
1638          */
1639         limit += 64*1024;
1640
1641         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1642          * we can fix skb->truesize to its real value to avoid future drops.
1643          * This is valid because skb is not yet charged to the socket.
1644          * It has been noticed pure SACK packets were sometimes dropped
1645          * (if cooked by drivers without copybreak feature).
1646          */
1647         skb_condense(skb);
1648
1649         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1650                 bh_unlock_sock(sk);
1651                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1652                 return true;
1653         }
1654         return false;
1655 }
1656 EXPORT_SYMBOL(tcp_add_backlog);
1657
1658 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1659 {
1660         struct tcphdr *th = (struct tcphdr *)skb->data;
1661
1662         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1663 }
1664 EXPORT_SYMBOL(tcp_filter);
1665
1666 static void tcp_v4_restore_cb(struct sk_buff *skb)
1667 {
1668         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1669                 sizeof(struct inet_skb_parm));
1670 }
1671
1672 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1673                            const struct tcphdr *th)
1674 {
1675         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1676          * barrier() makes sure compiler wont play fool^Waliasing games.
1677          */
1678         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1679                 sizeof(struct inet_skb_parm));
1680         barrier();
1681
1682         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1683         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1684                                     skb->len - th->doff * 4);
1685         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1686         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1687         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1688         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1689         TCP_SKB_CB(skb)->sacked  = 0;
1690         TCP_SKB_CB(skb)->has_rxtstamp =
1691                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1692 }
1693
1694 /*
1695  *      From tcp_input.c
1696  */
1697
1698 int tcp_v4_rcv(struct sk_buff *skb)
1699 {
1700         struct net *net = dev_net(skb->dev);
1701         int sdif = inet_sdif(skb);
1702         const struct iphdr *iph;
1703         const struct tcphdr *th;
1704         bool refcounted;
1705         struct sock *sk;
1706         int ret;
1707
1708         if (skb->pkt_type != PACKET_HOST)
1709                 goto discard_it;
1710
1711         /* Count it even if it's bad */
1712         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1713
1714         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1715                 goto discard_it;
1716
1717         th = (const struct tcphdr *)skb->data;
1718
1719         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1720                 goto bad_packet;
1721         if (!pskb_may_pull(skb, th->doff * 4))
1722                 goto discard_it;
1723
1724         /* An explanation is required here, I think.
1725          * Packet length and doff are validated by header prediction,
1726          * provided case of th->doff==0 is eliminated.
1727          * So, we defer the checks. */
1728
1729         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1730                 goto csum_error;
1731
1732         th = (const struct tcphdr *)skb->data;
1733         iph = ip_hdr(skb);
1734 lookup:
1735         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1736                                th->dest, sdif, &refcounted);
1737         if (!sk)
1738                 goto no_tcp_socket;
1739
1740 process:
1741         if (sk->sk_state == TCP_TIME_WAIT)
1742                 goto do_time_wait;
1743
1744         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1745                 struct request_sock *req = inet_reqsk(sk);
1746                 bool req_stolen = false;
1747                 struct sock *nsk;
1748
1749                 sk = req->rsk_listener;
1750                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1751                         sk_drops_add(sk, skb);
1752                         reqsk_put(req);
1753                         goto discard_it;
1754                 }
1755                 if (tcp_checksum_complete(skb)) {
1756                         reqsk_put(req);
1757                         goto csum_error;
1758                 }
1759                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1760                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1761                         goto lookup;
1762                 }
1763                 /* We own a reference on the listener, increase it again
1764                  * as we might lose it too soon.
1765                  */
1766                 sock_hold(sk);
1767                 refcounted = true;
1768                 nsk = NULL;
1769                 if (!tcp_filter(sk, skb)) {
1770                         th = (const struct tcphdr *)skb->data;
1771                         iph = ip_hdr(skb);
1772                         tcp_v4_fill_cb(skb, iph, th);
1773                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1774                 }
1775                 if (!nsk) {
1776                         reqsk_put(req);
1777                         if (req_stolen) {
1778                                 /* Another cpu got exclusive access to req
1779                                  * and created a full blown socket.
1780                                  * Try to feed this packet to this socket
1781                                  * instead of discarding it.
1782                                  */
1783                                 tcp_v4_restore_cb(skb);
1784                                 sock_put(sk);
1785                                 goto lookup;
1786                         }
1787                         goto discard_and_relse;
1788                 }
1789                 if (nsk == sk) {
1790                         reqsk_put(req);
1791                         tcp_v4_restore_cb(skb);
1792                 } else if (tcp_child_process(sk, nsk, skb)) {
1793                         tcp_v4_send_reset(nsk, skb);
1794                         goto discard_and_relse;
1795                 } else {
1796                         sock_put(sk);
1797                         return 0;
1798                 }
1799         }
1800         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1801                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1802                 goto discard_and_relse;
1803         }
1804
1805         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1806                 goto discard_and_relse;
1807
1808         if (tcp_v4_inbound_md5_hash(sk, skb))
1809                 goto discard_and_relse;
1810
1811         nf_reset(skb);
1812
1813         if (tcp_filter(sk, skb))
1814                 goto discard_and_relse;
1815         th = (const struct tcphdr *)skb->data;
1816         iph = ip_hdr(skb);
1817         tcp_v4_fill_cb(skb, iph, th);
1818
1819         skb->dev = NULL;
1820
1821         if (sk->sk_state == TCP_LISTEN) {
1822                 ret = tcp_v4_do_rcv(sk, skb);
1823                 goto put_and_return;
1824         }
1825
1826         sk_incoming_cpu_update(sk);
1827
1828         bh_lock_sock_nested(sk);
1829         tcp_segs_in(tcp_sk(sk), skb);
1830         ret = 0;
1831         if (!sock_owned_by_user(sk)) {
1832                 ret = tcp_v4_do_rcv(sk, skb);
1833         } else if (tcp_add_backlog(sk, skb)) {
1834                 goto discard_and_relse;
1835         }
1836         bh_unlock_sock(sk);
1837
1838 put_and_return:
1839         if (refcounted)
1840                 sock_put(sk);
1841
1842         return ret;
1843
1844 no_tcp_socket:
1845         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1846                 goto discard_it;
1847
1848         tcp_v4_fill_cb(skb, iph, th);
1849
1850         if (tcp_checksum_complete(skb)) {
1851 csum_error:
1852                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1853 bad_packet:
1854                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1855         } else {
1856                 tcp_v4_send_reset(NULL, skb);
1857         }
1858
1859 discard_it:
1860         /* Discard frame. */
1861         kfree_skb(skb);
1862         return 0;
1863
1864 discard_and_relse:
1865         sk_drops_add(sk, skb);
1866         if (refcounted)
1867                 sock_put(sk);
1868         goto discard_it;
1869
1870 do_time_wait:
1871         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1872                 inet_twsk_put(inet_twsk(sk));
1873                 goto discard_it;
1874         }
1875
1876         tcp_v4_fill_cb(skb, iph, th);
1877
1878         if (tcp_checksum_complete(skb)) {
1879                 inet_twsk_put(inet_twsk(sk));
1880                 goto csum_error;
1881         }
1882         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1883         case TCP_TW_SYN: {
1884                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1885                                                         &tcp_hashinfo, skb,
1886                                                         __tcp_hdrlen(th),
1887                                                         iph->saddr, th->source,
1888                                                         iph->daddr, th->dest,
1889                                                         inet_iif(skb),
1890                                                         sdif);
1891                 if (sk2) {
1892                         inet_twsk_deschedule_put(inet_twsk(sk));
1893                         sk = sk2;
1894                         tcp_v4_restore_cb(skb);
1895                         refcounted = false;
1896                         goto process;
1897                 }
1898         }
1899                 /* to ACK */
1900                 /* fall through */
1901         case TCP_TW_ACK:
1902                 tcp_v4_timewait_ack(sk, skb);
1903                 break;
1904         case TCP_TW_RST:
1905                 tcp_v4_send_reset(sk, skb);
1906                 inet_twsk_deschedule_put(inet_twsk(sk));
1907                 goto discard_it;
1908         case TCP_TW_SUCCESS:;
1909         }
1910         goto discard_it;
1911 }
1912
1913 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1914         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1915         .twsk_unique    = tcp_twsk_unique,
1916         .twsk_destructor= tcp_twsk_destructor,
1917 };
1918
1919 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1920 {
1921         struct dst_entry *dst = skb_dst(skb);
1922
1923         if (dst && dst_hold_safe(dst)) {
1924                 sk->sk_rx_dst = dst;
1925                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1926         }
1927 }
1928 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1929
1930 const struct inet_connection_sock_af_ops ipv4_specific = {
1931         .queue_xmit        = ip_queue_xmit,
1932         .send_check        = tcp_v4_send_check,
1933         .rebuild_header    = inet_sk_rebuild_header,
1934         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1935         .conn_request      = tcp_v4_conn_request,
1936         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1937         .net_header_len    = sizeof(struct iphdr),
1938         .setsockopt        = ip_setsockopt,
1939         .getsockopt        = ip_getsockopt,
1940         .addr2sockaddr     = inet_csk_addr2sockaddr,
1941         .sockaddr_len      = sizeof(struct sockaddr_in),
1942 #ifdef CONFIG_COMPAT
1943         .compat_setsockopt = compat_ip_setsockopt,
1944         .compat_getsockopt = compat_ip_getsockopt,
1945 #endif
1946         .mtu_reduced       = tcp_v4_mtu_reduced,
1947 };
1948 EXPORT_SYMBOL(ipv4_specific);
1949
1950 #ifdef CONFIG_TCP_MD5SIG
1951 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1952         .md5_lookup             = tcp_v4_md5_lookup,
1953         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1954         .md5_parse              = tcp_v4_parse_md5_keys,
1955 };
1956 #endif
1957
1958 /* NOTE: A lot of things set to zero explicitly by call to
1959  *       sk_alloc() so need not be done here.
1960  */
1961 static int tcp_v4_init_sock(struct sock *sk)
1962 {
1963         struct inet_connection_sock *icsk = inet_csk(sk);
1964
1965         tcp_init_sock(sk);
1966
1967         icsk->icsk_af_ops = &ipv4_specific;
1968
1969 #ifdef CONFIG_TCP_MD5SIG
1970         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1971 #endif
1972
1973         return 0;
1974 }
1975
1976 void tcp_v4_destroy_sock(struct sock *sk)
1977 {
1978         struct tcp_sock *tp = tcp_sk(sk);
1979
1980         trace_tcp_destroy_sock(sk);
1981
1982         tcp_clear_xmit_timers(sk);
1983
1984         tcp_cleanup_congestion_control(sk);
1985
1986         tcp_cleanup_ulp(sk);
1987
1988         /* Cleanup up the write buffer. */
1989         tcp_write_queue_purge(sk);
1990
1991         /* Check if we want to disable active TFO */
1992         tcp_fastopen_active_disable_ofo_check(sk);
1993
1994         /* Cleans up our, hopefully empty, out_of_order_queue. */
1995         skb_rbtree_purge(&tp->out_of_order_queue);
1996
1997 #ifdef CONFIG_TCP_MD5SIG
1998         /* Clean up the MD5 key list, if any */
1999         if (tp->md5sig_info) {
2000                 tcp_clear_md5_list(sk);
2001                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2002                 tp->md5sig_info = NULL;
2003         }
2004 #endif
2005
2006         /* Clean up a referenced TCP bind bucket. */
2007         if (inet_csk(sk)->icsk_bind_hash)
2008                 inet_put_port(sk);
2009
2010         BUG_ON(tp->fastopen_rsk);
2011
2012         /* If socket is aborted during connect operation */
2013         tcp_free_fastopen_req(tp);
2014         tcp_fastopen_destroy_cipher(sk);
2015         tcp_saved_syn_free(tp);
2016
2017         sk_sockets_allocated_dec(sk);
2018 }
2019 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2020
2021 #ifdef CONFIG_PROC_FS
2022 /* Proc filesystem TCP sock list dumping. */
2023
2024 /*
2025  * Get next listener socket follow cur.  If cur is NULL, get first socket
2026  * starting from bucket given in st->bucket; when st->bucket is zero the
2027  * very first socket in the hash table is returned.
2028  */
2029 static void *listening_get_next(struct seq_file *seq, void *cur)
2030 {
2031         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2032         struct tcp_iter_state *st = seq->private;
2033         struct net *net = seq_file_net(seq);
2034         struct inet_listen_hashbucket *ilb;
2035         struct hlist_nulls_node *node;
2036         struct sock *sk = cur;
2037
2038         if (!sk) {
2039 get_head:
2040                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2041                 spin_lock(&ilb->lock);
2042                 sk = sk_nulls_head(&ilb->nulls_head);
2043                 st->offset = 0;
2044                 goto get_sk;
2045         }
2046         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2047         ++st->num;
2048         ++st->offset;
2049
2050         sk = sk_nulls_next(sk);
2051 get_sk:
2052         sk_nulls_for_each_from(sk, node) {
2053                 if (!net_eq(sock_net(sk), net))
2054                         continue;
2055                 if (sk->sk_family == afinfo->family)
2056                         return sk;
2057         }
2058         spin_unlock(&ilb->lock);
2059         st->offset = 0;
2060         if (++st->bucket < INET_LHTABLE_SIZE)
2061                 goto get_head;
2062         return NULL;
2063 }
2064
2065 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2066 {
2067         struct tcp_iter_state *st = seq->private;
2068         void *rc;
2069
2070         st->bucket = 0;
2071         st->offset = 0;
2072         rc = listening_get_next(seq, NULL);
2073
2074         while (rc && *pos) {
2075                 rc = listening_get_next(seq, rc);
2076                 --*pos;
2077         }
2078         return rc;
2079 }
2080
2081 static inline bool empty_bucket(const struct tcp_iter_state *st)
2082 {
2083         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2084 }
2085
2086 /*
2087  * Get first established socket starting from bucket given in st->bucket.
2088  * If st->bucket is zero, the very first socket in the hash is returned.
2089  */
2090 static void *established_get_first(struct seq_file *seq)
2091 {
2092         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2093         struct tcp_iter_state *st = seq->private;
2094         struct net *net = seq_file_net(seq);
2095         void *rc = NULL;
2096
2097         st->offset = 0;
2098         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2099                 struct sock *sk;
2100                 struct hlist_nulls_node *node;
2101                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2102
2103                 /* Lockless fast path for the common case of empty buckets */
2104                 if (empty_bucket(st))
2105                         continue;
2106
2107                 spin_lock_bh(lock);
2108                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2109                         if (sk->sk_family != afinfo->family ||
2110                             !net_eq(sock_net(sk), net)) {
2111                                 continue;
2112                         }
2113                         rc = sk;
2114                         goto out;
2115                 }
2116                 spin_unlock_bh(lock);
2117         }
2118 out:
2119         return rc;
2120 }
2121
2122 static void *established_get_next(struct seq_file *seq, void *cur)
2123 {
2124         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2125         struct sock *sk = cur;
2126         struct hlist_nulls_node *node;
2127         struct tcp_iter_state *st = seq->private;
2128         struct net *net = seq_file_net(seq);
2129
2130         ++st->num;
2131         ++st->offset;
2132
2133         sk = sk_nulls_next(sk);
2134
2135         sk_nulls_for_each_from(sk, node) {
2136                 if (sk->sk_family == afinfo->family &&
2137                     net_eq(sock_net(sk), net))
2138                         return sk;
2139         }
2140
2141         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2142         ++st->bucket;
2143         return established_get_first(seq);
2144 }
2145
2146 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2147 {
2148         struct tcp_iter_state *st = seq->private;
2149         void *rc;
2150
2151         st->bucket = 0;
2152         rc = established_get_first(seq);
2153
2154         while (rc && pos) {
2155                 rc = established_get_next(seq, rc);
2156                 --pos;
2157         }
2158         return rc;
2159 }
2160
2161 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2162 {
2163         void *rc;
2164         struct tcp_iter_state *st = seq->private;
2165
2166         st->state = TCP_SEQ_STATE_LISTENING;
2167         rc        = listening_get_idx(seq, &pos);
2168
2169         if (!rc) {
2170                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2171                 rc        = established_get_idx(seq, pos);
2172         }
2173
2174         return rc;
2175 }
2176
2177 static void *tcp_seek_last_pos(struct seq_file *seq)
2178 {
2179         struct tcp_iter_state *st = seq->private;
2180         int bucket = st->bucket;
2181         int offset = st->offset;
2182         int orig_num = st->num;
2183         void *rc = NULL;
2184
2185         switch (st->state) {
2186         case TCP_SEQ_STATE_LISTENING:
2187                 if (st->bucket >= INET_LHTABLE_SIZE)
2188                         break;
2189                 st->state = TCP_SEQ_STATE_LISTENING;
2190                 rc = listening_get_next(seq, NULL);
2191                 while (offset-- && rc && bucket == st->bucket)
2192                         rc = listening_get_next(seq, rc);
2193                 if (rc)
2194                         break;
2195                 st->bucket = 0;
2196                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2197                 /* Fallthrough */
2198         case TCP_SEQ_STATE_ESTABLISHED:
2199                 if (st->bucket > tcp_hashinfo.ehash_mask)
2200                         break;
2201                 rc = established_get_first(seq);
2202                 while (offset-- && rc && bucket == st->bucket)
2203                         rc = established_get_next(seq, rc);
2204         }
2205
2206         st->num = orig_num;
2207
2208         return rc;
2209 }
2210
2211 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2212 {
2213         struct tcp_iter_state *st = seq->private;
2214         void *rc;
2215
2216         if (*pos && *pos == st->last_pos) {
2217                 rc = tcp_seek_last_pos(seq);
2218                 if (rc)
2219                         goto out;
2220         }
2221
2222         st->state = TCP_SEQ_STATE_LISTENING;
2223         st->num = 0;
2224         st->bucket = 0;
2225         st->offset = 0;
2226         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2227
2228 out:
2229         st->last_pos = *pos;
2230         return rc;
2231 }
2232 EXPORT_SYMBOL(tcp_seq_start);
2233
2234 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2235 {
2236         struct tcp_iter_state *st = seq->private;
2237         void *rc = NULL;
2238
2239         if (v == SEQ_START_TOKEN) {
2240                 rc = tcp_get_idx(seq, 0);
2241                 goto out;
2242         }
2243
2244         switch (st->state) {
2245         case TCP_SEQ_STATE_LISTENING:
2246                 rc = listening_get_next(seq, v);
2247                 if (!rc) {
2248                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2249                         st->bucket = 0;
2250                         st->offset = 0;
2251                         rc        = established_get_first(seq);
2252                 }
2253                 break;
2254         case TCP_SEQ_STATE_ESTABLISHED:
2255                 rc = established_get_next(seq, v);
2256                 break;
2257         }
2258 out:
2259         ++*pos;
2260         st->last_pos = *pos;
2261         return rc;
2262 }
2263 EXPORT_SYMBOL(tcp_seq_next);
2264
2265 void tcp_seq_stop(struct seq_file *seq, void *v)
2266 {
2267         struct tcp_iter_state *st = seq->private;
2268
2269         switch (st->state) {
2270         case TCP_SEQ_STATE_LISTENING:
2271                 if (v != SEQ_START_TOKEN)
2272                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2273                 break;
2274         case TCP_SEQ_STATE_ESTABLISHED:
2275                 if (v)
2276                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2277                 break;
2278         }
2279 }
2280 EXPORT_SYMBOL(tcp_seq_stop);
2281
2282 static void get_openreq4(const struct request_sock *req,
2283                          struct seq_file *f, int i)
2284 {
2285         const struct inet_request_sock *ireq = inet_rsk(req);
2286         long delta = req->rsk_timer.expires - jiffies;
2287
2288         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2289                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2290                 i,
2291                 ireq->ir_loc_addr,
2292                 ireq->ir_num,
2293                 ireq->ir_rmt_addr,
2294                 ntohs(ireq->ir_rmt_port),
2295                 TCP_SYN_RECV,
2296                 0, 0, /* could print option size, but that is af dependent. */
2297                 1,    /* timers active (only the expire timer) */
2298                 jiffies_delta_to_clock_t(delta),
2299                 req->num_timeout,
2300                 from_kuid_munged(seq_user_ns(f),
2301                                  sock_i_uid(req->rsk_listener)),
2302                 0,  /* non standard timer */
2303                 0, /* open_requests have no inode */
2304                 0,
2305                 req);
2306 }
2307
2308 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2309 {
2310         int timer_active;
2311         unsigned long timer_expires;
2312         const struct tcp_sock *tp = tcp_sk(sk);
2313         const struct inet_connection_sock *icsk = inet_csk(sk);
2314         const struct inet_sock *inet = inet_sk(sk);
2315         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2316         __be32 dest = inet->inet_daddr;
2317         __be32 src = inet->inet_rcv_saddr;
2318         __u16 destp = ntohs(inet->inet_dport);
2319         __u16 srcp = ntohs(inet->inet_sport);
2320         int rx_queue;
2321         int state;
2322
2323         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2324             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2325             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2326                 timer_active    = 1;
2327                 timer_expires   = icsk->icsk_timeout;
2328         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2329                 timer_active    = 4;
2330                 timer_expires   = icsk->icsk_timeout;
2331         } else if (timer_pending(&sk->sk_timer)) {
2332                 timer_active    = 2;
2333                 timer_expires   = sk->sk_timer.expires;
2334         } else {
2335                 timer_active    = 0;
2336                 timer_expires = jiffies;
2337         }
2338
2339         state = inet_sk_state_load(sk);
2340         if (state == TCP_LISTEN)
2341                 rx_queue = sk->sk_ack_backlog;
2342         else
2343                 /* Because we don't lock the socket,
2344                  * we might find a transient negative value.
2345                  */
2346                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2347                                       READ_ONCE(tp->copied_seq), 0);
2348
2349         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2350                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2351                 i, src, srcp, dest, destp, state,
2352                 READ_ONCE(tp->write_seq) - tp->snd_una,
2353                 rx_queue,
2354                 timer_active,
2355                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2356                 icsk->icsk_retransmits,
2357                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2358                 icsk->icsk_probes_out,
2359                 sock_i_ino(sk),
2360                 refcount_read(&sk->sk_refcnt), sk,
2361                 jiffies_to_clock_t(icsk->icsk_rto),
2362                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2363                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2364                 tp->snd_cwnd,
2365                 state == TCP_LISTEN ?
2366                     fastopenq->max_qlen :
2367                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2368 }
2369
2370 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2371                                struct seq_file *f, int i)
2372 {
2373         long delta = tw->tw_timer.expires - jiffies;
2374         __be32 dest, src;
2375         __u16 destp, srcp;
2376
2377         dest  = tw->tw_daddr;
2378         src   = tw->tw_rcv_saddr;
2379         destp = ntohs(tw->tw_dport);
2380         srcp  = ntohs(tw->tw_sport);
2381
2382         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2383                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2384                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2385                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2386                 refcount_read(&tw->tw_refcnt), tw);
2387 }
2388
2389 #define TMPSZ 150
2390
2391 static int tcp4_seq_show(struct seq_file *seq, void *v)
2392 {
2393         struct tcp_iter_state *st;
2394         struct sock *sk = v;
2395
2396         seq_setwidth(seq, TMPSZ - 1);
2397         if (v == SEQ_START_TOKEN) {
2398                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2399                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2400                            "inode");
2401                 goto out;
2402         }
2403         st = seq->private;
2404
2405         if (sk->sk_state == TCP_TIME_WAIT)
2406                 get_timewait4_sock(v, seq, st->num);
2407         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2408                 get_openreq4(v, seq, st->num);
2409         else
2410                 get_tcp4_sock(v, seq, st->num);
2411 out:
2412         seq_pad(seq, '\n');
2413         return 0;
2414 }
2415
2416 static const struct seq_operations tcp4_seq_ops = {
2417         .show           = tcp4_seq_show,
2418         .start          = tcp_seq_start,
2419         .next           = tcp_seq_next,
2420         .stop           = tcp_seq_stop,
2421 };
2422
2423 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2424         .family         = AF_INET,
2425 };
2426
2427 static int __net_init tcp4_proc_init_net(struct net *net)
2428 {
2429         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2430                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2431                 return -ENOMEM;
2432         return 0;
2433 }
2434
2435 static void __net_exit tcp4_proc_exit_net(struct net *net)
2436 {
2437         remove_proc_entry("tcp", net->proc_net);
2438 }
2439
2440 static struct pernet_operations tcp4_net_ops = {
2441         .init = tcp4_proc_init_net,
2442         .exit = tcp4_proc_exit_net,
2443 };
2444
2445 int __init tcp4_proc_init(void)
2446 {
2447         return register_pernet_subsys(&tcp4_net_ops);
2448 }
2449
2450 void tcp4_proc_exit(void)
2451 {
2452         unregister_pernet_subsys(&tcp4_net_ops);
2453 }
2454 #endif /* CONFIG_PROC_FS */
2455
2456 struct proto tcp_prot = {
2457         .name                   = "TCP",
2458         .owner                  = THIS_MODULE,
2459         .close                  = tcp_close,
2460         .pre_connect            = tcp_v4_pre_connect,
2461         .connect                = tcp_v4_connect,
2462         .disconnect             = tcp_disconnect,
2463         .accept                 = inet_csk_accept,
2464         .ioctl                  = tcp_ioctl,
2465         .init                   = tcp_v4_init_sock,
2466         .destroy                = tcp_v4_destroy_sock,
2467         .shutdown               = tcp_shutdown,
2468         .setsockopt             = tcp_setsockopt,
2469         .getsockopt             = tcp_getsockopt,
2470         .keepalive              = tcp_set_keepalive,
2471         .recvmsg                = tcp_recvmsg,
2472         .sendmsg                = tcp_sendmsg,
2473         .sendpage               = tcp_sendpage,
2474         .backlog_rcv            = tcp_v4_do_rcv,
2475         .release_cb             = tcp_release_cb,
2476         .hash                   = inet_hash,
2477         .unhash                 = inet_unhash,
2478         .get_port               = inet_csk_get_port,
2479         .enter_memory_pressure  = tcp_enter_memory_pressure,
2480         .leave_memory_pressure  = tcp_leave_memory_pressure,
2481         .stream_memory_free     = tcp_stream_memory_free,
2482         .sockets_allocated      = &tcp_sockets_allocated,
2483         .orphan_count           = &tcp_orphan_count,
2484         .memory_allocated       = &tcp_memory_allocated,
2485         .memory_pressure        = &tcp_memory_pressure,
2486         .sysctl_mem             = sysctl_tcp_mem,
2487         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2488         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2489         .max_header             = MAX_TCP_HEADER,
2490         .obj_size               = sizeof(struct tcp_sock),
2491         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2492         .twsk_prot              = &tcp_timewait_sock_ops,
2493         .rsk_prot               = &tcp_request_sock_ops,
2494         .h.hashinfo             = &tcp_hashinfo,
2495         .no_autobind            = true,
2496 #ifdef CONFIG_COMPAT
2497         .compat_setsockopt      = compat_tcp_setsockopt,
2498         .compat_getsockopt      = compat_tcp_getsockopt,
2499 #endif
2500         .diag_destroy           = tcp_abort,
2501 };
2502 EXPORT_SYMBOL(tcp_prot);
2503
2504 static void __net_exit tcp_sk_exit(struct net *net)
2505 {
2506         int cpu;
2507
2508         if (net->ipv4.tcp_congestion_control)
2509                 module_put(net->ipv4.tcp_congestion_control->owner);
2510
2511         for_each_possible_cpu(cpu)
2512                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2513         free_percpu(net->ipv4.tcp_sk);
2514 }
2515
2516 static int __net_init tcp_sk_init(struct net *net)
2517 {
2518         int res, cpu, cnt;
2519
2520         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2521         if (!net->ipv4.tcp_sk)
2522                 return -ENOMEM;
2523
2524         for_each_possible_cpu(cpu) {
2525                 struct sock *sk;
2526
2527                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2528                                            IPPROTO_TCP, net);
2529                 if (res)
2530                         goto fail;
2531                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2532
2533                 /* Please enforce IP_DF and IPID==0 for RST and
2534                  * ACK sent in SYN-RECV and TIME-WAIT state.
2535                  */
2536                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2537
2538                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2539         }
2540
2541         net->ipv4.sysctl_tcp_ecn = 2;
2542         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2543
2544         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2545         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2546         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2547         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2548
2549         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2550         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2551         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2552
2553         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2554         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2555         net->ipv4.sysctl_tcp_syncookies = 1;
2556         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2557         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2558         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2559         net->ipv4.sysctl_tcp_orphan_retries = 0;
2560         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2561         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2562         net->ipv4.sysctl_tcp_tw_reuse = 2;
2563
2564         cnt = tcp_hashinfo.ehash_mask + 1;
2565         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2566         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2567
2568         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2569         net->ipv4.sysctl_tcp_sack = 1;
2570         net->ipv4.sysctl_tcp_window_scaling = 1;
2571         net->ipv4.sysctl_tcp_timestamps = 1;
2572         net->ipv4.sysctl_tcp_early_retrans = 3;
2573         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2574         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2575         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2576         net->ipv4.sysctl_tcp_max_reordering = 300;
2577         net->ipv4.sysctl_tcp_dsack = 1;
2578         net->ipv4.sysctl_tcp_app_win = 31;
2579         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2580         net->ipv4.sysctl_tcp_frto = 2;
2581         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2582         /* This limits the percentage of the congestion window which we
2583          * will allow a single TSO frame to consume.  Building TSO frames
2584          * which are too large can cause TCP streams to be bursty.
2585          */
2586         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2587         /* Default TSQ limit of four TSO segments */
2588         net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2589         /* rfc5961 challenge ack rate limiting */
2590         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2591         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2592         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2593         net->ipv4.sysctl_tcp_autocorking = 1;
2594         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2595         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2596         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2597         if (net != &init_net) {
2598                 memcpy(net->ipv4.sysctl_tcp_rmem,
2599                        init_net.ipv4.sysctl_tcp_rmem,
2600                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2601                 memcpy(net->ipv4.sysctl_tcp_wmem,
2602                        init_net.ipv4.sysctl_tcp_wmem,
2603                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2604         }
2605         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2606         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2607         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2608         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2609         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2610         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2611
2612         /* Reno is always built in */
2613         if (!net_eq(net, &init_net) &&
2614             try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2615                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2616         else
2617                 net->ipv4.tcp_congestion_control = &tcp_reno;
2618
2619         return 0;
2620 fail:
2621         tcp_sk_exit(net);
2622
2623         return res;
2624 }
2625
2626 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2627 {
2628         struct net *net;
2629
2630         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2631
2632         list_for_each_entry(net, net_exit_list, exit_list)
2633                 tcp_fastopen_ctx_destroy(net);
2634 }
2635
2636 static struct pernet_operations __net_initdata tcp_sk_ops = {
2637        .init       = tcp_sk_init,
2638        .exit       = tcp_sk_exit,
2639        .exit_batch = tcp_sk_exit_batch,
2640 };
2641
2642 void __init tcp_v4_init(void)
2643 {
2644         if (register_pernet_subsys(&tcp_sk_ops))
2645                 panic("Failed to create the TCP control socket.\n");
2646 }