GNU Linux-libre 4.19.242-gnu1
[releases.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
84
85 #include <crypto/hash.h>
86 #include <linux/scatterlist.h>
87
88 #include <trace/events/tcp.h>
89
90 #ifdef CONFIG_TCP_MD5SIG
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
92                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
93 #endif
94
95 struct inet_hashinfo tcp_hashinfo;
96 EXPORT_SYMBOL(tcp_hashinfo);
97
98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
99 {
100         return secure_tcp_seq(ip_hdr(skb)->daddr,
101                               ip_hdr(skb)->saddr,
102                               tcp_hdr(skb)->dest,
103                               tcp_hdr(skb)->source);
104 }
105
106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
107 {
108         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 }
110
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113         const struct inet_timewait_sock *tw = inet_twsk(sktw);
114         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115         struct tcp_sock *tp = tcp_sk(sk);
116         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
117
118         if (reuse == 2) {
119                 /* Still does not detect *everything* that goes through
120                  * lo, since we require a loopback src or dst address
121                  * or direct binding to 'lo' interface.
122                  */
123                 bool loopback = false;
124                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
125                         loopback = true;
126 #if IS_ENABLED(CONFIG_IPV6)
127                 if (tw->tw_family == AF_INET6) {
128                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
129                             (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
130                              (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
131                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
132                             (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
133                              (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
134                                 loopback = true;
135                 } else
136 #endif
137                 {
138                         if (ipv4_is_loopback(tw->tw_daddr) ||
139                             ipv4_is_loopback(tw->tw_rcv_saddr))
140                                 loopback = true;
141                 }
142                 if (!loopback)
143                         reuse = 0;
144         }
145
146         /* With PAWS, it is safe from the viewpoint
147            of data integrity. Even without PAWS it is safe provided sequence
148            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
149
150            Actually, the idea is close to VJ's one, only timestamp cache is
151            held not per host, but per port pair and TW bucket is used as state
152            holder.
153
154            If TW bucket has been already destroyed we fall back to VJ's scheme
155            and use initial timestamp retrieved from peer table.
156          */
157         if (tcptw->tw_ts_recent_stamp &&
158             (!twp || (reuse && time_after32(ktime_get_seconds(),
159                                             tcptw->tw_ts_recent_stamp)))) {
160                 /* In case of repair and re-using TIME-WAIT sockets we still
161                  * want to be sure that it is safe as above but honor the
162                  * sequence numbers and time stamps set as part of the repair
163                  * process.
164                  *
165                  * Without this check re-using a TIME-WAIT socket with TCP
166                  * repair would accumulate a -1 on the repair assigned
167                  * sequence number. The first time it is reused the sequence
168                  * is -1, the second time -2, etc. This fixes that issue
169                  * without appearing to create any others.
170                  */
171                 if (likely(!tp->repair)) {
172                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
173
174                         if (!seq)
175                                 seq = 1;
176                         WRITE_ONCE(tp->write_seq, seq);
177                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
178                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
179                 }
180                 sock_hold(sktw);
181                 return 1;
182         }
183
184         return 0;
185 }
186 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
187
188 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
189                               int addr_len)
190 {
191         /* This check is replicated from tcp_v4_connect() and intended to
192          * prevent BPF program called below from accessing bytes that are out
193          * of the bound specified by user in addr_len.
194          */
195         if (addr_len < sizeof(struct sockaddr_in))
196                 return -EINVAL;
197
198         sock_owned_by_me(sk);
199
200         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
201 }
202
203 /* This will initiate an outgoing connection. */
204 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
205 {
206         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
207         struct inet_sock *inet = inet_sk(sk);
208         struct tcp_sock *tp = tcp_sk(sk);
209         __be16 orig_sport, orig_dport;
210         __be32 daddr, nexthop;
211         struct flowi4 *fl4;
212         struct rtable *rt;
213         int err;
214         struct ip_options_rcu *inet_opt;
215         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
216
217         if (addr_len < sizeof(struct sockaddr_in))
218                 return -EINVAL;
219
220         if (usin->sin_family != AF_INET)
221                 return -EAFNOSUPPORT;
222
223         nexthop = daddr = usin->sin_addr.s_addr;
224         inet_opt = rcu_dereference_protected(inet->inet_opt,
225                                              lockdep_sock_is_held(sk));
226         if (inet_opt && inet_opt->opt.srr) {
227                 if (!daddr)
228                         return -EINVAL;
229                 nexthop = inet_opt->opt.faddr;
230         }
231
232         orig_sport = inet->inet_sport;
233         orig_dport = usin->sin_port;
234         fl4 = &inet->cork.fl.u.ip4;
235         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
236                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
237                               IPPROTO_TCP,
238                               orig_sport, orig_dport, sk);
239         if (IS_ERR(rt)) {
240                 err = PTR_ERR(rt);
241                 if (err == -ENETUNREACH)
242                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
243                 return err;
244         }
245
246         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
247                 ip_rt_put(rt);
248                 return -ENETUNREACH;
249         }
250
251         if (!inet_opt || !inet_opt->opt.srr)
252                 daddr = fl4->daddr;
253
254         if (!inet->inet_saddr)
255                 inet->inet_saddr = fl4->saddr;
256         sk_rcv_saddr_set(sk, inet->inet_saddr);
257
258         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
259                 /* Reset inherited state */
260                 tp->rx_opt.ts_recent       = 0;
261                 tp->rx_opt.ts_recent_stamp = 0;
262                 if (likely(!tp->repair))
263                         WRITE_ONCE(tp->write_seq, 0);
264         }
265
266         inet->inet_dport = usin->sin_port;
267         sk_daddr_set(sk, daddr);
268
269         inet_csk(sk)->icsk_ext_hdr_len = 0;
270         if (inet_opt)
271                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
272
273         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
274
275         /* Socket identity is still unknown (sport may be zero).
276          * However we set state to SYN-SENT and not releasing socket
277          * lock select source port, enter ourselves into the hash tables and
278          * complete initialization after this.
279          */
280         tcp_set_state(sk, TCP_SYN_SENT);
281         err = inet_hash_connect(tcp_death_row, sk);
282         if (err)
283                 goto failure;
284
285         sk_set_txhash(sk);
286
287         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
288                                inet->inet_sport, inet->inet_dport, sk);
289         if (IS_ERR(rt)) {
290                 err = PTR_ERR(rt);
291                 rt = NULL;
292                 goto failure;
293         }
294         /* OK, now commit destination to socket.  */
295         sk->sk_gso_type = SKB_GSO_TCPV4;
296         sk_setup_caps(sk, &rt->dst);
297         rt = NULL;
298
299         if (likely(!tp->repair)) {
300                 if (!tp->write_seq)
301                         WRITE_ONCE(tp->write_seq,
302                                    secure_tcp_seq(inet->inet_saddr,
303                                                   inet->inet_daddr,
304                                                   inet->inet_sport,
305                                                   usin->sin_port));
306                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
307                                                  inet->inet_saddr,
308                                                  inet->inet_daddr);
309         }
310
311         inet->inet_id = prandom_u32();
312
313         if (tcp_fastopen_defer_connect(sk, &err))
314                 return err;
315         if (err)
316                 goto failure;
317
318         err = tcp_connect(sk);
319
320         if (err)
321                 goto failure;
322
323         return 0;
324
325 failure:
326         /*
327          * This unhashes the socket and releases the local port,
328          * if necessary.
329          */
330         tcp_set_state(sk, TCP_CLOSE);
331         ip_rt_put(rt);
332         sk->sk_route_caps = 0;
333         inet->inet_dport = 0;
334         return err;
335 }
336 EXPORT_SYMBOL(tcp_v4_connect);
337
338 /*
339  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
340  * It can be called through tcp_release_cb() if socket was owned by user
341  * at the time tcp_v4_err() was called to handle ICMP message.
342  */
343 void tcp_v4_mtu_reduced(struct sock *sk)
344 {
345         struct inet_sock *inet = inet_sk(sk);
346         struct dst_entry *dst;
347         u32 mtu;
348
349         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
350                 return;
351         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
352         dst = inet_csk_update_pmtu(sk, mtu);
353         if (!dst)
354                 return;
355
356         /* Something is about to be wrong... Remember soft error
357          * for the case, if this connection will not able to recover.
358          */
359         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
360                 sk->sk_err_soft = EMSGSIZE;
361
362         mtu = dst_mtu(dst);
363
364         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
365             ip_sk_accept_pmtu(sk) &&
366             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
367                 tcp_sync_mss(sk, mtu);
368
369                 /* Resend the TCP packet because it's
370                  * clear that the old packet has been
371                  * dropped. This is the new "fast" path mtu
372                  * discovery.
373                  */
374                 tcp_simple_retransmit(sk);
375         } /* else let the usual retransmit timer handle it */
376 }
377 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
378
379 static void do_redirect(struct sk_buff *skb, struct sock *sk)
380 {
381         struct dst_entry *dst = __sk_dst_check(sk, 0);
382
383         if (dst)
384                 dst->ops->redirect(dst, sk, skb);
385 }
386
387
388 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
389 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
390 {
391         struct request_sock *req = inet_reqsk(sk);
392         struct net *net = sock_net(sk);
393
394         /* ICMPs are not backlogged, hence we cannot get
395          * an established socket here.
396          */
397         if (seq != tcp_rsk(req)->snt_isn) {
398                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
399         } else if (abort) {
400                 /*
401                  * Still in SYN_RECV, just remove it silently.
402                  * There is no good way to pass the error to the newly
403                  * created socket, and POSIX does not want network
404                  * errors returned from accept().
405                  */
406                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
407                 tcp_listendrop(req->rsk_listener);
408         }
409         reqsk_put(req);
410 }
411 EXPORT_SYMBOL(tcp_req_err);
412
413 /*
414  * This routine is called by the ICMP module when it gets some
415  * sort of error condition.  If err < 0 then the socket should
416  * be closed and the error returned to the user.  If err > 0
417  * it's just the icmp type << 8 | icmp code.  After adjustment
418  * header points to the first 8 bytes of the tcp header.  We need
419  * to find the appropriate port.
420  *
421  * The locking strategy used here is very "optimistic". When
422  * someone else accesses the socket the ICMP is just dropped
423  * and for some paths there is no check at all.
424  * A more general error queue to queue errors for later handling
425  * is probably better.
426  *
427  */
428
429 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
430 {
431         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
432         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
433         struct inet_connection_sock *icsk;
434         struct tcp_sock *tp;
435         struct inet_sock *inet;
436         const int type = icmp_hdr(icmp_skb)->type;
437         const int code = icmp_hdr(icmp_skb)->code;
438         struct sock *sk;
439         struct sk_buff *skb;
440         struct request_sock *fastopen;
441         u32 seq, snd_una;
442         s32 remaining;
443         u32 delta_us;
444         int err;
445         struct net *net = dev_net(icmp_skb->dev);
446
447         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
448                                        th->dest, iph->saddr, ntohs(th->source),
449                                        inet_iif(icmp_skb), 0);
450         if (!sk) {
451                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
452                 return;
453         }
454         if (sk->sk_state == TCP_TIME_WAIT) {
455                 inet_twsk_put(inet_twsk(sk));
456                 return;
457         }
458         seq = ntohl(th->seq);
459         if (sk->sk_state == TCP_NEW_SYN_RECV)
460                 return tcp_req_err(sk, seq,
461                                   type == ICMP_PARAMETERPROB ||
462                                   type == ICMP_TIME_EXCEEDED ||
463                                   (type == ICMP_DEST_UNREACH &&
464                                    (code == ICMP_NET_UNREACH ||
465                                     code == ICMP_HOST_UNREACH)));
466
467         bh_lock_sock(sk);
468         /* If too many ICMPs get dropped on busy
469          * servers this needs to be solved differently.
470          * We do take care of PMTU discovery (RFC1191) special case :
471          * we can receive locally generated ICMP messages while socket is held.
472          */
473         if (sock_owned_by_user(sk)) {
474                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
475                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
476         }
477         if (sk->sk_state == TCP_CLOSE)
478                 goto out;
479
480         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
481                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
482                 goto out;
483         }
484
485         icsk = inet_csk(sk);
486         tp = tcp_sk(sk);
487         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
488         fastopen = tp->fastopen_rsk;
489         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
490         if (sk->sk_state != TCP_LISTEN &&
491             !between(seq, snd_una, tp->snd_nxt)) {
492                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
493                 goto out;
494         }
495
496         switch (type) {
497         case ICMP_REDIRECT:
498                 if (!sock_owned_by_user(sk))
499                         do_redirect(icmp_skb, sk);
500                 goto out;
501         case ICMP_SOURCE_QUENCH:
502                 /* Just silently ignore these. */
503                 goto out;
504         case ICMP_PARAMETERPROB:
505                 err = EPROTO;
506                 break;
507         case ICMP_DEST_UNREACH:
508                 if (code > NR_ICMP_UNREACH)
509                         goto out;
510
511                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
512                         /* We are not interested in TCP_LISTEN and open_requests
513                          * (SYN-ACKs send out by Linux are always <576bytes so
514                          * they should go through unfragmented).
515                          */
516                         if (sk->sk_state == TCP_LISTEN)
517                                 goto out;
518
519                         WRITE_ONCE(tp->mtu_info, info);
520                         if (!sock_owned_by_user(sk)) {
521                                 tcp_v4_mtu_reduced(sk);
522                         } else {
523                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
524                                         sock_hold(sk);
525                         }
526                         goto out;
527                 }
528
529                 err = icmp_err_convert[code].errno;
530                 /* check if icmp_skb allows revert of backoff
531                  * (see draft-zimmermann-tcp-lcd) */
532                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
533                         break;
534                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
535                     !icsk->icsk_backoff || fastopen)
536                         break;
537
538                 if (sock_owned_by_user(sk))
539                         break;
540
541                 skb = tcp_rtx_queue_head(sk);
542                 if (WARN_ON_ONCE(!skb))
543                         break;
544
545                 icsk->icsk_backoff--;
546                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
547                                                TCP_TIMEOUT_INIT;
548                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
549
550                 tcp_mstamp_refresh(tp);
551                 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
552                 remaining = icsk->icsk_rto -
553                             usecs_to_jiffies(delta_us);
554
555                 if (remaining > 0) {
556                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
557                                                   remaining, TCP_RTO_MAX);
558                 } else {
559                         /* RTO revert clocked out retransmission.
560                          * Will retransmit now */
561                         tcp_retransmit_timer(sk);
562                 }
563
564                 break;
565         case ICMP_TIME_EXCEEDED:
566                 err = EHOSTUNREACH;
567                 break;
568         default:
569                 goto out;
570         }
571
572         switch (sk->sk_state) {
573         case TCP_SYN_SENT:
574         case TCP_SYN_RECV:
575                 /* Only in fast or simultaneous open. If a fast open socket is
576                  * is already accepted it is treated as a connected one below.
577                  */
578                 if (fastopen && !fastopen->sk)
579                         break;
580
581                 if (!sock_owned_by_user(sk)) {
582                         sk->sk_err = err;
583
584                         sk->sk_error_report(sk);
585
586                         tcp_done(sk);
587                 } else {
588                         sk->sk_err_soft = err;
589                 }
590                 goto out;
591         }
592
593         /* If we've already connected we will keep trying
594          * until we time out, or the user gives up.
595          *
596          * rfc1122 4.2.3.9 allows to consider as hard errors
597          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
598          * but it is obsoleted by pmtu discovery).
599          *
600          * Note, that in modern internet, where routing is unreliable
601          * and in each dark corner broken firewalls sit, sending random
602          * errors ordered by their masters even this two messages finally lose
603          * their original sense (even Linux sends invalid PORT_UNREACHs)
604          *
605          * Now we are in compliance with RFCs.
606          *                                                      --ANK (980905)
607          */
608
609         inet = inet_sk(sk);
610         if (!sock_owned_by_user(sk) && inet->recverr) {
611                 sk->sk_err = err;
612                 sk->sk_error_report(sk);
613         } else  { /* Only an error on timeout */
614                 sk->sk_err_soft = err;
615         }
616
617 out:
618         bh_unlock_sock(sk);
619         sock_put(sk);
620 }
621
622 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
623 {
624         struct tcphdr *th = tcp_hdr(skb);
625
626         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
627         skb->csum_start = skb_transport_header(skb) - skb->head;
628         skb->csum_offset = offsetof(struct tcphdr, check);
629 }
630
631 /* This routine computes an IPv4 TCP checksum. */
632 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
633 {
634         const struct inet_sock *inet = inet_sk(sk);
635
636         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
637 }
638 EXPORT_SYMBOL(tcp_v4_send_check);
639
640 /*
641  *      This routine will send an RST to the other tcp.
642  *
643  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
644  *                    for reset.
645  *      Answer: if a packet caused RST, it is not for a socket
646  *              existing in our system, if it is matched to a socket,
647  *              it is just duplicate segment or bug in other side's TCP.
648  *              So that we build reply only basing on parameters
649  *              arrived with segment.
650  *      Exception: precedence violation. We do not implement it in any case.
651  */
652
653 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
654 {
655         const struct tcphdr *th = tcp_hdr(skb);
656         struct {
657                 struct tcphdr th;
658 #ifdef CONFIG_TCP_MD5SIG
659                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
660 #endif
661         } rep;
662         struct ip_reply_arg arg;
663 #ifdef CONFIG_TCP_MD5SIG
664         struct tcp_md5sig_key *key = NULL;
665         const __u8 *hash_location = NULL;
666         unsigned char newhash[16];
667         int genhash;
668         struct sock *sk1 = NULL;
669 #endif
670         struct net *net;
671         struct sock *ctl_sk;
672
673         /* Never send a reset in response to a reset. */
674         if (th->rst)
675                 return;
676
677         /* If sk not NULL, it means we did a successful lookup and incoming
678          * route had to be correct. prequeue might have dropped our dst.
679          */
680         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
681                 return;
682
683         /* Swap the send and the receive. */
684         memset(&rep, 0, sizeof(rep));
685         rep.th.dest   = th->source;
686         rep.th.source = th->dest;
687         rep.th.doff   = sizeof(struct tcphdr) / 4;
688         rep.th.rst    = 1;
689
690         if (th->ack) {
691                 rep.th.seq = th->ack_seq;
692         } else {
693                 rep.th.ack = 1;
694                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
695                                        skb->len - (th->doff << 2));
696         }
697
698         memset(&arg, 0, sizeof(arg));
699         arg.iov[0].iov_base = (unsigned char *)&rep;
700         arg.iov[0].iov_len  = sizeof(rep.th);
701
702         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
703 #ifdef CONFIG_TCP_MD5SIG
704         rcu_read_lock();
705         hash_location = tcp_parse_md5sig_option(th);
706         if (sk && sk_fullsock(sk)) {
707                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
708                                         &ip_hdr(skb)->saddr, AF_INET);
709         } else if (hash_location) {
710                 /*
711                  * active side is lost. Try to find listening socket through
712                  * source port, and then find md5 key through listening socket.
713                  * we are not loose security here:
714                  * Incoming packet is checked with md5 hash with finding key,
715                  * no RST generated if md5 hash doesn't match.
716                  */
717                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
718                                              ip_hdr(skb)->saddr,
719                                              th->source, ip_hdr(skb)->daddr,
720                                              ntohs(th->source), inet_iif(skb),
721                                              tcp_v4_sdif(skb));
722                 /* don't send rst if it can't find key */
723                 if (!sk1)
724                         goto out;
725
726                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
727                                         &ip_hdr(skb)->saddr, AF_INET);
728                 if (!key)
729                         goto out;
730
731
732                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
733                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
734                         goto out;
735
736         }
737
738         if (key) {
739                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
740                                    (TCPOPT_NOP << 16) |
741                                    (TCPOPT_MD5SIG << 8) |
742                                    TCPOLEN_MD5SIG);
743                 /* Update length and the length the header thinks exists */
744                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
745                 rep.th.doff = arg.iov[0].iov_len / 4;
746
747                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
748                                      key, ip_hdr(skb)->saddr,
749                                      ip_hdr(skb)->daddr, &rep.th);
750         }
751 #endif
752         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
753                                       ip_hdr(skb)->saddr, /* XXX */
754                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
755         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
756         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
757
758         /* When socket is gone, all binding information is lost.
759          * routing might fail in this case. No choice here, if we choose to force
760          * input interface, we will misroute in case of asymmetric route.
761          */
762         if (sk) {
763                 arg.bound_dev_if = sk->sk_bound_dev_if;
764                 if (sk_fullsock(sk))
765                         trace_tcp_send_reset(sk, skb);
766         }
767
768         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
769                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
770
771         arg.tos = ip_hdr(skb)->tos;
772         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
773         local_bh_disable();
774         ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
775         if (sk)
776                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
777                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
778         ip_send_unicast_reply(ctl_sk,
779                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
780                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
781                               &arg, arg.iov[0].iov_len);
782
783         ctl_sk->sk_mark = 0;
784         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
785         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
786         local_bh_enable();
787
788 #ifdef CONFIG_TCP_MD5SIG
789 out:
790         rcu_read_unlock();
791 #endif
792 }
793
794 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
795    outside socket context is ugly, certainly. What can I do?
796  */
797
798 static void tcp_v4_send_ack(const struct sock *sk,
799                             struct sk_buff *skb, u32 seq, u32 ack,
800                             u32 win, u32 tsval, u32 tsecr, int oif,
801                             struct tcp_md5sig_key *key,
802                             int reply_flags, u8 tos)
803 {
804         const struct tcphdr *th = tcp_hdr(skb);
805         struct {
806                 struct tcphdr th;
807                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
808 #ifdef CONFIG_TCP_MD5SIG
809                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
810 #endif
811                         ];
812         } rep;
813         struct net *net = sock_net(sk);
814         struct ip_reply_arg arg;
815         struct sock *ctl_sk;
816
817         memset(&rep.th, 0, sizeof(struct tcphdr));
818         memset(&arg, 0, sizeof(arg));
819
820         arg.iov[0].iov_base = (unsigned char *)&rep;
821         arg.iov[0].iov_len  = sizeof(rep.th);
822         if (tsecr) {
823                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
824                                    (TCPOPT_TIMESTAMP << 8) |
825                                    TCPOLEN_TIMESTAMP);
826                 rep.opt[1] = htonl(tsval);
827                 rep.opt[2] = htonl(tsecr);
828                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
829         }
830
831         /* Swap the send and the receive. */
832         rep.th.dest    = th->source;
833         rep.th.source  = th->dest;
834         rep.th.doff    = arg.iov[0].iov_len / 4;
835         rep.th.seq     = htonl(seq);
836         rep.th.ack_seq = htonl(ack);
837         rep.th.ack     = 1;
838         rep.th.window  = htons(win);
839
840 #ifdef CONFIG_TCP_MD5SIG
841         if (key) {
842                 int offset = (tsecr) ? 3 : 0;
843
844                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
845                                           (TCPOPT_NOP << 16) |
846                                           (TCPOPT_MD5SIG << 8) |
847                                           TCPOLEN_MD5SIG);
848                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
849                 rep.th.doff = arg.iov[0].iov_len/4;
850
851                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
852                                     key, ip_hdr(skb)->saddr,
853                                     ip_hdr(skb)->daddr, &rep.th);
854         }
855 #endif
856         arg.flags = reply_flags;
857         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
858                                       ip_hdr(skb)->saddr, /* XXX */
859                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
860         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
861         if (oif)
862                 arg.bound_dev_if = oif;
863         arg.tos = tos;
864         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
865         local_bh_disable();
866         ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
867         if (sk)
868                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
869                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
870         ip_send_unicast_reply(ctl_sk,
871                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
872                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
873                               &arg, arg.iov[0].iov_len);
874
875         ctl_sk->sk_mark = 0;
876         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
877         local_bh_enable();
878 }
879
880 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
881 {
882         struct inet_timewait_sock *tw = inet_twsk(sk);
883         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
884
885         tcp_v4_send_ack(sk, skb,
886                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
887                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
888                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
889                         tcptw->tw_ts_recent,
890                         tw->tw_bound_dev_if,
891                         tcp_twsk_md5_key(tcptw),
892                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
893                         tw->tw_tos
894                         );
895
896         inet_twsk_put(tw);
897 }
898
899 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
900                                   struct request_sock *req)
901 {
902         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
903          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
904          */
905         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
906                                              tcp_sk(sk)->snd_nxt;
907
908         /* RFC 7323 2.3
909          * The window field (SEG.WND) of every outgoing segment, with the
910          * exception of <SYN> segments, MUST be right-shifted by
911          * Rcv.Wind.Shift bits:
912          */
913         tcp_v4_send_ack(sk, skb, seq,
914                         tcp_rsk(req)->rcv_nxt,
915                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
916                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
917                         req->ts_recent,
918                         0,
919                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
920                                           AF_INET),
921                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
922                         ip_hdr(skb)->tos);
923 }
924
925 /*
926  *      Send a SYN-ACK after having received a SYN.
927  *      This still operates on a request_sock only, not on a big
928  *      socket.
929  */
930 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
931                               struct flowi *fl,
932                               struct request_sock *req,
933                               struct tcp_fastopen_cookie *foc,
934                               enum tcp_synack_type synack_type)
935 {
936         const struct inet_request_sock *ireq = inet_rsk(req);
937         struct flowi4 fl4;
938         int err = -1;
939         struct sk_buff *skb;
940
941         /* First, grab a route. */
942         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
943                 return -1;
944
945         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
946
947         if (skb) {
948                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
949
950                 rcu_read_lock();
951                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
952                                             ireq->ir_rmt_addr,
953                                             rcu_dereference(ireq->ireq_opt));
954                 rcu_read_unlock();
955                 err = net_xmit_eval(err);
956         }
957
958         return err;
959 }
960
961 /*
962  *      IPv4 request_sock destructor.
963  */
964 static void tcp_v4_reqsk_destructor(struct request_sock *req)
965 {
966         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
967 }
968
969 #ifdef CONFIG_TCP_MD5SIG
970 /*
971  * RFC2385 MD5 checksumming requires a mapping of
972  * IP address->MD5 Key.
973  * We need to maintain these in the sk structure.
974  */
975
976 /* Find the Key structure for an address.  */
977 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
978                                          const union tcp_md5_addr *addr,
979                                          int family)
980 {
981         const struct tcp_sock *tp = tcp_sk(sk);
982         struct tcp_md5sig_key *key;
983         const struct tcp_md5sig_info *md5sig;
984         __be32 mask;
985         struct tcp_md5sig_key *best_match = NULL;
986         bool match;
987
988         /* caller either holds rcu_read_lock() or socket lock */
989         md5sig = rcu_dereference_check(tp->md5sig_info,
990                                        lockdep_sock_is_held(sk));
991         if (!md5sig)
992                 return NULL;
993
994         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
995                 if (key->family != family)
996                         continue;
997
998                 if (family == AF_INET) {
999                         mask = inet_make_mask(key->prefixlen);
1000                         match = (key->addr.a4.s_addr & mask) ==
1001                                 (addr->a4.s_addr & mask);
1002 #if IS_ENABLED(CONFIG_IPV6)
1003                 } else if (family == AF_INET6) {
1004                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1005                                                   key->prefixlen);
1006 #endif
1007                 } else {
1008                         match = false;
1009                 }
1010
1011                 if (match && (!best_match ||
1012                               key->prefixlen > best_match->prefixlen))
1013                         best_match = key;
1014         }
1015         return best_match;
1016 }
1017 EXPORT_SYMBOL(tcp_md5_do_lookup);
1018
1019 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1020                                                       const union tcp_md5_addr *addr,
1021                                                       int family, u8 prefixlen)
1022 {
1023         const struct tcp_sock *tp = tcp_sk(sk);
1024         struct tcp_md5sig_key *key;
1025         unsigned int size = sizeof(struct in_addr);
1026         const struct tcp_md5sig_info *md5sig;
1027
1028         /* caller either holds rcu_read_lock() or socket lock */
1029         md5sig = rcu_dereference_check(tp->md5sig_info,
1030                                        lockdep_sock_is_held(sk));
1031         if (!md5sig)
1032                 return NULL;
1033 #if IS_ENABLED(CONFIG_IPV6)
1034         if (family == AF_INET6)
1035                 size = sizeof(struct in6_addr);
1036 #endif
1037         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1038                 if (key->family != family)
1039                         continue;
1040                 if (!memcmp(&key->addr, addr, size) &&
1041                     key->prefixlen == prefixlen)
1042                         return key;
1043         }
1044         return NULL;
1045 }
1046
1047 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1048                                          const struct sock *addr_sk)
1049 {
1050         const union tcp_md5_addr *addr;
1051
1052         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1053         return tcp_md5_do_lookup(sk, addr, AF_INET);
1054 }
1055 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1056
1057 /* This can be called on a newly created socket, from other files */
1058 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1059                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1060                    gfp_t gfp)
1061 {
1062         /* Add Key to the list */
1063         struct tcp_md5sig_key *key;
1064         struct tcp_sock *tp = tcp_sk(sk);
1065         struct tcp_md5sig_info *md5sig;
1066
1067         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1068         if (key) {
1069                 /* Pre-existing entry - just update that one.
1070                  * Note that the key might be used concurrently.
1071                  */
1072                 memcpy(key->key, newkey, newkeylen);
1073
1074                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1075                  * Also note that a reader could catch new key->keylen value
1076                  * but old key->key[], this is the reason we use __GFP_ZERO
1077                  * at sock_kmalloc() time below these lines.
1078                  */
1079                 WRITE_ONCE(key->keylen, newkeylen);
1080
1081                 return 0;
1082         }
1083
1084         md5sig = rcu_dereference_protected(tp->md5sig_info,
1085                                            lockdep_sock_is_held(sk));
1086         if (!md5sig) {
1087                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1088                 if (!md5sig)
1089                         return -ENOMEM;
1090
1091                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1092                 INIT_HLIST_HEAD(&md5sig->head);
1093                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1094         }
1095
1096         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1097         if (!key)
1098                 return -ENOMEM;
1099         if (!tcp_alloc_md5sig_pool()) {
1100                 sock_kfree_s(sk, key, sizeof(*key));
1101                 return -ENOMEM;
1102         }
1103
1104         memcpy(key->key, newkey, newkeylen);
1105         key->keylen = newkeylen;
1106         key->family = family;
1107         key->prefixlen = prefixlen;
1108         memcpy(&key->addr, addr,
1109                (family == AF_INET6) ? sizeof(struct in6_addr) :
1110                                       sizeof(struct in_addr));
1111         hlist_add_head_rcu(&key->node, &md5sig->head);
1112         return 0;
1113 }
1114 EXPORT_SYMBOL(tcp_md5_do_add);
1115
1116 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1117                    u8 prefixlen)
1118 {
1119         struct tcp_md5sig_key *key;
1120
1121         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1122         if (!key)
1123                 return -ENOENT;
1124         hlist_del_rcu(&key->node);
1125         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1126         kfree_rcu(key, rcu);
1127         return 0;
1128 }
1129 EXPORT_SYMBOL(tcp_md5_do_del);
1130
1131 static void tcp_clear_md5_list(struct sock *sk)
1132 {
1133         struct tcp_sock *tp = tcp_sk(sk);
1134         struct tcp_md5sig_key *key;
1135         struct hlist_node *n;
1136         struct tcp_md5sig_info *md5sig;
1137
1138         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1139
1140         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1141                 hlist_del_rcu(&key->node);
1142                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1143                 kfree_rcu(key, rcu);
1144         }
1145 }
1146
1147 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1148                                  char __user *optval, int optlen)
1149 {
1150         struct tcp_md5sig cmd;
1151         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1152         u8 prefixlen = 32;
1153
1154         if (optlen < sizeof(cmd))
1155                 return -EINVAL;
1156
1157         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1158                 return -EFAULT;
1159
1160         if (sin->sin_family != AF_INET)
1161                 return -EINVAL;
1162
1163         if (optname == TCP_MD5SIG_EXT &&
1164             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1165                 prefixlen = cmd.tcpm_prefixlen;
1166                 if (prefixlen > 32)
1167                         return -EINVAL;
1168         }
1169
1170         if (!cmd.tcpm_keylen)
1171                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1172                                       AF_INET, prefixlen);
1173
1174         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1175                 return -EINVAL;
1176
1177         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1178                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1179                               GFP_KERNEL);
1180 }
1181
1182 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1183                                    __be32 daddr, __be32 saddr,
1184                                    const struct tcphdr *th, int nbytes)
1185 {
1186         struct tcp4_pseudohdr *bp;
1187         struct scatterlist sg;
1188         struct tcphdr *_th;
1189
1190         bp = hp->scratch;
1191         bp->saddr = saddr;
1192         bp->daddr = daddr;
1193         bp->pad = 0;
1194         bp->protocol = IPPROTO_TCP;
1195         bp->len = cpu_to_be16(nbytes);
1196
1197         _th = (struct tcphdr *)(bp + 1);
1198         memcpy(_th, th, sizeof(*th));
1199         _th->check = 0;
1200
1201         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1202         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1203                                 sizeof(*bp) + sizeof(*th));
1204         return crypto_ahash_update(hp->md5_req);
1205 }
1206
1207 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1208                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1209 {
1210         struct tcp_md5sig_pool *hp;
1211         struct ahash_request *req;
1212
1213         hp = tcp_get_md5sig_pool();
1214         if (!hp)
1215                 goto clear_hash_noput;
1216         req = hp->md5_req;
1217
1218         if (crypto_ahash_init(req))
1219                 goto clear_hash;
1220         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1221                 goto clear_hash;
1222         if (tcp_md5_hash_key(hp, key))
1223                 goto clear_hash;
1224         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1225         if (crypto_ahash_final(req))
1226                 goto clear_hash;
1227
1228         tcp_put_md5sig_pool();
1229         return 0;
1230
1231 clear_hash:
1232         tcp_put_md5sig_pool();
1233 clear_hash_noput:
1234         memset(md5_hash, 0, 16);
1235         return 1;
1236 }
1237
1238 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1239                         const struct sock *sk,
1240                         const struct sk_buff *skb)
1241 {
1242         struct tcp_md5sig_pool *hp;
1243         struct ahash_request *req;
1244         const struct tcphdr *th = tcp_hdr(skb);
1245         __be32 saddr, daddr;
1246
1247         if (sk) { /* valid for establish/request sockets */
1248                 saddr = sk->sk_rcv_saddr;
1249                 daddr = sk->sk_daddr;
1250         } else {
1251                 const struct iphdr *iph = ip_hdr(skb);
1252                 saddr = iph->saddr;
1253                 daddr = iph->daddr;
1254         }
1255
1256         hp = tcp_get_md5sig_pool();
1257         if (!hp)
1258                 goto clear_hash_noput;
1259         req = hp->md5_req;
1260
1261         if (crypto_ahash_init(req))
1262                 goto clear_hash;
1263
1264         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1265                 goto clear_hash;
1266         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1267                 goto clear_hash;
1268         if (tcp_md5_hash_key(hp, key))
1269                 goto clear_hash;
1270         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1271         if (crypto_ahash_final(req))
1272                 goto clear_hash;
1273
1274         tcp_put_md5sig_pool();
1275         return 0;
1276
1277 clear_hash:
1278         tcp_put_md5sig_pool();
1279 clear_hash_noput:
1280         memset(md5_hash, 0, 16);
1281         return 1;
1282 }
1283 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1284
1285 #endif
1286
1287 /* Called with rcu_read_lock() */
1288 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1289                                     const struct sk_buff *skb)
1290 {
1291 #ifdef CONFIG_TCP_MD5SIG
1292         /*
1293          * This gets called for each TCP segment that arrives
1294          * so we want to be efficient.
1295          * We have 3 drop cases:
1296          * o No MD5 hash and one expected.
1297          * o MD5 hash and we're not expecting one.
1298          * o MD5 hash and its wrong.
1299          */
1300         const __u8 *hash_location = NULL;
1301         struct tcp_md5sig_key *hash_expected;
1302         const struct iphdr *iph = ip_hdr(skb);
1303         const struct tcphdr *th = tcp_hdr(skb);
1304         int genhash;
1305         unsigned char newhash[16];
1306
1307         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1308                                           AF_INET);
1309         hash_location = tcp_parse_md5sig_option(th);
1310
1311         /* We've parsed the options - do we have a hash? */
1312         if (!hash_expected && !hash_location)
1313                 return false;
1314
1315         if (hash_expected && !hash_location) {
1316                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1317                 return true;
1318         }
1319
1320         if (!hash_expected && hash_location) {
1321                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1322                 return true;
1323         }
1324
1325         /* Okay, so this is hash_expected and hash_location -
1326          * so we need to calculate the checksum.
1327          */
1328         genhash = tcp_v4_md5_hash_skb(newhash,
1329                                       hash_expected,
1330                                       NULL, skb);
1331
1332         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1333                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1334                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1335                                      &iph->saddr, ntohs(th->source),
1336                                      &iph->daddr, ntohs(th->dest),
1337                                      genhash ? " tcp_v4_calc_md5_hash failed"
1338                                      : "");
1339                 return true;
1340         }
1341         return false;
1342 #endif
1343         return false;
1344 }
1345
1346 static void tcp_v4_init_req(struct request_sock *req,
1347                             const struct sock *sk_listener,
1348                             struct sk_buff *skb)
1349 {
1350         struct inet_request_sock *ireq = inet_rsk(req);
1351         struct net *net = sock_net(sk_listener);
1352
1353         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1354         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1355         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1356 }
1357
1358 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1359                                           struct flowi *fl,
1360                                           const struct request_sock *req)
1361 {
1362         return inet_csk_route_req(sk, &fl->u.ip4, req);
1363 }
1364
1365 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1366         .family         =       PF_INET,
1367         .obj_size       =       sizeof(struct tcp_request_sock),
1368         .rtx_syn_ack    =       tcp_rtx_synack,
1369         .send_ack       =       tcp_v4_reqsk_send_ack,
1370         .destructor     =       tcp_v4_reqsk_destructor,
1371         .send_reset     =       tcp_v4_send_reset,
1372         .syn_ack_timeout =      tcp_syn_ack_timeout,
1373 };
1374
1375 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1376         .mss_clamp      =       TCP_MSS_DEFAULT,
1377 #ifdef CONFIG_TCP_MD5SIG
1378         .req_md5_lookup =       tcp_v4_md5_lookup,
1379         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1380 #endif
1381         .init_req       =       tcp_v4_init_req,
1382 #ifdef CONFIG_SYN_COOKIES
1383         .cookie_init_seq =      cookie_v4_init_sequence,
1384 #endif
1385         .route_req      =       tcp_v4_route_req,
1386         .init_seq       =       tcp_v4_init_seq,
1387         .init_ts_off    =       tcp_v4_init_ts_off,
1388         .send_synack    =       tcp_v4_send_synack,
1389 };
1390
1391 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1392 {
1393         /* Never answer to SYNs send to broadcast or multicast */
1394         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1395                 goto drop;
1396
1397         return tcp_conn_request(&tcp_request_sock_ops,
1398                                 &tcp_request_sock_ipv4_ops, sk, skb);
1399
1400 drop:
1401         tcp_listendrop(sk);
1402         return 0;
1403 }
1404 EXPORT_SYMBOL(tcp_v4_conn_request);
1405
1406
1407 /*
1408  * The three way handshake has completed - we got a valid synack -
1409  * now create the new socket.
1410  */
1411 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1412                                   struct request_sock *req,
1413                                   struct dst_entry *dst,
1414                                   struct request_sock *req_unhash,
1415                                   bool *own_req)
1416 {
1417         struct inet_request_sock *ireq;
1418         bool found_dup_sk = false;
1419         struct inet_sock *newinet;
1420         struct tcp_sock *newtp;
1421         struct sock *newsk;
1422 #ifdef CONFIG_TCP_MD5SIG
1423         struct tcp_md5sig_key *key;
1424 #endif
1425         struct ip_options_rcu *inet_opt;
1426
1427         if (sk_acceptq_is_full(sk))
1428                 goto exit_overflow;
1429
1430         newsk = tcp_create_openreq_child(sk, req, skb);
1431         if (!newsk)
1432                 goto exit_nonewsk;
1433
1434         newsk->sk_gso_type = SKB_GSO_TCPV4;
1435         inet_sk_rx_dst_set(newsk, skb);
1436
1437         newtp                 = tcp_sk(newsk);
1438         newinet               = inet_sk(newsk);
1439         ireq                  = inet_rsk(req);
1440         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1441         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1442         newsk->sk_bound_dev_if = ireq->ir_iif;
1443         newinet->inet_saddr   = ireq->ir_loc_addr;
1444         inet_opt              = rcu_dereference(ireq->ireq_opt);
1445         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1446         newinet->mc_index     = inet_iif(skb);
1447         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1448         newinet->rcv_tos      = ip_hdr(skb)->tos;
1449         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1450         if (inet_opt)
1451                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1452         newinet->inet_id = prandom_u32();
1453
1454         if (!dst) {
1455                 dst = inet_csk_route_child_sock(sk, newsk, req);
1456                 if (!dst)
1457                         goto put_and_exit;
1458         } else {
1459                 /* syncookie case : see end of cookie_v4_check() */
1460         }
1461         sk_setup_caps(newsk, dst);
1462
1463         tcp_ca_openreq_child(newsk, dst);
1464
1465         tcp_sync_mss(newsk, dst_mtu(dst));
1466         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1467
1468         tcp_initialize_rcv_mss(newsk);
1469
1470 #ifdef CONFIG_TCP_MD5SIG
1471         /* Copy over the MD5 key from the original socket */
1472         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1473                                 AF_INET);
1474         if (key) {
1475                 /*
1476                  * We're using one, so create a matching key
1477                  * on the newsk structure. If we fail to get
1478                  * memory, then we end up not copying the key
1479                  * across. Shucks.
1480                  */
1481                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1482                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1483                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1484         }
1485 #endif
1486
1487         if (__inet_inherit_port(sk, newsk) < 0)
1488                 goto put_and_exit;
1489         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1490                                        &found_dup_sk);
1491         if (likely(*own_req)) {
1492                 tcp_move_syn(newtp, req);
1493                 ireq->ireq_opt = NULL;
1494         } else {
1495                 newinet->inet_opt = NULL;
1496
1497                 if (!req_unhash && found_dup_sk) {
1498                         /* This code path should only be executed in the
1499                          * syncookie case only
1500                          */
1501                         bh_unlock_sock(newsk);
1502                         sock_put(newsk);
1503                         newsk = NULL;
1504                 }
1505         }
1506         return newsk;
1507
1508 exit_overflow:
1509         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1510 exit_nonewsk:
1511         dst_release(dst);
1512 exit:
1513         tcp_listendrop(sk);
1514         return NULL;
1515 put_and_exit:
1516         newinet->inet_opt = NULL;
1517         inet_csk_prepare_forced_close(newsk);
1518         tcp_done(newsk);
1519         goto exit;
1520 }
1521 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1522
1523 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1524 {
1525 #ifdef CONFIG_SYN_COOKIES
1526         const struct tcphdr *th = tcp_hdr(skb);
1527
1528         if (!th->syn)
1529                 sk = cookie_v4_check(sk, skb);
1530 #endif
1531         return sk;
1532 }
1533
1534 /* The socket must have it's spinlock held when we get
1535  * here, unless it is a TCP_LISTEN socket.
1536  *
1537  * We have a potential double-lock case here, so even when
1538  * doing backlog processing we use the BH locking scheme.
1539  * This is because we cannot sleep with the original spinlock
1540  * held.
1541  */
1542 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1543 {
1544         struct sock *rsk;
1545
1546         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1547                 struct dst_entry *dst = sk->sk_rx_dst;
1548
1549                 sock_rps_save_rxhash(sk, skb);
1550                 sk_mark_napi_id(sk, skb);
1551                 if (dst) {
1552                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1553                             !dst->ops->check(dst, 0)) {
1554                                 dst_release(dst);
1555                                 sk->sk_rx_dst = NULL;
1556                         }
1557                 }
1558                 tcp_rcv_established(sk, skb);
1559                 return 0;
1560         }
1561
1562         if (tcp_checksum_complete(skb))
1563                 goto csum_err;
1564
1565         if (sk->sk_state == TCP_LISTEN) {
1566                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1567
1568                 if (!nsk)
1569                         goto discard;
1570                 if (nsk != sk) {
1571                         if (tcp_child_process(sk, nsk, skb)) {
1572                                 rsk = nsk;
1573                                 goto reset;
1574                         }
1575                         return 0;
1576                 }
1577         } else
1578                 sock_rps_save_rxhash(sk, skb);
1579
1580         if (tcp_rcv_state_process(sk, skb)) {
1581                 rsk = sk;
1582                 goto reset;
1583         }
1584         return 0;
1585
1586 reset:
1587         tcp_v4_send_reset(rsk, skb);
1588 discard:
1589         kfree_skb(skb);
1590         /* Be careful here. If this function gets more complicated and
1591          * gcc suffers from register pressure on the x86, sk (in %ebx)
1592          * might be destroyed here. This current version compiles correctly,
1593          * but you have been warned.
1594          */
1595         return 0;
1596
1597 csum_err:
1598         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1599         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1600         goto discard;
1601 }
1602 EXPORT_SYMBOL(tcp_v4_do_rcv);
1603
1604 int tcp_v4_early_demux(struct sk_buff *skb)
1605 {
1606         const struct iphdr *iph;
1607         const struct tcphdr *th;
1608         struct sock *sk;
1609
1610         if (skb->pkt_type != PACKET_HOST)
1611                 return 0;
1612
1613         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1614                 return 0;
1615
1616         iph = ip_hdr(skb);
1617         th = tcp_hdr(skb);
1618
1619         if (th->doff < sizeof(struct tcphdr) / 4)
1620                 return 0;
1621
1622         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1623                                        iph->saddr, th->source,
1624                                        iph->daddr, ntohs(th->dest),
1625                                        skb->skb_iif, inet_sdif(skb));
1626         if (sk) {
1627                 skb->sk = sk;
1628                 skb->destructor = sock_edemux;
1629                 if (sk_fullsock(sk)) {
1630                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1631
1632                         if (dst)
1633                                 dst = dst_check(dst, 0);
1634                         if (dst &&
1635                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1636                                 skb_dst_set_noref(skb, dst);
1637                 }
1638         }
1639         return 0;
1640 }
1641
1642 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1643 {
1644         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1645
1646         /* Only socket owner can try to collapse/prune rx queues
1647          * to reduce memory overhead, so add a little headroom here.
1648          * Few sockets backlog are possibly concurrently non empty.
1649          */
1650         limit += 64*1024;
1651
1652         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1653          * we can fix skb->truesize to its real value to avoid future drops.
1654          * This is valid because skb is not yet charged to the socket.
1655          * It has been noticed pure SACK packets were sometimes dropped
1656          * (if cooked by drivers without copybreak feature).
1657          */
1658         skb_condense(skb);
1659
1660         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1661                 bh_unlock_sock(sk);
1662                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1663                 return true;
1664         }
1665         return false;
1666 }
1667 EXPORT_SYMBOL(tcp_add_backlog);
1668
1669 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1670 {
1671         struct tcphdr *th = (struct tcphdr *)skb->data;
1672
1673         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1674 }
1675 EXPORT_SYMBOL(tcp_filter);
1676
1677 static void tcp_v4_restore_cb(struct sk_buff *skb)
1678 {
1679         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1680                 sizeof(struct inet_skb_parm));
1681 }
1682
1683 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1684                            const struct tcphdr *th)
1685 {
1686         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1687          * barrier() makes sure compiler wont play fool^Waliasing games.
1688          */
1689         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1690                 sizeof(struct inet_skb_parm));
1691         barrier();
1692
1693         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1694         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1695                                     skb->len - th->doff * 4);
1696         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1697         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1698         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1699         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1700         TCP_SKB_CB(skb)->sacked  = 0;
1701         TCP_SKB_CB(skb)->has_rxtstamp =
1702                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1703 }
1704
1705 /*
1706  *      From tcp_input.c
1707  */
1708
1709 int tcp_v4_rcv(struct sk_buff *skb)
1710 {
1711         struct net *net = dev_net(skb->dev);
1712         int sdif = inet_sdif(skb);
1713         const struct iphdr *iph;
1714         const struct tcphdr *th;
1715         bool refcounted;
1716         struct sock *sk;
1717         int ret;
1718
1719         if (skb->pkt_type != PACKET_HOST)
1720                 goto discard_it;
1721
1722         /* Count it even if it's bad */
1723         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1724
1725         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1726                 goto discard_it;
1727
1728         th = (const struct tcphdr *)skb->data;
1729
1730         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1731                 goto bad_packet;
1732         if (!pskb_may_pull(skb, th->doff * 4))
1733                 goto discard_it;
1734
1735         /* An explanation is required here, I think.
1736          * Packet length and doff are validated by header prediction,
1737          * provided case of th->doff==0 is eliminated.
1738          * So, we defer the checks. */
1739
1740         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1741                 goto csum_error;
1742
1743         th = (const struct tcphdr *)skb->data;
1744         iph = ip_hdr(skb);
1745 lookup:
1746         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1747                                th->dest, sdif, &refcounted);
1748         if (!sk)
1749                 goto no_tcp_socket;
1750
1751 process:
1752         if (sk->sk_state == TCP_TIME_WAIT)
1753                 goto do_time_wait;
1754
1755         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1756                 struct request_sock *req = inet_reqsk(sk);
1757                 bool req_stolen = false;
1758                 struct sock *nsk;
1759
1760                 sk = req->rsk_listener;
1761                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1762                         sk_drops_add(sk, skb);
1763                         reqsk_put(req);
1764                         goto discard_it;
1765                 }
1766                 if (tcp_checksum_complete(skb)) {
1767                         reqsk_put(req);
1768                         goto csum_error;
1769                 }
1770                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1771                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1772                         goto lookup;
1773                 }
1774                 /* We own a reference on the listener, increase it again
1775                  * as we might lose it too soon.
1776                  */
1777                 sock_hold(sk);
1778                 refcounted = true;
1779                 nsk = NULL;
1780                 if (!tcp_filter(sk, skb)) {
1781                         th = (const struct tcphdr *)skb->data;
1782                         iph = ip_hdr(skb);
1783                         tcp_v4_fill_cb(skb, iph, th);
1784                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1785                 }
1786                 if (!nsk) {
1787                         reqsk_put(req);
1788                         if (req_stolen) {
1789                                 /* Another cpu got exclusive access to req
1790                                  * and created a full blown socket.
1791                                  * Try to feed this packet to this socket
1792                                  * instead of discarding it.
1793                                  */
1794                                 tcp_v4_restore_cb(skb);
1795                                 sock_put(sk);
1796                                 goto lookup;
1797                         }
1798                         goto discard_and_relse;
1799                 }
1800                 if (nsk == sk) {
1801                         reqsk_put(req);
1802                         tcp_v4_restore_cb(skb);
1803                 } else if (tcp_child_process(sk, nsk, skb)) {
1804                         tcp_v4_send_reset(nsk, skb);
1805                         goto discard_and_relse;
1806                 } else {
1807                         sock_put(sk);
1808                         return 0;
1809                 }
1810         }
1811         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1812                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1813                 goto discard_and_relse;
1814         }
1815
1816         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1817                 goto discard_and_relse;
1818
1819         if (tcp_v4_inbound_md5_hash(sk, skb))
1820                 goto discard_and_relse;
1821
1822         nf_reset(skb);
1823
1824         if (tcp_filter(sk, skb))
1825                 goto discard_and_relse;
1826         th = (const struct tcphdr *)skb->data;
1827         iph = ip_hdr(skb);
1828         tcp_v4_fill_cb(skb, iph, th);
1829
1830         skb->dev = NULL;
1831
1832         if (sk->sk_state == TCP_LISTEN) {
1833                 ret = tcp_v4_do_rcv(sk, skb);
1834                 goto put_and_return;
1835         }
1836
1837         sk_incoming_cpu_update(sk);
1838
1839         bh_lock_sock_nested(sk);
1840         tcp_segs_in(tcp_sk(sk), skb);
1841         ret = 0;
1842         if (!sock_owned_by_user(sk)) {
1843                 ret = tcp_v4_do_rcv(sk, skb);
1844         } else if (tcp_add_backlog(sk, skb)) {
1845                 goto discard_and_relse;
1846         }
1847         bh_unlock_sock(sk);
1848
1849 put_and_return:
1850         if (refcounted)
1851                 sock_put(sk);
1852
1853         return ret;
1854
1855 no_tcp_socket:
1856         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1857                 goto discard_it;
1858
1859         tcp_v4_fill_cb(skb, iph, th);
1860
1861         if (tcp_checksum_complete(skb)) {
1862 csum_error:
1863                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1864 bad_packet:
1865                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1866         } else {
1867                 tcp_v4_send_reset(NULL, skb);
1868         }
1869
1870 discard_it:
1871         /* Discard frame. */
1872         kfree_skb(skb);
1873         return 0;
1874
1875 discard_and_relse:
1876         sk_drops_add(sk, skb);
1877         if (refcounted)
1878                 sock_put(sk);
1879         goto discard_it;
1880
1881 do_time_wait:
1882         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1883                 inet_twsk_put(inet_twsk(sk));
1884                 goto discard_it;
1885         }
1886
1887         tcp_v4_fill_cb(skb, iph, th);
1888
1889         if (tcp_checksum_complete(skb)) {
1890                 inet_twsk_put(inet_twsk(sk));
1891                 goto csum_error;
1892         }
1893         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1894         case TCP_TW_SYN: {
1895                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1896                                                         &tcp_hashinfo, skb,
1897                                                         __tcp_hdrlen(th),
1898                                                         iph->saddr, th->source,
1899                                                         iph->daddr, th->dest,
1900                                                         inet_iif(skb),
1901                                                         sdif);
1902                 if (sk2) {
1903                         inet_twsk_deschedule_put(inet_twsk(sk));
1904                         sk = sk2;
1905                         tcp_v4_restore_cb(skb);
1906                         refcounted = false;
1907                         goto process;
1908                 }
1909         }
1910                 /* to ACK */
1911                 /* fall through */
1912         case TCP_TW_ACK:
1913                 tcp_v4_timewait_ack(sk, skb);
1914                 break;
1915         case TCP_TW_RST:
1916                 tcp_v4_send_reset(sk, skb);
1917                 inet_twsk_deschedule_put(inet_twsk(sk));
1918                 goto discard_it;
1919         case TCP_TW_SUCCESS:;
1920         }
1921         goto discard_it;
1922 }
1923
1924 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1925         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1926         .twsk_unique    = tcp_twsk_unique,
1927         .twsk_destructor= tcp_twsk_destructor,
1928 };
1929
1930 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1931 {
1932         struct dst_entry *dst = skb_dst(skb);
1933
1934         if (dst && dst_hold_safe(dst)) {
1935                 sk->sk_rx_dst = dst;
1936                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1937         }
1938 }
1939 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1940
1941 const struct inet_connection_sock_af_ops ipv4_specific = {
1942         .queue_xmit        = ip_queue_xmit,
1943         .send_check        = tcp_v4_send_check,
1944         .rebuild_header    = inet_sk_rebuild_header,
1945         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1946         .conn_request      = tcp_v4_conn_request,
1947         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1948         .net_header_len    = sizeof(struct iphdr),
1949         .setsockopt        = ip_setsockopt,
1950         .getsockopt        = ip_getsockopt,
1951         .addr2sockaddr     = inet_csk_addr2sockaddr,
1952         .sockaddr_len      = sizeof(struct sockaddr_in),
1953 #ifdef CONFIG_COMPAT
1954         .compat_setsockopt = compat_ip_setsockopt,
1955         .compat_getsockopt = compat_ip_getsockopt,
1956 #endif
1957         .mtu_reduced       = tcp_v4_mtu_reduced,
1958 };
1959 EXPORT_SYMBOL(ipv4_specific);
1960
1961 #ifdef CONFIG_TCP_MD5SIG
1962 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1963         .md5_lookup             = tcp_v4_md5_lookup,
1964         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1965         .md5_parse              = tcp_v4_parse_md5_keys,
1966 };
1967 #endif
1968
1969 /* NOTE: A lot of things set to zero explicitly by call to
1970  *       sk_alloc() so need not be done here.
1971  */
1972 static int tcp_v4_init_sock(struct sock *sk)
1973 {
1974         struct inet_connection_sock *icsk = inet_csk(sk);
1975
1976         tcp_init_sock(sk);
1977
1978         icsk->icsk_af_ops = &ipv4_specific;
1979
1980 #ifdef CONFIG_TCP_MD5SIG
1981         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1982 #endif
1983
1984         return 0;
1985 }
1986
1987 void tcp_v4_destroy_sock(struct sock *sk)
1988 {
1989         struct tcp_sock *tp = tcp_sk(sk);
1990
1991         trace_tcp_destroy_sock(sk);
1992
1993         tcp_clear_xmit_timers(sk);
1994
1995         tcp_cleanup_congestion_control(sk);
1996
1997         tcp_cleanup_ulp(sk);
1998
1999         /* Cleanup up the write buffer. */
2000         tcp_write_queue_purge(sk);
2001
2002         /* Check if we want to disable active TFO */
2003         tcp_fastopen_active_disable_ofo_check(sk);
2004
2005         /* Cleans up our, hopefully empty, out_of_order_queue. */
2006         skb_rbtree_purge(&tp->out_of_order_queue);
2007
2008 #ifdef CONFIG_TCP_MD5SIG
2009         /* Clean up the MD5 key list, if any */
2010         if (tp->md5sig_info) {
2011                 tcp_clear_md5_list(sk);
2012                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2013                 tp->md5sig_info = NULL;
2014         }
2015 #endif
2016
2017         /* Clean up a referenced TCP bind bucket. */
2018         if (inet_csk(sk)->icsk_bind_hash)
2019                 inet_put_port(sk);
2020
2021         BUG_ON(tp->fastopen_rsk);
2022
2023         /* If socket is aborted during connect operation */
2024         tcp_free_fastopen_req(tp);
2025         tcp_fastopen_destroy_cipher(sk);
2026         tcp_saved_syn_free(tp);
2027
2028         sk_sockets_allocated_dec(sk);
2029 }
2030 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2031
2032 #ifdef CONFIG_PROC_FS
2033 /* Proc filesystem TCP sock list dumping. */
2034
2035 /*
2036  * Get next listener socket follow cur.  If cur is NULL, get first socket
2037  * starting from bucket given in st->bucket; when st->bucket is zero the
2038  * very first socket in the hash table is returned.
2039  */
2040 static void *listening_get_next(struct seq_file *seq, void *cur)
2041 {
2042         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2043         struct tcp_iter_state *st = seq->private;
2044         struct net *net = seq_file_net(seq);
2045         struct inet_listen_hashbucket *ilb;
2046         struct hlist_nulls_node *node;
2047         struct sock *sk = cur;
2048
2049         if (!sk) {
2050 get_head:
2051                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2052                 spin_lock(&ilb->lock);
2053                 sk = sk_nulls_head(&ilb->nulls_head);
2054                 st->offset = 0;
2055                 goto get_sk;
2056         }
2057         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2058         ++st->num;
2059         ++st->offset;
2060
2061         sk = sk_nulls_next(sk);
2062 get_sk:
2063         sk_nulls_for_each_from(sk, node) {
2064                 if (!net_eq(sock_net(sk), net))
2065                         continue;
2066                 if (sk->sk_family == afinfo->family)
2067                         return sk;
2068         }
2069         spin_unlock(&ilb->lock);
2070         st->offset = 0;
2071         if (++st->bucket < INET_LHTABLE_SIZE)
2072                 goto get_head;
2073         return NULL;
2074 }
2075
2076 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2077 {
2078         struct tcp_iter_state *st = seq->private;
2079         void *rc;
2080
2081         st->bucket = 0;
2082         st->offset = 0;
2083         rc = listening_get_next(seq, NULL);
2084
2085         while (rc && *pos) {
2086                 rc = listening_get_next(seq, rc);
2087                 --*pos;
2088         }
2089         return rc;
2090 }
2091
2092 static inline bool empty_bucket(const struct tcp_iter_state *st)
2093 {
2094         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2095 }
2096
2097 /*
2098  * Get first established socket starting from bucket given in st->bucket.
2099  * If st->bucket is zero, the very first socket in the hash is returned.
2100  */
2101 static void *established_get_first(struct seq_file *seq)
2102 {
2103         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2104         struct tcp_iter_state *st = seq->private;
2105         struct net *net = seq_file_net(seq);
2106         void *rc = NULL;
2107
2108         st->offset = 0;
2109         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2110                 struct sock *sk;
2111                 struct hlist_nulls_node *node;
2112                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2113
2114                 /* Lockless fast path for the common case of empty buckets */
2115                 if (empty_bucket(st))
2116                         continue;
2117
2118                 spin_lock_bh(lock);
2119                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2120                         if (sk->sk_family != afinfo->family ||
2121                             !net_eq(sock_net(sk), net)) {
2122                                 continue;
2123                         }
2124                         rc = sk;
2125                         goto out;
2126                 }
2127                 spin_unlock_bh(lock);
2128         }
2129 out:
2130         return rc;
2131 }
2132
2133 static void *established_get_next(struct seq_file *seq, void *cur)
2134 {
2135         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2136         struct sock *sk = cur;
2137         struct hlist_nulls_node *node;
2138         struct tcp_iter_state *st = seq->private;
2139         struct net *net = seq_file_net(seq);
2140
2141         ++st->num;
2142         ++st->offset;
2143
2144         sk = sk_nulls_next(sk);
2145
2146         sk_nulls_for_each_from(sk, node) {
2147                 if (sk->sk_family == afinfo->family &&
2148                     net_eq(sock_net(sk), net))
2149                         return sk;
2150         }
2151
2152         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2153         ++st->bucket;
2154         return established_get_first(seq);
2155 }
2156
2157 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2158 {
2159         struct tcp_iter_state *st = seq->private;
2160         void *rc;
2161
2162         st->bucket = 0;
2163         rc = established_get_first(seq);
2164
2165         while (rc && pos) {
2166                 rc = established_get_next(seq, rc);
2167                 --pos;
2168         }
2169         return rc;
2170 }
2171
2172 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2173 {
2174         void *rc;
2175         struct tcp_iter_state *st = seq->private;
2176
2177         st->state = TCP_SEQ_STATE_LISTENING;
2178         rc        = listening_get_idx(seq, &pos);
2179
2180         if (!rc) {
2181                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2182                 rc        = established_get_idx(seq, pos);
2183         }
2184
2185         return rc;
2186 }
2187
2188 static void *tcp_seek_last_pos(struct seq_file *seq)
2189 {
2190         struct tcp_iter_state *st = seq->private;
2191         int bucket = st->bucket;
2192         int offset = st->offset;
2193         int orig_num = st->num;
2194         void *rc = NULL;
2195
2196         switch (st->state) {
2197         case TCP_SEQ_STATE_LISTENING:
2198                 if (st->bucket >= INET_LHTABLE_SIZE)
2199                         break;
2200                 st->state = TCP_SEQ_STATE_LISTENING;
2201                 rc = listening_get_next(seq, NULL);
2202                 while (offset-- && rc && bucket == st->bucket)
2203                         rc = listening_get_next(seq, rc);
2204                 if (rc)
2205                         break;
2206                 st->bucket = 0;
2207                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2208                 /* Fallthrough */
2209         case TCP_SEQ_STATE_ESTABLISHED:
2210                 if (st->bucket > tcp_hashinfo.ehash_mask)
2211                         break;
2212                 rc = established_get_first(seq);
2213                 while (offset-- && rc && bucket == st->bucket)
2214                         rc = established_get_next(seq, rc);
2215         }
2216
2217         st->num = orig_num;
2218
2219         return rc;
2220 }
2221
2222 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2223 {
2224         struct tcp_iter_state *st = seq->private;
2225         void *rc;
2226
2227         if (*pos && *pos == st->last_pos) {
2228                 rc = tcp_seek_last_pos(seq);
2229                 if (rc)
2230                         goto out;
2231         }
2232
2233         st->state = TCP_SEQ_STATE_LISTENING;
2234         st->num = 0;
2235         st->bucket = 0;
2236         st->offset = 0;
2237         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2238
2239 out:
2240         st->last_pos = *pos;
2241         return rc;
2242 }
2243 EXPORT_SYMBOL(tcp_seq_start);
2244
2245 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2246 {
2247         struct tcp_iter_state *st = seq->private;
2248         void *rc = NULL;
2249
2250         if (v == SEQ_START_TOKEN) {
2251                 rc = tcp_get_idx(seq, 0);
2252                 goto out;
2253         }
2254
2255         switch (st->state) {
2256         case TCP_SEQ_STATE_LISTENING:
2257                 rc = listening_get_next(seq, v);
2258                 if (!rc) {
2259                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2260                         st->bucket = 0;
2261                         st->offset = 0;
2262                         rc        = established_get_first(seq);
2263                 }
2264                 break;
2265         case TCP_SEQ_STATE_ESTABLISHED:
2266                 rc = established_get_next(seq, v);
2267                 break;
2268         }
2269 out:
2270         ++*pos;
2271         st->last_pos = *pos;
2272         return rc;
2273 }
2274 EXPORT_SYMBOL(tcp_seq_next);
2275
2276 void tcp_seq_stop(struct seq_file *seq, void *v)
2277 {
2278         struct tcp_iter_state *st = seq->private;
2279
2280         switch (st->state) {
2281         case TCP_SEQ_STATE_LISTENING:
2282                 if (v != SEQ_START_TOKEN)
2283                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2284                 break;
2285         case TCP_SEQ_STATE_ESTABLISHED:
2286                 if (v)
2287                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2288                 break;
2289         }
2290 }
2291 EXPORT_SYMBOL(tcp_seq_stop);
2292
2293 static void get_openreq4(const struct request_sock *req,
2294                          struct seq_file *f, int i)
2295 {
2296         const struct inet_request_sock *ireq = inet_rsk(req);
2297         long delta = req->rsk_timer.expires - jiffies;
2298
2299         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2300                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2301                 i,
2302                 ireq->ir_loc_addr,
2303                 ireq->ir_num,
2304                 ireq->ir_rmt_addr,
2305                 ntohs(ireq->ir_rmt_port),
2306                 TCP_SYN_RECV,
2307                 0, 0, /* could print option size, but that is af dependent. */
2308                 1,    /* timers active (only the expire timer) */
2309                 jiffies_delta_to_clock_t(delta),
2310                 req->num_timeout,
2311                 from_kuid_munged(seq_user_ns(f),
2312                                  sock_i_uid(req->rsk_listener)),
2313                 0,  /* non standard timer */
2314                 0, /* open_requests have no inode */
2315                 0,
2316                 req);
2317 }
2318
2319 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2320 {
2321         int timer_active;
2322         unsigned long timer_expires;
2323         const struct tcp_sock *tp = tcp_sk(sk);
2324         const struct inet_connection_sock *icsk = inet_csk(sk);
2325         const struct inet_sock *inet = inet_sk(sk);
2326         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2327         __be32 dest = inet->inet_daddr;
2328         __be32 src = inet->inet_rcv_saddr;
2329         __u16 destp = ntohs(inet->inet_dport);
2330         __u16 srcp = ntohs(inet->inet_sport);
2331         int rx_queue;
2332         int state;
2333
2334         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2335             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2336             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2337                 timer_active    = 1;
2338                 timer_expires   = icsk->icsk_timeout;
2339         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2340                 timer_active    = 4;
2341                 timer_expires   = icsk->icsk_timeout;
2342         } else if (timer_pending(&sk->sk_timer)) {
2343                 timer_active    = 2;
2344                 timer_expires   = sk->sk_timer.expires;
2345         } else {
2346                 timer_active    = 0;
2347                 timer_expires = jiffies;
2348         }
2349
2350         state = inet_sk_state_load(sk);
2351         if (state == TCP_LISTEN)
2352                 rx_queue = sk->sk_ack_backlog;
2353         else
2354                 /* Because we don't lock the socket,
2355                  * we might find a transient negative value.
2356                  */
2357                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2358                                       READ_ONCE(tp->copied_seq), 0);
2359
2360         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2361                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2362                 i, src, srcp, dest, destp, state,
2363                 READ_ONCE(tp->write_seq) - tp->snd_una,
2364                 rx_queue,
2365                 timer_active,
2366                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2367                 icsk->icsk_retransmits,
2368                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2369                 icsk->icsk_probes_out,
2370                 sock_i_ino(sk),
2371                 refcount_read(&sk->sk_refcnt), sk,
2372                 jiffies_to_clock_t(icsk->icsk_rto),
2373                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2374                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2375                 tp->snd_cwnd,
2376                 state == TCP_LISTEN ?
2377                     fastopenq->max_qlen :
2378                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2379 }
2380
2381 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2382                                struct seq_file *f, int i)
2383 {
2384         long delta = tw->tw_timer.expires - jiffies;
2385         __be32 dest, src;
2386         __u16 destp, srcp;
2387
2388         dest  = tw->tw_daddr;
2389         src   = tw->tw_rcv_saddr;
2390         destp = ntohs(tw->tw_dport);
2391         srcp  = ntohs(tw->tw_sport);
2392
2393         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2394                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2395                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2396                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2397                 refcount_read(&tw->tw_refcnt), tw);
2398 }
2399
2400 #define TMPSZ 150
2401
2402 static int tcp4_seq_show(struct seq_file *seq, void *v)
2403 {
2404         struct tcp_iter_state *st;
2405         struct sock *sk = v;
2406
2407         seq_setwidth(seq, TMPSZ - 1);
2408         if (v == SEQ_START_TOKEN) {
2409                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2410                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2411                            "inode");
2412                 goto out;
2413         }
2414         st = seq->private;
2415
2416         if (sk->sk_state == TCP_TIME_WAIT)
2417                 get_timewait4_sock(v, seq, st->num);
2418         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2419                 get_openreq4(v, seq, st->num);
2420         else
2421                 get_tcp4_sock(v, seq, st->num);
2422 out:
2423         seq_pad(seq, '\n');
2424         return 0;
2425 }
2426
2427 static const struct seq_operations tcp4_seq_ops = {
2428         .show           = tcp4_seq_show,
2429         .start          = tcp_seq_start,
2430         .next           = tcp_seq_next,
2431         .stop           = tcp_seq_stop,
2432 };
2433
2434 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2435         .family         = AF_INET,
2436 };
2437
2438 static int __net_init tcp4_proc_init_net(struct net *net)
2439 {
2440         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2441                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2442                 return -ENOMEM;
2443         return 0;
2444 }
2445
2446 static void __net_exit tcp4_proc_exit_net(struct net *net)
2447 {
2448         remove_proc_entry("tcp", net->proc_net);
2449 }
2450
2451 static struct pernet_operations tcp4_net_ops = {
2452         .init = tcp4_proc_init_net,
2453         .exit = tcp4_proc_exit_net,
2454 };
2455
2456 int __init tcp4_proc_init(void)
2457 {
2458         return register_pernet_subsys(&tcp4_net_ops);
2459 }
2460
2461 void tcp4_proc_exit(void)
2462 {
2463         unregister_pernet_subsys(&tcp4_net_ops);
2464 }
2465 #endif /* CONFIG_PROC_FS */
2466
2467 struct proto tcp_prot = {
2468         .name                   = "TCP",
2469         .owner                  = THIS_MODULE,
2470         .close                  = tcp_close,
2471         .pre_connect            = tcp_v4_pre_connect,
2472         .connect                = tcp_v4_connect,
2473         .disconnect             = tcp_disconnect,
2474         .accept                 = inet_csk_accept,
2475         .ioctl                  = tcp_ioctl,
2476         .init                   = tcp_v4_init_sock,
2477         .destroy                = tcp_v4_destroy_sock,
2478         .shutdown               = tcp_shutdown,
2479         .setsockopt             = tcp_setsockopt,
2480         .getsockopt             = tcp_getsockopt,
2481         .keepalive              = tcp_set_keepalive,
2482         .recvmsg                = tcp_recvmsg,
2483         .sendmsg                = tcp_sendmsg,
2484         .sendpage               = tcp_sendpage,
2485         .backlog_rcv            = tcp_v4_do_rcv,
2486         .release_cb             = tcp_release_cb,
2487         .hash                   = inet_hash,
2488         .unhash                 = inet_unhash,
2489         .get_port               = inet_csk_get_port,
2490         .enter_memory_pressure  = tcp_enter_memory_pressure,
2491         .leave_memory_pressure  = tcp_leave_memory_pressure,
2492         .stream_memory_free     = tcp_stream_memory_free,
2493         .sockets_allocated      = &tcp_sockets_allocated,
2494         .orphan_count           = &tcp_orphan_count,
2495         .memory_allocated       = &tcp_memory_allocated,
2496         .memory_pressure        = &tcp_memory_pressure,
2497         .sysctl_mem             = sysctl_tcp_mem,
2498         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2499         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2500         .max_header             = MAX_TCP_HEADER,
2501         .obj_size               = sizeof(struct tcp_sock),
2502         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2503         .twsk_prot              = &tcp_timewait_sock_ops,
2504         .rsk_prot               = &tcp_request_sock_ops,
2505         .h.hashinfo             = &tcp_hashinfo,
2506         .no_autobind            = true,
2507 #ifdef CONFIG_COMPAT
2508         .compat_setsockopt      = compat_tcp_setsockopt,
2509         .compat_getsockopt      = compat_tcp_getsockopt,
2510 #endif
2511         .diag_destroy           = tcp_abort,
2512 };
2513 EXPORT_SYMBOL(tcp_prot);
2514
2515 static void __net_exit tcp_sk_exit(struct net *net)
2516 {
2517         int cpu;
2518
2519         if (net->ipv4.tcp_congestion_control)
2520                 module_put(net->ipv4.tcp_congestion_control->owner);
2521
2522         for_each_possible_cpu(cpu)
2523                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2524         free_percpu(net->ipv4.tcp_sk);
2525 }
2526
2527 static int __net_init tcp_sk_init(struct net *net)
2528 {
2529         int res, cpu, cnt;
2530
2531         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2532         if (!net->ipv4.tcp_sk)
2533                 return -ENOMEM;
2534
2535         for_each_possible_cpu(cpu) {
2536                 struct sock *sk;
2537
2538                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2539                                            IPPROTO_TCP, net);
2540                 if (res)
2541                         goto fail;
2542                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2543
2544                 /* Please enforce IP_DF and IPID==0 for RST and
2545                  * ACK sent in SYN-RECV and TIME-WAIT state.
2546                  */
2547                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2548
2549                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2550         }
2551
2552         net->ipv4.sysctl_tcp_ecn = 2;
2553         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2554
2555         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2556         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2557         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2558         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2559
2560         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2561         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2562         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2563
2564         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2565         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2566         net->ipv4.sysctl_tcp_syncookies = 1;
2567         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2568         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2569         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2570         net->ipv4.sysctl_tcp_orphan_retries = 0;
2571         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2572         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2573         net->ipv4.sysctl_tcp_tw_reuse = 2;
2574
2575         cnt = tcp_hashinfo.ehash_mask + 1;
2576         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2577         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2578
2579         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2580         net->ipv4.sysctl_tcp_sack = 1;
2581         net->ipv4.sysctl_tcp_window_scaling = 1;
2582         net->ipv4.sysctl_tcp_timestamps = 1;
2583         net->ipv4.sysctl_tcp_early_retrans = 3;
2584         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2585         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2586         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2587         net->ipv4.sysctl_tcp_max_reordering = 300;
2588         net->ipv4.sysctl_tcp_dsack = 1;
2589         net->ipv4.sysctl_tcp_app_win = 31;
2590         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2591         net->ipv4.sysctl_tcp_frto = 2;
2592         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2593         /* This limits the percentage of the congestion window which we
2594          * will allow a single TSO frame to consume.  Building TSO frames
2595          * which are too large can cause TCP streams to be bursty.
2596          */
2597         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2598         /* Default TSQ limit of four TSO segments */
2599         net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2600         /* rfc5961 challenge ack rate limiting */
2601         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2602         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2603         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2604         net->ipv4.sysctl_tcp_autocorking = 1;
2605         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2606         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2607         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2608         if (net != &init_net) {
2609                 memcpy(net->ipv4.sysctl_tcp_rmem,
2610                        init_net.ipv4.sysctl_tcp_rmem,
2611                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2612                 memcpy(net->ipv4.sysctl_tcp_wmem,
2613                        init_net.ipv4.sysctl_tcp_wmem,
2614                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2615         }
2616         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2617         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2618         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2619         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2620         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2621         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2622
2623         /* Reno is always built in */
2624         if (!net_eq(net, &init_net) &&
2625             try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2626                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2627         else
2628                 net->ipv4.tcp_congestion_control = &tcp_reno;
2629
2630         return 0;
2631 fail:
2632         tcp_sk_exit(net);
2633
2634         return res;
2635 }
2636
2637 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2638 {
2639         struct net *net;
2640
2641         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2642
2643         list_for_each_entry(net, net_exit_list, exit_list)
2644                 tcp_fastopen_ctx_destroy(net);
2645 }
2646
2647 static struct pernet_operations __net_initdata tcp_sk_ops = {
2648        .init       = tcp_sk_init,
2649        .exit       = tcp_sk_exit,
2650        .exit_batch = tcp_sk_exit_batch,
2651 };
2652
2653 void __init tcp_v4_init(void)
2654 {
2655         if (register_pernet_subsys(&tcp_sk_ops))
2656                 panic("Failed to create the TCP control socket.\n");
2657 }