ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/busy_poll.h>
  77
  78 #include <linux/inet.h>
  79 #include <linux/ipv6.h>
  80 #include <linux/stddef.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/seq_file.h>
  83 #include <linux/inetdevice.h>
  84
  85 #include <crypto/hash.h>
  86 #include <linux/scatterlist.h>
  87
  88 #ifdef CONFIG_TCP_MD5SIG
  89 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  90                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  91 #endif
  92
  93 struct inet_hashinfo tcp_hashinfo;
  94 EXPORT_SYMBOL(tcp_hashinfo);
  95
  96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  97 {
  98         return secure_tcp_seq(ip_hdr(skb)->daddr,
  99                               ip_hdr(skb)->saddr,
 100                               tcp_hdr(skb)->dest,
 101                               tcp_hdr(skb)->source);
 102 }
 103
 104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 105 {
 106         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 107 }
 108
 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110 {
 111         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 112         struct tcp_sock *tp = tcp_sk(sk);
 113
 114         /* With PAWS, it is safe from the viewpoint
 115            of data integrity. Even without PAWS it is safe provided sequence
 116            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 117
 118            Actually, the idea is close to VJ's one, only timestamp cache is
 119            held not per host, but per port pair and TW bucket is used as state
 120            holder.
 121
 122            If TW bucket has been already destroyed we fall back to VJ's scheme
 123            and use initial timestamp retrieved from peer table.
 124          */
 125         if (tcptw->tw_ts_recent_stamp &&
 126             (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
 127                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 128                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 129                 if (tp->write_seq == 0)
 130                         tp->write_seq = 1;
 131                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 132                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 133                 sock_hold(sktw);
 134                 return 1;
 135         }
 136
 137         return 0;
 138 }
 139 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 140
 141 /* This will initiate an outgoing connection. */
 142 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 143 {
 144         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 145         struct inet_sock *inet = inet_sk(sk);
 146         struct tcp_sock *tp = tcp_sk(sk);
 147         __be16 orig_sport, orig_dport;
 148         __be32 daddr, nexthop;
 149         struct flowi4 *fl4;
 150         struct rtable *rt;
 151         int err;
 152         struct ip_options_rcu *inet_opt;
 153         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 154
 155         if (addr_len < sizeof(struct sockaddr_in))
 156                 return -EINVAL;
 157
 158         if (usin->sin_family != AF_INET)
 159                 return -EAFNOSUPPORT;
 160
 161         nexthop = daddr = usin->sin_addr.s_addr;
 162         inet_opt = rcu_dereference_protected(inet->inet_opt,
 163                                              lockdep_sock_is_held(sk));
 164         if (inet_opt && inet_opt->opt.srr) {
 165                 if (!daddr)
 166                         return -EINVAL;
 167                 nexthop = inet_opt->opt.faddr;
 168         }
 169
 170         orig_sport = inet->inet_sport;
 171         orig_dport = usin->sin_port;
 172         fl4 = &inet->cork.fl.u.ip4;
 173         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 174                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 175                               IPPROTO_TCP,
 176                               orig_sport, orig_dport, sk);
 177         if (IS_ERR(rt)) {
 178                 err = PTR_ERR(rt);
 179                 if (err == -ENETUNREACH)
 180                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 181                 return err;
 182         }
 183
 184         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 185                 ip_rt_put(rt);
 186                 return -ENETUNREACH;
 187         }
 188
 189         if (!inet_opt || !inet_opt->opt.srr)
 190                 daddr = fl4->daddr;
 191
 192         if (!inet->inet_saddr)
 193                 inet->inet_saddr = fl4->saddr;
 194         sk_rcv_saddr_set(sk, inet->inet_saddr);
 195
 196         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 197                 /* Reset inherited state */
 198                 tp->rx_opt.ts_recent       = 0;
 199                 tp->rx_opt.ts_recent_stamp = 0;
 200                 if (likely(!tp->repair))
 201                         tp->write_seq      = 0;
 202         }
 203
 204         inet->inet_dport = usin->sin_port;
 205         sk_daddr_set(sk, daddr);
 206
 207         inet_csk(sk)->icsk_ext_hdr_len = 0;
 208         if (inet_opt)
 209                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 210
 211         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 212
 213         /* Socket identity is still unknown (sport may be zero).
 214          * However we set state to SYN-SENT and not releasing socket
 215          * lock select source port, enter ourselves into the hash tables and
 216          * complete initialization after this.
 217          */
 218         tcp_set_state(sk, TCP_SYN_SENT);
 219         err = inet_hash_connect(tcp_death_row, sk);
 220         if (err)
 221                 goto failure;
 222
 223         sk_set_txhash(sk);
 224
 225         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 226                                inet->inet_sport, inet->inet_dport, sk);
 227         if (IS_ERR(rt)) {
 228                 err = PTR_ERR(rt);
 229                 rt = NULL;
 230                 goto failure;
 231         }
 232         /* OK, now commit destination to socket.  */
 233         sk->sk_gso_type = SKB_GSO_TCPV4;
 234         sk_setup_caps(sk, &rt->dst);
 235         rt = NULL;
 236
 237         if (likely(!tp->repair)) {
 238                 if (!tp->write_seq)
 239                         tp->write_seq = secure_tcp_seq(inet->inet_saddr,
 240                                                        inet->inet_daddr,
 241                                                        inet->inet_sport,
 242                                                        usin->sin_port);
 243                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 244                                                  inet->inet_saddr,
 245                                                  inet->inet_daddr);
 246         }
 247
 248         inet->inet_id = prandom_u32();
 249
 250         if (tcp_fastopen_defer_connect(sk, &err))
 251                 return err;
 252         if (err)
 253                 goto failure;
 254
 255         err = tcp_connect(sk);
 256
 257         if (err)
 258                 goto failure;
 259
 260         return 0;
 261
 262 failure:
 263         /*
 264          * This unhashes the socket and releases the local port,
 265          * if necessary.
 266          */
 267         tcp_set_state(sk, TCP_CLOSE);
 268         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
 269                 inet_reset_saddr(sk);
 270         ip_rt_put(rt);
 271         sk->sk_route_caps = 0;
 272         inet->inet_dport = 0;
 273         return err;
 274 }
 275 EXPORT_SYMBOL(tcp_v4_connect);
 276
 277 /*
 278  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 279  * It can be called through tcp_release_cb() if socket was owned by user
 280  * at the time tcp_v4_err() was called to handle ICMP message.
 281  */
 282 void tcp_v4_mtu_reduced(struct sock *sk)
 283 {
 284         struct inet_sock *inet = inet_sk(sk);
 285         struct dst_entry *dst;
 286         u32 mtu;
 287
 288         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 289                 return;
 290         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 291         dst = inet_csk_update_pmtu(sk, mtu);
 292         if (!dst)
 293                 return;
 294
 295         /* Something is about to be wrong... Remember soft error
 296          * for the case, if this connection will not able to recover.
 297          */
 298         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 299                 sk->sk_err_soft = EMSGSIZE;
 300
 301         mtu = dst_mtu(dst);
 302
 303         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 304             ip_sk_accept_pmtu(sk) &&
 305             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 306                 tcp_sync_mss(sk, mtu);
 307
 308                 /* Resend the TCP packet because it's
 309                  * clear that the old packet has been
 310                  * dropped. This is the new "fast" path mtu
 311                  * discovery.
 312                  */
 313                 tcp_simple_retransmit(sk);
 314         } /* else let the usual retransmit timer handle it */
 315 }
 316 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 317
 318 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 319 {
 320         struct dst_entry *dst = __sk_dst_check(sk, 0);
 321
 322         if (dst)
 323                 dst->ops->redirect(dst, sk, skb);
 324 }
 325
 326
 327 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 328 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 329 {
 330         struct request_sock *req = inet_reqsk(sk);
 331         struct net *net = sock_net(sk);
 332
 333         /* ICMPs are not backlogged, hence we cannot get
 334          * an established socket here.
 335          */
 336         if (seq != tcp_rsk(req)->snt_isn) {
 337                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 338         } else if (abort) {
 339                 /*
 340                  * Still in SYN_RECV, just remove it silently.
 341                  * There is no good way to pass the error to the newly
 342                  * created socket, and POSIX does not want network
 343                  * errors returned from accept().
 344                  */
 345                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 346                 tcp_listendrop(req->rsk_listener);
 347         }
 348         reqsk_put(req);
 349 }
 350 EXPORT_SYMBOL(tcp_req_err);
 351
 352 /*
 353  * This routine is called by the ICMP module when it gets some
 354  * sort of error condition.  If err < 0 then the socket should
 355  * be closed and the error returned to the user.  If err > 0
 356  * it's just the icmp type << 8 | icmp code.  After adjustment
 357  * header points to the first 8 bytes of the tcp header.  We need
 358  * to find the appropriate port.
 359  *
 360  * The locking strategy used here is very "optimistic". When
 361  * someone else accesses the socket the ICMP is just dropped
 362  * and for some paths there is no check at all.
 363  * A more general error queue to queue errors for later handling
 364  * is probably better.
 365  *
 366  */
 367
 368 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 369 {
 370         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 371         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 372         struct inet_connection_sock *icsk;
 373         struct tcp_sock *tp;
 374         struct inet_sock *inet;
 375         const int type = icmp_hdr(icmp_skb)->type;
 376         const int code = icmp_hdr(icmp_skb)->code;
 377         struct sock *sk;
 378         struct sk_buff *skb;
 379         struct request_sock *fastopen;
 380         u32 seq, snd_una;
 381         s32 remaining;
 382         u32 delta_us;
 383         int err;
 384         struct net *net = dev_net(icmp_skb->dev);
 385
 386         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 387                                        th->dest, iph->saddr, ntohs(th->source),
 388                                        inet_iif(icmp_skb), 0);
 389         if (!sk) {
 390                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 391                 return;
 392         }
 393         if (sk->sk_state == TCP_TIME_WAIT) {
 394                 inet_twsk_put(inet_twsk(sk));
 395                 return;
 396         }
 397         seq = ntohl(th->seq);
 398         if (sk->sk_state == TCP_NEW_SYN_RECV)
 399                 return tcp_req_err(sk, seq,
 400                                   type == ICMP_PARAMETERPROB ||
 401                                   type == ICMP_TIME_EXCEEDED ||
 402                                   (type == ICMP_DEST_UNREACH &&
 403                                    (code == ICMP_NET_UNREACH ||
 404                                     code == ICMP_HOST_UNREACH)));
 405
 406         bh_lock_sock(sk);
 407         /* If too many ICMPs get dropped on busy
 408          * servers this needs to be solved differently.
 409          * We do take care of PMTU discovery (RFC1191) special case :
 410          * we can receive locally generated ICMP messages while socket is held.
 411          */
 412         if (sock_owned_by_user(sk)) {
 413                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 414                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 415         }
 416         if (sk->sk_state == TCP_CLOSE)
 417                 goto out;
 418
 419         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 420                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 421                 goto out;
 422         }
 423
 424         icsk = inet_csk(sk);
 425         tp = tcp_sk(sk);
 426         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 427         fastopen = tp->fastopen_rsk;
 428         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 429         if (sk->sk_state != TCP_LISTEN &&
 430             !between(seq, snd_una, tp->snd_nxt)) {
 431                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 432                 goto out;
 433         }
 434
 435         switch (type) {
 436         case ICMP_REDIRECT:
 437                 if (!sock_owned_by_user(sk))
 438                         do_redirect(icmp_skb, sk);
 439                 goto out;
 440         case ICMP_SOURCE_QUENCH:
 441                 /* Just silently ignore these. */
 442                 goto out;
 443         case ICMP_PARAMETERPROB:
 444                 err = EPROTO;
 445                 break;
 446         case ICMP_DEST_UNREACH:
 447                 if (code > NR_ICMP_UNREACH)
 448                         goto out;
 449
 450                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 451                         /* We are not interested in TCP_LISTEN and open_requests
 452                          * (SYN-ACKs send out by Linux are always <576bytes so
 453                          * they should go through unfragmented).
 454                          */
 455                         if (sk->sk_state == TCP_LISTEN)
 456                                 goto out;
 457
 458                         WRITE_ONCE(tp->mtu_info, info);
 459                         if (!sock_owned_by_user(sk)) {
 460                                 tcp_v4_mtu_reduced(sk);
 461                         } else {
 462                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 463                                         sock_hold(sk);
 464                         }
 465                         goto out;
 466                 }
 467
 468                 err = icmp_err_convert[code].errno;
 469                 /* check if icmp_skb allows revert of backoff
 470                  * (see draft-zimmermann-tcp-lcd) */
 471                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 472                         break;
 473                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 474                     !icsk->icsk_backoff || fastopen)
 475                         break;
 476
 477                 if (sock_owned_by_user(sk))
 478                         break;
 479
 480                 skb = tcp_write_queue_head(sk);
 481                 if (WARN_ON_ONCE(!skb))
 482                         break;
 483
 484                 icsk->icsk_backoff--;
 485                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 486                                                TCP_TIMEOUT_INIT;
 487                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 488
 489                 tcp_mstamp_refresh(tp);
 490                 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
 491                 remaining = icsk->icsk_rto -
 492                             usecs_to_jiffies(delta_us);
 493
 494                 if (remaining > 0) {
 495                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 496                                                   remaining, TCP_RTO_MAX);
 497                 } else {
 498                         /* RTO revert clocked out retransmission.
 499                          * Will retransmit now */
 500                         tcp_retransmit_timer(sk);
 501                 }
 502
 503                 break;
 504         case ICMP_TIME_EXCEEDED:
 505                 err = EHOSTUNREACH;
 506                 break;
 507         default:
 508                 goto out;
 509         }
 510
 511         switch (sk->sk_state) {
 512         case TCP_SYN_SENT:
 513         case TCP_SYN_RECV:
 514                 /* Only in fast or simultaneous open. If a fast open socket is
 515                  * is already accepted it is treated as a connected one below.
 516                  */
 517                 if (fastopen && !fastopen->sk)
 518                         break;
 519
 520                 if (!sock_owned_by_user(sk)) {
 521                         sk->sk_err = err;
 522
 523                         sk->sk_error_report(sk);
 524
 525                         tcp_done(sk);
 526                 } else {
 527                         sk->sk_err_soft = err;
 528                 }
 529                 goto out;
 530         }
 531
 532         /* If we've already connected we will keep trying
 533          * until we time out, or the user gives up.
 534          *
 535          * rfc1122 4.2.3.9 allows to consider as hard errors
 536          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 537          * but it is obsoleted by pmtu discovery).
 538          *
 539          * Note, that in modern internet, where routing is unreliable
 540          * and in each dark corner broken firewalls sit, sending random
 541          * errors ordered by their masters even this two messages finally lose
 542          * their original sense (even Linux sends invalid PORT_UNREACHs)
 543          *
 544          * Now we are in compliance with RFCs.
 545          *                                                      --ANK (980905)
 546          */
 547
 548         inet = inet_sk(sk);
 549         if (!sock_owned_by_user(sk) && inet->recverr) {
 550                 sk->sk_err = err;
 551                 sk->sk_error_report(sk);
 552         } else  { /* Only an error on timeout */
 553                 sk->sk_err_soft = err;
 554         }
 555
 556 out:
 557         bh_unlock_sock(sk);
 558         sock_put(sk);
 559 }
 560
 561 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 562 {
 563         struct tcphdr *th = tcp_hdr(skb);
 564
 565         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 566                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 567                 skb->csum_start = skb_transport_header(skb) - skb->head;
 568                 skb->csum_offset = offsetof(struct tcphdr, check);
 569         } else {
 570                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 571                                          csum_partial(th,
 572                                                       th->doff << 2,
 573                                                       skb->csum));
 574         }
 575 }
 576
 577 /* This routine computes an IPv4 TCP checksum. */
 578 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 579 {
 580         const struct inet_sock *inet = inet_sk(sk);
 581
 582         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 583 }
 584 EXPORT_SYMBOL(tcp_v4_send_check);
 585
 586 /*
 587  *      This routine will send an RST to the other tcp.
 588  *
 589  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 590  *                    for reset.
 591  *      Answer: if a packet caused RST, it is not for a socket
 592  *              existing in our system, if it is matched to a socket,
 593  *              it is just duplicate segment or bug in other side's TCP.
 594  *              So that we build reply only basing on parameters
 595  *              arrived with segment.
 596  *      Exception: precedence violation. We do not implement it in any case.
 597  */
 598
 599 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 600 {
 601         const struct tcphdr *th = tcp_hdr(skb);
 602         struct {
 603                 struct tcphdr th;
 604 #ifdef CONFIG_TCP_MD5SIG
 605                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 606 #endif
 607         } rep;
 608         struct ip_reply_arg arg;
 609 #ifdef CONFIG_TCP_MD5SIG
 610         struct tcp_md5sig_key *key = NULL;
 611         const __u8 *hash_location = NULL;
 612         unsigned char newhash[16];
 613         int genhash;
 614         struct sock *sk1 = NULL;
 615 #endif
 616         struct net *net;
 617
 618         /* Never send a reset in response to a reset. */
 619         if (th->rst)
 620                 return;
 621
 622         /* If sk not NULL, it means we did a successful lookup and incoming
 623          * route had to be correct. prequeue might have dropped our dst.
 624          */
 625         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 626                 return;
 627
 628         /* Swap the send and the receive. */
 629         memset(&rep, 0, sizeof(rep));
 630         rep.th.dest   = th->source;
 631         rep.th.source = th->dest;
 632         rep.th.doff   = sizeof(struct tcphdr) / 4;
 633         rep.th.rst    = 1;
 634
 635         if (th->ack) {
 636                 rep.th.seq = th->ack_seq;
 637         } else {
 638                 rep.th.ack = 1;
 639                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 640                                        skb->len - (th->doff << 2));
 641         }
 642
 643         memset(&arg, 0, sizeof(arg));
 644         arg.iov[0].iov_base = (unsigned char *)&rep;
 645         arg.iov[0].iov_len  = sizeof(rep.th);
 646
 647         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 648 #ifdef CONFIG_TCP_MD5SIG
 649         rcu_read_lock();
 650         hash_location = tcp_parse_md5sig_option(th);
 651         if (sk && sk_fullsock(sk)) {
 652                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 653                                         &ip_hdr(skb)->saddr, AF_INET);
 654         } else if (hash_location) {
 655                 /*
 656                  * active side is lost. Try to find listening socket through
 657                  * source port, and then find md5 key through listening socket.
 658                  * we are not loose security here:
 659                  * Incoming packet is checked with md5 hash with finding key,
 660                  * no RST generated if md5 hash doesn't match.
 661                  */
 662                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 663                                              ip_hdr(skb)->saddr,
 664                                              th->source, ip_hdr(skb)->daddr,
 665                                              ntohs(th->source), inet_iif(skb),
 666                                              tcp_v4_sdif(skb));
 667                 /* don't send rst if it can't find key */
 668                 if (!sk1)
 669                         goto out;
 670
 671                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 672                                         &ip_hdr(skb)->saddr, AF_INET);
 673                 if (!key)
 674                         goto out;
 675
 676
 677                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 678                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 679                         goto out;
 680
 681         }
 682
 683         if (key) {
 684                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 685                                    (TCPOPT_NOP << 16) |
 686                                    (TCPOPT_MD5SIG << 8) |
 687                                    TCPOLEN_MD5SIG);
 688                 /* Update length and the length the header thinks exists */
 689                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 690                 rep.th.doff = arg.iov[0].iov_len / 4;
 691
 692                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 693                                      key, ip_hdr(skb)->saddr,
 694                                      ip_hdr(skb)->daddr, &rep.th);
 695         }
 696 #endif
 697         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 698                                       ip_hdr(skb)->saddr, /* XXX */
 699                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 700         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 701         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 702
 703         /* When socket is gone, all binding information is lost.
 704          * routing might fail in this case. No choice here, if we choose to force
 705          * input interface, we will misroute in case of asymmetric route.
 706          */
 707         if (sk)
 708                 arg.bound_dev_if = sk->sk_bound_dev_if;
 709
 710         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 711                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 712
 713         arg.tos = ip_hdr(skb)->tos;
 714         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 715         local_bh_disable();
 716         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 717                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 718                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 719                               &arg, arg.iov[0].iov_len);
 720
 721         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 722         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 723         local_bh_enable();
 724
 725 #ifdef CONFIG_TCP_MD5SIG
 726 out:
 727         rcu_read_unlock();
 728 #endif
 729 }
 730
 731 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 732    outside socket context is ugly, certainly. What can I do?
 733  */
 734
 735 static void tcp_v4_send_ack(const struct sock *sk,
 736                             struct sk_buff *skb, u32 seq, u32 ack,
 737                             u32 win, u32 tsval, u32 tsecr, int oif,
 738                             struct tcp_md5sig_key *key,
 739                             int reply_flags, u8 tos)
 740 {
 741         const struct tcphdr *th = tcp_hdr(skb);
 742         struct {
 743                 struct tcphdr th;
 744                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 745 #ifdef CONFIG_TCP_MD5SIG
 746                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 747 #endif
 748                         ];
 749         } rep;
 750         struct net *net = sock_net(sk);
 751         struct ip_reply_arg arg;
 752
 753         memset(&rep.th, 0, sizeof(struct tcphdr));
 754         memset(&arg, 0, sizeof(arg));
 755
 756         arg.iov[0].iov_base = (unsigned char *)&rep;
 757         arg.iov[0].iov_len  = sizeof(rep.th);
 758         if (tsecr) {
 759                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 760                                    (TCPOPT_TIMESTAMP << 8) |
 761                                    TCPOLEN_TIMESTAMP);
 762                 rep.opt[1] = htonl(tsval);
 763                 rep.opt[2] = htonl(tsecr);
 764                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 765         }
 766
 767         /* Swap the send and the receive. */
 768         rep.th.dest    = th->source;
 769         rep.th.source  = th->dest;
 770         rep.th.doff    = arg.iov[0].iov_len / 4;
 771         rep.th.seq     = htonl(seq);
 772         rep.th.ack_seq = htonl(ack);
 773         rep.th.ack     = 1;
 774         rep.th.window  = htons(win);
 775
 776 #ifdef CONFIG_TCP_MD5SIG
 777         if (key) {
 778                 int offset = (tsecr) ? 3 : 0;
 779
 780                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 781                                           (TCPOPT_NOP << 16) |
 782                                           (TCPOPT_MD5SIG << 8) |
 783                                           TCPOLEN_MD5SIG);
 784                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 785                 rep.th.doff = arg.iov[0].iov_len/4;
 786
 787                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 788                                     key, ip_hdr(skb)->saddr,
 789                                     ip_hdr(skb)->daddr, &rep.th);
 790         }
 791 #endif
 792         arg.flags = reply_flags;
 793         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 794                                       ip_hdr(skb)->saddr, /* XXX */
 795                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 796         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 797         if (oif)
 798                 arg.bound_dev_if = oif;
 799         arg.tos = tos;
 800         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 801         local_bh_disable();
 802         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 803                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 804                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 805                               &arg, arg.iov[0].iov_len);
 806
 807         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 808         local_bh_enable();
 809 }
 810
 811 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 812 {
 813         struct inet_timewait_sock *tw = inet_twsk(sk);
 814         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 815
 816         tcp_v4_send_ack(sk, skb,
 817                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 818                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 819                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 820                         tcptw->tw_ts_recent,
 821                         tw->tw_bound_dev_if,
 822                         tcp_twsk_md5_key(tcptw),
 823                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 824                         tw->tw_tos
 825                         );
 826
 827         inet_twsk_put(tw);
 828 }
 829
 830 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 831                                   struct request_sock *req)
 832 {
 833         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 834          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 835          */
 836         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 837                                              tcp_sk(sk)->snd_nxt;
 838
 839         /* RFC 7323 2.3
 840          * The window field (SEG.WND) of every outgoing segment, with the
 841          * exception of <SYN> segments, MUST be right-shifted by
 842          * Rcv.Wind.Shift bits:
 843          */
 844         tcp_v4_send_ack(sk, skb, seq,
 845                         tcp_rsk(req)->rcv_nxt,
 846                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 847                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 848                         req->ts_recent,
 849                         0,
 850                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 851                                           AF_INET),
 852                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 853                         ip_hdr(skb)->tos);
 854 }
 855
 856 /*
 857  *      Send a SYN-ACK after having received a SYN.
 858  *      This still operates on a request_sock only, not on a big
 859  *      socket.
 860  */
 861 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 862                               struct flowi *fl,
 863                               struct request_sock *req,
 864                               struct tcp_fastopen_cookie *foc,
 865                               enum tcp_synack_type synack_type)
 866 {
 867         const struct inet_request_sock *ireq = inet_rsk(req);
 868         struct flowi4 fl4;
 869         int err = -1;
 870         struct sk_buff *skb;
 871
 872         /* First, grab a route. */
 873         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 874                 return -1;
 875
 876         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 877
 878         if (skb) {
 879                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 880
 881                 rcu_read_lock();
 882                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 883                                             ireq->ir_rmt_addr,
 884                                             rcu_dereference(ireq->ireq_opt));
 885                 rcu_read_unlock();
 886                 err = net_xmit_eval(err);
 887         }
 888
 889         return err;
 890 }
 891
 892 /*
 893  *      IPv4 request_sock destructor.
 894  */
 895 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 896 {
 897         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 898 }
 899
 900 #ifdef CONFIG_TCP_MD5SIG
 901 /*
 902  * RFC2385 MD5 checksumming requires a mapping of
 903  * IP address->MD5 Key.
 904  * We need to maintain these in the sk structure.
 905  */
 906
 907 /* Find the Key structure for an address.  */
 908 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 909                                          const union tcp_md5_addr *addr,
 910                                          int family)
 911 {
 912         const struct tcp_sock *tp = tcp_sk(sk);
 913         struct tcp_md5sig_key *key;
 914         const struct tcp_md5sig_info *md5sig;
 915         __be32 mask;
 916         struct tcp_md5sig_key *best_match = NULL;
 917         bool match;
 918
 919         /* caller either holds rcu_read_lock() or socket lock */
 920         md5sig = rcu_dereference_check(tp->md5sig_info,
 921                                        lockdep_sock_is_held(sk));
 922         if (!md5sig)
 923                 return NULL;
 924
 925         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 926                 if (key->family != family)
 927                         continue;
 928
 929                 if (family == AF_INET) {
 930                         mask = inet_make_mask(key->prefixlen);
 931                         match = (key->addr.a4.s_addr & mask) ==
 932                                 (addr->a4.s_addr & mask);
 933 #if IS_ENABLED(CONFIG_IPV6)
 934                 } else if (family == AF_INET6) {
 935                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
 936                                                   key->prefixlen);
 937 #endif
 938                 } else {
 939                         match = false;
 940                 }
 941
 942                 if (match && (!best_match ||
 943                               key->prefixlen > best_match->prefixlen))
 944                         best_match = key;
 945         }
 946         return best_match;
 947 }
 948 EXPORT_SYMBOL(tcp_md5_do_lookup);
 949
 950 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
 951                                                       const union tcp_md5_addr *addr,
 952                                                       int family, u8 prefixlen)
 953 {
 954         const struct tcp_sock *tp = tcp_sk(sk);
 955         struct tcp_md5sig_key *key;
 956         unsigned int size = sizeof(struct in_addr);
 957         const struct tcp_md5sig_info *md5sig;
 958
 959         /* caller either holds rcu_read_lock() or socket lock */
 960         md5sig = rcu_dereference_check(tp->md5sig_info,
 961                                        lockdep_sock_is_held(sk));
 962         if (!md5sig)
 963                 return NULL;
 964 #if IS_ENABLED(CONFIG_IPV6)
 965         if (family == AF_INET6)
 966                 size = sizeof(struct in6_addr);
 967 #endif
 968         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 969                 if (key->family != family)
 970                         continue;
 971                 if (!memcmp(&key->addr, addr, size) &&
 972                     key->prefixlen == prefixlen)
 973                         return key;
 974         }
 975         return NULL;
 976 }
 977
 978 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 979                                          const struct sock *addr_sk)
 980 {
 981         const union tcp_md5_addr *addr;
 982
 983         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 984         return tcp_md5_do_lookup(sk, addr, AF_INET);
 985 }
 986 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 987
 988 /* This can be called on a newly created socket, from other files */
 989 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 990                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
 991                    gfp_t gfp)
 992 {
 993         /* Add Key to the list */
 994         struct tcp_md5sig_key *key;
 995         struct tcp_sock *tp = tcp_sk(sk);
 996         struct tcp_md5sig_info *md5sig;
 997
 998         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
 999         if (key) {
1000                 /* Pre-existing entry - just update that one.
1001                  * Note that the key might be used concurrently.
1002                  */
1003                 memcpy(key->key, newkey, newkeylen);
1004
1005                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1006                  * Also note that a reader could catch new key->keylen value
1007                  * but old key->key[], this is the reason we use __GFP_ZERO
1008                  * at sock_kmalloc() time below these lines.
1009                  */
1010                 WRITE_ONCE(key->keylen, newkeylen);
1011
1012                 return 0;
1013         }
1014
1015         md5sig = rcu_dereference_protected(tp->md5sig_info,
1016                                            lockdep_sock_is_held(sk));
1017         if (!md5sig) {
1018                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1019                 if (!md5sig)
1020                         return -ENOMEM;
1021
1022                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1023                 INIT_HLIST_HEAD(&md5sig->head);
1024                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1025         }
1026
1027         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1028         if (!key)
1029                 return -ENOMEM;
1030         if (!tcp_alloc_md5sig_pool()) {
1031                 sock_kfree_s(sk, key, sizeof(*key));
1032                 return -ENOMEM;
1033         }
1034
1035         memcpy(key->key, newkey, newkeylen);
1036         key->keylen = newkeylen;
1037         key->family = family;
1038         key->prefixlen = prefixlen;
1039         memcpy(&key->addr, addr,
1040                (family == AF_INET6) ? sizeof(struct in6_addr) :
1041                                       sizeof(struct in_addr));
1042         hlist_add_head_rcu(&key->node, &md5sig->head);
1043         return 0;
1044 }
1045 EXPORT_SYMBOL(tcp_md5_do_add);
1046
1047 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1048                    u8 prefixlen)
1049 {
1050         struct tcp_md5sig_key *key;
1051
1052         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1053         if (!key)
1054                 return -ENOENT;
1055         hlist_del_rcu(&key->node);
1056         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1057         kfree_rcu(key, rcu);
1058         return 0;
1059 }
1060 EXPORT_SYMBOL(tcp_md5_do_del);
1061
1062 static void tcp_clear_md5_list(struct sock *sk)
1063 {
1064         struct tcp_sock *tp = tcp_sk(sk);
1065         struct tcp_md5sig_key *key;
1066         struct hlist_node *n;
1067         struct tcp_md5sig_info *md5sig;
1068
1069         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1070
1071         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1072                 hlist_del_rcu(&key->node);
1073                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1074                 kfree_rcu(key, rcu);
1075         }
1076 }
1077
1078 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1079                                  char __user *optval, int optlen)
1080 {
1081         struct tcp_md5sig cmd;
1082         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1083         u8 prefixlen = 32;
1084
1085         if (optlen < sizeof(cmd))
1086                 return -EINVAL;
1087
1088         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1089                 return -EFAULT;
1090
1091         if (sin->sin_family != AF_INET)
1092                 return -EINVAL;
1093
1094         if (optname == TCP_MD5SIG_EXT &&
1095             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1096                 prefixlen = cmd.tcpm_prefixlen;
1097                 if (prefixlen > 32)
1098                         return -EINVAL;
1099         }
1100
1101         if (!cmd.tcpm_keylen)
1102                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1103                                       AF_INET, prefixlen);
1104
1105         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1106                 return -EINVAL;
1107
1108         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1109                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1110                               GFP_KERNEL);
1111 }
1112
1113 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1114                                    __be32 daddr, __be32 saddr,
1115                                    const struct tcphdr *th, int nbytes)
1116 {
1117         struct tcp4_pseudohdr *bp;
1118         struct scatterlist sg;
1119         struct tcphdr *_th;
1120
1121         bp = hp->scratch;
1122         bp->saddr = saddr;
1123         bp->daddr = daddr;
1124         bp->pad = 0;
1125         bp->protocol = IPPROTO_TCP;
1126         bp->len = cpu_to_be16(nbytes);
1127
1128         _th = (struct tcphdr *)(bp + 1);
1129         memcpy(_th, th, sizeof(*th));
1130         _th->check = 0;
1131
1132         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1133         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1134                                 sizeof(*bp) + sizeof(*th));
1135         return crypto_ahash_update(hp->md5_req);
1136 }
1137
1138 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1139                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1140 {
1141         struct tcp_md5sig_pool *hp;
1142         struct ahash_request *req;
1143
1144         hp = tcp_get_md5sig_pool();
1145         if (!hp)
1146                 goto clear_hash_noput;
1147         req = hp->md5_req;
1148
1149         if (crypto_ahash_init(req))
1150                 goto clear_hash;
1151         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1152                 goto clear_hash;
1153         if (tcp_md5_hash_key(hp, key))
1154                 goto clear_hash;
1155         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1156         if (crypto_ahash_final(req))
1157                 goto clear_hash;
1158
1159         tcp_put_md5sig_pool();
1160         return 0;
1161
1162 clear_hash:
1163         tcp_put_md5sig_pool();
1164 clear_hash_noput:
1165         memset(md5_hash, 0, 16);
1166         return 1;
1167 }
1168
1169 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1170                         const struct sock *sk,
1171                         const struct sk_buff *skb)
1172 {
1173         struct tcp_md5sig_pool *hp;
1174         struct ahash_request *req;
1175         const struct tcphdr *th = tcp_hdr(skb);
1176         __be32 saddr, daddr;
1177
1178         if (sk) { /* valid for establish/request sockets */
1179                 saddr = sk->sk_rcv_saddr;
1180                 daddr = sk->sk_daddr;
1181         } else {
1182                 const struct iphdr *iph = ip_hdr(skb);
1183                 saddr = iph->saddr;
1184                 daddr = iph->daddr;
1185         }
1186
1187         hp = tcp_get_md5sig_pool();
1188         if (!hp)
1189                 goto clear_hash_noput;
1190         req = hp->md5_req;
1191
1192         if (crypto_ahash_init(req))
1193                 goto clear_hash;
1194
1195         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1196                 goto clear_hash;
1197         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1198                 goto clear_hash;
1199         if (tcp_md5_hash_key(hp, key))
1200                 goto clear_hash;
1201         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1202         if (crypto_ahash_final(req))
1203                 goto clear_hash;
1204
1205         tcp_put_md5sig_pool();
1206         return 0;
1207
1208 clear_hash:
1209         tcp_put_md5sig_pool();
1210 clear_hash_noput:
1211         memset(md5_hash, 0, 16);
1212         return 1;
1213 }
1214 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1215
1216 #endif
1217
1218 /* Called with rcu_read_lock() */
1219 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1220                                     const struct sk_buff *skb)
1221 {
1222 #ifdef CONFIG_TCP_MD5SIG
1223         /*
1224          * This gets called for each TCP segment that arrives
1225          * so we want to be efficient.
1226          * We have 3 drop cases:
1227          * o No MD5 hash and one expected.
1228          * o MD5 hash and we're not expecting one.
1229          * o MD5 hash and its wrong.
1230          */
1231         const __u8 *hash_location = NULL;
1232         struct tcp_md5sig_key *hash_expected;
1233         const struct iphdr *iph = ip_hdr(skb);
1234         const struct tcphdr *th = tcp_hdr(skb);
1235         int genhash;
1236         unsigned char newhash[16];
1237
1238         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1239                                           AF_INET);
1240         hash_location = tcp_parse_md5sig_option(th);
1241
1242         /* We've parsed the options - do we have a hash? */
1243         if (!hash_expected && !hash_location)
1244                 return false;
1245
1246         if (hash_expected && !hash_location) {
1247                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1248                 return true;
1249         }
1250
1251         if (!hash_expected && hash_location) {
1252                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1253                 return true;
1254         }
1255
1256         /* Okay, so this is hash_expected and hash_location -
1257          * so we need to calculate the checksum.
1258          */
1259         genhash = tcp_v4_md5_hash_skb(newhash,
1260                                       hash_expected,
1261                                       NULL, skb);
1262
1263         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1264                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1265                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1266                                      &iph->saddr, ntohs(th->source),
1267                                      &iph->daddr, ntohs(th->dest),
1268                                      genhash ? " tcp_v4_calc_md5_hash failed"
1269                                      : "");
1270                 return true;
1271         }
1272         return false;
1273 #endif
1274         return false;
1275 }
1276
1277 static void tcp_v4_init_req(struct request_sock *req,
1278                             const struct sock *sk_listener,
1279                             struct sk_buff *skb)
1280 {
1281         struct inet_request_sock *ireq = inet_rsk(req);
1282         struct net *net = sock_net(sk_listener);
1283
1284         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1285         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1286         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1287 }
1288
1289 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1290                                           struct flowi *fl,
1291                                           const struct request_sock *req)
1292 {
1293         return inet_csk_route_req(sk, &fl->u.ip4, req);
1294 }
1295
1296 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1297         .family         =       PF_INET,
1298         .obj_size       =       sizeof(struct tcp_request_sock),
1299         .rtx_syn_ack    =       tcp_rtx_synack,
1300         .send_ack       =       tcp_v4_reqsk_send_ack,
1301         .destructor     =       tcp_v4_reqsk_destructor,
1302         .send_reset     =       tcp_v4_send_reset,
1303         .syn_ack_timeout =      tcp_syn_ack_timeout,
1304 };
1305
1306 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1307         .mss_clamp      =       TCP_MSS_DEFAULT,
1308 #ifdef CONFIG_TCP_MD5SIG
1309         .req_md5_lookup =       tcp_v4_md5_lookup,
1310         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1311 #endif
1312         .init_req       =       tcp_v4_init_req,
1313 #ifdef CONFIG_SYN_COOKIES
1314         .cookie_init_seq =      cookie_v4_init_sequence,
1315 #endif
1316         .route_req      =       tcp_v4_route_req,
1317         .init_seq       =       tcp_v4_init_seq,
1318         .init_ts_off    =       tcp_v4_init_ts_off,
1319         .send_synack    =       tcp_v4_send_synack,
1320 };
1321
1322 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1323 {
1324         /* Never answer to SYNs send to broadcast or multicast */
1325         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1326                 goto drop;
1327
1328         return tcp_conn_request(&tcp_request_sock_ops,
1329                                 &tcp_request_sock_ipv4_ops, sk, skb);
1330
1331 drop:
1332         tcp_listendrop(sk);
1333         return 0;
1334 }
1335 EXPORT_SYMBOL(tcp_v4_conn_request);
1336
1337
1338 /*
1339  * The three way handshake has completed - we got a valid synack -
1340  * now create the new socket.
1341  */
1342 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1343                                   struct request_sock *req,
1344                                   struct dst_entry *dst,
1345                                   struct request_sock *req_unhash,
1346                                   bool *own_req)
1347 {
1348         struct inet_request_sock *ireq;
1349         bool found_dup_sk = false;
1350         struct inet_sock *newinet;
1351         struct tcp_sock *newtp;
1352         struct sock *newsk;
1353 #ifdef CONFIG_TCP_MD5SIG
1354         struct tcp_md5sig_key *key;
1355 #endif
1356         struct ip_options_rcu *inet_opt;
1357
1358         if (sk_acceptq_is_full(sk))
1359                 goto exit_overflow;
1360
1361         newsk = tcp_create_openreq_child(sk, req, skb);
1362         if (!newsk)
1363                 goto exit_nonewsk;
1364
1365         newsk->sk_gso_type = SKB_GSO_TCPV4;
1366         inet_sk_rx_dst_set(newsk, skb);
1367
1368         newtp                 = tcp_sk(newsk);
1369         newinet               = inet_sk(newsk);
1370         ireq                  = inet_rsk(req);
1371         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1372         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1373         newsk->sk_bound_dev_if = ireq->ir_iif;
1374         newinet->inet_saddr   = ireq->ir_loc_addr;
1375         inet_opt              = rcu_dereference(ireq->ireq_opt);
1376         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1377         newinet->mc_index     = inet_iif(skb);
1378         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1379         newinet->rcv_tos      = ip_hdr(skb)->tos;
1380         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1381         if (inet_opt)
1382                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1383         newinet->inet_id = prandom_u32();
1384
1385         if (!dst) {
1386                 dst = inet_csk_route_child_sock(sk, newsk, req);
1387                 if (!dst)
1388                         goto put_and_exit;
1389         } else {
1390                 /* syncookie case : see end of cookie_v4_check() */
1391         }
1392         sk_setup_caps(newsk, dst);
1393
1394         tcp_ca_openreq_child(newsk, dst);
1395
1396         tcp_sync_mss(newsk, dst_mtu(dst));
1397         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1398
1399         tcp_initialize_rcv_mss(newsk);
1400
1401 #ifdef CONFIG_TCP_MD5SIG
1402         /* Copy over the MD5 key from the original socket */
1403         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1404                                 AF_INET);
1405         if (key) {
1406                 /*
1407                  * We're using one, so create a matching key
1408                  * on the newsk structure. If we fail to get
1409                  * memory, then we end up not copying the key
1410                  * across. Shucks.
1411                  */
1412                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1413                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1414                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1415         }
1416 #endif
1417
1418         if (__inet_inherit_port(sk, newsk) < 0)
1419                 goto put_and_exit;
1420         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1421                                        &found_dup_sk);
1422         if (likely(*own_req)) {
1423                 tcp_move_syn(newtp, req);
1424                 ireq->ireq_opt = NULL;
1425         } else {
1426                 newinet->inet_opt = NULL;
1427
1428                 if (!req_unhash && found_dup_sk) {
1429                         /* This code path should only be executed in the
1430                          * syncookie case only
1431                          */
1432                         bh_unlock_sock(newsk);
1433                         sock_put(newsk);
1434                         newsk = NULL;
1435                 }
1436         }
1437         return newsk;
1438
1439 exit_overflow:
1440         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1441 exit_nonewsk:
1442         dst_release(dst);
1443 exit:
1444         tcp_listendrop(sk);
1445         return NULL;
1446 put_and_exit:
1447         newinet->inet_opt = NULL;
1448         inet_csk_prepare_forced_close(newsk);
1449         tcp_done(newsk);
1450         goto exit;
1451 }
1452 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1453
1454 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1455 {
1456 #ifdef CONFIG_SYN_COOKIES
1457         const struct tcphdr *th = tcp_hdr(skb);
1458
1459         if (!th->syn)
1460                 sk = cookie_v4_check(sk, skb);
1461 #endif
1462         return sk;
1463 }
1464
1465 /* The socket must have it's spinlock held when we get
1466  * here, unless it is a TCP_LISTEN socket.
1467  *
1468  * We have a potential double-lock case here, so even when
1469  * doing backlog processing we use the BH locking scheme.
1470  * This is because we cannot sleep with the original spinlock
1471  * held.
1472  */
1473 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1474 {
1475         struct sock *rsk;
1476
1477         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1478                 struct dst_entry *dst;
1479
1480                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1481                                                 lockdep_sock_is_held(sk));
1482
1483                 sock_rps_save_rxhash(sk, skb);
1484                 sk_mark_napi_id(sk, skb);
1485                 if (dst) {
1486                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1487                             !dst->ops->check(dst, 0)) {
1488                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1489                                 dst_release(dst);
1490                         }
1491                 }
1492                 tcp_rcv_established(sk, skb, tcp_hdr(skb));
1493                 return 0;
1494         }
1495
1496         if (tcp_checksum_complete(skb))
1497                 goto csum_err;
1498
1499         if (sk->sk_state == TCP_LISTEN) {
1500                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1501
1502                 if (!nsk)
1503                         goto discard;
1504                 if (nsk != sk) {
1505                         if (tcp_child_process(sk, nsk, skb)) {
1506                                 rsk = nsk;
1507                                 goto reset;
1508                         }
1509                         return 0;
1510                 }
1511         } else
1512                 sock_rps_save_rxhash(sk, skb);
1513
1514         if (tcp_rcv_state_process(sk, skb)) {
1515                 rsk = sk;
1516                 goto reset;
1517         }
1518         return 0;
1519
1520 reset:
1521         tcp_v4_send_reset(rsk, skb);
1522 discard:
1523         kfree_skb(skb);
1524         /* Be careful here. If this function gets more complicated and
1525          * gcc suffers from register pressure on the x86, sk (in %ebx)
1526          * might be destroyed here. This current version compiles correctly,
1527          * but you have been warned.
1528          */
1529         return 0;
1530
1531 csum_err:
1532         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1533         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1534         goto discard;
1535 }
1536 EXPORT_SYMBOL(tcp_v4_do_rcv);
1537
1538 int tcp_v4_early_demux(struct sk_buff *skb)
1539 {
1540         const struct iphdr *iph;
1541         const struct tcphdr *th;
1542         struct sock *sk;
1543
1544         if (skb->pkt_type != PACKET_HOST)
1545                 return 0;
1546
1547         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1548                 return 0;
1549
1550         iph = ip_hdr(skb);
1551         th = tcp_hdr(skb);
1552
1553         if (th->doff < sizeof(struct tcphdr) / 4)
1554                 return 0;
1555
1556         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1557                                        iph->saddr, th->source,
1558                                        iph->daddr, ntohs(th->dest),
1559                                        skb->skb_iif, inet_sdif(skb));
1560         if (sk) {
1561                 skb->sk = sk;
1562                 skb->destructor = sock_edemux;
1563                 if (sk_fullsock(sk)) {
1564                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1565
1566                         if (dst)
1567                                 dst = dst_check(dst, 0);
1568                         if (dst &&
1569                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1570                                 skb_dst_set_noref(skb, dst);
1571                 }
1572         }
1573         return 0;
1574 }
1575
1576 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1577 {
1578         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1579
1580         /* Only socket owner can try to collapse/prune rx queues
1581          * to reduce memory overhead, so add a little headroom here.
1582          * Few sockets backlog are possibly concurrently non empty.
1583          */
1584         limit += 64*1024;
1585
1586         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1587          * we can fix skb->truesize to its real value to avoid future drops.
1588          * This is valid because skb is not yet charged to the socket.
1589          * It has been noticed pure SACK packets were sometimes dropped
1590          * (if cooked by drivers without copybreak feature).
1591          */
1592         skb_condense(skb);
1593
1594         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1595                 bh_unlock_sock(sk);
1596                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1597                 return true;
1598         }
1599         return false;
1600 }
1601 EXPORT_SYMBOL(tcp_add_backlog);
1602
1603 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1604 {
1605         struct tcphdr *th = (struct tcphdr *)skb->data;
1606
1607         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1608 }
1609 EXPORT_SYMBOL(tcp_filter);
1610
1611 static void tcp_v4_restore_cb(struct sk_buff *skb)
1612 {
1613         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1614                 sizeof(struct inet_skb_parm));
1615 }
1616
1617 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1618                            const struct tcphdr *th)
1619 {
1620         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1621          * barrier() makes sure compiler wont play fool^Waliasing games.
1622          */
1623         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1624                 sizeof(struct inet_skb_parm));
1625         barrier();
1626
1627         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1628         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1629                                     skb->len - th->doff * 4);
1630         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1631         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1632         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1633         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1634         TCP_SKB_CB(skb)->sacked  = 0;
1635         TCP_SKB_CB(skb)->has_rxtstamp =
1636                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1637 }
1638
1639 /*
1640  *      From tcp_input.c
1641  */
1642
1643 int tcp_v4_rcv(struct sk_buff *skb)
1644 {
1645         struct net *net = dev_net(skb->dev);
1646         int sdif = inet_sdif(skb);
1647         const struct iphdr *iph;
1648         const struct tcphdr *th;
1649         bool refcounted;
1650         struct sock *sk;
1651         int ret;
1652
1653         if (skb->pkt_type != PACKET_HOST)
1654                 goto discard_it;
1655
1656         /* Count it even if it's bad */
1657         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1658
1659         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1660                 goto discard_it;
1661
1662         th = (const struct tcphdr *)skb->data;
1663
1664         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1665                 goto bad_packet;
1666         if (!pskb_may_pull(skb, th->doff * 4))
1667                 goto discard_it;
1668
1669         /* An explanation is required here, I think.
1670          * Packet length and doff are validated by header prediction,
1671          * provided case of th->doff==0 is eliminated.
1672          * So, we defer the checks. */
1673
1674         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1675                 goto csum_error;
1676
1677         th = (const struct tcphdr *)skb->data;
1678         iph = ip_hdr(skb);
1679 lookup:
1680         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1681                                th->dest, sdif, &refcounted);
1682         if (!sk)
1683                 goto no_tcp_socket;
1684
1685 process:
1686         if (sk->sk_state == TCP_TIME_WAIT)
1687                 goto do_time_wait;
1688
1689         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1690                 struct request_sock *req = inet_reqsk(sk);
1691                 struct sock *nsk;
1692
1693                 sk = req->rsk_listener;
1694                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1695                         sk_drops_add(sk, skb);
1696                         reqsk_put(req);
1697                         goto discard_it;
1698                 }
1699                 if (tcp_checksum_complete(skb)) {
1700                         reqsk_put(req);
1701                         goto csum_error;
1702                 }
1703                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1704                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1705                         goto lookup;
1706                 }
1707                 /* We own a reference on the listener, increase it again
1708                  * as we might lose it too soon.
1709                  */
1710                 sock_hold(sk);
1711                 refcounted = true;
1712                 nsk = NULL;
1713                 if (!tcp_filter(sk, skb)) {
1714                         th = (const struct tcphdr *)skb->data;
1715                         iph = ip_hdr(skb);
1716                         tcp_v4_fill_cb(skb, iph, th);
1717                         nsk = tcp_check_req(sk, skb, req, false);
1718                 }
1719                 if (!nsk) {
1720                         reqsk_put(req);
1721                         goto discard_and_relse;
1722                 }
1723                 if (nsk == sk) {
1724                         reqsk_put(req);
1725                         tcp_v4_restore_cb(skb);
1726                 } else if (tcp_child_process(sk, nsk, skb)) {
1727                         tcp_v4_send_reset(nsk, skb);
1728                         goto discard_and_relse;
1729                 } else {
1730                         sock_put(sk);
1731                         return 0;
1732                 }
1733         }
1734         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1735                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1736                 goto discard_and_relse;
1737         }
1738
1739         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1740                 goto discard_and_relse;
1741
1742         if (tcp_v4_inbound_md5_hash(sk, skb))
1743                 goto discard_and_relse;
1744
1745         nf_reset(skb);
1746
1747         if (tcp_filter(sk, skb))
1748                 goto discard_and_relse;
1749         th = (const struct tcphdr *)skb->data;
1750         iph = ip_hdr(skb);
1751         tcp_v4_fill_cb(skb, iph, th);
1752
1753         skb->dev = NULL;
1754
1755         if (sk->sk_state == TCP_LISTEN) {
1756                 ret = tcp_v4_do_rcv(sk, skb);
1757                 goto put_and_return;
1758         }
1759
1760         sk_incoming_cpu_update(sk);
1761
1762         bh_lock_sock_nested(sk);
1763         tcp_segs_in(tcp_sk(sk), skb);
1764         ret = 0;
1765         if (!sock_owned_by_user(sk)) {
1766                 ret = tcp_v4_do_rcv(sk, skb);
1767         } else if (tcp_add_backlog(sk, skb)) {
1768                 goto discard_and_relse;
1769         }
1770         bh_unlock_sock(sk);
1771
1772 put_and_return:
1773         if (refcounted)
1774                 sock_put(sk);
1775
1776         return ret;
1777
1778 no_tcp_socket:
1779         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1780                 goto discard_it;
1781
1782         tcp_v4_fill_cb(skb, iph, th);
1783
1784         if (tcp_checksum_complete(skb)) {
1785 csum_error:
1786                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1787 bad_packet:
1788                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1789         } else {
1790                 tcp_v4_send_reset(NULL, skb);
1791         }
1792
1793 discard_it:
1794         /* Discard frame. */
1795         kfree_skb(skb);
1796         return 0;
1797
1798 discard_and_relse:
1799         sk_drops_add(sk, skb);
1800         if (refcounted)
1801                 sock_put(sk);
1802         goto discard_it;
1803
1804 do_time_wait:
1805         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1806                 inet_twsk_put(inet_twsk(sk));
1807                 goto discard_it;
1808         }
1809
1810         tcp_v4_fill_cb(skb, iph, th);
1811
1812         if (tcp_checksum_complete(skb)) {
1813                 inet_twsk_put(inet_twsk(sk));
1814                 goto csum_error;
1815         }
1816         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1817         case TCP_TW_SYN: {
1818                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1819                                                         &tcp_hashinfo, skb,
1820                                                         __tcp_hdrlen(th),
1821                                                         iph->saddr, th->source,
1822                                                         iph->daddr, th->dest,
1823                                                         inet_iif(skb),
1824                                                         sdif);
1825                 if (sk2) {
1826                         inet_twsk_deschedule_put(inet_twsk(sk));
1827                         sk = sk2;
1828                         tcp_v4_restore_cb(skb);
1829                         refcounted = false;
1830                         goto process;
1831                 }
1832                 /* Fall through to ACK */
1833         }
1834         case TCP_TW_ACK:
1835                 tcp_v4_timewait_ack(sk, skb);
1836                 break;
1837         case TCP_TW_RST:
1838                 tcp_v4_send_reset(sk, skb);
1839                 inet_twsk_deschedule_put(inet_twsk(sk));
1840                 goto discard_it;
1841         case TCP_TW_SUCCESS:;
1842         }
1843         goto discard_it;
1844 }
1845
1846 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1847         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1848         .twsk_unique    = tcp_twsk_unique,
1849         .twsk_destructor= tcp_twsk_destructor,
1850 };
1851
1852 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1853 {
1854         struct dst_entry *dst = skb_dst(skb);
1855
1856         if (dst && dst_hold_safe(dst)) {
1857                 rcu_assign_pointer(sk->sk_rx_dst, dst);
1858                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1859         }
1860 }
1861 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1862
1863 const struct inet_connection_sock_af_ops ipv4_specific = {
1864         .queue_xmit        = ip_queue_xmit,
1865         .send_check        = tcp_v4_send_check,
1866         .rebuild_header    = inet_sk_rebuild_header,
1867         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1868         .conn_request      = tcp_v4_conn_request,
1869         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1870         .net_header_len    = sizeof(struct iphdr),
1871         .setsockopt        = ip_setsockopt,
1872         .getsockopt        = ip_getsockopt,
1873         .addr2sockaddr     = inet_csk_addr2sockaddr,
1874         .sockaddr_len      = sizeof(struct sockaddr_in),
1875 #ifdef CONFIG_COMPAT
1876         .compat_setsockopt = compat_ip_setsockopt,
1877         .compat_getsockopt = compat_ip_getsockopt,
1878 #endif
1879         .mtu_reduced       = tcp_v4_mtu_reduced,
1880 };
1881 EXPORT_SYMBOL(ipv4_specific);
1882
1883 #ifdef CONFIG_TCP_MD5SIG
1884 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1885         .md5_lookup             = tcp_v4_md5_lookup,
1886         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1887         .md5_parse              = tcp_v4_parse_md5_keys,
1888 };
1889 #endif
1890
1891 /* NOTE: A lot of things set to zero explicitly by call to
1892  *       sk_alloc() so need not be done here.
1893  */
1894 static int tcp_v4_init_sock(struct sock *sk)
1895 {
1896         struct inet_connection_sock *icsk = inet_csk(sk);
1897
1898         tcp_init_sock(sk);
1899
1900         icsk->icsk_af_ops = &ipv4_specific;
1901
1902 #ifdef CONFIG_TCP_MD5SIG
1903         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1904 #endif
1905
1906         return 0;
1907 }
1908
1909 void tcp_v4_destroy_sock(struct sock *sk)
1910 {
1911         struct tcp_sock *tp = tcp_sk(sk);
1912
1913         tcp_clear_xmit_timers(sk);
1914
1915         tcp_cleanup_congestion_control(sk);
1916
1917         tcp_cleanup_ulp(sk);
1918
1919         /* Cleanup up the write buffer. */
1920         tcp_write_queue_purge(sk);
1921
1922         /* Check if we want to disable active TFO */
1923         tcp_fastopen_active_disable_ofo_check(sk);
1924
1925         /* Cleans up our, hopefully empty, out_of_order_queue. */
1926         skb_rbtree_purge(&tp->out_of_order_queue);
1927
1928 #ifdef CONFIG_TCP_MD5SIG
1929         /* Clean up the MD5 key list, if any */
1930         if (tp->md5sig_info) {
1931                 tcp_clear_md5_list(sk);
1932                 kfree_rcu(tp->md5sig_info, rcu);
1933                 tp->md5sig_info = NULL;
1934         }
1935 #endif
1936
1937         /* Clean up a referenced TCP bind bucket. */
1938         if (inet_csk(sk)->icsk_bind_hash)
1939                 inet_put_port(sk);
1940
1941         BUG_ON(tp->fastopen_rsk);
1942
1943         /* If socket is aborted during connect operation */
1944         tcp_free_fastopen_req(tp);
1945         tcp_saved_syn_free(tp);
1946
1947         sk_sockets_allocated_dec(sk);
1948 }
1949 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1950
1951 #ifdef CONFIG_PROC_FS
1952 /* Proc filesystem TCP sock list dumping. */
1953
1954 /*
1955  * Get next listener socket follow cur.  If cur is NULL, get first socket
1956  * starting from bucket given in st->bucket; when st->bucket is zero the
1957  * very first socket in the hash table is returned.
1958  */
1959 static void *listening_get_next(struct seq_file *seq, void *cur)
1960 {
1961         struct tcp_iter_state *st = seq->private;
1962         struct net *net = seq_file_net(seq);
1963         struct inet_listen_hashbucket *ilb;
1964         struct hlist_nulls_node *node;
1965         struct sock *sk = cur;
1966
1967         if (!sk) {
1968 get_head:
1969                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1970                 spin_lock(&ilb->lock);
1971                 sk = sk_nulls_head(&ilb->nulls_head);
1972                 st->offset = 0;
1973                 goto get_sk;
1974         }
1975         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1976         ++st->num;
1977         ++st->offset;
1978
1979         sk = sk_nulls_next(sk);
1980 get_sk:
1981         sk_nulls_for_each_from(sk, node) {
1982                 if (!net_eq(sock_net(sk), net))
1983                         continue;
1984                 if (sk->sk_family == st->family)
1985                         return sk;
1986         }
1987         spin_unlock(&ilb->lock);
1988         st->offset = 0;
1989         if (++st->bucket < INET_LHTABLE_SIZE)
1990                 goto get_head;
1991         return NULL;
1992 }
1993
1994 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1995 {
1996         struct tcp_iter_state *st = seq->private;
1997         void *rc;
1998
1999         st->bucket = 0;
2000         st->offset = 0;
2001         rc = listening_get_next(seq, NULL);
2002
2003         while (rc && *pos) {
2004                 rc = listening_get_next(seq, rc);
2005                 --*pos;
2006         }
2007         return rc;
2008 }
2009
2010 static inline bool empty_bucket(const struct tcp_iter_state *st)
2011 {
2012         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2013 }
2014
2015 /*
2016  * Get first established socket starting from bucket given in st->bucket.
2017  * If st->bucket is zero, the very first socket in the hash is returned.
2018  */
2019 static void *established_get_first(struct seq_file *seq)
2020 {
2021         struct tcp_iter_state *st = seq->private;
2022         struct net *net = seq_file_net(seq);
2023         void *rc = NULL;
2024
2025         st->offset = 0;
2026         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2027                 struct sock *sk;
2028                 struct hlist_nulls_node *node;
2029                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2030
2031                 /* Lockless fast path for the common case of empty buckets */
2032                 if (empty_bucket(st))
2033                         continue;
2034
2035                 spin_lock_bh(lock);
2036                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2037                         if (sk->sk_family != st->family ||
2038                             !net_eq(sock_net(sk), net)) {
2039                                 continue;
2040                         }
2041                         rc = sk;
2042                         goto out;
2043                 }
2044                 spin_unlock_bh(lock);
2045         }
2046 out:
2047         return rc;
2048 }
2049
2050 static void *established_get_next(struct seq_file *seq, void *cur)
2051 {
2052         struct sock *sk = cur;
2053         struct hlist_nulls_node *node;
2054         struct tcp_iter_state *st = seq->private;
2055         struct net *net = seq_file_net(seq);
2056
2057         ++st->num;
2058         ++st->offset;
2059
2060         sk = sk_nulls_next(sk);
2061
2062         sk_nulls_for_each_from(sk, node) {
2063                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2064                         return sk;
2065         }
2066
2067         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2068         ++st->bucket;
2069         return established_get_first(seq);
2070 }
2071
2072 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2073 {
2074         struct tcp_iter_state *st = seq->private;
2075         void *rc;
2076
2077         st->bucket = 0;
2078         rc = established_get_first(seq);
2079
2080         while (rc && pos) {
2081                 rc = established_get_next(seq, rc);
2082                 --pos;
2083         }
2084         return rc;
2085 }
2086
2087 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2088 {
2089         void *rc;
2090         struct tcp_iter_state *st = seq->private;
2091
2092         st->state = TCP_SEQ_STATE_LISTENING;
2093         rc        = listening_get_idx(seq, &pos);
2094
2095         if (!rc) {
2096                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2097                 rc        = established_get_idx(seq, pos);
2098         }
2099
2100         return rc;
2101 }
2102
2103 static void *tcp_seek_last_pos(struct seq_file *seq)
2104 {
2105         struct tcp_iter_state *st = seq->private;
2106         int bucket = st->bucket;
2107         int offset = st->offset;
2108         int orig_num = st->num;
2109         void *rc = NULL;
2110
2111         switch (st->state) {
2112         case TCP_SEQ_STATE_LISTENING:
2113                 if (st->bucket >= INET_LHTABLE_SIZE)
2114                         break;
2115                 st->state = TCP_SEQ_STATE_LISTENING;
2116                 rc = listening_get_next(seq, NULL);
2117                 while (offset-- && rc && bucket == st->bucket)
2118                         rc = listening_get_next(seq, rc);
2119                 if (rc)
2120                         break;
2121                 st->bucket = 0;
2122                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2123                 /* Fallthrough */
2124         case TCP_SEQ_STATE_ESTABLISHED:
2125                 if (st->bucket > tcp_hashinfo.ehash_mask)
2126                         break;
2127                 rc = established_get_first(seq);
2128                 while (offset-- && rc && bucket == st->bucket)
2129                         rc = established_get_next(seq, rc);
2130         }
2131
2132         st->num = orig_num;
2133
2134         return rc;
2135 }
2136
2137 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2138 {
2139         struct tcp_iter_state *st = seq->private;
2140         void *rc;
2141
2142         if (*pos && *pos == st->last_pos) {
2143                 rc = tcp_seek_last_pos(seq);
2144                 if (rc)
2145                         goto out;
2146         }
2147
2148         st->state = TCP_SEQ_STATE_LISTENING;
2149         st->num = 0;
2150         st->bucket = 0;
2151         st->offset = 0;
2152         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2153
2154 out:
2155         st->last_pos = *pos;
2156         return rc;
2157 }
2158
2159 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2160 {
2161         struct tcp_iter_state *st = seq->private;
2162         void *rc = NULL;
2163
2164         if (v == SEQ_START_TOKEN) {
2165                 rc = tcp_get_idx(seq, 0);
2166                 goto out;
2167         }
2168
2169         switch (st->state) {
2170         case TCP_SEQ_STATE_LISTENING:
2171                 rc = listening_get_next(seq, v);
2172                 if (!rc) {
2173                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2174                         st->bucket = 0;
2175                         st->offset = 0;
2176                         rc        = established_get_first(seq);
2177                 }
2178                 break;
2179         case TCP_SEQ_STATE_ESTABLISHED:
2180                 rc = established_get_next(seq, v);
2181                 break;
2182         }
2183 out:
2184         ++*pos;
2185         st->last_pos = *pos;
2186         return rc;
2187 }
2188
2189 static void tcp_seq_stop(struct seq_file *seq, void *v)
2190 {
2191         struct tcp_iter_state *st = seq->private;
2192
2193         switch (st->state) {
2194         case TCP_SEQ_STATE_LISTENING:
2195                 if (v != SEQ_START_TOKEN)
2196                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2197                 break;
2198         case TCP_SEQ_STATE_ESTABLISHED:
2199                 if (v)
2200                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2201                 break;
2202         }
2203 }
2204
2205 int tcp_seq_open(struct inode *inode, struct file *file)
2206 {
2207         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2208         struct tcp_iter_state *s;
2209         int err;
2210
2211         err = seq_open_net(inode, file, &afinfo->seq_ops,
2212                           sizeof(struct tcp_iter_state));
2213         if (err < 0)
2214                 return err;
2215
2216         s = ((struct seq_file *)file->private_data)->private;
2217         s->family               = afinfo->family;
2218         s->last_pos             = 0;
2219         return 0;
2220 }
2221 EXPORT_SYMBOL(tcp_seq_open);
2222
2223 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2224 {
2225         int rc = 0;
2226         struct proc_dir_entry *p;
2227
2228         afinfo->seq_ops.start           = tcp_seq_start;
2229         afinfo->seq_ops.next            = tcp_seq_next;
2230         afinfo->seq_ops.stop            = tcp_seq_stop;
2231
2232         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2233                              afinfo->seq_fops, afinfo);
2234         if (!p)
2235                 rc = -ENOMEM;
2236         return rc;
2237 }
2238 EXPORT_SYMBOL(tcp_proc_register);
2239
2240 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2241 {
2242         remove_proc_entry(afinfo->name, net->proc_net);
2243 }
2244 EXPORT_SYMBOL(tcp_proc_unregister);
2245
2246 static void get_openreq4(const struct request_sock *req,
2247                          struct seq_file *f, int i)
2248 {
2249         const struct inet_request_sock *ireq = inet_rsk(req);
2250         long delta = req->rsk_timer.expires - jiffies;
2251
2252         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2253                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2254                 i,
2255                 ireq->ir_loc_addr,
2256                 ireq->ir_num,
2257                 ireq->ir_rmt_addr,
2258                 ntohs(ireq->ir_rmt_port),
2259                 TCP_SYN_RECV,
2260                 0, 0, /* could print option size, but that is af dependent. */
2261                 1,    /* timers active (only the expire timer) */
2262                 jiffies_delta_to_clock_t(delta),
2263                 req->num_timeout,
2264                 from_kuid_munged(seq_user_ns(f),
2265                                  sock_i_uid(req->rsk_listener)),
2266                 0,  /* non standard timer */
2267                 0, /* open_requests have no inode */
2268                 0,
2269                 req);
2270 }
2271
2272 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2273 {
2274         int timer_active;
2275         unsigned long timer_expires;
2276         const struct tcp_sock *tp = tcp_sk(sk);
2277         const struct inet_connection_sock *icsk = inet_csk(sk);
2278         const struct inet_sock *inet = inet_sk(sk);
2279         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2280         __be32 dest = inet->inet_daddr;
2281         __be32 src = inet->inet_rcv_saddr;
2282         __u16 destp = ntohs(inet->inet_dport);
2283         __u16 srcp = ntohs(inet->inet_sport);
2284         int rx_queue;
2285         int state;
2286
2287         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2288             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2289             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2290                 timer_active    = 1;
2291                 timer_expires   = icsk->icsk_timeout;
2292         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2293                 timer_active    = 4;
2294                 timer_expires   = icsk->icsk_timeout;
2295         } else if (timer_pending(&sk->sk_timer)) {
2296                 timer_active    = 2;
2297                 timer_expires   = sk->sk_timer.expires;
2298         } else {
2299                 timer_active    = 0;
2300                 timer_expires = jiffies;
2301         }
2302
2303         state = sk_state_load(sk);
2304         if (state == TCP_LISTEN)
2305                 rx_queue = sk->sk_ack_backlog;
2306         else
2307                 /* Because we don't lock the socket,
2308                  * we might find a transient negative value.
2309                  */
2310                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2311
2312         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2313                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2314                 i, src, srcp, dest, destp, state,
2315                 tp->write_seq - tp->snd_una,
2316                 rx_queue,
2317                 timer_active,
2318                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2319                 icsk->icsk_retransmits,
2320                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2321                 icsk->icsk_probes_out,
2322                 sock_i_ino(sk),
2323                 refcount_read(&sk->sk_refcnt), sk,
2324                 jiffies_to_clock_t(icsk->icsk_rto),
2325                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2326                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2327                 tp->snd_cwnd,
2328                 state == TCP_LISTEN ?
2329                     fastopenq->max_qlen :
2330                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2331 }
2332
2333 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2334                                struct seq_file *f, int i)
2335 {
2336         long delta = tw->tw_timer.expires - jiffies;
2337         __be32 dest, src;
2338         __u16 destp, srcp;
2339
2340         dest  = tw->tw_daddr;
2341         src   = tw->tw_rcv_saddr;
2342         destp = ntohs(tw->tw_dport);
2343         srcp  = ntohs(tw->tw_sport);
2344
2345         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2346                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2347                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2348                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2349                 refcount_read(&tw->tw_refcnt), tw);
2350 }
2351
2352 #define TMPSZ 150
2353
2354 static int tcp4_seq_show(struct seq_file *seq, void *v)
2355 {
2356         struct tcp_iter_state *st;
2357         struct sock *sk = v;
2358
2359         seq_setwidth(seq, TMPSZ - 1);
2360         if (v == SEQ_START_TOKEN) {
2361                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2362                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2363                            "inode");
2364                 goto out;
2365         }
2366         st = seq->private;
2367
2368         if (sk->sk_state == TCP_TIME_WAIT)
2369                 get_timewait4_sock(v, seq, st->num);
2370         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2371                 get_openreq4(v, seq, st->num);
2372         else
2373                 get_tcp4_sock(v, seq, st->num);
2374 out:
2375         seq_pad(seq, '\n');
2376         return 0;
2377 }
2378
2379 static const struct file_operations tcp_afinfo_seq_fops = {
2380         .owner   = THIS_MODULE,
2381         .open    = tcp_seq_open,
2382         .read    = seq_read,
2383         .llseek  = seq_lseek,
2384         .release = seq_release_net
2385 };
2386
2387 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2388         .name           = "tcp",
2389         .family         = AF_INET,
2390         .seq_fops       = &tcp_afinfo_seq_fops,
2391         .seq_ops        = {
2392                 .show           = tcp4_seq_show,
2393         },
2394 };
2395
2396 static int __net_init tcp4_proc_init_net(struct net *net)
2397 {
2398         return tcp_proc_register(net, &tcp4_seq_afinfo);
2399 }
2400
2401 static void __net_exit tcp4_proc_exit_net(struct net *net)
2402 {
2403         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2404 }
2405
2406 static struct pernet_operations tcp4_net_ops = {
2407         .init = tcp4_proc_init_net,
2408         .exit = tcp4_proc_exit_net,
2409 };
2410
2411 int __init tcp4_proc_init(void)
2412 {
2413         return register_pernet_subsys(&tcp4_net_ops);
2414 }
2415
2416 void tcp4_proc_exit(void)
2417 {
2418         unregister_pernet_subsys(&tcp4_net_ops);
2419 }
2420 #endif /* CONFIG_PROC_FS */
2421
2422 struct proto tcp_prot = {
2423         .name                   = "TCP",
2424         .owner                  = THIS_MODULE,
2425         .close                  = tcp_close,
2426         .connect                = tcp_v4_connect,
2427         .disconnect             = tcp_disconnect,
2428         .accept                 = inet_csk_accept,
2429         .ioctl                  = tcp_ioctl,
2430         .init                   = tcp_v4_init_sock,
2431         .destroy                = tcp_v4_destroy_sock,
2432         .shutdown               = tcp_shutdown,
2433         .setsockopt             = tcp_setsockopt,
2434         .getsockopt             = tcp_getsockopt,
2435         .keepalive              = tcp_set_keepalive,
2436         .recvmsg                = tcp_recvmsg,
2437         .sendmsg                = tcp_sendmsg,
2438         .sendpage               = tcp_sendpage,
2439         .backlog_rcv            = tcp_v4_do_rcv,
2440         .release_cb             = tcp_release_cb,
2441         .hash                   = inet_hash,
2442         .unhash                 = inet_unhash,
2443         .get_port               = inet_csk_get_port,
2444         .enter_memory_pressure  = tcp_enter_memory_pressure,
2445         .leave_memory_pressure  = tcp_leave_memory_pressure,
2446         .stream_memory_free     = tcp_stream_memory_free,
2447         .sockets_allocated      = &tcp_sockets_allocated,
2448         .orphan_count           = &tcp_orphan_count,
2449         .memory_allocated       = &tcp_memory_allocated,
2450         .memory_pressure        = &tcp_memory_pressure,
2451         .sysctl_mem             = sysctl_tcp_mem,
2452         .sysctl_wmem            = sysctl_tcp_wmem,
2453         .sysctl_rmem            = sysctl_tcp_rmem,
2454         .max_header             = MAX_TCP_HEADER,
2455         .obj_size               = sizeof(struct tcp_sock),
2456         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2457         .twsk_prot              = &tcp_timewait_sock_ops,
2458         .rsk_prot               = &tcp_request_sock_ops,
2459         .h.hashinfo             = &tcp_hashinfo,
2460         .no_autobind            = true,
2461 #ifdef CONFIG_COMPAT
2462         .compat_setsockopt      = compat_tcp_setsockopt,
2463         .compat_getsockopt      = compat_tcp_getsockopt,
2464 #endif
2465         .diag_destroy           = tcp_abort,
2466 };
2467 EXPORT_SYMBOL(tcp_prot);
2468
2469 static void __net_exit tcp_sk_exit(struct net *net)
2470 {
2471         int cpu;
2472
2473         for_each_possible_cpu(cpu)
2474                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2475         free_percpu(net->ipv4.tcp_sk);
2476 }
2477
2478 static int __net_init tcp_sk_init(struct net *net)
2479 {
2480         int res, cpu, cnt;
2481
2482         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2483         if (!net->ipv4.tcp_sk)
2484                 return -ENOMEM;
2485
2486         for_each_possible_cpu(cpu) {
2487                 struct sock *sk;
2488
2489                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2490                                            IPPROTO_TCP, net);
2491                 if (res)
2492                         goto fail;
2493                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2494
2495                 /* Please enforce IP_DF and IPID==0 for RST and
2496                  * ACK sent in SYN-RECV and TIME-WAIT state.
2497                  */
2498                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2499
2500                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2501         }
2502
2503         net->ipv4.sysctl_tcp_ecn = 2;
2504         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2505
2506         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2507         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2508         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2509         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2510
2511         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2512         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2513         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2514
2515         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2516         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2517         net->ipv4.sysctl_tcp_syncookies = 1;
2518         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2519         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2520         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2521         net->ipv4.sysctl_tcp_orphan_retries = 0;
2522         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2523         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2524         net->ipv4.sysctl_tcp_tw_reuse = 0;
2525
2526         cnt = tcp_hashinfo.ehash_mask + 1;
2527         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2528         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2529
2530         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2531         net->ipv4.sysctl_tcp_sack = 1;
2532         net->ipv4.sysctl_tcp_window_scaling = 1;
2533         net->ipv4.sysctl_tcp_timestamps = 1;
2534
2535         return 0;
2536 fail:
2537         tcp_sk_exit(net);
2538
2539         return res;
2540 }
2541
2542 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2543 {
2544         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2545 }
2546
2547 static struct pernet_operations __net_initdata tcp_sk_ops = {
2548        .init       = tcp_sk_init,
2549        .exit       = tcp_sk_exit,
2550        .exit_batch = tcp_sk_exit_batch,
2551 };
2552
2553 void __init tcp_v4_init(void)
2554 {
2555         if (register_pernet_subsys(&tcp_sk_ops))
2556                 panic("Failed to create the TCP control socket.\n");
2557 }