net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79
  80 #include <crypto/hash.h>
  81 #include <linux/scatterlist.h>
  82
  83 #include <trace/events/tcp.h>
  84
  85 #ifdef CONFIG_TCP_MD5SIG
  86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  87                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  88 #endif
  89
  90 struct inet_hashinfo tcp_hashinfo;
  91 EXPORT_SYMBOL(tcp_hashinfo);
  92
  93 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  94 {
  95         return secure_tcp_seq(ip_hdr(skb)->daddr,
  96                               ip_hdr(skb)->saddr,
  97                               tcp_hdr(skb)->dest,
  98                               tcp_hdr(skb)->source);
  99 }
 100
 101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 102 {
 103         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 104 }
 105
 106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 107 {
 108         int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
 109         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 110         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 111         struct tcp_sock *tp = tcp_sk(sk);
 112
 113         if (reuse == 2) {
 114                 /* Still does not detect *everything* that goes through
 115                  * lo, since we require a loopback src or dst address
 116                  * or direct binding to 'lo' interface.
 117                  */
 118                 bool loopback = false;
 119                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 120                         loopback = true;
 121 #if IS_ENABLED(CONFIG_IPV6)
 122                 if (tw->tw_family == AF_INET6) {
 123                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 124                             (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
 125                              (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
 126                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 127                             (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
 128                              (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
 129                                 loopback = true;
 130                 } else
 131 #endif
 132                 {
 133                         if (ipv4_is_loopback(tw->tw_daddr) ||
 134                             ipv4_is_loopback(tw->tw_rcv_saddr))
 135                                 loopback = true;
 136                 }
 137                 if (!loopback)
 138                         reuse = 0;
 139         }
 140
 141         /* With PAWS, it is safe from the viewpoint
 142            of data integrity. Even without PAWS it is safe provided sequence
 143            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 144
 145            Actually, the idea is close to VJ's one, only timestamp cache is
 146            held not per host, but per port pair and TW bucket is used as state
 147            holder.
 148
 149            If TW bucket has been already destroyed we fall back to VJ's scheme
 150            and use initial timestamp retrieved from peer table.
 151          */
 152         if (tcptw->tw_ts_recent_stamp &&
 153             (!twp || (reuse && time_after32(ktime_get_seconds(),
 154                                             tcptw->tw_ts_recent_stamp)))) {
 155                 /* In case of repair and re-using TIME-WAIT sockets we still
 156                  * want to be sure that it is safe as above but honor the
 157                  * sequence numbers and time stamps set as part of the repair
 158                  * process.
 159                  *
 160                  * Without this check re-using a TIME-WAIT socket with TCP
 161                  * repair would accumulate a -1 on the repair assigned
 162                  * sequence number. The first time it is reused the sequence
 163                  * is -1, the second time -2, etc. This fixes that issue
 164                  * without appearing to create any others.
 165                  */
 166                 if (likely(!tp->repair)) {
 167                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 168
 169                         if (!seq)
 170                                 seq = 1;
 171                         WRITE_ONCE(tp->write_seq, seq);
 172                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 173                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 174                 }
 175                 sock_hold(sktw);
 176                 return 1;
 177         }
 178
 179         return 0;
 180 }
 181 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 182
 183 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 184                               int addr_len)
 185 {
 186         /* This check is replicated from tcp_v4_connect() and intended to
 187          * prevent BPF program called below from accessing bytes that are out
 188          * of the bound specified by user in addr_len.
 189          */
 190         if (addr_len < sizeof(struct sockaddr_in))
 191                 return -EINVAL;
 192
 193         sock_owned_by_me(sk);
 194
 195         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 196 }
 197
 198 /* This will initiate an outgoing connection. */
 199 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 200 {
 201         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 202         struct inet_sock *inet = inet_sk(sk);
 203         struct tcp_sock *tp = tcp_sk(sk);
 204         __be16 orig_sport, orig_dport;
 205         __be32 daddr, nexthop;
 206         struct flowi4 *fl4;
 207         struct rtable *rt;
 208         int err;
 209         struct ip_options_rcu *inet_opt;
 210         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 211
 212         if (addr_len < sizeof(struct sockaddr_in))
 213                 return -EINVAL;
 214
 215         if (usin->sin_family != AF_INET)
 216                 return -EAFNOSUPPORT;
 217
 218         nexthop = daddr = usin->sin_addr.s_addr;
 219         inet_opt = rcu_dereference_protected(inet->inet_opt,
 220                                              lockdep_sock_is_held(sk));
 221         if (inet_opt && inet_opt->opt.srr) {
 222                 if (!daddr)
 223                         return -EINVAL;
 224                 nexthop = inet_opt->opt.faddr;
 225         }
 226
 227         orig_sport = inet->inet_sport;
 228         orig_dport = usin->sin_port;
 229         fl4 = &inet->cork.fl.u.ip4;
 230         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 231                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 232                               IPPROTO_TCP,
 233                               orig_sport, orig_dport, sk);
 234         if (IS_ERR(rt)) {
 235                 err = PTR_ERR(rt);
 236                 if (err == -ENETUNREACH)
 237                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 238                 return err;
 239         }
 240
 241         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 242                 ip_rt_put(rt);
 243                 return -ENETUNREACH;
 244         }
 245
 246         if (!inet_opt || !inet_opt->opt.srr)
 247                 daddr = fl4->daddr;
 248
 249         if (!inet->inet_saddr)
 250                 inet->inet_saddr = fl4->saddr;
 251         sk_rcv_saddr_set(sk, inet->inet_saddr);
 252
 253         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 254                 /* Reset inherited state */
 255                 tp->rx_opt.ts_recent       = 0;
 256                 tp->rx_opt.ts_recent_stamp = 0;
 257                 if (likely(!tp->repair))
 258                         WRITE_ONCE(tp->write_seq, 0);
 259         }
 260
 261         inet->inet_dport = usin->sin_port;
 262         sk_daddr_set(sk, daddr);
 263
 264         inet_csk(sk)->icsk_ext_hdr_len = 0;
 265         if (inet_opt)
 266                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 267
 268         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 269
 270         /* Socket identity is still unknown (sport may be zero).
 271          * However we set state to SYN-SENT and not releasing socket
 272          * lock select source port, enter ourselves into the hash tables and
 273          * complete initialization after this.
 274          */
 275         tcp_set_state(sk, TCP_SYN_SENT);
 276         err = inet_hash_connect(tcp_death_row, sk);
 277         if (err)
 278                 goto failure;
 279
 280         sk_set_txhash(sk);
 281
 282         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 283                                inet->inet_sport, inet->inet_dport, sk);
 284         if (IS_ERR(rt)) {
 285                 err = PTR_ERR(rt);
 286                 rt = NULL;
 287                 goto failure;
 288         }
 289         /* OK, now commit destination to socket.  */
 290         sk->sk_gso_type = SKB_GSO_TCPV4;
 291         sk_setup_caps(sk, &rt->dst);
 292         rt = NULL;
 293
 294         if (likely(!tp->repair)) {
 295                 if (!tp->write_seq)
 296                         WRITE_ONCE(tp->write_seq,
 297                                    secure_tcp_seq(inet->inet_saddr,
 298                                                   inet->inet_daddr,
 299                                                   inet->inet_sport,
 300                                                   usin->sin_port));
 301                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 302                                                  inet->inet_saddr,
 303                                                  inet->inet_daddr);
 304         }
 305
 306         inet->inet_id = prandom_u32();
 307
 308         if (tcp_fastopen_defer_connect(sk, &err))
 309                 return err;
 310         if (err)
 311                 goto failure;
 312
 313         err = tcp_connect(sk);
 314
 315         if (err)
 316                 goto failure;
 317
 318         return 0;
 319
 320 failure:
 321         /*
 322          * This unhashes the socket and releases the local port,
 323          * if necessary.
 324          */
 325         tcp_set_state(sk, TCP_CLOSE);
 326         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
 327                 inet_reset_saddr(sk);
 328         ip_rt_put(rt);
 329         sk->sk_route_caps = 0;
 330         inet->inet_dport = 0;
 331         return err;
 332 }
 333 EXPORT_SYMBOL(tcp_v4_connect);
 334
 335 /*
 336  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 337  * It can be called through tcp_release_cb() if socket was owned by user
 338  * at the time tcp_v4_err() was called to handle ICMP message.
 339  */
 340 void tcp_v4_mtu_reduced(struct sock *sk)
 341 {
 342         struct inet_sock *inet = inet_sk(sk);
 343         struct dst_entry *dst;
 344         u32 mtu;
 345
 346         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 347                 return;
 348         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 349         dst = inet_csk_update_pmtu(sk, mtu);
 350         if (!dst)
 351                 return;
 352
 353         /* Something is about to be wrong... Remember soft error
 354          * for the case, if this connection will not able to recover.
 355          */
 356         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 357                 sk->sk_err_soft = EMSGSIZE;
 358
 359         mtu = dst_mtu(dst);
 360
 361         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 362             ip_sk_accept_pmtu(sk) &&
 363             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 364                 tcp_sync_mss(sk, mtu);
 365
 366                 /* Resend the TCP packet because it's
 367                  * clear that the old packet has been
 368                  * dropped. This is the new "fast" path mtu
 369                  * discovery.
 370                  */
 371                 tcp_simple_retransmit(sk);
 372         } /* else let the usual retransmit timer handle it */
 373 }
 374 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 375
 376 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 377 {
 378         struct dst_entry *dst = __sk_dst_check(sk, 0);
 379
 380         if (dst)
 381                 dst->ops->redirect(dst, sk, skb);
 382 }
 383
 384
 385 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 386 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 387 {
 388         struct request_sock *req = inet_reqsk(sk);
 389         struct net *net = sock_net(sk);
 390
 391         /* ICMPs are not backlogged, hence we cannot get
 392          * an established socket here.
 393          */
 394         if (seq != tcp_rsk(req)->snt_isn) {
 395                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 396         } else if (abort) {
 397                 /*
 398                  * Still in SYN_RECV, just remove it silently.
 399                  * There is no good way to pass the error to the newly
 400                  * created socket, and POSIX does not want network
 401                  * errors returned from accept().
 402                  */
 403                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 404                 tcp_listendrop(req->rsk_listener);
 405         }
 406         reqsk_put(req);
 407 }
 408 EXPORT_SYMBOL(tcp_req_err);
 409
 410 /*
 411  * This routine is called by the ICMP module when it gets some
 412  * sort of error condition.  If err < 0 then the socket should
 413  * be closed and the error returned to the user.  If err > 0
 414  * it's just the icmp type << 8 | icmp code.  After adjustment
 415  * header points to the first 8 bytes of the tcp header.  We need
 416  * to find the appropriate port.
 417  *
 418  * The locking strategy used here is very "optimistic". When
 419  * someone else accesses the socket the ICMP is just dropped
 420  * and for some paths there is no check at all.
 421  * A more general error queue to queue errors for later handling
 422  * is probably better.
 423  *
 424  */
 425
 426 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 427 {
 428         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 429         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 430         struct inet_connection_sock *icsk;
 431         struct tcp_sock *tp;
 432         struct inet_sock *inet;
 433         const int type = icmp_hdr(icmp_skb)->type;
 434         const int code = icmp_hdr(icmp_skb)->code;
 435         struct sock *sk;
 436         struct sk_buff *skb;
 437         struct request_sock *fastopen;
 438         u32 seq, snd_una;
 439         s32 remaining;
 440         u32 delta_us;
 441         int err;
 442         struct net *net = dev_net(icmp_skb->dev);
 443
 444         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 445                                        th->dest, iph->saddr, ntohs(th->source),
 446                                        inet_iif(icmp_skb), 0);
 447         if (!sk) {
 448                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 449                 return -ENOENT;
 450         }
 451         if (sk->sk_state == TCP_TIME_WAIT) {
 452                 inet_twsk_put(inet_twsk(sk));
 453                 return 0;
 454         }
 455         seq = ntohl(th->seq);
 456         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 457                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 458                                      type == ICMP_TIME_EXCEEDED ||
 459                                      (type == ICMP_DEST_UNREACH &&
 460                                       (code == ICMP_NET_UNREACH ||
 461                                        code == ICMP_HOST_UNREACH)));
 462                 return 0;
 463         }
 464
 465         bh_lock_sock(sk);
 466         /* If too many ICMPs get dropped on busy
 467          * servers this needs to be solved differently.
 468          * We do take care of PMTU discovery (RFC1191) special case :
 469          * we can receive locally generated ICMP messages while socket is held.
 470          */
 471         if (sock_owned_by_user(sk)) {
 472                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 473                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 474         }
 475         if (sk->sk_state == TCP_CLOSE)
 476                 goto out;
 477
 478         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 479                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 480                 goto out;
 481         }
 482
 483         icsk = inet_csk(sk);
 484         tp = tcp_sk(sk);
 485         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 486         fastopen = rcu_dereference(tp->fastopen_rsk);
 487         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 488         if (sk->sk_state != TCP_LISTEN &&
 489             !between(seq, snd_una, tp->snd_nxt)) {
 490                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 491                 goto out;
 492         }
 493
 494         switch (type) {
 495         case ICMP_REDIRECT:
 496                 if (!sock_owned_by_user(sk))
 497                         do_redirect(icmp_skb, sk);
 498                 goto out;
 499         case ICMP_SOURCE_QUENCH:
 500                 /* Just silently ignore these. */
 501                 goto out;
 502         case ICMP_PARAMETERPROB:
 503                 err = EPROTO;
 504                 break;
 505         case ICMP_DEST_UNREACH:
 506                 if (code > NR_ICMP_UNREACH)
 507                         goto out;
 508
 509                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 510                         /* We are not interested in TCP_LISTEN and open_requests
 511                          * (SYN-ACKs send out by Linux are always <576bytes so
 512                          * they should go through unfragmented).
 513                          */
 514                         if (sk->sk_state == TCP_LISTEN)
 515                                 goto out;
 516
 517                         WRITE_ONCE(tp->mtu_info, info);
 518                         if (!sock_owned_by_user(sk)) {
 519                                 tcp_v4_mtu_reduced(sk);
 520                         } else {
 521                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 522                                         sock_hold(sk);
 523                         }
 524                         goto out;
 525                 }
 526
 527                 err = icmp_err_convert[code].errno;
 528                 /* check if icmp_skb allows revert of backoff
 529                  * (see draft-zimmermann-tcp-lcd) */
 530                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 531                         break;
 532                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 533                     !icsk->icsk_backoff || fastopen)
 534                         break;
 535
 536                 if (sock_owned_by_user(sk))
 537                         break;
 538
 539                 skb = tcp_rtx_queue_head(sk);
 540                 if (WARN_ON_ONCE(!skb))
 541                         break;
 542
 543                 icsk->icsk_backoff--;
 544                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 545                                                TCP_TIMEOUT_INIT;
 546                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 547
 548
 549                 tcp_mstamp_refresh(tp);
 550                 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 551                 remaining = icsk->icsk_rto -
 552                             usecs_to_jiffies(delta_us);
 553
 554                 if (remaining > 0) {
 555                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 556                                                   remaining, TCP_RTO_MAX);
 557                 } else {
 558                         /* RTO revert clocked out retransmission.
 559                          * Will retransmit now */
 560                         tcp_retransmit_timer(sk);
 561                 }
 562
 563                 break;
 564         case ICMP_TIME_EXCEEDED:
 565                 err = EHOSTUNREACH;
 566                 break;
 567         default:
 568                 goto out;
 569         }
 570
 571         switch (sk->sk_state) {
 572         case TCP_SYN_SENT:
 573         case TCP_SYN_RECV:
 574                 /* Only in fast or simultaneous open. If a fast open socket is
 575                  * is already accepted it is treated as a connected one below.
 576                  */
 577                 if (fastopen && !fastopen->sk)
 578                         break;
 579
 580                 if (!sock_owned_by_user(sk)) {
 581                         sk->sk_err = err;
 582
 583                         sk->sk_error_report(sk);
 584
 585                         tcp_done(sk);
 586                 } else {
 587                         sk->sk_err_soft = err;
 588                 }
 589                 goto out;
 590         }
 591
 592         /* If we've already connected we will keep trying
 593          * until we time out, or the user gives up.
 594          *
 595          * rfc1122 4.2.3.9 allows to consider as hard errors
 596          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 597          * but it is obsoleted by pmtu discovery).
 598          *
 599          * Note, that in modern internet, where routing is unreliable
 600          * and in each dark corner broken firewalls sit, sending random
 601          * errors ordered by their masters even this two messages finally lose
 602          * their original sense (even Linux sends invalid PORT_UNREACHs)
 603          *
 604          * Now we are in compliance with RFCs.
 605          *                                                      --ANK (980905)
 606          */
 607
 608         inet = inet_sk(sk);
 609         if (!sock_owned_by_user(sk) && inet->recverr) {
 610                 sk->sk_err = err;
 611                 sk->sk_error_report(sk);
 612         } else  { /* Only an error on timeout */
 613                 sk->sk_err_soft = err;
 614         }
 615
 616 out:
 617         bh_unlock_sock(sk);
 618         sock_put(sk);
 619         return 0;
 620 }
 621
 622 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 623 {
 624         struct tcphdr *th = tcp_hdr(skb);
 625
 626         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 627         skb->csum_start = skb_transport_header(skb) - skb->head;
 628         skb->csum_offset = offsetof(struct tcphdr, check);
 629 }
 630
 631 /* This routine computes an IPv4 TCP checksum. */
 632 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 633 {
 634         const struct inet_sock *inet = inet_sk(sk);
 635
 636         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 637 }
 638 EXPORT_SYMBOL(tcp_v4_send_check);
 639
 640 /*
 641  *      This routine will send an RST to the other tcp.
 642  *
 643  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 644  *                    for reset.
 645  *      Answer: if a packet caused RST, it is not for a socket
 646  *              existing in our system, if it is matched to a socket,
 647  *              it is just duplicate segment or bug in other side's TCP.
 648  *              So that we build reply only basing on parameters
 649  *              arrived with segment.
 650  *      Exception: precedence violation. We do not implement it in any case.
 651  */
 652
 653 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 654 {
 655         const struct tcphdr *th = tcp_hdr(skb);
 656         struct {
 657                 struct tcphdr th;
 658 #ifdef CONFIG_TCP_MD5SIG
 659                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 660 #endif
 661         } rep;
 662         struct ip_reply_arg arg;
 663 #ifdef CONFIG_TCP_MD5SIG
 664         struct tcp_md5sig_key *key = NULL;
 665         const __u8 *hash_location = NULL;
 666         unsigned char newhash[16];
 667         int genhash;
 668         struct sock *sk1 = NULL;
 669 #endif
 670         u64 transmit_time = 0;
 671         struct sock *ctl_sk;
 672         struct net *net;
 673
 674         /* Never send a reset in response to a reset. */
 675         if (th->rst)
 676                 return;
 677
 678         /* If sk not NULL, it means we did a successful lookup and incoming
 679          * route had to be correct. prequeue might have dropped our dst.
 680          */
 681         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 682                 return;
 683
 684         /* Swap the send and the receive. */
 685         memset(&rep, 0, sizeof(rep));
 686         rep.th.dest   = th->source;
 687         rep.th.source = th->dest;
 688         rep.th.doff   = sizeof(struct tcphdr) / 4;
 689         rep.th.rst    = 1;
 690
 691         if (th->ack) {
 692                 rep.th.seq = th->ack_seq;
 693         } else {
 694                 rep.th.ack = 1;
 695                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 696                                        skb->len - (th->doff << 2));
 697         }
 698
 699         memset(&arg, 0, sizeof(arg));
 700         arg.iov[0].iov_base = (unsigned char *)&rep;
 701         arg.iov[0].iov_len  = sizeof(rep.th);
 702
 703         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 704 #ifdef CONFIG_TCP_MD5SIG
 705         rcu_read_lock();
 706         hash_location = tcp_parse_md5sig_option(th);
 707         if (sk && sk_fullsock(sk)) {
 708                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 709                                         &ip_hdr(skb)->saddr, AF_INET);
 710         } else if (hash_location) {
 711                 /*
 712                  * active side is lost. Try to find listening socket through
 713                  * source port, and then find md5 key through listening socket.
 714                  * we are not loose security here:
 715                  * Incoming packet is checked with md5 hash with finding key,
 716                  * no RST generated if md5 hash doesn't match.
 717                  */
 718                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 719                                              ip_hdr(skb)->saddr,
 720                                              th->source, ip_hdr(skb)->daddr,
 721                                              ntohs(th->source), inet_iif(skb),
 722                                              tcp_v4_sdif(skb));
 723                 /* don't send rst if it can't find key */
 724                 if (!sk1)
 725                         goto out;
 726
 727                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 728                                         &ip_hdr(skb)->saddr, AF_INET);
 729                 if (!key)
 730                         goto out;
 731
 732
 733                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 734                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 735                         goto out;
 736
 737         }
 738
 739         if (key) {
 740                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 741                                    (TCPOPT_NOP << 16) |
 742                                    (TCPOPT_MD5SIG << 8) |
 743                                    TCPOLEN_MD5SIG);
 744                 /* Update length and the length the header thinks exists */
 745                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 746                 rep.th.doff = arg.iov[0].iov_len / 4;
 747
 748                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 749                                      key, ip_hdr(skb)->saddr,
 750                                      ip_hdr(skb)->daddr, &rep.th);
 751         }
 752 #endif
 753         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 754                                       ip_hdr(skb)->saddr, /* XXX */
 755                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 756         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 757         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 758
 759         /* When socket is gone, all binding information is lost.
 760          * routing might fail in this case. No choice here, if we choose to force
 761          * input interface, we will misroute in case of asymmetric route.
 762          */
 763         if (sk) {
 764                 arg.bound_dev_if = sk->sk_bound_dev_if;
 765                 if (sk_fullsock(sk))
 766                         trace_tcp_send_reset(sk, skb);
 767         }
 768
 769         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 770                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 771
 772         arg.tos = ip_hdr(skb)->tos;
 773         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 774         local_bh_disable();
 775         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 776         if (sk) {
 777                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 778                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 779                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 780                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 781                 transmit_time = tcp_transmit_time(sk);
 782         }
 783         ip_send_unicast_reply(ctl_sk,
 784                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 785                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 786                               &arg, arg.iov[0].iov_len,
 787                               transmit_time);
 788
 789         ctl_sk->sk_mark = 0;
 790         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 791         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 792         local_bh_enable();
 793
 794 #ifdef CONFIG_TCP_MD5SIG
 795 out:
 796         rcu_read_unlock();
 797 #endif
 798 }
 799
 800 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 801    outside socket context is ugly, certainly. What can I do?
 802  */
 803
 804 static void tcp_v4_send_ack(const struct sock *sk,
 805                             struct sk_buff *skb, u32 seq, u32 ack,
 806                             u32 win, u32 tsval, u32 tsecr, int oif,
 807                             struct tcp_md5sig_key *key,
 808                             int reply_flags, u8 tos)
 809 {
 810         const struct tcphdr *th = tcp_hdr(skb);
 811         struct {
 812                 struct tcphdr th;
 813                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 814 #ifdef CONFIG_TCP_MD5SIG
 815                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 816 #endif
 817                         ];
 818         } rep;
 819         struct net *net = sock_net(sk);
 820         struct ip_reply_arg arg;
 821         struct sock *ctl_sk;
 822         u64 transmit_time;
 823
 824         memset(&rep.th, 0, sizeof(struct tcphdr));
 825         memset(&arg, 0, sizeof(arg));
 826
 827         arg.iov[0].iov_base = (unsigned char *)&rep;
 828         arg.iov[0].iov_len  = sizeof(rep.th);
 829         if (tsecr) {
 830                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 831                                    (TCPOPT_TIMESTAMP << 8) |
 832                                    TCPOLEN_TIMESTAMP);
 833                 rep.opt[1] = htonl(tsval);
 834                 rep.opt[2] = htonl(tsecr);
 835                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 836         }
 837
 838         /* Swap the send and the receive. */
 839         rep.th.dest    = th->source;
 840         rep.th.source  = th->dest;
 841         rep.th.doff    = arg.iov[0].iov_len / 4;
 842         rep.th.seq     = htonl(seq);
 843         rep.th.ack_seq = htonl(ack);
 844         rep.th.ack     = 1;
 845         rep.th.window  = htons(win);
 846
 847 #ifdef CONFIG_TCP_MD5SIG
 848         if (key) {
 849                 int offset = (tsecr) ? 3 : 0;
 850
 851                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 852                                           (TCPOPT_NOP << 16) |
 853                                           (TCPOPT_MD5SIG << 8) |
 854                                           TCPOLEN_MD5SIG);
 855                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 856                 rep.th.doff = arg.iov[0].iov_len/4;
 857
 858                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 859                                     key, ip_hdr(skb)->saddr,
 860                                     ip_hdr(skb)->daddr, &rep.th);
 861         }
 862 #endif
 863         arg.flags = reply_flags;
 864         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 865                                       ip_hdr(skb)->saddr, /* XXX */
 866                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 867         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 868         if (oif)
 869                 arg.bound_dev_if = oif;
 870         arg.tos = tos;
 871         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 872         local_bh_disable();
 873         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 874         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 875                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 876         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 877                            inet_twsk(sk)->tw_priority : sk->sk_priority;
 878         transmit_time = tcp_transmit_time(sk);
 879         ip_send_unicast_reply(ctl_sk,
 880                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 881                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 882                               &arg, arg.iov[0].iov_len,
 883                               transmit_time);
 884
 885         ctl_sk->sk_mark = 0;
 886         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 887         local_bh_enable();
 888 }
 889
 890 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 891 {
 892         struct inet_timewait_sock *tw = inet_twsk(sk);
 893         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 894
 895         tcp_v4_send_ack(sk, skb,
 896                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 897                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 898                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 899                         tcptw->tw_ts_recent,
 900                         tw->tw_bound_dev_if,
 901                         tcp_twsk_md5_key(tcptw),
 902                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 903                         tw->tw_tos
 904                         );
 905
 906         inet_twsk_put(tw);
 907 }
 908
 909 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 910                                   struct request_sock *req)
 911 {
 912         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 913          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 914          */
 915         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 916                                              tcp_sk(sk)->snd_nxt;
 917
 918         /* RFC 7323 2.3
 919          * The window field (SEG.WND) of every outgoing segment, with the
 920          * exception of <SYN> segments, MUST be right-shifted by
 921          * Rcv.Wind.Shift bits:
 922          */
 923         tcp_v4_send_ack(sk, skb, seq,
 924                         tcp_rsk(req)->rcv_nxt,
 925                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 926                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 927                         req->ts_recent,
 928                         0,
 929                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 930                                           AF_INET),
 931                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 932                         ip_hdr(skb)->tos);
 933 }
 934
 935 /*
 936  *      Send a SYN-ACK after having received a SYN.
 937  *      This still operates on a request_sock only, not on a big
 938  *      socket.
 939  */
 940 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 941                               struct flowi *fl,
 942                               struct request_sock *req,
 943                               struct tcp_fastopen_cookie *foc,
 944                               enum tcp_synack_type synack_type)
 945 {
 946         const struct inet_request_sock *ireq = inet_rsk(req);
 947         struct flowi4 fl4;
 948         int err = -1;
 949         struct sk_buff *skb;
 950
 951         /* First, grab a route. */
 952         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 953                 return -1;
 954
 955         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 956
 957         if (skb) {
 958                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 959
 960                 rcu_read_lock();
 961                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 962                                             ireq->ir_rmt_addr,
 963                                             rcu_dereference(ireq->ireq_opt));
 964                 rcu_read_unlock();
 965                 err = net_xmit_eval(err);
 966         }
 967
 968         return err;
 969 }
 970
 971 /*
 972  *      IPv4 request_sock destructor.
 973  */
 974 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 975 {
 976         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 977 }
 978
 979 #ifdef CONFIG_TCP_MD5SIG
 980 /*
 981  * RFC2385 MD5 checksumming requires a mapping of
 982  * IP address->MD5 Key.
 983  * We need to maintain these in the sk structure.
 984  */
 985
 986 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
 987 EXPORT_SYMBOL(tcp_md5_needed);
 988
 989 /* Find the Key structure for an address.  */
 990 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
 991                                            const union tcp_md5_addr *addr,
 992                                            int family)
 993 {
 994         const struct tcp_sock *tp = tcp_sk(sk);
 995         struct tcp_md5sig_key *key;
 996         const struct tcp_md5sig_info *md5sig;
 997         __be32 mask;
 998         struct tcp_md5sig_key *best_match = NULL;
 999         bool match;
1000
1001         /* caller either holds rcu_read_lock() or socket lock */
1002         md5sig = rcu_dereference_check(tp->md5sig_info,
1003                                        lockdep_sock_is_held(sk));
1004         if (!md5sig)
1005                 return NULL;
1006
1007         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1008                 if (key->family != family)
1009                         continue;
1010
1011                 if (family == AF_INET) {
1012                         mask = inet_make_mask(key->prefixlen);
1013                         match = (key->addr.a4.s_addr & mask) ==
1014                                 (addr->a4.s_addr & mask);
1015 #if IS_ENABLED(CONFIG_IPV6)
1016                 } else if (family == AF_INET6) {
1017                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1018                                                   key->prefixlen);
1019 #endif
1020                 } else {
1021                         match = false;
1022                 }
1023
1024                 if (match && (!best_match ||
1025                               key->prefixlen > best_match->prefixlen))
1026                         best_match = key;
1027         }
1028         return best_match;
1029 }
1030 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1031
1032 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1033                                                       const union tcp_md5_addr *addr,
1034                                                       int family, u8 prefixlen)
1035 {
1036         const struct tcp_sock *tp = tcp_sk(sk);
1037         struct tcp_md5sig_key *key;
1038         unsigned int size = sizeof(struct in_addr);
1039         const struct tcp_md5sig_info *md5sig;
1040
1041         /* caller either holds rcu_read_lock() or socket lock */
1042         md5sig = rcu_dereference_check(tp->md5sig_info,
1043                                        lockdep_sock_is_held(sk));
1044         if (!md5sig)
1045                 return NULL;
1046 #if IS_ENABLED(CONFIG_IPV6)
1047         if (family == AF_INET6)
1048                 size = sizeof(struct in6_addr);
1049 #endif
1050         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1051                 if (key->family != family)
1052                         continue;
1053                 if (!memcmp(&key->addr, addr, size) &&
1054                     key->prefixlen == prefixlen)
1055                         return key;
1056         }
1057         return NULL;
1058 }
1059
1060 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1061                                          const struct sock *addr_sk)
1062 {
1063         const union tcp_md5_addr *addr;
1064
1065         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1066         return tcp_md5_do_lookup(sk, addr, AF_INET);
1067 }
1068 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1069
1070 /* This can be called on a newly created socket, from other files */
1071 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1072                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1073                    gfp_t gfp)
1074 {
1075         /* Add Key to the list */
1076         struct tcp_md5sig_key *key;
1077         struct tcp_sock *tp = tcp_sk(sk);
1078         struct tcp_md5sig_info *md5sig;
1079
1080         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1081         if (key) {
1082                 /* Pre-existing entry - just update that one.
1083                  * Note that the key might be used concurrently.
1084                  */
1085                 memcpy(key->key, newkey, newkeylen);
1086
1087                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1088                  * Also note that a reader could catch new key->keylen value
1089                  * but old key->key[], this is the reason we use __GFP_ZERO
1090                  * at sock_kmalloc() time below these lines.
1091                  */
1092                 WRITE_ONCE(key->keylen, newkeylen);
1093
1094                 return 0;
1095         }
1096
1097         md5sig = rcu_dereference_protected(tp->md5sig_info,
1098                                            lockdep_sock_is_held(sk));
1099         if (!md5sig) {
1100                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1101                 if (!md5sig)
1102                         return -ENOMEM;
1103
1104                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1105                 INIT_HLIST_HEAD(&md5sig->head);
1106                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1107         }
1108
1109         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1110         if (!key)
1111                 return -ENOMEM;
1112         if (!tcp_alloc_md5sig_pool()) {
1113                 sock_kfree_s(sk, key, sizeof(*key));
1114                 return -ENOMEM;
1115         }
1116
1117         memcpy(key->key, newkey, newkeylen);
1118         key->keylen = newkeylen;
1119         key->family = family;
1120         key->prefixlen = prefixlen;
1121         memcpy(&key->addr, addr,
1122                (family == AF_INET6) ? sizeof(struct in6_addr) :
1123                                       sizeof(struct in_addr));
1124         hlist_add_head_rcu(&key->node, &md5sig->head);
1125         return 0;
1126 }
1127 EXPORT_SYMBOL(tcp_md5_do_add);
1128
1129 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1130                    u8 prefixlen)
1131 {
1132         struct tcp_md5sig_key *key;
1133
1134         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1135         if (!key)
1136                 return -ENOENT;
1137         hlist_del_rcu(&key->node);
1138         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1139         kfree_rcu(key, rcu);
1140         return 0;
1141 }
1142 EXPORT_SYMBOL(tcp_md5_do_del);
1143
1144 static void tcp_clear_md5_list(struct sock *sk)
1145 {
1146         struct tcp_sock *tp = tcp_sk(sk);
1147         struct tcp_md5sig_key *key;
1148         struct hlist_node *n;
1149         struct tcp_md5sig_info *md5sig;
1150
1151         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1152
1153         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1154                 hlist_del_rcu(&key->node);
1155                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1156                 kfree_rcu(key, rcu);
1157         }
1158 }
1159
1160 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1161                                  char __user *optval, int optlen)
1162 {
1163         struct tcp_md5sig cmd;
1164         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1165         u8 prefixlen = 32;
1166
1167         if (optlen < sizeof(cmd))
1168                 return -EINVAL;
1169
1170         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1171                 return -EFAULT;
1172
1173         if (sin->sin_family != AF_INET)
1174                 return -EINVAL;
1175
1176         if (optname == TCP_MD5SIG_EXT &&
1177             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1178                 prefixlen = cmd.tcpm_prefixlen;
1179                 if (prefixlen > 32)
1180                         return -EINVAL;
1181         }
1182
1183         if (!cmd.tcpm_keylen)
1184                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1185                                       AF_INET, prefixlen);
1186
1187         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1188                 return -EINVAL;
1189
1190         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1191                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1192                               GFP_KERNEL);
1193 }
1194
1195 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1196                                    __be32 daddr, __be32 saddr,
1197                                    const struct tcphdr *th, int nbytes)
1198 {
1199         struct tcp4_pseudohdr *bp;
1200         struct scatterlist sg;
1201         struct tcphdr *_th;
1202
1203         bp = hp->scratch;
1204         bp->saddr = saddr;
1205         bp->daddr = daddr;
1206         bp->pad = 0;
1207         bp->protocol = IPPROTO_TCP;
1208         bp->len = cpu_to_be16(nbytes);
1209
1210         _th = (struct tcphdr *)(bp + 1);
1211         memcpy(_th, th, sizeof(*th));
1212         _th->check = 0;
1213
1214         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1215         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1216                                 sizeof(*bp) + sizeof(*th));
1217         return crypto_ahash_update(hp->md5_req);
1218 }
1219
1220 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1221                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1222 {
1223         struct tcp_md5sig_pool *hp;
1224         struct ahash_request *req;
1225
1226         hp = tcp_get_md5sig_pool();
1227         if (!hp)
1228                 goto clear_hash_noput;
1229         req = hp->md5_req;
1230
1231         if (crypto_ahash_init(req))
1232                 goto clear_hash;
1233         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1234                 goto clear_hash;
1235         if (tcp_md5_hash_key(hp, key))
1236                 goto clear_hash;
1237         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1238         if (crypto_ahash_final(req))
1239                 goto clear_hash;
1240
1241         tcp_put_md5sig_pool();
1242         return 0;
1243
1244 clear_hash:
1245         tcp_put_md5sig_pool();
1246 clear_hash_noput:
1247         memset(md5_hash, 0, 16);
1248         return 1;
1249 }
1250
1251 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1252                         const struct sock *sk,
1253                         const struct sk_buff *skb)
1254 {
1255         struct tcp_md5sig_pool *hp;
1256         struct ahash_request *req;
1257         const struct tcphdr *th = tcp_hdr(skb);
1258         __be32 saddr, daddr;
1259
1260         if (sk) { /* valid for establish/request sockets */
1261                 saddr = sk->sk_rcv_saddr;
1262                 daddr = sk->sk_daddr;
1263         } else {
1264                 const struct iphdr *iph = ip_hdr(skb);
1265                 saddr = iph->saddr;
1266                 daddr = iph->daddr;
1267         }
1268
1269         hp = tcp_get_md5sig_pool();
1270         if (!hp)
1271                 goto clear_hash_noput;
1272         req = hp->md5_req;
1273
1274         if (crypto_ahash_init(req))
1275                 goto clear_hash;
1276
1277         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1278                 goto clear_hash;
1279         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1280                 goto clear_hash;
1281         if (tcp_md5_hash_key(hp, key))
1282                 goto clear_hash;
1283         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1284         if (crypto_ahash_final(req))
1285                 goto clear_hash;
1286
1287         tcp_put_md5sig_pool();
1288         return 0;
1289
1290 clear_hash:
1291         tcp_put_md5sig_pool();
1292 clear_hash_noput:
1293         memset(md5_hash, 0, 16);
1294         return 1;
1295 }
1296 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1297
1298 #endif
1299
1300 /* Called with rcu_read_lock() */
1301 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1302                                     const struct sk_buff *skb)
1303 {
1304 #ifdef CONFIG_TCP_MD5SIG
1305         /*
1306          * This gets called for each TCP segment that arrives
1307          * so we want to be efficient.
1308          * We have 3 drop cases:
1309          * o No MD5 hash and one expected.
1310          * o MD5 hash and we're not expecting one.
1311          * o MD5 hash and its wrong.
1312          */
1313         const __u8 *hash_location = NULL;
1314         struct tcp_md5sig_key *hash_expected;
1315         const struct iphdr *iph = ip_hdr(skb);
1316         const struct tcphdr *th = tcp_hdr(skb);
1317         int genhash;
1318         unsigned char newhash[16];
1319
1320         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1321                                           AF_INET);
1322         hash_location = tcp_parse_md5sig_option(th);
1323
1324         /* We've parsed the options - do we have a hash? */
1325         if (!hash_expected && !hash_location)
1326                 return false;
1327
1328         if (hash_expected && !hash_location) {
1329                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1330                 return true;
1331         }
1332
1333         if (!hash_expected && hash_location) {
1334                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1335                 return true;
1336         }
1337
1338         /* Okay, so this is hash_expected and hash_location -
1339          * so we need to calculate the checksum.
1340          */
1341         genhash = tcp_v4_md5_hash_skb(newhash,
1342                                       hash_expected,
1343                                       NULL, skb);
1344
1345         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1346                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1347                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1348                                      &iph->saddr, ntohs(th->source),
1349                                      &iph->daddr, ntohs(th->dest),
1350                                      genhash ? " tcp_v4_calc_md5_hash failed"
1351                                      : "");
1352                 return true;
1353         }
1354         return false;
1355 #endif
1356         return false;
1357 }
1358
1359 static void tcp_v4_init_req(struct request_sock *req,
1360                             const struct sock *sk_listener,
1361                             struct sk_buff *skb)
1362 {
1363         struct inet_request_sock *ireq = inet_rsk(req);
1364         struct net *net = sock_net(sk_listener);
1365
1366         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1367         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1368         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1369 }
1370
1371 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1372                                           struct flowi *fl,
1373                                           const struct request_sock *req)
1374 {
1375         return inet_csk_route_req(sk, &fl->u.ip4, req);
1376 }
1377
1378 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1379         .family         =       PF_INET,
1380         .obj_size       =       sizeof(struct tcp_request_sock),
1381         .rtx_syn_ack    =       tcp_rtx_synack,
1382         .send_ack       =       tcp_v4_reqsk_send_ack,
1383         .destructor     =       tcp_v4_reqsk_destructor,
1384         .send_reset     =       tcp_v4_send_reset,
1385         .syn_ack_timeout =      tcp_syn_ack_timeout,
1386 };
1387
1388 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1389         .mss_clamp      =       TCP_MSS_DEFAULT,
1390 #ifdef CONFIG_TCP_MD5SIG
1391         .req_md5_lookup =       tcp_v4_md5_lookup,
1392         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1393 #endif
1394         .init_req       =       tcp_v4_init_req,
1395 #ifdef CONFIG_SYN_COOKIES
1396         .cookie_init_seq =      cookie_v4_init_sequence,
1397 #endif
1398         .route_req      =       tcp_v4_route_req,
1399         .init_seq       =       tcp_v4_init_seq,
1400         .init_ts_off    =       tcp_v4_init_ts_off,
1401         .send_synack    =       tcp_v4_send_synack,
1402 };
1403
1404 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1405 {
1406         /* Never answer to SYNs send to broadcast or multicast */
1407         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1408                 goto drop;
1409
1410         return tcp_conn_request(&tcp_request_sock_ops,
1411                                 &tcp_request_sock_ipv4_ops, sk, skb);
1412
1413 drop:
1414         tcp_listendrop(sk);
1415         return 0;
1416 }
1417 EXPORT_SYMBOL(tcp_v4_conn_request);
1418
1419
1420 /*
1421  * The three way handshake has completed - we got a valid synack -
1422  * now create the new socket.
1423  */
1424 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1425                                   struct request_sock *req,
1426                                   struct dst_entry *dst,
1427                                   struct request_sock *req_unhash,
1428                                   bool *own_req)
1429 {
1430         struct inet_request_sock *ireq;
1431         bool found_dup_sk = false;
1432         struct inet_sock *newinet;
1433         struct tcp_sock *newtp;
1434         struct sock *newsk;
1435 #ifdef CONFIG_TCP_MD5SIG
1436         struct tcp_md5sig_key *key;
1437 #endif
1438         struct ip_options_rcu *inet_opt;
1439
1440         if (sk_acceptq_is_full(sk))
1441                 goto exit_overflow;
1442
1443         newsk = tcp_create_openreq_child(sk, req, skb);
1444         if (!newsk)
1445                 goto exit_nonewsk;
1446
1447         newsk->sk_gso_type = SKB_GSO_TCPV4;
1448         inet_sk_rx_dst_set(newsk, skb);
1449
1450         newtp                 = tcp_sk(newsk);
1451         newinet               = inet_sk(newsk);
1452         ireq                  = inet_rsk(req);
1453         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1454         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1455         newsk->sk_bound_dev_if = ireq->ir_iif;
1456         newinet->inet_saddr   = ireq->ir_loc_addr;
1457         inet_opt              = rcu_dereference(ireq->ireq_opt);
1458         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1459         newinet->mc_index     = inet_iif(skb);
1460         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1461         newinet->rcv_tos      = ip_hdr(skb)->tos;
1462         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1463         if (inet_opt)
1464                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1465         newinet->inet_id = prandom_u32();
1466
1467         if (!dst) {
1468                 dst = inet_csk_route_child_sock(sk, newsk, req);
1469                 if (!dst)
1470                         goto put_and_exit;
1471         } else {
1472                 /* syncookie case : see end of cookie_v4_check() */
1473         }
1474         sk_setup_caps(newsk, dst);
1475
1476         tcp_ca_openreq_child(newsk, dst);
1477
1478         tcp_sync_mss(newsk, dst_mtu(dst));
1479         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1480
1481         tcp_initialize_rcv_mss(newsk);
1482
1483 #ifdef CONFIG_TCP_MD5SIG
1484         /* Copy over the MD5 key from the original socket */
1485         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1486                                 AF_INET);
1487         if (key) {
1488                 /*
1489                  * We're using one, so create a matching key
1490                  * on the newsk structure. If we fail to get
1491                  * memory, then we end up not copying the key
1492                  * across. Shucks.
1493                  */
1494                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1495                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1496                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1497         }
1498 #endif
1499
1500         if (__inet_inherit_port(sk, newsk) < 0)
1501                 goto put_and_exit;
1502         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1503                                        &found_dup_sk);
1504         if (likely(*own_req)) {
1505                 tcp_move_syn(newtp, req);
1506                 ireq->ireq_opt = NULL;
1507         } else {
1508                 newinet->inet_opt = NULL;
1509
1510                 if (!req_unhash && found_dup_sk) {
1511                         /* This code path should only be executed in the
1512                          * syncookie case only
1513                          */
1514                         bh_unlock_sock(newsk);
1515                         sock_put(newsk);
1516                         newsk = NULL;
1517                 }
1518         }
1519         return newsk;
1520
1521 exit_overflow:
1522         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1523 exit_nonewsk:
1524         dst_release(dst);
1525 exit:
1526         tcp_listendrop(sk);
1527         return NULL;
1528 put_and_exit:
1529         newinet->inet_opt = NULL;
1530         inet_csk_prepare_forced_close(newsk);
1531         tcp_done(newsk);
1532         goto exit;
1533 }
1534 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1535
1536 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1537 {
1538 #ifdef CONFIG_SYN_COOKIES
1539         const struct tcphdr *th = tcp_hdr(skb);
1540
1541         if (!th->syn)
1542                 sk = cookie_v4_check(sk, skb);
1543 #endif
1544         return sk;
1545 }
1546
1547 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1548                          struct tcphdr *th, u32 *cookie)
1549 {
1550         u16 mss = 0;
1551 #ifdef CONFIG_SYN_COOKIES
1552         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1553                                     &tcp_request_sock_ipv4_ops, sk, th);
1554         if (mss) {
1555                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1556                 tcp_synq_overflow(sk);
1557         }
1558 #endif
1559         return mss;
1560 }
1561
1562 /* The socket must have it's spinlock held when we get
1563  * here, unless it is a TCP_LISTEN socket.
1564  *
1565  * We have a potential double-lock case here, so even when
1566  * doing backlog processing we use the BH locking scheme.
1567  * This is because we cannot sleep with the original spinlock
1568  * held.
1569  */
1570 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1571 {
1572         struct sock *rsk;
1573
1574         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1575                 struct dst_entry *dst;
1576
1577                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1578                                                 lockdep_sock_is_held(sk));
1579
1580                 sock_rps_save_rxhash(sk, skb);
1581                 sk_mark_napi_id(sk, skb);
1582                 if (dst) {
1583                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1584                             !dst->ops->check(dst, 0)) {
1585                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1586                                 dst_release(dst);
1587                         }
1588                 }
1589                 tcp_rcv_established(sk, skb);
1590                 return 0;
1591         }
1592
1593         if (tcp_checksum_complete(skb))
1594                 goto csum_err;
1595
1596         if (sk->sk_state == TCP_LISTEN) {
1597                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1598
1599                 if (!nsk)
1600                         goto discard;
1601                 if (nsk != sk) {
1602                         if (tcp_child_process(sk, nsk, skb)) {
1603                                 rsk = nsk;
1604                                 goto reset;
1605                         }
1606                         return 0;
1607                 }
1608         } else
1609                 sock_rps_save_rxhash(sk, skb);
1610
1611         if (tcp_rcv_state_process(sk, skb)) {
1612                 rsk = sk;
1613                 goto reset;
1614         }
1615         return 0;
1616
1617 reset:
1618         tcp_v4_send_reset(rsk, skb);
1619 discard:
1620         kfree_skb(skb);
1621         /* Be careful here. If this function gets more complicated and
1622          * gcc suffers from register pressure on the x86, sk (in %ebx)
1623          * might be destroyed here. This current version compiles correctly,
1624          * but you have been warned.
1625          */
1626         return 0;
1627
1628 csum_err:
1629         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1630         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1631         goto discard;
1632 }
1633 EXPORT_SYMBOL(tcp_v4_do_rcv);
1634
1635 int tcp_v4_early_demux(struct sk_buff *skb)
1636 {
1637         const struct iphdr *iph;
1638         const struct tcphdr *th;
1639         struct sock *sk;
1640
1641         if (skb->pkt_type != PACKET_HOST)
1642                 return 0;
1643
1644         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1645                 return 0;
1646
1647         iph = ip_hdr(skb);
1648         th = tcp_hdr(skb);
1649
1650         if (th->doff < sizeof(struct tcphdr) / 4)
1651                 return 0;
1652
1653         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1654                                        iph->saddr, th->source,
1655                                        iph->daddr, ntohs(th->dest),
1656                                        skb->skb_iif, inet_sdif(skb));
1657         if (sk) {
1658                 skb->sk = sk;
1659                 skb->destructor = sock_edemux;
1660                 if (sk_fullsock(sk)) {
1661                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1662
1663                         if (dst)
1664                                 dst = dst_check(dst, 0);
1665                         if (dst &&
1666                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1667                                 skb_dst_set_noref(skb, dst);
1668                 }
1669         }
1670         return 0;
1671 }
1672
1673 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1674 {
1675         u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1676         u32 tail_gso_size, tail_gso_segs;
1677         struct skb_shared_info *shinfo;
1678         const struct tcphdr *th;
1679         struct tcphdr *thtail;
1680         struct sk_buff *tail;
1681         unsigned int hdrlen;
1682         bool fragstolen;
1683         u32 gso_segs;
1684         u32 gso_size;
1685         int delta;
1686
1687         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1688          * we can fix skb->truesize to its real value to avoid future drops.
1689          * This is valid because skb is not yet charged to the socket.
1690          * It has been noticed pure SACK packets were sometimes dropped
1691          * (if cooked by drivers without copybreak feature).
1692          */
1693         skb_condense(skb);
1694
1695         skb_dst_drop(skb);
1696
1697         if (unlikely(tcp_checksum_complete(skb))) {
1698                 bh_unlock_sock(sk);
1699                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1700                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1701                 return true;
1702         }
1703
1704         /* Attempt coalescing to last skb in backlog, even if we are
1705          * above the limits.
1706          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1707          */
1708         th = (const struct tcphdr *)skb->data;
1709         hdrlen = th->doff * 4;
1710
1711         tail = sk->sk_backlog.tail;
1712         if (!tail)
1713                 goto no_coalesce;
1714         thtail = (struct tcphdr *)tail->data;
1715
1716         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1717             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1718             ((TCP_SKB_CB(tail)->tcp_flags |
1719               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1720             !((TCP_SKB_CB(tail)->tcp_flags &
1721               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1722             ((TCP_SKB_CB(tail)->tcp_flags ^
1723               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1724 #ifdef CONFIG_TLS_DEVICE
1725             tail->decrypted != skb->decrypted ||
1726 #endif
1727             thtail->doff != th->doff ||
1728             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1729                 goto no_coalesce;
1730
1731         __skb_pull(skb, hdrlen);
1732
1733         shinfo = skb_shinfo(skb);
1734         gso_size = shinfo->gso_size ?: skb->len;
1735         gso_segs = shinfo->gso_segs ?: 1;
1736
1737         shinfo = skb_shinfo(tail);
1738         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1739         tail_gso_segs = shinfo->gso_segs ?: 1;
1740
1741         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1742                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1743
1744                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1745                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1746                         thtail->window = th->window;
1747                 }
1748
1749                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1750                  * thtail->fin, so that the fast path in tcp_rcv_established()
1751                  * is not entered if we append a packet with a FIN.
1752                  * SYN, RST, URG are not present.
1753                  * ACK is set on both packets.
1754                  * PSH : we do not really care in TCP stack,
1755                  *       at least for 'GRO' packets.
1756                  */
1757                 thtail->fin |= th->fin;
1758                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1759
1760                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1761                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1762                         tail->tstamp = skb->tstamp;
1763                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1764                 }
1765
1766                 /* Not as strict as GRO. We only need to carry mss max value */
1767                 shinfo->gso_size = max(gso_size, tail_gso_size);
1768                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1769
1770                 sk->sk_backlog.len += delta;
1771                 __NET_INC_STATS(sock_net(sk),
1772                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1773                 kfree_skb_partial(skb, fragstolen);
1774                 return false;
1775         }
1776         __skb_push(skb, hdrlen);
1777
1778 no_coalesce:
1779         /* Only socket owner can try to collapse/prune rx queues
1780          * to reduce memory overhead, so add a little headroom here.
1781          * Few sockets backlog are possibly concurrently non empty.
1782          */
1783         limit += 64*1024;
1784
1785         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1786                 bh_unlock_sock(sk);
1787                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1788                 return true;
1789         }
1790         return false;
1791 }
1792 EXPORT_SYMBOL(tcp_add_backlog);
1793
1794 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1795 {
1796         struct tcphdr *th = (struct tcphdr *)skb->data;
1797
1798         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1799 }
1800 EXPORT_SYMBOL(tcp_filter);
1801
1802 static void tcp_v4_restore_cb(struct sk_buff *skb)
1803 {
1804         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1805                 sizeof(struct inet_skb_parm));
1806 }
1807
1808 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1809                            const struct tcphdr *th)
1810 {
1811         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1812          * barrier() makes sure compiler wont play fool^Waliasing games.
1813          */
1814         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1815                 sizeof(struct inet_skb_parm));
1816         barrier();
1817
1818         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1819         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1820                                     skb->len - th->doff * 4);
1821         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1822         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1823         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1824         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1825         TCP_SKB_CB(skb)->sacked  = 0;
1826         TCP_SKB_CB(skb)->has_rxtstamp =
1827                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1828 }
1829
1830 /*
1831  *      From tcp_input.c
1832  */
1833
1834 int tcp_v4_rcv(struct sk_buff *skb)
1835 {
1836         struct net *net = dev_net(skb->dev);
1837         struct sk_buff *skb_to_free;
1838         int sdif = inet_sdif(skb);
1839         const struct iphdr *iph;
1840         const struct tcphdr *th;
1841         bool refcounted;
1842         struct sock *sk;
1843         int ret;
1844
1845         if (skb->pkt_type != PACKET_HOST)
1846                 goto discard_it;
1847
1848         /* Count it even if it's bad */
1849         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1850
1851         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1852                 goto discard_it;
1853
1854         th = (const struct tcphdr *)skb->data;
1855
1856         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1857                 goto bad_packet;
1858         if (!pskb_may_pull(skb, th->doff * 4))
1859                 goto discard_it;
1860
1861         /* An explanation is required here, I think.
1862          * Packet length and doff are validated by header prediction,
1863          * provided case of th->doff==0 is eliminated.
1864          * So, we defer the checks. */
1865
1866         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1867                 goto csum_error;
1868
1869         th = (const struct tcphdr *)skb->data;
1870         iph = ip_hdr(skb);
1871 lookup:
1872         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1873                                th->dest, sdif, &refcounted);
1874         if (!sk)
1875                 goto no_tcp_socket;
1876
1877 process:
1878         if (sk->sk_state == TCP_TIME_WAIT)
1879                 goto do_time_wait;
1880
1881         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1882                 struct request_sock *req = inet_reqsk(sk);
1883                 bool req_stolen = false;
1884                 struct sock *nsk;
1885
1886                 sk = req->rsk_listener;
1887                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1888                         sk_drops_add(sk, skb);
1889                         reqsk_put(req);
1890                         goto discard_it;
1891                 }
1892                 if (tcp_checksum_complete(skb)) {
1893                         reqsk_put(req);
1894                         goto csum_error;
1895                 }
1896                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1897                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1898                         goto lookup;
1899                 }
1900                 /* We own a reference on the listener, increase it again
1901                  * as we might lose it too soon.
1902                  */
1903                 sock_hold(sk);
1904                 refcounted = true;
1905                 nsk = NULL;
1906                 if (!tcp_filter(sk, skb)) {
1907                         th = (const struct tcphdr *)skb->data;
1908                         iph = ip_hdr(skb);
1909                         tcp_v4_fill_cb(skb, iph, th);
1910                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1911                 }
1912                 if (!nsk) {
1913                         reqsk_put(req);
1914                         if (req_stolen) {
1915                                 /* Another cpu got exclusive access to req
1916                                  * and created a full blown socket.
1917                                  * Try to feed this packet to this socket
1918                                  * instead of discarding it.
1919                                  */
1920                                 tcp_v4_restore_cb(skb);
1921                                 sock_put(sk);
1922                                 goto lookup;
1923                         }
1924                         goto discard_and_relse;
1925                 }
1926                 if (nsk == sk) {
1927                         reqsk_put(req);
1928                         tcp_v4_restore_cb(skb);
1929                 } else if (tcp_child_process(sk, nsk, skb)) {
1930                         tcp_v4_send_reset(nsk, skb);
1931                         goto discard_and_relse;
1932                 } else {
1933                         sock_put(sk);
1934                         return 0;
1935                 }
1936         }
1937         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1938                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1939                 goto discard_and_relse;
1940         }
1941
1942         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1943                 goto discard_and_relse;
1944
1945         if (tcp_v4_inbound_md5_hash(sk, skb))
1946                 goto discard_and_relse;
1947
1948         nf_reset_ct(skb);
1949
1950         if (tcp_filter(sk, skb))
1951                 goto discard_and_relse;
1952         th = (const struct tcphdr *)skb->data;
1953         iph = ip_hdr(skb);
1954         tcp_v4_fill_cb(skb, iph, th);
1955
1956         skb->dev = NULL;
1957
1958         if (sk->sk_state == TCP_LISTEN) {
1959                 ret = tcp_v4_do_rcv(sk, skb);
1960                 goto put_and_return;
1961         }
1962
1963         sk_incoming_cpu_update(sk);
1964
1965         bh_lock_sock_nested(sk);
1966         tcp_segs_in(tcp_sk(sk), skb);
1967         ret = 0;
1968         if (!sock_owned_by_user(sk)) {
1969                 skb_to_free = sk->sk_rx_skb_cache;
1970                 sk->sk_rx_skb_cache = NULL;
1971                 ret = tcp_v4_do_rcv(sk, skb);
1972         } else {
1973                 if (tcp_add_backlog(sk, skb))
1974                         goto discard_and_relse;
1975                 skb_to_free = NULL;
1976         }
1977         bh_unlock_sock(sk);
1978         if (skb_to_free)
1979                 __kfree_skb(skb_to_free);
1980
1981 put_and_return:
1982         if (refcounted)
1983                 sock_put(sk);
1984
1985         return ret;
1986
1987 no_tcp_socket:
1988         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1989                 goto discard_it;
1990
1991         tcp_v4_fill_cb(skb, iph, th);
1992
1993         if (tcp_checksum_complete(skb)) {
1994 csum_error:
1995                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1996 bad_packet:
1997                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1998         } else {
1999                 tcp_v4_send_reset(NULL, skb);
2000         }
2001
2002 discard_it:
2003         /* Discard frame. */
2004         kfree_skb(skb);
2005         return 0;
2006
2007 discard_and_relse:
2008         sk_drops_add(sk, skb);
2009         if (refcounted)
2010                 sock_put(sk);
2011         goto discard_it;
2012
2013 do_time_wait:
2014         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2015                 inet_twsk_put(inet_twsk(sk));
2016                 goto discard_it;
2017         }
2018
2019         tcp_v4_fill_cb(skb, iph, th);
2020
2021         if (tcp_checksum_complete(skb)) {
2022                 inet_twsk_put(inet_twsk(sk));
2023                 goto csum_error;
2024         }
2025         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2026         case TCP_TW_SYN: {
2027                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2028                                                         &tcp_hashinfo, skb,
2029                                                         __tcp_hdrlen(th),
2030                                                         iph->saddr, th->source,
2031                                                         iph->daddr, th->dest,
2032                                                         inet_iif(skb),
2033                                                         sdif);
2034                 if (sk2) {
2035                         inet_twsk_deschedule_put(inet_twsk(sk));
2036                         sk = sk2;
2037                         tcp_v4_restore_cb(skb);
2038                         refcounted = false;
2039                         goto process;
2040                 }
2041         }
2042                 /* to ACK */
2043                 /* fall through */
2044         case TCP_TW_ACK:
2045                 tcp_v4_timewait_ack(sk, skb);
2046                 break;
2047         case TCP_TW_RST:
2048                 tcp_v4_send_reset(sk, skb);
2049                 inet_twsk_deschedule_put(inet_twsk(sk));
2050                 goto discard_it;
2051         case TCP_TW_SUCCESS:;
2052         }
2053         goto discard_it;
2054 }
2055
2056 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2057         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2058         .twsk_unique    = tcp_twsk_unique,
2059         .twsk_destructor= tcp_twsk_destructor,
2060 };
2061
2062 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2063 {
2064         struct dst_entry *dst = skb_dst(skb);
2065
2066         if (dst && dst_hold_safe(dst)) {
2067                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2068                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2069         }
2070 }
2071 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2072
2073 const struct inet_connection_sock_af_ops ipv4_specific = {
2074         .queue_xmit        = ip_queue_xmit,
2075         .send_check        = tcp_v4_send_check,
2076         .rebuild_header    = inet_sk_rebuild_header,
2077         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2078         .conn_request      = tcp_v4_conn_request,
2079         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2080         .net_header_len    = sizeof(struct iphdr),
2081         .setsockopt        = ip_setsockopt,
2082         .getsockopt        = ip_getsockopt,
2083         .addr2sockaddr     = inet_csk_addr2sockaddr,
2084         .sockaddr_len      = sizeof(struct sockaddr_in),
2085 #ifdef CONFIG_COMPAT
2086         .compat_setsockopt = compat_ip_setsockopt,
2087         .compat_getsockopt = compat_ip_getsockopt,
2088 #endif
2089         .mtu_reduced       = tcp_v4_mtu_reduced,
2090 };
2091 EXPORT_SYMBOL(ipv4_specific);
2092
2093 #ifdef CONFIG_TCP_MD5SIG
2094 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2095         .md5_lookup             = tcp_v4_md5_lookup,
2096         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2097         .md5_parse              = tcp_v4_parse_md5_keys,
2098 };
2099 #endif
2100
2101 /* NOTE: A lot of things set to zero explicitly by call to
2102  *       sk_alloc() so need not be done here.
2103  */
2104 static int tcp_v4_init_sock(struct sock *sk)
2105 {
2106         struct inet_connection_sock *icsk = inet_csk(sk);
2107
2108         tcp_init_sock(sk);
2109
2110         icsk->icsk_af_ops = &ipv4_specific;
2111
2112 #ifdef CONFIG_TCP_MD5SIG
2113         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2114 #endif
2115
2116         return 0;
2117 }
2118
2119 void tcp_v4_destroy_sock(struct sock *sk)
2120 {
2121         struct tcp_sock *tp = tcp_sk(sk);
2122
2123         trace_tcp_destroy_sock(sk);
2124
2125         tcp_clear_xmit_timers(sk);
2126
2127         tcp_cleanup_congestion_control(sk);
2128
2129         tcp_cleanup_ulp(sk);
2130
2131         /* Cleanup up the write buffer. */
2132         tcp_write_queue_purge(sk);
2133
2134         /* Check if we want to disable active TFO */
2135         tcp_fastopen_active_disable_ofo_check(sk);
2136
2137         /* Cleans up our, hopefully empty, out_of_order_queue. */
2138         skb_rbtree_purge(&tp->out_of_order_queue);
2139
2140 #ifdef CONFIG_TCP_MD5SIG
2141         /* Clean up the MD5 key list, if any */
2142         if (tp->md5sig_info) {
2143                 tcp_clear_md5_list(sk);
2144                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2145                 tp->md5sig_info = NULL;
2146         }
2147 #endif
2148
2149         /* Clean up a referenced TCP bind bucket. */
2150         if (inet_csk(sk)->icsk_bind_hash)
2151                 inet_put_port(sk);
2152
2153         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2154
2155         /* If socket is aborted during connect operation */
2156         tcp_free_fastopen_req(tp);
2157         tcp_fastopen_destroy_cipher(sk);
2158         tcp_saved_syn_free(tp);
2159
2160         sk_sockets_allocated_dec(sk);
2161 }
2162 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2163
2164 #ifdef CONFIG_PROC_FS
2165 /* Proc filesystem TCP sock list dumping. */
2166
2167 /*
2168  * Get next listener socket follow cur.  If cur is NULL, get first socket
2169  * starting from bucket given in st->bucket; when st->bucket is zero the
2170  * very first socket in the hash table is returned.
2171  */
2172 static void *listening_get_next(struct seq_file *seq, void *cur)
2173 {
2174         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2175         struct tcp_iter_state *st = seq->private;
2176         struct net *net = seq_file_net(seq);
2177         struct inet_listen_hashbucket *ilb;
2178         struct hlist_nulls_node *node;
2179         struct sock *sk = cur;
2180
2181         if (!sk) {
2182 get_head:
2183                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2184                 spin_lock(&ilb->lock);
2185                 sk = sk_nulls_head(&ilb->nulls_head);
2186                 st->offset = 0;
2187                 goto get_sk;
2188         }
2189         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2190         ++st->num;
2191         ++st->offset;
2192
2193         sk = sk_nulls_next(sk);
2194 get_sk:
2195         sk_nulls_for_each_from(sk, node) {
2196                 if (!net_eq(sock_net(sk), net))
2197                         continue;
2198                 if (sk->sk_family == afinfo->family)
2199                         return sk;
2200         }
2201         spin_unlock(&ilb->lock);
2202         st->offset = 0;
2203         if (++st->bucket < INET_LHTABLE_SIZE)
2204                 goto get_head;
2205         return NULL;
2206 }
2207
2208 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2209 {
2210         struct tcp_iter_state *st = seq->private;
2211         void *rc;
2212
2213         st->bucket = 0;
2214         st->offset = 0;
2215         rc = listening_get_next(seq, NULL);
2216
2217         while (rc && *pos) {
2218                 rc = listening_get_next(seq, rc);
2219                 --*pos;
2220         }
2221         return rc;
2222 }
2223
2224 static inline bool empty_bucket(const struct tcp_iter_state *st)
2225 {
2226         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2227 }
2228
2229 /*
2230  * Get first established socket starting from bucket given in st->bucket.
2231  * If st->bucket is zero, the very first socket in the hash is returned.
2232  */
2233 static void *established_get_first(struct seq_file *seq)
2234 {
2235         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2236         struct tcp_iter_state *st = seq->private;
2237         struct net *net = seq_file_net(seq);
2238         void *rc = NULL;
2239
2240         st->offset = 0;
2241         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2242                 struct sock *sk;
2243                 struct hlist_nulls_node *node;
2244                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2245
2246                 /* Lockless fast path for the common case of empty buckets */
2247                 if (empty_bucket(st))
2248                         continue;
2249
2250                 spin_lock_bh(lock);
2251                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2252                         if (sk->sk_family != afinfo->family ||
2253                             !net_eq(sock_net(sk), net)) {
2254                                 continue;
2255                         }
2256                         rc = sk;
2257                         goto out;
2258                 }
2259                 spin_unlock_bh(lock);
2260         }
2261 out:
2262         return rc;
2263 }
2264
2265 static void *established_get_next(struct seq_file *seq, void *cur)
2266 {
2267         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2268         struct sock *sk = cur;
2269         struct hlist_nulls_node *node;
2270         struct tcp_iter_state *st = seq->private;
2271         struct net *net = seq_file_net(seq);
2272
2273         ++st->num;
2274         ++st->offset;
2275
2276         sk = sk_nulls_next(sk);
2277
2278         sk_nulls_for_each_from(sk, node) {
2279                 if (sk->sk_family == afinfo->family &&
2280                     net_eq(sock_net(sk), net))
2281                         return sk;
2282         }
2283
2284         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2285         ++st->bucket;
2286         return established_get_first(seq);
2287 }
2288
2289 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2290 {
2291         struct tcp_iter_state *st = seq->private;
2292         void *rc;
2293
2294         st->bucket = 0;
2295         rc = established_get_first(seq);
2296
2297         while (rc && pos) {
2298                 rc = established_get_next(seq, rc);
2299                 --pos;
2300         }
2301         return rc;
2302 }
2303
2304 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2305 {
2306         void *rc;
2307         struct tcp_iter_state *st = seq->private;
2308
2309         st->state = TCP_SEQ_STATE_LISTENING;
2310         rc        = listening_get_idx(seq, &pos);
2311
2312         if (!rc) {
2313                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2314                 rc        = established_get_idx(seq, pos);
2315         }
2316
2317         return rc;
2318 }
2319
2320 static void *tcp_seek_last_pos(struct seq_file *seq)
2321 {
2322         struct tcp_iter_state *st = seq->private;
2323         int bucket = st->bucket;
2324         int offset = st->offset;
2325         int orig_num = st->num;
2326         void *rc = NULL;
2327
2328         switch (st->state) {
2329         case TCP_SEQ_STATE_LISTENING:
2330                 if (st->bucket >= INET_LHTABLE_SIZE)
2331                         break;
2332                 st->state = TCP_SEQ_STATE_LISTENING;
2333                 rc = listening_get_next(seq, NULL);
2334                 while (offset-- && rc && bucket == st->bucket)
2335                         rc = listening_get_next(seq, rc);
2336                 if (rc)
2337                         break;
2338                 st->bucket = 0;
2339                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2340                 /* Fallthrough */
2341         case TCP_SEQ_STATE_ESTABLISHED:
2342                 if (st->bucket > tcp_hashinfo.ehash_mask)
2343                         break;
2344                 rc = established_get_first(seq);
2345                 while (offset-- && rc && bucket == st->bucket)
2346                         rc = established_get_next(seq, rc);
2347         }
2348
2349         st->num = orig_num;
2350
2351         return rc;
2352 }
2353
2354 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2355 {
2356         struct tcp_iter_state *st = seq->private;
2357         void *rc;
2358
2359         if (*pos && *pos == st->last_pos) {
2360                 rc = tcp_seek_last_pos(seq);
2361                 if (rc)
2362                         goto out;
2363         }
2364
2365         st->state = TCP_SEQ_STATE_LISTENING;
2366         st->num = 0;
2367         st->bucket = 0;
2368         st->offset = 0;
2369         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2370
2371 out:
2372         st->last_pos = *pos;
2373         return rc;
2374 }
2375 EXPORT_SYMBOL(tcp_seq_start);
2376
2377 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2378 {
2379         struct tcp_iter_state *st = seq->private;
2380         void *rc = NULL;
2381
2382         if (v == SEQ_START_TOKEN) {
2383                 rc = tcp_get_idx(seq, 0);
2384                 goto out;
2385         }
2386
2387         switch (st->state) {
2388         case TCP_SEQ_STATE_LISTENING:
2389                 rc = listening_get_next(seq, v);
2390                 if (!rc) {
2391                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2392                         st->bucket = 0;
2393                         st->offset = 0;
2394                         rc        = established_get_first(seq);
2395                 }
2396                 break;
2397         case TCP_SEQ_STATE_ESTABLISHED:
2398                 rc = established_get_next(seq, v);
2399                 break;
2400         }
2401 out:
2402         ++*pos;
2403         st->last_pos = *pos;
2404         return rc;
2405 }
2406 EXPORT_SYMBOL(tcp_seq_next);
2407
2408 void tcp_seq_stop(struct seq_file *seq, void *v)
2409 {
2410         struct tcp_iter_state *st = seq->private;
2411
2412         switch (st->state) {
2413         case TCP_SEQ_STATE_LISTENING:
2414                 if (v != SEQ_START_TOKEN)
2415                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2416                 break;
2417         case TCP_SEQ_STATE_ESTABLISHED:
2418                 if (v)
2419                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2420                 break;
2421         }
2422 }
2423 EXPORT_SYMBOL(tcp_seq_stop);
2424
2425 static void get_openreq4(const struct request_sock *req,
2426                          struct seq_file *f, int i)
2427 {
2428         const struct inet_request_sock *ireq = inet_rsk(req);
2429         long delta = req->rsk_timer.expires - jiffies;
2430
2431         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2432                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2433                 i,
2434                 ireq->ir_loc_addr,
2435                 ireq->ir_num,
2436                 ireq->ir_rmt_addr,
2437                 ntohs(ireq->ir_rmt_port),
2438                 TCP_SYN_RECV,
2439                 0, 0, /* could print option size, but that is af dependent. */
2440                 1,    /* timers active (only the expire timer) */
2441                 jiffies_delta_to_clock_t(delta),
2442                 req->num_timeout,
2443                 from_kuid_munged(seq_user_ns(f),
2444                                  sock_i_uid(req->rsk_listener)),
2445                 0,  /* non standard timer */
2446                 0, /* open_requests have no inode */
2447                 0,
2448                 req);
2449 }
2450
2451 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2452 {
2453         int timer_active;
2454         unsigned long timer_expires;
2455         const struct tcp_sock *tp = tcp_sk(sk);
2456         const struct inet_connection_sock *icsk = inet_csk(sk);
2457         const struct inet_sock *inet = inet_sk(sk);
2458         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2459         __be32 dest = inet->inet_daddr;
2460         __be32 src = inet->inet_rcv_saddr;
2461         __u16 destp = ntohs(inet->inet_dport);
2462         __u16 srcp = ntohs(inet->inet_sport);
2463         int rx_queue;
2464         int state;
2465
2466         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2467             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2468             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2469                 timer_active    = 1;
2470                 timer_expires   = icsk->icsk_timeout;
2471         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2472                 timer_active    = 4;
2473                 timer_expires   = icsk->icsk_timeout;
2474         } else if (timer_pending(&sk->sk_timer)) {
2475                 timer_active    = 2;
2476                 timer_expires   = sk->sk_timer.expires;
2477         } else {
2478                 timer_active    = 0;
2479                 timer_expires = jiffies;
2480         }
2481
2482         state = inet_sk_state_load(sk);
2483         if (state == TCP_LISTEN)
2484                 rx_queue = sk->sk_ack_backlog;
2485         else
2486                 /* Because we don't lock the socket,
2487                  * we might find a transient negative value.
2488                  */
2489                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2490                                       READ_ONCE(tp->copied_seq), 0);
2491
2492         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2493                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2494                 i, src, srcp, dest, destp, state,
2495                 READ_ONCE(tp->write_seq) - tp->snd_una,
2496                 rx_queue,
2497                 timer_active,
2498                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2499                 icsk->icsk_retransmits,
2500                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2501                 icsk->icsk_probes_out,
2502                 sock_i_ino(sk),
2503                 refcount_read(&sk->sk_refcnt), sk,
2504                 jiffies_to_clock_t(icsk->icsk_rto),
2505                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2506                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2507                 tp->snd_cwnd,
2508                 state == TCP_LISTEN ?
2509                     fastopenq->max_qlen :
2510                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2511 }
2512
2513 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2514                                struct seq_file *f, int i)
2515 {
2516         long delta = tw->tw_timer.expires - jiffies;
2517         __be32 dest, src;
2518         __u16 destp, srcp;
2519
2520         dest  = tw->tw_daddr;
2521         src   = tw->tw_rcv_saddr;
2522         destp = ntohs(tw->tw_dport);
2523         srcp  = ntohs(tw->tw_sport);
2524
2525         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2526                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2527                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2528                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2529                 refcount_read(&tw->tw_refcnt), tw);
2530 }
2531
2532 #define TMPSZ 150
2533
2534 static int tcp4_seq_show(struct seq_file *seq, void *v)
2535 {
2536         struct tcp_iter_state *st;
2537         struct sock *sk = v;
2538
2539         seq_setwidth(seq, TMPSZ - 1);
2540         if (v == SEQ_START_TOKEN) {
2541                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2542                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2543                            "inode");
2544                 goto out;
2545         }
2546         st = seq->private;
2547
2548         if (sk->sk_state == TCP_TIME_WAIT)
2549                 get_timewait4_sock(v, seq, st->num);
2550         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2551                 get_openreq4(v, seq, st->num);
2552         else
2553                 get_tcp4_sock(v, seq, st->num);
2554 out:
2555         seq_pad(seq, '\n');
2556         return 0;
2557 }
2558
2559 static const struct seq_operations tcp4_seq_ops = {
2560         .show           = tcp4_seq_show,
2561         .start          = tcp_seq_start,
2562         .next           = tcp_seq_next,
2563         .stop           = tcp_seq_stop,
2564 };
2565
2566 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2567         .family         = AF_INET,
2568 };
2569
2570 static int __net_init tcp4_proc_init_net(struct net *net)
2571 {
2572         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2573                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2574                 return -ENOMEM;
2575         return 0;
2576 }
2577
2578 static void __net_exit tcp4_proc_exit_net(struct net *net)
2579 {
2580         remove_proc_entry("tcp", net->proc_net);
2581 }
2582
2583 static struct pernet_operations tcp4_net_ops = {
2584         .init = tcp4_proc_init_net,
2585         .exit = tcp4_proc_exit_net,
2586 };
2587
2588 int __init tcp4_proc_init(void)
2589 {
2590         return register_pernet_subsys(&tcp4_net_ops);
2591 }
2592
2593 void tcp4_proc_exit(void)
2594 {
2595         unregister_pernet_subsys(&tcp4_net_ops);
2596 }
2597 #endif /* CONFIG_PROC_FS */
2598
2599 struct proto tcp_prot = {
2600         .name                   = "TCP",
2601         .owner                  = THIS_MODULE,
2602         .close                  = tcp_close,
2603         .pre_connect            = tcp_v4_pre_connect,
2604         .connect                = tcp_v4_connect,
2605         .disconnect             = tcp_disconnect,
2606         .accept                 = inet_csk_accept,
2607         .ioctl                  = tcp_ioctl,
2608         .init                   = tcp_v4_init_sock,
2609         .destroy                = tcp_v4_destroy_sock,
2610         .shutdown               = tcp_shutdown,
2611         .setsockopt             = tcp_setsockopt,
2612         .getsockopt             = tcp_getsockopt,
2613         .keepalive              = tcp_set_keepalive,
2614         .recvmsg                = tcp_recvmsg,
2615         .sendmsg                = tcp_sendmsg,
2616         .sendpage               = tcp_sendpage,
2617         .backlog_rcv            = tcp_v4_do_rcv,
2618         .release_cb             = tcp_release_cb,
2619         .hash                   = inet_hash,
2620         .unhash                 = inet_unhash,
2621         .get_port               = inet_csk_get_port,
2622         .enter_memory_pressure  = tcp_enter_memory_pressure,
2623         .leave_memory_pressure  = tcp_leave_memory_pressure,
2624         .stream_memory_free     = tcp_stream_memory_free,
2625         .sockets_allocated      = &tcp_sockets_allocated,
2626         .orphan_count           = &tcp_orphan_count,
2627         .memory_allocated       = &tcp_memory_allocated,
2628         .memory_pressure        = &tcp_memory_pressure,
2629         .sysctl_mem             = sysctl_tcp_mem,
2630         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2631         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2632         .max_header             = MAX_TCP_HEADER,
2633         .obj_size               = sizeof(struct tcp_sock),
2634         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2635         .twsk_prot              = &tcp_timewait_sock_ops,
2636         .rsk_prot               = &tcp_request_sock_ops,
2637         .h.hashinfo             = &tcp_hashinfo,
2638         .no_autobind            = true,
2639 #ifdef CONFIG_COMPAT
2640         .compat_setsockopt      = compat_tcp_setsockopt,
2641         .compat_getsockopt      = compat_tcp_getsockopt,
2642 #endif
2643         .diag_destroy           = tcp_abort,
2644 };
2645 EXPORT_SYMBOL(tcp_prot);
2646
2647 static void __net_exit tcp_sk_exit(struct net *net)
2648 {
2649         int cpu;
2650
2651         if (net->ipv4.tcp_congestion_control)
2652                 module_put(net->ipv4.tcp_congestion_control->owner);
2653
2654         for_each_possible_cpu(cpu)
2655                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2656         free_percpu(net->ipv4.tcp_sk);
2657 }
2658
2659 static int __net_init tcp_sk_init(struct net *net)
2660 {
2661         int res, cpu, cnt;
2662
2663         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2664         if (!net->ipv4.tcp_sk)
2665                 return -ENOMEM;
2666
2667         for_each_possible_cpu(cpu) {
2668                 struct sock *sk;
2669
2670                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2671                                            IPPROTO_TCP, net);
2672                 if (res)
2673                         goto fail;
2674                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2675
2676                 /* Please enforce IP_DF and IPID==0 for RST and
2677                  * ACK sent in SYN-RECV and TIME-WAIT state.
2678                  */
2679                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2680
2681                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2682         }
2683
2684         net->ipv4.sysctl_tcp_ecn = 2;
2685         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2686
2687         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2688         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2689         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2690         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2691         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2692
2693         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2694         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2695         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2696
2697         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2698         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2699         net->ipv4.sysctl_tcp_syncookies = 1;
2700         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2701         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2702         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2703         net->ipv4.sysctl_tcp_orphan_retries = 0;
2704         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2705         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2706         net->ipv4.sysctl_tcp_tw_reuse = 2;
2707
2708         cnt = tcp_hashinfo.ehash_mask + 1;
2709         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2710         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2711
2712         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2713         net->ipv4.sysctl_tcp_sack = 1;
2714         net->ipv4.sysctl_tcp_window_scaling = 1;
2715         net->ipv4.sysctl_tcp_timestamps = 1;
2716         net->ipv4.sysctl_tcp_early_retrans = 3;
2717         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2718         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2719         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2720         net->ipv4.sysctl_tcp_max_reordering = 300;
2721         net->ipv4.sysctl_tcp_dsack = 1;
2722         net->ipv4.sysctl_tcp_app_win = 31;
2723         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2724         net->ipv4.sysctl_tcp_frto = 2;
2725         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2726         /* This limits the percentage of the congestion window which we
2727          * will allow a single TSO frame to consume.  Building TSO frames
2728          * which are too large can cause TCP streams to be bursty.
2729          */
2730         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2731         /* Default TSQ limit of 16 TSO segments */
2732         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2733         /* rfc5961 challenge ack rate limiting */
2734         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2735         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2736         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2737         net->ipv4.sysctl_tcp_autocorking = 1;
2738         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2739         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2740         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2741         if (net != &init_net) {
2742                 memcpy(net->ipv4.sysctl_tcp_rmem,
2743                        init_net.ipv4.sysctl_tcp_rmem,
2744                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2745                 memcpy(net->ipv4.sysctl_tcp_wmem,
2746                        init_net.ipv4.sysctl_tcp_wmem,
2747                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2748         }
2749         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2750         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2751         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2752         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2753         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2754         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2755
2756         /* Reno is always built in */
2757         if (!net_eq(net, &init_net) &&
2758             try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2759                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2760         else
2761                 net->ipv4.tcp_congestion_control = &tcp_reno;
2762
2763         return 0;
2764 fail:
2765         tcp_sk_exit(net);
2766
2767         return res;
2768 }
2769
2770 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2771 {
2772         struct net *net;
2773
2774         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2775
2776         list_for_each_entry(net, net_exit_list, exit_list)
2777                 tcp_fastopen_ctx_destroy(net);
2778 }
2779
2780 static struct pernet_operations __net_initdata tcp_sk_ops = {
2781        .init       = tcp_sk_init,
2782        .exit       = tcp_sk_exit,
2783        .exit_batch = tcp_sk_exit_batch,
2784 };
2785
2786 void __init tcp_v4_init(void)
2787 {
2788         if (register_pernet_subsys(&tcp_sk_ops))
2789                 panic("Failed to create the TCP control socket.\n");
2790 }