net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79 #include <linux/btf_ids.h>
  80
  81 #include <crypto/hash.h>
  82 #include <linux/scatterlist.h>
  83
  84 #include <trace/events/tcp.h>
  85
  86 #ifdef CONFIG_TCP_MD5SIG
  87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89 #endif
  90
  91 struct inet_hashinfo tcp_hashinfo;
  92 EXPORT_SYMBOL(tcp_hashinfo);
  93
  94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  95 {
  96         return secure_tcp_seq(ip_hdr(skb)->daddr,
  97                               ip_hdr(skb)->saddr,
  98                               tcp_hdr(skb)->dest,
  99                               tcp_hdr(skb)->source);
 100 }
 101
 102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 103 {
 104         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 105 }
 106
 107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 108 {
 109         int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
 110         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 111         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 112         struct tcp_sock *tp = tcp_sk(sk);
 113
 114         if (reuse == 2) {
 115                 /* Still does not detect *everything* that goes through
 116                  * lo, since we require a loopback src or dst address
 117                  * or direct binding to 'lo' interface.
 118                  */
 119                 bool loopback = false;
 120                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 121                         loopback = true;
 122 #if IS_ENABLED(CONFIG_IPV6)
 123                 if (tw->tw_family == AF_INET6) {
 124                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 125                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 126                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 128                                 loopback = true;
 129                 } else
 130 #endif
 131                 {
 132                         if (ipv4_is_loopback(tw->tw_daddr) ||
 133                             ipv4_is_loopback(tw->tw_rcv_saddr))
 134                                 loopback = true;
 135                 }
 136                 if (!loopback)
 137                         reuse = 0;
 138         }
 139
 140         /* With PAWS, it is safe from the viewpoint
 141            of data integrity. Even without PAWS it is safe provided sequence
 142            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 143
 144            Actually, the idea is close to VJ's one, only timestamp cache is
 145            held not per host, but per port pair and TW bucket is used as state
 146            holder.
 147
 148            If TW bucket has been already destroyed we fall back to VJ's scheme
 149            and use initial timestamp retrieved from peer table.
 150          */
 151         if (tcptw->tw_ts_recent_stamp &&
 152             (!twp || (reuse && time_after32(ktime_get_seconds(),
 153                                             tcptw->tw_ts_recent_stamp)))) {
 154                 /* In case of repair and re-using TIME-WAIT sockets we still
 155                  * want to be sure that it is safe as above but honor the
 156                  * sequence numbers and time stamps set as part of the repair
 157                  * process.
 158                  *
 159                  * Without this check re-using a TIME-WAIT socket with TCP
 160                  * repair would accumulate a -1 on the repair assigned
 161                  * sequence number. The first time it is reused the sequence
 162                  * is -1, the second time -2, etc. This fixes that issue
 163                  * without appearing to create any others.
 164                  */
 165                 if (likely(!tp->repair)) {
 166                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 167
 168                         if (!seq)
 169                                 seq = 1;
 170                         WRITE_ONCE(tp->write_seq, seq);
 171                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 172                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 173                 }
 174                 sock_hold(sktw);
 175                 return 1;
 176         }
 177
 178         return 0;
 179 }
 180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 181
 182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 183                               int addr_len)
 184 {
 185         /* This check is replicated from tcp_v4_connect() and intended to
 186          * prevent BPF program called below from accessing bytes that are out
 187          * of the bound specified by user in addr_len.
 188          */
 189         if (addr_len < sizeof(struct sockaddr_in))
 190                 return -EINVAL;
 191
 192         sock_owned_by_me(sk);
 193
 194         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 195 }
 196
 197 /* This will initiate an outgoing connection. */
 198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 199 {
 200         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 201         struct inet_sock *inet = inet_sk(sk);
 202         struct tcp_sock *tp = tcp_sk(sk);
 203         __be16 orig_sport, orig_dport;
 204         __be32 daddr, nexthop;
 205         struct flowi4 *fl4;
 206         struct rtable *rt;
 207         int err;
 208         struct ip_options_rcu *inet_opt;
 209         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 210
 211         if (addr_len < sizeof(struct sockaddr_in))
 212                 return -EINVAL;
 213
 214         if (usin->sin_family != AF_INET)
 215                 return -EAFNOSUPPORT;
 216
 217         nexthop = daddr = usin->sin_addr.s_addr;
 218         inet_opt = rcu_dereference_protected(inet->inet_opt,
 219                                              lockdep_sock_is_held(sk));
 220         if (inet_opt && inet_opt->opt.srr) {
 221                 if (!daddr)
 222                         return -EINVAL;
 223                 nexthop = inet_opt->opt.faddr;
 224         }
 225
 226         orig_sport = inet->inet_sport;
 227         orig_dport = usin->sin_port;
 228         fl4 = &inet->cork.fl.u.ip4;
 229         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 230                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 231                               IPPROTO_TCP,
 232                               orig_sport, orig_dport, sk);
 233         if (IS_ERR(rt)) {
 234                 err = PTR_ERR(rt);
 235                 if (err == -ENETUNREACH)
 236                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 237                 return err;
 238         }
 239
 240         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 241                 ip_rt_put(rt);
 242                 return -ENETUNREACH;
 243         }
 244
 245         if (!inet_opt || !inet_opt->opt.srr)
 246                 daddr = fl4->daddr;
 247
 248         if (!inet->inet_saddr)
 249                 inet->inet_saddr = fl4->saddr;
 250         sk_rcv_saddr_set(sk, inet->inet_saddr);
 251
 252         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 253                 /* Reset inherited state */
 254                 tp->rx_opt.ts_recent       = 0;
 255                 tp->rx_opt.ts_recent_stamp = 0;
 256                 if (likely(!tp->repair))
 257                         WRITE_ONCE(tp->write_seq, 0);
 258         }
 259
 260         inet->inet_dport = usin->sin_port;
 261         sk_daddr_set(sk, daddr);
 262
 263         inet_csk(sk)->icsk_ext_hdr_len = 0;
 264         if (inet_opt)
 265                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 266
 267         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 268
 269         /* Socket identity is still unknown (sport may be zero).
 270          * However we set state to SYN-SENT and not releasing socket
 271          * lock select source port, enter ourselves into the hash tables and
 272          * complete initialization after this.
 273          */
 274         tcp_set_state(sk, TCP_SYN_SENT);
 275         err = inet_hash_connect(tcp_death_row, sk);
 276         if (err)
 277                 goto failure;
 278
 279         sk_set_txhash(sk);
 280
 281         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 282                                inet->inet_sport, inet->inet_dport, sk);
 283         if (IS_ERR(rt)) {
 284                 err = PTR_ERR(rt);
 285                 rt = NULL;
 286                 goto failure;
 287         }
 288         /* OK, now commit destination to socket.  */
 289         sk->sk_gso_type = SKB_GSO_TCPV4;
 290         sk_setup_caps(sk, &rt->dst);
 291         rt = NULL;
 292
 293         if (likely(!tp->repair)) {
 294                 if (!tp->write_seq)
 295                         WRITE_ONCE(tp->write_seq,
 296                                    secure_tcp_seq(inet->inet_saddr,
 297                                                   inet->inet_daddr,
 298                                                   inet->inet_sport,
 299                                                   usin->sin_port));
 300                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 301                                                  inet->inet_saddr,
 302                                                  inet->inet_daddr);
 303         }
 304
 305         inet->inet_id = prandom_u32();
 306
 307         if (tcp_fastopen_defer_connect(sk, &err))
 308                 return err;
 309         if (err)
 310                 goto failure;
 311
 312         err = tcp_connect(sk);
 313
 314         if (err)
 315                 goto failure;
 316
 317         return 0;
 318
 319 failure:
 320         /*
 321          * This unhashes the socket and releases the local port,
 322          * if necessary.
 323          */
 324         tcp_set_state(sk, TCP_CLOSE);
 325         ip_rt_put(rt);
 326         sk->sk_route_caps = 0;
 327         inet->inet_dport = 0;
 328         return err;
 329 }
 330 EXPORT_SYMBOL(tcp_v4_connect);
 331
 332 /*
 333  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 334  * It can be called through tcp_release_cb() if socket was owned by user
 335  * at the time tcp_v4_err() was called to handle ICMP message.
 336  */
 337 void tcp_v4_mtu_reduced(struct sock *sk)
 338 {
 339         struct inet_sock *inet = inet_sk(sk);
 340         struct dst_entry *dst;
 341         u32 mtu;
 342
 343         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 344                 return;
 345         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 346         dst = inet_csk_update_pmtu(sk, mtu);
 347         if (!dst)
 348                 return;
 349
 350         /* Something is about to be wrong... Remember soft error
 351          * for the case, if this connection will not able to recover.
 352          */
 353         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 354                 sk->sk_err_soft = EMSGSIZE;
 355
 356         mtu = dst_mtu(dst);
 357
 358         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 359             ip_sk_accept_pmtu(sk) &&
 360             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 361                 tcp_sync_mss(sk, mtu);
 362
 363                 /* Resend the TCP packet because it's
 364                  * clear that the old packet has been
 365                  * dropped. This is the new "fast" path mtu
 366                  * discovery.
 367                  */
 368                 tcp_simple_retransmit(sk);
 369         } /* else let the usual retransmit timer handle it */
 370 }
 371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 372
 373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 374 {
 375         struct dst_entry *dst = __sk_dst_check(sk, 0);
 376
 377         if (dst)
 378                 dst->ops->redirect(dst, sk, skb);
 379 }
 380
 381
 382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 384 {
 385         struct request_sock *req = inet_reqsk(sk);
 386         struct net *net = sock_net(sk);
 387
 388         /* ICMPs are not backlogged, hence we cannot get
 389          * an established socket here.
 390          */
 391         if (seq != tcp_rsk(req)->snt_isn) {
 392                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 393         } else if (abort) {
 394                 /*
 395                  * Still in SYN_RECV, just remove it silently.
 396                  * There is no good way to pass the error to the newly
 397                  * created socket, and POSIX does not want network
 398                  * errors returned from accept().
 399                  */
 400                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 401                 tcp_listendrop(req->rsk_listener);
 402         }
 403         reqsk_put(req);
 404 }
 405 EXPORT_SYMBOL(tcp_req_err);
 406
 407 /* TCP-LD (RFC 6069) logic */
 408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 409 {
 410         struct inet_connection_sock *icsk = inet_csk(sk);
 411         struct tcp_sock *tp = tcp_sk(sk);
 412         struct sk_buff *skb;
 413         s32 remaining;
 414         u32 delta_us;
 415
 416         if (sock_owned_by_user(sk))
 417                 return;
 418
 419         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 420             !icsk->icsk_backoff)
 421                 return;
 422
 423         skb = tcp_rtx_queue_head(sk);
 424         if (WARN_ON_ONCE(!skb))
 425                 return;
 426
 427         icsk->icsk_backoff--;
 428         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 429         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 430
 431         tcp_mstamp_refresh(tp);
 432         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 433         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 434
 435         if (remaining > 0) {
 436                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 437                                           remaining, TCP_RTO_MAX);
 438         } else {
 439                 /* RTO revert clocked out retransmission.
 440                  * Will retransmit now.
 441                  */
 442                 tcp_retransmit_timer(sk);
 443         }
 444 }
 445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
 446
 447 /*
 448  * This routine is called by the ICMP module when it gets some
 449  * sort of error condition.  If err < 0 then the socket should
 450  * be closed and the error returned to the user.  If err > 0
 451  * it's just the icmp type << 8 | icmp code.  After adjustment
 452  * header points to the first 8 bytes of the tcp header.  We need
 453  * to find the appropriate port.
 454  *
 455  * The locking strategy used here is very "optimistic". When
 456  * someone else accesses the socket the ICMP is just dropped
 457  * and for some paths there is no check at all.
 458  * A more general error queue to queue errors for later handling
 459  * is probably better.
 460  *
 461  */
 462
 463 int tcp_v4_err(struct sk_buff *skb, u32 info)
 464 {
 465         const struct iphdr *iph = (const struct iphdr *)skb->data;
 466         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 467         struct tcp_sock *tp;
 468         struct inet_sock *inet;
 469         const int type = icmp_hdr(skb)->type;
 470         const int code = icmp_hdr(skb)->code;
 471         struct sock *sk;
 472         struct request_sock *fastopen;
 473         u32 seq, snd_una;
 474         int err;
 475         struct net *net = dev_net(skb->dev);
 476
 477         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 478                                        th->dest, iph->saddr, ntohs(th->source),
 479                                        inet_iif(skb), 0);
 480         if (!sk) {
 481                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 482                 return -ENOENT;
 483         }
 484         if (sk->sk_state == TCP_TIME_WAIT) {
 485                 inet_twsk_put(inet_twsk(sk));
 486                 return 0;
 487         }
 488         seq = ntohl(th->seq);
 489         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 490                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 491                                      type == ICMP_TIME_EXCEEDED ||
 492                                      (type == ICMP_DEST_UNREACH &&
 493                                       (code == ICMP_NET_UNREACH ||
 494                                        code == ICMP_HOST_UNREACH)));
 495                 return 0;
 496         }
 497
 498         bh_lock_sock(sk);
 499         /* If too many ICMPs get dropped on busy
 500          * servers this needs to be solved differently.
 501          * We do take care of PMTU discovery (RFC1191) special case :
 502          * we can receive locally generated ICMP messages while socket is held.
 503          */
 504         if (sock_owned_by_user(sk)) {
 505                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 506                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 507         }
 508         if (sk->sk_state == TCP_CLOSE)
 509                 goto out;
 510
 511         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 512                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 513                 goto out;
 514         }
 515
 516         tp = tcp_sk(sk);
 517         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 518         fastopen = rcu_dereference(tp->fastopen_rsk);
 519         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 520         if (sk->sk_state != TCP_LISTEN &&
 521             !between(seq, snd_una, tp->snd_nxt)) {
 522                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 523                 goto out;
 524         }
 525
 526         switch (type) {
 527         case ICMP_REDIRECT:
 528                 if (!sock_owned_by_user(sk))
 529                         do_redirect(skb, sk);
 530                 goto out;
 531         case ICMP_SOURCE_QUENCH:
 532                 /* Just silently ignore these. */
 533                 goto out;
 534         case ICMP_PARAMETERPROB:
 535                 err = EPROTO;
 536                 break;
 537         case ICMP_DEST_UNREACH:
 538                 if (code > NR_ICMP_UNREACH)
 539                         goto out;
 540
 541                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 542                         /* We are not interested in TCP_LISTEN and open_requests
 543                          * (SYN-ACKs send out by Linux are always <576bytes so
 544                          * they should go through unfragmented).
 545                          */
 546                         if (sk->sk_state == TCP_LISTEN)
 547                                 goto out;
 548
 549                         WRITE_ONCE(tp->mtu_info, info);
 550                         if (!sock_owned_by_user(sk)) {
 551                                 tcp_v4_mtu_reduced(sk);
 552                         } else {
 553                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 554                                         sock_hold(sk);
 555                         }
 556                         goto out;
 557                 }
 558
 559                 err = icmp_err_convert[code].errno;
 560                 /* check if this ICMP message allows revert of backoff.
 561                  * (see RFC 6069)
 562                  */
 563                 if (!fastopen &&
 564                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 565                         tcp_ld_RTO_revert(sk, seq);
 566                 break;
 567         case ICMP_TIME_EXCEEDED:
 568                 err = EHOSTUNREACH;
 569                 break;
 570         default:
 571                 goto out;
 572         }
 573
 574         switch (sk->sk_state) {
 575         case TCP_SYN_SENT:
 576         case TCP_SYN_RECV:
 577                 /* Only in fast or simultaneous open. If a fast open socket is
 578                  * already accepted it is treated as a connected one below.
 579                  */
 580                 if (fastopen && !fastopen->sk)
 581                         break;
 582
 583                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 584
 585                 if (!sock_owned_by_user(sk)) {
 586                         sk->sk_err = err;
 587
 588                         sk->sk_error_report(sk);
 589
 590                         tcp_done(sk);
 591                 } else {
 592                         sk->sk_err_soft = err;
 593                 }
 594                 goto out;
 595         }
 596
 597         /* If we've already connected we will keep trying
 598          * until we time out, or the user gives up.
 599          *
 600          * rfc1122 4.2.3.9 allows to consider as hard errors
 601          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 602          * but it is obsoleted by pmtu discovery).
 603          *
 604          * Note, that in modern internet, where routing is unreliable
 605          * and in each dark corner broken firewalls sit, sending random
 606          * errors ordered by their masters even this two messages finally lose
 607          * their original sense (even Linux sends invalid PORT_UNREACHs)
 608          *
 609          * Now we are in compliance with RFCs.
 610          *                                                      --ANK (980905)
 611          */
 612
 613         inet = inet_sk(sk);
 614         if (!sock_owned_by_user(sk) && inet->recverr) {
 615                 sk->sk_err = err;
 616                 sk->sk_error_report(sk);
 617         } else  { /* Only an error on timeout */
 618                 sk->sk_err_soft = err;
 619         }
 620
 621 out:
 622         bh_unlock_sock(sk);
 623         sock_put(sk);
 624         return 0;
 625 }
 626
 627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 628 {
 629         struct tcphdr *th = tcp_hdr(skb);
 630
 631         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 632         skb->csum_start = skb_transport_header(skb) - skb->head;
 633         skb->csum_offset = offsetof(struct tcphdr, check);
 634 }
 635
 636 /* This routine computes an IPv4 TCP checksum. */
 637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 638 {
 639         const struct inet_sock *inet = inet_sk(sk);
 640
 641         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 642 }
 643 EXPORT_SYMBOL(tcp_v4_send_check);
 644
 645 /*
 646  *      This routine will send an RST to the other tcp.
 647  *
 648  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 649  *                    for reset.
 650  *      Answer: if a packet caused RST, it is not for a socket
 651  *              existing in our system, if it is matched to a socket,
 652  *              it is just duplicate segment or bug in other side's TCP.
 653  *              So that we build reply only basing on parameters
 654  *              arrived with segment.
 655  *      Exception: precedence violation. We do not implement it in any case.
 656  */
 657
 658 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 659 {
 660         const struct tcphdr *th = tcp_hdr(skb);
 661         struct {
 662                 struct tcphdr th;
 663 #ifdef CONFIG_TCP_MD5SIG
 664                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 665 #endif
 666         } rep;
 667         struct ip_reply_arg arg;
 668 #ifdef CONFIG_TCP_MD5SIG
 669         struct tcp_md5sig_key *key = NULL;
 670         const __u8 *hash_location = NULL;
 671         unsigned char newhash[16];
 672         int genhash;
 673         struct sock *sk1 = NULL;
 674 #endif
 675         u64 transmit_time = 0;
 676         struct sock *ctl_sk;
 677         struct net *net;
 678
 679         /* Never send a reset in response to a reset. */
 680         if (th->rst)
 681                 return;
 682
 683         /* If sk not NULL, it means we did a successful lookup and incoming
 684          * route had to be correct. prequeue might have dropped our dst.
 685          */
 686         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 687                 return;
 688
 689         /* Swap the send and the receive. */
 690         memset(&rep, 0, sizeof(rep));
 691         rep.th.dest   = th->source;
 692         rep.th.source = th->dest;
 693         rep.th.doff   = sizeof(struct tcphdr) / 4;
 694         rep.th.rst    = 1;
 695
 696         if (th->ack) {
 697                 rep.th.seq = th->ack_seq;
 698         } else {
 699                 rep.th.ack = 1;
 700                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 701                                        skb->len - (th->doff << 2));
 702         }
 703
 704         memset(&arg, 0, sizeof(arg));
 705         arg.iov[0].iov_base = (unsigned char *)&rep;
 706         arg.iov[0].iov_len  = sizeof(rep.th);
 707
 708         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 709 #ifdef CONFIG_TCP_MD5SIG
 710         rcu_read_lock();
 711         hash_location = tcp_parse_md5sig_option(th);
 712         if (sk && sk_fullsock(sk)) {
 713                 const union tcp_md5_addr *addr;
 714                 int l3index;
 715
 716                 /* sdif set, means packet ingressed via a device
 717                  * in an L3 domain and inet_iif is set to it.
 718                  */
 719                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 720                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 721                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 722         } else if (hash_location) {
 723                 const union tcp_md5_addr *addr;
 724                 int sdif = tcp_v4_sdif(skb);
 725                 int dif = inet_iif(skb);
 726                 int l3index;
 727
 728                 /*
 729                  * active side is lost. Try to find listening socket through
 730                  * source port, and then find md5 key through listening socket.
 731                  * we are not loose security here:
 732                  * Incoming packet is checked with md5 hash with finding key,
 733                  * no RST generated if md5 hash doesn't match.
 734                  */
 735                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 736                                              ip_hdr(skb)->saddr,
 737                                              th->source, ip_hdr(skb)->daddr,
 738                                              ntohs(th->source), dif, sdif);
 739                 /* don't send rst if it can't find key */
 740                 if (!sk1)
 741                         goto out;
 742
 743                 /* sdif set, means packet ingressed via a device
 744                  * in an L3 domain and dif is set to it.
 745                  */
 746                 l3index = sdif ? dif : 0;
 747                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 748                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 749                 if (!key)
 750                         goto out;
 751
 752
 753                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 754                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 755                         goto out;
 756
 757         }
 758
 759         if (key) {
 760                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 761                                    (TCPOPT_NOP << 16) |
 762                                    (TCPOPT_MD5SIG << 8) |
 763                                    TCPOLEN_MD5SIG);
 764                 /* Update length and the length the header thinks exists */
 765                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 766                 rep.th.doff = arg.iov[0].iov_len / 4;
 767
 768                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 769                                      key, ip_hdr(skb)->saddr,
 770                                      ip_hdr(skb)->daddr, &rep.th);
 771         }
 772 #endif
 773         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 774                                       ip_hdr(skb)->saddr, /* XXX */
 775                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 776         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 777         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 778
 779         /* When socket is gone, all binding information is lost.
 780          * routing might fail in this case. No choice here, if we choose to force
 781          * input interface, we will misroute in case of asymmetric route.
 782          */
 783         if (sk) {
 784                 arg.bound_dev_if = sk->sk_bound_dev_if;
 785                 if (sk_fullsock(sk))
 786                         trace_tcp_send_reset(sk, skb);
 787         }
 788
 789         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 790                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 791
 792         arg.tos = ip_hdr(skb)->tos;
 793         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 794         local_bh_disable();
 795         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 796         if (sk) {
 797                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 798                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 799                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 800                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 801                 transmit_time = tcp_transmit_time(sk);
 802         }
 803         ip_send_unicast_reply(ctl_sk,
 804                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 805                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 806                               &arg, arg.iov[0].iov_len,
 807                               transmit_time);
 808
 809         ctl_sk->sk_mark = 0;
 810         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 811         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 812         local_bh_enable();
 813
 814 #ifdef CONFIG_TCP_MD5SIG
 815 out:
 816         rcu_read_unlock();
 817 #endif
 818 }
 819
 820 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 821    outside socket context is ugly, certainly. What can I do?
 822  */
 823
 824 static void tcp_v4_send_ack(const struct sock *sk,
 825                             struct sk_buff *skb, u32 seq, u32 ack,
 826                             u32 win, u32 tsval, u32 tsecr, int oif,
 827                             struct tcp_md5sig_key *key,
 828                             int reply_flags, u8 tos)
 829 {
 830         const struct tcphdr *th = tcp_hdr(skb);
 831         struct {
 832                 struct tcphdr th;
 833                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 834 #ifdef CONFIG_TCP_MD5SIG
 835                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 836 #endif
 837                         ];
 838         } rep;
 839         struct net *net = sock_net(sk);
 840         struct ip_reply_arg arg;
 841         struct sock *ctl_sk;
 842         u64 transmit_time;
 843
 844         memset(&rep.th, 0, sizeof(struct tcphdr));
 845         memset(&arg, 0, sizeof(arg));
 846
 847         arg.iov[0].iov_base = (unsigned char *)&rep;
 848         arg.iov[0].iov_len  = sizeof(rep.th);
 849         if (tsecr) {
 850                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 851                                    (TCPOPT_TIMESTAMP << 8) |
 852                                    TCPOLEN_TIMESTAMP);
 853                 rep.opt[1] = htonl(tsval);
 854                 rep.opt[2] = htonl(tsecr);
 855                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 856         }
 857
 858         /* Swap the send and the receive. */
 859         rep.th.dest    = th->source;
 860         rep.th.source  = th->dest;
 861         rep.th.doff    = arg.iov[0].iov_len / 4;
 862         rep.th.seq     = htonl(seq);
 863         rep.th.ack_seq = htonl(ack);
 864         rep.th.ack     = 1;
 865         rep.th.window  = htons(win);
 866
 867 #ifdef CONFIG_TCP_MD5SIG
 868         if (key) {
 869                 int offset = (tsecr) ? 3 : 0;
 870
 871                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 872                                           (TCPOPT_NOP << 16) |
 873                                           (TCPOPT_MD5SIG << 8) |
 874                                           TCPOLEN_MD5SIG);
 875                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 876                 rep.th.doff = arg.iov[0].iov_len/4;
 877
 878                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 879                                     key, ip_hdr(skb)->saddr,
 880                                     ip_hdr(skb)->daddr, &rep.th);
 881         }
 882 #endif
 883         arg.flags = reply_flags;
 884         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 885                                       ip_hdr(skb)->saddr, /* XXX */
 886                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 887         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 888         if (oif)
 889                 arg.bound_dev_if = oif;
 890         arg.tos = tos;
 891         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 892         local_bh_disable();
 893         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 894         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 895                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 896         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 897                            inet_twsk(sk)->tw_priority : sk->sk_priority;
 898         transmit_time = tcp_transmit_time(sk);
 899         ip_send_unicast_reply(ctl_sk,
 900                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 901                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 902                               &arg, arg.iov[0].iov_len,
 903                               transmit_time);
 904
 905         ctl_sk->sk_mark = 0;
 906         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 907         local_bh_enable();
 908 }
 909
 910 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 911 {
 912         struct inet_timewait_sock *tw = inet_twsk(sk);
 913         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 914
 915         tcp_v4_send_ack(sk, skb,
 916                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 917                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 918                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 919                         tcptw->tw_ts_recent,
 920                         tw->tw_bound_dev_if,
 921                         tcp_twsk_md5_key(tcptw),
 922                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 923                         tw->tw_tos
 924                         );
 925
 926         inet_twsk_put(tw);
 927 }
 928
 929 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 930                                   struct request_sock *req)
 931 {
 932         const union tcp_md5_addr *addr;
 933         int l3index;
 934
 935         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 936          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 937          */
 938         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 939                                              tcp_sk(sk)->snd_nxt;
 940
 941         /* RFC 7323 2.3
 942          * The window field (SEG.WND) of every outgoing segment, with the
 943          * exception of <SYN> segments, MUST be right-shifted by
 944          * Rcv.Wind.Shift bits:
 945          */
 946         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 947         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 948         tcp_v4_send_ack(sk, skb, seq,
 949                         tcp_rsk(req)->rcv_nxt,
 950                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 951                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 952                         req->ts_recent,
 953                         0,
 954                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 955                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 956                         ip_hdr(skb)->tos);
 957 }
 958
 959 /*
 960  *      Send a SYN-ACK after having received a SYN.
 961  *      This still operates on a request_sock only, not on a big
 962  *      socket.
 963  */
 964 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 965                               struct flowi *fl,
 966                               struct request_sock *req,
 967                               struct tcp_fastopen_cookie *foc,
 968                               enum tcp_synack_type synack_type,
 969                               struct sk_buff *syn_skb)
 970 {
 971         const struct inet_request_sock *ireq = inet_rsk(req);
 972         struct flowi4 fl4;
 973         int err = -1;
 974         struct sk_buff *skb;
 975         u8 tos;
 976
 977         /* First, grab a route. */
 978         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 979                 return -1;
 980
 981         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
 982
 983         if (skb) {
 984                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 985
 986                 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
 987                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
 988                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
 989                                 inet_sk(sk)->tos;
 990
 991                 if (!INET_ECN_is_capable(tos) &&
 992                     tcp_bpf_ca_needs_ecn((struct sock *)req))
 993                         tos |= INET_ECN_ECT_0;
 994
 995                 rcu_read_lock();
 996                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 997                                             ireq->ir_rmt_addr,
 998                                             rcu_dereference(ireq->ireq_opt),
 999                                             tos);
1000                 rcu_read_unlock();
1001                 err = net_xmit_eval(err);
1002         }
1003
1004         return err;
1005 }
1006
1007 /*
1008  *      IPv4 request_sock destructor.
1009  */
1010 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1011 {
1012         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1013 }
1014
1015 #ifdef CONFIG_TCP_MD5SIG
1016 /*
1017  * RFC2385 MD5 checksumming requires a mapping of
1018  * IP address->MD5 Key.
1019  * We need to maintain these in the sk structure.
1020  */
1021
1022 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1023 EXPORT_SYMBOL(tcp_md5_needed);
1024
1025 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1026 {
1027         if (!old)
1028                 return true;
1029
1030         /* l3index always overrides non-l3index */
1031         if (old->l3index && new->l3index == 0)
1032                 return false;
1033         if (old->l3index == 0 && new->l3index)
1034                 return true;
1035
1036         return old->prefixlen < new->prefixlen;
1037 }
1038
1039 /* Find the Key structure for an address.  */
1040 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1041                                            const union tcp_md5_addr *addr,
1042                                            int family)
1043 {
1044         const struct tcp_sock *tp = tcp_sk(sk);
1045         struct tcp_md5sig_key *key;
1046         const struct tcp_md5sig_info *md5sig;
1047         __be32 mask;
1048         struct tcp_md5sig_key *best_match = NULL;
1049         bool match;
1050
1051         /* caller either holds rcu_read_lock() or socket lock */
1052         md5sig = rcu_dereference_check(tp->md5sig_info,
1053                                        lockdep_sock_is_held(sk));
1054         if (!md5sig)
1055                 return NULL;
1056
1057         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1058                                  lockdep_sock_is_held(sk)) {
1059                 if (key->family != family)
1060                         continue;
1061                 if (key->l3index && key->l3index != l3index)
1062                         continue;
1063                 if (family == AF_INET) {
1064                         mask = inet_make_mask(key->prefixlen);
1065                         match = (key->addr.a4.s_addr & mask) ==
1066                                 (addr->a4.s_addr & mask);
1067 #if IS_ENABLED(CONFIG_IPV6)
1068                 } else if (family == AF_INET6) {
1069                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1070                                                   key->prefixlen);
1071 #endif
1072                 } else {
1073                         match = false;
1074                 }
1075
1076                 if (match && better_md5_match(best_match, key))
1077                         best_match = key;
1078         }
1079         return best_match;
1080 }
1081 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1082
1083 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1084                                                       const union tcp_md5_addr *addr,
1085                                                       int family, u8 prefixlen,
1086                                                       int l3index)
1087 {
1088         const struct tcp_sock *tp = tcp_sk(sk);
1089         struct tcp_md5sig_key *key;
1090         unsigned int size = sizeof(struct in_addr);
1091         const struct tcp_md5sig_info *md5sig;
1092
1093         /* caller either holds rcu_read_lock() or socket lock */
1094         md5sig = rcu_dereference_check(tp->md5sig_info,
1095                                        lockdep_sock_is_held(sk));
1096         if (!md5sig)
1097                 return NULL;
1098 #if IS_ENABLED(CONFIG_IPV6)
1099         if (family == AF_INET6)
1100                 size = sizeof(struct in6_addr);
1101 #endif
1102         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1103                                  lockdep_sock_is_held(sk)) {
1104                 if (key->family != family)
1105                         continue;
1106                 if (key->l3index != l3index)
1107                         continue;
1108                 if (!memcmp(&key->addr, addr, size) &&
1109                     key->prefixlen == prefixlen)
1110                         return key;
1111         }
1112         return NULL;
1113 }
1114
1115 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1116                                          const struct sock *addr_sk)
1117 {
1118         const union tcp_md5_addr *addr;
1119         int l3index;
1120
1121         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1122                                                  addr_sk->sk_bound_dev_if);
1123         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1124         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1125 }
1126 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1127
1128 /* This can be called on a newly created socket, from other files */
1129 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1130                    int family, u8 prefixlen, int l3index,
1131                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1132 {
1133         /* Add Key to the list */
1134         struct tcp_md5sig_key *key;
1135         struct tcp_sock *tp = tcp_sk(sk);
1136         struct tcp_md5sig_info *md5sig;
1137
1138         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1139         if (key) {
1140                 /* Pre-existing entry - just update that one.
1141                  * Note that the key might be used concurrently.
1142                  * data_race() is telling kcsan that we do not care of
1143                  * key mismatches, since changing MD5 key on live flows
1144                  * can lead to packet drops.
1145                  */
1146                 data_race(memcpy(key->key, newkey, newkeylen));
1147
1148                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1149                  * Also note that a reader could catch new key->keylen value
1150                  * but old key->key[], this is the reason we use __GFP_ZERO
1151                  * at sock_kmalloc() time below these lines.
1152                  */
1153                 WRITE_ONCE(key->keylen, newkeylen);
1154
1155                 return 0;
1156         }
1157
1158         md5sig = rcu_dereference_protected(tp->md5sig_info,
1159                                            lockdep_sock_is_held(sk));
1160         if (!md5sig) {
1161                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1162                 if (!md5sig)
1163                         return -ENOMEM;
1164
1165                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1166                 INIT_HLIST_HEAD(&md5sig->head);
1167                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1168         }
1169
1170         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1171         if (!key)
1172                 return -ENOMEM;
1173         if (!tcp_alloc_md5sig_pool()) {
1174                 sock_kfree_s(sk, key, sizeof(*key));
1175                 return -ENOMEM;
1176         }
1177
1178         memcpy(key->key, newkey, newkeylen);
1179         key->keylen = newkeylen;
1180         key->family = family;
1181         key->prefixlen = prefixlen;
1182         key->l3index = l3index;
1183         memcpy(&key->addr, addr,
1184                (family == AF_INET6) ? sizeof(struct in6_addr) :
1185                                       sizeof(struct in_addr));
1186         hlist_add_head_rcu(&key->node, &md5sig->head);
1187         return 0;
1188 }
1189 EXPORT_SYMBOL(tcp_md5_do_add);
1190
1191 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1192                    u8 prefixlen, int l3index)
1193 {
1194         struct tcp_md5sig_key *key;
1195
1196         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1197         if (!key)
1198                 return -ENOENT;
1199         hlist_del_rcu(&key->node);
1200         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1201         kfree_rcu(key, rcu);
1202         return 0;
1203 }
1204 EXPORT_SYMBOL(tcp_md5_do_del);
1205
1206 static void tcp_clear_md5_list(struct sock *sk)
1207 {
1208         struct tcp_sock *tp = tcp_sk(sk);
1209         struct tcp_md5sig_key *key;
1210         struct hlist_node *n;
1211         struct tcp_md5sig_info *md5sig;
1212
1213         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1214
1215         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1216                 hlist_del_rcu(&key->node);
1217                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1218                 kfree_rcu(key, rcu);
1219         }
1220 }
1221
1222 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1223                                  sockptr_t optval, int optlen)
1224 {
1225         struct tcp_md5sig cmd;
1226         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1227         const union tcp_md5_addr *addr;
1228         u8 prefixlen = 32;
1229         int l3index = 0;
1230
1231         if (optlen < sizeof(cmd))
1232                 return -EINVAL;
1233
1234         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1235                 return -EFAULT;
1236
1237         if (sin->sin_family != AF_INET)
1238                 return -EINVAL;
1239
1240         if (optname == TCP_MD5SIG_EXT &&
1241             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1242                 prefixlen = cmd.tcpm_prefixlen;
1243                 if (prefixlen > 32)
1244                         return -EINVAL;
1245         }
1246
1247         if (optname == TCP_MD5SIG_EXT &&
1248             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1249                 struct net_device *dev;
1250
1251                 rcu_read_lock();
1252                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1253                 if (dev && netif_is_l3_master(dev))
1254                         l3index = dev->ifindex;
1255
1256                 rcu_read_unlock();
1257
1258                 /* ok to reference set/not set outside of rcu;
1259                  * right now device MUST be an L3 master
1260                  */
1261                 if (!dev || !l3index)
1262                         return -EINVAL;
1263         }
1264
1265         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1266
1267         if (!cmd.tcpm_keylen)
1268                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1269
1270         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1271                 return -EINVAL;
1272
1273         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1274                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1275 }
1276
1277 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1278                                    __be32 daddr, __be32 saddr,
1279                                    const struct tcphdr *th, int nbytes)
1280 {
1281         struct tcp4_pseudohdr *bp;
1282         struct scatterlist sg;
1283         struct tcphdr *_th;
1284
1285         bp = hp->scratch;
1286         bp->saddr = saddr;
1287         bp->daddr = daddr;
1288         bp->pad = 0;
1289         bp->protocol = IPPROTO_TCP;
1290         bp->len = cpu_to_be16(nbytes);
1291
1292         _th = (struct tcphdr *)(bp + 1);
1293         memcpy(_th, th, sizeof(*th));
1294         _th->check = 0;
1295
1296         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1297         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1298                                 sizeof(*bp) + sizeof(*th));
1299         return crypto_ahash_update(hp->md5_req);
1300 }
1301
1302 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1303                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1304 {
1305         struct tcp_md5sig_pool *hp;
1306         struct ahash_request *req;
1307
1308         hp = tcp_get_md5sig_pool();
1309         if (!hp)
1310                 goto clear_hash_noput;
1311         req = hp->md5_req;
1312
1313         if (crypto_ahash_init(req))
1314                 goto clear_hash;
1315         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1316                 goto clear_hash;
1317         if (tcp_md5_hash_key(hp, key))
1318                 goto clear_hash;
1319         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1320         if (crypto_ahash_final(req))
1321                 goto clear_hash;
1322
1323         tcp_put_md5sig_pool();
1324         return 0;
1325
1326 clear_hash:
1327         tcp_put_md5sig_pool();
1328 clear_hash_noput:
1329         memset(md5_hash, 0, 16);
1330         return 1;
1331 }
1332
1333 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1334                         const struct sock *sk,
1335                         const struct sk_buff *skb)
1336 {
1337         struct tcp_md5sig_pool *hp;
1338         struct ahash_request *req;
1339         const struct tcphdr *th = tcp_hdr(skb);
1340         __be32 saddr, daddr;
1341
1342         if (sk) { /* valid for establish/request sockets */
1343                 saddr = sk->sk_rcv_saddr;
1344                 daddr = sk->sk_daddr;
1345         } else {
1346                 const struct iphdr *iph = ip_hdr(skb);
1347                 saddr = iph->saddr;
1348                 daddr = iph->daddr;
1349         }
1350
1351         hp = tcp_get_md5sig_pool();
1352         if (!hp)
1353                 goto clear_hash_noput;
1354         req = hp->md5_req;
1355
1356         if (crypto_ahash_init(req))
1357                 goto clear_hash;
1358
1359         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1360                 goto clear_hash;
1361         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1362                 goto clear_hash;
1363         if (tcp_md5_hash_key(hp, key))
1364                 goto clear_hash;
1365         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1366         if (crypto_ahash_final(req))
1367                 goto clear_hash;
1368
1369         tcp_put_md5sig_pool();
1370         return 0;
1371
1372 clear_hash:
1373         tcp_put_md5sig_pool();
1374 clear_hash_noput:
1375         memset(md5_hash, 0, 16);
1376         return 1;
1377 }
1378 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1379
1380 #endif
1381
1382 /* Called with rcu_read_lock() */
1383 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1384                                     const struct sk_buff *skb,
1385                                     int dif, int sdif)
1386 {
1387 #ifdef CONFIG_TCP_MD5SIG
1388         /*
1389          * This gets called for each TCP segment that arrives
1390          * so we want to be efficient.
1391          * We have 3 drop cases:
1392          * o No MD5 hash and one expected.
1393          * o MD5 hash and we're not expecting one.
1394          * o MD5 hash and its wrong.
1395          */
1396         const __u8 *hash_location = NULL;
1397         struct tcp_md5sig_key *hash_expected;
1398         const struct iphdr *iph = ip_hdr(skb);
1399         const struct tcphdr *th = tcp_hdr(skb);
1400         const union tcp_md5_addr *addr;
1401         unsigned char newhash[16];
1402         int genhash, l3index;
1403
1404         /* sdif set, means packet ingressed via a device
1405          * in an L3 domain and dif is set to the l3mdev
1406          */
1407         l3index = sdif ? dif : 0;
1408
1409         addr = (union tcp_md5_addr *)&iph->saddr;
1410         hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1411         hash_location = tcp_parse_md5sig_option(th);
1412
1413         /* We've parsed the options - do we have a hash? */
1414         if (!hash_expected && !hash_location)
1415                 return false;
1416
1417         if (hash_expected && !hash_location) {
1418                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1419                 return true;
1420         }
1421
1422         if (!hash_expected && hash_location) {
1423                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1424                 return true;
1425         }
1426
1427         /* Okay, so this is hash_expected and hash_location -
1428          * so we need to calculate the checksum.
1429          */
1430         genhash = tcp_v4_md5_hash_skb(newhash,
1431                                       hash_expected,
1432                                       NULL, skb);
1433
1434         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1435                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1436                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1437                                      &iph->saddr, ntohs(th->source),
1438                                      &iph->daddr, ntohs(th->dest),
1439                                      genhash ? " tcp_v4_calc_md5_hash failed"
1440                                      : "", l3index);
1441                 return true;
1442         }
1443         return false;
1444 #endif
1445         return false;
1446 }
1447
1448 static void tcp_v4_init_req(struct request_sock *req,
1449                             const struct sock *sk_listener,
1450                             struct sk_buff *skb)
1451 {
1452         struct inet_request_sock *ireq = inet_rsk(req);
1453         struct net *net = sock_net(sk_listener);
1454
1455         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1456         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1457         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1458 }
1459
1460 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1461                                           struct flowi *fl,
1462                                           const struct request_sock *req)
1463 {
1464         return inet_csk_route_req(sk, &fl->u.ip4, req);
1465 }
1466
1467 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1468         .family         =       PF_INET,
1469         .obj_size       =       sizeof(struct tcp_request_sock),
1470         .rtx_syn_ack    =       tcp_rtx_synack,
1471         .send_ack       =       tcp_v4_reqsk_send_ack,
1472         .destructor     =       tcp_v4_reqsk_destructor,
1473         .send_reset     =       tcp_v4_send_reset,
1474         .syn_ack_timeout =      tcp_syn_ack_timeout,
1475 };
1476
1477 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1478         .mss_clamp      =       TCP_MSS_DEFAULT,
1479 #ifdef CONFIG_TCP_MD5SIG
1480         .req_md5_lookup =       tcp_v4_md5_lookup,
1481         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1482 #endif
1483         .init_req       =       tcp_v4_init_req,
1484 #ifdef CONFIG_SYN_COOKIES
1485         .cookie_init_seq =      cookie_v4_init_sequence,
1486 #endif
1487         .route_req      =       tcp_v4_route_req,
1488         .init_seq       =       tcp_v4_init_seq,
1489         .init_ts_off    =       tcp_v4_init_ts_off,
1490         .send_synack    =       tcp_v4_send_synack,
1491 };
1492
1493 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1494 {
1495         /* Never answer to SYNs send to broadcast or multicast */
1496         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1497                 goto drop;
1498
1499         return tcp_conn_request(&tcp_request_sock_ops,
1500                                 &tcp_request_sock_ipv4_ops, sk, skb);
1501
1502 drop:
1503         tcp_listendrop(sk);
1504         return 0;
1505 }
1506 EXPORT_SYMBOL(tcp_v4_conn_request);
1507
1508
1509 /*
1510  * The three way handshake has completed - we got a valid synack -
1511  * now create the new socket.
1512  */
1513 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1514                                   struct request_sock *req,
1515                                   struct dst_entry *dst,
1516                                   struct request_sock *req_unhash,
1517                                   bool *own_req)
1518 {
1519         struct inet_request_sock *ireq;
1520         bool found_dup_sk = false;
1521         struct inet_sock *newinet;
1522         struct tcp_sock *newtp;
1523         struct sock *newsk;
1524 #ifdef CONFIG_TCP_MD5SIG
1525         const union tcp_md5_addr *addr;
1526         struct tcp_md5sig_key *key;
1527         int l3index;
1528 #endif
1529         struct ip_options_rcu *inet_opt;
1530
1531         if (sk_acceptq_is_full(sk))
1532                 goto exit_overflow;
1533
1534         newsk = tcp_create_openreq_child(sk, req, skb);
1535         if (!newsk)
1536                 goto exit_nonewsk;
1537
1538         newsk->sk_gso_type = SKB_GSO_TCPV4;
1539         inet_sk_rx_dst_set(newsk, skb);
1540
1541         newtp                 = tcp_sk(newsk);
1542         newinet               = inet_sk(newsk);
1543         ireq                  = inet_rsk(req);
1544         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1545         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1546         newsk->sk_bound_dev_if = ireq->ir_iif;
1547         newinet->inet_saddr   = ireq->ir_loc_addr;
1548         inet_opt              = rcu_dereference(ireq->ireq_opt);
1549         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1550         newinet->mc_index     = inet_iif(skb);
1551         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1552         newinet->rcv_tos      = ip_hdr(skb)->tos;
1553         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1554         if (inet_opt)
1555                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1556         newinet->inet_id = prandom_u32();
1557
1558         /* Set ToS of the new socket based upon the value of incoming SYN.
1559          * ECT bits are set later in tcp_init_transfer().
1560          */
1561         if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1562                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1563
1564         if (!dst) {
1565                 dst = inet_csk_route_child_sock(sk, newsk, req);
1566                 if (!dst)
1567                         goto put_and_exit;
1568         } else {
1569                 /* syncookie case : see end of cookie_v4_check() */
1570         }
1571         sk_setup_caps(newsk, dst);
1572
1573         tcp_ca_openreq_child(newsk, dst);
1574
1575         tcp_sync_mss(newsk, dst_mtu(dst));
1576         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1577
1578         tcp_initialize_rcv_mss(newsk);
1579
1580 #ifdef CONFIG_TCP_MD5SIG
1581         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1582         /* Copy over the MD5 key from the original socket */
1583         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1584         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1585         if (key) {
1586                 /*
1587                  * We're using one, so create a matching key
1588                  * on the newsk structure. If we fail to get
1589                  * memory, then we end up not copying the key
1590                  * across. Shucks.
1591                  */
1592                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1593                                key->key, key->keylen, GFP_ATOMIC);
1594                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1595         }
1596 #endif
1597
1598         if (__inet_inherit_port(sk, newsk) < 0)
1599                 goto put_and_exit;
1600         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1601                                        &found_dup_sk);
1602         if (likely(*own_req)) {
1603                 tcp_move_syn(newtp, req);
1604                 ireq->ireq_opt = NULL;
1605         } else {
1606                 newinet->inet_opt = NULL;
1607
1608                 if (!req_unhash && found_dup_sk) {
1609                         /* This code path should only be executed in the
1610                          * syncookie case only
1611                          */
1612                         bh_unlock_sock(newsk);
1613                         sock_put(newsk);
1614                         newsk = NULL;
1615                 }
1616         }
1617         return newsk;
1618
1619 exit_overflow:
1620         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1621 exit_nonewsk:
1622         dst_release(dst);
1623 exit:
1624         tcp_listendrop(sk);
1625         return NULL;
1626 put_and_exit:
1627         newinet->inet_opt = NULL;
1628         inet_csk_prepare_forced_close(newsk);
1629         tcp_done(newsk);
1630         goto exit;
1631 }
1632 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1633
1634 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1635 {
1636 #ifdef CONFIG_SYN_COOKIES
1637         const struct tcphdr *th = tcp_hdr(skb);
1638
1639         if (!th->syn)
1640                 sk = cookie_v4_check(sk, skb);
1641 #endif
1642         return sk;
1643 }
1644
1645 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1646                          struct tcphdr *th, u32 *cookie)
1647 {
1648         u16 mss = 0;
1649 #ifdef CONFIG_SYN_COOKIES
1650         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1651                                     &tcp_request_sock_ipv4_ops, sk, th);
1652         if (mss) {
1653                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1654                 tcp_synq_overflow(sk);
1655         }
1656 #endif
1657         return mss;
1658 }
1659
1660 /* The socket must have it's spinlock held when we get
1661  * here, unless it is a TCP_LISTEN socket.
1662  *
1663  * We have a potential double-lock case here, so even when
1664  * doing backlog processing we use the BH locking scheme.
1665  * This is because we cannot sleep with the original spinlock
1666  * held.
1667  */
1668 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1669 {
1670         struct sock *rsk;
1671
1672         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1673                 struct dst_entry *dst;
1674
1675                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1676                                                 lockdep_sock_is_held(sk));
1677
1678                 sock_rps_save_rxhash(sk, skb);
1679                 sk_mark_napi_id(sk, skb);
1680                 if (dst) {
1681                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1682                             !dst->ops->check(dst, 0)) {
1683                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1684                                 dst_release(dst);
1685                         }
1686                 }
1687                 tcp_rcv_established(sk, skb);
1688                 return 0;
1689         }
1690
1691         if (tcp_checksum_complete(skb))
1692                 goto csum_err;
1693
1694         if (sk->sk_state == TCP_LISTEN) {
1695                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1696
1697                 if (!nsk)
1698                         goto discard;
1699                 if (nsk != sk) {
1700                         if (tcp_child_process(sk, nsk, skb)) {
1701                                 rsk = nsk;
1702                                 goto reset;
1703                         }
1704                         return 0;
1705                 }
1706         } else
1707                 sock_rps_save_rxhash(sk, skb);
1708
1709         if (tcp_rcv_state_process(sk, skb)) {
1710                 rsk = sk;
1711                 goto reset;
1712         }
1713         return 0;
1714
1715 reset:
1716         tcp_v4_send_reset(rsk, skb);
1717 discard:
1718         kfree_skb(skb);
1719         /* Be careful here. If this function gets more complicated and
1720          * gcc suffers from register pressure on the x86, sk (in %ebx)
1721          * might be destroyed here. This current version compiles correctly,
1722          * but you have been warned.
1723          */
1724         return 0;
1725
1726 csum_err:
1727         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1728         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1729         goto discard;
1730 }
1731 EXPORT_SYMBOL(tcp_v4_do_rcv);
1732
1733 int tcp_v4_early_demux(struct sk_buff *skb)
1734 {
1735         const struct iphdr *iph;
1736         const struct tcphdr *th;
1737         struct sock *sk;
1738
1739         if (skb->pkt_type != PACKET_HOST)
1740                 return 0;
1741
1742         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1743                 return 0;
1744
1745         iph = ip_hdr(skb);
1746         th = tcp_hdr(skb);
1747
1748         if (th->doff < sizeof(struct tcphdr) / 4)
1749                 return 0;
1750
1751         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1752                                        iph->saddr, th->source,
1753                                        iph->daddr, ntohs(th->dest),
1754                                        skb->skb_iif, inet_sdif(skb));
1755         if (sk) {
1756                 skb->sk = sk;
1757                 skb->destructor = sock_edemux;
1758                 if (sk_fullsock(sk)) {
1759                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1760
1761                         if (dst)
1762                                 dst = dst_check(dst, 0);
1763                         if (dst &&
1764                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1765                                 skb_dst_set_noref(skb, dst);
1766                 }
1767         }
1768         return 0;
1769 }
1770
1771 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1772 {
1773         u32 limit, tail_gso_size, tail_gso_segs;
1774         struct skb_shared_info *shinfo;
1775         const struct tcphdr *th;
1776         struct tcphdr *thtail;
1777         struct sk_buff *tail;
1778         unsigned int hdrlen;
1779         bool fragstolen;
1780         u32 gso_segs;
1781         u32 gso_size;
1782         int delta;
1783
1784         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1785          * we can fix skb->truesize to its real value to avoid future drops.
1786          * This is valid because skb is not yet charged to the socket.
1787          * It has been noticed pure SACK packets were sometimes dropped
1788          * (if cooked by drivers without copybreak feature).
1789          */
1790         skb_condense(skb);
1791
1792         skb_dst_drop(skb);
1793
1794         if (unlikely(tcp_checksum_complete(skb))) {
1795                 bh_unlock_sock(sk);
1796                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1797                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1798                 return true;
1799         }
1800
1801         /* Attempt coalescing to last skb in backlog, even if we are
1802          * above the limits.
1803          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1804          */
1805         th = (const struct tcphdr *)skb->data;
1806         hdrlen = th->doff * 4;
1807
1808         tail = sk->sk_backlog.tail;
1809         if (!tail)
1810                 goto no_coalesce;
1811         thtail = (struct tcphdr *)tail->data;
1812
1813         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1814             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1815             ((TCP_SKB_CB(tail)->tcp_flags |
1816               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1817             !((TCP_SKB_CB(tail)->tcp_flags &
1818               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1819             ((TCP_SKB_CB(tail)->tcp_flags ^
1820               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1821 #ifdef CONFIG_TLS_DEVICE
1822             tail->decrypted != skb->decrypted ||
1823 #endif
1824             thtail->doff != th->doff ||
1825             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1826                 goto no_coalesce;
1827
1828         __skb_pull(skb, hdrlen);
1829
1830         shinfo = skb_shinfo(skb);
1831         gso_size = shinfo->gso_size ?: skb->len;
1832         gso_segs = shinfo->gso_segs ?: 1;
1833
1834         shinfo = skb_shinfo(tail);
1835         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1836         tail_gso_segs = shinfo->gso_segs ?: 1;
1837
1838         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1839                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1840
1841                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1842                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1843                         thtail->window = th->window;
1844                 }
1845
1846                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1847                  * thtail->fin, so that the fast path in tcp_rcv_established()
1848                  * is not entered if we append a packet with a FIN.
1849                  * SYN, RST, URG are not present.
1850                  * ACK is set on both packets.
1851                  * PSH : we do not really care in TCP stack,
1852                  *       at least for 'GRO' packets.
1853                  */
1854                 thtail->fin |= th->fin;
1855                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1856
1857                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1858                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1859                         tail->tstamp = skb->tstamp;
1860                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1861                 }
1862
1863                 /* Not as strict as GRO. We only need to carry mss max value */
1864                 shinfo->gso_size = max(gso_size, tail_gso_size);
1865                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1866
1867                 sk->sk_backlog.len += delta;
1868                 __NET_INC_STATS(sock_net(sk),
1869                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1870                 kfree_skb_partial(skb, fragstolen);
1871                 return false;
1872         }
1873         __skb_push(skb, hdrlen);
1874
1875 no_coalesce:
1876         limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1877
1878         /* Only socket owner can try to collapse/prune rx queues
1879          * to reduce memory overhead, so add a little headroom here.
1880          * Few sockets backlog are possibly concurrently non empty.
1881          */
1882         limit += 64 * 1024;
1883
1884         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1885                 bh_unlock_sock(sk);
1886                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1887                 return true;
1888         }
1889         return false;
1890 }
1891 EXPORT_SYMBOL(tcp_add_backlog);
1892
1893 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1894 {
1895         struct tcphdr *th = (struct tcphdr *)skb->data;
1896
1897         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1898 }
1899 EXPORT_SYMBOL(tcp_filter);
1900
1901 static void tcp_v4_restore_cb(struct sk_buff *skb)
1902 {
1903         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1904                 sizeof(struct inet_skb_parm));
1905 }
1906
1907 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1908                            const struct tcphdr *th)
1909 {
1910         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1911          * barrier() makes sure compiler wont play fool^Waliasing games.
1912          */
1913         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1914                 sizeof(struct inet_skb_parm));
1915         barrier();
1916
1917         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1918         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1919                                     skb->len - th->doff * 4);
1920         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1921         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1922         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1923         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1924         TCP_SKB_CB(skb)->sacked  = 0;
1925         TCP_SKB_CB(skb)->has_rxtstamp =
1926                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1927 }
1928
1929 /*
1930  *      From tcp_input.c
1931  */
1932
1933 int tcp_v4_rcv(struct sk_buff *skb)
1934 {
1935         struct net *net = dev_net(skb->dev);
1936         struct sk_buff *skb_to_free;
1937         int sdif = inet_sdif(skb);
1938         int dif = inet_iif(skb);
1939         const struct iphdr *iph;
1940         const struct tcphdr *th;
1941         bool refcounted;
1942         struct sock *sk;
1943         int ret;
1944
1945         if (skb->pkt_type != PACKET_HOST)
1946                 goto discard_it;
1947
1948         /* Count it even if it's bad */
1949         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1950
1951         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1952                 goto discard_it;
1953
1954         th = (const struct tcphdr *)skb->data;
1955
1956         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1957                 goto bad_packet;
1958         if (!pskb_may_pull(skb, th->doff * 4))
1959                 goto discard_it;
1960
1961         /* An explanation is required here, I think.
1962          * Packet length and doff are validated by header prediction,
1963          * provided case of th->doff==0 is eliminated.
1964          * So, we defer the checks. */
1965
1966         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1967                 goto csum_error;
1968
1969         th = (const struct tcphdr *)skb->data;
1970         iph = ip_hdr(skb);
1971 lookup:
1972         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1973                                th->dest, sdif, &refcounted);
1974         if (!sk)
1975                 goto no_tcp_socket;
1976
1977 process:
1978         if (sk->sk_state == TCP_TIME_WAIT)
1979                 goto do_time_wait;
1980
1981         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1982                 struct request_sock *req = inet_reqsk(sk);
1983                 bool req_stolen = false;
1984                 struct sock *nsk;
1985
1986                 sk = req->rsk_listener;
1987                 if (unlikely(!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb) ||
1988                              tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1989                         sk_drops_add(sk, skb);
1990                         reqsk_put(req);
1991                         goto discard_it;
1992                 }
1993                 if (tcp_checksum_complete(skb)) {
1994                         reqsk_put(req);
1995                         goto csum_error;
1996                 }
1997                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1998                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1999                         goto lookup;
2000                 }
2001                 /* We own a reference on the listener, increase it again
2002                  * as we might lose it too soon.
2003                  */
2004                 sock_hold(sk);
2005                 refcounted = true;
2006                 nsk = NULL;
2007                 if (!tcp_filter(sk, skb)) {
2008                         th = (const struct tcphdr *)skb->data;
2009                         iph = ip_hdr(skb);
2010                         tcp_v4_fill_cb(skb, iph, th);
2011                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2012                 }
2013                 if (!nsk) {
2014                         reqsk_put(req);
2015                         if (req_stolen) {
2016                                 /* Another cpu got exclusive access to req
2017                                  * and created a full blown socket.
2018                                  * Try to feed this packet to this socket
2019                                  * instead of discarding it.
2020                                  */
2021                                 tcp_v4_restore_cb(skb);
2022                                 sock_put(sk);
2023                                 goto lookup;
2024                         }
2025                         goto discard_and_relse;
2026                 }
2027                 nf_reset_ct(skb);
2028                 if (nsk == sk) {
2029                         reqsk_put(req);
2030                         tcp_v4_restore_cb(skb);
2031                 } else if (tcp_child_process(sk, nsk, skb)) {
2032                         tcp_v4_send_reset(nsk, skb);
2033                         goto discard_and_relse;
2034                 } else {
2035                         sock_put(sk);
2036                         return 0;
2037                 }
2038         }
2039         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2040                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2041                 goto discard_and_relse;
2042         }
2043
2044         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2045                 goto discard_and_relse;
2046
2047         if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2048                 goto discard_and_relse;
2049
2050         nf_reset_ct(skb);
2051
2052         if (tcp_filter(sk, skb))
2053                 goto discard_and_relse;
2054         th = (const struct tcphdr *)skb->data;
2055         iph = ip_hdr(skb);
2056         tcp_v4_fill_cb(skb, iph, th);
2057
2058         skb->dev = NULL;
2059
2060         if (sk->sk_state == TCP_LISTEN) {
2061                 ret = tcp_v4_do_rcv(sk, skb);
2062                 goto put_and_return;
2063         }
2064
2065         sk_incoming_cpu_update(sk);
2066
2067         bh_lock_sock_nested(sk);
2068         tcp_segs_in(tcp_sk(sk), skb);
2069         ret = 0;
2070         if (!sock_owned_by_user(sk)) {
2071                 skb_to_free = sk->sk_rx_skb_cache;
2072                 sk->sk_rx_skb_cache = NULL;
2073                 ret = tcp_v4_do_rcv(sk, skb);
2074         } else {
2075                 if (tcp_add_backlog(sk, skb))
2076                         goto discard_and_relse;
2077                 skb_to_free = NULL;
2078         }
2079         bh_unlock_sock(sk);
2080         if (skb_to_free)
2081                 __kfree_skb(skb_to_free);
2082
2083 put_and_return:
2084         if (refcounted)
2085                 sock_put(sk);
2086
2087         return ret;
2088
2089 no_tcp_socket:
2090         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2091                 goto discard_it;
2092
2093         tcp_v4_fill_cb(skb, iph, th);
2094
2095         if (tcp_checksum_complete(skb)) {
2096 csum_error:
2097                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2098 bad_packet:
2099                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2100         } else {
2101                 tcp_v4_send_reset(NULL, skb);
2102         }
2103
2104 discard_it:
2105         /* Discard frame. */
2106         kfree_skb(skb);
2107         return 0;
2108
2109 discard_and_relse:
2110         sk_drops_add(sk, skb);
2111         if (refcounted)
2112                 sock_put(sk);
2113         goto discard_it;
2114
2115 do_time_wait:
2116         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2117                 inet_twsk_put(inet_twsk(sk));
2118                 goto discard_it;
2119         }
2120
2121         tcp_v4_fill_cb(skb, iph, th);
2122
2123         if (tcp_checksum_complete(skb)) {
2124                 inet_twsk_put(inet_twsk(sk));
2125                 goto csum_error;
2126         }
2127         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2128         case TCP_TW_SYN: {
2129                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2130                                                         &tcp_hashinfo, skb,
2131                                                         __tcp_hdrlen(th),
2132                                                         iph->saddr, th->source,
2133                                                         iph->daddr, th->dest,
2134                                                         inet_iif(skb),
2135                                                         sdif);
2136                 if (sk2) {
2137                         inet_twsk_deschedule_put(inet_twsk(sk));
2138                         sk = sk2;
2139                         tcp_v4_restore_cb(skb);
2140                         refcounted = false;
2141                         goto process;
2142                 }
2143         }
2144                 /* to ACK */
2145                 fallthrough;
2146         case TCP_TW_ACK:
2147                 tcp_v4_timewait_ack(sk, skb);
2148                 break;
2149         case TCP_TW_RST:
2150                 tcp_v4_send_reset(sk, skb);
2151                 inet_twsk_deschedule_put(inet_twsk(sk));
2152                 goto discard_it;
2153         case TCP_TW_SUCCESS:;
2154         }
2155         goto discard_it;
2156 }
2157
2158 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2159         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2160         .twsk_unique    = tcp_twsk_unique,
2161         .twsk_destructor= tcp_twsk_destructor,
2162 };
2163
2164 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2165 {
2166         struct dst_entry *dst = skb_dst(skb);
2167
2168         if (dst && dst_hold_safe(dst)) {
2169                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2170                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2171         }
2172 }
2173 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2174
2175 const struct inet_connection_sock_af_ops ipv4_specific = {
2176         .queue_xmit        = ip_queue_xmit,
2177         .send_check        = tcp_v4_send_check,
2178         .rebuild_header    = inet_sk_rebuild_header,
2179         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2180         .conn_request      = tcp_v4_conn_request,
2181         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2182         .net_header_len    = sizeof(struct iphdr),
2183         .setsockopt        = ip_setsockopt,
2184         .getsockopt        = ip_getsockopt,
2185         .addr2sockaddr     = inet_csk_addr2sockaddr,
2186         .sockaddr_len      = sizeof(struct sockaddr_in),
2187         .mtu_reduced       = tcp_v4_mtu_reduced,
2188 };
2189 EXPORT_SYMBOL(ipv4_specific);
2190
2191 #ifdef CONFIG_TCP_MD5SIG
2192 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2193         .md5_lookup             = tcp_v4_md5_lookup,
2194         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2195         .md5_parse              = tcp_v4_parse_md5_keys,
2196 };
2197 #endif
2198
2199 /* NOTE: A lot of things set to zero explicitly by call to
2200  *       sk_alloc() so need not be done here.
2201  */
2202 static int tcp_v4_init_sock(struct sock *sk)
2203 {
2204         struct inet_connection_sock *icsk = inet_csk(sk);
2205
2206         tcp_init_sock(sk);
2207
2208         icsk->icsk_af_ops = &ipv4_specific;
2209
2210 #ifdef CONFIG_TCP_MD5SIG
2211         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2212 #endif
2213
2214         return 0;
2215 }
2216
2217 void tcp_v4_destroy_sock(struct sock *sk)
2218 {
2219         struct tcp_sock *tp = tcp_sk(sk);
2220
2221         trace_tcp_destroy_sock(sk);
2222
2223         tcp_clear_xmit_timers(sk);
2224
2225         tcp_cleanup_congestion_control(sk);
2226
2227         tcp_cleanup_ulp(sk);
2228
2229         /* Cleanup up the write buffer. */
2230         tcp_write_queue_purge(sk);
2231
2232         /* Check if we want to disable active TFO */
2233         tcp_fastopen_active_disable_ofo_check(sk);
2234
2235         /* Cleans up our, hopefully empty, out_of_order_queue. */
2236         skb_rbtree_purge(&tp->out_of_order_queue);
2237
2238 #ifdef CONFIG_TCP_MD5SIG
2239         /* Clean up the MD5 key list, if any */
2240         if (tp->md5sig_info) {
2241                 tcp_clear_md5_list(sk);
2242                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2243                 tp->md5sig_info = NULL;
2244         }
2245 #endif
2246
2247         /* Clean up a referenced TCP bind bucket. */
2248         if (inet_csk(sk)->icsk_bind_hash)
2249                 inet_put_port(sk);
2250
2251         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2252
2253         /* If socket is aborted during connect operation */
2254         tcp_free_fastopen_req(tp);
2255         tcp_fastopen_destroy_cipher(sk);
2256         tcp_saved_syn_free(tp);
2257
2258         sk_sockets_allocated_dec(sk);
2259 }
2260 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2261
2262 #ifdef CONFIG_PROC_FS
2263 /* Proc filesystem TCP sock list dumping. */
2264
2265 /*
2266  * Get next listener socket follow cur.  If cur is NULL, get first socket
2267  * starting from bucket given in st->bucket; when st->bucket is zero the
2268  * very first socket in the hash table is returned.
2269  */
2270 static void *listening_get_next(struct seq_file *seq, void *cur)
2271 {
2272         struct tcp_seq_afinfo *afinfo;
2273         struct tcp_iter_state *st = seq->private;
2274         struct net *net = seq_file_net(seq);
2275         struct inet_listen_hashbucket *ilb;
2276         struct hlist_nulls_node *node;
2277         struct sock *sk = cur;
2278
2279         if (st->bpf_seq_afinfo)
2280                 afinfo = st->bpf_seq_afinfo;
2281         else
2282                 afinfo = PDE_DATA(file_inode(seq->file));
2283
2284         if (!sk) {
2285 get_head:
2286                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2287                 spin_lock(&ilb->lock);
2288                 sk = sk_nulls_head(&ilb->nulls_head);
2289                 st->offset = 0;
2290                 goto get_sk;
2291         }
2292         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2293         ++st->num;
2294         ++st->offset;
2295
2296         sk = sk_nulls_next(sk);
2297 get_sk:
2298         sk_nulls_for_each_from(sk, node) {
2299                 if (!net_eq(sock_net(sk), net))
2300                         continue;
2301                 if (afinfo->family == AF_UNSPEC ||
2302                     sk->sk_family == afinfo->family)
2303                         return sk;
2304         }
2305         spin_unlock(&ilb->lock);
2306         st->offset = 0;
2307         if (++st->bucket < INET_LHTABLE_SIZE)
2308                 goto get_head;
2309         return NULL;
2310 }
2311
2312 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2313 {
2314         struct tcp_iter_state *st = seq->private;
2315         void *rc;
2316
2317         st->bucket = 0;
2318         st->offset = 0;
2319         rc = listening_get_next(seq, NULL);
2320
2321         while (rc && *pos) {
2322                 rc = listening_get_next(seq, rc);
2323                 --*pos;
2324         }
2325         return rc;
2326 }
2327
2328 static inline bool empty_bucket(const struct tcp_iter_state *st)
2329 {
2330         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2331 }
2332
2333 /*
2334  * Get first established socket starting from bucket given in st->bucket.
2335  * If st->bucket is zero, the very first socket in the hash is returned.
2336  */
2337 static void *established_get_first(struct seq_file *seq)
2338 {
2339         struct tcp_seq_afinfo *afinfo;
2340         struct tcp_iter_state *st = seq->private;
2341         struct net *net = seq_file_net(seq);
2342         void *rc = NULL;
2343
2344         if (st->bpf_seq_afinfo)
2345                 afinfo = st->bpf_seq_afinfo;
2346         else
2347                 afinfo = PDE_DATA(file_inode(seq->file));
2348
2349         st->offset = 0;
2350         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2351                 struct sock *sk;
2352                 struct hlist_nulls_node *node;
2353                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2354
2355                 /* Lockless fast path for the common case of empty buckets */
2356                 if (empty_bucket(st))
2357                         continue;
2358
2359                 spin_lock_bh(lock);
2360                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2361                         if ((afinfo->family != AF_UNSPEC &&
2362                              sk->sk_family != afinfo->family) ||
2363                             !net_eq(sock_net(sk), net)) {
2364                                 continue;
2365                         }
2366                         rc = sk;
2367                         goto out;
2368                 }
2369                 spin_unlock_bh(lock);
2370         }
2371 out:
2372         return rc;
2373 }
2374
2375 static void *established_get_next(struct seq_file *seq, void *cur)
2376 {
2377         struct tcp_seq_afinfo *afinfo;
2378         struct sock *sk = cur;
2379         struct hlist_nulls_node *node;
2380         struct tcp_iter_state *st = seq->private;
2381         struct net *net = seq_file_net(seq);
2382
2383         if (st->bpf_seq_afinfo)
2384                 afinfo = st->bpf_seq_afinfo;
2385         else
2386                 afinfo = PDE_DATA(file_inode(seq->file));
2387
2388         ++st->num;
2389         ++st->offset;
2390
2391         sk = sk_nulls_next(sk);
2392
2393         sk_nulls_for_each_from(sk, node) {
2394                 if ((afinfo->family == AF_UNSPEC ||
2395                      sk->sk_family == afinfo->family) &&
2396                     net_eq(sock_net(sk), net))
2397                         return sk;
2398         }
2399
2400         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2401         ++st->bucket;
2402         return established_get_first(seq);
2403 }
2404
2405 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2406 {
2407         struct tcp_iter_state *st = seq->private;
2408         void *rc;
2409
2410         st->bucket = 0;
2411         rc = established_get_first(seq);
2412
2413         while (rc && pos) {
2414                 rc = established_get_next(seq, rc);
2415                 --pos;
2416         }
2417         return rc;
2418 }
2419
2420 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2421 {
2422         void *rc;
2423         struct tcp_iter_state *st = seq->private;
2424
2425         st->state = TCP_SEQ_STATE_LISTENING;
2426         rc        = listening_get_idx(seq, &pos);
2427
2428         if (!rc) {
2429                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2430                 rc        = established_get_idx(seq, pos);
2431         }
2432
2433         return rc;
2434 }
2435
2436 static void *tcp_seek_last_pos(struct seq_file *seq)
2437 {
2438         struct tcp_iter_state *st = seq->private;
2439         int bucket = st->bucket;
2440         int offset = st->offset;
2441         int orig_num = st->num;
2442         void *rc = NULL;
2443
2444         switch (st->state) {
2445         case TCP_SEQ_STATE_LISTENING:
2446                 if (st->bucket >= INET_LHTABLE_SIZE)
2447                         break;
2448                 st->state = TCP_SEQ_STATE_LISTENING;
2449                 rc = listening_get_next(seq, NULL);
2450                 while (offset-- && rc && bucket == st->bucket)
2451                         rc = listening_get_next(seq, rc);
2452                 if (rc)
2453                         break;
2454                 st->bucket = 0;
2455                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2456                 fallthrough;
2457         case TCP_SEQ_STATE_ESTABLISHED:
2458                 if (st->bucket > tcp_hashinfo.ehash_mask)
2459                         break;
2460                 rc = established_get_first(seq);
2461                 while (offset-- && rc && bucket == st->bucket)
2462                         rc = established_get_next(seq, rc);
2463         }
2464
2465         st->num = orig_num;
2466
2467         return rc;
2468 }
2469
2470 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2471 {
2472         struct tcp_iter_state *st = seq->private;
2473         void *rc;
2474
2475         if (*pos && *pos == st->last_pos) {
2476                 rc = tcp_seek_last_pos(seq);
2477                 if (rc)
2478                         goto out;
2479         }
2480
2481         st->state = TCP_SEQ_STATE_LISTENING;
2482         st->num = 0;
2483         st->bucket = 0;
2484         st->offset = 0;
2485         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2486
2487 out:
2488         st->last_pos = *pos;
2489         return rc;
2490 }
2491 EXPORT_SYMBOL(tcp_seq_start);
2492
2493 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2494 {
2495         struct tcp_iter_state *st = seq->private;
2496         void *rc = NULL;
2497
2498         if (v == SEQ_START_TOKEN) {
2499                 rc = tcp_get_idx(seq, 0);
2500                 goto out;
2501         }
2502
2503         switch (st->state) {
2504         case TCP_SEQ_STATE_LISTENING:
2505                 rc = listening_get_next(seq, v);
2506                 if (!rc) {
2507                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2508                         st->bucket = 0;
2509                         st->offset = 0;
2510                         rc        = established_get_first(seq);
2511                 }
2512                 break;
2513         case TCP_SEQ_STATE_ESTABLISHED:
2514                 rc = established_get_next(seq, v);
2515                 break;
2516         }
2517 out:
2518         ++*pos;
2519         st->last_pos = *pos;
2520         return rc;
2521 }
2522 EXPORT_SYMBOL(tcp_seq_next);
2523
2524 void tcp_seq_stop(struct seq_file *seq, void *v)
2525 {
2526         struct tcp_iter_state *st = seq->private;
2527
2528         switch (st->state) {
2529         case TCP_SEQ_STATE_LISTENING:
2530                 if (v != SEQ_START_TOKEN)
2531                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2532                 break;
2533         case TCP_SEQ_STATE_ESTABLISHED:
2534                 if (v)
2535                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2536                 break;
2537         }
2538 }
2539 EXPORT_SYMBOL(tcp_seq_stop);
2540
2541 static void get_openreq4(const struct request_sock *req,
2542                          struct seq_file *f, int i)
2543 {
2544         const struct inet_request_sock *ireq = inet_rsk(req);
2545         long delta = req->rsk_timer.expires - jiffies;
2546
2547         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2548                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2549                 i,
2550                 ireq->ir_loc_addr,
2551                 ireq->ir_num,
2552                 ireq->ir_rmt_addr,
2553                 ntohs(ireq->ir_rmt_port),
2554                 TCP_SYN_RECV,
2555                 0, 0, /* could print option size, but that is af dependent. */
2556                 1,    /* timers active (only the expire timer) */
2557                 jiffies_delta_to_clock_t(delta),
2558                 req->num_timeout,
2559                 from_kuid_munged(seq_user_ns(f),
2560                                  sock_i_uid(req->rsk_listener)),
2561                 0,  /* non standard timer */
2562                 0, /* open_requests have no inode */
2563                 0,
2564                 req);
2565 }
2566
2567 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2568 {
2569         int timer_active;
2570         unsigned long timer_expires;
2571         const struct tcp_sock *tp = tcp_sk(sk);
2572         const struct inet_connection_sock *icsk = inet_csk(sk);
2573         const struct inet_sock *inet = inet_sk(sk);
2574         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2575         __be32 dest = inet->inet_daddr;
2576         __be32 src = inet->inet_rcv_saddr;
2577         __u16 destp = ntohs(inet->inet_dport);
2578         __u16 srcp = ntohs(inet->inet_sport);
2579         int rx_queue;
2580         int state;
2581
2582         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2583             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2584             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2585                 timer_active    = 1;
2586                 timer_expires   = icsk->icsk_timeout;
2587         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2588                 timer_active    = 4;
2589                 timer_expires   = icsk->icsk_timeout;
2590         } else if (timer_pending(&sk->sk_timer)) {
2591                 timer_active    = 2;
2592                 timer_expires   = sk->sk_timer.expires;
2593         } else {
2594                 timer_active    = 0;
2595                 timer_expires = jiffies;
2596         }
2597
2598         state = inet_sk_state_load(sk);
2599         if (state == TCP_LISTEN)
2600                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2601         else
2602                 /* Because we don't lock the socket,
2603                  * we might find a transient negative value.
2604                  */
2605                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2606                                       READ_ONCE(tp->copied_seq), 0);
2607
2608         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2609                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2610                 i, src, srcp, dest, destp, state,
2611                 READ_ONCE(tp->write_seq) - tp->snd_una,
2612                 rx_queue,
2613                 timer_active,
2614                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2615                 icsk->icsk_retransmits,
2616                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2617                 icsk->icsk_probes_out,
2618                 sock_i_ino(sk),
2619                 refcount_read(&sk->sk_refcnt), sk,
2620                 jiffies_to_clock_t(icsk->icsk_rto),
2621                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2622                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2623                 tp->snd_cwnd,
2624                 state == TCP_LISTEN ?
2625                     fastopenq->max_qlen :
2626                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2627 }
2628
2629 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2630                                struct seq_file *f, int i)
2631 {
2632         long delta = tw->tw_timer.expires - jiffies;
2633         __be32 dest, src;
2634         __u16 destp, srcp;
2635
2636         dest  = tw->tw_daddr;
2637         src   = tw->tw_rcv_saddr;
2638         destp = ntohs(tw->tw_dport);
2639         srcp  = ntohs(tw->tw_sport);
2640
2641         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2642                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2643                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2644                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2645                 refcount_read(&tw->tw_refcnt), tw);
2646 }
2647
2648 #define TMPSZ 150
2649
2650 static int tcp4_seq_show(struct seq_file *seq, void *v)
2651 {
2652         struct tcp_iter_state *st;
2653         struct sock *sk = v;
2654
2655         seq_setwidth(seq, TMPSZ - 1);
2656         if (v == SEQ_START_TOKEN) {
2657                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2658                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2659                            "inode");
2660                 goto out;
2661         }
2662         st = seq->private;
2663
2664         if (sk->sk_state == TCP_TIME_WAIT)
2665                 get_timewait4_sock(v, seq, st->num);
2666         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2667                 get_openreq4(v, seq, st->num);
2668         else
2669                 get_tcp4_sock(v, seq, st->num);
2670 out:
2671         seq_pad(seq, '\n');
2672         return 0;
2673 }
2674
2675 #ifdef CONFIG_BPF_SYSCALL
2676 struct bpf_iter__tcp {
2677         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2678         __bpf_md_ptr(struct sock_common *, sk_common);
2679         uid_t uid __aligned(8);
2680 };
2681
2682 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2683                              struct sock_common *sk_common, uid_t uid)
2684 {
2685         struct bpf_iter__tcp ctx;
2686
2687         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2688         ctx.meta = meta;
2689         ctx.sk_common = sk_common;
2690         ctx.uid = uid;
2691         return bpf_iter_run_prog(prog, &ctx);
2692 }
2693
2694 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2695 {
2696         struct bpf_iter_meta meta;
2697         struct bpf_prog *prog;
2698         struct sock *sk = v;
2699         uid_t uid;
2700
2701         if (v == SEQ_START_TOKEN)
2702                 return 0;
2703
2704         if (sk->sk_state == TCP_TIME_WAIT) {
2705                 uid = 0;
2706         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2707                 const struct request_sock *req = v;
2708
2709                 uid = from_kuid_munged(seq_user_ns(seq),
2710                                        sock_i_uid(req->rsk_listener));
2711         } else {
2712                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2713         }
2714
2715         meta.seq = seq;
2716         prog = bpf_iter_get_info(&meta, false);
2717         return tcp_prog_seq_show(prog, &meta, v, uid);
2718 }
2719
2720 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2721 {
2722         struct bpf_iter_meta meta;
2723         struct bpf_prog *prog;
2724
2725         if (!v) {
2726                 meta.seq = seq;
2727                 prog = bpf_iter_get_info(&meta, true);
2728                 if (prog)
2729                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2730         }
2731
2732         tcp_seq_stop(seq, v);
2733 }
2734
2735 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2736         .show           = bpf_iter_tcp_seq_show,
2737         .start          = tcp_seq_start,
2738         .next           = tcp_seq_next,
2739         .stop           = bpf_iter_tcp_seq_stop,
2740 };
2741 #endif
2742
2743 static const struct seq_operations tcp4_seq_ops = {
2744         .show           = tcp4_seq_show,
2745         .start          = tcp_seq_start,
2746         .next           = tcp_seq_next,
2747         .stop           = tcp_seq_stop,
2748 };
2749
2750 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2751         .family         = AF_INET,
2752 };
2753
2754 static int __net_init tcp4_proc_init_net(struct net *net)
2755 {
2756         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2757                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2758                 return -ENOMEM;
2759         return 0;
2760 }
2761
2762 static void __net_exit tcp4_proc_exit_net(struct net *net)
2763 {
2764         remove_proc_entry("tcp", net->proc_net);
2765 }
2766
2767 static struct pernet_operations tcp4_net_ops = {
2768         .init = tcp4_proc_init_net,
2769         .exit = tcp4_proc_exit_net,
2770 };
2771
2772 int __init tcp4_proc_init(void)
2773 {
2774         return register_pernet_subsys(&tcp4_net_ops);
2775 }
2776
2777 void tcp4_proc_exit(void)
2778 {
2779         unregister_pernet_subsys(&tcp4_net_ops);
2780 }
2781 #endif /* CONFIG_PROC_FS */
2782
2783 struct proto tcp_prot = {
2784         .name                   = "TCP",
2785         .owner                  = THIS_MODULE,
2786         .close                  = tcp_close,
2787         .pre_connect            = tcp_v4_pre_connect,
2788         .connect                = tcp_v4_connect,
2789         .disconnect             = tcp_disconnect,
2790         .accept                 = inet_csk_accept,
2791         .ioctl                  = tcp_ioctl,
2792         .init                   = tcp_v4_init_sock,
2793         .destroy                = tcp_v4_destroy_sock,
2794         .shutdown               = tcp_shutdown,
2795         .setsockopt             = tcp_setsockopt,
2796         .getsockopt             = tcp_getsockopt,
2797         .keepalive              = tcp_set_keepalive,
2798         .recvmsg                = tcp_recvmsg,
2799         .sendmsg                = tcp_sendmsg,
2800         .sendpage               = tcp_sendpage,
2801         .backlog_rcv            = tcp_v4_do_rcv,
2802         .release_cb             = tcp_release_cb,
2803         .hash                   = inet_hash,
2804         .unhash                 = inet_unhash,
2805         .get_port               = inet_csk_get_port,
2806         .enter_memory_pressure  = tcp_enter_memory_pressure,
2807         .leave_memory_pressure  = tcp_leave_memory_pressure,
2808         .stream_memory_free     = tcp_stream_memory_free,
2809         .sockets_allocated      = &tcp_sockets_allocated,
2810         .orphan_count           = &tcp_orphan_count,
2811         .memory_allocated       = &tcp_memory_allocated,
2812         .memory_pressure        = &tcp_memory_pressure,
2813         .sysctl_mem             = sysctl_tcp_mem,
2814         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2815         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2816         .max_header             = MAX_TCP_HEADER,
2817         .obj_size               = sizeof(struct tcp_sock),
2818         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2819         .twsk_prot              = &tcp_timewait_sock_ops,
2820         .rsk_prot               = &tcp_request_sock_ops,
2821         .h.hashinfo             = &tcp_hashinfo,
2822         .no_autobind            = true,
2823         .diag_destroy           = tcp_abort,
2824 };
2825 EXPORT_SYMBOL(tcp_prot);
2826
2827 static void __net_exit tcp_sk_exit(struct net *net)
2828 {
2829         int cpu;
2830
2831         if (net->ipv4.tcp_congestion_control)
2832                 bpf_module_put(net->ipv4.tcp_congestion_control,
2833                                net->ipv4.tcp_congestion_control->owner);
2834
2835         for_each_possible_cpu(cpu)
2836                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2837         free_percpu(net->ipv4.tcp_sk);
2838 }
2839
2840 static int __net_init tcp_sk_init(struct net *net)
2841 {
2842         int res, cpu, cnt;
2843
2844         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2845         if (!net->ipv4.tcp_sk)
2846                 return -ENOMEM;
2847
2848         for_each_possible_cpu(cpu) {
2849                 struct sock *sk;
2850
2851                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2852                                            IPPROTO_TCP, net);
2853                 if (res)
2854                         goto fail;
2855                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2856
2857                 /* Please enforce IP_DF and IPID==0 for RST and
2858                  * ACK sent in SYN-RECV and TIME-WAIT state.
2859                  */
2860                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2861
2862                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2863         }
2864
2865         net->ipv4.sysctl_tcp_ecn = 2;
2866         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2867
2868         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2869         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2870         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2871         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2872         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2873
2874         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2875         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2876         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2877
2878         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2879         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2880         net->ipv4.sysctl_tcp_syncookies = 1;
2881         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2882         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2883         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2884         net->ipv4.sysctl_tcp_orphan_retries = 0;
2885         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2886         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2887         net->ipv4.sysctl_tcp_tw_reuse = 2;
2888         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2889
2890         cnt = tcp_hashinfo.ehash_mask + 1;
2891         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2892         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2893
2894         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2895         net->ipv4.sysctl_tcp_sack = 1;
2896         net->ipv4.sysctl_tcp_window_scaling = 1;
2897         net->ipv4.sysctl_tcp_timestamps = 1;
2898         net->ipv4.sysctl_tcp_early_retrans = 3;
2899         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2900         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2901         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2902         net->ipv4.sysctl_tcp_max_reordering = 300;
2903         net->ipv4.sysctl_tcp_dsack = 1;
2904         net->ipv4.sysctl_tcp_app_win = 31;
2905         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2906         net->ipv4.sysctl_tcp_frto = 2;
2907         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2908         /* This limits the percentage of the congestion window which we
2909          * will allow a single TSO frame to consume.  Building TSO frames
2910          * which are too large can cause TCP streams to be bursty.
2911          */
2912         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2913         /* Default TSQ limit of 16 TSO segments */
2914         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2915         /* rfc5961 challenge ack rate limiting */
2916         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2917         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2918         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2919         net->ipv4.sysctl_tcp_autocorking = 1;
2920         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2921         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2922         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2923         if (net != &init_net) {
2924                 memcpy(net->ipv4.sysctl_tcp_rmem,
2925                        init_net.ipv4.sysctl_tcp_rmem,
2926                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2927                 memcpy(net->ipv4.sysctl_tcp_wmem,
2928                        init_net.ipv4.sysctl_tcp_wmem,
2929                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2930         }
2931         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2932         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2933         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2934         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2935         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2936         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
2937         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2938
2939         /* Reno is always built in */
2940         if (!net_eq(net, &init_net) &&
2941             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2942                                init_net.ipv4.tcp_congestion_control->owner))
2943                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2944         else
2945                 net->ipv4.tcp_congestion_control = &tcp_reno;
2946
2947         return 0;
2948 fail:
2949         tcp_sk_exit(net);
2950
2951         return res;
2952 }
2953
2954 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2955 {
2956         struct net *net;
2957
2958         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2959
2960         list_for_each_entry(net, net_exit_list, exit_list)
2961                 tcp_fastopen_ctx_destroy(net);
2962 }
2963
2964 static struct pernet_operations __net_initdata tcp_sk_ops = {
2965        .init       = tcp_sk_init,
2966        .exit       = tcp_sk_exit,
2967        .exit_batch = tcp_sk_exit_batch,
2968 };
2969
2970 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2971 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2972                      struct sock_common *sk_common, uid_t uid)
2973
2974 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2975 {
2976         struct tcp_iter_state *st = priv_data;
2977         struct tcp_seq_afinfo *afinfo;
2978         int ret;
2979
2980         afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2981         if (!afinfo)
2982                 return -ENOMEM;
2983
2984         afinfo->family = AF_UNSPEC;
2985         st->bpf_seq_afinfo = afinfo;
2986         ret = bpf_iter_init_seq_net(priv_data, aux);
2987         if (ret)
2988                 kfree(afinfo);
2989         return ret;
2990 }
2991
2992 static void bpf_iter_fini_tcp(void *priv_data)
2993 {
2994         struct tcp_iter_state *st = priv_data;
2995
2996         kfree(st->bpf_seq_afinfo);
2997         bpf_iter_fini_seq_net(priv_data);
2998 }
2999
3000 static const struct bpf_iter_seq_info tcp_seq_info = {
3001         .seq_ops                = &bpf_iter_tcp_seq_ops,
3002         .init_seq_private       = bpf_iter_init_tcp,
3003         .fini_seq_private       = bpf_iter_fini_tcp,
3004         .seq_priv_size          = sizeof(struct tcp_iter_state),
3005 };
3006
3007 static struct bpf_iter_reg tcp_reg_info = {
3008         .target                 = "tcp",
3009         .ctx_arg_info_size      = 1,
3010         .ctx_arg_info           = {
3011                 { offsetof(struct bpf_iter__tcp, sk_common),
3012                   PTR_TO_BTF_ID_OR_NULL },
3013         },
3014         .seq_info               = &tcp_seq_info,
3015 };
3016
3017 static void __init bpf_iter_register(void)
3018 {
3019         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3020         if (bpf_iter_reg_target(&tcp_reg_info))
3021                 pr_warn("Warning: could not register bpf iterator tcp\n");
3022 }
3023
3024 #endif
3025
3026 void __init tcp_v4_init(void)
3027 {
3028         if (register_pernet_subsys(&tcp_sk_ops))
3029                 panic("Failed to create the TCP control socket.\n");
3030
3031 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3032         bpf_iter_register();
3033 #endif
3034 }