tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60 #include <linux/sched.h>
  61
  62 #include <net/net_namespace.h>
  63 #include <net/icmp.h>
  64 #include <net/inet_hashtables.h>
  65 #include <net/tcp.h>
  66 #include <net/transp_v6.h>
  67 #include <net/ipv6.h>
  68 #include <net/inet_common.h>
  69 #include <net/timewait_sock.h>
  70 #include <net/xfrm.h>
  71 #include <net/secure_seq.h>
  72 #include <net/busy_poll.h>
  73
  74 #include <linux/inet.h>
  75 #include <linux/ipv6.h>
  76 #include <linux/stddef.h>
  77 #include <linux/proc_fs.h>
  78 #include <linux/seq_file.h>
  79 #include <linux/inetdevice.h>
  80 #include <linux/btf_ids.h>
  81
  82 #include <crypto/hash.h>
  83 #include <linux/scatterlist.h>
  84
  85 #include <trace/events/tcp.h>
  86
  87 #ifdef CONFIG_TCP_MD5SIG
  88 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  89                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  90 #endif
  91
  92 struct inet_hashinfo tcp_hashinfo;
  93 EXPORT_SYMBOL(tcp_hashinfo);
  94
  95 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
  96
  97 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  98 {
  99         return secure_tcp_seq(ip_hdr(skb)->daddr,
 100                               ip_hdr(skb)->saddr,
 101                               tcp_hdr(skb)->dest,
 102                               tcp_hdr(skb)->source);
 103 }
 104
 105 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 106 {
 107         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 108 }
 109
 110 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 111 {
 112         int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
 113         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 114         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 115         struct tcp_sock *tp = tcp_sk(sk);
 116
 117         if (reuse == 2) {
 118                 /* Still does not detect *everything* that goes through
 119                  * lo, since we require a loopback src or dst address
 120                  * or direct binding to 'lo' interface.
 121                  */
 122                 bool loopback = false;
 123                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 124                         loopback = true;
 125 #if IS_ENABLED(CONFIG_IPV6)
 126                 if (tw->tw_family == AF_INET6) {
 127                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 128                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 129                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 130                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 131                                 loopback = true;
 132                 } else
 133 #endif
 134                 {
 135                         if (ipv4_is_loopback(tw->tw_daddr) ||
 136                             ipv4_is_loopback(tw->tw_rcv_saddr))
 137                                 loopback = true;
 138                 }
 139                 if (!loopback)
 140                         reuse = 0;
 141         }
 142
 143         /* With PAWS, it is safe from the viewpoint
 144            of data integrity. Even without PAWS it is safe provided sequence
 145            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 146
 147            Actually, the idea is close to VJ's one, only timestamp cache is
 148            held not per host, but per port pair and TW bucket is used as state
 149            holder.
 150
 151            If TW bucket has been already destroyed we fall back to VJ's scheme
 152            and use initial timestamp retrieved from peer table.
 153          */
 154         if (tcptw->tw_ts_recent_stamp &&
 155             (!twp || (reuse && time_after32(ktime_get_seconds(),
 156                                             tcptw->tw_ts_recent_stamp)))) {
 157                 /* In case of repair and re-using TIME-WAIT sockets we still
 158                  * want to be sure that it is safe as above but honor the
 159                  * sequence numbers and time stamps set as part of the repair
 160                  * process.
 161                  *
 162                  * Without this check re-using a TIME-WAIT socket with TCP
 163                  * repair would accumulate a -1 on the repair assigned
 164                  * sequence number. The first time it is reused the sequence
 165                  * is -1, the second time -2, etc. This fixes that issue
 166                  * without appearing to create any others.
 167                  */
 168                 if (likely(!tp->repair)) {
 169                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 170
 171                         if (!seq)
 172                                 seq = 1;
 173                         WRITE_ONCE(tp->write_seq, seq);
 174                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 175                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 176                 }
 177                 sock_hold(sktw);
 178                 return 1;
 179         }
 180
 181         return 0;
 182 }
 183 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 184
 185 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 186                               int addr_len)
 187 {
 188         /* This check is replicated from tcp_v4_connect() and intended to
 189          * prevent BPF program called below from accessing bytes that are out
 190          * of the bound specified by user in addr_len.
 191          */
 192         if (addr_len < sizeof(struct sockaddr_in))
 193                 return -EINVAL;
 194
 195         sock_owned_by_me(sk);
 196
 197         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
 198 }
 199
 200 /* This will initiate an outgoing connection. */
 201 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 202 {
 203         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 204         struct inet_timewait_death_row *tcp_death_row;
 205         struct inet_sock *inet = inet_sk(sk);
 206         struct tcp_sock *tp = tcp_sk(sk);
 207         struct ip_options_rcu *inet_opt;
 208         struct net *net = sock_net(sk);
 209         __be16 orig_sport, orig_dport;
 210         __be32 daddr, nexthop;
 211         struct flowi4 *fl4;
 212         struct rtable *rt;
 213         int err;
 214
 215         if (addr_len < sizeof(struct sockaddr_in))
 216                 return -EINVAL;
 217
 218         if (usin->sin_family != AF_INET)
 219                 return -EAFNOSUPPORT;
 220
 221         nexthop = daddr = usin->sin_addr.s_addr;
 222         inet_opt = rcu_dereference_protected(inet->inet_opt,
 223                                              lockdep_sock_is_held(sk));
 224         if (inet_opt && inet_opt->opt.srr) {
 225                 if (!daddr)
 226                         return -EINVAL;
 227                 nexthop = inet_opt->opt.faddr;
 228         }
 229
 230         orig_sport = inet->inet_sport;
 231         orig_dport = usin->sin_port;
 232         fl4 = &inet->cork.fl.u.ip4;
 233         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 234                               sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
 235                               orig_dport, sk);
 236         if (IS_ERR(rt)) {
 237                 err = PTR_ERR(rt);
 238                 if (err == -ENETUNREACH)
 239                         IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
 240                 return err;
 241         }
 242
 243         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 244                 ip_rt_put(rt);
 245                 return -ENETUNREACH;
 246         }
 247
 248         if (!inet_opt || !inet_opt->opt.srr)
 249                 daddr = fl4->daddr;
 250
 251         tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 252
 253         if (!inet->inet_saddr) {
 254                 err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
 255                 if (err) {
 256                         ip_rt_put(rt);
 257                         return err;
 258                 }
 259         } else {
 260                 sk_rcv_saddr_set(sk, inet->inet_saddr);
 261         }
 262
 263         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 264                 /* Reset inherited state */
 265                 tp->rx_opt.ts_recent       = 0;
 266                 tp->rx_opt.ts_recent_stamp = 0;
 267                 if (likely(!tp->repair))
 268                         WRITE_ONCE(tp->write_seq, 0);
 269         }
 270
 271         inet->inet_dport = usin->sin_port;
 272         sk_daddr_set(sk, daddr);
 273
 274         inet_csk(sk)->icsk_ext_hdr_len = 0;
 275         if (inet_opt)
 276                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 277
 278         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 279
 280         /* Socket identity is still unknown (sport may be zero).
 281          * However we set state to SYN-SENT and not releasing socket
 282          * lock select source port, enter ourselves into the hash tables and
 283          * complete initialization after this.
 284          */
 285         tcp_set_state(sk, TCP_SYN_SENT);
 286         err = inet_hash_connect(tcp_death_row, sk);
 287         if (err)
 288                 goto failure;
 289
 290         sk_set_txhash(sk);
 291
 292         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 293                                inet->inet_sport, inet->inet_dport, sk);
 294         if (IS_ERR(rt)) {
 295                 err = PTR_ERR(rt);
 296                 rt = NULL;
 297                 goto failure;
 298         }
 299         tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
 300         /* OK, now commit destination to socket.  */
 301         sk->sk_gso_type = SKB_GSO_TCPV4;
 302         sk_setup_caps(sk, &rt->dst);
 303         rt = NULL;
 304
 305         if (likely(!tp->repair)) {
 306                 if (!tp->write_seq)
 307                         WRITE_ONCE(tp->write_seq,
 308                                    secure_tcp_seq(inet->inet_saddr,
 309                                                   inet->inet_daddr,
 310                                                   inet->inet_sport,
 311                                                   usin->sin_port));
 312                 WRITE_ONCE(tp->tsoffset,
 313                            secure_tcp_ts_off(net, inet->inet_saddr,
 314                                              inet->inet_daddr));
 315         }
 316
 317         atomic_set(&inet->inet_id, get_random_u16());
 318
 319         if (tcp_fastopen_defer_connect(sk, &err))
 320                 return err;
 321         if (err)
 322                 goto failure;
 323
 324         err = tcp_connect(sk);
 325
 326         if (err)
 327                 goto failure;
 328
 329         return 0;
 330
 331 failure:
 332         /*
 333          * This unhashes the socket and releases the local port,
 334          * if necessary.
 335          */
 336         tcp_set_state(sk, TCP_CLOSE);
 337         inet_bhash2_reset_saddr(sk);
 338         ip_rt_put(rt);
 339         sk->sk_route_caps = 0;
 340         inet->inet_dport = 0;
 341         return err;
 342 }
 343 EXPORT_SYMBOL(tcp_v4_connect);
 344
 345 /*
 346  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 347  * It can be called through tcp_release_cb() if socket was owned by user
 348  * at the time tcp_v4_err() was called to handle ICMP message.
 349  */
 350 void tcp_v4_mtu_reduced(struct sock *sk)
 351 {
 352         struct inet_sock *inet = inet_sk(sk);
 353         struct dst_entry *dst;
 354         u32 mtu;
 355
 356         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 357                 return;
 358         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 359         dst = inet_csk_update_pmtu(sk, mtu);
 360         if (!dst)
 361                 return;
 362
 363         /* Something is about to be wrong... Remember soft error
 364          * for the case, if this connection will not able to recover.
 365          */
 366         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 367                 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
 368
 369         mtu = dst_mtu(dst);
 370
 371         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 372             ip_sk_accept_pmtu(sk) &&
 373             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 374                 tcp_sync_mss(sk, mtu);
 375
 376                 /* Resend the TCP packet because it's
 377                  * clear that the old packet has been
 378                  * dropped. This is the new "fast" path mtu
 379                  * discovery.
 380                  */
 381                 tcp_simple_retransmit(sk);
 382         } /* else let the usual retransmit timer handle it */
 383 }
 384 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 385
 386 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 387 {
 388         struct dst_entry *dst = __sk_dst_check(sk, 0);
 389
 390         if (dst)
 391                 dst->ops->redirect(dst, sk, skb);
 392 }
 393
 394
 395 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 396 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 397 {
 398         struct request_sock *req = inet_reqsk(sk);
 399         struct net *net = sock_net(sk);
 400
 401         /* ICMPs are not backlogged, hence we cannot get
 402          * an established socket here.
 403          */
 404         if (seq != tcp_rsk(req)->snt_isn) {
 405                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 406         } else if (abort) {
 407                 /*
 408                  * Still in SYN_RECV, just remove it silently.
 409                  * There is no good way to pass the error to the newly
 410                  * created socket, and POSIX does not want network
 411                  * errors returned from accept().
 412                  */
 413                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 414                 tcp_listendrop(req->rsk_listener);
 415         }
 416         reqsk_put(req);
 417 }
 418 EXPORT_SYMBOL(tcp_req_err);
 419
 420 /* TCP-LD (RFC 6069) logic */
 421 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 422 {
 423         struct inet_connection_sock *icsk = inet_csk(sk);
 424         struct tcp_sock *tp = tcp_sk(sk);
 425         struct sk_buff *skb;
 426         s32 remaining;
 427         u32 delta_us;
 428
 429         if (sock_owned_by_user(sk))
 430                 return;
 431
 432         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 433             !icsk->icsk_backoff)
 434                 return;
 435
 436         skb = tcp_rtx_queue_head(sk);
 437         if (WARN_ON_ONCE(!skb))
 438                 return;
 439
 440         icsk->icsk_backoff--;
 441         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 442         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 443
 444         tcp_mstamp_refresh(tp);
 445         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 446         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 447
 448         if (remaining > 0) {
 449                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 450                                           remaining, TCP_RTO_MAX);
 451         } else {
 452                 /* RTO revert clocked out retransmission.
 453                  * Will retransmit now.
 454                  */
 455                 tcp_retransmit_timer(sk);
 456         }
 457 }
 458 EXPORT_SYMBOL(tcp_ld_RTO_revert);
 459
 460 /*
 461  * This routine is called by the ICMP module when it gets some
 462  * sort of error condition.  If err < 0 then the socket should
 463  * be closed and the error returned to the user.  If err > 0
 464  * it's just the icmp type << 8 | icmp code.  After adjustment
 465  * header points to the first 8 bytes of the tcp header.  We need
 466  * to find the appropriate port.
 467  *
 468  * The locking strategy used here is very "optimistic". When
 469  * someone else accesses the socket the ICMP is just dropped
 470  * and for some paths there is no check at all.
 471  * A more general error queue to queue errors for later handling
 472  * is probably better.
 473  *
 474  */
 475
 476 int tcp_v4_err(struct sk_buff *skb, u32 info)
 477 {
 478         const struct iphdr *iph = (const struct iphdr *)skb->data;
 479         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 480         struct tcp_sock *tp;
 481         const int type = icmp_hdr(skb)->type;
 482         const int code = icmp_hdr(skb)->code;
 483         struct sock *sk;
 484         struct request_sock *fastopen;
 485         u32 seq, snd_una;
 486         int err;
 487         struct net *net = dev_net(skb->dev);
 488
 489         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
 490                                        iph->daddr, th->dest, iph->saddr,
 491                                        ntohs(th->source), inet_iif(skb), 0);
 492         if (!sk) {
 493                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 494                 return -ENOENT;
 495         }
 496         if (sk->sk_state == TCP_TIME_WAIT) {
 497                 /* To increase the counter of ignored icmps for TCP-AO */
 498                 tcp_ao_ignore_icmp(sk, AF_INET, type, code);
 499                 inet_twsk_put(inet_twsk(sk));
 500                 return 0;
 501         }
 502         seq = ntohl(th->seq);
 503         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 504                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 505                                      type == ICMP_TIME_EXCEEDED ||
 506                                      (type == ICMP_DEST_UNREACH &&
 507                                       (code == ICMP_NET_UNREACH ||
 508                                        code == ICMP_HOST_UNREACH)));
 509                 return 0;
 510         }
 511
 512         if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
 513                 sock_put(sk);
 514                 return 0;
 515         }
 516
 517         bh_lock_sock(sk);
 518         /* If too many ICMPs get dropped on busy
 519          * servers this needs to be solved differently.
 520          * We do take care of PMTU discovery (RFC1191) special case :
 521          * we can receive locally generated ICMP messages while socket is held.
 522          */
 523         if (sock_owned_by_user(sk)) {
 524                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 525                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 526         }
 527         if (sk->sk_state == TCP_CLOSE)
 528                 goto out;
 529
 530         if (static_branch_unlikely(&ip4_min_ttl)) {
 531                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
 532                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
 533                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 534                         goto out;
 535                 }
 536         }
 537
 538         tp = tcp_sk(sk);
 539         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 540         fastopen = rcu_dereference(tp->fastopen_rsk);
 541         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 542         if (sk->sk_state != TCP_LISTEN &&
 543             !between(seq, snd_una, tp->snd_nxt)) {
 544                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 545                 goto out;
 546         }
 547
 548         switch (type) {
 549         case ICMP_REDIRECT:
 550                 if (!sock_owned_by_user(sk))
 551                         do_redirect(skb, sk);
 552                 goto out;
 553         case ICMP_SOURCE_QUENCH:
 554                 /* Just silently ignore these. */
 555                 goto out;
 556         case ICMP_PARAMETERPROB:
 557                 err = EPROTO;
 558                 break;
 559         case ICMP_DEST_UNREACH:
 560                 if (code > NR_ICMP_UNREACH)
 561                         goto out;
 562
 563                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 564                         /* We are not interested in TCP_LISTEN and open_requests
 565                          * (SYN-ACKs send out by Linux are always <576bytes so
 566                          * they should go through unfragmented).
 567                          */
 568                         if (sk->sk_state == TCP_LISTEN)
 569                                 goto out;
 570
 571                         WRITE_ONCE(tp->mtu_info, info);
 572                         if (!sock_owned_by_user(sk)) {
 573                                 tcp_v4_mtu_reduced(sk);
 574                         } else {
 575                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 576                                         sock_hold(sk);
 577                         }
 578                         goto out;
 579                 }
 580
 581                 err = icmp_err_convert[code].errno;
 582                 /* check if this ICMP message allows revert of backoff.
 583                  * (see RFC 6069)
 584                  */
 585                 if (!fastopen &&
 586                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 587                         tcp_ld_RTO_revert(sk, seq);
 588                 break;
 589         case ICMP_TIME_EXCEEDED:
 590                 err = EHOSTUNREACH;
 591                 break;
 592         default:
 593                 goto out;
 594         }
 595
 596         switch (sk->sk_state) {
 597         case TCP_SYN_SENT:
 598         case TCP_SYN_RECV:
 599                 /* Only in fast or simultaneous open. If a fast open socket is
 600                  * already accepted it is treated as a connected one below.
 601                  */
 602                 if (fastopen && !fastopen->sk)
 603                         break;
 604
 605                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 606
 607                 if (!sock_owned_by_user(sk)) {
 608                         WRITE_ONCE(sk->sk_err, err);
 609
 610                         sk_error_report(sk);
 611
 612                         tcp_done(sk);
 613                 } else {
 614                         WRITE_ONCE(sk->sk_err_soft, err);
 615                 }
 616                 goto out;
 617         }
 618
 619         /* If we've already connected we will keep trying
 620          * until we time out, or the user gives up.
 621          *
 622          * rfc1122 4.2.3.9 allows to consider as hard errors
 623          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 624          * but it is obsoleted by pmtu discovery).
 625          *
 626          * Note, that in modern internet, where routing is unreliable
 627          * and in each dark corner broken firewalls sit, sending random
 628          * errors ordered by their masters even this two messages finally lose
 629          * their original sense (even Linux sends invalid PORT_UNREACHs)
 630          *
 631          * Now we are in compliance with RFCs.
 632          *                                                      --ANK (980905)
 633          */
 634
 635         if (!sock_owned_by_user(sk) &&
 636             inet_test_bit(RECVERR, sk)) {
 637                 WRITE_ONCE(sk->sk_err, err);
 638                 sk_error_report(sk);
 639         } else  { /* Only an error on timeout */
 640                 WRITE_ONCE(sk->sk_err_soft, err);
 641         }
 642
 643 out:
 644         bh_unlock_sock(sk);
 645         sock_put(sk);
 646         return 0;
 647 }
 648
 649 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 650 {
 651         struct tcphdr *th = tcp_hdr(skb);
 652
 653         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 654         skb->csum_start = skb_transport_header(skb) - skb->head;
 655         skb->csum_offset = offsetof(struct tcphdr, check);
 656 }
 657
 658 /* This routine computes an IPv4 TCP checksum. */
 659 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 660 {
 661         const struct inet_sock *inet = inet_sk(sk);
 662
 663         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 664 }
 665 EXPORT_SYMBOL(tcp_v4_send_check);
 666
 667 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
 668
 669 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
 670                                  const struct tcp_ao_hdr *aoh,
 671                                  struct ip_reply_arg *arg, struct tcphdr *reply,
 672                                  __be32 reply_options[REPLY_OPTIONS_LEN])
 673 {
 674 #ifdef CONFIG_TCP_AO
 675         int sdif = tcp_v4_sdif(skb);
 676         int dif = inet_iif(skb);
 677         int l3index = sdif ? dif : 0;
 678         bool allocated_traffic_key;
 679         struct tcp_ao_key *key;
 680         char *traffic_key;
 681         bool drop = true;
 682         u32 ao_sne = 0;
 683         u8 keyid;
 684
 685         rcu_read_lock();
 686         if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
 687                                  &key, &traffic_key, &allocated_traffic_key,
 688                                  &keyid, &ao_sne))
 689                 goto out;
 690
 691         reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
 692                                  (aoh->rnext_keyid << 8) | keyid);
 693         arg->iov[0].iov_len += tcp_ao_len_aligned(key);
 694         reply->doff = arg->iov[0].iov_len / 4;
 695
 696         if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
 697                             key, traffic_key,
 698                             (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
 699                             (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
 700                             reply, ao_sne))
 701                 goto out;
 702         drop = false;
 703 out:
 704         rcu_read_unlock();
 705         if (allocated_traffic_key)
 706                 kfree(traffic_key);
 707         return drop;
 708 #else
 709         return true;
 710 #endif
 711 }
 712
 713 /*
 714  *      This routine will send an RST to the other tcp.
 715  *
 716  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 717  *                    for reset.
 718  *      Answer: if a packet caused RST, it is not for a socket
 719  *              existing in our system, if it is matched to a socket,
 720  *              it is just duplicate segment or bug in other side's TCP.
 721  *              So that we build reply only basing on parameters
 722  *              arrived with segment.
 723  *      Exception: precedence violation. We do not implement it in any case.
 724  */
 725
 726 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 727 {
 728         const struct tcphdr *th = tcp_hdr(skb);
 729         struct {
 730                 struct tcphdr th;
 731                 __be32 opt[REPLY_OPTIONS_LEN];
 732         } rep;
 733         const __u8 *md5_hash_location = NULL;
 734         const struct tcp_ao_hdr *aoh;
 735         struct ip_reply_arg arg;
 736 #ifdef CONFIG_TCP_MD5SIG
 737         struct tcp_md5sig_key *key = NULL;
 738         unsigned char newhash[16];
 739         struct sock *sk1 = NULL;
 740         int genhash;
 741 #endif
 742         u64 transmit_time = 0;
 743         struct sock *ctl_sk;
 744         struct net *net;
 745         u32 txhash = 0;
 746
 747         /* Never send a reset in response to a reset. */
 748         if (th->rst)
 749                 return;
 750
 751         /* If sk not NULL, it means we did a successful lookup and incoming
 752          * route had to be correct. prequeue might have dropped our dst.
 753          */
 754         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 755                 return;
 756
 757         /* Swap the send and the receive. */
 758         memset(&rep, 0, sizeof(rep));
 759         rep.th.dest   = th->source;
 760         rep.th.source = th->dest;
 761         rep.th.doff   = sizeof(struct tcphdr) / 4;
 762         rep.th.rst    = 1;
 763
 764         if (th->ack) {
 765                 rep.th.seq = th->ack_seq;
 766         } else {
 767                 rep.th.ack = 1;
 768                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 769                                        skb->len - (th->doff << 2));
 770         }
 771
 772         memset(&arg, 0, sizeof(arg));
 773         arg.iov[0].iov_base = (unsigned char *)&rep;
 774         arg.iov[0].iov_len  = sizeof(rep.th);
 775
 776         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 777
 778         /* Invalid TCP option size or twice included auth */
 779         if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
 780                 return;
 781
 782         if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
 783                 return;
 784
 785 #ifdef CONFIG_TCP_MD5SIG
 786         rcu_read_lock();
 787         if (sk && sk_fullsock(sk)) {
 788                 const union tcp_md5_addr *addr;
 789                 int l3index;
 790
 791                 /* sdif set, means packet ingressed via a device
 792                  * in an L3 domain and inet_iif is set to it.
 793                  */
 794                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 795                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 796                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 797         } else if (md5_hash_location) {
 798                 const union tcp_md5_addr *addr;
 799                 int sdif = tcp_v4_sdif(skb);
 800                 int dif = inet_iif(skb);
 801                 int l3index;
 802
 803                 /*
 804                  * active side is lost. Try to find listening socket through
 805                  * source port, and then find md5 key through listening socket.
 806                  * we are not loose security here:
 807                  * Incoming packet is checked with md5 hash with finding key,
 808                  * no RST generated if md5 hash doesn't match.
 809                  */
 810                 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
 811                                              NULL, 0, ip_hdr(skb)->saddr,
 812                                              th->source, ip_hdr(skb)->daddr,
 813                                              ntohs(th->source), dif, sdif);
 814                 /* don't send rst if it can't find key */
 815                 if (!sk1)
 816                         goto out;
 817
 818                 /* sdif set, means packet ingressed via a device
 819                  * in an L3 domain and dif is set to it.
 820                  */
 821                 l3index = sdif ? dif : 0;
 822                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 823                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 824                 if (!key)
 825                         goto out;
 826
 827
 828                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 829                 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
 830                         goto out;
 831
 832         }
 833
 834         if (key) {
 835                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 836                                    (TCPOPT_NOP << 16) |
 837                                    (TCPOPT_MD5SIG << 8) |
 838                                    TCPOLEN_MD5SIG);
 839                 /* Update length and the length the header thinks exists */
 840                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 841                 rep.th.doff = arg.iov[0].iov_len / 4;
 842
 843                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 844                                      key, ip_hdr(skb)->saddr,
 845                                      ip_hdr(skb)->daddr, &rep.th);
 846         }
 847 #endif
 848         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
 849         if (rep.opt[0] == 0) {
 850                 __be32 mrst = mptcp_reset_option(skb);
 851
 852                 if (mrst) {
 853                         rep.opt[0] = mrst;
 854                         arg.iov[0].iov_len += sizeof(mrst);
 855                         rep.th.doff = arg.iov[0].iov_len / 4;
 856                 }
 857         }
 858
 859         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 860                                       ip_hdr(skb)->saddr, /* XXX */
 861                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 862         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 863         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 864
 865         /* When socket is gone, all binding information is lost.
 866          * routing might fail in this case. No choice here, if we choose to force
 867          * input interface, we will misroute in case of asymmetric route.
 868          */
 869         if (sk) {
 870                 arg.bound_dev_if = sk->sk_bound_dev_if;
 871                 if (sk_fullsock(sk))
 872                         trace_tcp_send_reset(sk, skb);
 873         }
 874
 875         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 876                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 877
 878         arg.tos = ip_hdr(skb)->tos;
 879         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 880         local_bh_disable();
 881         ctl_sk = this_cpu_read(ipv4_tcp_sk);
 882         sock_net_set(ctl_sk, net);
 883         if (sk) {
 884                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 885                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 886                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 887                                    inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
 888                 transmit_time = tcp_transmit_time(sk);
 889                 xfrm_sk_clone_policy(ctl_sk, sk);
 890                 txhash = (sk->sk_state == TCP_TIME_WAIT) ?
 891                          inet_twsk(sk)->tw_txhash : sk->sk_txhash;
 892         } else {
 893                 ctl_sk->sk_mark = 0;
 894                 ctl_sk->sk_priority = 0;
 895         }
 896         ip_send_unicast_reply(ctl_sk,
 897                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 898                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 899                               &arg, arg.iov[0].iov_len,
 900                               transmit_time, txhash);
 901
 902         xfrm_sk_free_policy(ctl_sk);
 903         sock_net_set(ctl_sk, &init_net);
 904         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 905         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 906         local_bh_enable();
 907
 908 #ifdef CONFIG_TCP_MD5SIG
 909 out:
 910         rcu_read_unlock();
 911 #endif
 912 }
 913
 914 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 915    outside socket context is ugly, certainly. What can I do?
 916  */
 917
 918 static void tcp_v4_send_ack(const struct sock *sk,
 919                             struct sk_buff *skb, u32 seq, u32 ack,
 920                             u32 win, u32 tsval, u32 tsecr, int oif,
 921                             struct tcp_key *key,
 922                             int reply_flags, u8 tos, u32 txhash)
 923 {
 924         const struct tcphdr *th = tcp_hdr(skb);
 925         struct {
 926                 struct tcphdr th;
 927                 __be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
 928         } rep;
 929         struct net *net = sock_net(sk);
 930         struct ip_reply_arg arg;
 931         struct sock *ctl_sk;
 932         u64 transmit_time;
 933
 934         memset(&rep.th, 0, sizeof(struct tcphdr));
 935         memset(&arg, 0, sizeof(arg));
 936
 937         arg.iov[0].iov_base = (unsigned char *)&rep;
 938         arg.iov[0].iov_len  = sizeof(rep.th);
 939         if (tsecr) {
 940                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 941                                    (TCPOPT_TIMESTAMP << 8) |
 942                                    TCPOLEN_TIMESTAMP);
 943                 rep.opt[1] = htonl(tsval);
 944                 rep.opt[2] = htonl(tsecr);
 945                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 946         }
 947
 948         /* Swap the send and the receive. */
 949         rep.th.dest    = th->source;
 950         rep.th.source  = th->dest;
 951         rep.th.doff    = arg.iov[0].iov_len / 4;
 952         rep.th.seq     = htonl(seq);
 953         rep.th.ack_seq = htonl(ack);
 954         rep.th.ack     = 1;
 955         rep.th.window  = htons(win);
 956
 957 #ifdef CONFIG_TCP_MD5SIG
 958         if (tcp_key_is_md5(key)) {
 959                 int offset = (tsecr) ? 3 : 0;
 960
 961                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 962                                           (TCPOPT_NOP << 16) |
 963                                           (TCPOPT_MD5SIG << 8) |
 964                                           TCPOLEN_MD5SIG);
 965                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 966                 rep.th.doff = arg.iov[0].iov_len/4;
 967
 968                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 969                                     key->md5_key, ip_hdr(skb)->saddr,
 970                                     ip_hdr(skb)->daddr, &rep.th);
 971         }
 972 #endif
 973 #ifdef CONFIG_TCP_AO
 974         if (tcp_key_is_ao(key)) {
 975                 int offset = (tsecr) ? 3 : 0;
 976
 977                 rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
 978                                           (tcp_ao_len(key->ao_key) << 16) |
 979                                           (key->ao_key->sndid << 8) |
 980                                           key->rcv_next);
 981                 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
 982                 rep.th.doff = arg.iov[0].iov_len / 4;
 983
 984                 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
 985                                 key->ao_key, key->traffic_key,
 986                                 (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
 987                                 (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
 988                                 &rep.th, key->sne);
 989         }
 990 #endif
 991         arg.flags = reply_flags;
 992         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 993                                       ip_hdr(skb)->saddr, /* XXX */
 994                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 995         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 996         if (oif)
 997                 arg.bound_dev_if = oif;
 998         arg.tos = tos;
 999         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1000         local_bh_disable();
1001         ctl_sk = this_cpu_read(ipv4_tcp_sk);
1002         sock_net_set(ctl_sk, net);
1003         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1004                            inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1005         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1006                            inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1007         transmit_time = tcp_transmit_time(sk);
1008         ip_send_unicast_reply(ctl_sk,
1009                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
1010                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1011                               &arg, arg.iov[0].iov_len,
1012                               transmit_time, txhash);
1013
1014         sock_net_set(ctl_sk, &init_net);
1015         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1016         local_bh_enable();
1017 }
1018
1019 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1020 {
1021         struct inet_timewait_sock *tw = inet_twsk(sk);
1022         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1023         struct tcp_key key = {};
1024 #ifdef CONFIG_TCP_AO
1025         struct tcp_ao_info *ao_info;
1026
1027         if (static_branch_unlikely(&tcp_ao_needed.key)) {
1028                 /* FIXME: the segment to-be-acked is not verified yet */
1029                 ao_info = rcu_dereference(tcptw->ao_info);
1030                 if (ao_info) {
1031                         const struct tcp_ao_hdr *aoh;
1032
1033                         if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1034                                 inet_twsk_put(tw);
1035                                 return;
1036                         }
1037
1038                         if (aoh)
1039                                 key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
1040                 }
1041         }
1042         if (key.ao_key) {
1043                 struct tcp_ao_key *rnext_key;
1044
1045                 key.traffic_key = snd_other_key(key.ao_key);
1046                 key.sne = READ_ONCE(ao_info->snd_sne);
1047                 rnext_key = READ_ONCE(ao_info->rnext_key);
1048                 key.rcv_next = rnext_key->rcvid;
1049                 key.type = TCP_KEY_AO;
1050 #else
1051         if (0) {
1052 #endif
1053 #ifdef CONFIG_TCP_MD5SIG
1054         } else if (static_branch_unlikely(&tcp_md5_needed.key)) {
1055                 key.md5_key = tcp_twsk_md5_key(tcptw);
1056                 if (key.md5_key)
1057                         key.type = TCP_KEY_MD5;
1058 #endif
1059         }
1060
1061         tcp_v4_send_ack(sk, skb,
1062                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
1063                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1064                         tcp_tw_tsval(tcptw),
1065                         tcptw->tw_ts_recent,
1066                         tw->tw_bound_dev_if, &key,
1067                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1068                         tw->tw_tos,
1069                         tw->tw_txhash);
1070
1071         inet_twsk_put(tw);
1072 }
1073
1074 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1075                                   struct request_sock *req)
1076 {
1077         struct tcp_key key = {};
1078
1079         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1080          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1081          */
1082         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1083                                              tcp_sk(sk)->snd_nxt;
1084
1085 #ifdef CONFIG_TCP_AO
1086         if (static_branch_unlikely(&tcp_ao_needed.key) &&
1087             tcp_rsk_used_ao(req)) {
1088                 const union tcp_md5_addr *addr;
1089                 const struct tcp_ao_hdr *aoh;
1090                 int l3index;
1091
1092                 /* Invalid TCP option size or twice included auth */
1093                 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1094                         return;
1095                 if (!aoh)
1096                         return;
1097
1098                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1099                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1100                 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1101                                               aoh->rnext_keyid, -1);
1102                 if (unlikely(!key.ao_key)) {
1103                         /* Send ACK with any matching MKT for the peer */
1104                         key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1105                         /* Matching key disappeared (user removed the key?)
1106                          * let the handshake timeout.
1107                          */
1108                         if (!key.ao_key) {
1109                                 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1110                                                      addr,
1111                                                      ntohs(tcp_hdr(skb)->source),
1112                                                      &ip_hdr(skb)->daddr,
1113                                                      ntohs(tcp_hdr(skb)->dest));
1114                                 return;
1115                         }
1116                 }
1117                 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1118                 if (!key.traffic_key)
1119                         return;
1120
1121                 key.type = TCP_KEY_AO;
1122                 key.rcv_next = aoh->keyid;
1123                 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1124 #else
1125         if (0) {
1126 #endif
1127 #ifdef CONFIG_TCP_MD5SIG
1128         } else if (static_branch_unlikely(&tcp_md5_needed.key)) {
1129                 const union tcp_md5_addr *addr;
1130                 int l3index;
1131
1132                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1133                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1134                 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1135                 if (key.md5_key)
1136                         key.type = TCP_KEY_MD5;
1137 #endif
1138         }
1139
1140         /* RFC 7323 2.3
1141          * The window field (SEG.WND) of every outgoing segment, with the
1142          * exception of <SYN> segments, MUST be right-shifted by
1143          * Rcv.Wind.Shift bits:
1144          */
1145         tcp_v4_send_ack(sk, skb, seq,
1146                         tcp_rsk(req)->rcv_nxt,
1147                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
1148                         tcp_rsk_tsval(tcp_rsk(req)),
1149                         READ_ONCE(req->ts_recent),
1150                         0, &key,
1151                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1152                         ip_hdr(skb)->tos,
1153                         READ_ONCE(tcp_rsk(req)->txhash));
1154         if (tcp_key_is_ao(&key))
1155                 kfree(key.traffic_key);
1156 }
1157
1158 /*
1159  *      Send a SYN-ACK after having received a SYN.
1160  *      This still operates on a request_sock only, not on a big
1161  *      socket.
1162  */
1163 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1164                               struct flowi *fl,
1165                               struct request_sock *req,
1166                               struct tcp_fastopen_cookie *foc,
1167                               enum tcp_synack_type synack_type,
1168                               struct sk_buff *syn_skb)
1169 {
1170         const struct inet_request_sock *ireq = inet_rsk(req);
1171         struct flowi4 fl4;
1172         int err = -1;
1173         struct sk_buff *skb;
1174         u8 tos;
1175
1176         /* First, grab a route. */
1177         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1178                 return -1;
1179
1180         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1181
1182         if (skb) {
1183                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1184
1185                 tos = READ_ONCE(inet_sk(sk)->tos);
1186
1187                 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1188                         tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1189                               (tos & INET_ECN_MASK);
1190
1191                 if (!INET_ECN_is_capable(tos) &&
1192                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1193                         tos |= INET_ECN_ECT_0;
1194
1195                 rcu_read_lock();
1196                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1197                                             ireq->ir_rmt_addr,
1198                                             rcu_dereference(ireq->ireq_opt),
1199                                             tos);
1200                 rcu_read_unlock();
1201                 err = net_xmit_eval(err);
1202         }
1203
1204         return err;
1205 }
1206
1207 /*
1208  *      IPv4 request_sock destructor.
1209  */
1210 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1211 {
1212         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1213 }
1214
1215 #ifdef CONFIG_TCP_MD5SIG
1216 /*
1217  * RFC2385 MD5 checksumming requires a mapping of
1218  * IP address->MD5 Key.
1219  * We need to maintain these in the sk structure.
1220  */
1221
1222 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1223 EXPORT_SYMBOL(tcp_md5_needed);
1224
1225 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1226 {
1227         if (!old)
1228                 return true;
1229
1230         /* l3index always overrides non-l3index */
1231         if (old->l3index && new->l3index == 0)
1232                 return false;
1233         if (old->l3index == 0 && new->l3index)
1234                 return true;
1235
1236         return old->prefixlen < new->prefixlen;
1237 }
1238
1239 /* Find the Key structure for an address.  */
1240 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1241                                            const union tcp_md5_addr *addr,
1242                                            int family, bool any_l3index)
1243 {
1244         const struct tcp_sock *tp = tcp_sk(sk);
1245         struct tcp_md5sig_key *key;
1246         const struct tcp_md5sig_info *md5sig;
1247         __be32 mask;
1248         struct tcp_md5sig_key *best_match = NULL;
1249         bool match;
1250
1251         /* caller either holds rcu_read_lock() or socket lock */
1252         md5sig = rcu_dereference_check(tp->md5sig_info,
1253                                        lockdep_sock_is_held(sk));
1254         if (!md5sig)
1255                 return NULL;
1256
1257         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1258                                  lockdep_sock_is_held(sk)) {
1259                 if (key->family != family)
1260                         continue;
1261                 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1262                     key->l3index != l3index)
1263                         continue;
1264                 if (family == AF_INET) {
1265                         mask = inet_make_mask(key->prefixlen);
1266                         match = (key->addr.a4.s_addr & mask) ==
1267                                 (addr->a4.s_addr & mask);
1268 #if IS_ENABLED(CONFIG_IPV6)
1269                 } else if (family == AF_INET6) {
1270                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1271                                                   key->prefixlen);
1272 #endif
1273                 } else {
1274                         match = false;
1275                 }
1276
1277                 if (match && better_md5_match(best_match, key))
1278                         best_match = key;
1279         }
1280         return best_match;
1281 }
1282 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1283
1284 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1285                                                       const union tcp_md5_addr *addr,
1286                                                       int family, u8 prefixlen,
1287                                                       int l3index, u8 flags)
1288 {
1289         const struct tcp_sock *tp = tcp_sk(sk);
1290         struct tcp_md5sig_key *key;
1291         unsigned int size = sizeof(struct in_addr);
1292         const struct tcp_md5sig_info *md5sig;
1293
1294         /* caller either holds rcu_read_lock() or socket lock */
1295         md5sig = rcu_dereference_check(tp->md5sig_info,
1296                                        lockdep_sock_is_held(sk));
1297         if (!md5sig)
1298                 return NULL;
1299 #if IS_ENABLED(CONFIG_IPV6)
1300         if (family == AF_INET6)
1301                 size = sizeof(struct in6_addr);
1302 #endif
1303         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1304                                  lockdep_sock_is_held(sk)) {
1305                 if (key->family != family)
1306                         continue;
1307                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1308                         continue;
1309                 if (key->l3index != l3index)
1310                         continue;
1311                 if (!memcmp(&key->addr, addr, size) &&
1312                     key->prefixlen == prefixlen)
1313                         return key;
1314         }
1315         return NULL;
1316 }
1317
1318 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1319                                          const struct sock *addr_sk)
1320 {
1321         const union tcp_md5_addr *addr;
1322         int l3index;
1323
1324         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1325                                                  addr_sk->sk_bound_dev_if);
1326         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1327         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1328 }
1329 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1330
1331 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1332 {
1333         struct tcp_sock *tp = tcp_sk(sk);
1334         struct tcp_md5sig_info *md5sig;
1335
1336         md5sig = kmalloc(sizeof(*md5sig), gfp);
1337         if (!md5sig)
1338                 return -ENOMEM;
1339
1340         sk_gso_disable(sk);
1341         INIT_HLIST_HEAD(&md5sig->head);
1342         rcu_assign_pointer(tp->md5sig_info, md5sig);
1343         return 0;
1344 }
1345
1346 /* This can be called on a newly created socket, from other files */
1347 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1348                             int family, u8 prefixlen, int l3index, u8 flags,
1349                             const u8 *newkey, u8 newkeylen, gfp_t gfp)
1350 {
1351         /* Add Key to the list */
1352         struct tcp_md5sig_key *key;
1353         struct tcp_sock *tp = tcp_sk(sk);
1354         struct tcp_md5sig_info *md5sig;
1355
1356         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1357         if (key) {
1358                 /* Pre-existing entry - just update that one.
1359                  * Note that the key might be used concurrently.
1360                  * data_race() is telling kcsan that we do not care of
1361                  * key mismatches, since changing MD5 key on live flows
1362                  * can lead to packet drops.
1363                  */
1364                 data_race(memcpy(key->key, newkey, newkeylen));
1365
1366                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1367                  * Also note that a reader could catch new key->keylen value
1368                  * but old key->key[], this is the reason we use __GFP_ZERO
1369                  * at sock_kmalloc() time below these lines.
1370                  */
1371                 WRITE_ONCE(key->keylen, newkeylen);
1372
1373                 return 0;
1374         }
1375
1376         md5sig = rcu_dereference_protected(tp->md5sig_info,
1377                                            lockdep_sock_is_held(sk));
1378
1379         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1380         if (!key)
1381                 return -ENOMEM;
1382
1383         memcpy(key->key, newkey, newkeylen);
1384         key->keylen = newkeylen;
1385         key->family = family;
1386         key->prefixlen = prefixlen;
1387         key->l3index = l3index;
1388         key->flags = flags;
1389         memcpy(&key->addr, addr,
1390                (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1391                                                                  sizeof(struct in_addr));
1392         hlist_add_head_rcu(&key->node, &md5sig->head);
1393         return 0;
1394 }
1395
1396 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1397                    int family, u8 prefixlen, int l3index, u8 flags,
1398                    const u8 *newkey, u8 newkeylen)
1399 {
1400         struct tcp_sock *tp = tcp_sk(sk);
1401
1402         if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1403                 if (tcp_md5_alloc_sigpool())
1404                         return -ENOMEM;
1405
1406                 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1407                         tcp_md5_release_sigpool();
1408                         return -ENOMEM;
1409                 }
1410
1411                 if (!static_branch_inc(&tcp_md5_needed.key)) {
1412                         struct tcp_md5sig_info *md5sig;
1413
1414                         md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1415                         rcu_assign_pointer(tp->md5sig_info, NULL);
1416                         kfree_rcu(md5sig, rcu);
1417                         tcp_md5_release_sigpool();
1418                         return -EUSERS;
1419                 }
1420         }
1421
1422         return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1423                                 newkey, newkeylen, GFP_KERNEL);
1424 }
1425 EXPORT_SYMBOL(tcp_md5_do_add);
1426
1427 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1428                      int family, u8 prefixlen, int l3index,
1429                      struct tcp_md5sig_key *key)
1430 {
1431         struct tcp_sock *tp = tcp_sk(sk);
1432
1433         if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1434                 tcp_md5_add_sigpool();
1435
1436                 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1437                         tcp_md5_release_sigpool();
1438                         return -ENOMEM;
1439                 }
1440
1441                 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1442                         struct tcp_md5sig_info *md5sig;
1443
1444                         md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1445                         net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1446                         rcu_assign_pointer(tp->md5sig_info, NULL);
1447                         kfree_rcu(md5sig, rcu);
1448                         tcp_md5_release_sigpool();
1449                         return -EUSERS;
1450                 }
1451         }
1452
1453         return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1454                                 key->flags, key->key, key->keylen,
1455                                 sk_gfp_mask(sk, GFP_ATOMIC));
1456 }
1457 EXPORT_SYMBOL(tcp_md5_key_copy);
1458
1459 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1460                    u8 prefixlen, int l3index, u8 flags)
1461 {
1462         struct tcp_md5sig_key *key;
1463
1464         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1465         if (!key)
1466                 return -ENOENT;
1467         hlist_del_rcu(&key->node);
1468         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1469         kfree_rcu(key, rcu);
1470         return 0;
1471 }
1472 EXPORT_SYMBOL(tcp_md5_do_del);
1473
1474 void tcp_clear_md5_list(struct sock *sk)
1475 {
1476         struct tcp_sock *tp = tcp_sk(sk);
1477         struct tcp_md5sig_key *key;
1478         struct hlist_node *n;
1479         struct tcp_md5sig_info *md5sig;
1480
1481         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1482
1483         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1484                 hlist_del_rcu(&key->node);
1485                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1486                 kfree_rcu(key, rcu);
1487         }
1488 }
1489
1490 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1491                                  sockptr_t optval, int optlen)
1492 {
1493         struct tcp_md5sig cmd;
1494         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1495         const union tcp_md5_addr *addr;
1496         u8 prefixlen = 32;
1497         int l3index = 0;
1498         bool l3flag;
1499         u8 flags;
1500
1501         if (optlen < sizeof(cmd))
1502                 return -EINVAL;
1503
1504         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1505                 return -EFAULT;
1506
1507         if (sin->sin_family != AF_INET)
1508                 return -EINVAL;
1509
1510         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1511         l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1512
1513         if (optname == TCP_MD5SIG_EXT &&
1514             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1515                 prefixlen = cmd.tcpm_prefixlen;
1516                 if (prefixlen > 32)
1517                         return -EINVAL;
1518         }
1519
1520         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1521             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1522                 struct net_device *dev;
1523
1524                 rcu_read_lock();
1525                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1526                 if (dev && netif_is_l3_master(dev))
1527                         l3index = dev->ifindex;
1528
1529                 rcu_read_unlock();
1530
1531                 /* ok to reference set/not set outside of rcu;
1532                  * right now device MUST be an L3 master
1533                  */
1534                 if (!dev || !l3index)
1535                         return -EINVAL;
1536         }
1537
1538         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1539
1540         if (!cmd.tcpm_keylen)
1541                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1542
1543         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1544                 return -EINVAL;
1545
1546         /* Don't allow keys for peers that have a matching TCP-AO key.
1547          * See the comment in tcp_ao_add_cmd()
1548          */
1549         if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1550                 return -EKEYREJECTED;
1551
1552         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1553                               cmd.tcpm_key, cmd.tcpm_keylen);
1554 }
1555
1556 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1557                                    __be32 daddr, __be32 saddr,
1558                                    const struct tcphdr *th, int nbytes)
1559 {
1560         struct tcp4_pseudohdr *bp;
1561         struct scatterlist sg;
1562         struct tcphdr *_th;
1563
1564         bp = hp->scratch;
1565         bp->saddr = saddr;
1566         bp->daddr = daddr;
1567         bp->pad = 0;
1568         bp->protocol = IPPROTO_TCP;
1569         bp->len = cpu_to_be16(nbytes);
1570
1571         _th = (struct tcphdr *)(bp + 1);
1572         memcpy(_th, th, sizeof(*th));
1573         _th->check = 0;
1574
1575         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1576         ahash_request_set_crypt(hp->req, &sg, NULL,
1577                                 sizeof(*bp) + sizeof(*th));
1578         return crypto_ahash_update(hp->req);
1579 }
1580
1581 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1582                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1583 {
1584         struct tcp_sigpool hp;
1585
1586         if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1587                 goto clear_hash_nostart;
1588
1589         if (crypto_ahash_init(hp.req))
1590                 goto clear_hash;
1591         if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1592                 goto clear_hash;
1593         if (tcp_md5_hash_key(&hp, key))
1594                 goto clear_hash;
1595         ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1596         if (crypto_ahash_final(hp.req))
1597                 goto clear_hash;
1598
1599         tcp_sigpool_end(&hp);
1600         return 0;
1601
1602 clear_hash:
1603         tcp_sigpool_end(&hp);
1604 clear_hash_nostart:
1605         memset(md5_hash, 0, 16);
1606         return 1;
1607 }
1608
1609 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1610                         const struct sock *sk,
1611                         const struct sk_buff *skb)
1612 {
1613         const struct tcphdr *th = tcp_hdr(skb);
1614         struct tcp_sigpool hp;
1615         __be32 saddr, daddr;
1616
1617         if (sk) { /* valid for establish/request sockets */
1618                 saddr = sk->sk_rcv_saddr;
1619                 daddr = sk->sk_daddr;
1620         } else {
1621                 const struct iphdr *iph = ip_hdr(skb);
1622                 saddr = iph->saddr;
1623                 daddr = iph->daddr;
1624         }
1625
1626         if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1627                 goto clear_hash_nostart;
1628
1629         if (crypto_ahash_init(hp.req))
1630                 goto clear_hash;
1631
1632         if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1633                 goto clear_hash;
1634         if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1635                 goto clear_hash;
1636         if (tcp_md5_hash_key(&hp, key))
1637                 goto clear_hash;
1638         ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1639         if (crypto_ahash_final(hp.req))
1640                 goto clear_hash;
1641
1642         tcp_sigpool_end(&hp);
1643         return 0;
1644
1645 clear_hash:
1646         tcp_sigpool_end(&hp);
1647 clear_hash_nostart:
1648         memset(md5_hash, 0, 16);
1649         return 1;
1650 }
1651 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1652
1653 #endif
1654
1655 static void tcp_v4_init_req(struct request_sock *req,
1656                             const struct sock *sk_listener,
1657                             struct sk_buff *skb)
1658 {
1659         struct inet_request_sock *ireq = inet_rsk(req);
1660         struct net *net = sock_net(sk_listener);
1661
1662         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1663         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1664         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1665 }
1666
1667 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1668                                           struct sk_buff *skb,
1669                                           struct flowi *fl,
1670                                           struct request_sock *req)
1671 {
1672         tcp_v4_init_req(req, sk, skb);
1673
1674         if (security_inet_conn_request(sk, skb, req))
1675                 return NULL;
1676
1677         return inet_csk_route_req(sk, &fl->u.ip4, req);
1678 }
1679
1680 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1681         .family         =       PF_INET,
1682         .obj_size       =       sizeof(struct tcp_request_sock),
1683         .rtx_syn_ack    =       tcp_rtx_synack,
1684         .send_ack       =       tcp_v4_reqsk_send_ack,
1685         .destructor     =       tcp_v4_reqsk_destructor,
1686         .send_reset     =       tcp_v4_send_reset,
1687         .syn_ack_timeout =      tcp_syn_ack_timeout,
1688 };
1689
1690 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1691         .mss_clamp      =       TCP_MSS_DEFAULT,
1692 #ifdef CONFIG_TCP_MD5SIG
1693         .req_md5_lookup =       tcp_v4_md5_lookup,
1694         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1695 #endif
1696 #ifdef CONFIG_TCP_AO
1697         .ao_lookup      =       tcp_v4_ao_lookup_rsk,
1698         .ao_calc_key    =       tcp_v4_ao_calc_key_rsk,
1699         .ao_synack_hash =       tcp_v4_ao_synack_hash,
1700 #endif
1701 #ifdef CONFIG_SYN_COOKIES
1702         .cookie_init_seq =      cookie_v4_init_sequence,
1703 #endif
1704         .route_req      =       tcp_v4_route_req,
1705         .init_seq       =       tcp_v4_init_seq,
1706         .init_ts_off    =       tcp_v4_init_ts_off,
1707         .send_synack    =       tcp_v4_send_synack,
1708 };
1709
1710 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1711 {
1712         /* Never answer to SYNs send to broadcast or multicast */
1713         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1714                 goto drop;
1715
1716         return tcp_conn_request(&tcp_request_sock_ops,
1717                                 &tcp_request_sock_ipv4_ops, sk, skb);
1718
1719 drop:
1720         tcp_listendrop(sk);
1721         return 0;
1722 }
1723 EXPORT_SYMBOL(tcp_v4_conn_request);
1724
1725
1726 /*
1727  * The three way handshake has completed - we got a valid synack -
1728  * now create the new socket.
1729  */
1730 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1731                                   struct request_sock *req,
1732                                   struct dst_entry *dst,
1733                                   struct request_sock *req_unhash,
1734                                   bool *own_req)
1735 {
1736         struct inet_request_sock *ireq;
1737         bool found_dup_sk = false;
1738         struct inet_sock *newinet;
1739         struct tcp_sock *newtp;
1740         struct sock *newsk;
1741 #ifdef CONFIG_TCP_MD5SIG
1742         const union tcp_md5_addr *addr;
1743         struct tcp_md5sig_key *key;
1744         int l3index;
1745 #endif
1746         struct ip_options_rcu *inet_opt;
1747
1748         if (sk_acceptq_is_full(sk))
1749                 goto exit_overflow;
1750
1751         newsk = tcp_create_openreq_child(sk, req, skb);
1752         if (!newsk)
1753                 goto exit_nonewsk;
1754
1755         newsk->sk_gso_type = SKB_GSO_TCPV4;
1756         inet_sk_rx_dst_set(newsk, skb);
1757
1758         newtp                 = tcp_sk(newsk);
1759         newinet               = inet_sk(newsk);
1760         ireq                  = inet_rsk(req);
1761         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1762         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1763         newsk->sk_bound_dev_if = ireq->ir_iif;
1764         newinet->inet_saddr   = ireq->ir_loc_addr;
1765         inet_opt              = rcu_dereference(ireq->ireq_opt);
1766         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1767         newinet->mc_index     = inet_iif(skb);
1768         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1769         newinet->rcv_tos      = ip_hdr(skb)->tos;
1770         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1771         if (inet_opt)
1772                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1773         atomic_set(&newinet->inet_id, get_random_u16());
1774
1775         /* Set ToS of the new socket based upon the value of incoming SYN.
1776          * ECT bits are set later in tcp_init_transfer().
1777          */
1778         if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1779                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1780
1781         if (!dst) {
1782                 dst = inet_csk_route_child_sock(sk, newsk, req);
1783                 if (!dst)
1784                         goto put_and_exit;
1785         } else {
1786                 /* syncookie case : see end of cookie_v4_check() */
1787         }
1788         sk_setup_caps(newsk, dst);
1789
1790         tcp_ca_openreq_child(newsk, dst);
1791
1792         tcp_sync_mss(newsk, dst_mtu(dst));
1793         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1794
1795         tcp_initialize_rcv_mss(newsk);
1796
1797 #ifdef CONFIG_TCP_MD5SIG
1798         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1799         /* Copy over the MD5 key from the original socket */
1800         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1801         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1802         if (key && !tcp_rsk_used_ao(req)) {
1803                 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1804                         goto put_and_exit;
1805                 sk_gso_disable(newsk);
1806         }
1807 #endif
1808 #ifdef CONFIG_TCP_AO
1809         if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1810                 goto put_and_exit; /* OOM, release back memory */
1811 #endif
1812
1813         if (__inet_inherit_port(sk, newsk) < 0)
1814                 goto put_and_exit;
1815         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1816                                        &found_dup_sk);
1817         if (likely(*own_req)) {
1818                 tcp_move_syn(newtp, req);
1819                 ireq->ireq_opt = NULL;
1820         } else {
1821                 newinet->inet_opt = NULL;
1822
1823                 if (!req_unhash && found_dup_sk) {
1824                         /* This code path should only be executed in the
1825                          * syncookie case only
1826                          */
1827                         bh_unlock_sock(newsk);
1828                         sock_put(newsk);
1829                         newsk = NULL;
1830                 }
1831         }
1832         return newsk;
1833
1834 exit_overflow:
1835         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1836 exit_nonewsk:
1837         dst_release(dst);
1838 exit:
1839         tcp_listendrop(sk);
1840         return NULL;
1841 put_and_exit:
1842         newinet->inet_opt = NULL;
1843         inet_csk_prepare_forced_close(newsk);
1844         tcp_done(newsk);
1845         goto exit;
1846 }
1847 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1848
1849 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1850 {
1851 #ifdef CONFIG_SYN_COOKIES
1852         const struct tcphdr *th = tcp_hdr(skb);
1853
1854         if (!th->syn)
1855                 sk = cookie_v4_check(sk, skb);
1856 #endif
1857         return sk;
1858 }
1859
1860 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1861                          struct tcphdr *th, u32 *cookie)
1862 {
1863         u16 mss = 0;
1864 #ifdef CONFIG_SYN_COOKIES
1865         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1866                                     &tcp_request_sock_ipv4_ops, sk, th);
1867         if (mss) {
1868                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1869                 tcp_synq_overflow(sk);
1870         }
1871 #endif
1872         return mss;
1873 }
1874
1875 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1876                                                            u32));
1877 /* The socket must have it's spinlock held when we get
1878  * here, unless it is a TCP_LISTEN socket.
1879  *
1880  * We have a potential double-lock case here, so even when
1881  * doing backlog processing we use the BH locking scheme.
1882  * This is because we cannot sleep with the original spinlock
1883  * held.
1884  */
1885 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1886 {
1887         enum skb_drop_reason reason;
1888         struct sock *rsk;
1889
1890         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1891                 struct dst_entry *dst;
1892
1893                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1894                                                 lockdep_sock_is_held(sk));
1895
1896                 sock_rps_save_rxhash(sk, skb);
1897                 sk_mark_napi_id(sk, skb);
1898                 if (dst) {
1899                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1900                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1901                                              dst, 0)) {
1902                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1903                                 dst_release(dst);
1904                         }
1905                 }
1906                 tcp_rcv_established(sk, skb);
1907                 return 0;
1908         }
1909
1910         reason = SKB_DROP_REASON_NOT_SPECIFIED;
1911         if (tcp_checksum_complete(skb))
1912                 goto csum_err;
1913
1914         if (sk->sk_state == TCP_LISTEN) {
1915                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1916
1917                 if (!nsk)
1918                         goto discard;
1919                 if (nsk != sk) {
1920                         if (tcp_child_process(sk, nsk, skb)) {
1921                                 rsk = nsk;
1922                                 goto reset;
1923                         }
1924                         return 0;
1925                 }
1926         } else
1927                 sock_rps_save_rxhash(sk, skb);
1928
1929         if (tcp_rcv_state_process(sk, skb)) {
1930                 rsk = sk;
1931                 goto reset;
1932         }
1933         return 0;
1934
1935 reset:
1936         tcp_v4_send_reset(rsk, skb);
1937 discard:
1938         kfree_skb_reason(skb, reason);
1939         /* Be careful here. If this function gets more complicated and
1940          * gcc suffers from register pressure on the x86, sk (in %ebx)
1941          * might be destroyed here. This current version compiles correctly,
1942          * but you have been warned.
1943          */
1944         return 0;
1945
1946 csum_err:
1947         reason = SKB_DROP_REASON_TCP_CSUM;
1948         trace_tcp_bad_csum(skb);
1949         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1950         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1951         goto discard;
1952 }
1953 EXPORT_SYMBOL(tcp_v4_do_rcv);
1954
1955 int tcp_v4_early_demux(struct sk_buff *skb)
1956 {
1957         struct net *net = dev_net(skb->dev);
1958         const struct iphdr *iph;
1959         const struct tcphdr *th;
1960         struct sock *sk;
1961
1962         if (skb->pkt_type != PACKET_HOST)
1963                 return 0;
1964
1965         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1966                 return 0;
1967
1968         iph = ip_hdr(skb);
1969         th = tcp_hdr(skb);
1970
1971         if (th->doff < sizeof(struct tcphdr) / 4)
1972                 return 0;
1973
1974         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1975                                        iph->saddr, th->source,
1976                                        iph->daddr, ntohs(th->dest),
1977                                        skb->skb_iif, inet_sdif(skb));
1978         if (sk) {
1979                 skb->sk = sk;
1980                 skb->destructor = sock_edemux;
1981                 if (sk_fullsock(sk)) {
1982                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1983
1984                         if (dst)
1985                                 dst = dst_check(dst, 0);
1986                         if (dst &&
1987                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1988                                 skb_dst_set_noref(skb, dst);
1989                 }
1990         }
1991         return 0;
1992 }
1993
1994 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1995                      enum skb_drop_reason *reason)
1996 {
1997         u32 limit, tail_gso_size, tail_gso_segs;
1998         struct skb_shared_info *shinfo;
1999         const struct tcphdr *th;
2000         struct tcphdr *thtail;
2001         struct sk_buff *tail;
2002         unsigned int hdrlen;
2003         bool fragstolen;
2004         u32 gso_segs;
2005         u32 gso_size;
2006         int delta;
2007
2008         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2009          * we can fix skb->truesize to its real value to avoid future drops.
2010          * This is valid because skb is not yet charged to the socket.
2011          * It has been noticed pure SACK packets were sometimes dropped
2012          * (if cooked by drivers without copybreak feature).
2013          */
2014         skb_condense(skb);
2015
2016         skb_dst_drop(skb);
2017
2018         if (unlikely(tcp_checksum_complete(skb))) {
2019                 bh_unlock_sock(sk);
2020                 trace_tcp_bad_csum(skb);
2021                 *reason = SKB_DROP_REASON_TCP_CSUM;
2022                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2023                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2024                 return true;
2025         }
2026
2027         /* Attempt coalescing to last skb in backlog, even if we are
2028          * above the limits.
2029          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2030          */
2031         th = (const struct tcphdr *)skb->data;
2032         hdrlen = th->doff * 4;
2033
2034         tail = sk->sk_backlog.tail;
2035         if (!tail)
2036                 goto no_coalesce;
2037         thtail = (struct tcphdr *)tail->data;
2038
2039         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2040             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2041             ((TCP_SKB_CB(tail)->tcp_flags |
2042               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2043             !((TCP_SKB_CB(tail)->tcp_flags &
2044               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2045             ((TCP_SKB_CB(tail)->tcp_flags ^
2046               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
2047 #ifdef CONFIG_TLS_DEVICE
2048             tail->decrypted != skb->decrypted ||
2049 #endif
2050             !mptcp_skb_can_collapse(tail, skb) ||
2051             thtail->doff != th->doff ||
2052             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2053                 goto no_coalesce;
2054
2055         __skb_pull(skb, hdrlen);
2056
2057         shinfo = skb_shinfo(skb);
2058         gso_size = shinfo->gso_size ?: skb->len;
2059         gso_segs = shinfo->gso_segs ?: 1;
2060
2061         shinfo = skb_shinfo(tail);
2062         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2063         tail_gso_segs = shinfo->gso_segs ?: 1;
2064
2065         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2066                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2067
2068                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2069                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2070                         thtail->window = th->window;
2071                 }
2072
2073                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2074                  * thtail->fin, so that the fast path in tcp_rcv_established()
2075                  * is not entered if we append a packet with a FIN.
2076                  * SYN, RST, URG are not present.
2077                  * ACK is set on both packets.
2078                  * PSH : we do not really care in TCP stack,
2079                  *       at least for 'GRO' packets.
2080                  */
2081                 thtail->fin |= th->fin;
2082                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2083
2084                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2085                         TCP_SKB_CB(tail)->has_rxtstamp = true;
2086                         tail->tstamp = skb->tstamp;
2087                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2088                 }
2089
2090                 /* Not as strict as GRO. We only need to carry mss max value */
2091                 shinfo->gso_size = max(gso_size, tail_gso_size);
2092                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2093
2094                 sk->sk_backlog.len += delta;
2095                 __NET_INC_STATS(sock_net(sk),
2096                                 LINUX_MIB_TCPBACKLOGCOALESCE);
2097                 kfree_skb_partial(skb, fragstolen);
2098                 return false;
2099         }
2100         __skb_push(skb, hdrlen);
2101
2102 no_coalesce:
2103         limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
2104
2105         /* Only socket owner can try to collapse/prune rx queues
2106          * to reduce memory overhead, so add a little headroom here.
2107          * Few sockets backlog are possibly concurrently non empty.
2108          */
2109         limit += 64 * 1024;
2110
2111         if (unlikely(sk_add_backlog(sk, skb, limit))) {
2112                 bh_unlock_sock(sk);
2113                 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2114                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2115                 return true;
2116         }
2117         return false;
2118 }
2119 EXPORT_SYMBOL(tcp_add_backlog);
2120
2121 int tcp_filter(struct sock *sk, struct sk_buff *skb)
2122 {
2123         struct tcphdr *th = (struct tcphdr *)skb->data;
2124
2125         return sk_filter_trim_cap(sk, skb, th->doff * 4);
2126 }
2127 EXPORT_SYMBOL(tcp_filter);
2128
2129 static void tcp_v4_restore_cb(struct sk_buff *skb)
2130 {
2131         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2132                 sizeof(struct inet_skb_parm));
2133 }
2134
2135 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2136                            const struct tcphdr *th)
2137 {
2138         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2139          * barrier() makes sure compiler wont play fool^Waliasing games.
2140          */
2141         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2142                 sizeof(struct inet_skb_parm));
2143         barrier();
2144
2145         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2146         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2147                                     skb->len - th->doff * 4);
2148         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2149         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
2150         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
2151         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2152         TCP_SKB_CB(skb)->sacked  = 0;
2153         TCP_SKB_CB(skb)->has_rxtstamp =
2154                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2155 }
2156
2157 /*
2158  *      From tcp_input.c
2159  */
2160
2161 int tcp_v4_rcv(struct sk_buff *skb)
2162 {
2163         struct net *net = dev_net(skb->dev);
2164         enum skb_drop_reason drop_reason;
2165         int sdif = inet_sdif(skb);
2166         int dif = inet_iif(skb);
2167         const struct iphdr *iph;
2168         const struct tcphdr *th;
2169         bool refcounted;
2170         struct sock *sk;
2171         int ret;
2172
2173         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2174         if (skb->pkt_type != PACKET_HOST)
2175                 goto discard_it;
2176
2177         /* Count it even if it's bad */
2178         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
2179
2180         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2181                 goto discard_it;
2182
2183         th = (const struct tcphdr *)skb->data;
2184
2185         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2186                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2187                 goto bad_packet;
2188         }
2189         if (!pskb_may_pull(skb, th->doff * 4))
2190                 goto discard_it;
2191
2192         /* An explanation is required here, I think.
2193          * Packet length and doff are validated by header prediction,
2194          * provided case of th->doff==0 is eliminated.
2195          * So, we defer the checks. */
2196
2197         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2198                 goto csum_error;
2199
2200         th = (const struct tcphdr *)skb->data;
2201         iph = ip_hdr(skb);
2202 lookup:
2203         sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2204                                skb, __tcp_hdrlen(th), th->source,
2205                                th->dest, sdif, &refcounted);
2206         if (!sk)
2207                 goto no_tcp_socket;
2208
2209 process:
2210         if (sk->sk_state == TCP_TIME_WAIT)
2211                 goto do_time_wait;
2212
2213         if (sk->sk_state == TCP_NEW_SYN_RECV) {
2214                 struct request_sock *req = inet_reqsk(sk);
2215                 bool req_stolen = false;
2216                 struct sock *nsk;
2217
2218                 sk = req->rsk_listener;
2219                 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2220                         drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2221                 else
2222                         drop_reason = tcp_inbound_hash(sk, req, skb,
2223                                                        &iph->saddr, &iph->daddr,
2224                                                        AF_INET, dif, sdif);
2225                 if (unlikely(drop_reason)) {
2226                         sk_drops_add(sk, skb);
2227                         reqsk_put(req);
2228                         goto discard_it;
2229                 }
2230                 if (tcp_checksum_complete(skb)) {
2231                         reqsk_put(req);
2232                         goto csum_error;
2233                 }
2234                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2235                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2236                         if (!nsk) {
2237                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2238                                 goto lookup;
2239                         }
2240                         sk = nsk;
2241                         /* reuseport_migrate_sock() has already held one sk_refcnt
2242                          * before returning.
2243                          */
2244                 } else {
2245                         /* We own a reference on the listener, increase it again
2246                          * as we might lose it too soon.
2247                          */
2248                         sock_hold(sk);
2249                 }
2250                 refcounted = true;
2251                 nsk = NULL;
2252                 if (!tcp_filter(sk, skb)) {
2253                         th = (const struct tcphdr *)skb->data;
2254                         iph = ip_hdr(skb);
2255                         tcp_v4_fill_cb(skb, iph, th);
2256                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2257                 } else {
2258                         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2259                 }
2260                 if (!nsk) {
2261                         reqsk_put(req);
2262                         if (req_stolen) {
2263                                 /* Another cpu got exclusive access to req
2264                                  * and created a full blown socket.
2265                                  * Try to feed this packet to this socket
2266                                  * instead of discarding it.
2267                                  */
2268                                 tcp_v4_restore_cb(skb);
2269                                 sock_put(sk);
2270                                 goto lookup;
2271                         }
2272                         goto discard_and_relse;
2273                 }
2274                 nf_reset_ct(skb);
2275                 if (nsk == sk) {
2276                         reqsk_put(req);
2277                         tcp_v4_restore_cb(skb);
2278                 } else if (tcp_child_process(sk, nsk, skb)) {
2279                         tcp_v4_send_reset(nsk, skb);
2280                         goto discard_and_relse;
2281                 } else {
2282                         sock_put(sk);
2283                         return 0;
2284                 }
2285         }
2286
2287         if (static_branch_unlikely(&ip4_min_ttl)) {
2288                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2289                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2290                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2291                         drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2292                         goto discard_and_relse;
2293                 }
2294         }
2295
2296         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2297                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2298                 goto discard_and_relse;
2299         }
2300
2301         drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2302                                        AF_INET, dif, sdif);
2303         if (drop_reason)
2304                 goto discard_and_relse;
2305
2306         nf_reset_ct(skb);
2307
2308         if (tcp_filter(sk, skb)) {
2309                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2310                 goto discard_and_relse;
2311         }
2312         th = (const struct tcphdr *)skb->data;
2313         iph = ip_hdr(skb);
2314         tcp_v4_fill_cb(skb, iph, th);
2315
2316         skb->dev = NULL;
2317
2318         if (sk->sk_state == TCP_LISTEN) {
2319                 ret = tcp_v4_do_rcv(sk, skb);
2320                 goto put_and_return;
2321         }
2322
2323         sk_incoming_cpu_update(sk);
2324
2325         bh_lock_sock_nested(sk);
2326         tcp_segs_in(tcp_sk(sk), skb);
2327         ret = 0;
2328         if (!sock_owned_by_user(sk)) {
2329                 ret = tcp_v4_do_rcv(sk, skb);
2330         } else {
2331                 if (tcp_add_backlog(sk, skb, &drop_reason))
2332                         goto discard_and_relse;
2333         }
2334         bh_unlock_sock(sk);
2335
2336 put_and_return:
2337         if (refcounted)
2338                 sock_put(sk);
2339
2340         return ret;
2341
2342 no_tcp_socket:
2343         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2344         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2345                 goto discard_it;
2346
2347         tcp_v4_fill_cb(skb, iph, th);
2348
2349         if (tcp_checksum_complete(skb)) {
2350 csum_error:
2351                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2352                 trace_tcp_bad_csum(skb);
2353                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2354 bad_packet:
2355                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2356         } else {
2357                 tcp_v4_send_reset(NULL, skb);
2358         }
2359
2360 discard_it:
2361         SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2362         /* Discard frame. */
2363         kfree_skb_reason(skb, drop_reason);
2364         return 0;
2365
2366 discard_and_relse:
2367         sk_drops_add(sk, skb);
2368         if (refcounted)
2369                 sock_put(sk);
2370         goto discard_it;
2371
2372 do_time_wait:
2373         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2374                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2375                 inet_twsk_put(inet_twsk(sk));
2376                 goto discard_it;
2377         }
2378
2379         tcp_v4_fill_cb(skb, iph, th);
2380
2381         if (tcp_checksum_complete(skb)) {
2382                 inet_twsk_put(inet_twsk(sk));
2383                 goto csum_error;
2384         }
2385         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2386         case TCP_TW_SYN: {
2387                 struct sock *sk2 = inet_lookup_listener(net,
2388                                                         net->ipv4.tcp_death_row.hashinfo,
2389                                                         skb, __tcp_hdrlen(th),
2390                                                         iph->saddr, th->source,
2391                                                         iph->daddr, th->dest,
2392                                                         inet_iif(skb),
2393                                                         sdif);
2394                 if (sk2) {
2395                         inet_twsk_deschedule_put(inet_twsk(sk));
2396                         sk = sk2;
2397                         tcp_v4_restore_cb(skb);
2398                         refcounted = false;
2399                         goto process;
2400                 }
2401         }
2402                 /* to ACK */
2403                 fallthrough;
2404         case TCP_TW_ACK:
2405                 tcp_v4_timewait_ack(sk, skb);
2406                 break;
2407         case TCP_TW_RST:
2408                 tcp_v4_send_reset(sk, skb);
2409                 inet_twsk_deschedule_put(inet_twsk(sk));
2410                 goto discard_it;
2411         case TCP_TW_SUCCESS:;
2412         }
2413         goto discard_it;
2414 }
2415
2416 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2417         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2418         .twsk_unique    = tcp_twsk_unique,
2419         .twsk_destructor= tcp_twsk_destructor,
2420 };
2421
2422 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2423 {
2424         struct dst_entry *dst = skb_dst(skb);
2425
2426         if (dst && dst_hold_safe(dst)) {
2427                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2428                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2429         }
2430 }
2431 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2432
2433 const struct inet_connection_sock_af_ops ipv4_specific = {
2434         .queue_xmit        = ip_queue_xmit,
2435         .send_check        = tcp_v4_send_check,
2436         .rebuild_header    = inet_sk_rebuild_header,
2437         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2438         .conn_request      = tcp_v4_conn_request,
2439         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2440         .net_header_len    = sizeof(struct iphdr),
2441         .setsockopt        = ip_setsockopt,
2442         .getsockopt        = ip_getsockopt,
2443         .addr2sockaddr     = inet_csk_addr2sockaddr,
2444         .sockaddr_len      = sizeof(struct sockaddr_in),
2445         .mtu_reduced       = tcp_v4_mtu_reduced,
2446 };
2447 EXPORT_SYMBOL(ipv4_specific);
2448
2449 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2450 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2451 #ifdef CONFIG_TCP_MD5SIG
2452         .md5_lookup             = tcp_v4_md5_lookup,
2453         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2454         .md5_parse              = tcp_v4_parse_md5_keys,
2455 #endif
2456 #ifdef CONFIG_TCP_AO
2457         .ao_lookup              = tcp_v4_ao_lookup,
2458         .calc_ao_hash           = tcp_v4_ao_hash_skb,
2459         .ao_parse               = tcp_v4_parse_ao,
2460         .ao_calc_key_sk         = tcp_v4_ao_calc_key_sk,
2461 #endif
2462 };
2463 #endif
2464
2465 /* NOTE: A lot of things set to zero explicitly by call to
2466  *       sk_alloc() so need not be done here.
2467  */
2468 static int tcp_v4_init_sock(struct sock *sk)
2469 {
2470         struct inet_connection_sock *icsk = inet_csk(sk);
2471
2472         tcp_init_sock(sk);
2473
2474         icsk->icsk_af_ops = &ipv4_specific;
2475
2476 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2477         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2478 #endif
2479
2480         return 0;
2481 }
2482
2483 #ifdef CONFIG_TCP_MD5SIG
2484 static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2485 {
2486         struct tcp_md5sig_info *md5sig;
2487
2488         md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2489         kfree(md5sig);
2490         static_branch_slow_dec_deferred(&tcp_md5_needed);
2491         tcp_md5_release_sigpool();
2492 }
2493 #endif
2494
2495 void tcp_v4_destroy_sock(struct sock *sk)
2496 {
2497         struct tcp_sock *tp = tcp_sk(sk);
2498
2499         trace_tcp_destroy_sock(sk);
2500
2501         tcp_clear_xmit_timers(sk);
2502
2503         tcp_cleanup_congestion_control(sk);
2504
2505         tcp_cleanup_ulp(sk);
2506
2507         /* Cleanup up the write buffer. */
2508         tcp_write_queue_purge(sk);
2509
2510         /* Check if we want to disable active TFO */
2511         tcp_fastopen_active_disable_ofo_check(sk);
2512
2513         /* Cleans up our, hopefully empty, out_of_order_queue. */
2514         skb_rbtree_purge(&tp->out_of_order_queue);
2515
2516 #ifdef CONFIG_TCP_MD5SIG
2517         /* Clean up the MD5 key list, if any */
2518         if (tp->md5sig_info) {
2519                 struct tcp_md5sig_info *md5sig;
2520
2521                 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2522                 tcp_clear_md5_list(sk);
2523                 call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2524                 rcu_assign_pointer(tp->md5sig_info, NULL);
2525         }
2526 #endif
2527         tcp_ao_destroy_sock(sk, false);
2528
2529         /* Clean up a referenced TCP bind bucket. */
2530         if (inet_csk(sk)->icsk_bind_hash)
2531                 inet_put_port(sk);
2532
2533         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2534
2535         /* If socket is aborted during connect operation */
2536         tcp_free_fastopen_req(tp);
2537         tcp_fastopen_destroy_cipher(sk);
2538         tcp_saved_syn_free(tp);
2539
2540         sk_sockets_allocated_dec(sk);
2541 }
2542 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2543
2544 #ifdef CONFIG_PROC_FS
2545 /* Proc filesystem TCP sock list dumping. */
2546
2547 static unsigned short seq_file_family(const struct seq_file *seq);
2548
2549 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2550 {
2551         unsigned short family = seq_file_family(seq);
2552
2553         /* AF_UNSPEC is used as a match all */
2554         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2555                 net_eq(sock_net(sk), seq_file_net(seq)));
2556 }
2557
2558 /* Find a non empty bucket (starting from st->bucket)
2559  * and return the first sk from it.
2560  */
2561 static void *listening_get_first(struct seq_file *seq)
2562 {
2563         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2564         struct tcp_iter_state *st = seq->private;
2565
2566         st->offset = 0;
2567         for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2568                 struct inet_listen_hashbucket *ilb2;
2569                 struct hlist_nulls_node *node;
2570                 struct sock *sk;
2571
2572                 ilb2 = &hinfo->lhash2[st->bucket];
2573                 if (hlist_nulls_empty(&ilb2->nulls_head))
2574                         continue;
2575
2576                 spin_lock(&ilb2->lock);
2577                 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2578                         if (seq_sk_match(seq, sk))
2579                                 return sk;
2580                 }
2581                 spin_unlock(&ilb2->lock);
2582         }
2583
2584         return NULL;
2585 }
2586
2587 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2588  * If "cur" is the last one in the st->bucket,
2589  * call listening_get_first() to return the first sk of the next
2590  * non empty bucket.
2591  */
2592 static void *listening_get_next(struct seq_file *seq, void *cur)
2593 {
2594         struct tcp_iter_state *st = seq->private;
2595         struct inet_listen_hashbucket *ilb2;
2596         struct hlist_nulls_node *node;
2597         struct inet_hashinfo *hinfo;
2598         struct sock *sk = cur;
2599
2600         ++st->num;
2601         ++st->offset;
2602
2603         sk = sk_nulls_next(sk);
2604         sk_nulls_for_each_from(sk, node) {
2605                 if (seq_sk_match(seq, sk))
2606                         return sk;
2607         }
2608
2609         hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2610         ilb2 = &hinfo->lhash2[st->bucket];
2611         spin_unlock(&ilb2->lock);
2612         ++st->bucket;
2613         return listening_get_first(seq);
2614 }
2615
2616 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2617 {
2618         struct tcp_iter_state *st = seq->private;
2619         void *rc;
2620
2621         st->bucket = 0;
2622         st->offset = 0;
2623         rc = listening_get_first(seq);
2624
2625         while (rc && *pos) {
2626                 rc = listening_get_next(seq, rc);
2627                 --*pos;
2628         }
2629         return rc;
2630 }
2631
2632 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2633                                 const struct tcp_iter_state *st)
2634 {
2635         return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2636 }
2637
2638 /*
2639  * Get first established socket starting from bucket given in st->bucket.
2640  * If st->bucket is zero, the very first socket in the hash is returned.
2641  */
2642 static void *established_get_first(struct seq_file *seq)
2643 {
2644         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2645         struct tcp_iter_state *st = seq->private;
2646
2647         st->offset = 0;
2648         for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2649                 struct sock *sk;
2650                 struct hlist_nulls_node *node;
2651                 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2652
2653                 cond_resched();
2654
2655                 /* Lockless fast path for the common case of empty buckets */
2656                 if (empty_bucket(hinfo, st))
2657                         continue;
2658
2659                 spin_lock_bh(lock);
2660                 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2661                         if (seq_sk_match(seq, sk))
2662                                 return sk;
2663                 }
2664                 spin_unlock_bh(lock);
2665         }
2666
2667         return NULL;
2668 }
2669
2670 static void *established_get_next(struct seq_file *seq, void *cur)
2671 {
2672         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2673         struct tcp_iter_state *st = seq->private;
2674         struct hlist_nulls_node *node;
2675         struct sock *sk = cur;
2676
2677         ++st->num;
2678         ++st->offset;
2679
2680         sk = sk_nulls_next(sk);
2681
2682         sk_nulls_for_each_from(sk, node) {
2683                 if (seq_sk_match(seq, sk))
2684                         return sk;
2685         }
2686
2687         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2688         ++st->bucket;
2689         return established_get_first(seq);
2690 }
2691
2692 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2693 {
2694         struct tcp_iter_state *st = seq->private;
2695         void *rc;
2696
2697         st->bucket = 0;
2698         rc = established_get_first(seq);
2699
2700         while (rc && pos) {
2701                 rc = established_get_next(seq, rc);
2702                 --pos;
2703         }
2704         return rc;
2705 }
2706
2707 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2708 {
2709         void *rc;
2710         struct tcp_iter_state *st = seq->private;
2711
2712         st->state = TCP_SEQ_STATE_LISTENING;
2713         rc        = listening_get_idx(seq, &pos);
2714
2715         if (!rc) {
2716                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2717                 rc        = established_get_idx(seq, pos);
2718         }
2719
2720         return rc;
2721 }
2722
2723 static void *tcp_seek_last_pos(struct seq_file *seq)
2724 {
2725         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2726         struct tcp_iter_state *st = seq->private;
2727         int bucket = st->bucket;
2728         int offset = st->offset;
2729         int orig_num = st->num;
2730         void *rc = NULL;
2731
2732         switch (st->state) {
2733         case TCP_SEQ_STATE_LISTENING:
2734                 if (st->bucket > hinfo->lhash2_mask)
2735                         break;
2736                 rc = listening_get_first(seq);
2737                 while (offset-- && rc && bucket == st->bucket)
2738                         rc = listening_get_next(seq, rc);
2739                 if (rc)
2740                         break;
2741                 st->bucket = 0;
2742                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2743                 fallthrough;
2744         case TCP_SEQ_STATE_ESTABLISHED:
2745                 if (st->bucket > hinfo->ehash_mask)
2746                         break;
2747                 rc = established_get_first(seq);
2748                 while (offset-- && rc && bucket == st->bucket)
2749                         rc = established_get_next(seq, rc);
2750         }
2751
2752         st->num = orig_num;
2753
2754         return rc;
2755 }
2756
2757 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2758 {
2759         struct tcp_iter_state *st = seq->private;
2760         void *rc;
2761
2762         if (*pos && *pos == st->last_pos) {
2763                 rc = tcp_seek_last_pos(seq);
2764                 if (rc)
2765                         goto out;
2766         }
2767
2768         st->state = TCP_SEQ_STATE_LISTENING;
2769         st->num = 0;
2770         st->bucket = 0;
2771         st->offset = 0;
2772         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2773
2774 out:
2775         st->last_pos = *pos;
2776         return rc;
2777 }
2778 EXPORT_SYMBOL(tcp_seq_start);
2779
2780 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2781 {
2782         struct tcp_iter_state *st = seq->private;
2783         void *rc = NULL;
2784
2785         if (v == SEQ_START_TOKEN) {
2786                 rc = tcp_get_idx(seq, 0);
2787                 goto out;
2788         }
2789
2790         switch (st->state) {
2791         case TCP_SEQ_STATE_LISTENING:
2792                 rc = listening_get_next(seq, v);
2793                 if (!rc) {
2794                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2795                         st->bucket = 0;
2796                         st->offset = 0;
2797                         rc        = established_get_first(seq);
2798                 }
2799                 break;
2800         case TCP_SEQ_STATE_ESTABLISHED:
2801                 rc = established_get_next(seq, v);
2802                 break;
2803         }
2804 out:
2805         ++*pos;
2806         st->last_pos = *pos;
2807         return rc;
2808 }
2809 EXPORT_SYMBOL(tcp_seq_next);
2810
2811 void tcp_seq_stop(struct seq_file *seq, void *v)
2812 {
2813         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2814         struct tcp_iter_state *st = seq->private;
2815
2816         switch (st->state) {
2817         case TCP_SEQ_STATE_LISTENING:
2818                 if (v != SEQ_START_TOKEN)
2819                         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2820                 break;
2821         case TCP_SEQ_STATE_ESTABLISHED:
2822                 if (v)
2823                         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2824                 break;
2825         }
2826 }
2827 EXPORT_SYMBOL(tcp_seq_stop);
2828
2829 static void get_openreq4(const struct request_sock *req,
2830                          struct seq_file *f, int i)
2831 {
2832         const struct inet_request_sock *ireq = inet_rsk(req);
2833         long delta = req->rsk_timer.expires - jiffies;
2834
2835         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2836                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2837                 i,
2838                 ireq->ir_loc_addr,
2839                 ireq->ir_num,
2840                 ireq->ir_rmt_addr,
2841                 ntohs(ireq->ir_rmt_port),
2842                 TCP_SYN_RECV,
2843                 0, 0, /* could print option size, but that is af dependent. */
2844                 1,    /* timers active (only the expire timer) */
2845                 jiffies_delta_to_clock_t(delta),
2846                 req->num_timeout,
2847                 from_kuid_munged(seq_user_ns(f),
2848                                  sock_i_uid(req->rsk_listener)),
2849                 0,  /* non standard timer */
2850                 0, /* open_requests have no inode */
2851                 0,
2852                 req);
2853 }
2854
2855 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2856 {
2857         int timer_active;
2858         unsigned long timer_expires;
2859         const struct tcp_sock *tp = tcp_sk(sk);
2860         const struct inet_connection_sock *icsk = inet_csk(sk);
2861         const struct inet_sock *inet = inet_sk(sk);
2862         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2863         __be32 dest = inet->inet_daddr;
2864         __be32 src = inet->inet_rcv_saddr;
2865         __u16 destp = ntohs(inet->inet_dport);
2866         __u16 srcp = ntohs(inet->inet_sport);
2867         int rx_queue;
2868         int state;
2869
2870         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2871             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2872             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2873                 timer_active    = 1;
2874                 timer_expires   = icsk->icsk_timeout;
2875         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2876                 timer_active    = 4;
2877                 timer_expires   = icsk->icsk_timeout;
2878         } else if (timer_pending(&sk->sk_timer)) {
2879                 timer_active    = 2;
2880                 timer_expires   = sk->sk_timer.expires;
2881         } else {
2882                 timer_active    = 0;
2883                 timer_expires = jiffies;
2884         }
2885
2886         state = inet_sk_state_load(sk);
2887         if (state == TCP_LISTEN)
2888                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2889         else
2890                 /* Because we don't lock the socket,
2891                  * we might find a transient negative value.
2892                  */
2893                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2894                                       READ_ONCE(tp->copied_seq), 0);
2895
2896         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2897                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2898                 i, src, srcp, dest, destp, state,
2899                 READ_ONCE(tp->write_seq) - tp->snd_una,
2900                 rx_queue,
2901                 timer_active,
2902                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2903                 icsk->icsk_retransmits,
2904                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2905                 icsk->icsk_probes_out,
2906                 sock_i_ino(sk),
2907                 refcount_read(&sk->sk_refcnt), sk,
2908                 jiffies_to_clock_t(icsk->icsk_rto),
2909                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2910                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2911                 tcp_snd_cwnd(tp),
2912                 state == TCP_LISTEN ?
2913                     fastopenq->max_qlen :
2914                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2915 }
2916
2917 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2918                                struct seq_file *f, int i)
2919 {
2920         long delta = tw->tw_timer.expires - jiffies;
2921         __be32 dest, src;
2922         __u16 destp, srcp;
2923
2924         dest  = tw->tw_daddr;
2925         src   = tw->tw_rcv_saddr;
2926         destp = ntohs(tw->tw_dport);
2927         srcp  = ntohs(tw->tw_sport);
2928
2929         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2930                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2931                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2932                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2933                 refcount_read(&tw->tw_refcnt), tw);
2934 }
2935
2936 #define TMPSZ 150
2937
2938 static int tcp4_seq_show(struct seq_file *seq, void *v)
2939 {
2940         struct tcp_iter_state *st;
2941         struct sock *sk = v;
2942
2943         seq_setwidth(seq, TMPSZ - 1);
2944         if (v == SEQ_START_TOKEN) {
2945                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2946                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2947                            "inode");
2948                 goto out;
2949         }
2950         st = seq->private;
2951
2952         if (sk->sk_state == TCP_TIME_WAIT)
2953                 get_timewait4_sock(v, seq, st->num);
2954         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2955                 get_openreq4(v, seq, st->num);
2956         else
2957                 get_tcp4_sock(v, seq, st->num);
2958 out:
2959         seq_pad(seq, '\n');
2960         return 0;
2961 }
2962
2963 #ifdef CONFIG_BPF_SYSCALL
2964 struct bpf_tcp_iter_state {
2965         struct tcp_iter_state state;
2966         unsigned int cur_sk;
2967         unsigned int end_sk;
2968         unsigned int max_sk;
2969         struct sock **batch;
2970         bool st_bucket_done;
2971 };
2972
2973 struct bpf_iter__tcp {
2974         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2975         __bpf_md_ptr(struct sock_common *, sk_common);
2976         uid_t uid __aligned(8);
2977 };
2978
2979 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2980                              struct sock_common *sk_common, uid_t uid)
2981 {
2982         struct bpf_iter__tcp ctx;
2983
2984         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2985         ctx.meta = meta;
2986         ctx.sk_common = sk_common;
2987         ctx.uid = uid;
2988         return bpf_iter_run_prog(prog, &ctx);
2989 }
2990
2991 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2992 {
2993         while (iter->cur_sk < iter->end_sk)
2994                 sock_gen_put(iter->batch[iter->cur_sk++]);
2995 }
2996
2997 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2998                                       unsigned int new_batch_sz)
2999 {
3000         struct sock **new_batch;
3001
3002         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3003                              GFP_USER | __GFP_NOWARN);
3004         if (!new_batch)
3005                 return -ENOMEM;
3006
3007         bpf_iter_tcp_put_batch(iter);
3008         kvfree(iter->batch);
3009         iter->batch = new_batch;
3010         iter->max_sk = new_batch_sz;
3011
3012         return 0;
3013 }
3014
3015 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3016                                                  struct sock *start_sk)
3017 {
3018         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3019         struct bpf_tcp_iter_state *iter = seq->private;
3020         struct tcp_iter_state *st = &iter->state;
3021         struct hlist_nulls_node *node;
3022         unsigned int expected = 1;
3023         struct sock *sk;
3024
3025         sock_hold(start_sk);
3026         iter->batch[iter->end_sk++] = start_sk;
3027
3028         sk = sk_nulls_next(start_sk);
3029         sk_nulls_for_each_from(sk, node) {
3030                 if (seq_sk_match(seq, sk)) {
3031                         if (iter->end_sk < iter->max_sk) {
3032                                 sock_hold(sk);
3033                                 iter->batch[iter->end_sk++] = sk;
3034                         }
3035                         expected++;
3036                 }
3037         }
3038         spin_unlock(&hinfo->lhash2[st->bucket].lock);
3039
3040         return expected;
3041 }
3042
3043 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3044                                                    struct sock *start_sk)
3045 {
3046         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3047         struct bpf_tcp_iter_state *iter = seq->private;
3048         struct tcp_iter_state *st = &iter->state;
3049         struct hlist_nulls_node *node;
3050         unsigned int expected = 1;
3051         struct sock *sk;
3052
3053         sock_hold(start_sk);
3054         iter->batch[iter->end_sk++] = start_sk;
3055
3056         sk = sk_nulls_next(start_sk);
3057         sk_nulls_for_each_from(sk, node) {
3058                 if (seq_sk_match(seq, sk)) {
3059                         if (iter->end_sk < iter->max_sk) {
3060                                 sock_hold(sk);
3061                                 iter->batch[iter->end_sk++] = sk;
3062                         }
3063                         expected++;
3064                 }
3065         }
3066         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3067
3068         return expected;
3069 }
3070
3071 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3072 {
3073         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3074         struct bpf_tcp_iter_state *iter = seq->private;
3075         struct tcp_iter_state *st = &iter->state;
3076         unsigned int expected;
3077         bool resized = false;
3078         struct sock *sk;
3079
3080         /* The st->bucket is done.  Directly advance to the next
3081          * bucket instead of having the tcp_seek_last_pos() to skip
3082          * one by one in the current bucket and eventually find out
3083          * it has to advance to the next bucket.
3084          */
3085         if (iter->st_bucket_done) {
3086                 st->offset = 0;
3087                 st->bucket++;
3088                 if (st->state == TCP_SEQ_STATE_LISTENING &&
3089                     st->bucket > hinfo->lhash2_mask) {
3090                         st->state = TCP_SEQ_STATE_ESTABLISHED;
3091                         st->bucket = 0;
3092                 }
3093         }
3094
3095 again:
3096         /* Get a new batch */
3097         iter->cur_sk = 0;
3098         iter->end_sk = 0;
3099         iter->st_bucket_done = false;
3100
3101         sk = tcp_seek_last_pos(seq);
3102         if (!sk)
3103                 return NULL; /* Done */
3104
3105         if (st->state == TCP_SEQ_STATE_LISTENING)
3106                 expected = bpf_iter_tcp_listening_batch(seq, sk);
3107         else
3108                 expected = bpf_iter_tcp_established_batch(seq, sk);
3109
3110         if (iter->end_sk == expected) {
3111                 iter->st_bucket_done = true;
3112                 return sk;
3113         }
3114
3115         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
3116                 resized = true;
3117                 goto again;
3118         }
3119
3120         return sk;
3121 }
3122
3123 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3124 {
3125         /* bpf iter does not support lseek, so it always
3126          * continue from where it was stop()-ped.
3127          */
3128         if (*pos)
3129                 return bpf_iter_tcp_batch(seq);
3130
3131         return SEQ_START_TOKEN;
3132 }
3133
3134 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3135 {
3136         struct bpf_tcp_iter_state *iter = seq->private;
3137         struct tcp_iter_state *st = &iter->state;
3138         struct sock *sk;
3139
3140         /* Whenever seq_next() is called, the iter->cur_sk is
3141          * done with seq_show(), so advance to the next sk in
3142          * the batch.
3143          */
3144         if (iter->cur_sk < iter->end_sk) {
3145                 /* Keeping st->num consistent in tcp_iter_state.
3146                  * bpf_iter_tcp does not use st->num.
3147                  * meta.seq_num is used instead.
3148                  */
3149                 st->num++;
3150                 /* Move st->offset to the next sk in the bucket such that
3151                  * the future start() will resume at st->offset in
3152                  * st->bucket.  See tcp_seek_last_pos().
3153                  */
3154                 st->offset++;
3155                 sock_gen_put(iter->batch[iter->cur_sk++]);
3156         }
3157
3158         if (iter->cur_sk < iter->end_sk)
3159                 sk = iter->batch[iter->cur_sk];
3160         else
3161                 sk = bpf_iter_tcp_batch(seq);
3162
3163         ++*pos;
3164         /* Keeping st->last_pos consistent in tcp_iter_state.
3165          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3166          */
3167         st->last_pos = *pos;
3168         return sk;
3169 }
3170
3171 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3172 {
3173         struct bpf_iter_meta meta;
3174         struct bpf_prog *prog;
3175         struct sock *sk = v;
3176         uid_t uid;
3177         int ret;
3178
3179         if (v == SEQ_START_TOKEN)
3180                 return 0;
3181
3182         if (sk_fullsock(sk))
3183                 lock_sock(sk);
3184
3185         if (unlikely(sk_unhashed(sk))) {
3186                 ret = SEQ_SKIP;
3187                 goto unlock;
3188         }
3189
3190         if (sk->sk_state == TCP_TIME_WAIT) {
3191                 uid = 0;
3192         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3193                 const struct request_sock *req = v;
3194
3195                 uid = from_kuid_munged(seq_user_ns(seq),
3196                                        sock_i_uid(req->rsk_listener));
3197         } else {
3198                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3199         }
3200
3201         meta.seq = seq;
3202         prog = bpf_iter_get_info(&meta, false);
3203         ret = tcp_prog_seq_show(prog, &meta, v, uid);
3204
3205 unlock:
3206         if (sk_fullsock(sk))
3207                 release_sock(sk);
3208         return ret;
3209
3210 }
3211
3212 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3213 {
3214         struct bpf_tcp_iter_state *iter = seq->private;
3215         struct bpf_iter_meta meta;
3216         struct bpf_prog *prog;
3217
3218         if (!v) {
3219                 meta.seq = seq;
3220                 prog = bpf_iter_get_info(&meta, true);
3221                 if (prog)
3222                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
3223         }
3224
3225         if (iter->cur_sk < iter->end_sk) {
3226                 bpf_iter_tcp_put_batch(iter);
3227                 iter->st_bucket_done = false;
3228         }
3229 }
3230
3231 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3232         .show           = bpf_iter_tcp_seq_show,
3233         .start          = bpf_iter_tcp_seq_start,
3234         .next           = bpf_iter_tcp_seq_next,
3235         .stop           = bpf_iter_tcp_seq_stop,
3236 };
3237 #endif
3238 static unsigned short seq_file_family(const struct seq_file *seq)
3239 {
3240         const struct tcp_seq_afinfo *afinfo;
3241
3242 #ifdef CONFIG_BPF_SYSCALL
3243         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3244         if (seq->op == &bpf_iter_tcp_seq_ops)
3245                 return AF_UNSPEC;
3246 #endif
3247
3248         /* Iterated from proc fs */
3249         afinfo = pde_data(file_inode(seq->file));
3250         return afinfo->family;
3251 }
3252
3253 static const struct seq_operations tcp4_seq_ops = {
3254         .show           = tcp4_seq_show,
3255         .start          = tcp_seq_start,
3256         .next           = tcp_seq_next,
3257         .stop           = tcp_seq_stop,
3258 };
3259
3260 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3261         .family         = AF_INET,
3262 };
3263
3264 static int __net_init tcp4_proc_init_net(struct net *net)
3265 {
3266         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3267                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3268                 return -ENOMEM;
3269         return 0;
3270 }
3271
3272 static void __net_exit tcp4_proc_exit_net(struct net *net)
3273 {
3274         remove_proc_entry("tcp", net->proc_net);
3275 }
3276
3277 static struct pernet_operations tcp4_net_ops = {
3278         .init = tcp4_proc_init_net,
3279         .exit = tcp4_proc_exit_net,
3280 };
3281
3282 int __init tcp4_proc_init(void)
3283 {
3284         return register_pernet_subsys(&tcp4_net_ops);
3285 }
3286
3287 void tcp4_proc_exit(void)
3288 {
3289         unregister_pernet_subsys(&tcp4_net_ops);
3290 }
3291 #endif /* CONFIG_PROC_FS */
3292
3293 /* @wake is one when sk_stream_write_space() calls us.
3294  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3295  * This mimics the strategy used in sock_def_write_space().
3296  */
3297 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3298 {
3299         const struct tcp_sock *tp = tcp_sk(sk);
3300         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3301                             READ_ONCE(tp->snd_nxt);
3302
3303         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3304 }
3305 EXPORT_SYMBOL(tcp_stream_memory_free);
3306
3307 struct proto tcp_prot = {
3308         .name                   = "TCP",
3309         .owner                  = THIS_MODULE,
3310         .close                  = tcp_close,
3311         .pre_connect            = tcp_v4_pre_connect,
3312         .connect                = tcp_v4_connect,
3313         .disconnect             = tcp_disconnect,
3314         .accept                 = inet_csk_accept,
3315         .ioctl                  = tcp_ioctl,
3316         .init                   = tcp_v4_init_sock,
3317         .destroy                = tcp_v4_destroy_sock,
3318         .shutdown               = tcp_shutdown,
3319         .setsockopt             = tcp_setsockopt,
3320         .getsockopt             = tcp_getsockopt,
3321         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3322         .keepalive              = tcp_set_keepalive,
3323         .recvmsg                = tcp_recvmsg,
3324         .sendmsg                = tcp_sendmsg,
3325         .splice_eof             = tcp_splice_eof,
3326         .backlog_rcv            = tcp_v4_do_rcv,
3327         .release_cb             = tcp_release_cb,
3328         .hash                   = inet_hash,
3329         .unhash                 = inet_unhash,
3330         .get_port               = inet_csk_get_port,
3331         .put_port               = inet_put_port,
3332 #ifdef CONFIG_BPF_SYSCALL
3333         .psock_update_sk_prot   = tcp_bpf_update_proto,
3334 #endif
3335         .enter_memory_pressure  = tcp_enter_memory_pressure,
3336         .leave_memory_pressure  = tcp_leave_memory_pressure,
3337         .stream_memory_free     = tcp_stream_memory_free,
3338         .sockets_allocated      = &tcp_sockets_allocated,
3339         .orphan_count           = &tcp_orphan_count,
3340
3341         .memory_allocated       = &tcp_memory_allocated,
3342         .per_cpu_fw_alloc       = &tcp_memory_per_cpu_fw_alloc,
3343
3344         .memory_pressure        = &tcp_memory_pressure,
3345         .sysctl_mem             = sysctl_tcp_mem,
3346         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3347         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3348         .max_header             = MAX_TCP_HEADER,
3349         .obj_size               = sizeof(struct tcp_sock),
3350         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3351         .twsk_prot              = &tcp_timewait_sock_ops,
3352         .rsk_prot               = &tcp_request_sock_ops,
3353         .h.hashinfo             = NULL,
3354         .no_autobind            = true,
3355         .diag_destroy           = tcp_abort,
3356 };
3357 EXPORT_SYMBOL(tcp_prot);
3358
3359 static void __net_exit tcp_sk_exit(struct net *net)
3360 {
3361         if (net->ipv4.tcp_congestion_control)
3362                 bpf_module_put(net->ipv4.tcp_congestion_control,
3363                                net->ipv4.tcp_congestion_control->owner);
3364 }
3365
3366 static void __net_init tcp_set_hashinfo(struct net *net)
3367 {
3368         struct inet_hashinfo *hinfo;
3369         unsigned int ehash_entries;
3370         struct net *old_net;
3371
3372         if (net_eq(net, &init_net))
3373                 goto fallback;
3374
3375         old_net = current->nsproxy->net_ns;
3376         ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3377         if (!ehash_entries)
3378                 goto fallback;
3379
3380         ehash_entries = roundup_pow_of_two(ehash_entries);
3381         hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3382         if (!hinfo) {
3383                 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3384                         "for a netns, fallback to the global one\n",
3385                         ehash_entries);
3386 fallback:
3387                 hinfo = &tcp_hashinfo;
3388                 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3389         }
3390
3391         net->ipv4.tcp_death_row.hashinfo = hinfo;
3392         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3393         net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3394 }
3395
3396 static int __net_init tcp_sk_init(struct net *net)
3397 {
3398         net->ipv4.sysctl_tcp_ecn = 2;
3399         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3400
3401         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3402         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3403         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3404         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3405         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3406
3407         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3408         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3409         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3410
3411         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3412         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3413         net->ipv4.sysctl_tcp_syncookies = 1;
3414         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3415         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3416         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3417         net->ipv4.sysctl_tcp_orphan_retries = 0;
3418         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3419         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3420         net->ipv4.sysctl_tcp_tw_reuse = 2;
3421         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3422
3423         refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3424         tcp_set_hashinfo(net);
3425
3426         net->ipv4.sysctl_tcp_sack = 1;
3427         net->ipv4.sysctl_tcp_window_scaling = 1;
3428         net->ipv4.sysctl_tcp_timestamps = 1;
3429         net->ipv4.sysctl_tcp_early_retrans = 3;
3430         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3431         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3432         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3433         net->ipv4.sysctl_tcp_max_reordering = 300;
3434         net->ipv4.sysctl_tcp_dsack = 1;
3435         net->ipv4.sysctl_tcp_app_win = 31;
3436         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3437         net->ipv4.sysctl_tcp_frto = 2;
3438         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3439         /* This limits the percentage of the congestion window which we
3440          * will allow a single TSO frame to consume.  Building TSO frames
3441          * which are too large can cause TCP streams to be bursty.
3442          */
3443         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3444         /* Default TSQ limit of 16 TSO segments */
3445         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3446
3447         /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3448         net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3449
3450         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3451         net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3452         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3453         net->ipv4.sysctl_tcp_autocorking = 1;
3454         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3455         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3456         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3457         if (net != &init_net) {
3458                 memcpy(net->ipv4.sysctl_tcp_rmem,
3459                        init_net.ipv4.sysctl_tcp_rmem,
3460                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3461                 memcpy(net->ipv4.sysctl_tcp_wmem,
3462                        init_net.ipv4.sysctl_tcp_wmem,
3463                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3464         }
3465         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3466         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3467         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3468         net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3469         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3470         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3471         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3472
3473         /* Set default values for PLB */
3474         net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3475         net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3476         net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3477         net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3478         /* Default congestion threshold for PLB to mark a round is 50% */
3479         net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3480
3481         /* Reno is always built in */
3482         if (!net_eq(net, &init_net) &&
3483             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3484                                init_net.ipv4.tcp_congestion_control->owner))
3485                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3486         else
3487                 net->ipv4.tcp_congestion_control = &tcp_reno;
3488
3489         net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3490         net->ipv4.sysctl_tcp_shrink_window = 0;
3491
3492         net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3493
3494         return 0;
3495 }
3496
3497 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3498 {
3499         struct net *net;
3500
3501         tcp_twsk_purge(net_exit_list, AF_INET);
3502
3503         list_for_each_entry(net, net_exit_list, exit_list) {
3504                 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3505                 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3506                 tcp_fastopen_ctx_destroy(net);
3507         }
3508 }
3509
3510 static struct pernet_operations __net_initdata tcp_sk_ops = {
3511        .init       = tcp_sk_init,
3512        .exit       = tcp_sk_exit,
3513        .exit_batch = tcp_sk_exit_batch,
3514 };
3515
3516 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3517 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3518                      struct sock_common *sk_common, uid_t uid)
3519
3520 #define INIT_BATCH_SZ 16
3521
3522 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3523 {
3524         struct bpf_tcp_iter_state *iter = priv_data;
3525         int err;
3526
3527         err = bpf_iter_init_seq_net(priv_data, aux);
3528         if (err)
3529                 return err;
3530
3531         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3532         if (err) {
3533                 bpf_iter_fini_seq_net(priv_data);
3534                 return err;
3535         }
3536
3537         return 0;
3538 }
3539
3540 static void bpf_iter_fini_tcp(void *priv_data)
3541 {
3542         struct bpf_tcp_iter_state *iter = priv_data;
3543
3544         bpf_iter_fini_seq_net(priv_data);
3545         kvfree(iter->batch);
3546 }
3547
3548 static const struct bpf_iter_seq_info tcp_seq_info = {
3549         .seq_ops                = &bpf_iter_tcp_seq_ops,
3550         .init_seq_private       = bpf_iter_init_tcp,
3551         .fini_seq_private       = bpf_iter_fini_tcp,
3552         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3553 };
3554
3555 static const struct bpf_func_proto *
3556 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3557                             const struct bpf_prog *prog)
3558 {
3559         switch (func_id) {
3560         case BPF_FUNC_setsockopt:
3561                 return &bpf_sk_setsockopt_proto;
3562         case BPF_FUNC_getsockopt:
3563                 return &bpf_sk_getsockopt_proto;
3564         default:
3565                 return NULL;
3566         }
3567 }
3568
3569 static struct bpf_iter_reg tcp_reg_info = {
3570         .target                 = "tcp",
3571         .ctx_arg_info_size      = 1,
3572         .ctx_arg_info           = {
3573                 { offsetof(struct bpf_iter__tcp, sk_common),
3574                   PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3575         },
3576         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3577         .seq_info               = &tcp_seq_info,
3578 };
3579
3580 static void __init bpf_iter_register(void)
3581 {
3582         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3583         if (bpf_iter_reg_target(&tcp_reg_info))
3584                 pr_warn("Warning: could not register bpf iterator tcp\n");
3585 }
3586
3587 #endif
3588
3589 void __init tcp_v4_init(void)
3590 {
3591         int cpu, res;
3592
3593         for_each_possible_cpu(cpu) {
3594                 struct sock *sk;
3595
3596                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3597                                            IPPROTO_TCP, &init_net);
3598                 if (res)
3599                         panic("Failed to create the TCP control socket.\n");
3600                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3601
3602                 /* Please enforce IP_DF and IPID==0 for RST and
3603                  * ACK sent in SYN-RECV and TIME-WAIT state.
3604                  */
3605                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3606
3607                 per_cpu(ipv4_tcp_sk, cpu) = sk;
3608         }
3609         if (register_pernet_subsys(&tcp_sk_ops))
3610                 panic("Failed to create the TCP control socket.\n");
3611
3612 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3613         bpf_iter_register();
3614 #endif
3615 }