net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79 #include <linux/btf_ids.h>
  80
  81 #include <crypto/hash.h>
  82 #include <linux/scatterlist.h>
  83
  84 #include <trace/events/tcp.h>
  85
  86 #ifdef CONFIG_TCP_MD5SIG
  87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89 #endif
  90
  91 struct inet_hashinfo tcp_hashinfo;
  92 EXPORT_SYMBOL(tcp_hashinfo);
  93
  94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
  95
  96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  97 {
  98         return secure_tcp_seq(ip_hdr(skb)->daddr,
  99                               ip_hdr(skb)->saddr,
 100                               tcp_hdr(skb)->dest,
 101                               tcp_hdr(skb)->source);
 102 }
 103
 104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 105 {
 106         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 107 }
 108
 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110 {
 111         int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
 112         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 114         struct tcp_sock *tp = tcp_sk(sk);
 115
 116         if (reuse == 2) {
 117                 /* Still does not detect *everything* that goes through
 118                  * lo, since we require a loopback src or dst address
 119                  * or direct binding to 'lo' interface.
 120                  */
 121                 bool loopback = false;
 122                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 123                         loopback = true;
 124 #if IS_ENABLED(CONFIG_IPV6)
 125                 if (tw->tw_family == AF_INET6) {
 126                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 128                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 129                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 130                                 loopback = true;
 131                 } else
 132 #endif
 133                 {
 134                         if (ipv4_is_loopback(tw->tw_daddr) ||
 135                             ipv4_is_loopback(tw->tw_rcv_saddr))
 136                                 loopback = true;
 137                 }
 138                 if (!loopback)
 139                         reuse = 0;
 140         }
 141
 142         /* With PAWS, it is safe from the viewpoint
 143            of data integrity. Even without PAWS it is safe provided sequence
 144            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 145
 146            Actually, the idea is close to VJ's one, only timestamp cache is
 147            held not per host, but per port pair and TW bucket is used as state
 148            holder.
 149
 150            If TW bucket has been already destroyed we fall back to VJ's scheme
 151            and use initial timestamp retrieved from peer table.
 152          */
 153         if (tcptw->tw_ts_recent_stamp &&
 154             (!twp || (reuse && time_after32(ktime_get_seconds(),
 155                                             tcptw->tw_ts_recent_stamp)))) {
 156                 /* In case of repair and re-using TIME-WAIT sockets we still
 157                  * want to be sure that it is safe as above but honor the
 158                  * sequence numbers and time stamps set as part of the repair
 159                  * process.
 160                  *
 161                  * Without this check re-using a TIME-WAIT socket with TCP
 162                  * repair would accumulate a -1 on the repair assigned
 163                  * sequence number. The first time it is reused the sequence
 164                  * is -1, the second time -2, etc. This fixes that issue
 165                  * without appearing to create any others.
 166                  */
 167                 if (likely(!tp->repair)) {
 168                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 169
 170                         if (!seq)
 171                                 seq = 1;
 172                         WRITE_ONCE(tp->write_seq, seq);
 173                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 174                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 175                 }
 176                 sock_hold(sktw);
 177                 return 1;
 178         }
 179
 180         return 0;
 181 }
 182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 183
 184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 185                               int addr_len)
 186 {
 187         /* This check is replicated from tcp_v4_connect() and intended to
 188          * prevent BPF program called below from accessing bytes that are out
 189          * of the bound specified by user in addr_len.
 190          */
 191         if (addr_len < sizeof(struct sockaddr_in))
 192                 return -EINVAL;
 193
 194         sock_owned_by_me(sk);
 195
 196         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 197 }
 198
 199 /* This will initiate an outgoing connection. */
 200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 201 {
 202         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 203         struct inet_timewait_death_row *tcp_death_row;
 204         struct inet_sock *inet = inet_sk(sk);
 205         struct tcp_sock *tp = tcp_sk(sk);
 206         struct ip_options_rcu *inet_opt;
 207         struct net *net = sock_net(sk);
 208         __be16 orig_sport, orig_dport;
 209         __be32 daddr, nexthop;
 210         struct flowi4 *fl4;
 211         struct rtable *rt;
 212         int err;
 213
 214         if (addr_len < sizeof(struct sockaddr_in))
 215                 return -EINVAL;
 216
 217         if (usin->sin_family != AF_INET)
 218                 return -EAFNOSUPPORT;
 219
 220         nexthop = daddr = usin->sin_addr.s_addr;
 221         inet_opt = rcu_dereference_protected(inet->inet_opt,
 222                                              lockdep_sock_is_held(sk));
 223         if (inet_opt && inet_opt->opt.srr) {
 224                 if (!daddr)
 225                         return -EINVAL;
 226                 nexthop = inet_opt->opt.faddr;
 227         }
 228
 229         orig_sport = inet->inet_sport;
 230         orig_dport = usin->sin_port;
 231         fl4 = &inet->cork.fl.u.ip4;
 232         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 233                               sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
 234                               orig_dport, sk);
 235         if (IS_ERR(rt)) {
 236                 err = PTR_ERR(rt);
 237                 if (err == -ENETUNREACH)
 238                         IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
 239                 return err;
 240         }
 241
 242         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 243                 ip_rt_put(rt);
 244                 return -ENETUNREACH;
 245         }
 246
 247         if (!inet_opt || !inet_opt->opt.srr)
 248                 daddr = fl4->daddr;
 249
 250         tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 251
 252         if (!inet->inet_saddr) {
 253                 err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
 254                 if (err) {
 255                         ip_rt_put(rt);
 256                         return err;
 257                 }
 258         } else {
 259                 sk_rcv_saddr_set(sk, inet->inet_saddr);
 260         }
 261
 262         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 263                 /* Reset inherited state */
 264                 tp->rx_opt.ts_recent       = 0;
 265                 tp->rx_opt.ts_recent_stamp = 0;
 266                 if (likely(!tp->repair))
 267                         WRITE_ONCE(tp->write_seq, 0);
 268         }
 269
 270         inet->inet_dport = usin->sin_port;
 271         sk_daddr_set(sk, daddr);
 272
 273         inet_csk(sk)->icsk_ext_hdr_len = 0;
 274         if (inet_opt)
 275                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 276
 277         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 278
 279         /* Socket identity is still unknown (sport may be zero).
 280          * However we set state to SYN-SENT and not releasing socket
 281          * lock select source port, enter ourselves into the hash tables and
 282          * complete initialization after this.
 283          */
 284         tcp_set_state(sk, TCP_SYN_SENT);
 285         err = inet_hash_connect(tcp_death_row, sk);
 286         if (err)
 287                 goto failure;
 288
 289         sk_set_txhash(sk);
 290
 291         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 292                                inet->inet_sport, inet->inet_dport, sk);
 293         if (IS_ERR(rt)) {
 294                 err = PTR_ERR(rt);
 295                 rt = NULL;
 296                 goto failure;
 297         }
 298         /* OK, now commit destination to socket.  */
 299         sk->sk_gso_type = SKB_GSO_TCPV4;
 300         sk_setup_caps(sk, &rt->dst);
 301         rt = NULL;
 302
 303         if (likely(!tp->repair)) {
 304                 if (!tp->write_seq)
 305                         WRITE_ONCE(tp->write_seq,
 306                                    secure_tcp_seq(inet->inet_saddr,
 307                                                   inet->inet_daddr,
 308                                                   inet->inet_sport,
 309                                                   usin->sin_port));
 310                 WRITE_ONCE(tp->tsoffset,
 311                            secure_tcp_ts_off(net, inet->inet_saddr,
 312                                              inet->inet_daddr));
 313         }
 314
 315         atomic_set(&inet->inet_id, get_random_u16());
 316
 317         if (tcp_fastopen_defer_connect(sk, &err))
 318                 return err;
 319         if (err)
 320                 goto failure;
 321
 322         err = tcp_connect(sk);
 323
 324         if (err)
 325                 goto failure;
 326
 327         return 0;
 328
 329 failure:
 330         /*
 331          * This unhashes the socket and releases the local port,
 332          * if necessary.
 333          */
 334         tcp_set_state(sk, TCP_CLOSE);
 335         inet_bhash2_reset_saddr(sk);
 336         ip_rt_put(rt);
 337         sk->sk_route_caps = 0;
 338         inet->inet_dport = 0;
 339         return err;
 340 }
 341 EXPORT_SYMBOL(tcp_v4_connect);
 342
 343 /*
 344  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 345  * It can be called through tcp_release_cb() if socket was owned by user
 346  * at the time tcp_v4_err() was called to handle ICMP message.
 347  */
 348 void tcp_v4_mtu_reduced(struct sock *sk)
 349 {
 350         struct inet_sock *inet = inet_sk(sk);
 351         struct dst_entry *dst;
 352         u32 mtu;
 353
 354         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 355                 return;
 356         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 357         dst = inet_csk_update_pmtu(sk, mtu);
 358         if (!dst)
 359                 return;
 360
 361         /* Something is about to be wrong... Remember soft error
 362          * for the case, if this connection will not able to recover.
 363          */
 364         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 365                 sk->sk_err_soft = EMSGSIZE;
 366
 367         mtu = dst_mtu(dst);
 368
 369         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 370             ip_sk_accept_pmtu(sk) &&
 371             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 372                 tcp_sync_mss(sk, mtu);
 373
 374                 /* Resend the TCP packet because it's
 375                  * clear that the old packet has been
 376                  * dropped. This is the new "fast" path mtu
 377                  * discovery.
 378                  */
 379                 tcp_simple_retransmit(sk);
 380         } /* else let the usual retransmit timer handle it */
 381 }
 382 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 383
 384 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 385 {
 386         struct dst_entry *dst = __sk_dst_check(sk, 0);
 387
 388         if (dst)
 389                 dst->ops->redirect(dst, sk, skb);
 390 }
 391
 392
 393 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 394 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 395 {
 396         struct request_sock *req = inet_reqsk(sk);
 397         struct net *net = sock_net(sk);
 398
 399         /* ICMPs are not backlogged, hence we cannot get
 400          * an established socket here.
 401          */
 402         if (seq != tcp_rsk(req)->snt_isn) {
 403                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 404         } else if (abort) {
 405                 /*
 406                  * Still in SYN_RECV, just remove it silently.
 407                  * There is no good way to pass the error to the newly
 408                  * created socket, and POSIX does not want network
 409                  * errors returned from accept().
 410                  */
 411                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 412                 tcp_listendrop(req->rsk_listener);
 413         }
 414         reqsk_put(req);
 415 }
 416 EXPORT_SYMBOL(tcp_req_err);
 417
 418 /* TCP-LD (RFC 6069) logic */
 419 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 420 {
 421         struct inet_connection_sock *icsk = inet_csk(sk);
 422         struct tcp_sock *tp = tcp_sk(sk);
 423         struct sk_buff *skb;
 424         s32 remaining;
 425         u32 delta_us;
 426
 427         if (sock_owned_by_user(sk))
 428                 return;
 429
 430         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 431             !icsk->icsk_backoff)
 432                 return;
 433
 434         skb = tcp_rtx_queue_head(sk);
 435         if (WARN_ON_ONCE(!skb))
 436                 return;
 437
 438         icsk->icsk_backoff--;
 439         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 440         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 441
 442         tcp_mstamp_refresh(tp);
 443         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 444         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 445
 446         if (remaining > 0) {
 447                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 448                                           remaining, TCP_RTO_MAX);
 449         } else {
 450                 /* RTO revert clocked out retransmission.
 451                  * Will retransmit now.
 452                  */
 453                 tcp_retransmit_timer(sk);
 454         }
 455 }
 456 EXPORT_SYMBOL(tcp_ld_RTO_revert);
 457
 458 /*
 459  * This routine is called by the ICMP module when it gets some
 460  * sort of error condition.  If err < 0 then the socket should
 461  * be closed and the error returned to the user.  If err > 0
 462  * it's just the icmp type << 8 | icmp code.  After adjustment
 463  * header points to the first 8 bytes of the tcp header.  We need
 464  * to find the appropriate port.
 465  *
 466  * The locking strategy used here is very "optimistic". When
 467  * someone else accesses the socket the ICMP is just dropped
 468  * and for some paths there is no check at all.
 469  * A more general error queue to queue errors for later handling
 470  * is probably better.
 471  *
 472  */
 473
 474 int tcp_v4_err(struct sk_buff *skb, u32 info)
 475 {
 476         const struct iphdr *iph = (const struct iphdr *)skb->data;
 477         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 478         struct tcp_sock *tp;
 479         struct inet_sock *inet;
 480         const int type = icmp_hdr(skb)->type;
 481         const int code = icmp_hdr(skb)->code;
 482         struct sock *sk;
 483         struct request_sock *fastopen;
 484         u32 seq, snd_una;
 485         int err;
 486         struct net *net = dev_net(skb->dev);
 487
 488         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
 489                                        iph->daddr, th->dest, iph->saddr,
 490                                        ntohs(th->source), inet_iif(skb), 0);
 491         if (!sk) {
 492                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 493                 return -ENOENT;
 494         }
 495         if (sk->sk_state == TCP_TIME_WAIT) {
 496                 inet_twsk_put(inet_twsk(sk));
 497                 return 0;
 498         }
 499         seq = ntohl(th->seq);
 500         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 501                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 502                                      type == ICMP_TIME_EXCEEDED ||
 503                                      (type == ICMP_DEST_UNREACH &&
 504                                       (code == ICMP_NET_UNREACH ||
 505                                        code == ICMP_HOST_UNREACH)));
 506                 return 0;
 507         }
 508
 509         bh_lock_sock(sk);
 510         /* If too many ICMPs get dropped on busy
 511          * servers this needs to be solved differently.
 512          * We do take care of PMTU discovery (RFC1191) special case :
 513          * we can receive locally generated ICMP messages while socket is held.
 514          */
 515         if (sock_owned_by_user(sk)) {
 516                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 517                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 518         }
 519         if (sk->sk_state == TCP_CLOSE)
 520                 goto out;
 521
 522         if (static_branch_unlikely(&ip4_min_ttl)) {
 523                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
 524                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
 525                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 526                         goto out;
 527                 }
 528         }
 529
 530         tp = tcp_sk(sk);
 531         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 532         fastopen = rcu_dereference(tp->fastopen_rsk);
 533         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 534         if (sk->sk_state != TCP_LISTEN &&
 535             !between(seq, snd_una, tp->snd_nxt)) {
 536                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 537                 goto out;
 538         }
 539
 540         switch (type) {
 541         case ICMP_REDIRECT:
 542                 if (!sock_owned_by_user(sk))
 543                         do_redirect(skb, sk);
 544                 goto out;
 545         case ICMP_SOURCE_QUENCH:
 546                 /* Just silently ignore these. */
 547                 goto out;
 548         case ICMP_PARAMETERPROB:
 549                 err = EPROTO;
 550                 break;
 551         case ICMP_DEST_UNREACH:
 552                 if (code > NR_ICMP_UNREACH)
 553                         goto out;
 554
 555                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 556                         /* We are not interested in TCP_LISTEN and open_requests
 557                          * (SYN-ACKs send out by Linux are always <576bytes so
 558                          * they should go through unfragmented).
 559                          */
 560                         if (sk->sk_state == TCP_LISTEN)
 561                                 goto out;
 562
 563                         WRITE_ONCE(tp->mtu_info, info);
 564                         if (!sock_owned_by_user(sk)) {
 565                                 tcp_v4_mtu_reduced(sk);
 566                         } else {
 567                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 568                                         sock_hold(sk);
 569                         }
 570                         goto out;
 571                 }
 572
 573                 err = icmp_err_convert[code].errno;
 574                 /* check if this ICMP message allows revert of backoff.
 575                  * (see RFC 6069)
 576                  */
 577                 if (!fastopen &&
 578                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 579                         tcp_ld_RTO_revert(sk, seq);
 580                 break;
 581         case ICMP_TIME_EXCEEDED:
 582                 err = EHOSTUNREACH;
 583                 break;
 584         default:
 585                 goto out;
 586         }
 587
 588         switch (sk->sk_state) {
 589         case TCP_SYN_SENT:
 590         case TCP_SYN_RECV:
 591                 /* Only in fast or simultaneous open. If a fast open socket is
 592                  * already accepted it is treated as a connected one below.
 593                  */
 594                 if (fastopen && !fastopen->sk)
 595                         break;
 596
 597                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 598
 599                 if (!sock_owned_by_user(sk)) {
 600                         sk->sk_err = err;
 601
 602                         sk_error_report(sk);
 603
 604                         tcp_done(sk);
 605                 } else {
 606                         sk->sk_err_soft = err;
 607                 }
 608                 goto out;
 609         }
 610
 611         /* If we've already connected we will keep trying
 612          * until we time out, or the user gives up.
 613          *
 614          * rfc1122 4.2.3.9 allows to consider as hard errors
 615          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 616          * but it is obsoleted by pmtu discovery).
 617          *
 618          * Note, that in modern internet, where routing is unreliable
 619          * and in each dark corner broken firewalls sit, sending random
 620          * errors ordered by their masters even this two messages finally lose
 621          * their original sense (even Linux sends invalid PORT_UNREACHs)
 622          *
 623          * Now we are in compliance with RFCs.
 624          *                                                      --ANK (980905)
 625          */
 626
 627         inet = inet_sk(sk);
 628         if (!sock_owned_by_user(sk) && inet->recverr) {
 629                 sk->sk_err = err;
 630                 sk_error_report(sk);
 631         } else  { /* Only an error on timeout */
 632                 sk->sk_err_soft = err;
 633         }
 634
 635 out:
 636         bh_unlock_sock(sk);
 637         sock_put(sk);
 638         return 0;
 639 }
 640
 641 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 642 {
 643         struct tcphdr *th = tcp_hdr(skb);
 644
 645         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 646         skb->csum_start = skb_transport_header(skb) - skb->head;
 647         skb->csum_offset = offsetof(struct tcphdr, check);
 648 }
 649
 650 /* This routine computes an IPv4 TCP checksum. */
 651 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 652 {
 653         const struct inet_sock *inet = inet_sk(sk);
 654
 655         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 656 }
 657 EXPORT_SYMBOL(tcp_v4_send_check);
 658
 659 /*
 660  *      This routine will send an RST to the other tcp.
 661  *
 662  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 663  *                    for reset.
 664  *      Answer: if a packet caused RST, it is not for a socket
 665  *              existing in our system, if it is matched to a socket,
 666  *              it is just duplicate segment or bug in other side's TCP.
 667  *              So that we build reply only basing on parameters
 668  *              arrived with segment.
 669  *      Exception: precedence violation. We do not implement it in any case.
 670  */
 671
 672 #ifdef CONFIG_TCP_MD5SIG
 673 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
 674 #else
 675 #define OPTION_BYTES sizeof(__be32)
 676 #endif
 677
 678 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 679 {
 680         const struct tcphdr *th = tcp_hdr(skb);
 681         struct {
 682                 struct tcphdr th;
 683                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
 684         } rep;
 685         struct ip_reply_arg arg;
 686 #ifdef CONFIG_TCP_MD5SIG
 687         struct tcp_md5sig_key *key = NULL;
 688         const __u8 *hash_location = NULL;
 689         unsigned char newhash[16];
 690         int genhash;
 691         struct sock *sk1 = NULL;
 692 #endif
 693         u64 transmit_time = 0;
 694         struct sock *ctl_sk;
 695         struct net *net;
 696         u32 txhash = 0;
 697
 698         /* Never send a reset in response to a reset. */
 699         if (th->rst)
 700                 return;
 701
 702         /* If sk not NULL, it means we did a successful lookup and incoming
 703          * route had to be correct. prequeue might have dropped our dst.
 704          */
 705         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 706                 return;
 707
 708         /* Swap the send and the receive. */
 709         memset(&rep, 0, sizeof(rep));
 710         rep.th.dest   = th->source;
 711         rep.th.source = th->dest;
 712         rep.th.doff   = sizeof(struct tcphdr) / 4;
 713         rep.th.rst    = 1;
 714
 715         if (th->ack) {
 716                 rep.th.seq = th->ack_seq;
 717         } else {
 718                 rep.th.ack = 1;
 719                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 720                                        skb->len - (th->doff << 2));
 721         }
 722
 723         memset(&arg, 0, sizeof(arg));
 724         arg.iov[0].iov_base = (unsigned char *)&rep;
 725         arg.iov[0].iov_len  = sizeof(rep.th);
 726
 727         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 728 #ifdef CONFIG_TCP_MD5SIG
 729         rcu_read_lock();
 730         hash_location = tcp_parse_md5sig_option(th);
 731         if (sk && sk_fullsock(sk)) {
 732                 const union tcp_md5_addr *addr;
 733                 int l3index;
 734
 735                 /* sdif set, means packet ingressed via a device
 736                  * in an L3 domain and inet_iif is set to it.
 737                  */
 738                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 739                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 740                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 741         } else if (hash_location) {
 742                 const union tcp_md5_addr *addr;
 743                 int sdif = tcp_v4_sdif(skb);
 744                 int dif = inet_iif(skb);
 745                 int l3index;
 746
 747                 /*
 748                  * active side is lost. Try to find listening socket through
 749                  * source port, and then find md5 key through listening socket.
 750                  * we are not loose security here:
 751                  * Incoming packet is checked with md5 hash with finding key,
 752                  * no RST generated if md5 hash doesn't match.
 753                  */
 754                 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
 755                                              NULL, 0, ip_hdr(skb)->saddr,
 756                                              th->source, ip_hdr(skb)->daddr,
 757                                              ntohs(th->source), dif, sdif);
 758                 /* don't send rst if it can't find key */
 759                 if (!sk1)
 760                         goto out;
 761
 762                 /* sdif set, means packet ingressed via a device
 763                  * in an L3 domain and dif is set to it.
 764                  */
 765                 l3index = sdif ? dif : 0;
 766                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 767                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 768                 if (!key)
 769                         goto out;
 770
 771
 772                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 773                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 774                         goto out;
 775
 776         }
 777
 778         if (key) {
 779                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 780                                    (TCPOPT_NOP << 16) |
 781                                    (TCPOPT_MD5SIG << 8) |
 782                                    TCPOLEN_MD5SIG);
 783                 /* Update length and the length the header thinks exists */
 784                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 785                 rep.th.doff = arg.iov[0].iov_len / 4;
 786
 787                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 788                                      key, ip_hdr(skb)->saddr,
 789                                      ip_hdr(skb)->daddr, &rep.th);
 790         }
 791 #endif
 792         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
 793         if (rep.opt[0] == 0) {
 794                 __be32 mrst = mptcp_reset_option(skb);
 795
 796                 if (mrst) {
 797                         rep.opt[0] = mrst;
 798                         arg.iov[0].iov_len += sizeof(mrst);
 799                         rep.th.doff = arg.iov[0].iov_len / 4;
 800                 }
 801         }
 802
 803         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 804                                       ip_hdr(skb)->saddr, /* XXX */
 805                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 806         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 807         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 808
 809         /* When socket is gone, all binding information is lost.
 810          * routing might fail in this case. No choice here, if we choose to force
 811          * input interface, we will misroute in case of asymmetric route.
 812          */
 813         if (sk) {
 814                 arg.bound_dev_if = sk->sk_bound_dev_if;
 815                 if (sk_fullsock(sk))
 816                         trace_tcp_send_reset(sk, skb);
 817         }
 818
 819         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 820                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 821
 822         arg.tos = ip_hdr(skb)->tos;
 823         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 824         local_bh_disable();
 825         ctl_sk = this_cpu_read(ipv4_tcp_sk);
 826         sock_net_set(ctl_sk, net);
 827         if (sk) {
 828                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 829                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 830                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 831                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 832                 transmit_time = tcp_transmit_time(sk);
 833                 xfrm_sk_clone_policy(ctl_sk, sk);
 834                 txhash = (sk->sk_state == TCP_TIME_WAIT) ?
 835                          inet_twsk(sk)->tw_txhash : sk->sk_txhash;
 836         } else {
 837                 ctl_sk->sk_mark = 0;
 838                 ctl_sk->sk_priority = 0;
 839         }
 840         ip_send_unicast_reply(ctl_sk,
 841                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 842                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 843                               &arg, arg.iov[0].iov_len,
 844                               transmit_time, txhash);
 845
 846         xfrm_sk_free_policy(ctl_sk);
 847         sock_net_set(ctl_sk, &init_net);
 848         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 849         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 850         local_bh_enable();
 851
 852 #ifdef CONFIG_TCP_MD5SIG
 853 out:
 854         rcu_read_unlock();
 855 #endif
 856 }
 857
 858 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 859    outside socket context is ugly, certainly. What can I do?
 860  */
 861
 862 static void tcp_v4_send_ack(const struct sock *sk,
 863                             struct sk_buff *skb, u32 seq, u32 ack,
 864                             u32 win, u32 tsval, u32 tsecr, int oif,
 865                             struct tcp_md5sig_key *key,
 866                             int reply_flags, u8 tos, u32 txhash)
 867 {
 868         const struct tcphdr *th = tcp_hdr(skb);
 869         struct {
 870                 struct tcphdr th;
 871                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 872 #ifdef CONFIG_TCP_MD5SIG
 873                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 874 #endif
 875                         ];
 876         } rep;
 877         struct net *net = sock_net(sk);
 878         struct ip_reply_arg arg;
 879         struct sock *ctl_sk;
 880         u64 transmit_time;
 881
 882         memset(&rep.th, 0, sizeof(struct tcphdr));
 883         memset(&arg, 0, sizeof(arg));
 884
 885         arg.iov[0].iov_base = (unsigned char *)&rep;
 886         arg.iov[0].iov_len  = sizeof(rep.th);
 887         if (tsecr) {
 888                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 889                                    (TCPOPT_TIMESTAMP << 8) |
 890                                    TCPOLEN_TIMESTAMP);
 891                 rep.opt[1] = htonl(tsval);
 892                 rep.opt[2] = htonl(tsecr);
 893                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 894         }
 895
 896         /* Swap the send and the receive. */
 897         rep.th.dest    = th->source;
 898         rep.th.source  = th->dest;
 899         rep.th.doff    = arg.iov[0].iov_len / 4;
 900         rep.th.seq     = htonl(seq);
 901         rep.th.ack_seq = htonl(ack);
 902         rep.th.ack     = 1;
 903         rep.th.window  = htons(win);
 904
 905 #ifdef CONFIG_TCP_MD5SIG
 906         if (key) {
 907                 int offset = (tsecr) ? 3 : 0;
 908
 909                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 910                                           (TCPOPT_NOP << 16) |
 911                                           (TCPOPT_MD5SIG << 8) |
 912                                           TCPOLEN_MD5SIG);
 913                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 914                 rep.th.doff = arg.iov[0].iov_len/4;
 915
 916                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 917                                     key, ip_hdr(skb)->saddr,
 918                                     ip_hdr(skb)->daddr, &rep.th);
 919         }
 920 #endif
 921         arg.flags = reply_flags;
 922         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 923                                       ip_hdr(skb)->saddr, /* XXX */
 924                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 925         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 926         if (oif)
 927                 arg.bound_dev_if = oif;
 928         arg.tos = tos;
 929         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 930         local_bh_disable();
 931         ctl_sk = this_cpu_read(ipv4_tcp_sk);
 932         sock_net_set(ctl_sk, net);
 933         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 934                            inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
 935         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 936                            inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
 937         transmit_time = tcp_transmit_time(sk);
 938         ip_send_unicast_reply(ctl_sk,
 939                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 940                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 941                               &arg, arg.iov[0].iov_len,
 942                               transmit_time, txhash);
 943
 944         sock_net_set(ctl_sk, &init_net);
 945         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 946         local_bh_enable();
 947 }
 948
 949 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 950 {
 951         struct inet_timewait_sock *tw = inet_twsk(sk);
 952         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 953
 954         tcp_v4_send_ack(sk, skb,
 955                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 956                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 957                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 958                         tcptw->tw_ts_recent,
 959                         tw->tw_bound_dev_if,
 960                         tcp_twsk_md5_key(tcptw),
 961                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 962                         tw->tw_tos,
 963                         tw->tw_txhash
 964                         );
 965
 966         inet_twsk_put(tw);
 967 }
 968
 969 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 970                                   struct request_sock *req)
 971 {
 972         const union tcp_md5_addr *addr;
 973         int l3index;
 974
 975         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 976          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 977          */
 978         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 979                                              tcp_sk(sk)->snd_nxt;
 980
 981         /* RFC 7323 2.3
 982          * The window field (SEG.WND) of every outgoing segment, with the
 983          * exception of <SYN> segments, MUST be right-shifted by
 984          * Rcv.Wind.Shift bits:
 985          */
 986         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 987         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 988         tcp_v4_send_ack(sk, skb, seq,
 989                         tcp_rsk(req)->rcv_nxt,
 990                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 991                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 992                         READ_ONCE(req->ts_recent),
 993                         0,
 994                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 995                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 996                         ip_hdr(skb)->tos,
 997                         READ_ONCE(tcp_rsk(req)->txhash));
 998 }
 999
1000 /*
1001  *      Send a SYN-ACK after having received a SYN.
1002  *      This still operates on a request_sock only, not on a big
1003  *      socket.
1004  */
1005 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1006                               struct flowi *fl,
1007                               struct request_sock *req,
1008                               struct tcp_fastopen_cookie *foc,
1009                               enum tcp_synack_type synack_type,
1010                               struct sk_buff *syn_skb)
1011 {
1012         const struct inet_request_sock *ireq = inet_rsk(req);
1013         struct flowi4 fl4;
1014         int err = -1;
1015         struct sk_buff *skb;
1016         u8 tos;
1017
1018         /* First, grab a route. */
1019         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1020                 return -1;
1021
1022         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1023
1024         if (skb) {
1025                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1026
1027                 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1028                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1029                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1030                                 inet_sk(sk)->tos;
1031
1032                 if (!INET_ECN_is_capable(tos) &&
1033                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1034                         tos |= INET_ECN_ECT_0;
1035
1036                 rcu_read_lock();
1037                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1038                                             ireq->ir_rmt_addr,
1039                                             rcu_dereference(ireq->ireq_opt),
1040                                             tos);
1041                 rcu_read_unlock();
1042                 err = net_xmit_eval(err);
1043         }
1044
1045         return err;
1046 }
1047
1048 /*
1049  *      IPv4 request_sock destructor.
1050  */
1051 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1052 {
1053         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1054 }
1055
1056 #ifdef CONFIG_TCP_MD5SIG
1057 /*
1058  * RFC2385 MD5 checksumming requires a mapping of
1059  * IP address->MD5 Key.
1060  * We need to maintain these in the sk structure.
1061  */
1062
1063 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1064 EXPORT_SYMBOL(tcp_md5_needed);
1065
1066 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1067 {
1068         if (!old)
1069                 return true;
1070
1071         /* l3index always overrides non-l3index */
1072         if (old->l3index && new->l3index == 0)
1073                 return false;
1074         if (old->l3index == 0 && new->l3index)
1075                 return true;
1076
1077         return old->prefixlen < new->prefixlen;
1078 }
1079
1080 /* Find the Key structure for an address.  */
1081 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1082                                            const union tcp_md5_addr *addr,
1083                                            int family)
1084 {
1085         const struct tcp_sock *tp = tcp_sk(sk);
1086         struct tcp_md5sig_key *key;
1087         const struct tcp_md5sig_info *md5sig;
1088         __be32 mask;
1089         struct tcp_md5sig_key *best_match = NULL;
1090         bool match;
1091
1092         /* caller either holds rcu_read_lock() or socket lock */
1093         md5sig = rcu_dereference_check(tp->md5sig_info,
1094                                        lockdep_sock_is_held(sk));
1095         if (!md5sig)
1096                 return NULL;
1097
1098         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1099                                  lockdep_sock_is_held(sk)) {
1100                 if (key->family != family)
1101                         continue;
1102                 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1103                         continue;
1104                 if (family == AF_INET) {
1105                         mask = inet_make_mask(key->prefixlen);
1106                         match = (key->addr.a4.s_addr & mask) ==
1107                                 (addr->a4.s_addr & mask);
1108 #if IS_ENABLED(CONFIG_IPV6)
1109                 } else if (family == AF_INET6) {
1110                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1111                                                   key->prefixlen);
1112 #endif
1113                 } else {
1114                         match = false;
1115                 }
1116
1117                 if (match && better_md5_match(best_match, key))
1118                         best_match = key;
1119         }
1120         return best_match;
1121 }
1122 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1123
1124 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1125                                                       const union tcp_md5_addr *addr,
1126                                                       int family, u8 prefixlen,
1127                                                       int l3index, u8 flags)
1128 {
1129         const struct tcp_sock *tp = tcp_sk(sk);
1130         struct tcp_md5sig_key *key;
1131         unsigned int size = sizeof(struct in_addr);
1132         const struct tcp_md5sig_info *md5sig;
1133
1134         /* caller either holds rcu_read_lock() or socket lock */
1135         md5sig = rcu_dereference_check(tp->md5sig_info,
1136                                        lockdep_sock_is_held(sk));
1137         if (!md5sig)
1138                 return NULL;
1139 #if IS_ENABLED(CONFIG_IPV6)
1140         if (family == AF_INET6)
1141                 size = sizeof(struct in6_addr);
1142 #endif
1143         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1144                                  lockdep_sock_is_held(sk)) {
1145                 if (key->family != family)
1146                         continue;
1147                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1148                         continue;
1149                 if (key->l3index != l3index)
1150                         continue;
1151                 if (!memcmp(&key->addr, addr, size) &&
1152                     key->prefixlen == prefixlen)
1153                         return key;
1154         }
1155         return NULL;
1156 }
1157
1158 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1159                                          const struct sock *addr_sk)
1160 {
1161         const union tcp_md5_addr *addr;
1162         int l3index;
1163
1164         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1165                                                  addr_sk->sk_bound_dev_if);
1166         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1167         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1168 }
1169 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1170
1171 /* This can be called on a newly created socket, from other files */
1172 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1173                    int family, u8 prefixlen, int l3index, u8 flags,
1174                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1175 {
1176         /* Add Key to the list */
1177         struct tcp_md5sig_key *key;
1178         struct tcp_sock *tp = tcp_sk(sk);
1179         struct tcp_md5sig_info *md5sig;
1180
1181         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1182         if (key) {
1183                 /* Pre-existing entry - just update that one.
1184                  * Note that the key might be used concurrently.
1185                  * data_race() is telling kcsan that we do not care of
1186                  * key mismatches, since changing MD5 key on live flows
1187                  * can lead to packet drops.
1188                  */
1189                 data_race(memcpy(key->key, newkey, newkeylen));
1190
1191                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1192                  * Also note that a reader could catch new key->keylen value
1193                  * but old key->key[], this is the reason we use __GFP_ZERO
1194                  * at sock_kmalloc() time below these lines.
1195                  */
1196                 WRITE_ONCE(key->keylen, newkeylen);
1197
1198                 return 0;
1199         }
1200
1201         md5sig = rcu_dereference_protected(tp->md5sig_info,
1202                                            lockdep_sock_is_held(sk));
1203         if (!md5sig) {
1204                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1205                 if (!md5sig)
1206                         return -ENOMEM;
1207
1208                 sk_gso_disable(sk);
1209                 INIT_HLIST_HEAD(&md5sig->head);
1210                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1211         }
1212
1213         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1214         if (!key)
1215                 return -ENOMEM;
1216         if (!tcp_alloc_md5sig_pool()) {
1217                 sock_kfree_s(sk, key, sizeof(*key));
1218                 return -ENOMEM;
1219         }
1220
1221         memcpy(key->key, newkey, newkeylen);
1222         key->keylen = newkeylen;
1223         key->family = family;
1224         key->prefixlen = prefixlen;
1225         key->l3index = l3index;
1226         key->flags = flags;
1227         memcpy(&key->addr, addr,
1228                (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1229                                                                  sizeof(struct in_addr));
1230         hlist_add_head_rcu(&key->node, &md5sig->head);
1231         return 0;
1232 }
1233 EXPORT_SYMBOL(tcp_md5_do_add);
1234
1235 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1236                    u8 prefixlen, int l3index, u8 flags)
1237 {
1238         struct tcp_md5sig_key *key;
1239
1240         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1241         if (!key)
1242                 return -ENOENT;
1243         hlist_del_rcu(&key->node);
1244         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1245         kfree_rcu(key, rcu);
1246         return 0;
1247 }
1248 EXPORT_SYMBOL(tcp_md5_do_del);
1249
1250 static void tcp_clear_md5_list(struct sock *sk)
1251 {
1252         struct tcp_sock *tp = tcp_sk(sk);
1253         struct tcp_md5sig_key *key;
1254         struct hlist_node *n;
1255         struct tcp_md5sig_info *md5sig;
1256
1257         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1258
1259         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1260                 hlist_del_rcu(&key->node);
1261                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1262                 kfree_rcu(key, rcu);
1263         }
1264 }
1265
1266 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1267                                  sockptr_t optval, int optlen)
1268 {
1269         struct tcp_md5sig cmd;
1270         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1271         const union tcp_md5_addr *addr;
1272         u8 prefixlen = 32;
1273         int l3index = 0;
1274         u8 flags;
1275
1276         if (optlen < sizeof(cmd))
1277                 return -EINVAL;
1278
1279         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1280                 return -EFAULT;
1281
1282         if (sin->sin_family != AF_INET)
1283                 return -EINVAL;
1284
1285         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1286
1287         if (optname == TCP_MD5SIG_EXT &&
1288             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1289                 prefixlen = cmd.tcpm_prefixlen;
1290                 if (prefixlen > 32)
1291                         return -EINVAL;
1292         }
1293
1294         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1295             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1296                 struct net_device *dev;
1297
1298                 rcu_read_lock();
1299                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1300                 if (dev && netif_is_l3_master(dev))
1301                         l3index = dev->ifindex;
1302
1303                 rcu_read_unlock();
1304
1305                 /* ok to reference set/not set outside of rcu;
1306                  * right now device MUST be an L3 master
1307                  */
1308                 if (!dev || !l3index)
1309                         return -EINVAL;
1310         }
1311
1312         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1313
1314         if (!cmd.tcpm_keylen)
1315                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1316
1317         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1318                 return -EINVAL;
1319
1320         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1321                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1322 }
1323
1324 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1325                                    __be32 daddr, __be32 saddr,
1326                                    const struct tcphdr *th, int nbytes)
1327 {
1328         struct tcp4_pseudohdr *bp;
1329         struct scatterlist sg;
1330         struct tcphdr *_th;
1331
1332         bp = hp->scratch;
1333         bp->saddr = saddr;
1334         bp->daddr = daddr;
1335         bp->pad = 0;
1336         bp->protocol = IPPROTO_TCP;
1337         bp->len = cpu_to_be16(nbytes);
1338
1339         _th = (struct tcphdr *)(bp + 1);
1340         memcpy(_th, th, sizeof(*th));
1341         _th->check = 0;
1342
1343         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1344         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1345                                 sizeof(*bp) + sizeof(*th));
1346         return crypto_ahash_update(hp->md5_req);
1347 }
1348
1349 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1350                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1351 {
1352         struct tcp_md5sig_pool *hp;
1353         struct ahash_request *req;
1354
1355         hp = tcp_get_md5sig_pool();
1356         if (!hp)
1357                 goto clear_hash_noput;
1358         req = hp->md5_req;
1359
1360         if (crypto_ahash_init(req))
1361                 goto clear_hash;
1362         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1363                 goto clear_hash;
1364         if (tcp_md5_hash_key(hp, key))
1365                 goto clear_hash;
1366         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1367         if (crypto_ahash_final(req))
1368                 goto clear_hash;
1369
1370         tcp_put_md5sig_pool();
1371         return 0;
1372
1373 clear_hash:
1374         tcp_put_md5sig_pool();
1375 clear_hash_noput:
1376         memset(md5_hash, 0, 16);
1377         return 1;
1378 }
1379
1380 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1381                         const struct sock *sk,
1382                         const struct sk_buff *skb)
1383 {
1384         struct tcp_md5sig_pool *hp;
1385         struct ahash_request *req;
1386         const struct tcphdr *th = tcp_hdr(skb);
1387         __be32 saddr, daddr;
1388
1389         if (sk) { /* valid for establish/request sockets */
1390                 saddr = sk->sk_rcv_saddr;
1391                 daddr = sk->sk_daddr;
1392         } else {
1393                 const struct iphdr *iph = ip_hdr(skb);
1394                 saddr = iph->saddr;
1395                 daddr = iph->daddr;
1396         }
1397
1398         hp = tcp_get_md5sig_pool();
1399         if (!hp)
1400                 goto clear_hash_noput;
1401         req = hp->md5_req;
1402
1403         if (crypto_ahash_init(req))
1404                 goto clear_hash;
1405
1406         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1407                 goto clear_hash;
1408         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1409                 goto clear_hash;
1410         if (tcp_md5_hash_key(hp, key))
1411                 goto clear_hash;
1412         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1413         if (crypto_ahash_final(req))
1414                 goto clear_hash;
1415
1416         tcp_put_md5sig_pool();
1417         return 0;
1418
1419 clear_hash:
1420         tcp_put_md5sig_pool();
1421 clear_hash_noput:
1422         memset(md5_hash, 0, 16);
1423         return 1;
1424 }
1425 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1426
1427 #endif
1428
1429 static void tcp_v4_init_req(struct request_sock *req,
1430                             const struct sock *sk_listener,
1431                             struct sk_buff *skb)
1432 {
1433         struct inet_request_sock *ireq = inet_rsk(req);
1434         struct net *net = sock_net(sk_listener);
1435
1436         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1437         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1438         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1439 }
1440
1441 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1442                                           struct sk_buff *skb,
1443                                           struct flowi *fl,
1444                                           struct request_sock *req)
1445 {
1446         tcp_v4_init_req(req, sk, skb);
1447
1448         if (security_inet_conn_request(sk, skb, req))
1449                 return NULL;
1450
1451         return inet_csk_route_req(sk, &fl->u.ip4, req);
1452 }
1453
1454 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1455         .family         =       PF_INET,
1456         .obj_size       =       sizeof(struct tcp_request_sock),
1457         .rtx_syn_ack    =       tcp_rtx_synack,
1458         .send_ack       =       tcp_v4_reqsk_send_ack,
1459         .destructor     =       tcp_v4_reqsk_destructor,
1460         .send_reset     =       tcp_v4_send_reset,
1461         .syn_ack_timeout =      tcp_syn_ack_timeout,
1462 };
1463
1464 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1465         .mss_clamp      =       TCP_MSS_DEFAULT,
1466 #ifdef CONFIG_TCP_MD5SIG
1467         .req_md5_lookup =       tcp_v4_md5_lookup,
1468         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1469 #endif
1470 #ifdef CONFIG_SYN_COOKIES
1471         .cookie_init_seq =      cookie_v4_init_sequence,
1472 #endif
1473         .route_req      =       tcp_v4_route_req,
1474         .init_seq       =       tcp_v4_init_seq,
1475         .init_ts_off    =       tcp_v4_init_ts_off,
1476         .send_synack    =       tcp_v4_send_synack,
1477 };
1478
1479 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1480 {
1481         /* Never answer to SYNs send to broadcast or multicast */
1482         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1483                 goto drop;
1484
1485         return tcp_conn_request(&tcp_request_sock_ops,
1486                                 &tcp_request_sock_ipv4_ops, sk, skb);
1487
1488 drop:
1489         tcp_listendrop(sk);
1490         return 0;
1491 }
1492 EXPORT_SYMBOL(tcp_v4_conn_request);
1493
1494
1495 /*
1496  * The three way handshake has completed - we got a valid synack -
1497  * now create the new socket.
1498  */
1499 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1500                                   struct request_sock *req,
1501                                   struct dst_entry *dst,
1502                                   struct request_sock *req_unhash,
1503                                   bool *own_req)
1504 {
1505         struct inet_request_sock *ireq;
1506         bool found_dup_sk = false;
1507         struct inet_sock *newinet;
1508         struct tcp_sock *newtp;
1509         struct sock *newsk;
1510 #ifdef CONFIG_TCP_MD5SIG
1511         const union tcp_md5_addr *addr;
1512         struct tcp_md5sig_key *key;
1513         int l3index;
1514 #endif
1515         struct ip_options_rcu *inet_opt;
1516
1517         if (sk_acceptq_is_full(sk))
1518                 goto exit_overflow;
1519
1520         newsk = tcp_create_openreq_child(sk, req, skb);
1521         if (!newsk)
1522                 goto exit_nonewsk;
1523
1524         newsk->sk_gso_type = SKB_GSO_TCPV4;
1525         inet_sk_rx_dst_set(newsk, skb);
1526
1527         newtp                 = tcp_sk(newsk);
1528         newinet               = inet_sk(newsk);
1529         ireq                  = inet_rsk(req);
1530         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1531         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1532         newsk->sk_bound_dev_if = ireq->ir_iif;
1533         newinet->inet_saddr   = ireq->ir_loc_addr;
1534         inet_opt              = rcu_dereference(ireq->ireq_opt);
1535         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1536         newinet->mc_index     = inet_iif(skb);
1537         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1538         newinet->rcv_tos      = ip_hdr(skb)->tos;
1539         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1540         if (inet_opt)
1541                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1542         atomic_set(&newinet->inet_id, get_random_u16());
1543
1544         /* Set ToS of the new socket based upon the value of incoming SYN.
1545          * ECT bits are set later in tcp_init_transfer().
1546          */
1547         if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1548                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1549
1550         if (!dst) {
1551                 dst = inet_csk_route_child_sock(sk, newsk, req);
1552                 if (!dst)
1553                         goto put_and_exit;
1554         } else {
1555                 /* syncookie case : see end of cookie_v4_check() */
1556         }
1557         sk_setup_caps(newsk, dst);
1558
1559         tcp_ca_openreq_child(newsk, dst);
1560
1561         tcp_sync_mss(newsk, dst_mtu(dst));
1562         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1563
1564         tcp_initialize_rcv_mss(newsk);
1565
1566 #ifdef CONFIG_TCP_MD5SIG
1567         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1568         /* Copy over the MD5 key from the original socket */
1569         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1570         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1571         if (key) {
1572                 /*
1573                  * We're using one, so create a matching key
1574                  * on the newsk structure. If we fail to get
1575                  * memory, then we end up not copying the key
1576                  * across. Shucks.
1577                  */
1578                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1579                                key->key, key->keylen, GFP_ATOMIC);
1580                 sk_gso_disable(newsk);
1581         }
1582 #endif
1583
1584         if (__inet_inherit_port(sk, newsk) < 0)
1585                 goto put_and_exit;
1586         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1587                                        &found_dup_sk);
1588         if (likely(*own_req)) {
1589                 tcp_move_syn(newtp, req);
1590                 ireq->ireq_opt = NULL;
1591         } else {
1592                 newinet->inet_opt = NULL;
1593
1594                 if (!req_unhash && found_dup_sk) {
1595                         /* This code path should only be executed in the
1596                          * syncookie case only
1597                          */
1598                         bh_unlock_sock(newsk);
1599                         sock_put(newsk);
1600                         newsk = NULL;
1601                 }
1602         }
1603         return newsk;
1604
1605 exit_overflow:
1606         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1607 exit_nonewsk:
1608         dst_release(dst);
1609 exit:
1610         tcp_listendrop(sk);
1611         return NULL;
1612 put_and_exit:
1613         newinet->inet_opt = NULL;
1614         inet_csk_prepare_forced_close(newsk);
1615         tcp_done(newsk);
1616         goto exit;
1617 }
1618 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1619
1620 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1621 {
1622 #ifdef CONFIG_SYN_COOKIES
1623         const struct tcphdr *th = tcp_hdr(skb);
1624
1625         if (!th->syn)
1626                 sk = cookie_v4_check(sk, skb);
1627 #endif
1628         return sk;
1629 }
1630
1631 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1632                          struct tcphdr *th, u32 *cookie)
1633 {
1634         u16 mss = 0;
1635 #ifdef CONFIG_SYN_COOKIES
1636         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1637                                     &tcp_request_sock_ipv4_ops, sk, th);
1638         if (mss) {
1639                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1640                 tcp_synq_overflow(sk);
1641         }
1642 #endif
1643         return mss;
1644 }
1645
1646 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1647                                                            u32));
1648 /* The socket must have it's spinlock held when we get
1649  * here, unless it is a TCP_LISTEN socket.
1650  *
1651  * We have a potential double-lock case here, so even when
1652  * doing backlog processing we use the BH locking scheme.
1653  * This is because we cannot sleep with the original spinlock
1654  * held.
1655  */
1656 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1657 {
1658         enum skb_drop_reason reason;
1659         struct sock *rsk;
1660
1661         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1662                 struct dst_entry *dst;
1663
1664                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1665                                                 lockdep_sock_is_held(sk));
1666
1667                 sock_rps_save_rxhash(sk, skb);
1668                 sk_mark_napi_id(sk, skb);
1669                 if (dst) {
1670                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1671                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1672                                              dst, 0)) {
1673                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1674                                 dst_release(dst);
1675                         }
1676                 }
1677                 tcp_rcv_established(sk, skb);
1678                 return 0;
1679         }
1680
1681         reason = SKB_DROP_REASON_NOT_SPECIFIED;
1682         if (tcp_checksum_complete(skb))
1683                 goto csum_err;
1684
1685         if (sk->sk_state == TCP_LISTEN) {
1686                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1687
1688                 if (!nsk)
1689                         goto discard;
1690                 if (nsk != sk) {
1691                         if (tcp_child_process(sk, nsk, skb)) {
1692                                 rsk = nsk;
1693                                 goto reset;
1694                         }
1695                         return 0;
1696                 }
1697         } else
1698                 sock_rps_save_rxhash(sk, skb);
1699
1700         if (tcp_rcv_state_process(sk, skb)) {
1701                 rsk = sk;
1702                 goto reset;
1703         }
1704         return 0;
1705
1706 reset:
1707         tcp_v4_send_reset(rsk, skb);
1708 discard:
1709         kfree_skb_reason(skb, reason);
1710         /* Be careful here. If this function gets more complicated and
1711          * gcc suffers from register pressure on the x86, sk (in %ebx)
1712          * might be destroyed here. This current version compiles correctly,
1713          * but you have been warned.
1714          */
1715         return 0;
1716
1717 csum_err:
1718         reason = SKB_DROP_REASON_TCP_CSUM;
1719         trace_tcp_bad_csum(skb);
1720         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1721         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1722         goto discard;
1723 }
1724 EXPORT_SYMBOL(tcp_v4_do_rcv);
1725
1726 int tcp_v4_early_demux(struct sk_buff *skb)
1727 {
1728         struct net *net = dev_net(skb->dev);
1729         const struct iphdr *iph;
1730         const struct tcphdr *th;
1731         struct sock *sk;
1732
1733         if (skb->pkt_type != PACKET_HOST)
1734                 return 0;
1735
1736         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1737                 return 0;
1738
1739         iph = ip_hdr(skb);
1740         th = tcp_hdr(skb);
1741
1742         if (th->doff < sizeof(struct tcphdr) / 4)
1743                 return 0;
1744
1745         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1746                                        iph->saddr, th->source,
1747                                        iph->daddr, ntohs(th->dest),
1748                                        skb->skb_iif, inet_sdif(skb));
1749         if (sk) {
1750                 skb->sk = sk;
1751                 skb->destructor = sock_edemux;
1752                 if (sk_fullsock(sk)) {
1753                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1754
1755                         if (dst)
1756                                 dst = dst_check(dst, 0);
1757                         if (dst &&
1758                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1759                                 skb_dst_set_noref(skb, dst);
1760                 }
1761         }
1762         return 0;
1763 }
1764
1765 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1766                      enum skb_drop_reason *reason)
1767 {
1768         u32 limit, tail_gso_size, tail_gso_segs;
1769         struct skb_shared_info *shinfo;
1770         const struct tcphdr *th;
1771         struct tcphdr *thtail;
1772         struct sk_buff *tail;
1773         unsigned int hdrlen;
1774         bool fragstolen;
1775         u32 gso_segs;
1776         u32 gso_size;
1777         int delta;
1778
1779         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1780          * we can fix skb->truesize to its real value to avoid future drops.
1781          * This is valid because skb is not yet charged to the socket.
1782          * It has been noticed pure SACK packets were sometimes dropped
1783          * (if cooked by drivers without copybreak feature).
1784          */
1785         skb_condense(skb);
1786
1787         skb_dst_drop(skb);
1788
1789         if (unlikely(tcp_checksum_complete(skb))) {
1790                 bh_unlock_sock(sk);
1791                 trace_tcp_bad_csum(skb);
1792                 *reason = SKB_DROP_REASON_TCP_CSUM;
1793                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1794                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1795                 return true;
1796         }
1797
1798         /* Attempt coalescing to last skb in backlog, even if we are
1799          * above the limits.
1800          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1801          */
1802         th = (const struct tcphdr *)skb->data;
1803         hdrlen = th->doff * 4;
1804
1805         tail = sk->sk_backlog.tail;
1806         if (!tail)
1807                 goto no_coalesce;
1808         thtail = (struct tcphdr *)tail->data;
1809
1810         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1811             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1812             ((TCP_SKB_CB(tail)->tcp_flags |
1813               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1814             !((TCP_SKB_CB(tail)->tcp_flags &
1815               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1816             ((TCP_SKB_CB(tail)->tcp_flags ^
1817               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1818 #ifdef CONFIG_TLS_DEVICE
1819             tail->decrypted != skb->decrypted ||
1820 #endif
1821             !mptcp_skb_can_collapse(tail, skb) ||
1822             thtail->doff != th->doff ||
1823             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1824                 goto no_coalesce;
1825
1826         __skb_pull(skb, hdrlen);
1827
1828         shinfo = skb_shinfo(skb);
1829         gso_size = shinfo->gso_size ?: skb->len;
1830         gso_segs = shinfo->gso_segs ?: 1;
1831
1832         shinfo = skb_shinfo(tail);
1833         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1834         tail_gso_segs = shinfo->gso_segs ?: 1;
1835
1836         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1837                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1838
1839                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1840                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1841                         thtail->window = th->window;
1842                 }
1843
1844                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1845                  * thtail->fin, so that the fast path in tcp_rcv_established()
1846                  * is not entered if we append a packet with a FIN.
1847                  * SYN, RST, URG are not present.
1848                  * ACK is set on both packets.
1849                  * PSH : we do not really care in TCP stack,
1850                  *       at least for 'GRO' packets.
1851                  */
1852                 thtail->fin |= th->fin;
1853                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1854
1855                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1856                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1857                         tail->tstamp = skb->tstamp;
1858                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1859                 }
1860
1861                 /* Not as strict as GRO. We only need to carry mss max value */
1862                 shinfo->gso_size = max(gso_size, tail_gso_size);
1863                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1864
1865                 sk->sk_backlog.len += delta;
1866                 __NET_INC_STATS(sock_net(sk),
1867                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1868                 kfree_skb_partial(skb, fragstolen);
1869                 return false;
1870         }
1871         __skb_push(skb, hdrlen);
1872
1873 no_coalesce:
1874         limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1875
1876         /* Only socket owner can try to collapse/prune rx queues
1877          * to reduce memory overhead, so add a little headroom here.
1878          * Few sockets backlog are possibly concurrently non empty.
1879          */
1880         limit += 64 * 1024;
1881
1882         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1883                 bh_unlock_sock(sk);
1884                 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1885                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1886                 return true;
1887         }
1888         return false;
1889 }
1890 EXPORT_SYMBOL(tcp_add_backlog);
1891
1892 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1893 {
1894         struct tcphdr *th = (struct tcphdr *)skb->data;
1895
1896         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1897 }
1898 EXPORT_SYMBOL(tcp_filter);
1899
1900 static void tcp_v4_restore_cb(struct sk_buff *skb)
1901 {
1902         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1903                 sizeof(struct inet_skb_parm));
1904 }
1905
1906 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1907                            const struct tcphdr *th)
1908 {
1909         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1910          * barrier() makes sure compiler wont play fool^Waliasing games.
1911          */
1912         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1913                 sizeof(struct inet_skb_parm));
1914         barrier();
1915
1916         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1917         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1918                                     skb->len - th->doff * 4);
1919         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1920         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1921         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1922         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1923         TCP_SKB_CB(skb)->sacked  = 0;
1924         TCP_SKB_CB(skb)->has_rxtstamp =
1925                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1926 }
1927
1928 /*
1929  *      From tcp_input.c
1930  */
1931
1932 int tcp_v4_rcv(struct sk_buff *skb)
1933 {
1934         struct net *net = dev_net(skb->dev);
1935         enum skb_drop_reason drop_reason;
1936         int sdif = inet_sdif(skb);
1937         int dif = inet_iif(skb);
1938         const struct iphdr *iph;
1939         const struct tcphdr *th;
1940         bool refcounted;
1941         struct sock *sk;
1942         int ret;
1943
1944         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1945         if (skb->pkt_type != PACKET_HOST)
1946                 goto discard_it;
1947
1948         /* Count it even if it's bad */
1949         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1950
1951         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1952                 goto discard_it;
1953
1954         th = (const struct tcphdr *)skb->data;
1955
1956         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1957                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1958                 goto bad_packet;
1959         }
1960         if (!pskb_may_pull(skb, th->doff * 4))
1961                 goto discard_it;
1962
1963         /* An explanation is required here, I think.
1964          * Packet length and doff are validated by header prediction,
1965          * provided case of th->doff==0 is eliminated.
1966          * So, we defer the checks. */
1967
1968         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1969                 goto csum_error;
1970
1971         th = (const struct tcphdr *)skb->data;
1972         iph = ip_hdr(skb);
1973 lookup:
1974         sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
1975                                skb, __tcp_hdrlen(th), th->source,
1976                                th->dest, sdif, &refcounted);
1977         if (!sk)
1978                 goto no_tcp_socket;
1979
1980 process:
1981         if (sk->sk_state == TCP_TIME_WAIT)
1982                 goto do_time_wait;
1983
1984         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1985                 struct request_sock *req = inet_reqsk(sk);
1986                 bool req_stolen = false;
1987                 struct sock *nsk;
1988
1989                 sk = req->rsk_listener;
1990                 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1991                         drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1992                 else
1993                         drop_reason = tcp_inbound_md5_hash(sk, skb,
1994                                                    &iph->saddr, &iph->daddr,
1995                                                    AF_INET, dif, sdif);
1996                 if (unlikely(drop_reason)) {
1997                         sk_drops_add(sk, skb);
1998                         reqsk_put(req);
1999                         goto discard_it;
2000                 }
2001                 if (tcp_checksum_complete(skb)) {
2002                         reqsk_put(req);
2003                         goto csum_error;
2004                 }
2005                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2006                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2007                         if (!nsk) {
2008                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2009                                 goto lookup;
2010                         }
2011                         sk = nsk;
2012                         /* reuseport_migrate_sock() has already held one sk_refcnt
2013                          * before returning.
2014                          */
2015                 } else {
2016                         /* We own a reference on the listener, increase it again
2017                          * as we might lose it too soon.
2018                          */
2019                         sock_hold(sk);
2020                 }
2021                 refcounted = true;
2022                 nsk = NULL;
2023                 if (!tcp_filter(sk, skb)) {
2024                         th = (const struct tcphdr *)skb->data;
2025                         iph = ip_hdr(skb);
2026                         tcp_v4_fill_cb(skb, iph, th);
2027                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2028                 } else {
2029                         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2030                 }
2031                 if (!nsk) {
2032                         reqsk_put(req);
2033                         if (req_stolen) {
2034                                 /* Another cpu got exclusive access to req
2035                                  * and created a full blown socket.
2036                                  * Try to feed this packet to this socket
2037                                  * instead of discarding it.
2038                                  */
2039                                 tcp_v4_restore_cb(skb);
2040                                 sock_put(sk);
2041                                 goto lookup;
2042                         }
2043                         goto discard_and_relse;
2044                 }
2045                 nf_reset_ct(skb);
2046                 if (nsk == sk) {
2047                         reqsk_put(req);
2048                         tcp_v4_restore_cb(skb);
2049                 } else if (tcp_child_process(sk, nsk, skb)) {
2050                         tcp_v4_send_reset(nsk, skb);
2051                         goto discard_and_relse;
2052                 } else {
2053                         sock_put(sk);
2054                         return 0;
2055                 }
2056         }
2057
2058         if (static_branch_unlikely(&ip4_min_ttl)) {
2059                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2060                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2061                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2062                         goto discard_and_relse;
2063                 }
2064         }
2065
2066         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2067                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2068                 goto discard_and_relse;
2069         }
2070
2071         drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2072                                            &iph->daddr, AF_INET, dif, sdif);
2073         if (drop_reason)
2074                 goto discard_and_relse;
2075
2076         nf_reset_ct(skb);
2077
2078         if (tcp_filter(sk, skb)) {
2079                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2080                 goto discard_and_relse;
2081         }
2082         th = (const struct tcphdr *)skb->data;
2083         iph = ip_hdr(skb);
2084         tcp_v4_fill_cb(skb, iph, th);
2085
2086         skb->dev = NULL;
2087
2088         if (sk->sk_state == TCP_LISTEN) {
2089                 ret = tcp_v4_do_rcv(sk, skb);
2090                 goto put_and_return;
2091         }
2092
2093         sk_incoming_cpu_update(sk);
2094
2095         bh_lock_sock_nested(sk);
2096         tcp_segs_in(tcp_sk(sk), skb);
2097         ret = 0;
2098         if (!sock_owned_by_user(sk)) {
2099                 ret = tcp_v4_do_rcv(sk, skb);
2100         } else {
2101                 if (tcp_add_backlog(sk, skb, &drop_reason))
2102                         goto discard_and_relse;
2103         }
2104         bh_unlock_sock(sk);
2105
2106 put_and_return:
2107         if (refcounted)
2108                 sock_put(sk);
2109
2110         return ret;
2111
2112 no_tcp_socket:
2113         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2114         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2115                 goto discard_it;
2116
2117         tcp_v4_fill_cb(skb, iph, th);
2118
2119         if (tcp_checksum_complete(skb)) {
2120 csum_error:
2121                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2122                 trace_tcp_bad_csum(skb);
2123                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2124 bad_packet:
2125                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2126         } else {
2127                 tcp_v4_send_reset(NULL, skb);
2128         }
2129
2130 discard_it:
2131         SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2132         /* Discard frame. */
2133         kfree_skb_reason(skb, drop_reason);
2134         return 0;
2135
2136 discard_and_relse:
2137         sk_drops_add(sk, skb);
2138         if (refcounted)
2139                 sock_put(sk);
2140         goto discard_it;
2141
2142 do_time_wait:
2143         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2144                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2145                 inet_twsk_put(inet_twsk(sk));
2146                 goto discard_it;
2147         }
2148
2149         tcp_v4_fill_cb(skb, iph, th);
2150
2151         if (tcp_checksum_complete(skb)) {
2152                 inet_twsk_put(inet_twsk(sk));
2153                 goto csum_error;
2154         }
2155         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2156         case TCP_TW_SYN: {
2157                 struct sock *sk2 = inet_lookup_listener(net,
2158                                                         net->ipv4.tcp_death_row.hashinfo,
2159                                                         skb, __tcp_hdrlen(th),
2160                                                         iph->saddr, th->source,
2161                                                         iph->daddr, th->dest,
2162                                                         inet_iif(skb),
2163                                                         sdif);
2164                 if (sk2) {
2165                         inet_twsk_deschedule_put(inet_twsk(sk));
2166                         sk = sk2;
2167                         tcp_v4_restore_cb(skb);
2168                         refcounted = false;
2169                         goto process;
2170                 }
2171         }
2172                 /* to ACK */
2173                 fallthrough;
2174         case TCP_TW_ACK:
2175                 tcp_v4_timewait_ack(sk, skb);
2176                 break;
2177         case TCP_TW_RST:
2178                 tcp_v4_send_reset(sk, skb);
2179                 inet_twsk_deschedule_put(inet_twsk(sk));
2180                 goto discard_it;
2181         case TCP_TW_SUCCESS:;
2182         }
2183         goto discard_it;
2184 }
2185
2186 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2187         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2188         .twsk_unique    = tcp_twsk_unique,
2189         .twsk_destructor= tcp_twsk_destructor,
2190 };
2191
2192 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2193 {
2194         struct dst_entry *dst = skb_dst(skb);
2195
2196         if (dst && dst_hold_safe(dst)) {
2197                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2198                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2199         }
2200 }
2201 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2202
2203 const struct inet_connection_sock_af_ops ipv4_specific = {
2204         .queue_xmit        = ip_queue_xmit,
2205         .send_check        = tcp_v4_send_check,
2206         .rebuild_header    = inet_sk_rebuild_header,
2207         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2208         .conn_request      = tcp_v4_conn_request,
2209         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2210         .net_header_len    = sizeof(struct iphdr),
2211         .setsockopt        = ip_setsockopt,
2212         .getsockopt        = ip_getsockopt,
2213         .addr2sockaddr     = inet_csk_addr2sockaddr,
2214         .sockaddr_len      = sizeof(struct sockaddr_in),
2215         .mtu_reduced       = tcp_v4_mtu_reduced,
2216 };
2217 EXPORT_SYMBOL(ipv4_specific);
2218
2219 #ifdef CONFIG_TCP_MD5SIG
2220 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2221         .md5_lookup             = tcp_v4_md5_lookup,
2222         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2223         .md5_parse              = tcp_v4_parse_md5_keys,
2224 };
2225 #endif
2226
2227 /* NOTE: A lot of things set to zero explicitly by call to
2228  *       sk_alloc() so need not be done here.
2229  */
2230 static int tcp_v4_init_sock(struct sock *sk)
2231 {
2232         struct inet_connection_sock *icsk = inet_csk(sk);
2233
2234         tcp_init_sock(sk);
2235
2236         icsk->icsk_af_ops = &ipv4_specific;
2237
2238 #ifdef CONFIG_TCP_MD5SIG
2239         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2240 #endif
2241
2242         return 0;
2243 }
2244
2245 void tcp_v4_destroy_sock(struct sock *sk)
2246 {
2247         struct tcp_sock *tp = tcp_sk(sk);
2248
2249         trace_tcp_destroy_sock(sk);
2250
2251         tcp_clear_xmit_timers(sk);
2252
2253         tcp_cleanup_congestion_control(sk);
2254
2255         tcp_cleanup_ulp(sk);
2256
2257         /* Cleanup up the write buffer. */
2258         tcp_write_queue_purge(sk);
2259
2260         /* Check if we want to disable active TFO */
2261         tcp_fastopen_active_disable_ofo_check(sk);
2262
2263         /* Cleans up our, hopefully empty, out_of_order_queue. */
2264         skb_rbtree_purge(&tp->out_of_order_queue);
2265
2266 #ifdef CONFIG_TCP_MD5SIG
2267         /* Clean up the MD5 key list, if any */
2268         if (tp->md5sig_info) {
2269                 tcp_clear_md5_list(sk);
2270                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2271                 tp->md5sig_info = NULL;
2272         }
2273 #endif
2274
2275         /* Clean up a referenced TCP bind bucket. */
2276         if (inet_csk(sk)->icsk_bind_hash)
2277                 inet_put_port(sk);
2278
2279         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2280
2281         /* If socket is aborted during connect operation */
2282         tcp_free_fastopen_req(tp);
2283         tcp_fastopen_destroy_cipher(sk);
2284         tcp_saved_syn_free(tp);
2285
2286         sk_sockets_allocated_dec(sk);
2287 }
2288 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2289
2290 #ifdef CONFIG_PROC_FS
2291 /* Proc filesystem TCP sock list dumping. */
2292
2293 static unsigned short seq_file_family(const struct seq_file *seq);
2294
2295 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2296 {
2297         unsigned short family = seq_file_family(seq);
2298
2299         /* AF_UNSPEC is used as a match all */
2300         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2301                 net_eq(sock_net(sk), seq_file_net(seq)));
2302 }
2303
2304 /* Find a non empty bucket (starting from st->bucket)
2305  * and return the first sk from it.
2306  */
2307 static void *listening_get_first(struct seq_file *seq)
2308 {
2309         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2310         struct tcp_iter_state *st = seq->private;
2311
2312         st->offset = 0;
2313         for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2314                 struct inet_listen_hashbucket *ilb2;
2315                 struct hlist_nulls_node *node;
2316                 struct sock *sk;
2317
2318                 ilb2 = &hinfo->lhash2[st->bucket];
2319                 if (hlist_nulls_empty(&ilb2->nulls_head))
2320                         continue;
2321
2322                 spin_lock(&ilb2->lock);
2323                 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2324                         if (seq_sk_match(seq, sk))
2325                                 return sk;
2326                 }
2327                 spin_unlock(&ilb2->lock);
2328         }
2329
2330         return NULL;
2331 }
2332
2333 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2334  * If "cur" is the last one in the st->bucket,
2335  * call listening_get_first() to return the first sk of the next
2336  * non empty bucket.
2337  */
2338 static void *listening_get_next(struct seq_file *seq, void *cur)
2339 {
2340         struct tcp_iter_state *st = seq->private;
2341         struct inet_listen_hashbucket *ilb2;
2342         struct hlist_nulls_node *node;
2343         struct inet_hashinfo *hinfo;
2344         struct sock *sk = cur;
2345
2346         ++st->num;
2347         ++st->offset;
2348
2349         sk = sk_nulls_next(sk);
2350         sk_nulls_for_each_from(sk, node) {
2351                 if (seq_sk_match(seq, sk))
2352                         return sk;
2353         }
2354
2355         hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2356         ilb2 = &hinfo->lhash2[st->bucket];
2357         spin_unlock(&ilb2->lock);
2358         ++st->bucket;
2359         return listening_get_first(seq);
2360 }
2361
2362 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2363 {
2364         struct tcp_iter_state *st = seq->private;
2365         void *rc;
2366
2367         st->bucket = 0;
2368         st->offset = 0;
2369         rc = listening_get_first(seq);
2370
2371         while (rc && *pos) {
2372                 rc = listening_get_next(seq, rc);
2373                 --*pos;
2374         }
2375         return rc;
2376 }
2377
2378 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2379                                 const struct tcp_iter_state *st)
2380 {
2381         return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2382 }
2383
2384 /*
2385  * Get first established socket starting from bucket given in st->bucket.
2386  * If st->bucket is zero, the very first socket in the hash is returned.
2387  */
2388 static void *established_get_first(struct seq_file *seq)
2389 {
2390         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2391         struct tcp_iter_state *st = seq->private;
2392
2393         st->offset = 0;
2394         for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2395                 struct sock *sk;
2396                 struct hlist_nulls_node *node;
2397                 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2398
2399                 /* Lockless fast path for the common case of empty buckets */
2400                 if (empty_bucket(hinfo, st))
2401                         continue;
2402
2403                 spin_lock_bh(lock);
2404                 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2405                         if (seq_sk_match(seq, sk))
2406                                 return sk;
2407                 }
2408                 spin_unlock_bh(lock);
2409         }
2410
2411         return NULL;
2412 }
2413
2414 static void *established_get_next(struct seq_file *seq, void *cur)
2415 {
2416         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2417         struct tcp_iter_state *st = seq->private;
2418         struct hlist_nulls_node *node;
2419         struct sock *sk = cur;
2420
2421         ++st->num;
2422         ++st->offset;
2423
2424         sk = sk_nulls_next(sk);
2425
2426         sk_nulls_for_each_from(sk, node) {
2427                 if (seq_sk_match(seq, sk))
2428                         return sk;
2429         }
2430
2431         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2432         ++st->bucket;
2433         return established_get_first(seq);
2434 }
2435
2436 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2437 {
2438         struct tcp_iter_state *st = seq->private;
2439         void *rc;
2440
2441         st->bucket = 0;
2442         rc = established_get_first(seq);
2443
2444         while (rc && pos) {
2445                 rc = established_get_next(seq, rc);
2446                 --pos;
2447         }
2448         return rc;
2449 }
2450
2451 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2452 {
2453         void *rc;
2454         struct tcp_iter_state *st = seq->private;
2455
2456         st->state = TCP_SEQ_STATE_LISTENING;
2457         rc        = listening_get_idx(seq, &pos);
2458
2459         if (!rc) {
2460                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2461                 rc        = established_get_idx(seq, pos);
2462         }
2463
2464         return rc;
2465 }
2466
2467 static void *tcp_seek_last_pos(struct seq_file *seq)
2468 {
2469         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2470         struct tcp_iter_state *st = seq->private;
2471         int bucket = st->bucket;
2472         int offset = st->offset;
2473         int orig_num = st->num;
2474         void *rc = NULL;
2475
2476         switch (st->state) {
2477         case TCP_SEQ_STATE_LISTENING:
2478                 if (st->bucket > hinfo->lhash2_mask)
2479                         break;
2480                 st->state = TCP_SEQ_STATE_LISTENING;
2481                 rc = listening_get_first(seq);
2482                 while (offset-- && rc && bucket == st->bucket)
2483                         rc = listening_get_next(seq, rc);
2484                 if (rc)
2485                         break;
2486                 st->bucket = 0;
2487                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2488                 fallthrough;
2489         case TCP_SEQ_STATE_ESTABLISHED:
2490                 if (st->bucket > hinfo->ehash_mask)
2491                         break;
2492                 rc = established_get_first(seq);
2493                 while (offset-- && rc && bucket == st->bucket)
2494                         rc = established_get_next(seq, rc);
2495         }
2496
2497         st->num = orig_num;
2498
2499         return rc;
2500 }
2501
2502 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2503 {
2504         struct tcp_iter_state *st = seq->private;
2505         void *rc;
2506
2507         if (*pos && *pos == st->last_pos) {
2508                 rc = tcp_seek_last_pos(seq);
2509                 if (rc)
2510                         goto out;
2511         }
2512
2513         st->state = TCP_SEQ_STATE_LISTENING;
2514         st->num = 0;
2515         st->bucket = 0;
2516         st->offset = 0;
2517         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2518
2519 out:
2520         st->last_pos = *pos;
2521         return rc;
2522 }
2523 EXPORT_SYMBOL(tcp_seq_start);
2524
2525 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2526 {
2527         struct tcp_iter_state *st = seq->private;
2528         void *rc = NULL;
2529
2530         if (v == SEQ_START_TOKEN) {
2531                 rc = tcp_get_idx(seq, 0);
2532                 goto out;
2533         }
2534
2535         switch (st->state) {
2536         case TCP_SEQ_STATE_LISTENING:
2537                 rc = listening_get_next(seq, v);
2538                 if (!rc) {
2539                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2540                         st->bucket = 0;
2541                         st->offset = 0;
2542                         rc        = established_get_first(seq);
2543                 }
2544                 break;
2545         case TCP_SEQ_STATE_ESTABLISHED:
2546                 rc = established_get_next(seq, v);
2547                 break;
2548         }
2549 out:
2550         ++*pos;
2551         st->last_pos = *pos;
2552         return rc;
2553 }
2554 EXPORT_SYMBOL(tcp_seq_next);
2555
2556 void tcp_seq_stop(struct seq_file *seq, void *v)
2557 {
2558         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2559         struct tcp_iter_state *st = seq->private;
2560
2561         switch (st->state) {
2562         case TCP_SEQ_STATE_LISTENING:
2563                 if (v != SEQ_START_TOKEN)
2564                         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2565                 break;
2566         case TCP_SEQ_STATE_ESTABLISHED:
2567                 if (v)
2568                         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2569                 break;
2570         }
2571 }
2572 EXPORT_SYMBOL(tcp_seq_stop);
2573
2574 static void get_openreq4(const struct request_sock *req,
2575                          struct seq_file *f, int i)
2576 {
2577         const struct inet_request_sock *ireq = inet_rsk(req);
2578         long delta = req->rsk_timer.expires - jiffies;
2579
2580         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2581                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2582                 i,
2583                 ireq->ir_loc_addr,
2584                 ireq->ir_num,
2585                 ireq->ir_rmt_addr,
2586                 ntohs(ireq->ir_rmt_port),
2587                 TCP_SYN_RECV,
2588                 0, 0, /* could print option size, but that is af dependent. */
2589                 1,    /* timers active (only the expire timer) */
2590                 jiffies_delta_to_clock_t(delta),
2591                 req->num_timeout,
2592                 from_kuid_munged(seq_user_ns(f),
2593                                  sock_i_uid(req->rsk_listener)),
2594                 0,  /* non standard timer */
2595                 0, /* open_requests have no inode */
2596                 0,
2597                 req);
2598 }
2599
2600 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2601 {
2602         int timer_active;
2603         unsigned long timer_expires;
2604         const struct tcp_sock *tp = tcp_sk(sk);
2605         const struct inet_connection_sock *icsk = inet_csk(sk);
2606         const struct inet_sock *inet = inet_sk(sk);
2607         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2608         __be32 dest = inet->inet_daddr;
2609         __be32 src = inet->inet_rcv_saddr;
2610         __u16 destp = ntohs(inet->inet_dport);
2611         __u16 srcp = ntohs(inet->inet_sport);
2612         int rx_queue;
2613         int state;
2614
2615         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2616             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2617             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2618                 timer_active    = 1;
2619                 timer_expires   = icsk->icsk_timeout;
2620         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2621                 timer_active    = 4;
2622                 timer_expires   = icsk->icsk_timeout;
2623         } else if (timer_pending(&sk->sk_timer)) {
2624                 timer_active    = 2;
2625                 timer_expires   = sk->sk_timer.expires;
2626         } else {
2627                 timer_active    = 0;
2628                 timer_expires = jiffies;
2629         }
2630
2631         state = inet_sk_state_load(sk);
2632         if (state == TCP_LISTEN)
2633                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2634         else
2635                 /* Because we don't lock the socket,
2636                  * we might find a transient negative value.
2637                  */
2638                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2639                                       READ_ONCE(tp->copied_seq), 0);
2640
2641         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2642                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2643                 i, src, srcp, dest, destp, state,
2644                 READ_ONCE(tp->write_seq) - tp->snd_una,
2645                 rx_queue,
2646                 timer_active,
2647                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2648                 icsk->icsk_retransmits,
2649                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2650                 icsk->icsk_probes_out,
2651                 sock_i_ino(sk),
2652                 refcount_read(&sk->sk_refcnt), sk,
2653                 jiffies_to_clock_t(icsk->icsk_rto),
2654                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2655                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2656                 tcp_snd_cwnd(tp),
2657                 state == TCP_LISTEN ?
2658                     fastopenq->max_qlen :
2659                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2660 }
2661
2662 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2663                                struct seq_file *f, int i)
2664 {
2665         long delta = tw->tw_timer.expires - jiffies;
2666         __be32 dest, src;
2667         __u16 destp, srcp;
2668
2669         dest  = tw->tw_daddr;
2670         src   = tw->tw_rcv_saddr;
2671         destp = ntohs(tw->tw_dport);
2672         srcp  = ntohs(tw->tw_sport);
2673
2674         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2675                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2676                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2677                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2678                 refcount_read(&tw->tw_refcnt), tw);
2679 }
2680
2681 #define TMPSZ 150
2682
2683 static int tcp4_seq_show(struct seq_file *seq, void *v)
2684 {
2685         struct tcp_iter_state *st;
2686         struct sock *sk = v;
2687
2688         seq_setwidth(seq, TMPSZ - 1);
2689         if (v == SEQ_START_TOKEN) {
2690                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2691                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2692                            "inode");
2693                 goto out;
2694         }
2695         st = seq->private;
2696
2697         if (sk->sk_state == TCP_TIME_WAIT)
2698                 get_timewait4_sock(v, seq, st->num);
2699         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2700                 get_openreq4(v, seq, st->num);
2701         else
2702                 get_tcp4_sock(v, seq, st->num);
2703 out:
2704         seq_pad(seq, '\n');
2705         return 0;
2706 }
2707
2708 #ifdef CONFIG_BPF_SYSCALL
2709 struct bpf_tcp_iter_state {
2710         struct tcp_iter_state state;
2711         unsigned int cur_sk;
2712         unsigned int end_sk;
2713         unsigned int max_sk;
2714         struct sock **batch;
2715         bool st_bucket_done;
2716 };
2717
2718 struct bpf_iter__tcp {
2719         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2720         __bpf_md_ptr(struct sock_common *, sk_common);
2721         uid_t uid __aligned(8);
2722 };
2723
2724 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2725                              struct sock_common *sk_common, uid_t uid)
2726 {
2727         struct bpf_iter__tcp ctx;
2728
2729         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2730         ctx.meta = meta;
2731         ctx.sk_common = sk_common;
2732         ctx.uid = uid;
2733         return bpf_iter_run_prog(prog, &ctx);
2734 }
2735
2736 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2737 {
2738         while (iter->cur_sk < iter->end_sk)
2739                 sock_gen_put(iter->batch[iter->cur_sk++]);
2740 }
2741
2742 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2743                                       unsigned int new_batch_sz)
2744 {
2745         struct sock **new_batch;
2746
2747         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2748                              GFP_USER | __GFP_NOWARN);
2749         if (!new_batch)
2750                 return -ENOMEM;
2751
2752         bpf_iter_tcp_put_batch(iter);
2753         kvfree(iter->batch);
2754         iter->batch = new_batch;
2755         iter->max_sk = new_batch_sz;
2756
2757         return 0;
2758 }
2759
2760 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2761                                                  struct sock *start_sk)
2762 {
2763         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2764         struct bpf_tcp_iter_state *iter = seq->private;
2765         struct tcp_iter_state *st = &iter->state;
2766         struct hlist_nulls_node *node;
2767         unsigned int expected = 1;
2768         struct sock *sk;
2769
2770         sock_hold(start_sk);
2771         iter->batch[iter->end_sk++] = start_sk;
2772
2773         sk = sk_nulls_next(start_sk);
2774         sk_nulls_for_each_from(sk, node) {
2775                 if (seq_sk_match(seq, sk)) {
2776                         if (iter->end_sk < iter->max_sk) {
2777                                 sock_hold(sk);
2778                                 iter->batch[iter->end_sk++] = sk;
2779                         }
2780                         expected++;
2781                 }
2782         }
2783         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2784
2785         return expected;
2786 }
2787
2788 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2789                                                    struct sock *start_sk)
2790 {
2791         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2792         struct bpf_tcp_iter_state *iter = seq->private;
2793         struct tcp_iter_state *st = &iter->state;
2794         struct hlist_nulls_node *node;
2795         unsigned int expected = 1;
2796         struct sock *sk;
2797
2798         sock_hold(start_sk);
2799         iter->batch[iter->end_sk++] = start_sk;
2800
2801         sk = sk_nulls_next(start_sk);
2802         sk_nulls_for_each_from(sk, node) {
2803                 if (seq_sk_match(seq, sk)) {
2804                         if (iter->end_sk < iter->max_sk) {
2805                                 sock_hold(sk);
2806                                 iter->batch[iter->end_sk++] = sk;
2807                         }
2808                         expected++;
2809                 }
2810         }
2811         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2812
2813         return expected;
2814 }
2815
2816 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2817 {
2818         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2819         struct bpf_tcp_iter_state *iter = seq->private;
2820         struct tcp_iter_state *st = &iter->state;
2821         unsigned int expected;
2822         bool resized = false;
2823         struct sock *sk;
2824
2825         /* The st->bucket is done.  Directly advance to the next
2826          * bucket instead of having the tcp_seek_last_pos() to skip
2827          * one by one in the current bucket and eventually find out
2828          * it has to advance to the next bucket.
2829          */
2830         if (iter->st_bucket_done) {
2831                 st->offset = 0;
2832                 st->bucket++;
2833                 if (st->state == TCP_SEQ_STATE_LISTENING &&
2834                     st->bucket > hinfo->lhash2_mask) {
2835                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2836                         st->bucket = 0;
2837                 }
2838         }
2839
2840 again:
2841         /* Get a new batch */
2842         iter->cur_sk = 0;
2843         iter->end_sk = 0;
2844         iter->st_bucket_done = false;
2845
2846         sk = tcp_seek_last_pos(seq);
2847         if (!sk)
2848                 return NULL; /* Done */
2849
2850         if (st->state == TCP_SEQ_STATE_LISTENING)
2851                 expected = bpf_iter_tcp_listening_batch(seq, sk);
2852         else
2853                 expected = bpf_iter_tcp_established_batch(seq, sk);
2854
2855         if (iter->end_sk == expected) {
2856                 iter->st_bucket_done = true;
2857                 return sk;
2858         }
2859
2860         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2861                 resized = true;
2862                 goto again;
2863         }
2864
2865         return sk;
2866 }
2867
2868 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2869 {
2870         /* bpf iter does not support lseek, so it always
2871          * continue from where it was stop()-ped.
2872          */
2873         if (*pos)
2874                 return bpf_iter_tcp_batch(seq);
2875
2876         return SEQ_START_TOKEN;
2877 }
2878
2879 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2880 {
2881         struct bpf_tcp_iter_state *iter = seq->private;
2882         struct tcp_iter_state *st = &iter->state;
2883         struct sock *sk;
2884
2885         /* Whenever seq_next() is called, the iter->cur_sk is
2886          * done with seq_show(), so advance to the next sk in
2887          * the batch.
2888          */
2889         if (iter->cur_sk < iter->end_sk) {
2890                 /* Keeping st->num consistent in tcp_iter_state.
2891                  * bpf_iter_tcp does not use st->num.
2892                  * meta.seq_num is used instead.
2893                  */
2894                 st->num++;
2895                 /* Move st->offset to the next sk in the bucket such that
2896                  * the future start() will resume at st->offset in
2897                  * st->bucket.  See tcp_seek_last_pos().
2898                  */
2899                 st->offset++;
2900                 sock_gen_put(iter->batch[iter->cur_sk++]);
2901         }
2902
2903         if (iter->cur_sk < iter->end_sk)
2904                 sk = iter->batch[iter->cur_sk];
2905         else
2906                 sk = bpf_iter_tcp_batch(seq);
2907
2908         ++*pos;
2909         /* Keeping st->last_pos consistent in tcp_iter_state.
2910          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2911          */
2912         st->last_pos = *pos;
2913         return sk;
2914 }
2915
2916 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2917 {
2918         struct bpf_iter_meta meta;
2919         struct bpf_prog *prog;
2920         struct sock *sk = v;
2921         uid_t uid;
2922         int ret;
2923
2924         if (v == SEQ_START_TOKEN)
2925                 return 0;
2926
2927         if (sk_fullsock(sk))
2928                 lock_sock(sk);
2929
2930         if (unlikely(sk_unhashed(sk))) {
2931                 ret = SEQ_SKIP;
2932                 goto unlock;
2933         }
2934
2935         if (sk->sk_state == TCP_TIME_WAIT) {
2936                 uid = 0;
2937         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2938                 const struct request_sock *req = v;
2939
2940                 uid = from_kuid_munged(seq_user_ns(seq),
2941                                        sock_i_uid(req->rsk_listener));
2942         } else {
2943                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2944         }
2945
2946         meta.seq = seq;
2947         prog = bpf_iter_get_info(&meta, false);
2948         ret = tcp_prog_seq_show(prog, &meta, v, uid);
2949
2950 unlock:
2951         if (sk_fullsock(sk))
2952                 release_sock(sk);
2953         return ret;
2954
2955 }
2956
2957 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2958 {
2959         struct bpf_tcp_iter_state *iter = seq->private;
2960         struct bpf_iter_meta meta;
2961         struct bpf_prog *prog;
2962
2963         if (!v) {
2964                 meta.seq = seq;
2965                 prog = bpf_iter_get_info(&meta, true);
2966                 if (prog)
2967                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2968         }
2969
2970         if (iter->cur_sk < iter->end_sk) {
2971                 bpf_iter_tcp_put_batch(iter);
2972                 iter->st_bucket_done = false;
2973         }
2974 }
2975
2976 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2977         .show           = bpf_iter_tcp_seq_show,
2978         .start          = bpf_iter_tcp_seq_start,
2979         .next           = bpf_iter_tcp_seq_next,
2980         .stop           = bpf_iter_tcp_seq_stop,
2981 };
2982 #endif
2983 static unsigned short seq_file_family(const struct seq_file *seq)
2984 {
2985         const struct tcp_seq_afinfo *afinfo;
2986
2987 #ifdef CONFIG_BPF_SYSCALL
2988         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2989         if (seq->op == &bpf_iter_tcp_seq_ops)
2990                 return AF_UNSPEC;
2991 #endif
2992
2993         /* Iterated from proc fs */
2994         afinfo = pde_data(file_inode(seq->file));
2995         return afinfo->family;
2996 }
2997
2998 static const struct seq_operations tcp4_seq_ops = {
2999         .show           = tcp4_seq_show,
3000         .start          = tcp_seq_start,
3001         .next           = tcp_seq_next,
3002         .stop           = tcp_seq_stop,
3003 };
3004
3005 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3006         .family         = AF_INET,
3007 };
3008
3009 static int __net_init tcp4_proc_init_net(struct net *net)
3010 {
3011         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3012                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3013                 return -ENOMEM;
3014         return 0;
3015 }
3016
3017 static void __net_exit tcp4_proc_exit_net(struct net *net)
3018 {
3019         remove_proc_entry("tcp", net->proc_net);
3020 }
3021
3022 static struct pernet_operations tcp4_net_ops = {
3023         .init = tcp4_proc_init_net,
3024         .exit = tcp4_proc_exit_net,
3025 };
3026
3027 int __init tcp4_proc_init(void)
3028 {
3029         return register_pernet_subsys(&tcp4_net_ops);
3030 }
3031
3032 void tcp4_proc_exit(void)
3033 {
3034         unregister_pernet_subsys(&tcp4_net_ops);
3035 }
3036 #endif /* CONFIG_PROC_FS */
3037
3038 /* @wake is one when sk_stream_write_space() calls us.
3039  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3040  * This mimics the strategy used in sock_def_write_space().
3041  */
3042 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3043 {
3044         const struct tcp_sock *tp = tcp_sk(sk);
3045         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3046                             READ_ONCE(tp->snd_nxt);
3047
3048         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3049 }
3050 EXPORT_SYMBOL(tcp_stream_memory_free);
3051
3052 struct proto tcp_prot = {
3053         .name                   = "TCP",
3054         .owner                  = THIS_MODULE,
3055         .close                  = tcp_close,
3056         .pre_connect            = tcp_v4_pre_connect,
3057         .connect                = tcp_v4_connect,
3058         .disconnect             = tcp_disconnect,
3059         .accept                 = inet_csk_accept,
3060         .ioctl                  = tcp_ioctl,
3061         .init                   = tcp_v4_init_sock,
3062         .destroy                = tcp_v4_destroy_sock,
3063         .shutdown               = tcp_shutdown,
3064         .setsockopt             = tcp_setsockopt,
3065         .getsockopt             = tcp_getsockopt,
3066         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3067         .keepalive              = tcp_set_keepalive,
3068         .recvmsg                = tcp_recvmsg,
3069         .sendmsg                = tcp_sendmsg,
3070         .splice_eof             = tcp_splice_eof,
3071         .sendpage               = tcp_sendpage,
3072         .backlog_rcv            = tcp_v4_do_rcv,
3073         .release_cb             = tcp_release_cb,
3074         .hash                   = inet_hash,
3075         .unhash                 = inet_unhash,
3076         .get_port               = inet_csk_get_port,
3077         .put_port               = inet_put_port,
3078 #ifdef CONFIG_BPF_SYSCALL
3079         .psock_update_sk_prot   = tcp_bpf_update_proto,
3080 #endif
3081         .enter_memory_pressure  = tcp_enter_memory_pressure,
3082         .leave_memory_pressure  = tcp_leave_memory_pressure,
3083         .stream_memory_free     = tcp_stream_memory_free,
3084         .sockets_allocated      = &tcp_sockets_allocated,
3085         .orphan_count           = &tcp_orphan_count,
3086
3087         .memory_allocated       = &tcp_memory_allocated,
3088         .per_cpu_fw_alloc       = &tcp_memory_per_cpu_fw_alloc,
3089
3090         .memory_pressure        = &tcp_memory_pressure,
3091         .sysctl_mem             = sysctl_tcp_mem,
3092         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3093         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3094         .max_header             = MAX_TCP_HEADER,
3095         .obj_size               = sizeof(struct tcp_sock),
3096         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3097         .twsk_prot              = &tcp_timewait_sock_ops,
3098         .rsk_prot               = &tcp_request_sock_ops,
3099         .h.hashinfo             = NULL,
3100         .no_autobind            = true,
3101         .diag_destroy           = tcp_abort,
3102 };
3103 EXPORT_SYMBOL(tcp_prot);
3104
3105 static void __net_exit tcp_sk_exit(struct net *net)
3106 {
3107         if (net->ipv4.tcp_congestion_control)
3108                 bpf_module_put(net->ipv4.tcp_congestion_control,
3109                                net->ipv4.tcp_congestion_control->owner);
3110 }
3111
3112 static void __net_init tcp_set_hashinfo(struct net *net)
3113 {
3114         struct inet_hashinfo *hinfo;
3115         unsigned int ehash_entries;
3116         struct net *old_net;
3117
3118         if (net_eq(net, &init_net))
3119                 goto fallback;
3120
3121         old_net = current->nsproxy->net_ns;
3122         ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3123         if (!ehash_entries)
3124                 goto fallback;
3125
3126         ehash_entries = roundup_pow_of_two(ehash_entries);
3127         hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3128         if (!hinfo) {
3129                 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3130                         "for a netns, fallback to the global one\n",
3131                         ehash_entries);
3132 fallback:
3133                 hinfo = &tcp_hashinfo;
3134                 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3135         }
3136
3137         net->ipv4.tcp_death_row.hashinfo = hinfo;
3138         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3139         net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3140 }
3141
3142 static int __net_init tcp_sk_init(struct net *net)
3143 {
3144         net->ipv4.sysctl_tcp_ecn = 2;
3145         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3146
3147         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3148         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3149         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3150         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3151         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3152
3153         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3154         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3155         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3156
3157         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3158         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3159         net->ipv4.sysctl_tcp_syncookies = 1;
3160         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3161         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3162         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3163         net->ipv4.sysctl_tcp_orphan_retries = 0;
3164         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3165         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3166         net->ipv4.sysctl_tcp_tw_reuse = 2;
3167         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3168
3169         refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3170         tcp_set_hashinfo(net);
3171
3172         net->ipv4.sysctl_tcp_sack = 1;
3173         net->ipv4.sysctl_tcp_window_scaling = 1;
3174         net->ipv4.sysctl_tcp_timestamps = 1;
3175         net->ipv4.sysctl_tcp_early_retrans = 3;
3176         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3177         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3178         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3179         net->ipv4.sysctl_tcp_max_reordering = 300;
3180         net->ipv4.sysctl_tcp_dsack = 1;
3181         net->ipv4.sysctl_tcp_app_win = 31;
3182         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3183         net->ipv4.sysctl_tcp_frto = 2;
3184         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3185         /* This limits the percentage of the congestion window which we
3186          * will allow a single TSO frame to consume.  Building TSO frames
3187          * which are too large can cause TCP streams to be bursty.
3188          */
3189         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3190         /* Default TSQ limit of 16 TSO segments */
3191         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3192
3193         /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3194         net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3195
3196         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3197         net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3198         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3199         net->ipv4.sysctl_tcp_autocorking = 1;
3200         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3201         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3202         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3203         if (net != &init_net) {
3204                 memcpy(net->ipv4.sysctl_tcp_rmem,
3205                        init_net.ipv4.sysctl_tcp_rmem,
3206                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3207                 memcpy(net->ipv4.sysctl_tcp_wmem,
3208                        init_net.ipv4.sysctl_tcp_wmem,
3209                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3210         }
3211         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3212         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3213         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3214         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3215         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3216         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3217
3218         /* Reno is always built in */
3219         if (!net_eq(net, &init_net) &&
3220             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3221                                init_net.ipv4.tcp_congestion_control->owner))
3222                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3223         else
3224                 net->ipv4.tcp_congestion_control = &tcp_reno;
3225
3226         net->ipv4.sysctl_tcp_shrink_window = 0;
3227
3228         return 0;
3229 }
3230
3231 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3232 {
3233         struct net *net;
3234
3235         tcp_twsk_purge(net_exit_list, AF_INET);
3236
3237         list_for_each_entry(net, net_exit_list, exit_list) {
3238                 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3239                 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3240                 tcp_fastopen_ctx_destroy(net);
3241         }
3242 }
3243
3244 static struct pernet_operations __net_initdata tcp_sk_ops = {
3245        .init       = tcp_sk_init,
3246        .exit       = tcp_sk_exit,
3247        .exit_batch = tcp_sk_exit_batch,
3248 };
3249
3250 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3251 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3252                      struct sock_common *sk_common, uid_t uid)
3253
3254 #define INIT_BATCH_SZ 16
3255
3256 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3257 {
3258         struct bpf_tcp_iter_state *iter = priv_data;
3259         int err;
3260
3261         err = bpf_iter_init_seq_net(priv_data, aux);
3262         if (err)
3263                 return err;
3264
3265         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3266         if (err) {
3267                 bpf_iter_fini_seq_net(priv_data);
3268                 return err;
3269         }
3270
3271         return 0;
3272 }
3273
3274 static void bpf_iter_fini_tcp(void *priv_data)
3275 {
3276         struct bpf_tcp_iter_state *iter = priv_data;
3277
3278         bpf_iter_fini_seq_net(priv_data);
3279         kvfree(iter->batch);
3280 }
3281
3282 static const struct bpf_iter_seq_info tcp_seq_info = {
3283         .seq_ops                = &bpf_iter_tcp_seq_ops,
3284         .init_seq_private       = bpf_iter_init_tcp,
3285         .fini_seq_private       = bpf_iter_fini_tcp,
3286         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3287 };
3288
3289 static const struct bpf_func_proto *
3290 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3291                             const struct bpf_prog *prog)
3292 {
3293         switch (func_id) {
3294         case BPF_FUNC_setsockopt:
3295                 return &bpf_sk_setsockopt_proto;
3296         case BPF_FUNC_getsockopt:
3297                 return &bpf_sk_getsockopt_proto;
3298         default:
3299                 return NULL;
3300         }
3301 }
3302
3303 static struct bpf_iter_reg tcp_reg_info = {
3304         .target                 = "tcp",
3305         .ctx_arg_info_size      = 1,
3306         .ctx_arg_info           = {
3307                 { offsetof(struct bpf_iter__tcp, sk_common),
3308                   PTR_TO_BTF_ID_OR_NULL },
3309         },
3310         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3311         .seq_info               = &tcp_seq_info,
3312 };
3313
3314 static void __init bpf_iter_register(void)
3315 {
3316         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3317         if (bpf_iter_reg_target(&tcp_reg_info))
3318                 pr_warn("Warning: could not register bpf iterator tcp\n");
3319 }
3320
3321 #endif
3322
3323 void __init tcp_v4_init(void)
3324 {
3325         int cpu, res;
3326
3327         for_each_possible_cpu(cpu) {
3328                 struct sock *sk;
3329
3330                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3331                                            IPPROTO_TCP, &init_net);
3332                 if (res)
3333                         panic("Failed to create the TCP control socket.\n");
3334                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3335
3336                 /* Please enforce IP_DF and IPID==0 for RST and
3337                  * ACK sent in SYN-RECV and TIME-WAIT state.
3338                  */
3339                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3340
3341                 per_cpu(ipv4_tcp_sk, cpu) = sk;
3342         }
3343         if (register_pernet_subsys(&tcp_sk_ops))
3344                 panic("Failed to create the TCP control socket.\n");
3345
3346 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3347         bpf_iter_register();
3348 #endif
3349 }