net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79 #include <linux/btf_ids.h>
  80
  81 #include <crypto/hash.h>
  82 #include <linux/scatterlist.h>
  83
  84 #include <trace/events/tcp.h>
  85
  86 #ifdef CONFIG_TCP_MD5SIG
  87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89 #endif
  90
  91 struct inet_hashinfo tcp_hashinfo;
  92 EXPORT_SYMBOL(tcp_hashinfo);
  93
  94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
  95
  96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  97 {
  98         return secure_tcp_seq(ip_hdr(skb)->daddr,
  99                               ip_hdr(skb)->saddr,
 100                               tcp_hdr(skb)->dest,
 101                               tcp_hdr(skb)->source);
 102 }
 103
 104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 105 {
 106         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 107 }
 108
 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110 {
 111         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 112         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 113         struct tcp_sock *tp = tcp_sk(sk);
 114         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 115
 116         if (reuse == 2) {
 117                 /* Still does not detect *everything* that goes through
 118                  * lo, since we require a loopback src or dst address
 119                  * or direct binding to 'lo' interface.
 120                  */
 121                 bool loopback = false;
 122                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 123                         loopback = true;
 124 #if IS_ENABLED(CONFIG_IPV6)
 125                 if (tw->tw_family == AF_INET6) {
 126                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 128                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 129                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 130                                 loopback = true;
 131                 } else
 132 #endif
 133                 {
 134                         if (ipv4_is_loopback(tw->tw_daddr) ||
 135                             ipv4_is_loopback(tw->tw_rcv_saddr))
 136                                 loopback = true;
 137                 }
 138                 if (!loopback)
 139                         reuse = 0;
 140         }
 141
 142         /* With PAWS, it is safe from the viewpoint
 143            of data integrity. Even without PAWS it is safe provided sequence
 144            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 145
 146            Actually, the idea is close to VJ's one, only timestamp cache is
 147            held not per host, but per port pair and TW bucket is used as state
 148            holder.
 149
 150            If TW bucket has been already destroyed we fall back to VJ's scheme
 151            and use initial timestamp retrieved from peer table.
 152          */
 153         if (tcptw->tw_ts_recent_stamp &&
 154             (!twp || (reuse && time_after32(ktime_get_seconds(),
 155                                             tcptw->tw_ts_recent_stamp)))) {
 156                 /* In case of repair and re-using TIME-WAIT sockets we still
 157                  * want to be sure that it is safe as above but honor the
 158                  * sequence numbers and time stamps set as part of the repair
 159                  * process.
 160                  *
 161                  * Without this check re-using a TIME-WAIT socket with TCP
 162                  * repair would accumulate a -1 on the repair assigned
 163                  * sequence number. The first time it is reused the sequence
 164                  * is -1, the second time -2, etc. This fixes that issue
 165                  * without appearing to create any others.
 166                  */
 167                 if (likely(!tp->repair)) {
 168                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 169
 170                         if (!seq)
 171                                 seq = 1;
 172                         WRITE_ONCE(tp->write_seq, seq);
 173                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 174                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 175                 }
 176                 sock_hold(sktw);
 177                 return 1;
 178         }
 179
 180         return 0;
 181 }
 182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 183
 184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 185                               int addr_len)
 186 {
 187         /* This check is replicated from tcp_v4_connect() and intended to
 188          * prevent BPF program called below from accessing bytes that are out
 189          * of the bound specified by user in addr_len.
 190          */
 191         if (addr_len < sizeof(struct sockaddr_in))
 192                 return -EINVAL;
 193
 194         sock_owned_by_me(sk);
 195
 196         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 197 }
 198
 199 /* This will initiate an outgoing connection. */
 200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 201 {
 202         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 203         struct inet_sock *inet = inet_sk(sk);
 204         struct tcp_sock *tp = tcp_sk(sk);
 205         __be16 orig_sport, orig_dport;
 206         __be32 daddr, nexthop;
 207         struct flowi4 *fl4;
 208         struct rtable *rt;
 209         int err;
 210         struct ip_options_rcu *inet_opt;
 211         struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
 212
 213         if (addr_len < sizeof(struct sockaddr_in))
 214                 return -EINVAL;
 215
 216         if (usin->sin_family != AF_INET)
 217                 return -EAFNOSUPPORT;
 218
 219         nexthop = daddr = usin->sin_addr.s_addr;
 220         inet_opt = rcu_dereference_protected(inet->inet_opt,
 221                                              lockdep_sock_is_held(sk));
 222         if (inet_opt && inet_opt->opt.srr) {
 223                 if (!daddr)
 224                         return -EINVAL;
 225                 nexthop = inet_opt->opt.faddr;
 226         }
 227
 228         orig_sport = inet->inet_sport;
 229         orig_dport = usin->sin_port;
 230         fl4 = &inet->cork.fl.u.ip4;
 231         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 232                               sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
 233                               orig_dport, sk);
 234         if (IS_ERR(rt)) {
 235                 err = PTR_ERR(rt);
 236                 if (err == -ENETUNREACH)
 237                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 238                 return err;
 239         }
 240
 241         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 242                 ip_rt_put(rt);
 243                 return -ENETUNREACH;
 244         }
 245
 246         if (!inet_opt || !inet_opt->opt.srr)
 247                 daddr = fl4->daddr;
 248
 249         if (!inet->inet_saddr)
 250                 inet->inet_saddr = fl4->saddr;
 251         sk_rcv_saddr_set(sk, inet->inet_saddr);
 252
 253         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 254                 /* Reset inherited state */
 255                 tp->rx_opt.ts_recent       = 0;
 256                 tp->rx_opt.ts_recent_stamp = 0;
 257                 if (likely(!tp->repair))
 258                         WRITE_ONCE(tp->write_seq, 0);
 259         }
 260
 261         inet->inet_dport = usin->sin_port;
 262         sk_daddr_set(sk, daddr);
 263
 264         inet_csk(sk)->icsk_ext_hdr_len = 0;
 265         if (inet_opt)
 266                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 267
 268         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 269
 270         /* Socket identity is still unknown (sport may be zero).
 271          * However we set state to SYN-SENT and not releasing socket
 272          * lock select source port, enter ourselves into the hash tables and
 273          * complete initialization after this.
 274          */
 275         tcp_set_state(sk, TCP_SYN_SENT);
 276         err = inet_hash_connect(tcp_death_row, sk);
 277         if (err)
 278                 goto failure;
 279
 280         sk_set_txhash(sk);
 281
 282         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 283                                inet->inet_sport, inet->inet_dport, sk);
 284         if (IS_ERR(rt)) {
 285                 err = PTR_ERR(rt);
 286                 rt = NULL;
 287                 goto failure;
 288         }
 289         /* OK, now commit destination to socket.  */
 290         sk->sk_gso_type = SKB_GSO_TCPV4;
 291         sk_setup_caps(sk, &rt->dst);
 292         rt = NULL;
 293
 294         if (likely(!tp->repair)) {
 295                 if (!tp->write_seq)
 296                         WRITE_ONCE(tp->write_seq,
 297                                    secure_tcp_seq(inet->inet_saddr,
 298                                                   inet->inet_daddr,
 299                                                   inet->inet_sport,
 300                                                   usin->sin_port));
 301                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 302                                                  inet->inet_saddr,
 303                                                  inet->inet_daddr);
 304         }
 305
 306         inet->inet_id = prandom_u32();
 307
 308         if (tcp_fastopen_defer_connect(sk, &err))
 309                 return err;
 310         if (err)
 311                 goto failure;
 312
 313         err = tcp_connect(sk);
 314
 315         if (err)
 316                 goto failure;
 317
 318         return 0;
 319
 320 failure:
 321         /*
 322          * This unhashes the socket and releases the local port,
 323          * if necessary.
 324          */
 325         tcp_set_state(sk, TCP_CLOSE);
 326         ip_rt_put(rt);
 327         sk->sk_route_caps = 0;
 328         inet->inet_dport = 0;
 329         return err;
 330 }
 331 EXPORT_SYMBOL(tcp_v4_connect);
 332
 333 /*
 334  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 335  * It can be called through tcp_release_cb() if socket was owned by user
 336  * at the time tcp_v4_err() was called to handle ICMP message.
 337  */
 338 void tcp_v4_mtu_reduced(struct sock *sk)
 339 {
 340         struct inet_sock *inet = inet_sk(sk);
 341         struct dst_entry *dst;
 342         u32 mtu;
 343
 344         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 345                 return;
 346         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 347         dst = inet_csk_update_pmtu(sk, mtu);
 348         if (!dst)
 349                 return;
 350
 351         /* Something is about to be wrong... Remember soft error
 352          * for the case, if this connection will not able to recover.
 353          */
 354         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 355                 sk->sk_err_soft = EMSGSIZE;
 356
 357         mtu = dst_mtu(dst);
 358
 359         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 360             ip_sk_accept_pmtu(sk) &&
 361             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 362                 tcp_sync_mss(sk, mtu);
 363
 364                 /* Resend the TCP packet because it's
 365                  * clear that the old packet has been
 366                  * dropped. This is the new "fast" path mtu
 367                  * discovery.
 368                  */
 369                 tcp_simple_retransmit(sk);
 370         } /* else let the usual retransmit timer handle it */
 371 }
 372 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 373
 374 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 375 {
 376         struct dst_entry *dst = __sk_dst_check(sk, 0);
 377
 378         if (dst)
 379                 dst->ops->redirect(dst, sk, skb);
 380 }
 381
 382
 383 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 384 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 385 {
 386         struct request_sock *req = inet_reqsk(sk);
 387         struct net *net = sock_net(sk);
 388
 389         /* ICMPs are not backlogged, hence we cannot get
 390          * an established socket here.
 391          */
 392         if (seq != tcp_rsk(req)->snt_isn) {
 393                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 394         } else if (abort) {
 395                 /*
 396                  * Still in SYN_RECV, just remove it silently.
 397                  * There is no good way to pass the error to the newly
 398                  * created socket, and POSIX does not want network
 399                  * errors returned from accept().
 400                  */
 401                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 402                 tcp_listendrop(req->rsk_listener);
 403         }
 404         reqsk_put(req);
 405 }
 406 EXPORT_SYMBOL(tcp_req_err);
 407
 408 /* TCP-LD (RFC 6069) logic */
 409 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 410 {
 411         struct inet_connection_sock *icsk = inet_csk(sk);
 412         struct tcp_sock *tp = tcp_sk(sk);
 413         struct sk_buff *skb;
 414         s32 remaining;
 415         u32 delta_us;
 416
 417         if (sock_owned_by_user(sk))
 418                 return;
 419
 420         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 421             !icsk->icsk_backoff)
 422                 return;
 423
 424         skb = tcp_rtx_queue_head(sk);
 425         if (WARN_ON_ONCE(!skb))
 426                 return;
 427
 428         icsk->icsk_backoff--;
 429         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 430         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 431
 432         tcp_mstamp_refresh(tp);
 433         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 434         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 435
 436         if (remaining > 0) {
 437                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 438                                           remaining, TCP_RTO_MAX);
 439         } else {
 440                 /* RTO revert clocked out retransmission.
 441                  * Will retransmit now.
 442                  */
 443                 tcp_retransmit_timer(sk);
 444         }
 445 }
 446 EXPORT_SYMBOL(tcp_ld_RTO_revert);
 447
 448 /*
 449  * This routine is called by the ICMP module when it gets some
 450  * sort of error condition.  If err < 0 then the socket should
 451  * be closed and the error returned to the user.  If err > 0
 452  * it's just the icmp type << 8 | icmp code.  After adjustment
 453  * header points to the first 8 bytes of the tcp header.  We need
 454  * to find the appropriate port.
 455  *
 456  * The locking strategy used here is very "optimistic". When
 457  * someone else accesses the socket the ICMP is just dropped
 458  * and for some paths there is no check at all.
 459  * A more general error queue to queue errors for later handling
 460  * is probably better.
 461  *
 462  */
 463
 464 int tcp_v4_err(struct sk_buff *skb, u32 info)
 465 {
 466         const struct iphdr *iph = (const struct iphdr *)skb->data;
 467         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 468         struct tcp_sock *tp;
 469         struct inet_sock *inet;
 470         const int type = icmp_hdr(skb)->type;
 471         const int code = icmp_hdr(skb)->code;
 472         struct sock *sk;
 473         struct request_sock *fastopen;
 474         u32 seq, snd_una;
 475         int err;
 476         struct net *net = dev_net(skb->dev);
 477
 478         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 479                                        th->dest, iph->saddr, ntohs(th->source),
 480                                        inet_iif(skb), 0);
 481         if (!sk) {
 482                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 483                 return -ENOENT;
 484         }
 485         if (sk->sk_state == TCP_TIME_WAIT) {
 486                 inet_twsk_put(inet_twsk(sk));
 487                 return 0;
 488         }
 489         seq = ntohl(th->seq);
 490         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 491                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 492                                      type == ICMP_TIME_EXCEEDED ||
 493                                      (type == ICMP_DEST_UNREACH &&
 494                                       (code == ICMP_NET_UNREACH ||
 495                                        code == ICMP_HOST_UNREACH)));
 496                 return 0;
 497         }
 498
 499         bh_lock_sock(sk);
 500         /* If too many ICMPs get dropped on busy
 501          * servers this needs to be solved differently.
 502          * We do take care of PMTU discovery (RFC1191) special case :
 503          * we can receive locally generated ICMP messages while socket is held.
 504          */
 505         if (sock_owned_by_user(sk)) {
 506                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 507                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 508         }
 509         if (sk->sk_state == TCP_CLOSE)
 510                 goto out;
 511
 512         if (static_branch_unlikely(&ip4_min_ttl)) {
 513                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
 514                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
 515                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 516                         goto out;
 517                 }
 518         }
 519
 520         tp = tcp_sk(sk);
 521         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 522         fastopen = rcu_dereference(tp->fastopen_rsk);
 523         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 524         if (sk->sk_state != TCP_LISTEN &&
 525             !between(seq, snd_una, tp->snd_nxt)) {
 526                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 527                 goto out;
 528         }
 529
 530         switch (type) {
 531         case ICMP_REDIRECT:
 532                 if (!sock_owned_by_user(sk))
 533                         do_redirect(skb, sk);
 534                 goto out;
 535         case ICMP_SOURCE_QUENCH:
 536                 /* Just silently ignore these. */
 537                 goto out;
 538         case ICMP_PARAMETERPROB:
 539                 err = EPROTO;
 540                 break;
 541         case ICMP_DEST_UNREACH:
 542                 if (code > NR_ICMP_UNREACH)
 543                         goto out;
 544
 545                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 546                         /* We are not interested in TCP_LISTEN and open_requests
 547                          * (SYN-ACKs send out by Linux are always <576bytes so
 548                          * they should go through unfragmented).
 549                          */
 550                         if (sk->sk_state == TCP_LISTEN)
 551                                 goto out;
 552
 553                         WRITE_ONCE(tp->mtu_info, info);
 554                         if (!sock_owned_by_user(sk)) {
 555                                 tcp_v4_mtu_reduced(sk);
 556                         } else {
 557                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 558                                         sock_hold(sk);
 559                         }
 560                         goto out;
 561                 }
 562
 563                 err = icmp_err_convert[code].errno;
 564                 /* check if this ICMP message allows revert of backoff.
 565                  * (see RFC 6069)
 566                  */
 567                 if (!fastopen &&
 568                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 569                         tcp_ld_RTO_revert(sk, seq);
 570                 break;
 571         case ICMP_TIME_EXCEEDED:
 572                 err = EHOSTUNREACH;
 573                 break;
 574         default:
 575                 goto out;
 576         }
 577
 578         switch (sk->sk_state) {
 579         case TCP_SYN_SENT:
 580         case TCP_SYN_RECV:
 581                 /* Only in fast or simultaneous open. If a fast open socket is
 582                  * already accepted it is treated as a connected one below.
 583                  */
 584                 if (fastopen && !fastopen->sk)
 585                         break;
 586
 587                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 588
 589                 if (!sock_owned_by_user(sk)) {
 590                         sk->sk_err = err;
 591
 592                         sk_error_report(sk);
 593
 594                         tcp_done(sk);
 595                 } else {
 596                         sk->sk_err_soft = err;
 597                 }
 598                 goto out;
 599         }
 600
 601         /* If we've already connected we will keep trying
 602          * until we time out, or the user gives up.
 603          *
 604          * rfc1122 4.2.3.9 allows to consider as hard errors
 605          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 606          * but it is obsoleted by pmtu discovery).
 607          *
 608          * Note, that in modern internet, where routing is unreliable
 609          * and in each dark corner broken firewalls sit, sending random
 610          * errors ordered by their masters even this two messages finally lose
 611          * their original sense (even Linux sends invalid PORT_UNREACHs)
 612          *
 613          * Now we are in compliance with RFCs.
 614          *                                                      --ANK (980905)
 615          */
 616
 617         inet = inet_sk(sk);
 618         if (!sock_owned_by_user(sk) && inet->recverr) {
 619                 sk->sk_err = err;
 620                 sk_error_report(sk);
 621         } else  { /* Only an error on timeout */
 622                 sk->sk_err_soft = err;
 623         }
 624
 625 out:
 626         bh_unlock_sock(sk);
 627         sock_put(sk);
 628         return 0;
 629 }
 630
 631 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 632 {
 633         struct tcphdr *th = tcp_hdr(skb);
 634
 635         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 636         skb->csum_start = skb_transport_header(skb) - skb->head;
 637         skb->csum_offset = offsetof(struct tcphdr, check);
 638 }
 639
 640 /* This routine computes an IPv4 TCP checksum. */
 641 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 642 {
 643         const struct inet_sock *inet = inet_sk(sk);
 644
 645         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 646 }
 647 EXPORT_SYMBOL(tcp_v4_send_check);
 648
 649 /*
 650  *      This routine will send an RST to the other tcp.
 651  *
 652  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 653  *                    for reset.
 654  *      Answer: if a packet caused RST, it is not for a socket
 655  *              existing in our system, if it is matched to a socket,
 656  *              it is just duplicate segment or bug in other side's TCP.
 657  *              So that we build reply only basing on parameters
 658  *              arrived with segment.
 659  *      Exception: precedence violation. We do not implement it in any case.
 660  */
 661
 662 #ifdef CONFIG_TCP_MD5SIG
 663 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
 664 #else
 665 #define OPTION_BYTES sizeof(__be32)
 666 #endif
 667
 668 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 669 {
 670         const struct tcphdr *th = tcp_hdr(skb);
 671         struct {
 672                 struct tcphdr th;
 673                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
 674         } rep;
 675         struct ip_reply_arg arg;
 676 #ifdef CONFIG_TCP_MD5SIG
 677         struct tcp_md5sig_key *key = NULL;
 678         const __u8 *hash_location = NULL;
 679         unsigned char newhash[16];
 680         int genhash;
 681         struct sock *sk1 = NULL;
 682 #endif
 683         u64 transmit_time = 0;
 684         struct sock *ctl_sk;
 685         struct net *net;
 686
 687         /* Never send a reset in response to a reset. */
 688         if (th->rst)
 689                 return;
 690
 691         /* If sk not NULL, it means we did a successful lookup and incoming
 692          * route had to be correct. prequeue might have dropped our dst.
 693          */
 694         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 695                 return;
 696
 697         /* Swap the send and the receive. */
 698         memset(&rep, 0, sizeof(rep));
 699         rep.th.dest   = th->source;
 700         rep.th.source = th->dest;
 701         rep.th.doff   = sizeof(struct tcphdr) / 4;
 702         rep.th.rst    = 1;
 703
 704         if (th->ack) {
 705                 rep.th.seq = th->ack_seq;
 706         } else {
 707                 rep.th.ack = 1;
 708                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 709                                        skb->len - (th->doff << 2));
 710         }
 711
 712         memset(&arg, 0, sizeof(arg));
 713         arg.iov[0].iov_base = (unsigned char *)&rep;
 714         arg.iov[0].iov_len  = sizeof(rep.th);
 715
 716         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 717 #ifdef CONFIG_TCP_MD5SIG
 718         rcu_read_lock();
 719         hash_location = tcp_parse_md5sig_option(th);
 720         if (sk && sk_fullsock(sk)) {
 721                 const union tcp_md5_addr *addr;
 722                 int l3index;
 723
 724                 /* sdif set, means packet ingressed via a device
 725                  * in an L3 domain and inet_iif is set to it.
 726                  */
 727                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 728                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 729                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 730         } else if (hash_location) {
 731                 const union tcp_md5_addr *addr;
 732                 int sdif = tcp_v4_sdif(skb);
 733                 int dif = inet_iif(skb);
 734                 int l3index;
 735
 736                 /*
 737                  * active side is lost. Try to find listening socket through
 738                  * source port, and then find md5 key through listening socket.
 739                  * we are not loose security here:
 740                  * Incoming packet is checked with md5 hash with finding key,
 741                  * no RST generated if md5 hash doesn't match.
 742                  */
 743                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 744                                              ip_hdr(skb)->saddr,
 745                                              th->source, ip_hdr(skb)->daddr,
 746                                              ntohs(th->source), dif, sdif);
 747                 /* don't send rst if it can't find key */
 748                 if (!sk1)
 749                         goto out;
 750
 751                 /* sdif set, means packet ingressed via a device
 752                  * in an L3 domain and dif is set to it.
 753                  */
 754                 l3index = sdif ? dif : 0;
 755                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 756                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 757                 if (!key)
 758                         goto out;
 759
 760
 761                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 762                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 763                         goto out;
 764
 765         }
 766
 767         if (key) {
 768                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 769                                    (TCPOPT_NOP << 16) |
 770                                    (TCPOPT_MD5SIG << 8) |
 771                                    TCPOLEN_MD5SIG);
 772                 /* Update length and the length the header thinks exists */
 773                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 774                 rep.th.doff = arg.iov[0].iov_len / 4;
 775
 776                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 777                                      key, ip_hdr(skb)->saddr,
 778                                      ip_hdr(skb)->daddr, &rep.th);
 779         }
 780 #endif
 781         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
 782         if (rep.opt[0] == 0) {
 783                 __be32 mrst = mptcp_reset_option(skb);
 784
 785                 if (mrst) {
 786                         rep.opt[0] = mrst;
 787                         arg.iov[0].iov_len += sizeof(mrst);
 788                         rep.th.doff = arg.iov[0].iov_len / 4;
 789                 }
 790         }
 791
 792         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 793                                       ip_hdr(skb)->saddr, /* XXX */
 794                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 795         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 796         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 797
 798         /* When socket is gone, all binding information is lost.
 799          * routing might fail in this case. No choice here, if we choose to force
 800          * input interface, we will misroute in case of asymmetric route.
 801          */
 802         if (sk) {
 803                 arg.bound_dev_if = sk->sk_bound_dev_if;
 804                 if (sk_fullsock(sk))
 805                         trace_tcp_send_reset(sk, skb);
 806         }
 807
 808         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 809                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 810
 811         arg.tos = ip_hdr(skb)->tos;
 812         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 813         local_bh_disable();
 814         ctl_sk = this_cpu_read(ipv4_tcp_sk);
 815         sock_net_set(ctl_sk, net);
 816         if (sk) {
 817                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 818                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 819                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 820                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 821                 transmit_time = tcp_transmit_time(sk);
 822         }
 823         ip_send_unicast_reply(ctl_sk,
 824                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 825                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 826                               &arg, arg.iov[0].iov_len,
 827                               transmit_time);
 828
 829         ctl_sk->sk_mark = 0;
 830         sock_net_set(ctl_sk, &init_net);
 831         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 832         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 833         local_bh_enable();
 834
 835 #ifdef CONFIG_TCP_MD5SIG
 836 out:
 837         rcu_read_unlock();
 838 #endif
 839 }
 840
 841 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 842    outside socket context is ugly, certainly. What can I do?
 843  */
 844
 845 static void tcp_v4_send_ack(const struct sock *sk,
 846                             struct sk_buff *skb, u32 seq, u32 ack,
 847                             u32 win, u32 tsval, u32 tsecr, int oif,
 848                             struct tcp_md5sig_key *key,
 849                             int reply_flags, u8 tos)
 850 {
 851         const struct tcphdr *th = tcp_hdr(skb);
 852         struct {
 853                 struct tcphdr th;
 854                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 855 #ifdef CONFIG_TCP_MD5SIG
 856                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 857 #endif
 858                         ];
 859         } rep;
 860         struct net *net = sock_net(sk);
 861         struct ip_reply_arg arg;
 862         struct sock *ctl_sk;
 863         u64 transmit_time;
 864
 865         memset(&rep.th, 0, sizeof(struct tcphdr));
 866         memset(&arg, 0, sizeof(arg));
 867
 868         arg.iov[0].iov_base = (unsigned char *)&rep;
 869         arg.iov[0].iov_len  = sizeof(rep.th);
 870         if (tsecr) {
 871                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 872                                    (TCPOPT_TIMESTAMP << 8) |
 873                                    TCPOLEN_TIMESTAMP);
 874                 rep.opt[1] = htonl(tsval);
 875                 rep.opt[2] = htonl(tsecr);
 876                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 877         }
 878
 879         /* Swap the send and the receive. */
 880         rep.th.dest    = th->source;
 881         rep.th.source  = th->dest;
 882         rep.th.doff    = arg.iov[0].iov_len / 4;
 883         rep.th.seq     = htonl(seq);
 884         rep.th.ack_seq = htonl(ack);
 885         rep.th.ack     = 1;
 886         rep.th.window  = htons(win);
 887
 888 #ifdef CONFIG_TCP_MD5SIG
 889         if (key) {
 890                 int offset = (tsecr) ? 3 : 0;
 891
 892                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 893                                           (TCPOPT_NOP << 16) |
 894                                           (TCPOPT_MD5SIG << 8) |
 895                                           TCPOLEN_MD5SIG);
 896                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 897                 rep.th.doff = arg.iov[0].iov_len/4;
 898
 899                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 900                                     key, ip_hdr(skb)->saddr,
 901                                     ip_hdr(skb)->daddr, &rep.th);
 902         }
 903 #endif
 904         arg.flags = reply_flags;
 905         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 906                                       ip_hdr(skb)->saddr, /* XXX */
 907                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 908         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 909         if (oif)
 910                 arg.bound_dev_if = oif;
 911         arg.tos = tos;
 912         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 913         local_bh_disable();
 914         ctl_sk = this_cpu_read(ipv4_tcp_sk);
 915         sock_net_set(ctl_sk, net);
 916         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 917                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 918         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 919                            inet_twsk(sk)->tw_priority : sk->sk_priority;
 920         transmit_time = tcp_transmit_time(sk);
 921         ip_send_unicast_reply(ctl_sk,
 922                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 923                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 924                               &arg, arg.iov[0].iov_len,
 925                               transmit_time);
 926
 927         ctl_sk->sk_mark = 0;
 928         sock_net_set(ctl_sk, &init_net);
 929         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 930         local_bh_enable();
 931 }
 932
 933 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 934 {
 935         struct inet_timewait_sock *tw = inet_twsk(sk);
 936         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 937
 938         tcp_v4_send_ack(sk, skb,
 939                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 940                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 941                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 942                         tcptw->tw_ts_recent,
 943                         tw->tw_bound_dev_if,
 944                         tcp_twsk_md5_key(tcptw),
 945                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 946                         tw->tw_tos
 947                         );
 948
 949         inet_twsk_put(tw);
 950 }
 951
 952 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 953                                   struct request_sock *req)
 954 {
 955         const union tcp_md5_addr *addr;
 956         int l3index;
 957
 958         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 959          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 960          */
 961         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 962                                              tcp_sk(sk)->snd_nxt;
 963
 964         /* RFC 7323 2.3
 965          * The window field (SEG.WND) of every outgoing segment, with the
 966          * exception of <SYN> segments, MUST be right-shifted by
 967          * Rcv.Wind.Shift bits:
 968          */
 969         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 970         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 971         tcp_v4_send_ack(sk, skb, seq,
 972                         tcp_rsk(req)->rcv_nxt,
 973                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 974                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 975                         req->ts_recent,
 976                         0,
 977                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 978                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 979                         ip_hdr(skb)->tos);
 980 }
 981
 982 /*
 983  *      Send a SYN-ACK after having received a SYN.
 984  *      This still operates on a request_sock only, not on a big
 985  *      socket.
 986  */
 987 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 988                               struct flowi *fl,
 989                               struct request_sock *req,
 990                               struct tcp_fastopen_cookie *foc,
 991                               enum tcp_synack_type synack_type,
 992                               struct sk_buff *syn_skb)
 993 {
 994         const struct inet_request_sock *ireq = inet_rsk(req);
 995         struct flowi4 fl4;
 996         int err = -1;
 997         struct sk_buff *skb;
 998         u8 tos;
 999
1000         /* First, grab a route. */
1001         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1002                 return -1;
1003
1004         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1005
1006         if (skb) {
1007                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1008
1009                 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1010                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1011                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1012                                 inet_sk(sk)->tos;
1013
1014                 if (!INET_ECN_is_capable(tos) &&
1015                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1016                         tos |= INET_ECN_ECT_0;
1017
1018                 rcu_read_lock();
1019                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1020                                             ireq->ir_rmt_addr,
1021                                             rcu_dereference(ireq->ireq_opt),
1022                                             tos);
1023                 rcu_read_unlock();
1024                 err = net_xmit_eval(err);
1025         }
1026
1027         return err;
1028 }
1029
1030 /*
1031  *      IPv4 request_sock destructor.
1032  */
1033 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1034 {
1035         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1036 }
1037
1038 #ifdef CONFIG_TCP_MD5SIG
1039 /*
1040  * RFC2385 MD5 checksumming requires a mapping of
1041  * IP address->MD5 Key.
1042  * We need to maintain these in the sk structure.
1043  */
1044
1045 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1046 EXPORT_SYMBOL(tcp_md5_needed);
1047
1048 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1049 {
1050         if (!old)
1051                 return true;
1052
1053         /* l3index always overrides non-l3index */
1054         if (old->l3index && new->l3index == 0)
1055                 return false;
1056         if (old->l3index == 0 && new->l3index)
1057                 return true;
1058
1059         return old->prefixlen < new->prefixlen;
1060 }
1061
1062 /* Find the Key structure for an address.  */
1063 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1064                                            const union tcp_md5_addr *addr,
1065                                            int family)
1066 {
1067         const struct tcp_sock *tp = tcp_sk(sk);
1068         struct tcp_md5sig_key *key;
1069         const struct tcp_md5sig_info *md5sig;
1070         __be32 mask;
1071         struct tcp_md5sig_key *best_match = NULL;
1072         bool match;
1073
1074         /* caller either holds rcu_read_lock() or socket lock */
1075         md5sig = rcu_dereference_check(tp->md5sig_info,
1076                                        lockdep_sock_is_held(sk));
1077         if (!md5sig)
1078                 return NULL;
1079
1080         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1081                                  lockdep_sock_is_held(sk)) {
1082                 if (key->family != family)
1083                         continue;
1084                 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1085                         continue;
1086                 if (family == AF_INET) {
1087                         mask = inet_make_mask(key->prefixlen);
1088                         match = (key->addr.a4.s_addr & mask) ==
1089                                 (addr->a4.s_addr & mask);
1090 #if IS_ENABLED(CONFIG_IPV6)
1091                 } else if (family == AF_INET6) {
1092                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1093                                                   key->prefixlen);
1094 #endif
1095                 } else {
1096                         match = false;
1097                 }
1098
1099                 if (match && better_md5_match(best_match, key))
1100                         best_match = key;
1101         }
1102         return best_match;
1103 }
1104 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1105
1106 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1107                                                       const union tcp_md5_addr *addr,
1108                                                       int family, u8 prefixlen,
1109                                                       int l3index, u8 flags)
1110 {
1111         const struct tcp_sock *tp = tcp_sk(sk);
1112         struct tcp_md5sig_key *key;
1113         unsigned int size = sizeof(struct in_addr);
1114         const struct tcp_md5sig_info *md5sig;
1115
1116         /* caller either holds rcu_read_lock() or socket lock */
1117         md5sig = rcu_dereference_check(tp->md5sig_info,
1118                                        lockdep_sock_is_held(sk));
1119         if (!md5sig)
1120                 return NULL;
1121 #if IS_ENABLED(CONFIG_IPV6)
1122         if (family == AF_INET6)
1123                 size = sizeof(struct in6_addr);
1124 #endif
1125         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1126                                  lockdep_sock_is_held(sk)) {
1127                 if (key->family != family)
1128                         continue;
1129                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1130                         continue;
1131                 if (key->l3index != l3index)
1132                         continue;
1133                 if (!memcmp(&key->addr, addr, size) &&
1134                     key->prefixlen == prefixlen)
1135                         return key;
1136         }
1137         return NULL;
1138 }
1139
1140 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1141                                          const struct sock *addr_sk)
1142 {
1143         const union tcp_md5_addr *addr;
1144         int l3index;
1145
1146         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1147                                                  addr_sk->sk_bound_dev_if);
1148         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1149         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1150 }
1151 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1152
1153 /* This can be called on a newly created socket, from other files */
1154 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1155                    int family, u8 prefixlen, int l3index, u8 flags,
1156                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1157 {
1158         /* Add Key to the list */
1159         struct tcp_md5sig_key *key;
1160         struct tcp_sock *tp = tcp_sk(sk);
1161         struct tcp_md5sig_info *md5sig;
1162
1163         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1164         if (key) {
1165                 /* Pre-existing entry - just update that one.
1166                  * Note that the key might be used concurrently.
1167                  * data_race() is telling kcsan that we do not care of
1168                  * key mismatches, since changing MD5 key on live flows
1169                  * can lead to packet drops.
1170                  */
1171                 data_race(memcpy(key->key, newkey, newkeylen));
1172
1173                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1174                  * Also note that a reader could catch new key->keylen value
1175                  * but old key->key[], this is the reason we use __GFP_ZERO
1176                  * at sock_kmalloc() time below these lines.
1177                  */
1178                 WRITE_ONCE(key->keylen, newkeylen);
1179
1180                 return 0;
1181         }
1182
1183         md5sig = rcu_dereference_protected(tp->md5sig_info,
1184                                            lockdep_sock_is_held(sk));
1185         if (!md5sig) {
1186                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1187                 if (!md5sig)
1188                         return -ENOMEM;
1189
1190                 sk_gso_disable(sk);
1191                 INIT_HLIST_HEAD(&md5sig->head);
1192                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1193         }
1194
1195         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1196         if (!key)
1197                 return -ENOMEM;
1198         if (!tcp_alloc_md5sig_pool()) {
1199                 sock_kfree_s(sk, key, sizeof(*key));
1200                 return -ENOMEM;
1201         }
1202
1203         memcpy(key->key, newkey, newkeylen);
1204         key->keylen = newkeylen;
1205         key->family = family;
1206         key->prefixlen = prefixlen;
1207         key->l3index = l3index;
1208         key->flags = flags;
1209         memcpy(&key->addr, addr,
1210                (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1211                                                                  sizeof(struct in_addr));
1212         hlist_add_head_rcu(&key->node, &md5sig->head);
1213         return 0;
1214 }
1215 EXPORT_SYMBOL(tcp_md5_do_add);
1216
1217 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1218                    u8 prefixlen, int l3index, u8 flags)
1219 {
1220         struct tcp_md5sig_key *key;
1221
1222         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1223         if (!key)
1224                 return -ENOENT;
1225         hlist_del_rcu(&key->node);
1226         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1227         kfree_rcu(key, rcu);
1228         return 0;
1229 }
1230 EXPORT_SYMBOL(tcp_md5_do_del);
1231
1232 static void tcp_clear_md5_list(struct sock *sk)
1233 {
1234         struct tcp_sock *tp = tcp_sk(sk);
1235         struct tcp_md5sig_key *key;
1236         struct hlist_node *n;
1237         struct tcp_md5sig_info *md5sig;
1238
1239         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1240
1241         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1242                 hlist_del_rcu(&key->node);
1243                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1244                 kfree_rcu(key, rcu);
1245         }
1246 }
1247
1248 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1249                                  sockptr_t optval, int optlen)
1250 {
1251         struct tcp_md5sig cmd;
1252         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1253         const union tcp_md5_addr *addr;
1254         u8 prefixlen = 32;
1255         int l3index = 0;
1256         u8 flags;
1257
1258         if (optlen < sizeof(cmd))
1259                 return -EINVAL;
1260
1261         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1262                 return -EFAULT;
1263
1264         if (sin->sin_family != AF_INET)
1265                 return -EINVAL;
1266
1267         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1268
1269         if (optname == TCP_MD5SIG_EXT &&
1270             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1271                 prefixlen = cmd.tcpm_prefixlen;
1272                 if (prefixlen > 32)
1273                         return -EINVAL;
1274         }
1275
1276         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1277             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1278                 struct net_device *dev;
1279
1280                 rcu_read_lock();
1281                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1282                 if (dev && netif_is_l3_master(dev))
1283                         l3index = dev->ifindex;
1284
1285                 rcu_read_unlock();
1286
1287                 /* ok to reference set/not set outside of rcu;
1288                  * right now device MUST be an L3 master
1289                  */
1290                 if (!dev || !l3index)
1291                         return -EINVAL;
1292         }
1293
1294         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1295
1296         if (!cmd.tcpm_keylen)
1297                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1298
1299         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1300                 return -EINVAL;
1301
1302         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1303                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1304 }
1305
1306 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1307                                    __be32 daddr, __be32 saddr,
1308                                    const struct tcphdr *th, int nbytes)
1309 {
1310         struct tcp4_pseudohdr *bp;
1311         struct scatterlist sg;
1312         struct tcphdr *_th;
1313
1314         bp = hp->scratch;
1315         bp->saddr = saddr;
1316         bp->daddr = daddr;
1317         bp->pad = 0;
1318         bp->protocol = IPPROTO_TCP;
1319         bp->len = cpu_to_be16(nbytes);
1320
1321         _th = (struct tcphdr *)(bp + 1);
1322         memcpy(_th, th, sizeof(*th));
1323         _th->check = 0;
1324
1325         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1326         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1327                                 sizeof(*bp) + sizeof(*th));
1328         return crypto_ahash_update(hp->md5_req);
1329 }
1330
1331 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1332                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1333 {
1334         struct tcp_md5sig_pool *hp;
1335         struct ahash_request *req;
1336
1337         hp = tcp_get_md5sig_pool();
1338         if (!hp)
1339                 goto clear_hash_noput;
1340         req = hp->md5_req;
1341
1342         if (crypto_ahash_init(req))
1343                 goto clear_hash;
1344         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1345                 goto clear_hash;
1346         if (tcp_md5_hash_key(hp, key))
1347                 goto clear_hash;
1348         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1349         if (crypto_ahash_final(req))
1350                 goto clear_hash;
1351
1352         tcp_put_md5sig_pool();
1353         return 0;
1354
1355 clear_hash:
1356         tcp_put_md5sig_pool();
1357 clear_hash_noput:
1358         memset(md5_hash, 0, 16);
1359         return 1;
1360 }
1361
1362 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1363                         const struct sock *sk,
1364                         const struct sk_buff *skb)
1365 {
1366         struct tcp_md5sig_pool *hp;
1367         struct ahash_request *req;
1368         const struct tcphdr *th = tcp_hdr(skb);
1369         __be32 saddr, daddr;
1370
1371         if (sk) { /* valid for establish/request sockets */
1372                 saddr = sk->sk_rcv_saddr;
1373                 daddr = sk->sk_daddr;
1374         } else {
1375                 const struct iphdr *iph = ip_hdr(skb);
1376                 saddr = iph->saddr;
1377                 daddr = iph->daddr;
1378         }
1379
1380         hp = tcp_get_md5sig_pool();
1381         if (!hp)
1382                 goto clear_hash_noput;
1383         req = hp->md5_req;
1384
1385         if (crypto_ahash_init(req))
1386                 goto clear_hash;
1387
1388         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1389                 goto clear_hash;
1390         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1391                 goto clear_hash;
1392         if (tcp_md5_hash_key(hp, key))
1393                 goto clear_hash;
1394         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1395         if (crypto_ahash_final(req))
1396                 goto clear_hash;
1397
1398         tcp_put_md5sig_pool();
1399         return 0;
1400
1401 clear_hash:
1402         tcp_put_md5sig_pool();
1403 clear_hash_noput:
1404         memset(md5_hash, 0, 16);
1405         return 1;
1406 }
1407 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1408
1409 #endif
1410
1411 static void tcp_v4_init_req(struct request_sock *req,
1412                             const struct sock *sk_listener,
1413                             struct sk_buff *skb)
1414 {
1415         struct inet_request_sock *ireq = inet_rsk(req);
1416         struct net *net = sock_net(sk_listener);
1417
1418         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1419         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1420         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1421 }
1422
1423 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1424                                           struct sk_buff *skb,
1425                                           struct flowi *fl,
1426                                           struct request_sock *req)
1427 {
1428         tcp_v4_init_req(req, sk, skb);
1429
1430         if (security_inet_conn_request(sk, skb, req))
1431                 return NULL;
1432
1433         return inet_csk_route_req(sk, &fl->u.ip4, req);
1434 }
1435
1436 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1437         .family         =       PF_INET,
1438         .obj_size       =       sizeof(struct tcp_request_sock),
1439         .rtx_syn_ack    =       tcp_rtx_synack,
1440         .send_ack       =       tcp_v4_reqsk_send_ack,
1441         .destructor     =       tcp_v4_reqsk_destructor,
1442         .send_reset     =       tcp_v4_send_reset,
1443         .syn_ack_timeout =      tcp_syn_ack_timeout,
1444 };
1445
1446 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1447         .mss_clamp      =       TCP_MSS_DEFAULT,
1448 #ifdef CONFIG_TCP_MD5SIG
1449         .req_md5_lookup =       tcp_v4_md5_lookup,
1450         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1451 #endif
1452 #ifdef CONFIG_SYN_COOKIES
1453         .cookie_init_seq =      cookie_v4_init_sequence,
1454 #endif
1455         .route_req      =       tcp_v4_route_req,
1456         .init_seq       =       tcp_v4_init_seq,
1457         .init_ts_off    =       tcp_v4_init_ts_off,
1458         .send_synack    =       tcp_v4_send_synack,
1459 };
1460
1461 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1462 {
1463         /* Never answer to SYNs send to broadcast or multicast */
1464         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1465                 goto drop;
1466
1467         return tcp_conn_request(&tcp_request_sock_ops,
1468                                 &tcp_request_sock_ipv4_ops, sk, skb);
1469
1470 drop:
1471         tcp_listendrop(sk);
1472         return 0;
1473 }
1474 EXPORT_SYMBOL(tcp_v4_conn_request);
1475
1476
1477 /*
1478  * The three way handshake has completed - we got a valid synack -
1479  * now create the new socket.
1480  */
1481 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1482                                   struct request_sock *req,
1483                                   struct dst_entry *dst,
1484                                   struct request_sock *req_unhash,
1485                                   bool *own_req)
1486 {
1487         struct inet_request_sock *ireq;
1488         bool found_dup_sk = false;
1489         struct inet_sock *newinet;
1490         struct tcp_sock *newtp;
1491         struct sock *newsk;
1492 #ifdef CONFIG_TCP_MD5SIG
1493         const union tcp_md5_addr *addr;
1494         struct tcp_md5sig_key *key;
1495         int l3index;
1496 #endif
1497         struct ip_options_rcu *inet_opt;
1498
1499         if (sk_acceptq_is_full(sk))
1500                 goto exit_overflow;
1501
1502         newsk = tcp_create_openreq_child(sk, req, skb);
1503         if (!newsk)
1504                 goto exit_nonewsk;
1505
1506         newsk->sk_gso_type = SKB_GSO_TCPV4;
1507         inet_sk_rx_dst_set(newsk, skb);
1508
1509         newtp                 = tcp_sk(newsk);
1510         newinet               = inet_sk(newsk);
1511         ireq                  = inet_rsk(req);
1512         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1513         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1514         newsk->sk_bound_dev_if = ireq->ir_iif;
1515         newinet->inet_saddr   = ireq->ir_loc_addr;
1516         inet_opt              = rcu_dereference(ireq->ireq_opt);
1517         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1518         newinet->mc_index     = inet_iif(skb);
1519         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1520         newinet->rcv_tos      = ip_hdr(skb)->tos;
1521         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1522         if (inet_opt)
1523                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1524         newinet->inet_id = prandom_u32();
1525
1526         /* Set ToS of the new socket based upon the value of incoming SYN.
1527          * ECT bits are set later in tcp_init_transfer().
1528          */
1529         if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1530                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1531
1532         if (!dst) {
1533                 dst = inet_csk_route_child_sock(sk, newsk, req);
1534                 if (!dst)
1535                         goto put_and_exit;
1536         } else {
1537                 /* syncookie case : see end of cookie_v4_check() */
1538         }
1539         sk_setup_caps(newsk, dst);
1540
1541         tcp_ca_openreq_child(newsk, dst);
1542
1543         tcp_sync_mss(newsk, dst_mtu(dst));
1544         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1545
1546         tcp_initialize_rcv_mss(newsk);
1547
1548 #ifdef CONFIG_TCP_MD5SIG
1549         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1550         /* Copy over the MD5 key from the original socket */
1551         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1552         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1553         if (key) {
1554                 /*
1555                  * We're using one, so create a matching key
1556                  * on the newsk structure. If we fail to get
1557                  * memory, then we end up not copying the key
1558                  * across. Shucks.
1559                  */
1560                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1561                                key->key, key->keylen, GFP_ATOMIC);
1562                 sk_gso_disable(newsk);
1563         }
1564 #endif
1565
1566         if (__inet_inherit_port(sk, newsk) < 0)
1567                 goto put_and_exit;
1568         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1569                                        &found_dup_sk);
1570         if (likely(*own_req)) {
1571                 tcp_move_syn(newtp, req);
1572                 ireq->ireq_opt = NULL;
1573         } else {
1574                 newinet->inet_opt = NULL;
1575
1576                 if (!req_unhash && found_dup_sk) {
1577                         /* This code path should only be executed in the
1578                          * syncookie case only
1579                          */
1580                         bh_unlock_sock(newsk);
1581                         sock_put(newsk);
1582                         newsk = NULL;
1583                 }
1584         }
1585         return newsk;
1586
1587 exit_overflow:
1588         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1589 exit_nonewsk:
1590         dst_release(dst);
1591 exit:
1592         tcp_listendrop(sk);
1593         return NULL;
1594 put_and_exit:
1595         newinet->inet_opt = NULL;
1596         inet_csk_prepare_forced_close(newsk);
1597         tcp_done(newsk);
1598         goto exit;
1599 }
1600 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1601
1602 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1603 {
1604 #ifdef CONFIG_SYN_COOKIES
1605         const struct tcphdr *th = tcp_hdr(skb);
1606
1607         if (!th->syn)
1608                 sk = cookie_v4_check(sk, skb);
1609 #endif
1610         return sk;
1611 }
1612
1613 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1614                          struct tcphdr *th, u32 *cookie)
1615 {
1616         u16 mss = 0;
1617 #ifdef CONFIG_SYN_COOKIES
1618         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1619                                     &tcp_request_sock_ipv4_ops, sk, th);
1620         if (mss) {
1621                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1622                 tcp_synq_overflow(sk);
1623         }
1624 #endif
1625         return mss;
1626 }
1627
1628 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1629                                                            u32));
1630 /* The socket must have it's spinlock held when we get
1631  * here, unless it is a TCP_LISTEN socket.
1632  *
1633  * We have a potential double-lock case here, so even when
1634  * doing backlog processing we use the BH locking scheme.
1635  * This is because we cannot sleep with the original spinlock
1636  * held.
1637  */
1638 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1639 {
1640         enum skb_drop_reason reason;
1641         struct sock *rsk;
1642
1643         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1644                 struct dst_entry *dst;
1645
1646                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1647                                                 lockdep_sock_is_held(sk));
1648
1649                 sock_rps_save_rxhash(sk, skb);
1650                 sk_mark_napi_id(sk, skb);
1651                 if (dst) {
1652                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1653                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1654                                              dst, 0)) {
1655                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1656                                 dst_release(dst);
1657                         }
1658                 }
1659                 tcp_rcv_established(sk, skb);
1660                 return 0;
1661         }
1662
1663         reason = SKB_DROP_REASON_NOT_SPECIFIED;
1664         if (tcp_checksum_complete(skb))
1665                 goto csum_err;
1666
1667         if (sk->sk_state == TCP_LISTEN) {
1668                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1669
1670                 if (!nsk)
1671                         goto discard;
1672                 if (nsk != sk) {
1673                         if (tcp_child_process(sk, nsk, skb)) {
1674                                 rsk = nsk;
1675                                 goto reset;
1676                         }
1677                         return 0;
1678                 }
1679         } else
1680                 sock_rps_save_rxhash(sk, skb);
1681
1682         if (tcp_rcv_state_process(sk, skb)) {
1683                 rsk = sk;
1684                 goto reset;
1685         }
1686         return 0;
1687
1688 reset:
1689         tcp_v4_send_reset(rsk, skb);
1690 discard:
1691         kfree_skb_reason(skb, reason);
1692         /* Be careful here. If this function gets more complicated and
1693          * gcc suffers from register pressure on the x86, sk (in %ebx)
1694          * might be destroyed here. This current version compiles correctly,
1695          * but you have been warned.
1696          */
1697         return 0;
1698
1699 csum_err:
1700         reason = SKB_DROP_REASON_TCP_CSUM;
1701         trace_tcp_bad_csum(skb);
1702         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1703         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1704         goto discard;
1705 }
1706 EXPORT_SYMBOL(tcp_v4_do_rcv);
1707
1708 int tcp_v4_early_demux(struct sk_buff *skb)
1709 {
1710         const struct iphdr *iph;
1711         const struct tcphdr *th;
1712         struct sock *sk;
1713
1714         if (skb->pkt_type != PACKET_HOST)
1715                 return 0;
1716
1717         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1718                 return 0;
1719
1720         iph = ip_hdr(skb);
1721         th = tcp_hdr(skb);
1722
1723         if (th->doff < sizeof(struct tcphdr) / 4)
1724                 return 0;
1725
1726         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1727                                        iph->saddr, th->source,
1728                                        iph->daddr, ntohs(th->dest),
1729                                        skb->skb_iif, inet_sdif(skb));
1730         if (sk) {
1731                 skb->sk = sk;
1732                 skb->destructor = sock_edemux;
1733                 if (sk_fullsock(sk)) {
1734                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1735
1736                         if (dst)
1737                                 dst = dst_check(dst, 0);
1738                         if (dst &&
1739                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1740                                 skb_dst_set_noref(skb, dst);
1741                 }
1742         }
1743         return 0;
1744 }
1745
1746 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1747                      enum skb_drop_reason *reason)
1748 {
1749         u32 limit, tail_gso_size, tail_gso_segs;
1750         struct skb_shared_info *shinfo;
1751         const struct tcphdr *th;
1752         struct tcphdr *thtail;
1753         struct sk_buff *tail;
1754         unsigned int hdrlen;
1755         bool fragstolen;
1756         u32 gso_segs;
1757         u32 gso_size;
1758         int delta;
1759
1760         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1761          * we can fix skb->truesize to its real value to avoid future drops.
1762          * This is valid because skb is not yet charged to the socket.
1763          * It has been noticed pure SACK packets were sometimes dropped
1764          * (if cooked by drivers without copybreak feature).
1765          */
1766         skb_condense(skb);
1767
1768         skb_dst_drop(skb);
1769
1770         if (unlikely(tcp_checksum_complete(skb))) {
1771                 bh_unlock_sock(sk);
1772                 trace_tcp_bad_csum(skb);
1773                 *reason = SKB_DROP_REASON_TCP_CSUM;
1774                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1775                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1776                 return true;
1777         }
1778
1779         /* Attempt coalescing to last skb in backlog, even if we are
1780          * above the limits.
1781          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1782          */
1783         th = (const struct tcphdr *)skb->data;
1784         hdrlen = th->doff * 4;
1785
1786         tail = sk->sk_backlog.tail;
1787         if (!tail)
1788                 goto no_coalesce;
1789         thtail = (struct tcphdr *)tail->data;
1790
1791         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1792             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1793             ((TCP_SKB_CB(tail)->tcp_flags |
1794               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1795             !((TCP_SKB_CB(tail)->tcp_flags &
1796               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1797             ((TCP_SKB_CB(tail)->tcp_flags ^
1798               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1799 #ifdef CONFIG_TLS_DEVICE
1800             tail->decrypted != skb->decrypted ||
1801 #endif
1802             thtail->doff != th->doff ||
1803             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1804                 goto no_coalesce;
1805
1806         __skb_pull(skb, hdrlen);
1807
1808         shinfo = skb_shinfo(skb);
1809         gso_size = shinfo->gso_size ?: skb->len;
1810         gso_segs = shinfo->gso_segs ?: 1;
1811
1812         shinfo = skb_shinfo(tail);
1813         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1814         tail_gso_segs = shinfo->gso_segs ?: 1;
1815
1816         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1817                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1818
1819                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1820                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1821                         thtail->window = th->window;
1822                 }
1823
1824                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1825                  * thtail->fin, so that the fast path in tcp_rcv_established()
1826                  * is not entered if we append a packet with a FIN.
1827                  * SYN, RST, URG are not present.
1828                  * ACK is set on both packets.
1829                  * PSH : we do not really care in TCP stack,
1830                  *       at least for 'GRO' packets.
1831                  */
1832                 thtail->fin |= th->fin;
1833                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1834
1835                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1836                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1837                         tail->tstamp = skb->tstamp;
1838                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1839                 }
1840
1841                 /* Not as strict as GRO. We only need to carry mss max value */
1842                 shinfo->gso_size = max(gso_size, tail_gso_size);
1843                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1844
1845                 sk->sk_backlog.len += delta;
1846                 __NET_INC_STATS(sock_net(sk),
1847                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1848                 kfree_skb_partial(skb, fragstolen);
1849                 return false;
1850         }
1851         __skb_push(skb, hdrlen);
1852
1853 no_coalesce:
1854         /* Only socket owner can try to collapse/prune rx queues
1855          * to reduce memory overhead, so add a little headroom here.
1856          * Few sockets backlog are possibly concurrently non empty.
1857          */
1858         limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
1859
1860         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1861                 bh_unlock_sock(sk);
1862                 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1863                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1864                 return true;
1865         }
1866         return false;
1867 }
1868 EXPORT_SYMBOL(tcp_add_backlog);
1869
1870 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1871 {
1872         struct tcphdr *th = (struct tcphdr *)skb->data;
1873
1874         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1875 }
1876 EXPORT_SYMBOL(tcp_filter);
1877
1878 static void tcp_v4_restore_cb(struct sk_buff *skb)
1879 {
1880         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1881                 sizeof(struct inet_skb_parm));
1882 }
1883
1884 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1885                            const struct tcphdr *th)
1886 {
1887         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1888          * barrier() makes sure compiler wont play fool^Waliasing games.
1889          */
1890         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1891                 sizeof(struct inet_skb_parm));
1892         barrier();
1893
1894         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1895         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1896                                     skb->len - th->doff * 4);
1897         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1898         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1899         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1900         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1901         TCP_SKB_CB(skb)->sacked  = 0;
1902         TCP_SKB_CB(skb)->has_rxtstamp =
1903                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1904 }
1905
1906 /*
1907  *      From tcp_input.c
1908  */
1909
1910 int tcp_v4_rcv(struct sk_buff *skb)
1911 {
1912         struct net *net = dev_net(skb->dev);
1913         enum skb_drop_reason drop_reason;
1914         int sdif = inet_sdif(skb);
1915         int dif = inet_iif(skb);
1916         const struct iphdr *iph;
1917         const struct tcphdr *th;
1918         bool refcounted;
1919         struct sock *sk;
1920         int ret;
1921
1922         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1923         if (skb->pkt_type != PACKET_HOST)
1924                 goto discard_it;
1925
1926         /* Count it even if it's bad */
1927         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1928
1929         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1930                 goto discard_it;
1931
1932         th = (const struct tcphdr *)skb->data;
1933
1934         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1935                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1936                 goto bad_packet;
1937         }
1938         if (!pskb_may_pull(skb, th->doff * 4))
1939                 goto discard_it;
1940
1941         /* An explanation is required here, I think.
1942          * Packet length and doff are validated by header prediction,
1943          * provided case of th->doff==0 is eliminated.
1944          * So, we defer the checks. */
1945
1946         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1947                 goto csum_error;
1948
1949         th = (const struct tcphdr *)skb->data;
1950         iph = ip_hdr(skb);
1951 lookup:
1952         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1953                                th->dest, sdif, &refcounted);
1954         if (!sk)
1955                 goto no_tcp_socket;
1956
1957 process:
1958         if (sk->sk_state == TCP_TIME_WAIT)
1959                 goto do_time_wait;
1960
1961         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1962                 struct request_sock *req = inet_reqsk(sk);
1963                 bool req_stolen = false;
1964                 struct sock *nsk;
1965
1966                 sk = req->rsk_listener;
1967                 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1968                         drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1969                 else
1970                         drop_reason = tcp_inbound_md5_hash(sk, skb,
1971                                                    &iph->saddr, &iph->daddr,
1972                                                    AF_INET, dif, sdif);
1973                 if (unlikely(drop_reason)) {
1974                         sk_drops_add(sk, skb);
1975                         reqsk_put(req);
1976                         goto discard_it;
1977                 }
1978                 if (tcp_checksum_complete(skb)) {
1979                         reqsk_put(req);
1980                         goto csum_error;
1981                 }
1982                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1983                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
1984                         if (!nsk) {
1985                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
1986                                 goto lookup;
1987                         }
1988                         sk = nsk;
1989                         /* reuseport_migrate_sock() has already held one sk_refcnt
1990                          * before returning.
1991                          */
1992                 } else {
1993                         /* We own a reference on the listener, increase it again
1994                          * as we might lose it too soon.
1995                          */
1996                         sock_hold(sk);
1997                 }
1998                 refcounted = true;
1999                 nsk = NULL;
2000                 if (!tcp_filter(sk, skb)) {
2001                         th = (const struct tcphdr *)skb->data;
2002                         iph = ip_hdr(skb);
2003                         tcp_v4_fill_cb(skb, iph, th);
2004                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2005                 } else {
2006                         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2007                 }
2008                 if (!nsk) {
2009                         reqsk_put(req);
2010                         if (req_stolen) {
2011                                 /* Another cpu got exclusive access to req
2012                                  * and created a full blown socket.
2013                                  * Try to feed this packet to this socket
2014                                  * instead of discarding it.
2015                                  */
2016                                 tcp_v4_restore_cb(skb);
2017                                 sock_put(sk);
2018                                 goto lookup;
2019                         }
2020                         goto discard_and_relse;
2021                 }
2022                 nf_reset_ct(skb);
2023                 if (nsk == sk) {
2024                         reqsk_put(req);
2025                         tcp_v4_restore_cb(skb);
2026                 } else if (tcp_child_process(sk, nsk, skb)) {
2027                         tcp_v4_send_reset(nsk, skb);
2028                         goto discard_and_relse;
2029                 } else {
2030                         sock_put(sk);
2031                         return 0;
2032                 }
2033         }
2034
2035         if (static_branch_unlikely(&ip4_min_ttl)) {
2036                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2037                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2038                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2039                         goto discard_and_relse;
2040                 }
2041         }
2042
2043         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2044                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2045                 goto discard_and_relse;
2046         }
2047
2048         drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2049                                            &iph->daddr, AF_INET, dif, sdif);
2050         if (drop_reason)
2051                 goto discard_and_relse;
2052
2053         nf_reset_ct(skb);
2054
2055         if (tcp_filter(sk, skb)) {
2056                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2057                 goto discard_and_relse;
2058         }
2059         th = (const struct tcphdr *)skb->data;
2060         iph = ip_hdr(skb);
2061         tcp_v4_fill_cb(skb, iph, th);
2062
2063         skb->dev = NULL;
2064
2065         if (sk->sk_state == TCP_LISTEN) {
2066                 ret = tcp_v4_do_rcv(sk, skb);
2067                 goto put_and_return;
2068         }
2069
2070         sk_incoming_cpu_update(sk);
2071
2072         bh_lock_sock_nested(sk);
2073         tcp_segs_in(tcp_sk(sk), skb);
2074         ret = 0;
2075         if (!sock_owned_by_user(sk)) {
2076                 ret = tcp_v4_do_rcv(sk, skb);
2077         } else {
2078                 if (tcp_add_backlog(sk, skb, &drop_reason))
2079                         goto discard_and_relse;
2080         }
2081         bh_unlock_sock(sk);
2082
2083 put_and_return:
2084         if (refcounted)
2085                 sock_put(sk);
2086
2087         return ret;
2088
2089 no_tcp_socket:
2090         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2091         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2092                 goto discard_it;
2093
2094         tcp_v4_fill_cb(skb, iph, th);
2095
2096         if (tcp_checksum_complete(skb)) {
2097 csum_error:
2098                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2099                 trace_tcp_bad_csum(skb);
2100                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2101 bad_packet:
2102                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2103         } else {
2104                 tcp_v4_send_reset(NULL, skb);
2105         }
2106
2107 discard_it:
2108         SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2109         /* Discard frame. */
2110         kfree_skb_reason(skb, drop_reason);
2111         return 0;
2112
2113 discard_and_relse:
2114         sk_drops_add(sk, skb);
2115         if (refcounted)
2116                 sock_put(sk);
2117         goto discard_it;
2118
2119 do_time_wait:
2120         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2121                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2122                 inet_twsk_put(inet_twsk(sk));
2123                 goto discard_it;
2124         }
2125
2126         tcp_v4_fill_cb(skb, iph, th);
2127
2128         if (tcp_checksum_complete(skb)) {
2129                 inet_twsk_put(inet_twsk(sk));
2130                 goto csum_error;
2131         }
2132         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2133         case TCP_TW_SYN: {
2134                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2135                                                         &tcp_hashinfo, skb,
2136                                                         __tcp_hdrlen(th),
2137                                                         iph->saddr, th->source,
2138                                                         iph->daddr, th->dest,
2139                                                         inet_iif(skb),
2140                                                         sdif);
2141                 if (sk2) {
2142                         inet_twsk_deschedule_put(inet_twsk(sk));
2143                         sk = sk2;
2144                         tcp_v4_restore_cb(skb);
2145                         refcounted = false;
2146                         goto process;
2147                 }
2148         }
2149                 /* to ACK */
2150                 fallthrough;
2151         case TCP_TW_ACK:
2152                 tcp_v4_timewait_ack(sk, skb);
2153                 break;
2154         case TCP_TW_RST:
2155                 tcp_v4_send_reset(sk, skb);
2156                 inet_twsk_deschedule_put(inet_twsk(sk));
2157                 goto discard_it;
2158         case TCP_TW_SUCCESS:;
2159         }
2160         goto discard_it;
2161 }
2162
2163 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2164         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2165         .twsk_unique    = tcp_twsk_unique,
2166         .twsk_destructor= tcp_twsk_destructor,
2167 };
2168
2169 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2170 {
2171         struct dst_entry *dst = skb_dst(skb);
2172
2173         if (dst && dst_hold_safe(dst)) {
2174                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2175                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2176         }
2177 }
2178 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2179
2180 const struct inet_connection_sock_af_ops ipv4_specific = {
2181         .queue_xmit        = ip_queue_xmit,
2182         .send_check        = tcp_v4_send_check,
2183         .rebuild_header    = inet_sk_rebuild_header,
2184         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2185         .conn_request      = tcp_v4_conn_request,
2186         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2187         .net_header_len    = sizeof(struct iphdr),
2188         .setsockopt        = ip_setsockopt,
2189         .getsockopt        = ip_getsockopt,
2190         .addr2sockaddr     = inet_csk_addr2sockaddr,
2191         .sockaddr_len      = sizeof(struct sockaddr_in),
2192         .mtu_reduced       = tcp_v4_mtu_reduced,
2193 };
2194 EXPORT_SYMBOL(ipv4_specific);
2195
2196 #ifdef CONFIG_TCP_MD5SIG
2197 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2198         .md5_lookup             = tcp_v4_md5_lookup,
2199         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2200         .md5_parse              = tcp_v4_parse_md5_keys,
2201 };
2202 #endif
2203
2204 /* NOTE: A lot of things set to zero explicitly by call to
2205  *       sk_alloc() so need not be done here.
2206  */
2207 static int tcp_v4_init_sock(struct sock *sk)
2208 {
2209         struct inet_connection_sock *icsk = inet_csk(sk);
2210
2211         tcp_init_sock(sk);
2212
2213         icsk->icsk_af_ops = &ipv4_specific;
2214
2215 #ifdef CONFIG_TCP_MD5SIG
2216         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2217 #endif
2218
2219         return 0;
2220 }
2221
2222 void tcp_v4_destroy_sock(struct sock *sk)
2223 {
2224         struct tcp_sock *tp = tcp_sk(sk);
2225
2226         trace_tcp_destroy_sock(sk);
2227
2228         tcp_clear_xmit_timers(sk);
2229
2230         tcp_cleanup_congestion_control(sk);
2231
2232         tcp_cleanup_ulp(sk);
2233
2234         /* Cleanup up the write buffer. */
2235         tcp_write_queue_purge(sk);
2236
2237         /* Check if we want to disable active TFO */
2238         tcp_fastopen_active_disable_ofo_check(sk);
2239
2240         /* Cleans up our, hopefully empty, out_of_order_queue. */
2241         skb_rbtree_purge(&tp->out_of_order_queue);
2242
2243 #ifdef CONFIG_TCP_MD5SIG
2244         /* Clean up the MD5 key list, if any */
2245         if (tp->md5sig_info) {
2246                 tcp_clear_md5_list(sk);
2247                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2248                 tp->md5sig_info = NULL;
2249         }
2250 #endif
2251
2252         /* Clean up a referenced TCP bind bucket. */
2253         if (inet_csk(sk)->icsk_bind_hash)
2254                 inet_put_port(sk);
2255
2256         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2257
2258         /* If socket is aborted during connect operation */
2259         tcp_free_fastopen_req(tp);
2260         tcp_fastopen_destroy_cipher(sk);
2261         tcp_saved_syn_free(tp);
2262
2263         sk_sockets_allocated_dec(sk);
2264 }
2265 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2266
2267 #ifdef CONFIG_PROC_FS
2268 /* Proc filesystem TCP sock list dumping. */
2269
2270 static unsigned short seq_file_family(const struct seq_file *seq);
2271
2272 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2273 {
2274         unsigned short family = seq_file_family(seq);
2275
2276         /* AF_UNSPEC is used as a match all */
2277         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2278                 net_eq(sock_net(sk), seq_file_net(seq)));
2279 }
2280
2281 /* Find a non empty bucket (starting from st->bucket)
2282  * and return the first sk from it.
2283  */
2284 static void *listening_get_first(struct seq_file *seq)
2285 {
2286         struct tcp_iter_state *st = seq->private;
2287
2288         st->offset = 0;
2289         for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2290                 struct inet_listen_hashbucket *ilb2;
2291                 struct hlist_nulls_node *node;
2292                 struct sock *sk;
2293
2294                 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2295                 if (hlist_nulls_empty(&ilb2->nulls_head))
2296                         continue;
2297
2298                 spin_lock(&ilb2->lock);
2299                 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2300                         if (seq_sk_match(seq, sk))
2301                                 return sk;
2302                 }
2303                 spin_unlock(&ilb2->lock);
2304         }
2305
2306         return NULL;
2307 }
2308
2309 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2310  * If "cur" is the last one in the st->bucket,
2311  * call listening_get_first() to return the first sk of the next
2312  * non empty bucket.
2313  */
2314 static void *listening_get_next(struct seq_file *seq, void *cur)
2315 {
2316         struct tcp_iter_state *st = seq->private;
2317         struct inet_listen_hashbucket *ilb2;
2318         struct hlist_nulls_node *node;
2319         struct sock *sk = cur;
2320
2321         ++st->num;
2322         ++st->offset;
2323
2324         sk = sk_nulls_next(sk);
2325         sk_nulls_for_each_from(sk, node) {
2326                 if (seq_sk_match(seq, sk))
2327                         return sk;
2328         }
2329
2330         ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2331         spin_unlock(&ilb2->lock);
2332         ++st->bucket;
2333         return listening_get_first(seq);
2334 }
2335
2336 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2337 {
2338         struct tcp_iter_state *st = seq->private;
2339         void *rc;
2340
2341         st->bucket = 0;
2342         st->offset = 0;
2343         rc = listening_get_first(seq);
2344
2345         while (rc && *pos) {
2346                 rc = listening_get_next(seq, rc);
2347                 --*pos;
2348         }
2349         return rc;
2350 }
2351
2352 static inline bool empty_bucket(const struct tcp_iter_state *st)
2353 {
2354         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2355 }
2356
2357 /*
2358  * Get first established socket starting from bucket given in st->bucket.
2359  * If st->bucket is zero, the very first socket in the hash is returned.
2360  */
2361 static void *established_get_first(struct seq_file *seq)
2362 {
2363         struct tcp_iter_state *st = seq->private;
2364
2365         st->offset = 0;
2366         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2367                 struct sock *sk;
2368                 struct hlist_nulls_node *node;
2369                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2370
2371                 /* Lockless fast path for the common case of empty buckets */
2372                 if (empty_bucket(st))
2373                         continue;
2374
2375                 spin_lock_bh(lock);
2376                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2377                         if (seq_sk_match(seq, sk))
2378                                 return sk;
2379                 }
2380                 spin_unlock_bh(lock);
2381         }
2382
2383         return NULL;
2384 }
2385
2386 static void *established_get_next(struct seq_file *seq, void *cur)
2387 {
2388         struct sock *sk = cur;
2389         struct hlist_nulls_node *node;
2390         struct tcp_iter_state *st = seq->private;
2391
2392         ++st->num;
2393         ++st->offset;
2394
2395         sk = sk_nulls_next(sk);
2396
2397         sk_nulls_for_each_from(sk, node) {
2398                 if (seq_sk_match(seq, sk))
2399                         return sk;
2400         }
2401
2402         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2403         ++st->bucket;
2404         return established_get_first(seq);
2405 }
2406
2407 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2408 {
2409         struct tcp_iter_state *st = seq->private;
2410         void *rc;
2411
2412         st->bucket = 0;
2413         rc = established_get_first(seq);
2414
2415         while (rc && pos) {
2416                 rc = established_get_next(seq, rc);
2417                 --pos;
2418         }
2419         return rc;
2420 }
2421
2422 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2423 {
2424         void *rc;
2425         struct tcp_iter_state *st = seq->private;
2426
2427         st->state = TCP_SEQ_STATE_LISTENING;
2428         rc        = listening_get_idx(seq, &pos);
2429
2430         if (!rc) {
2431                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2432                 rc        = established_get_idx(seq, pos);
2433         }
2434
2435         return rc;
2436 }
2437
2438 static void *tcp_seek_last_pos(struct seq_file *seq)
2439 {
2440         struct tcp_iter_state *st = seq->private;
2441         int bucket = st->bucket;
2442         int offset = st->offset;
2443         int orig_num = st->num;
2444         void *rc = NULL;
2445
2446         switch (st->state) {
2447         case TCP_SEQ_STATE_LISTENING:
2448                 if (st->bucket > tcp_hashinfo.lhash2_mask)
2449                         break;
2450                 st->state = TCP_SEQ_STATE_LISTENING;
2451                 rc = listening_get_first(seq);
2452                 while (offset-- && rc && bucket == st->bucket)
2453                         rc = listening_get_next(seq, rc);
2454                 if (rc)
2455                         break;
2456                 st->bucket = 0;
2457                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2458                 fallthrough;
2459         case TCP_SEQ_STATE_ESTABLISHED:
2460                 if (st->bucket > tcp_hashinfo.ehash_mask)
2461                         break;
2462                 rc = established_get_first(seq);
2463                 while (offset-- && rc && bucket == st->bucket)
2464                         rc = established_get_next(seq, rc);
2465         }
2466
2467         st->num = orig_num;
2468
2469         return rc;
2470 }
2471
2472 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2473 {
2474         struct tcp_iter_state *st = seq->private;
2475         void *rc;
2476
2477         if (*pos && *pos == st->last_pos) {
2478                 rc = tcp_seek_last_pos(seq);
2479                 if (rc)
2480                         goto out;
2481         }
2482
2483         st->state = TCP_SEQ_STATE_LISTENING;
2484         st->num = 0;
2485         st->bucket = 0;
2486         st->offset = 0;
2487         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2488
2489 out:
2490         st->last_pos = *pos;
2491         return rc;
2492 }
2493 EXPORT_SYMBOL(tcp_seq_start);
2494
2495 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2496 {
2497         struct tcp_iter_state *st = seq->private;
2498         void *rc = NULL;
2499
2500         if (v == SEQ_START_TOKEN) {
2501                 rc = tcp_get_idx(seq, 0);
2502                 goto out;
2503         }
2504
2505         switch (st->state) {
2506         case TCP_SEQ_STATE_LISTENING:
2507                 rc = listening_get_next(seq, v);
2508                 if (!rc) {
2509                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2510                         st->bucket = 0;
2511                         st->offset = 0;
2512                         rc        = established_get_first(seq);
2513                 }
2514                 break;
2515         case TCP_SEQ_STATE_ESTABLISHED:
2516                 rc = established_get_next(seq, v);
2517                 break;
2518         }
2519 out:
2520         ++*pos;
2521         st->last_pos = *pos;
2522         return rc;
2523 }
2524 EXPORT_SYMBOL(tcp_seq_next);
2525
2526 void tcp_seq_stop(struct seq_file *seq, void *v)
2527 {
2528         struct tcp_iter_state *st = seq->private;
2529
2530         switch (st->state) {
2531         case TCP_SEQ_STATE_LISTENING:
2532                 if (v != SEQ_START_TOKEN)
2533                         spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2534                 break;
2535         case TCP_SEQ_STATE_ESTABLISHED:
2536                 if (v)
2537                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2538                 break;
2539         }
2540 }
2541 EXPORT_SYMBOL(tcp_seq_stop);
2542
2543 static void get_openreq4(const struct request_sock *req,
2544                          struct seq_file *f, int i)
2545 {
2546         const struct inet_request_sock *ireq = inet_rsk(req);
2547         long delta = req->rsk_timer.expires - jiffies;
2548
2549         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2550                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2551                 i,
2552                 ireq->ir_loc_addr,
2553                 ireq->ir_num,
2554                 ireq->ir_rmt_addr,
2555                 ntohs(ireq->ir_rmt_port),
2556                 TCP_SYN_RECV,
2557                 0, 0, /* could print option size, but that is af dependent. */
2558                 1,    /* timers active (only the expire timer) */
2559                 jiffies_delta_to_clock_t(delta),
2560                 req->num_timeout,
2561                 from_kuid_munged(seq_user_ns(f),
2562                                  sock_i_uid(req->rsk_listener)),
2563                 0,  /* non standard timer */
2564                 0, /* open_requests have no inode */
2565                 0,
2566                 req);
2567 }
2568
2569 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2570 {
2571         int timer_active;
2572         unsigned long timer_expires;
2573         const struct tcp_sock *tp = tcp_sk(sk);
2574         const struct inet_connection_sock *icsk = inet_csk(sk);
2575         const struct inet_sock *inet = inet_sk(sk);
2576         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2577         __be32 dest = inet->inet_daddr;
2578         __be32 src = inet->inet_rcv_saddr;
2579         __u16 destp = ntohs(inet->inet_dport);
2580         __u16 srcp = ntohs(inet->inet_sport);
2581         int rx_queue;
2582         int state;
2583
2584         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2585             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2586             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2587                 timer_active    = 1;
2588                 timer_expires   = icsk->icsk_timeout;
2589         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2590                 timer_active    = 4;
2591                 timer_expires   = icsk->icsk_timeout;
2592         } else if (timer_pending(&sk->sk_timer)) {
2593                 timer_active    = 2;
2594                 timer_expires   = sk->sk_timer.expires;
2595         } else {
2596                 timer_active    = 0;
2597                 timer_expires = jiffies;
2598         }
2599
2600         state = inet_sk_state_load(sk);
2601         if (state == TCP_LISTEN)
2602                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2603         else
2604                 /* Because we don't lock the socket,
2605                  * we might find a transient negative value.
2606                  */
2607                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2608                                       READ_ONCE(tp->copied_seq), 0);
2609
2610         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2611                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2612                 i, src, srcp, dest, destp, state,
2613                 READ_ONCE(tp->write_seq) - tp->snd_una,
2614                 rx_queue,
2615                 timer_active,
2616                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2617                 icsk->icsk_retransmits,
2618                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2619                 icsk->icsk_probes_out,
2620                 sock_i_ino(sk),
2621                 refcount_read(&sk->sk_refcnt), sk,
2622                 jiffies_to_clock_t(icsk->icsk_rto),
2623                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2624                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2625                 tcp_snd_cwnd(tp),
2626                 state == TCP_LISTEN ?
2627                     fastopenq->max_qlen :
2628                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2629 }
2630
2631 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2632                                struct seq_file *f, int i)
2633 {
2634         long delta = tw->tw_timer.expires - jiffies;
2635         __be32 dest, src;
2636         __u16 destp, srcp;
2637
2638         dest  = tw->tw_daddr;
2639         src   = tw->tw_rcv_saddr;
2640         destp = ntohs(tw->tw_dport);
2641         srcp  = ntohs(tw->tw_sport);
2642
2643         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2644                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2645                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2646                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2647                 refcount_read(&tw->tw_refcnt), tw);
2648 }
2649
2650 #define TMPSZ 150
2651
2652 static int tcp4_seq_show(struct seq_file *seq, void *v)
2653 {
2654         struct tcp_iter_state *st;
2655         struct sock *sk = v;
2656
2657         seq_setwidth(seq, TMPSZ - 1);
2658         if (v == SEQ_START_TOKEN) {
2659                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2660                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2661                            "inode");
2662                 goto out;
2663         }
2664         st = seq->private;
2665
2666         if (sk->sk_state == TCP_TIME_WAIT)
2667                 get_timewait4_sock(v, seq, st->num);
2668         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2669                 get_openreq4(v, seq, st->num);
2670         else
2671                 get_tcp4_sock(v, seq, st->num);
2672 out:
2673         seq_pad(seq, '\n');
2674         return 0;
2675 }
2676
2677 #ifdef CONFIG_BPF_SYSCALL
2678 struct bpf_tcp_iter_state {
2679         struct tcp_iter_state state;
2680         unsigned int cur_sk;
2681         unsigned int end_sk;
2682         unsigned int max_sk;
2683         struct sock **batch;
2684         bool st_bucket_done;
2685 };
2686
2687 struct bpf_iter__tcp {
2688         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2689         __bpf_md_ptr(struct sock_common *, sk_common);
2690         uid_t uid __aligned(8);
2691 };
2692
2693 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2694                              struct sock_common *sk_common, uid_t uid)
2695 {
2696         struct bpf_iter__tcp ctx;
2697
2698         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2699         ctx.meta = meta;
2700         ctx.sk_common = sk_common;
2701         ctx.uid = uid;
2702         return bpf_iter_run_prog(prog, &ctx);
2703 }
2704
2705 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2706 {
2707         while (iter->cur_sk < iter->end_sk)
2708                 sock_put(iter->batch[iter->cur_sk++]);
2709 }
2710
2711 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2712                                       unsigned int new_batch_sz)
2713 {
2714         struct sock **new_batch;
2715
2716         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2717                              GFP_USER | __GFP_NOWARN);
2718         if (!new_batch)
2719                 return -ENOMEM;
2720
2721         bpf_iter_tcp_put_batch(iter);
2722         kvfree(iter->batch);
2723         iter->batch = new_batch;
2724         iter->max_sk = new_batch_sz;
2725
2726         return 0;
2727 }
2728
2729 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2730                                                  struct sock *start_sk)
2731 {
2732         struct bpf_tcp_iter_state *iter = seq->private;
2733         struct tcp_iter_state *st = &iter->state;
2734         struct hlist_nulls_node *node;
2735         unsigned int expected = 1;
2736         struct sock *sk;
2737
2738         sock_hold(start_sk);
2739         iter->batch[iter->end_sk++] = start_sk;
2740
2741         sk = sk_nulls_next(start_sk);
2742         sk_nulls_for_each_from(sk, node) {
2743                 if (seq_sk_match(seq, sk)) {
2744                         if (iter->end_sk < iter->max_sk) {
2745                                 sock_hold(sk);
2746                                 iter->batch[iter->end_sk++] = sk;
2747                         }
2748                         expected++;
2749                 }
2750         }
2751         spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2752
2753         return expected;
2754 }
2755
2756 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2757                                                    struct sock *start_sk)
2758 {
2759         struct bpf_tcp_iter_state *iter = seq->private;
2760         struct tcp_iter_state *st = &iter->state;
2761         struct hlist_nulls_node *node;
2762         unsigned int expected = 1;
2763         struct sock *sk;
2764
2765         sock_hold(start_sk);
2766         iter->batch[iter->end_sk++] = start_sk;
2767
2768         sk = sk_nulls_next(start_sk);
2769         sk_nulls_for_each_from(sk, node) {
2770                 if (seq_sk_match(seq, sk)) {
2771                         if (iter->end_sk < iter->max_sk) {
2772                                 sock_hold(sk);
2773                                 iter->batch[iter->end_sk++] = sk;
2774                         }
2775                         expected++;
2776                 }
2777         }
2778         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2779
2780         return expected;
2781 }
2782
2783 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2784 {
2785         struct bpf_tcp_iter_state *iter = seq->private;
2786         struct tcp_iter_state *st = &iter->state;
2787         unsigned int expected;
2788         bool resized = false;
2789         struct sock *sk;
2790
2791         /* The st->bucket is done.  Directly advance to the next
2792          * bucket instead of having the tcp_seek_last_pos() to skip
2793          * one by one in the current bucket and eventually find out
2794          * it has to advance to the next bucket.
2795          */
2796         if (iter->st_bucket_done) {
2797                 st->offset = 0;
2798                 st->bucket++;
2799                 if (st->state == TCP_SEQ_STATE_LISTENING &&
2800                     st->bucket > tcp_hashinfo.lhash2_mask) {
2801                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2802                         st->bucket = 0;
2803                 }
2804         }
2805
2806 again:
2807         /* Get a new batch */
2808         iter->cur_sk = 0;
2809         iter->end_sk = 0;
2810         iter->st_bucket_done = false;
2811
2812         sk = tcp_seek_last_pos(seq);
2813         if (!sk)
2814                 return NULL; /* Done */
2815
2816         if (st->state == TCP_SEQ_STATE_LISTENING)
2817                 expected = bpf_iter_tcp_listening_batch(seq, sk);
2818         else
2819                 expected = bpf_iter_tcp_established_batch(seq, sk);
2820
2821         if (iter->end_sk == expected) {
2822                 iter->st_bucket_done = true;
2823                 return sk;
2824         }
2825
2826         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2827                 resized = true;
2828                 goto again;
2829         }
2830
2831         return sk;
2832 }
2833
2834 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2835 {
2836         /* bpf iter does not support lseek, so it always
2837          * continue from where it was stop()-ped.
2838          */
2839         if (*pos)
2840                 return bpf_iter_tcp_batch(seq);
2841
2842         return SEQ_START_TOKEN;
2843 }
2844
2845 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2846 {
2847         struct bpf_tcp_iter_state *iter = seq->private;
2848         struct tcp_iter_state *st = &iter->state;
2849         struct sock *sk;
2850
2851         /* Whenever seq_next() is called, the iter->cur_sk is
2852          * done with seq_show(), so advance to the next sk in
2853          * the batch.
2854          */
2855         if (iter->cur_sk < iter->end_sk) {
2856                 /* Keeping st->num consistent in tcp_iter_state.
2857                  * bpf_iter_tcp does not use st->num.
2858                  * meta.seq_num is used instead.
2859                  */
2860                 st->num++;
2861                 /* Move st->offset to the next sk in the bucket such that
2862                  * the future start() will resume at st->offset in
2863                  * st->bucket.  See tcp_seek_last_pos().
2864                  */
2865                 st->offset++;
2866                 sock_put(iter->batch[iter->cur_sk++]);
2867         }
2868
2869         if (iter->cur_sk < iter->end_sk)
2870                 sk = iter->batch[iter->cur_sk];
2871         else
2872                 sk = bpf_iter_tcp_batch(seq);
2873
2874         ++*pos;
2875         /* Keeping st->last_pos consistent in tcp_iter_state.
2876          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2877          */
2878         st->last_pos = *pos;
2879         return sk;
2880 }
2881
2882 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2883 {
2884         struct bpf_iter_meta meta;
2885         struct bpf_prog *prog;
2886         struct sock *sk = v;
2887         bool slow;
2888         uid_t uid;
2889         int ret;
2890
2891         if (v == SEQ_START_TOKEN)
2892                 return 0;
2893
2894         if (sk_fullsock(sk))
2895                 slow = lock_sock_fast(sk);
2896
2897         if (unlikely(sk_unhashed(sk))) {
2898                 ret = SEQ_SKIP;
2899                 goto unlock;
2900         }
2901
2902         if (sk->sk_state == TCP_TIME_WAIT) {
2903                 uid = 0;
2904         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2905                 const struct request_sock *req = v;
2906
2907                 uid = from_kuid_munged(seq_user_ns(seq),
2908                                        sock_i_uid(req->rsk_listener));
2909         } else {
2910                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2911         }
2912
2913         meta.seq = seq;
2914         prog = bpf_iter_get_info(&meta, false);
2915         ret = tcp_prog_seq_show(prog, &meta, v, uid);
2916
2917 unlock:
2918         if (sk_fullsock(sk))
2919                 unlock_sock_fast(sk, slow);
2920         return ret;
2921
2922 }
2923
2924 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2925 {
2926         struct bpf_tcp_iter_state *iter = seq->private;
2927         struct bpf_iter_meta meta;
2928         struct bpf_prog *prog;
2929
2930         if (!v) {
2931                 meta.seq = seq;
2932                 prog = bpf_iter_get_info(&meta, true);
2933                 if (prog)
2934                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2935         }
2936
2937         if (iter->cur_sk < iter->end_sk) {
2938                 bpf_iter_tcp_put_batch(iter);
2939                 iter->st_bucket_done = false;
2940         }
2941 }
2942
2943 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2944         .show           = bpf_iter_tcp_seq_show,
2945         .start          = bpf_iter_tcp_seq_start,
2946         .next           = bpf_iter_tcp_seq_next,
2947         .stop           = bpf_iter_tcp_seq_stop,
2948 };
2949 #endif
2950 static unsigned short seq_file_family(const struct seq_file *seq)
2951 {
2952         const struct tcp_seq_afinfo *afinfo;
2953
2954 #ifdef CONFIG_BPF_SYSCALL
2955         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2956         if (seq->op == &bpf_iter_tcp_seq_ops)
2957                 return AF_UNSPEC;
2958 #endif
2959
2960         /* Iterated from proc fs */
2961         afinfo = pde_data(file_inode(seq->file));
2962         return afinfo->family;
2963 }
2964
2965 static const struct seq_operations tcp4_seq_ops = {
2966         .show           = tcp4_seq_show,
2967         .start          = tcp_seq_start,
2968         .next           = tcp_seq_next,
2969         .stop           = tcp_seq_stop,
2970 };
2971
2972 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2973         .family         = AF_INET,
2974 };
2975
2976 static int __net_init tcp4_proc_init_net(struct net *net)
2977 {
2978         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2979                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2980                 return -ENOMEM;
2981         return 0;
2982 }
2983
2984 static void __net_exit tcp4_proc_exit_net(struct net *net)
2985 {
2986         remove_proc_entry("tcp", net->proc_net);
2987 }
2988
2989 static struct pernet_operations tcp4_net_ops = {
2990         .init = tcp4_proc_init_net,
2991         .exit = tcp4_proc_exit_net,
2992 };
2993
2994 int __init tcp4_proc_init(void)
2995 {
2996         return register_pernet_subsys(&tcp4_net_ops);
2997 }
2998
2999 void tcp4_proc_exit(void)
3000 {
3001         unregister_pernet_subsys(&tcp4_net_ops);
3002 }
3003 #endif /* CONFIG_PROC_FS */
3004
3005 /* @wake is one when sk_stream_write_space() calls us.
3006  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3007  * This mimics the strategy used in sock_def_write_space().
3008  */
3009 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3010 {
3011         const struct tcp_sock *tp = tcp_sk(sk);
3012         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3013                             READ_ONCE(tp->snd_nxt);
3014
3015         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3016 }
3017 EXPORT_SYMBOL(tcp_stream_memory_free);
3018
3019 struct proto tcp_prot = {
3020         .name                   = "TCP",
3021         .owner                  = THIS_MODULE,
3022         .close                  = tcp_close,
3023         .pre_connect            = tcp_v4_pre_connect,
3024         .connect                = tcp_v4_connect,
3025         .disconnect             = tcp_disconnect,
3026         .accept                 = inet_csk_accept,
3027         .ioctl                  = tcp_ioctl,
3028         .init                   = tcp_v4_init_sock,
3029         .destroy                = tcp_v4_destroy_sock,
3030         .shutdown               = tcp_shutdown,
3031         .setsockopt             = tcp_setsockopt,
3032         .getsockopt             = tcp_getsockopt,
3033         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3034         .keepalive              = tcp_set_keepalive,
3035         .recvmsg                = tcp_recvmsg,
3036         .sendmsg                = tcp_sendmsg,
3037         .sendpage               = tcp_sendpage,
3038         .backlog_rcv            = tcp_v4_do_rcv,
3039         .release_cb             = tcp_release_cb,
3040         .hash                   = inet_hash,
3041         .unhash                 = inet_unhash,
3042         .get_port               = inet_csk_get_port,
3043         .put_port               = inet_put_port,
3044 #ifdef CONFIG_BPF_SYSCALL
3045         .psock_update_sk_prot   = tcp_bpf_update_proto,
3046 #endif
3047         .enter_memory_pressure  = tcp_enter_memory_pressure,
3048         .leave_memory_pressure  = tcp_leave_memory_pressure,
3049         .stream_memory_free     = tcp_stream_memory_free,
3050         .sockets_allocated      = &tcp_sockets_allocated,
3051         .orphan_count           = &tcp_orphan_count,
3052         .memory_allocated       = &tcp_memory_allocated,
3053         .memory_pressure        = &tcp_memory_pressure,
3054         .sysctl_mem             = sysctl_tcp_mem,
3055         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3056         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3057         .max_header             = MAX_TCP_HEADER,
3058         .obj_size               = sizeof(struct tcp_sock),
3059         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3060         .twsk_prot              = &tcp_timewait_sock_ops,
3061         .rsk_prot               = &tcp_request_sock_ops,
3062         .h.hashinfo             = &tcp_hashinfo,
3063         .no_autobind            = true,
3064         .diag_destroy           = tcp_abort,
3065 };
3066 EXPORT_SYMBOL(tcp_prot);
3067
3068 static void __net_exit tcp_sk_exit(struct net *net)
3069 {
3070         struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
3071
3072         if (net->ipv4.tcp_congestion_control)
3073                 bpf_module_put(net->ipv4.tcp_congestion_control,
3074                                net->ipv4.tcp_congestion_control->owner);
3075         if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
3076                 kfree(tcp_death_row);
3077 }
3078
3079 static int __net_init tcp_sk_init(struct net *net)
3080 {
3081         int cnt;
3082
3083         net->ipv4.sysctl_tcp_ecn = 2;
3084         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3085
3086         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3087         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3088         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3089         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3090         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3091
3092         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3093         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3094         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3095
3096         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3097         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3098         net->ipv4.sysctl_tcp_syncookies = 1;
3099         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3100         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3101         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3102         net->ipv4.sysctl_tcp_orphan_retries = 0;
3103         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3104         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3105         net->ipv4.sysctl_tcp_tw_reuse = 2;
3106         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3107
3108         net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
3109         if (!net->ipv4.tcp_death_row)
3110                 return -ENOMEM;
3111         refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
3112         cnt = tcp_hashinfo.ehash_mask + 1;
3113         net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
3114         net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
3115
3116         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3117         net->ipv4.sysctl_tcp_sack = 1;
3118         net->ipv4.sysctl_tcp_window_scaling = 1;
3119         net->ipv4.sysctl_tcp_timestamps = 1;
3120         net->ipv4.sysctl_tcp_early_retrans = 3;
3121         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3122         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3123         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3124         net->ipv4.sysctl_tcp_max_reordering = 300;
3125         net->ipv4.sysctl_tcp_dsack = 1;
3126         net->ipv4.sysctl_tcp_app_win = 31;
3127         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3128         net->ipv4.sysctl_tcp_frto = 2;
3129         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3130         /* This limits the percentage of the congestion window which we
3131          * will allow a single TSO frame to consume.  Building TSO frames
3132          * which are too large can cause TCP streams to be bursty.
3133          */
3134         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3135         /* Default TSQ limit of 16 TSO segments */
3136         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3137         /* rfc5961 challenge ack rate limiting */
3138         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3139         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3140         net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3141         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3142         net->ipv4.sysctl_tcp_autocorking = 1;
3143         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3144         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3145         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3146         if (net != &init_net) {
3147                 memcpy(net->ipv4.sysctl_tcp_rmem,
3148                        init_net.ipv4.sysctl_tcp_rmem,
3149                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3150                 memcpy(net->ipv4.sysctl_tcp_wmem,
3151                        init_net.ipv4.sysctl_tcp_wmem,
3152                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3153         }
3154         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3155         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3156         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3157         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3158         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3159         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3160
3161         /* Reno is always built in */
3162         if (!net_eq(net, &init_net) &&
3163             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3164                                init_net.ipv4.tcp_congestion_control->owner))
3165                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3166         else
3167                 net->ipv4.tcp_congestion_control = &tcp_reno;
3168
3169         return 0;
3170 }
3171
3172 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3173 {
3174         struct net *net;
3175
3176         inet_twsk_purge(&tcp_hashinfo, AF_INET);
3177
3178         list_for_each_entry(net, net_exit_list, exit_list)
3179                 tcp_fastopen_ctx_destroy(net);
3180 }
3181
3182 static struct pernet_operations __net_initdata tcp_sk_ops = {
3183        .init       = tcp_sk_init,
3184        .exit       = tcp_sk_exit,
3185        .exit_batch = tcp_sk_exit_batch,
3186 };
3187
3188 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3189 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3190                      struct sock_common *sk_common, uid_t uid)
3191
3192 #define INIT_BATCH_SZ 16
3193
3194 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3195 {
3196         struct bpf_tcp_iter_state *iter = priv_data;
3197         int err;
3198
3199         err = bpf_iter_init_seq_net(priv_data, aux);
3200         if (err)
3201                 return err;
3202
3203         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3204         if (err) {
3205                 bpf_iter_fini_seq_net(priv_data);
3206                 return err;
3207         }
3208
3209         return 0;
3210 }
3211
3212 static void bpf_iter_fini_tcp(void *priv_data)
3213 {
3214         struct bpf_tcp_iter_state *iter = priv_data;
3215
3216         bpf_iter_fini_seq_net(priv_data);
3217         kvfree(iter->batch);
3218 }
3219
3220 static const struct bpf_iter_seq_info tcp_seq_info = {
3221         .seq_ops                = &bpf_iter_tcp_seq_ops,
3222         .init_seq_private       = bpf_iter_init_tcp,
3223         .fini_seq_private       = bpf_iter_fini_tcp,
3224         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3225 };
3226
3227 static const struct bpf_func_proto *
3228 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3229                             const struct bpf_prog *prog)
3230 {
3231         switch (func_id) {
3232         case BPF_FUNC_setsockopt:
3233                 return &bpf_sk_setsockopt_proto;
3234         case BPF_FUNC_getsockopt:
3235                 return &bpf_sk_getsockopt_proto;
3236         default:
3237                 return NULL;
3238         }
3239 }
3240
3241 static struct bpf_iter_reg tcp_reg_info = {
3242         .target                 = "tcp",
3243         .ctx_arg_info_size      = 1,
3244         .ctx_arg_info           = {
3245                 { offsetof(struct bpf_iter__tcp, sk_common),
3246                   PTR_TO_BTF_ID_OR_NULL },
3247         },
3248         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3249         .seq_info               = &tcp_seq_info,
3250 };
3251
3252 static void __init bpf_iter_register(void)
3253 {
3254         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3255         if (bpf_iter_reg_target(&tcp_reg_info))
3256                 pr_warn("Warning: could not register bpf iterator tcp\n");
3257 }
3258
3259 #endif
3260
3261 void __init tcp_v4_init(void)
3262 {
3263         int cpu, res;
3264
3265         for_each_possible_cpu(cpu) {
3266                 struct sock *sk;
3267
3268                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3269                                            IPPROTO_TCP, &init_net);
3270                 if (res)
3271                         panic("Failed to create the TCP control socket.\n");
3272                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3273
3274                 /* Please enforce IP_DF and IPID==0 for RST and
3275                  * ACK sent in SYN-RECV and TIME-WAIT state.
3276                  */
3277                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3278
3279                 per_cpu(ipv4_tcp_sk, cpu) = sk;
3280         }
3281         if (register_pernet_subsys(&tcp_sk_ops))
3282                 panic("Failed to create the TCP control socket.\n");
3283
3284 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3285         bpf_iter_register();
3286 #endif
3287 }