net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79 #include <linux/btf_ids.h>
  80
  81 #include <crypto/hash.h>
  82 #include <linux/scatterlist.h>
  83
  84 #include <trace/events/tcp.h>
  85
  86 #ifdef CONFIG_TCP_MD5SIG
  87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89 #endif
  90
  91 struct inet_hashinfo tcp_hashinfo;
  92 EXPORT_SYMBOL(tcp_hashinfo);
  93
  94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
  95
  96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  97 {
  98         return secure_tcp_seq(ip_hdr(skb)->daddr,
  99                               ip_hdr(skb)->saddr,
 100                               tcp_hdr(skb)->dest,
 101                               tcp_hdr(skb)->source);
 102 }
 103
 104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 105 {
 106         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 107 }
 108
 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110 {
 111         int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
 112         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 114         struct tcp_sock *tp = tcp_sk(sk);
 115
 116         if (reuse == 2) {
 117                 /* Still does not detect *everything* that goes through
 118                  * lo, since we require a loopback src or dst address
 119                  * or direct binding to 'lo' interface.
 120                  */
 121                 bool loopback = false;
 122                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 123                         loopback = true;
 124 #if IS_ENABLED(CONFIG_IPV6)
 125                 if (tw->tw_family == AF_INET6) {
 126                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 128                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 129                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 130                                 loopback = true;
 131                 } else
 132 #endif
 133                 {
 134                         if (ipv4_is_loopback(tw->tw_daddr) ||
 135                             ipv4_is_loopback(tw->tw_rcv_saddr))
 136                                 loopback = true;
 137                 }
 138                 if (!loopback)
 139                         reuse = 0;
 140         }
 141
 142         /* With PAWS, it is safe from the viewpoint
 143            of data integrity. Even without PAWS it is safe provided sequence
 144            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 145
 146            Actually, the idea is close to VJ's one, only timestamp cache is
 147            held not per host, but per port pair and TW bucket is used as state
 148            holder.
 149
 150            If TW bucket has been already destroyed we fall back to VJ's scheme
 151            and use initial timestamp retrieved from peer table.
 152          */
 153         if (tcptw->tw_ts_recent_stamp &&
 154             (!twp || (reuse && time_after32(ktime_get_seconds(),
 155                                             tcptw->tw_ts_recent_stamp)))) {
 156                 /* In case of repair and re-using TIME-WAIT sockets we still
 157                  * want to be sure that it is safe as above but honor the
 158                  * sequence numbers and time stamps set as part of the repair
 159                  * process.
 160                  *
 161                  * Without this check re-using a TIME-WAIT socket with TCP
 162                  * repair would accumulate a -1 on the repair assigned
 163                  * sequence number. The first time it is reused the sequence
 164                  * is -1, the second time -2, etc. This fixes that issue
 165                  * without appearing to create any others.
 166                  */
 167                 if (likely(!tp->repair)) {
 168                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 169
 170                         if (!seq)
 171                                 seq = 1;
 172                         WRITE_ONCE(tp->write_seq, seq);
 173                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 174                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 175                 }
 176                 sock_hold(sktw);
 177                 return 1;
 178         }
 179
 180         return 0;
 181 }
 182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 183
 184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 185                               int addr_len)
 186 {
 187         /* This check is replicated from tcp_v4_connect() and intended to
 188          * prevent BPF program called below from accessing bytes that are out
 189          * of the bound specified by user in addr_len.
 190          */
 191         if (addr_len < sizeof(struct sockaddr_in))
 192                 return -EINVAL;
 193
 194         sock_owned_by_me(sk);
 195
 196         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 197 }
 198
 199 /* This will initiate an outgoing connection. */
 200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 201 {
 202         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 203         struct inet_sock *inet = inet_sk(sk);
 204         struct tcp_sock *tp = tcp_sk(sk);
 205         __be16 orig_sport, orig_dport;
 206         __be32 daddr, nexthop;
 207         struct flowi4 *fl4;
 208         struct rtable *rt;
 209         int err;
 210         struct ip_options_rcu *inet_opt;
 211         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 212
 213         if (addr_len < sizeof(struct sockaddr_in))
 214                 return -EINVAL;
 215
 216         if (usin->sin_family != AF_INET)
 217                 return -EAFNOSUPPORT;
 218
 219         nexthop = daddr = usin->sin_addr.s_addr;
 220         inet_opt = rcu_dereference_protected(inet->inet_opt,
 221                                              lockdep_sock_is_held(sk));
 222         if (inet_opt && inet_opt->opt.srr) {
 223                 if (!daddr)
 224                         return -EINVAL;
 225                 nexthop = inet_opt->opt.faddr;
 226         }
 227
 228         orig_sport = inet->inet_sport;
 229         orig_dport = usin->sin_port;
 230         fl4 = &inet->cork.fl.u.ip4;
 231         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 232                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 233                               IPPROTO_TCP,
 234                               orig_sport, orig_dport, sk);
 235         if (IS_ERR(rt)) {
 236                 err = PTR_ERR(rt);
 237                 if (err == -ENETUNREACH)
 238                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 239                 return err;
 240         }
 241
 242         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 243                 ip_rt_put(rt);
 244                 return -ENETUNREACH;
 245         }
 246
 247         if (!inet_opt || !inet_opt->opt.srr)
 248                 daddr = fl4->daddr;
 249
 250         if (!inet->inet_saddr)
 251                 inet->inet_saddr = fl4->saddr;
 252         sk_rcv_saddr_set(sk, inet->inet_saddr);
 253
 254         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 255                 /* Reset inherited state */
 256                 tp->rx_opt.ts_recent       = 0;
 257                 tp->rx_opt.ts_recent_stamp = 0;
 258                 if (likely(!tp->repair))
 259                         WRITE_ONCE(tp->write_seq, 0);
 260         }
 261
 262         inet->inet_dport = usin->sin_port;
 263         sk_daddr_set(sk, daddr);
 264
 265         inet_csk(sk)->icsk_ext_hdr_len = 0;
 266         if (inet_opt)
 267                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 268
 269         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 270
 271         /* Socket identity is still unknown (sport may be zero).
 272          * However we set state to SYN-SENT and not releasing socket
 273          * lock select source port, enter ourselves into the hash tables and
 274          * complete initialization after this.
 275          */
 276         tcp_set_state(sk, TCP_SYN_SENT);
 277         err = inet_hash_connect(tcp_death_row, sk);
 278         if (err)
 279                 goto failure;
 280
 281         sk_set_txhash(sk);
 282
 283         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 284                                inet->inet_sport, inet->inet_dport, sk);
 285         if (IS_ERR(rt)) {
 286                 err = PTR_ERR(rt);
 287                 rt = NULL;
 288                 goto failure;
 289         }
 290         /* OK, now commit destination to socket.  */
 291         sk->sk_gso_type = SKB_GSO_TCPV4;
 292         sk_setup_caps(sk, &rt->dst);
 293         rt = NULL;
 294
 295         if (likely(!tp->repair)) {
 296                 if (!tp->write_seq)
 297                         WRITE_ONCE(tp->write_seq,
 298                                    secure_tcp_seq(inet->inet_saddr,
 299                                                   inet->inet_daddr,
 300                                                   inet->inet_sport,
 301                                                   usin->sin_port));
 302                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 303                                                  inet->inet_saddr,
 304                                                  inet->inet_daddr);
 305         }
 306
 307         inet->inet_id = prandom_u32();
 308
 309         if (tcp_fastopen_defer_connect(sk, &err))
 310                 return err;
 311         if (err)
 312                 goto failure;
 313
 314         err = tcp_connect(sk);
 315
 316         if (err)
 317                 goto failure;
 318
 319         return 0;
 320
 321 failure:
 322         /*
 323          * This unhashes the socket and releases the local port,
 324          * if necessary.
 325          */
 326         tcp_set_state(sk, TCP_CLOSE);
 327         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
 328                 inet_reset_saddr(sk);
 329         ip_rt_put(rt);
 330         sk->sk_route_caps = 0;
 331         inet->inet_dport = 0;
 332         return err;
 333 }
 334 EXPORT_SYMBOL(tcp_v4_connect);
 335
 336 /*
 337  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 338  * It can be called through tcp_release_cb() if socket was owned by user
 339  * at the time tcp_v4_err() was called to handle ICMP message.
 340  */
 341 void tcp_v4_mtu_reduced(struct sock *sk)
 342 {
 343         struct inet_sock *inet = inet_sk(sk);
 344         struct dst_entry *dst;
 345         u32 mtu;
 346
 347         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 348                 return;
 349         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 350         dst = inet_csk_update_pmtu(sk, mtu);
 351         if (!dst)
 352                 return;
 353
 354         /* Something is about to be wrong... Remember soft error
 355          * for the case, if this connection will not able to recover.
 356          */
 357         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 358                 sk->sk_err_soft = EMSGSIZE;
 359
 360         mtu = dst_mtu(dst);
 361
 362         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 363             ip_sk_accept_pmtu(sk) &&
 364             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 365                 tcp_sync_mss(sk, mtu);
 366
 367                 /* Resend the TCP packet because it's
 368                  * clear that the old packet has been
 369                  * dropped. This is the new "fast" path mtu
 370                  * discovery.
 371                  */
 372                 tcp_simple_retransmit(sk);
 373         } /* else let the usual retransmit timer handle it */
 374 }
 375 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 376
 377 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 378 {
 379         struct dst_entry *dst = __sk_dst_check(sk, 0);
 380
 381         if (dst)
 382                 dst->ops->redirect(dst, sk, skb);
 383 }
 384
 385
 386 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 387 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 388 {
 389         struct request_sock *req = inet_reqsk(sk);
 390         struct net *net = sock_net(sk);
 391
 392         /* ICMPs are not backlogged, hence we cannot get
 393          * an established socket here.
 394          */
 395         if (seq != tcp_rsk(req)->snt_isn) {
 396                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 397         } else if (abort) {
 398                 /*
 399                  * Still in SYN_RECV, just remove it silently.
 400                  * There is no good way to pass the error to the newly
 401                  * created socket, and POSIX does not want network
 402                  * errors returned from accept().
 403                  */
 404                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 405                 tcp_listendrop(req->rsk_listener);
 406         }
 407         reqsk_put(req);
 408 }
 409 EXPORT_SYMBOL(tcp_req_err);
 410
 411 /* TCP-LD (RFC 6069) logic */
 412 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 413 {
 414         struct inet_connection_sock *icsk = inet_csk(sk);
 415         struct tcp_sock *tp = tcp_sk(sk);
 416         struct sk_buff *skb;
 417         s32 remaining;
 418         u32 delta_us;
 419
 420         if (sock_owned_by_user(sk))
 421                 return;
 422
 423         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 424             !icsk->icsk_backoff)
 425                 return;
 426
 427         skb = tcp_rtx_queue_head(sk);
 428         if (WARN_ON_ONCE(!skb))
 429                 return;
 430
 431         icsk->icsk_backoff--;
 432         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 433         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 434
 435         tcp_mstamp_refresh(tp);
 436         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 437         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 438
 439         if (remaining > 0) {
 440                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 441                                           remaining, TCP_RTO_MAX);
 442         } else {
 443                 /* RTO revert clocked out retransmission.
 444                  * Will retransmit now.
 445                  */
 446                 tcp_retransmit_timer(sk);
 447         }
 448 }
 449 EXPORT_SYMBOL(tcp_ld_RTO_revert);
 450
 451 /*
 452  * This routine is called by the ICMP module when it gets some
 453  * sort of error condition.  If err < 0 then the socket should
 454  * be closed and the error returned to the user.  If err > 0
 455  * it's just the icmp type << 8 | icmp code.  After adjustment
 456  * header points to the first 8 bytes of the tcp header.  We need
 457  * to find the appropriate port.
 458  *
 459  * The locking strategy used here is very "optimistic". When
 460  * someone else accesses the socket the ICMP is just dropped
 461  * and for some paths there is no check at all.
 462  * A more general error queue to queue errors for later handling
 463  * is probably better.
 464  *
 465  */
 466
 467 int tcp_v4_err(struct sk_buff *skb, u32 info)
 468 {
 469         const struct iphdr *iph = (const struct iphdr *)skb->data;
 470         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 471         struct tcp_sock *tp;
 472         struct inet_sock *inet;
 473         const int type = icmp_hdr(skb)->type;
 474         const int code = icmp_hdr(skb)->code;
 475         struct sock *sk;
 476         struct request_sock *fastopen;
 477         u32 seq, snd_una;
 478         int err;
 479         struct net *net = dev_net(skb->dev);
 480
 481         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 482                                        th->dest, iph->saddr, ntohs(th->source),
 483                                        inet_iif(skb), 0);
 484         if (!sk) {
 485                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 486                 return -ENOENT;
 487         }
 488         if (sk->sk_state == TCP_TIME_WAIT) {
 489                 inet_twsk_put(inet_twsk(sk));
 490                 return 0;
 491         }
 492         seq = ntohl(th->seq);
 493         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 494                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 495                                      type == ICMP_TIME_EXCEEDED ||
 496                                      (type == ICMP_DEST_UNREACH &&
 497                                       (code == ICMP_NET_UNREACH ||
 498                                        code == ICMP_HOST_UNREACH)));
 499                 return 0;
 500         }
 501
 502         bh_lock_sock(sk);
 503         /* If too many ICMPs get dropped on busy
 504          * servers this needs to be solved differently.
 505          * We do take care of PMTU discovery (RFC1191) special case :
 506          * we can receive locally generated ICMP messages while socket is held.
 507          */
 508         if (sock_owned_by_user(sk)) {
 509                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 510                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 511         }
 512         if (sk->sk_state == TCP_CLOSE)
 513                 goto out;
 514
 515         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 516                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 517                 goto out;
 518         }
 519
 520         tp = tcp_sk(sk);
 521         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 522         fastopen = rcu_dereference(tp->fastopen_rsk);
 523         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 524         if (sk->sk_state != TCP_LISTEN &&
 525             !between(seq, snd_una, tp->snd_nxt)) {
 526                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 527                 goto out;
 528         }
 529
 530         switch (type) {
 531         case ICMP_REDIRECT:
 532                 if (!sock_owned_by_user(sk))
 533                         do_redirect(skb, sk);
 534                 goto out;
 535         case ICMP_SOURCE_QUENCH:
 536                 /* Just silently ignore these. */
 537                 goto out;
 538         case ICMP_PARAMETERPROB:
 539                 err = EPROTO;
 540                 break;
 541         case ICMP_DEST_UNREACH:
 542                 if (code > NR_ICMP_UNREACH)
 543                         goto out;
 544
 545                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 546                         /* We are not interested in TCP_LISTEN and open_requests
 547                          * (SYN-ACKs send out by Linux are always <576bytes so
 548                          * they should go through unfragmented).
 549                          */
 550                         if (sk->sk_state == TCP_LISTEN)
 551                                 goto out;
 552
 553                         WRITE_ONCE(tp->mtu_info, info);
 554                         if (!sock_owned_by_user(sk)) {
 555                                 tcp_v4_mtu_reduced(sk);
 556                         } else {
 557                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 558                                         sock_hold(sk);
 559                         }
 560                         goto out;
 561                 }
 562
 563                 err = icmp_err_convert[code].errno;
 564                 /* check if this ICMP message allows revert of backoff.
 565                  * (see RFC 6069)
 566                  */
 567                 if (!fastopen &&
 568                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 569                         tcp_ld_RTO_revert(sk, seq);
 570                 break;
 571         case ICMP_TIME_EXCEEDED:
 572                 err = EHOSTUNREACH;
 573                 break;
 574         default:
 575                 goto out;
 576         }
 577
 578         switch (sk->sk_state) {
 579         case TCP_SYN_SENT:
 580         case TCP_SYN_RECV:
 581                 /* Only in fast or simultaneous open. If a fast open socket is
 582                  * already accepted it is treated as a connected one below.
 583                  */
 584                 if (fastopen && !fastopen->sk)
 585                         break;
 586
 587                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 588
 589                 if (!sock_owned_by_user(sk)) {
 590                         sk->sk_err = err;
 591
 592                         sk_error_report(sk);
 593
 594                         tcp_done(sk);
 595                 } else {
 596                         sk->sk_err_soft = err;
 597                 }
 598                 goto out;
 599         }
 600
 601         /* If we've already connected we will keep trying
 602          * until we time out, or the user gives up.
 603          *
 604          * rfc1122 4.2.3.9 allows to consider as hard errors
 605          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 606          * but it is obsoleted by pmtu discovery).
 607          *
 608          * Note, that in modern internet, where routing is unreliable
 609          * and in each dark corner broken firewalls sit, sending random
 610          * errors ordered by their masters even this two messages finally lose
 611          * their original sense (even Linux sends invalid PORT_UNREACHs)
 612          *
 613          * Now we are in compliance with RFCs.
 614          *                                                      --ANK (980905)
 615          */
 616
 617         inet = inet_sk(sk);
 618         if (!sock_owned_by_user(sk) && inet->recverr) {
 619                 sk->sk_err = err;
 620                 sk_error_report(sk);
 621         } else  { /* Only an error on timeout */
 622                 sk->sk_err_soft = err;
 623         }
 624
 625 out:
 626         bh_unlock_sock(sk);
 627         sock_put(sk);
 628         return 0;
 629 }
 630
 631 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 632 {
 633         struct tcphdr *th = tcp_hdr(skb);
 634
 635         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 636         skb->csum_start = skb_transport_header(skb) - skb->head;
 637         skb->csum_offset = offsetof(struct tcphdr, check);
 638 }
 639
 640 /* This routine computes an IPv4 TCP checksum. */
 641 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 642 {
 643         const struct inet_sock *inet = inet_sk(sk);
 644
 645         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 646 }
 647 EXPORT_SYMBOL(tcp_v4_send_check);
 648
 649 /*
 650  *      This routine will send an RST to the other tcp.
 651  *
 652  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 653  *                    for reset.
 654  *      Answer: if a packet caused RST, it is not for a socket
 655  *              existing in our system, if it is matched to a socket,
 656  *              it is just duplicate segment or bug in other side's TCP.
 657  *              So that we build reply only basing on parameters
 658  *              arrived with segment.
 659  *      Exception: precedence violation. We do not implement it in any case.
 660  */
 661
 662 #ifdef CONFIG_TCP_MD5SIG
 663 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
 664 #else
 665 #define OPTION_BYTES sizeof(__be32)
 666 #endif
 667
 668 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 669 {
 670         const struct tcphdr *th = tcp_hdr(skb);
 671         struct {
 672                 struct tcphdr th;
 673                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
 674         } rep;
 675         struct ip_reply_arg arg;
 676 #ifdef CONFIG_TCP_MD5SIG
 677         struct tcp_md5sig_key *key = NULL;
 678         const __u8 *hash_location = NULL;
 679         unsigned char newhash[16];
 680         int genhash;
 681         struct sock *sk1 = NULL;
 682 #endif
 683         u64 transmit_time = 0;
 684         struct sock *ctl_sk;
 685         struct net *net;
 686
 687         /* Never send a reset in response to a reset. */
 688         if (th->rst)
 689                 return;
 690
 691         /* If sk not NULL, it means we did a successful lookup and incoming
 692          * route had to be correct. prequeue might have dropped our dst.
 693          */
 694         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 695                 return;
 696
 697         /* Swap the send and the receive. */
 698         memset(&rep, 0, sizeof(rep));
 699         rep.th.dest   = th->source;
 700         rep.th.source = th->dest;
 701         rep.th.doff   = sizeof(struct tcphdr) / 4;
 702         rep.th.rst    = 1;
 703
 704         if (th->ack) {
 705                 rep.th.seq = th->ack_seq;
 706         } else {
 707                 rep.th.ack = 1;
 708                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 709                                        skb->len - (th->doff << 2));
 710         }
 711
 712         memset(&arg, 0, sizeof(arg));
 713         arg.iov[0].iov_base = (unsigned char *)&rep;
 714         arg.iov[0].iov_len  = sizeof(rep.th);
 715
 716         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 717 #ifdef CONFIG_TCP_MD5SIG
 718         rcu_read_lock();
 719         hash_location = tcp_parse_md5sig_option(th);
 720         if (sk && sk_fullsock(sk)) {
 721                 const union tcp_md5_addr *addr;
 722                 int l3index;
 723
 724                 /* sdif set, means packet ingressed via a device
 725                  * in an L3 domain and inet_iif is set to it.
 726                  */
 727                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 728                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 729                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 730         } else if (hash_location) {
 731                 const union tcp_md5_addr *addr;
 732                 int sdif = tcp_v4_sdif(skb);
 733                 int dif = inet_iif(skb);
 734                 int l3index;
 735
 736                 /*
 737                  * active side is lost. Try to find listening socket through
 738                  * source port, and then find md5 key through listening socket.
 739                  * we are not loose security here:
 740                  * Incoming packet is checked with md5 hash with finding key,
 741                  * no RST generated if md5 hash doesn't match.
 742                  */
 743                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 744                                              ip_hdr(skb)->saddr,
 745                                              th->source, ip_hdr(skb)->daddr,
 746                                              ntohs(th->source), dif, sdif);
 747                 /* don't send rst if it can't find key */
 748                 if (!sk1)
 749                         goto out;
 750
 751                 /* sdif set, means packet ingressed via a device
 752                  * in an L3 domain and dif is set to it.
 753                  */
 754                 l3index = sdif ? dif : 0;
 755                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 756                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 757                 if (!key)
 758                         goto out;
 759
 760
 761                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 762                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 763                         goto out;
 764
 765         }
 766
 767         if (key) {
 768                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 769                                    (TCPOPT_NOP << 16) |
 770                                    (TCPOPT_MD5SIG << 8) |
 771                                    TCPOLEN_MD5SIG);
 772                 /* Update length and the length the header thinks exists */
 773                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 774                 rep.th.doff = arg.iov[0].iov_len / 4;
 775
 776                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 777                                      key, ip_hdr(skb)->saddr,
 778                                      ip_hdr(skb)->daddr, &rep.th);
 779         }
 780 #endif
 781         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
 782         if (rep.opt[0] == 0) {
 783                 __be32 mrst = mptcp_reset_option(skb);
 784
 785                 if (mrst) {
 786                         rep.opt[0] = mrst;
 787                         arg.iov[0].iov_len += sizeof(mrst);
 788                         rep.th.doff = arg.iov[0].iov_len / 4;
 789                 }
 790         }
 791
 792         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 793                                       ip_hdr(skb)->saddr, /* XXX */
 794                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 795         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 796         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 797
 798         /* When socket is gone, all binding information is lost.
 799          * routing might fail in this case. No choice here, if we choose to force
 800          * input interface, we will misroute in case of asymmetric route.
 801          */
 802         if (sk) {
 803                 arg.bound_dev_if = sk->sk_bound_dev_if;
 804                 if (sk_fullsock(sk))
 805                         trace_tcp_send_reset(sk, skb);
 806         }
 807
 808         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 809                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 810
 811         arg.tos = ip_hdr(skb)->tos;
 812         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 813         local_bh_disable();
 814         ctl_sk = this_cpu_read(ipv4_tcp_sk);
 815         sock_net_set(ctl_sk, net);
 816         if (sk) {
 817                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 818                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 819                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 820                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 821                 transmit_time = tcp_transmit_time(sk);
 822                 xfrm_sk_clone_policy(ctl_sk, sk);
 823         } else {
 824                 ctl_sk->sk_mark = 0;
 825                 ctl_sk->sk_priority = 0;
 826         }
 827         ip_send_unicast_reply(ctl_sk,
 828                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 829                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 830                               &arg, arg.iov[0].iov_len,
 831                               transmit_time);
 832
 833         xfrm_sk_free_policy(ctl_sk);
 834         sock_net_set(ctl_sk, &init_net);
 835         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 836         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 837         local_bh_enable();
 838
 839 #ifdef CONFIG_TCP_MD5SIG
 840 out:
 841         rcu_read_unlock();
 842 #endif
 843 }
 844
 845 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 846    outside socket context is ugly, certainly. What can I do?
 847  */
 848
 849 static void tcp_v4_send_ack(const struct sock *sk,
 850                             struct sk_buff *skb, u32 seq, u32 ack,
 851                             u32 win, u32 tsval, u32 tsecr, int oif,
 852                             struct tcp_md5sig_key *key,
 853                             int reply_flags, u8 tos)
 854 {
 855         const struct tcphdr *th = tcp_hdr(skb);
 856         struct {
 857                 struct tcphdr th;
 858                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 859 #ifdef CONFIG_TCP_MD5SIG
 860                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 861 #endif
 862                         ];
 863         } rep;
 864         struct net *net = sock_net(sk);
 865         struct ip_reply_arg arg;
 866         struct sock *ctl_sk;
 867         u64 transmit_time;
 868
 869         memset(&rep.th, 0, sizeof(struct tcphdr));
 870         memset(&arg, 0, sizeof(arg));
 871
 872         arg.iov[0].iov_base = (unsigned char *)&rep;
 873         arg.iov[0].iov_len  = sizeof(rep.th);
 874         if (tsecr) {
 875                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 876                                    (TCPOPT_TIMESTAMP << 8) |
 877                                    TCPOLEN_TIMESTAMP);
 878                 rep.opt[1] = htonl(tsval);
 879                 rep.opt[2] = htonl(tsecr);
 880                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 881         }
 882
 883         /* Swap the send and the receive. */
 884         rep.th.dest    = th->source;
 885         rep.th.source  = th->dest;
 886         rep.th.doff    = arg.iov[0].iov_len / 4;
 887         rep.th.seq     = htonl(seq);
 888         rep.th.ack_seq = htonl(ack);
 889         rep.th.ack     = 1;
 890         rep.th.window  = htons(win);
 891
 892 #ifdef CONFIG_TCP_MD5SIG
 893         if (key) {
 894                 int offset = (tsecr) ? 3 : 0;
 895
 896                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 897                                           (TCPOPT_NOP << 16) |
 898                                           (TCPOPT_MD5SIG << 8) |
 899                                           TCPOLEN_MD5SIG);
 900                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 901                 rep.th.doff = arg.iov[0].iov_len/4;
 902
 903                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 904                                     key, ip_hdr(skb)->saddr,
 905                                     ip_hdr(skb)->daddr, &rep.th);
 906         }
 907 #endif
 908         arg.flags = reply_flags;
 909         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 910                                       ip_hdr(skb)->saddr, /* XXX */
 911                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 912         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 913         if (oif)
 914                 arg.bound_dev_if = oif;
 915         arg.tos = tos;
 916         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 917         local_bh_disable();
 918         ctl_sk = this_cpu_read(ipv4_tcp_sk);
 919         sock_net_set(ctl_sk, net);
 920         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 921                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 922         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 923                            inet_twsk(sk)->tw_priority : sk->sk_priority;
 924         transmit_time = tcp_transmit_time(sk);
 925         ip_send_unicast_reply(ctl_sk,
 926                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 927                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 928                               &arg, arg.iov[0].iov_len,
 929                               transmit_time);
 930
 931         sock_net_set(ctl_sk, &init_net);
 932         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 933         local_bh_enable();
 934 }
 935
 936 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 937 {
 938         struct inet_timewait_sock *tw = inet_twsk(sk);
 939         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 940
 941         tcp_v4_send_ack(sk, skb,
 942                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 943                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 944                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 945                         tcptw->tw_ts_recent,
 946                         tw->tw_bound_dev_if,
 947                         tcp_twsk_md5_key(tcptw),
 948                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 949                         tw->tw_tos
 950                         );
 951
 952         inet_twsk_put(tw);
 953 }
 954
 955 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 956                                   struct request_sock *req)
 957 {
 958         const union tcp_md5_addr *addr;
 959         int l3index;
 960
 961         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 962          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 963          */
 964         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 965                                              tcp_sk(sk)->snd_nxt;
 966
 967         /* RFC 7323 2.3
 968          * The window field (SEG.WND) of every outgoing segment, with the
 969          * exception of <SYN> segments, MUST be right-shifted by
 970          * Rcv.Wind.Shift bits:
 971          */
 972         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 973         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 974         tcp_v4_send_ack(sk, skb, seq,
 975                         tcp_rsk(req)->rcv_nxt,
 976                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 977                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 978                         READ_ONCE(req->ts_recent),
 979                         0,
 980                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 981                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 982                         ip_hdr(skb)->tos);
 983 }
 984
 985 /*
 986  *      Send a SYN-ACK after having received a SYN.
 987  *      This still operates on a request_sock only, not on a big
 988  *      socket.
 989  */
 990 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 991                               struct flowi *fl,
 992                               struct request_sock *req,
 993                               struct tcp_fastopen_cookie *foc,
 994                               enum tcp_synack_type synack_type,
 995                               struct sk_buff *syn_skb)
 996 {
 997         const struct inet_request_sock *ireq = inet_rsk(req);
 998         struct flowi4 fl4;
 999         int err = -1;
1000         struct sk_buff *skb;
1001         u8 tos;
1002
1003         /* First, grab a route. */
1004         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1005                 return -1;
1006
1007         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1008
1009         if (skb) {
1010                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1011
1012                 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1013                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1014                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1015                                 inet_sk(sk)->tos;
1016
1017                 if (!INET_ECN_is_capable(tos) &&
1018                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1019                         tos |= INET_ECN_ECT_0;
1020
1021                 rcu_read_lock();
1022                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1023                                             ireq->ir_rmt_addr,
1024                                             rcu_dereference(ireq->ireq_opt),
1025                                             tos);
1026                 rcu_read_unlock();
1027                 err = net_xmit_eval(err);
1028         }
1029
1030         return err;
1031 }
1032
1033 /*
1034  *      IPv4 request_sock destructor.
1035  */
1036 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1037 {
1038         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1039 }
1040
1041 #ifdef CONFIG_TCP_MD5SIG
1042 /*
1043  * RFC2385 MD5 checksumming requires a mapping of
1044  * IP address->MD5 Key.
1045  * We need to maintain these in the sk structure.
1046  */
1047
1048 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1049 EXPORT_SYMBOL(tcp_md5_needed);
1050
1051 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1052 {
1053         if (!old)
1054                 return true;
1055
1056         /* l3index always overrides non-l3index */
1057         if (old->l3index && new->l3index == 0)
1058                 return false;
1059         if (old->l3index == 0 && new->l3index)
1060                 return true;
1061
1062         return old->prefixlen < new->prefixlen;
1063 }
1064
1065 /* Find the Key structure for an address.  */
1066 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1067                                            const union tcp_md5_addr *addr,
1068                                            int family)
1069 {
1070         const struct tcp_sock *tp = tcp_sk(sk);
1071         struct tcp_md5sig_key *key;
1072         const struct tcp_md5sig_info *md5sig;
1073         __be32 mask;
1074         struct tcp_md5sig_key *best_match = NULL;
1075         bool match;
1076
1077         /* caller either holds rcu_read_lock() or socket lock */
1078         md5sig = rcu_dereference_check(tp->md5sig_info,
1079                                        lockdep_sock_is_held(sk));
1080         if (!md5sig)
1081                 return NULL;
1082
1083         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1084                                  lockdep_sock_is_held(sk)) {
1085                 if (key->family != family)
1086                         continue;
1087                 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1088                         continue;
1089                 if (family == AF_INET) {
1090                         mask = inet_make_mask(key->prefixlen);
1091                         match = (key->addr.a4.s_addr & mask) ==
1092                                 (addr->a4.s_addr & mask);
1093 #if IS_ENABLED(CONFIG_IPV6)
1094                 } else if (family == AF_INET6) {
1095                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1096                                                   key->prefixlen);
1097 #endif
1098                 } else {
1099                         match = false;
1100                 }
1101
1102                 if (match && better_md5_match(best_match, key))
1103                         best_match = key;
1104         }
1105         return best_match;
1106 }
1107 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1108
1109 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1110                                                       const union tcp_md5_addr *addr,
1111                                                       int family, u8 prefixlen,
1112                                                       int l3index, u8 flags)
1113 {
1114         const struct tcp_sock *tp = tcp_sk(sk);
1115         struct tcp_md5sig_key *key;
1116         unsigned int size = sizeof(struct in_addr);
1117         const struct tcp_md5sig_info *md5sig;
1118
1119         /* caller either holds rcu_read_lock() or socket lock */
1120         md5sig = rcu_dereference_check(tp->md5sig_info,
1121                                        lockdep_sock_is_held(sk));
1122         if (!md5sig)
1123                 return NULL;
1124 #if IS_ENABLED(CONFIG_IPV6)
1125         if (family == AF_INET6)
1126                 size = sizeof(struct in6_addr);
1127 #endif
1128         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1129                                  lockdep_sock_is_held(sk)) {
1130                 if (key->family != family)
1131                         continue;
1132                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1133                         continue;
1134                 if (key->l3index != l3index)
1135                         continue;
1136                 if (!memcmp(&key->addr, addr, size) &&
1137                     key->prefixlen == prefixlen)
1138                         return key;
1139         }
1140         return NULL;
1141 }
1142
1143 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1144                                          const struct sock *addr_sk)
1145 {
1146         const union tcp_md5_addr *addr;
1147         int l3index;
1148
1149         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1150                                                  addr_sk->sk_bound_dev_if);
1151         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1152         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1153 }
1154 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1155
1156 /* This can be called on a newly created socket, from other files */
1157 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1158                    int family, u8 prefixlen, int l3index, u8 flags,
1159                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1160 {
1161         /* Add Key to the list */
1162         struct tcp_md5sig_key *key;
1163         struct tcp_sock *tp = tcp_sk(sk);
1164         struct tcp_md5sig_info *md5sig;
1165
1166         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1167         if (key) {
1168                 /* Pre-existing entry - just update that one.
1169                  * Note that the key might be used concurrently.
1170                  * data_race() is telling kcsan that we do not care of
1171                  * key mismatches, since changing MD5 key on live flows
1172                  * can lead to packet drops.
1173                  */
1174                 data_race(memcpy(key->key, newkey, newkeylen));
1175
1176                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1177                  * Also note that a reader could catch new key->keylen value
1178                  * but old key->key[], this is the reason we use __GFP_ZERO
1179                  * at sock_kmalloc() time below these lines.
1180                  */
1181                 WRITE_ONCE(key->keylen, newkeylen);
1182
1183                 return 0;
1184         }
1185
1186         md5sig = rcu_dereference_protected(tp->md5sig_info,
1187                                            lockdep_sock_is_held(sk));
1188         if (!md5sig) {
1189                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1190                 if (!md5sig)
1191                         return -ENOMEM;
1192
1193                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1194                 INIT_HLIST_HEAD(&md5sig->head);
1195                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1196         }
1197
1198         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1199         if (!key)
1200                 return -ENOMEM;
1201         if (!tcp_alloc_md5sig_pool()) {
1202                 sock_kfree_s(sk, key, sizeof(*key));
1203                 return -ENOMEM;
1204         }
1205
1206         memcpy(key->key, newkey, newkeylen);
1207         key->keylen = newkeylen;
1208         key->family = family;
1209         key->prefixlen = prefixlen;
1210         key->l3index = l3index;
1211         key->flags = flags;
1212         memcpy(&key->addr, addr,
1213                (family == AF_INET6) ? sizeof(struct in6_addr) :
1214                                       sizeof(struct in_addr));
1215         hlist_add_head_rcu(&key->node, &md5sig->head);
1216         return 0;
1217 }
1218 EXPORT_SYMBOL(tcp_md5_do_add);
1219
1220 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1221                    u8 prefixlen, int l3index, u8 flags)
1222 {
1223         struct tcp_md5sig_key *key;
1224
1225         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1226         if (!key)
1227                 return -ENOENT;
1228         hlist_del_rcu(&key->node);
1229         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1230         kfree_rcu(key, rcu);
1231         return 0;
1232 }
1233 EXPORT_SYMBOL(tcp_md5_do_del);
1234
1235 static void tcp_clear_md5_list(struct sock *sk)
1236 {
1237         struct tcp_sock *tp = tcp_sk(sk);
1238         struct tcp_md5sig_key *key;
1239         struct hlist_node *n;
1240         struct tcp_md5sig_info *md5sig;
1241
1242         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1243
1244         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1245                 hlist_del_rcu(&key->node);
1246                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1247                 kfree_rcu(key, rcu);
1248         }
1249 }
1250
1251 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1252                                  sockptr_t optval, int optlen)
1253 {
1254         struct tcp_md5sig cmd;
1255         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1256         const union tcp_md5_addr *addr;
1257         u8 prefixlen = 32;
1258         int l3index = 0;
1259         u8 flags;
1260
1261         if (optlen < sizeof(cmd))
1262                 return -EINVAL;
1263
1264         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1265                 return -EFAULT;
1266
1267         if (sin->sin_family != AF_INET)
1268                 return -EINVAL;
1269
1270         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1271
1272         if (optname == TCP_MD5SIG_EXT &&
1273             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1274                 prefixlen = cmd.tcpm_prefixlen;
1275                 if (prefixlen > 32)
1276                         return -EINVAL;
1277         }
1278
1279         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1280             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1281                 struct net_device *dev;
1282
1283                 rcu_read_lock();
1284                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1285                 if (dev && netif_is_l3_master(dev))
1286                         l3index = dev->ifindex;
1287
1288                 rcu_read_unlock();
1289
1290                 /* ok to reference set/not set outside of rcu;
1291                  * right now device MUST be an L3 master
1292                  */
1293                 if (!dev || !l3index)
1294                         return -EINVAL;
1295         }
1296
1297         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1298
1299         if (!cmd.tcpm_keylen)
1300                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1301
1302         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1303                 return -EINVAL;
1304
1305         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1306                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1307 }
1308
1309 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1310                                    __be32 daddr, __be32 saddr,
1311                                    const struct tcphdr *th, int nbytes)
1312 {
1313         struct tcp4_pseudohdr *bp;
1314         struct scatterlist sg;
1315         struct tcphdr *_th;
1316
1317         bp = hp->scratch;
1318         bp->saddr = saddr;
1319         bp->daddr = daddr;
1320         bp->pad = 0;
1321         bp->protocol = IPPROTO_TCP;
1322         bp->len = cpu_to_be16(nbytes);
1323
1324         _th = (struct tcphdr *)(bp + 1);
1325         memcpy(_th, th, sizeof(*th));
1326         _th->check = 0;
1327
1328         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1329         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1330                                 sizeof(*bp) + sizeof(*th));
1331         return crypto_ahash_update(hp->md5_req);
1332 }
1333
1334 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1335                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1336 {
1337         struct tcp_md5sig_pool *hp;
1338         struct ahash_request *req;
1339
1340         hp = tcp_get_md5sig_pool();
1341         if (!hp)
1342                 goto clear_hash_noput;
1343         req = hp->md5_req;
1344
1345         if (crypto_ahash_init(req))
1346                 goto clear_hash;
1347         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1348                 goto clear_hash;
1349         if (tcp_md5_hash_key(hp, key))
1350                 goto clear_hash;
1351         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1352         if (crypto_ahash_final(req))
1353                 goto clear_hash;
1354
1355         tcp_put_md5sig_pool();
1356         return 0;
1357
1358 clear_hash:
1359         tcp_put_md5sig_pool();
1360 clear_hash_noput:
1361         memset(md5_hash, 0, 16);
1362         return 1;
1363 }
1364
1365 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1366                         const struct sock *sk,
1367                         const struct sk_buff *skb)
1368 {
1369         struct tcp_md5sig_pool *hp;
1370         struct ahash_request *req;
1371         const struct tcphdr *th = tcp_hdr(skb);
1372         __be32 saddr, daddr;
1373
1374         if (sk) { /* valid for establish/request sockets */
1375                 saddr = sk->sk_rcv_saddr;
1376                 daddr = sk->sk_daddr;
1377         } else {
1378                 const struct iphdr *iph = ip_hdr(skb);
1379                 saddr = iph->saddr;
1380                 daddr = iph->daddr;
1381         }
1382
1383         hp = tcp_get_md5sig_pool();
1384         if (!hp)
1385                 goto clear_hash_noput;
1386         req = hp->md5_req;
1387
1388         if (crypto_ahash_init(req))
1389                 goto clear_hash;
1390
1391         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1392                 goto clear_hash;
1393         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1394                 goto clear_hash;
1395         if (tcp_md5_hash_key(hp, key))
1396                 goto clear_hash;
1397         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1398         if (crypto_ahash_final(req))
1399                 goto clear_hash;
1400
1401         tcp_put_md5sig_pool();
1402         return 0;
1403
1404 clear_hash:
1405         tcp_put_md5sig_pool();
1406 clear_hash_noput:
1407         memset(md5_hash, 0, 16);
1408         return 1;
1409 }
1410 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1411
1412 #endif
1413
1414 /* Called with rcu_read_lock() */
1415 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1416                                     const struct sk_buff *skb,
1417                                     int dif, int sdif)
1418 {
1419 #ifdef CONFIG_TCP_MD5SIG
1420         /*
1421          * This gets called for each TCP segment that arrives
1422          * so we want to be efficient.
1423          * We have 3 drop cases:
1424          * o No MD5 hash and one expected.
1425          * o MD5 hash and we're not expecting one.
1426          * o MD5 hash and its wrong.
1427          */
1428         const __u8 *hash_location = NULL;
1429         struct tcp_md5sig_key *hash_expected;
1430         const struct iphdr *iph = ip_hdr(skb);
1431         const struct tcphdr *th = tcp_hdr(skb);
1432         const union tcp_md5_addr *addr;
1433         unsigned char newhash[16];
1434         int genhash, l3index;
1435
1436         /* sdif set, means packet ingressed via a device
1437          * in an L3 domain and dif is set to the l3mdev
1438          */
1439         l3index = sdif ? dif : 0;
1440
1441         addr = (union tcp_md5_addr *)&iph->saddr;
1442         hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1443         hash_location = tcp_parse_md5sig_option(th);
1444
1445         /* We've parsed the options - do we have a hash? */
1446         if (!hash_expected && !hash_location)
1447                 return false;
1448
1449         if (hash_expected && !hash_location) {
1450                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1451                 return true;
1452         }
1453
1454         if (!hash_expected && hash_location) {
1455                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1456                 return true;
1457         }
1458
1459         /* Okay, so this is hash_expected and hash_location -
1460          * so we need to calculate the checksum.
1461          */
1462         genhash = tcp_v4_md5_hash_skb(newhash,
1463                                       hash_expected,
1464                                       NULL, skb);
1465
1466         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1467                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1468                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1469                                      &iph->saddr, ntohs(th->source),
1470                                      &iph->daddr, ntohs(th->dest),
1471                                      genhash ? " tcp_v4_calc_md5_hash failed"
1472                                      : "", l3index);
1473                 return true;
1474         }
1475         return false;
1476 #endif
1477         return false;
1478 }
1479
1480 static void tcp_v4_init_req(struct request_sock *req,
1481                             const struct sock *sk_listener,
1482                             struct sk_buff *skb)
1483 {
1484         struct inet_request_sock *ireq = inet_rsk(req);
1485         struct net *net = sock_net(sk_listener);
1486
1487         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1488         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1489         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1490 }
1491
1492 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1493                                           struct sk_buff *skb,
1494                                           struct flowi *fl,
1495                                           struct request_sock *req)
1496 {
1497         tcp_v4_init_req(req, sk, skb);
1498
1499         if (security_inet_conn_request(sk, skb, req))
1500                 return NULL;
1501
1502         return inet_csk_route_req(sk, &fl->u.ip4, req);
1503 }
1504
1505 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1506         .family         =       PF_INET,
1507         .obj_size       =       sizeof(struct tcp_request_sock),
1508         .rtx_syn_ack    =       tcp_rtx_synack,
1509         .send_ack       =       tcp_v4_reqsk_send_ack,
1510         .destructor     =       tcp_v4_reqsk_destructor,
1511         .send_reset     =       tcp_v4_send_reset,
1512         .syn_ack_timeout =      tcp_syn_ack_timeout,
1513 };
1514
1515 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1516         .mss_clamp      =       TCP_MSS_DEFAULT,
1517 #ifdef CONFIG_TCP_MD5SIG
1518         .req_md5_lookup =       tcp_v4_md5_lookup,
1519         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1520 #endif
1521 #ifdef CONFIG_SYN_COOKIES
1522         .cookie_init_seq =      cookie_v4_init_sequence,
1523 #endif
1524         .route_req      =       tcp_v4_route_req,
1525         .init_seq       =       tcp_v4_init_seq,
1526         .init_ts_off    =       tcp_v4_init_ts_off,
1527         .send_synack    =       tcp_v4_send_synack,
1528 };
1529
1530 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1531 {
1532         /* Never answer to SYNs send to broadcast or multicast */
1533         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1534                 goto drop;
1535
1536         return tcp_conn_request(&tcp_request_sock_ops,
1537                                 &tcp_request_sock_ipv4_ops, sk, skb);
1538
1539 drop:
1540         tcp_listendrop(sk);
1541         return 0;
1542 }
1543 EXPORT_SYMBOL(tcp_v4_conn_request);
1544
1545
1546 /*
1547  * The three way handshake has completed - we got a valid synack -
1548  * now create the new socket.
1549  */
1550 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1551                                   struct request_sock *req,
1552                                   struct dst_entry *dst,
1553                                   struct request_sock *req_unhash,
1554                                   bool *own_req)
1555 {
1556         struct inet_request_sock *ireq;
1557         bool found_dup_sk = false;
1558         struct inet_sock *newinet;
1559         struct tcp_sock *newtp;
1560         struct sock *newsk;
1561 #ifdef CONFIG_TCP_MD5SIG
1562         const union tcp_md5_addr *addr;
1563         struct tcp_md5sig_key *key;
1564         int l3index;
1565 #endif
1566         struct ip_options_rcu *inet_opt;
1567
1568         if (sk_acceptq_is_full(sk))
1569                 goto exit_overflow;
1570
1571         newsk = tcp_create_openreq_child(sk, req, skb);
1572         if (!newsk)
1573                 goto exit_nonewsk;
1574
1575         newsk->sk_gso_type = SKB_GSO_TCPV4;
1576         inet_sk_rx_dst_set(newsk, skb);
1577
1578         newtp                 = tcp_sk(newsk);
1579         newinet               = inet_sk(newsk);
1580         ireq                  = inet_rsk(req);
1581         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1582         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1583         newsk->sk_bound_dev_if = ireq->ir_iif;
1584         newinet->inet_saddr   = ireq->ir_loc_addr;
1585         inet_opt              = rcu_dereference(ireq->ireq_opt);
1586         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1587         newinet->mc_index     = inet_iif(skb);
1588         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1589         newinet->rcv_tos      = ip_hdr(skb)->tos;
1590         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1591         if (inet_opt)
1592                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1593         newinet->inet_id = prandom_u32();
1594
1595         /* Set ToS of the new socket based upon the value of incoming SYN.
1596          * ECT bits are set later in tcp_init_transfer().
1597          */
1598         if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1599                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1600
1601         if (!dst) {
1602                 dst = inet_csk_route_child_sock(sk, newsk, req);
1603                 if (!dst)
1604                         goto put_and_exit;
1605         } else {
1606                 /* syncookie case : see end of cookie_v4_check() */
1607         }
1608         sk_setup_caps(newsk, dst);
1609
1610         tcp_ca_openreq_child(newsk, dst);
1611
1612         tcp_sync_mss(newsk, dst_mtu(dst));
1613         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1614
1615         tcp_initialize_rcv_mss(newsk);
1616
1617 #ifdef CONFIG_TCP_MD5SIG
1618         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1619         /* Copy over the MD5 key from the original socket */
1620         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1621         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1622         if (key) {
1623                 /*
1624                  * We're using one, so create a matching key
1625                  * on the newsk structure. If we fail to get
1626                  * memory, then we end up not copying the key
1627                  * across. Shucks.
1628                  */
1629                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1630                                key->key, key->keylen, GFP_ATOMIC);
1631                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1632         }
1633 #endif
1634
1635         if (__inet_inherit_port(sk, newsk) < 0)
1636                 goto put_and_exit;
1637         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1638                                        &found_dup_sk);
1639         if (likely(*own_req)) {
1640                 tcp_move_syn(newtp, req);
1641                 ireq->ireq_opt = NULL;
1642         } else {
1643                 newinet->inet_opt = NULL;
1644
1645                 if (!req_unhash && found_dup_sk) {
1646                         /* This code path should only be executed in the
1647                          * syncookie case only
1648                          */
1649                         bh_unlock_sock(newsk);
1650                         sock_put(newsk);
1651                         newsk = NULL;
1652                 }
1653         }
1654         return newsk;
1655
1656 exit_overflow:
1657         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1658 exit_nonewsk:
1659         dst_release(dst);
1660 exit:
1661         tcp_listendrop(sk);
1662         return NULL;
1663 put_and_exit:
1664         newinet->inet_opt = NULL;
1665         inet_csk_prepare_forced_close(newsk);
1666         tcp_done(newsk);
1667         goto exit;
1668 }
1669 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1670
1671 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1672 {
1673 #ifdef CONFIG_SYN_COOKIES
1674         const struct tcphdr *th = tcp_hdr(skb);
1675
1676         if (!th->syn)
1677                 sk = cookie_v4_check(sk, skb);
1678 #endif
1679         return sk;
1680 }
1681
1682 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1683                          struct tcphdr *th, u32 *cookie)
1684 {
1685         u16 mss = 0;
1686 #ifdef CONFIG_SYN_COOKIES
1687         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1688                                     &tcp_request_sock_ipv4_ops, sk, th);
1689         if (mss) {
1690                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1691                 tcp_synq_overflow(sk);
1692         }
1693 #endif
1694         return mss;
1695 }
1696
1697 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1698                                                            u32));
1699 /* The socket must have it's spinlock held when we get
1700  * here, unless it is a TCP_LISTEN socket.
1701  *
1702  * We have a potential double-lock case here, so even when
1703  * doing backlog processing we use the BH locking scheme.
1704  * This is because we cannot sleep with the original spinlock
1705  * held.
1706  */
1707 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1708 {
1709         struct sock *rsk;
1710
1711         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1712                 struct dst_entry *dst;
1713
1714                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1715                                                 lockdep_sock_is_held(sk));
1716
1717                 sock_rps_save_rxhash(sk, skb);
1718                 sk_mark_napi_id(sk, skb);
1719                 if (dst) {
1720                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1721                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1722                                              dst, 0)) {
1723                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1724                                 dst_release(dst);
1725                         }
1726                 }
1727                 tcp_rcv_established(sk, skb);
1728                 return 0;
1729         }
1730
1731         if (tcp_checksum_complete(skb))
1732                 goto csum_err;
1733
1734         if (sk->sk_state == TCP_LISTEN) {
1735                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1736
1737                 if (!nsk)
1738                         goto discard;
1739                 if (nsk != sk) {
1740                         if (tcp_child_process(sk, nsk, skb)) {
1741                                 rsk = nsk;
1742                                 goto reset;
1743                         }
1744                         return 0;
1745                 }
1746         } else
1747                 sock_rps_save_rxhash(sk, skb);
1748
1749         if (tcp_rcv_state_process(sk, skb)) {
1750                 rsk = sk;
1751                 goto reset;
1752         }
1753         return 0;
1754
1755 reset:
1756         tcp_v4_send_reset(rsk, skb);
1757 discard:
1758         kfree_skb(skb);
1759         /* Be careful here. If this function gets more complicated and
1760          * gcc suffers from register pressure on the x86, sk (in %ebx)
1761          * might be destroyed here. This current version compiles correctly,
1762          * but you have been warned.
1763          */
1764         return 0;
1765
1766 csum_err:
1767         trace_tcp_bad_csum(skb);
1768         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1769         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1770         goto discard;
1771 }
1772 EXPORT_SYMBOL(tcp_v4_do_rcv);
1773
1774 int tcp_v4_early_demux(struct sk_buff *skb)
1775 {
1776         const struct iphdr *iph;
1777         const struct tcphdr *th;
1778         struct sock *sk;
1779
1780         if (skb->pkt_type != PACKET_HOST)
1781                 return 0;
1782
1783         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1784                 return 0;
1785
1786         iph = ip_hdr(skb);
1787         th = tcp_hdr(skb);
1788
1789         if (th->doff < sizeof(struct tcphdr) / 4)
1790                 return 0;
1791
1792         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1793                                        iph->saddr, th->source,
1794                                        iph->daddr, ntohs(th->dest),
1795                                        skb->skb_iif, inet_sdif(skb));
1796         if (sk) {
1797                 skb->sk = sk;
1798                 skb->destructor = sock_edemux;
1799                 if (sk_fullsock(sk)) {
1800                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1801
1802                         if (dst)
1803                                 dst = dst_check(dst, 0);
1804                         if (dst &&
1805                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1806                                 skb_dst_set_noref(skb, dst);
1807                 }
1808         }
1809         return 0;
1810 }
1811
1812 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1813 {
1814         u32 limit, tail_gso_size, tail_gso_segs;
1815         struct skb_shared_info *shinfo;
1816         const struct tcphdr *th;
1817         struct tcphdr *thtail;
1818         struct sk_buff *tail;
1819         unsigned int hdrlen;
1820         bool fragstolen;
1821         u32 gso_segs;
1822         u32 gso_size;
1823         int delta;
1824
1825         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1826          * we can fix skb->truesize to its real value to avoid future drops.
1827          * This is valid because skb is not yet charged to the socket.
1828          * It has been noticed pure SACK packets were sometimes dropped
1829          * (if cooked by drivers without copybreak feature).
1830          */
1831         skb_condense(skb);
1832
1833         skb_dst_drop(skb);
1834
1835         if (unlikely(tcp_checksum_complete(skb))) {
1836                 bh_unlock_sock(sk);
1837                 trace_tcp_bad_csum(skb);
1838                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1839                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1840                 return true;
1841         }
1842
1843         /* Attempt coalescing to last skb in backlog, even if we are
1844          * above the limits.
1845          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1846          */
1847         th = (const struct tcphdr *)skb->data;
1848         hdrlen = th->doff * 4;
1849
1850         tail = sk->sk_backlog.tail;
1851         if (!tail)
1852                 goto no_coalesce;
1853         thtail = (struct tcphdr *)tail->data;
1854
1855         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1856             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1857             ((TCP_SKB_CB(tail)->tcp_flags |
1858               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1859             !((TCP_SKB_CB(tail)->tcp_flags &
1860               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1861             ((TCP_SKB_CB(tail)->tcp_flags ^
1862               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1863 #ifdef CONFIG_TLS_DEVICE
1864             tail->decrypted != skb->decrypted ||
1865 #endif
1866             !mptcp_skb_can_collapse(tail, skb) ||
1867             thtail->doff != th->doff ||
1868             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1869                 goto no_coalesce;
1870
1871         __skb_pull(skb, hdrlen);
1872
1873         shinfo = skb_shinfo(skb);
1874         gso_size = shinfo->gso_size ?: skb->len;
1875         gso_segs = shinfo->gso_segs ?: 1;
1876
1877         shinfo = skb_shinfo(tail);
1878         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1879         tail_gso_segs = shinfo->gso_segs ?: 1;
1880
1881         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1882                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1883
1884                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1885                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1886                         thtail->window = th->window;
1887                 }
1888
1889                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1890                  * thtail->fin, so that the fast path in tcp_rcv_established()
1891                  * is not entered if we append a packet with a FIN.
1892                  * SYN, RST, URG are not present.
1893                  * ACK is set on both packets.
1894                  * PSH : we do not really care in TCP stack,
1895                  *       at least for 'GRO' packets.
1896                  */
1897                 thtail->fin |= th->fin;
1898                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1899
1900                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1901                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1902                         tail->tstamp = skb->tstamp;
1903                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1904                 }
1905
1906                 /* Not as strict as GRO. We only need to carry mss max value */
1907                 shinfo->gso_size = max(gso_size, tail_gso_size);
1908                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1909
1910                 sk->sk_backlog.len += delta;
1911                 __NET_INC_STATS(sock_net(sk),
1912                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1913                 kfree_skb_partial(skb, fragstolen);
1914                 return false;
1915         }
1916         __skb_push(skb, hdrlen);
1917
1918 no_coalesce:
1919         limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1920
1921         /* Only socket owner can try to collapse/prune rx queues
1922          * to reduce memory overhead, so add a little headroom here.
1923          * Few sockets backlog are possibly concurrently non empty.
1924          */
1925         limit += 64 * 1024;
1926
1927         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1928                 bh_unlock_sock(sk);
1929                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1930                 return true;
1931         }
1932         return false;
1933 }
1934 EXPORT_SYMBOL(tcp_add_backlog);
1935
1936 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1937 {
1938         struct tcphdr *th = (struct tcphdr *)skb->data;
1939
1940         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1941 }
1942 EXPORT_SYMBOL(tcp_filter);
1943
1944 static void tcp_v4_restore_cb(struct sk_buff *skb)
1945 {
1946         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1947                 sizeof(struct inet_skb_parm));
1948 }
1949
1950 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1951                            const struct tcphdr *th)
1952 {
1953         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1954          * barrier() makes sure compiler wont play fool^Waliasing games.
1955          */
1956         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1957                 sizeof(struct inet_skb_parm));
1958         barrier();
1959
1960         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1961         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1962                                     skb->len - th->doff * 4);
1963         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1964         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1965         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1966         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1967         TCP_SKB_CB(skb)->sacked  = 0;
1968         TCP_SKB_CB(skb)->has_rxtstamp =
1969                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1970 }
1971
1972 /*
1973  *      From tcp_input.c
1974  */
1975
1976 int tcp_v4_rcv(struct sk_buff *skb)
1977 {
1978         struct net *net = dev_net(skb->dev);
1979         struct sk_buff *skb_to_free;
1980         int sdif = inet_sdif(skb);
1981         int dif = inet_iif(skb);
1982         const struct iphdr *iph;
1983         const struct tcphdr *th;
1984         bool refcounted;
1985         struct sock *sk;
1986         int drop_reason;
1987         int ret;
1988
1989         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1990         if (skb->pkt_type != PACKET_HOST)
1991                 goto discard_it;
1992
1993         /* Count it even if it's bad */
1994         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1995
1996         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1997                 goto discard_it;
1998
1999         th = (const struct tcphdr *)skb->data;
2000
2001         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2002                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2003                 goto bad_packet;
2004         }
2005         if (!pskb_may_pull(skb, th->doff * 4))
2006                 goto discard_it;
2007
2008         /* An explanation is required here, I think.
2009          * Packet length and doff are validated by header prediction,
2010          * provided case of th->doff==0 is eliminated.
2011          * So, we defer the checks. */
2012
2013         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2014                 goto csum_error;
2015
2016         th = (const struct tcphdr *)skb->data;
2017         iph = ip_hdr(skb);
2018 lookup:
2019         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
2020                                th->dest, sdif, &refcounted);
2021         if (!sk)
2022                 goto no_tcp_socket;
2023
2024 process:
2025         if (sk->sk_state == TCP_TIME_WAIT)
2026                 goto do_time_wait;
2027
2028         if (sk->sk_state == TCP_NEW_SYN_RECV) {
2029                 struct request_sock *req = inet_reqsk(sk);
2030                 bool req_stolen = false;
2031                 struct sock *nsk;
2032
2033                 sk = req->rsk_listener;
2034                 if (unlikely(!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb) ||
2035                              tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2036                         sk_drops_add(sk, skb);
2037                         reqsk_put(req);
2038                         goto discard_it;
2039                 }
2040                 if (tcp_checksum_complete(skb)) {
2041                         reqsk_put(req);
2042                         goto csum_error;
2043                 }
2044                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2045                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2046                         if (!nsk) {
2047                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2048                                 goto lookup;
2049                         }
2050                         sk = nsk;
2051                         /* reuseport_migrate_sock() has already held one sk_refcnt
2052                          * before returning.
2053                          */
2054                 } else {
2055                         /* We own a reference on the listener, increase it again
2056                          * as we might lose it too soon.
2057                          */
2058                         sock_hold(sk);
2059                 }
2060                 refcounted = true;
2061                 nsk = NULL;
2062                 if (!tcp_filter(sk, skb)) {
2063                         th = (const struct tcphdr *)skb->data;
2064                         iph = ip_hdr(skb);
2065                         tcp_v4_fill_cb(skb, iph, th);
2066                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2067                 }
2068                 if (!nsk) {
2069                         reqsk_put(req);
2070                         if (req_stolen) {
2071                                 /* Another cpu got exclusive access to req
2072                                  * and created a full blown socket.
2073                                  * Try to feed this packet to this socket
2074                                  * instead of discarding it.
2075                                  */
2076                                 tcp_v4_restore_cb(skb);
2077                                 sock_put(sk);
2078                                 goto lookup;
2079                         }
2080                         goto discard_and_relse;
2081                 }
2082                 nf_reset_ct(skb);
2083                 if (nsk == sk) {
2084                         reqsk_put(req);
2085                         tcp_v4_restore_cb(skb);
2086                 } else if (tcp_child_process(sk, nsk, skb)) {
2087                         tcp_v4_send_reset(nsk, skb);
2088                         goto discard_and_relse;
2089                 } else {
2090                         sock_put(sk);
2091                         return 0;
2092                 }
2093         }
2094         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2095                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2096                 goto discard_and_relse;
2097         }
2098
2099         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2100                 goto discard_and_relse;
2101
2102         if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2103                 goto discard_and_relse;
2104
2105         nf_reset_ct(skb);
2106
2107         if (tcp_filter(sk, skb)) {
2108                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2109                 goto discard_and_relse;
2110         }
2111         th = (const struct tcphdr *)skb->data;
2112         iph = ip_hdr(skb);
2113         tcp_v4_fill_cb(skb, iph, th);
2114
2115         skb->dev = NULL;
2116
2117         if (sk->sk_state == TCP_LISTEN) {
2118                 ret = tcp_v4_do_rcv(sk, skb);
2119                 goto put_and_return;
2120         }
2121
2122         sk_incoming_cpu_update(sk);
2123
2124         bh_lock_sock_nested(sk);
2125         tcp_segs_in(tcp_sk(sk), skb);
2126         ret = 0;
2127         if (!sock_owned_by_user(sk)) {
2128                 skb_to_free = sk->sk_rx_skb_cache;
2129                 sk->sk_rx_skb_cache = NULL;
2130                 ret = tcp_v4_do_rcv(sk, skb);
2131         } else {
2132                 if (tcp_add_backlog(sk, skb))
2133                         goto discard_and_relse;
2134                 skb_to_free = NULL;
2135         }
2136         bh_unlock_sock(sk);
2137         if (skb_to_free)
2138                 __kfree_skb(skb_to_free);
2139
2140 put_and_return:
2141         if (refcounted)
2142                 sock_put(sk);
2143
2144         return ret;
2145
2146 no_tcp_socket:
2147         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2148         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2149                 goto discard_it;
2150
2151         tcp_v4_fill_cb(skb, iph, th);
2152
2153         if (tcp_checksum_complete(skb)) {
2154 csum_error:
2155                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2156                 trace_tcp_bad_csum(skb);
2157                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2158 bad_packet:
2159                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2160         } else {
2161                 tcp_v4_send_reset(NULL, skb);
2162         }
2163
2164 discard_it:
2165         /* Discard frame. */
2166         kfree_skb_reason(skb, drop_reason);
2167         return 0;
2168
2169 discard_and_relse:
2170         sk_drops_add(sk, skb);
2171         if (refcounted)
2172                 sock_put(sk);
2173         goto discard_it;
2174
2175 do_time_wait:
2176         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2177                 inet_twsk_put(inet_twsk(sk));
2178                 goto discard_it;
2179         }
2180
2181         tcp_v4_fill_cb(skb, iph, th);
2182
2183         if (tcp_checksum_complete(skb)) {
2184                 inet_twsk_put(inet_twsk(sk));
2185                 goto csum_error;
2186         }
2187         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2188         case TCP_TW_SYN: {
2189                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2190                                                         &tcp_hashinfo, skb,
2191                                                         __tcp_hdrlen(th),
2192                                                         iph->saddr, th->source,
2193                                                         iph->daddr, th->dest,
2194                                                         inet_iif(skb),
2195                                                         sdif);
2196                 if (sk2) {
2197                         inet_twsk_deschedule_put(inet_twsk(sk));
2198                         sk = sk2;
2199                         tcp_v4_restore_cb(skb);
2200                         refcounted = false;
2201                         goto process;
2202                 }
2203         }
2204                 /* to ACK */
2205                 fallthrough;
2206         case TCP_TW_ACK:
2207                 tcp_v4_timewait_ack(sk, skb);
2208                 break;
2209         case TCP_TW_RST:
2210                 tcp_v4_send_reset(sk, skb);
2211                 inet_twsk_deschedule_put(inet_twsk(sk));
2212                 goto discard_it;
2213         case TCP_TW_SUCCESS:;
2214         }
2215         goto discard_it;
2216 }
2217
2218 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2219         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2220         .twsk_unique    = tcp_twsk_unique,
2221         .twsk_destructor= tcp_twsk_destructor,
2222 };
2223
2224 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2225 {
2226         struct dst_entry *dst = skb_dst(skb);
2227
2228         if (dst && dst_hold_safe(dst)) {
2229                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2230                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2231         }
2232 }
2233 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2234
2235 const struct inet_connection_sock_af_ops ipv4_specific = {
2236         .queue_xmit        = ip_queue_xmit,
2237         .send_check        = tcp_v4_send_check,
2238         .rebuild_header    = inet_sk_rebuild_header,
2239         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2240         .conn_request      = tcp_v4_conn_request,
2241         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2242         .net_header_len    = sizeof(struct iphdr),
2243         .setsockopt        = ip_setsockopt,
2244         .getsockopt        = ip_getsockopt,
2245         .addr2sockaddr     = inet_csk_addr2sockaddr,
2246         .sockaddr_len      = sizeof(struct sockaddr_in),
2247         .mtu_reduced       = tcp_v4_mtu_reduced,
2248 };
2249 EXPORT_SYMBOL(ipv4_specific);
2250
2251 #ifdef CONFIG_TCP_MD5SIG
2252 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2253         .md5_lookup             = tcp_v4_md5_lookup,
2254         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2255         .md5_parse              = tcp_v4_parse_md5_keys,
2256 };
2257 #endif
2258
2259 /* NOTE: A lot of things set to zero explicitly by call to
2260  *       sk_alloc() so need not be done here.
2261  */
2262 static int tcp_v4_init_sock(struct sock *sk)
2263 {
2264         struct inet_connection_sock *icsk = inet_csk(sk);
2265
2266         tcp_init_sock(sk);
2267
2268         icsk->icsk_af_ops = &ipv4_specific;
2269
2270 #ifdef CONFIG_TCP_MD5SIG
2271         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2272 #endif
2273
2274         return 0;
2275 }
2276
2277 void tcp_v4_destroy_sock(struct sock *sk)
2278 {
2279         struct tcp_sock *tp = tcp_sk(sk);
2280
2281         trace_tcp_destroy_sock(sk);
2282
2283         tcp_clear_xmit_timers(sk);
2284
2285         tcp_cleanup_congestion_control(sk);
2286
2287         tcp_cleanup_ulp(sk);
2288
2289         /* Cleanup up the write buffer. */
2290         tcp_write_queue_purge(sk);
2291
2292         /* Check if we want to disable active TFO */
2293         tcp_fastopen_active_disable_ofo_check(sk);
2294
2295         /* Cleans up our, hopefully empty, out_of_order_queue. */
2296         skb_rbtree_purge(&tp->out_of_order_queue);
2297
2298 #ifdef CONFIG_TCP_MD5SIG
2299         /* Clean up the MD5 key list, if any */
2300         if (tp->md5sig_info) {
2301                 tcp_clear_md5_list(sk);
2302                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2303                 tp->md5sig_info = NULL;
2304         }
2305 #endif
2306
2307         /* Clean up a referenced TCP bind bucket. */
2308         if (inet_csk(sk)->icsk_bind_hash)
2309                 inet_put_port(sk);
2310
2311         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2312
2313         /* If socket is aborted during connect operation */
2314         tcp_free_fastopen_req(tp);
2315         tcp_fastopen_destroy_cipher(sk);
2316         tcp_saved_syn_free(tp);
2317
2318         sk_sockets_allocated_dec(sk);
2319 }
2320 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2321
2322 #ifdef CONFIG_PROC_FS
2323 /* Proc filesystem TCP sock list dumping. */
2324
2325 static unsigned short seq_file_family(const struct seq_file *seq);
2326
2327 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2328 {
2329         unsigned short family = seq_file_family(seq);
2330
2331         /* AF_UNSPEC is used as a match all */
2332         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2333                 net_eq(sock_net(sk), seq_file_net(seq)));
2334 }
2335
2336 /* Find a non empty bucket (starting from st->bucket)
2337  * and return the first sk from it.
2338  */
2339 static void *listening_get_first(struct seq_file *seq)
2340 {
2341         struct tcp_iter_state *st = seq->private;
2342
2343         st->offset = 0;
2344         for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2345                 struct inet_listen_hashbucket *ilb2;
2346                 struct inet_connection_sock *icsk;
2347                 struct sock *sk;
2348
2349                 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2350                 if (hlist_empty(&ilb2->head))
2351                         continue;
2352
2353                 spin_lock(&ilb2->lock);
2354                 inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
2355                         sk = (struct sock *)icsk;
2356                         if (seq_sk_match(seq, sk))
2357                                 return sk;
2358                 }
2359                 spin_unlock(&ilb2->lock);
2360         }
2361
2362         return NULL;
2363 }
2364
2365 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2366  * If "cur" is the last one in the st->bucket,
2367  * call listening_get_first() to return the first sk of the next
2368  * non empty bucket.
2369  */
2370 static void *listening_get_next(struct seq_file *seq, void *cur)
2371 {
2372         struct tcp_iter_state *st = seq->private;
2373         struct inet_listen_hashbucket *ilb2;
2374         struct inet_connection_sock *icsk;
2375         struct sock *sk = cur;
2376
2377         ++st->num;
2378         ++st->offset;
2379
2380         icsk = inet_csk(sk);
2381         inet_lhash2_for_each_icsk_continue(icsk) {
2382                 sk = (struct sock *)icsk;
2383                 if (seq_sk_match(seq, sk))
2384                         return sk;
2385         }
2386
2387         ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2388         spin_unlock(&ilb2->lock);
2389         ++st->bucket;
2390         return listening_get_first(seq);
2391 }
2392
2393 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2394 {
2395         struct tcp_iter_state *st = seq->private;
2396         void *rc;
2397
2398         st->bucket = 0;
2399         st->offset = 0;
2400         rc = listening_get_first(seq);
2401
2402         while (rc && *pos) {
2403                 rc = listening_get_next(seq, rc);
2404                 --*pos;
2405         }
2406         return rc;
2407 }
2408
2409 static inline bool empty_bucket(const struct tcp_iter_state *st)
2410 {
2411         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2412 }
2413
2414 /*
2415  * Get first established socket starting from bucket given in st->bucket.
2416  * If st->bucket is zero, the very first socket in the hash is returned.
2417  */
2418 static void *established_get_first(struct seq_file *seq)
2419 {
2420         struct tcp_iter_state *st = seq->private;
2421
2422         st->offset = 0;
2423         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2424                 struct sock *sk;
2425                 struct hlist_nulls_node *node;
2426                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2427
2428                 /* Lockless fast path for the common case of empty buckets */
2429                 if (empty_bucket(st))
2430                         continue;
2431
2432                 spin_lock_bh(lock);
2433                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2434                         if (seq_sk_match(seq, sk))
2435                                 return sk;
2436                 }
2437                 spin_unlock_bh(lock);
2438         }
2439
2440         return NULL;
2441 }
2442
2443 static void *established_get_next(struct seq_file *seq, void *cur)
2444 {
2445         struct sock *sk = cur;
2446         struct hlist_nulls_node *node;
2447         struct tcp_iter_state *st = seq->private;
2448
2449         ++st->num;
2450         ++st->offset;
2451
2452         sk = sk_nulls_next(sk);
2453
2454         sk_nulls_for_each_from(sk, node) {
2455                 if (seq_sk_match(seq, sk))
2456                         return sk;
2457         }
2458
2459         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2460         ++st->bucket;
2461         return established_get_first(seq);
2462 }
2463
2464 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2465 {
2466         struct tcp_iter_state *st = seq->private;
2467         void *rc;
2468
2469         st->bucket = 0;
2470         rc = established_get_first(seq);
2471
2472         while (rc && pos) {
2473                 rc = established_get_next(seq, rc);
2474                 --pos;
2475         }
2476         return rc;
2477 }
2478
2479 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2480 {
2481         void *rc;
2482         struct tcp_iter_state *st = seq->private;
2483
2484         st->state = TCP_SEQ_STATE_LISTENING;
2485         rc        = listening_get_idx(seq, &pos);
2486
2487         if (!rc) {
2488                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2489                 rc        = established_get_idx(seq, pos);
2490         }
2491
2492         return rc;
2493 }
2494
2495 static void *tcp_seek_last_pos(struct seq_file *seq)
2496 {
2497         struct tcp_iter_state *st = seq->private;
2498         int bucket = st->bucket;
2499         int offset = st->offset;
2500         int orig_num = st->num;
2501         void *rc = NULL;
2502
2503         switch (st->state) {
2504         case TCP_SEQ_STATE_LISTENING:
2505                 if (st->bucket > tcp_hashinfo.lhash2_mask)
2506                         break;
2507                 st->state = TCP_SEQ_STATE_LISTENING;
2508                 rc = listening_get_first(seq);
2509                 while (offset-- && rc && bucket == st->bucket)
2510                         rc = listening_get_next(seq, rc);
2511                 if (rc)
2512                         break;
2513                 st->bucket = 0;
2514                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2515                 fallthrough;
2516         case TCP_SEQ_STATE_ESTABLISHED:
2517                 if (st->bucket > tcp_hashinfo.ehash_mask)
2518                         break;
2519                 rc = established_get_first(seq);
2520                 while (offset-- && rc && bucket == st->bucket)
2521                         rc = established_get_next(seq, rc);
2522         }
2523
2524         st->num = orig_num;
2525
2526         return rc;
2527 }
2528
2529 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2530 {
2531         struct tcp_iter_state *st = seq->private;
2532         void *rc;
2533
2534         if (*pos && *pos == st->last_pos) {
2535                 rc = tcp_seek_last_pos(seq);
2536                 if (rc)
2537                         goto out;
2538         }
2539
2540         st->state = TCP_SEQ_STATE_LISTENING;
2541         st->num = 0;
2542         st->bucket = 0;
2543         st->offset = 0;
2544         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2545
2546 out:
2547         st->last_pos = *pos;
2548         return rc;
2549 }
2550 EXPORT_SYMBOL(tcp_seq_start);
2551
2552 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2553 {
2554         struct tcp_iter_state *st = seq->private;
2555         void *rc = NULL;
2556
2557         if (v == SEQ_START_TOKEN) {
2558                 rc = tcp_get_idx(seq, 0);
2559                 goto out;
2560         }
2561
2562         switch (st->state) {
2563         case TCP_SEQ_STATE_LISTENING:
2564                 rc = listening_get_next(seq, v);
2565                 if (!rc) {
2566                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2567                         st->bucket = 0;
2568                         st->offset = 0;
2569                         rc        = established_get_first(seq);
2570                 }
2571                 break;
2572         case TCP_SEQ_STATE_ESTABLISHED:
2573                 rc = established_get_next(seq, v);
2574                 break;
2575         }
2576 out:
2577         ++*pos;
2578         st->last_pos = *pos;
2579         return rc;
2580 }
2581 EXPORT_SYMBOL(tcp_seq_next);
2582
2583 void tcp_seq_stop(struct seq_file *seq, void *v)
2584 {
2585         struct tcp_iter_state *st = seq->private;
2586
2587         switch (st->state) {
2588         case TCP_SEQ_STATE_LISTENING:
2589                 if (v != SEQ_START_TOKEN)
2590                         spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2591                 break;
2592         case TCP_SEQ_STATE_ESTABLISHED:
2593                 if (v)
2594                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2595                 break;
2596         }
2597 }
2598 EXPORT_SYMBOL(tcp_seq_stop);
2599
2600 static void get_openreq4(const struct request_sock *req,
2601                          struct seq_file *f, int i)
2602 {
2603         const struct inet_request_sock *ireq = inet_rsk(req);
2604         long delta = req->rsk_timer.expires - jiffies;
2605
2606         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2607                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2608                 i,
2609                 ireq->ir_loc_addr,
2610                 ireq->ir_num,
2611                 ireq->ir_rmt_addr,
2612                 ntohs(ireq->ir_rmt_port),
2613                 TCP_SYN_RECV,
2614                 0, 0, /* could print option size, but that is af dependent. */
2615                 1,    /* timers active (only the expire timer) */
2616                 jiffies_delta_to_clock_t(delta),
2617                 req->num_timeout,
2618                 from_kuid_munged(seq_user_ns(f),
2619                                  sock_i_uid(req->rsk_listener)),
2620                 0,  /* non standard timer */
2621                 0, /* open_requests have no inode */
2622                 0,
2623                 req);
2624 }
2625
2626 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2627 {
2628         int timer_active;
2629         unsigned long timer_expires;
2630         const struct tcp_sock *tp = tcp_sk(sk);
2631         const struct inet_connection_sock *icsk = inet_csk(sk);
2632         const struct inet_sock *inet = inet_sk(sk);
2633         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2634         __be32 dest = inet->inet_daddr;
2635         __be32 src = inet->inet_rcv_saddr;
2636         __u16 destp = ntohs(inet->inet_dport);
2637         __u16 srcp = ntohs(inet->inet_sport);
2638         int rx_queue;
2639         int state;
2640
2641         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2642             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2643             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2644                 timer_active    = 1;
2645                 timer_expires   = icsk->icsk_timeout;
2646         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2647                 timer_active    = 4;
2648                 timer_expires   = icsk->icsk_timeout;
2649         } else if (timer_pending(&sk->sk_timer)) {
2650                 timer_active    = 2;
2651                 timer_expires   = sk->sk_timer.expires;
2652         } else {
2653                 timer_active    = 0;
2654                 timer_expires = jiffies;
2655         }
2656
2657         state = inet_sk_state_load(sk);
2658         if (state == TCP_LISTEN)
2659                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2660         else
2661                 /* Because we don't lock the socket,
2662                  * we might find a transient negative value.
2663                  */
2664                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2665                                       READ_ONCE(tp->copied_seq), 0);
2666
2667         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2668                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2669                 i, src, srcp, dest, destp, state,
2670                 READ_ONCE(tp->write_seq) - tp->snd_una,
2671                 rx_queue,
2672                 timer_active,
2673                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2674                 icsk->icsk_retransmits,
2675                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2676                 icsk->icsk_probes_out,
2677                 sock_i_ino(sk),
2678                 refcount_read(&sk->sk_refcnt), sk,
2679                 jiffies_to_clock_t(icsk->icsk_rto),
2680                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2681                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2682                 tcp_snd_cwnd(tp),
2683                 state == TCP_LISTEN ?
2684                     fastopenq->max_qlen :
2685                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2686 }
2687
2688 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2689                                struct seq_file *f, int i)
2690 {
2691         long delta = tw->tw_timer.expires - jiffies;
2692         __be32 dest, src;
2693         __u16 destp, srcp;
2694
2695         dest  = tw->tw_daddr;
2696         src   = tw->tw_rcv_saddr;
2697         destp = ntohs(tw->tw_dport);
2698         srcp  = ntohs(tw->tw_sport);
2699
2700         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2701                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2702                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2703                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2704                 refcount_read(&tw->tw_refcnt), tw);
2705 }
2706
2707 #define TMPSZ 150
2708
2709 static int tcp4_seq_show(struct seq_file *seq, void *v)
2710 {
2711         struct tcp_iter_state *st;
2712         struct sock *sk = v;
2713
2714         seq_setwidth(seq, TMPSZ - 1);
2715         if (v == SEQ_START_TOKEN) {
2716                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2717                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2718                            "inode");
2719                 goto out;
2720         }
2721         st = seq->private;
2722
2723         if (sk->sk_state == TCP_TIME_WAIT)
2724                 get_timewait4_sock(v, seq, st->num);
2725         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2726                 get_openreq4(v, seq, st->num);
2727         else
2728                 get_tcp4_sock(v, seq, st->num);
2729 out:
2730         seq_pad(seq, '\n');
2731         return 0;
2732 }
2733
2734 #ifdef CONFIG_BPF_SYSCALL
2735 struct bpf_tcp_iter_state {
2736         struct tcp_iter_state state;
2737         unsigned int cur_sk;
2738         unsigned int end_sk;
2739         unsigned int max_sk;
2740         struct sock **batch;
2741         bool st_bucket_done;
2742 };
2743
2744 struct bpf_iter__tcp {
2745         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2746         __bpf_md_ptr(struct sock_common *, sk_common);
2747         uid_t uid __aligned(8);
2748 };
2749
2750 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2751                              struct sock_common *sk_common, uid_t uid)
2752 {
2753         struct bpf_iter__tcp ctx;
2754
2755         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2756         ctx.meta = meta;
2757         ctx.sk_common = sk_common;
2758         ctx.uid = uid;
2759         return bpf_iter_run_prog(prog, &ctx);
2760 }
2761
2762 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2763 {
2764         while (iter->cur_sk < iter->end_sk)
2765                 sock_gen_put(iter->batch[iter->cur_sk++]);
2766 }
2767
2768 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2769                                       unsigned int new_batch_sz)
2770 {
2771         struct sock **new_batch;
2772
2773         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2774                              GFP_USER | __GFP_NOWARN);
2775         if (!new_batch)
2776                 return -ENOMEM;
2777
2778         bpf_iter_tcp_put_batch(iter);
2779         kvfree(iter->batch);
2780         iter->batch = new_batch;
2781         iter->max_sk = new_batch_sz;
2782
2783         return 0;
2784 }
2785
2786 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2787                                                  struct sock *start_sk)
2788 {
2789         struct bpf_tcp_iter_state *iter = seq->private;
2790         struct tcp_iter_state *st = &iter->state;
2791         struct inet_connection_sock *icsk;
2792         unsigned int expected = 1;
2793         struct sock *sk;
2794
2795         sock_hold(start_sk);
2796         iter->batch[iter->end_sk++] = start_sk;
2797
2798         icsk = inet_csk(start_sk);
2799         inet_lhash2_for_each_icsk_continue(icsk) {
2800                 sk = (struct sock *)icsk;
2801                 if (seq_sk_match(seq, sk)) {
2802                         if (iter->end_sk < iter->max_sk) {
2803                                 sock_hold(sk);
2804                                 iter->batch[iter->end_sk++] = sk;
2805                         }
2806                         expected++;
2807                 }
2808         }
2809         spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2810
2811         return expected;
2812 }
2813
2814 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2815                                                    struct sock *start_sk)
2816 {
2817         struct bpf_tcp_iter_state *iter = seq->private;
2818         struct tcp_iter_state *st = &iter->state;
2819         struct hlist_nulls_node *node;
2820         unsigned int expected = 1;
2821         struct sock *sk;
2822
2823         sock_hold(start_sk);
2824         iter->batch[iter->end_sk++] = start_sk;
2825
2826         sk = sk_nulls_next(start_sk);
2827         sk_nulls_for_each_from(sk, node) {
2828                 if (seq_sk_match(seq, sk)) {
2829                         if (iter->end_sk < iter->max_sk) {
2830                                 sock_hold(sk);
2831                                 iter->batch[iter->end_sk++] = sk;
2832                         }
2833                         expected++;
2834                 }
2835         }
2836         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2837
2838         return expected;
2839 }
2840
2841 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2842 {
2843         struct bpf_tcp_iter_state *iter = seq->private;
2844         struct tcp_iter_state *st = &iter->state;
2845         unsigned int expected;
2846         bool resized = false;
2847         struct sock *sk;
2848
2849         /* The st->bucket is done.  Directly advance to the next
2850          * bucket instead of having the tcp_seek_last_pos() to skip
2851          * one by one in the current bucket and eventually find out
2852          * it has to advance to the next bucket.
2853          */
2854         if (iter->st_bucket_done) {
2855                 st->offset = 0;
2856                 st->bucket++;
2857                 if (st->state == TCP_SEQ_STATE_LISTENING &&
2858                     st->bucket > tcp_hashinfo.lhash2_mask) {
2859                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2860                         st->bucket = 0;
2861                 }
2862         }
2863
2864 again:
2865         /* Get a new batch */
2866         iter->cur_sk = 0;
2867         iter->end_sk = 0;
2868         iter->st_bucket_done = false;
2869
2870         sk = tcp_seek_last_pos(seq);
2871         if (!sk)
2872                 return NULL; /* Done */
2873
2874         if (st->state == TCP_SEQ_STATE_LISTENING)
2875                 expected = bpf_iter_tcp_listening_batch(seq, sk);
2876         else
2877                 expected = bpf_iter_tcp_established_batch(seq, sk);
2878
2879         if (iter->end_sk == expected) {
2880                 iter->st_bucket_done = true;
2881                 return sk;
2882         }
2883
2884         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2885                 resized = true;
2886                 goto again;
2887         }
2888
2889         return sk;
2890 }
2891
2892 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2893 {
2894         /* bpf iter does not support lseek, so it always
2895          * continue from where it was stop()-ped.
2896          */
2897         if (*pos)
2898                 return bpf_iter_tcp_batch(seq);
2899
2900         return SEQ_START_TOKEN;
2901 }
2902
2903 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2904 {
2905         struct bpf_tcp_iter_state *iter = seq->private;
2906         struct tcp_iter_state *st = &iter->state;
2907         struct sock *sk;
2908
2909         /* Whenever seq_next() is called, the iter->cur_sk is
2910          * done with seq_show(), so advance to the next sk in
2911          * the batch.
2912          */
2913         if (iter->cur_sk < iter->end_sk) {
2914                 /* Keeping st->num consistent in tcp_iter_state.
2915                  * bpf_iter_tcp does not use st->num.
2916                  * meta.seq_num is used instead.
2917                  */
2918                 st->num++;
2919                 /* Move st->offset to the next sk in the bucket such that
2920                  * the future start() will resume at st->offset in
2921                  * st->bucket.  See tcp_seek_last_pos().
2922                  */
2923                 st->offset++;
2924                 sock_gen_put(iter->batch[iter->cur_sk++]);
2925         }
2926
2927         if (iter->cur_sk < iter->end_sk)
2928                 sk = iter->batch[iter->cur_sk];
2929         else
2930                 sk = bpf_iter_tcp_batch(seq);
2931
2932         ++*pos;
2933         /* Keeping st->last_pos consistent in tcp_iter_state.
2934          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2935          */
2936         st->last_pos = *pos;
2937         return sk;
2938 }
2939
2940 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2941 {
2942         struct bpf_iter_meta meta;
2943         struct bpf_prog *prog;
2944         struct sock *sk = v;
2945         bool slow;
2946         uid_t uid;
2947         int ret;
2948
2949         if (v == SEQ_START_TOKEN)
2950                 return 0;
2951
2952         if (sk_fullsock(sk))
2953                 slow = lock_sock_fast(sk);
2954
2955         if (unlikely(sk_unhashed(sk))) {
2956                 ret = SEQ_SKIP;
2957                 goto unlock;
2958         }
2959
2960         if (sk->sk_state == TCP_TIME_WAIT) {
2961                 uid = 0;
2962         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2963                 const struct request_sock *req = v;
2964
2965                 uid = from_kuid_munged(seq_user_ns(seq),
2966                                        sock_i_uid(req->rsk_listener));
2967         } else {
2968                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2969         }
2970
2971         meta.seq = seq;
2972         prog = bpf_iter_get_info(&meta, false);
2973         ret = tcp_prog_seq_show(prog, &meta, v, uid);
2974
2975 unlock:
2976         if (sk_fullsock(sk))
2977                 unlock_sock_fast(sk, slow);
2978         return ret;
2979
2980 }
2981
2982 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2983 {
2984         struct bpf_tcp_iter_state *iter = seq->private;
2985         struct bpf_iter_meta meta;
2986         struct bpf_prog *prog;
2987
2988         if (!v) {
2989                 meta.seq = seq;
2990                 prog = bpf_iter_get_info(&meta, true);
2991                 if (prog)
2992                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2993         }
2994
2995         if (iter->cur_sk < iter->end_sk) {
2996                 bpf_iter_tcp_put_batch(iter);
2997                 iter->st_bucket_done = false;
2998         }
2999 }
3000
3001 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3002         .show           = bpf_iter_tcp_seq_show,
3003         .start          = bpf_iter_tcp_seq_start,
3004         .next           = bpf_iter_tcp_seq_next,
3005         .stop           = bpf_iter_tcp_seq_stop,
3006 };
3007 #endif
3008 static unsigned short seq_file_family(const struct seq_file *seq)
3009 {
3010         const struct tcp_seq_afinfo *afinfo;
3011
3012 #ifdef CONFIG_BPF_SYSCALL
3013         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3014         if (seq->op == &bpf_iter_tcp_seq_ops)
3015                 return AF_UNSPEC;
3016 #endif
3017
3018         /* Iterated from proc fs */
3019         afinfo = PDE_DATA(file_inode(seq->file));
3020         return afinfo->family;
3021 }
3022
3023 static const struct seq_operations tcp4_seq_ops = {
3024         .show           = tcp4_seq_show,
3025         .start          = tcp_seq_start,
3026         .next           = tcp_seq_next,
3027         .stop           = tcp_seq_stop,
3028 };
3029
3030 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3031         .family         = AF_INET,
3032 };
3033
3034 static int __net_init tcp4_proc_init_net(struct net *net)
3035 {
3036         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3037                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3038                 return -ENOMEM;
3039         return 0;
3040 }
3041
3042 static void __net_exit tcp4_proc_exit_net(struct net *net)
3043 {
3044         remove_proc_entry("tcp", net->proc_net);
3045 }
3046
3047 static struct pernet_operations tcp4_net_ops = {
3048         .init = tcp4_proc_init_net,
3049         .exit = tcp4_proc_exit_net,
3050 };
3051
3052 int __init tcp4_proc_init(void)
3053 {
3054         return register_pernet_subsys(&tcp4_net_ops);
3055 }
3056
3057 void tcp4_proc_exit(void)
3058 {
3059         unregister_pernet_subsys(&tcp4_net_ops);
3060 }
3061 #endif /* CONFIG_PROC_FS */
3062
3063 /* @wake is one when sk_stream_write_space() calls us.
3064  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3065  * This mimics the strategy used in sock_def_write_space().
3066  */
3067 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3068 {
3069         const struct tcp_sock *tp = tcp_sk(sk);
3070         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3071                             READ_ONCE(tp->snd_nxt);
3072
3073         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3074 }
3075 EXPORT_SYMBOL(tcp_stream_memory_free);
3076
3077 struct proto tcp_prot = {
3078         .name                   = "TCP",
3079         .owner                  = THIS_MODULE,
3080         .close                  = tcp_close,
3081         .pre_connect            = tcp_v4_pre_connect,
3082         .connect                = tcp_v4_connect,
3083         .disconnect             = tcp_disconnect,
3084         .accept                 = inet_csk_accept,
3085         .ioctl                  = tcp_ioctl,
3086         .init                   = tcp_v4_init_sock,
3087         .destroy                = tcp_v4_destroy_sock,
3088         .shutdown               = tcp_shutdown,
3089         .setsockopt             = tcp_setsockopt,
3090         .getsockopt             = tcp_getsockopt,
3091         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3092         .keepalive              = tcp_set_keepalive,
3093         .recvmsg                = tcp_recvmsg,
3094         .sendmsg                = tcp_sendmsg,
3095         .sendpage               = tcp_sendpage,
3096         .backlog_rcv            = tcp_v4_do_rcv,
3097         .release_cb             = tcp_release_cb,
3098         .hash                   = inet_hash,
3099         .unhash                 = inet_unhash,
3100         .get_port               = inet_csk_get_port,
3101 #ifdef CONFIG_BPF_SYSCALL
3102         .psock_update_sk_prot   = tcp_bpf_update_proto,
3103 #endif
3104         .enter_memory_pressure  = tcp_enter_memory_pressure,
3105         .leave_memory_pressure  = tcp_leave_memory_pressure,
3106         .stream_memory_free     = tcp_stream_memory_free,
3107         .sockets_allocated      = &tcp_sockets_allocated,
3108         .orphan_count           = &tcp_orphan_count,
3109         .memory_allocated       = &tcp_memory_allocated,
3110         .memory_pressure        = &tcp_memory_pressure,
3111         .sysctl_mem             = sysctl_tcp_mem,
3112         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3113         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3114         .max_header             = MAX_TCP_HEADER,
3115         .obj_size               = sizeof(struct tcp_sock),
3116         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3117         .twsk_prot              = &tcp_timewait_sock_ops,
3118         .rsk_prot               = &tcp_request_sock_ops,
3119         .h.hashinfo             = &tcp_hashinfo,
3120         .no_autobind            = true,
3121         .diag_destroy           = tcp_abort,
3122 };
3123 EXPORT_SYMBOL(tcp_prot);
3124
3125 static void __net_exit tcp_sk_exit(struct net *net)
3126 {
3127         if (net->ipv4.tcp_congestion_control)
3128                 bpf_module_put(net->ipv4.tcp_congestion_control,
3129                                net->ipv4.tcp_congestion_control->owner);
3130 }
3131
3132 static int __net_init tcp_sk_init(struct net *net)
3133 {
3134         int cnt;
3135
3136         net->ipv4.sysctl_tcp_ecn = 2;
3137         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3138
3139         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3140         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3141         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3142         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3143         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3144
3145         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3146         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3147         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3148
3149         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3150         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3151         net->ipv4.sysctl_tcp_syncookies = 1;
3152         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3153         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3154         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3155         net->ipv4.sysctl_tcp_orphan_retries = 0;
3156         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3157         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3158         net->ipv4.sysctl_tcp_tw_reuse = 2;
3159         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3160
3161         cnt = tcp_hashinfo.ehash_mask + 1;
3162         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3163         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
3164
3165         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3166         net->ipv4.sysctl_tcp_sack = 1;
3167         net->ipv4.sysctl_tcp_window_scaling = 1;
3168         net->ipv4.sysctl_tcp_timestamps = 1;
3169         net->ipv4.sysctl_tcp_early_retrans = 3;
3170         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3171         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3172         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3173         net->ipv4.sysctl_tcp_max_reordering = 300;
3174         net->ipv4.sysctl_tcp_dsack = 1;
3175         net->ipv4.sysctl_tcp_app_win = 31;
3176         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3177         net->ipv4.sysctl_tcp_frto = 2;
3178         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3179         /* This limits the percentage of the congestion window which we
3180          * will allow a single TSO frame to consume.  Building TSO frames
3181          * which are too large can cause TCP streams to be bursty.
3182          */
3183         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3184         /* Default TSQ limit of 16 TSO segments */
3185         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3186         /* rfc5961 challenge ack rate limiting */
3187         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3188         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3189         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3190         net->ipv4.sysctl_tcp_autocorking = 1;
3191         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3192         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3193         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3194         if (net != &init_net) {
3195                 memcpy(net->ipv4.sysctl_tcp_rmem,
3196                        init_net.ipv4.sysctl_tcp_rmem,
3197                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3198                 memcpy(net->ipv4.sysctl_tcp_wmem,
3199                        init_net.ipv4.sysctl_tcp_wmem,
3200                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3201         }
3202         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3203         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3204         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3205         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3206         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3207         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3208
3209         /* Reno is always built in */
3210         if (!net_eq(net, &init_net) &&
3211             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3212                                init_net.ipv4.tcp_congestion_control->owner))
3213                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3214         else
3215                 net->ipv4.tcp_congestion_control = &tcp_reno;
3216
3217         return 0;
3218 }
3219
3220 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3221 {
3222         struct net *net;
3223
3224         inet_twsk_purge(&tcp_hashinfo, AF_INET);
3225
3226         list_for_each_entry(net, net_exit_list, exit_list)
3227                 tcp_fastopen_ctx_destroy(net);
3228 }
3229
3230 static struct pernet_operations __net_initdata tcp_sk_ops = {
3231        .init       = tcp_sk_init,
3232        .exit       = tcp_sk_exit,
3233        .exit_batch = tcp_sk_exit_batch,
3234 };
3235
3236 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3237 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3238                      struct sock_common *sk_common, uid_t uid)
3239
3240 #define INIT_BATCH_SZ 16
3241
3242 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3243 {
3244         struct bpf_tcp_iter_state *iter = priv_data;
3245         int err;
3246
3247         err = bpf_iter_init_seq_net(priv_data, aux);
3248         if (err)
3249                 return err;
3250
3251         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3252         if (err) {
3253                 bpf_iter_fini_seq_net(priv_data);
3254                 return err;
3255         }
3256
3257         return 0;
3258 }
3259
3260 static void bpf_iter_fini_tcp(void *priv_data)
3261 {
3262         struct bpf_tcp_iter_state *iter = priv_data;
3263
3264         bpf_iter_fini_seq_net(priv_data);
3265         kvfree(iter->batch);
3266 }
3267
3268 static const struct bpf_iter_seq_info tcp_seq_info = {
3269         .seq_ops                = &bpf_iter_tcp_seq_ops,
3270         .init_seq_private       = bpf_iter_init_tcp,
3271         .fini_seq_private       = bpf_iter_fini_tcp,
3272         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3273 };
3274
3275 static const struct bpf_func_proto *
3276 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3277                             const struct bpf_prog *prog)
3278 {
3279         switch (func_id) {
3280         case BPF_FUNC_setsockopt:
3281                 return &bpf_sk_setsockopt_proto;
3282         case BPF_FUNC_getsockopt:
3283                 return &bpf_sk_getsockopt_proto;
3284         default:
3285                 return NULL;
3286         }
3287 }
3288
3289 static struct bpf_iter_reg tcp_reg_info = {
3290         .target                 = "tcp",
3291         .ctx_arg_info_size      = 1,
3292         .ctx_arg_info           = {
3293                 { offsetof(struct bpf_iter__tcp, sk_common),
3294                   PTR_TO_BTF_ID_OR_NULL },
3295         },
3296         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3297         .seq_info               = &tcp_seq_info,
3298 };
3299
3300 static void __init bpf_iter_register(void)
3301 {
3302         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3303         if (bpf_iter_reg_target(&tcp_reg_info))
3304                 pr_warn("Warning: could not register bpf iterator tcp\n");
3305 }
3306
3307 #endif
3308
3309 void __init tcp_v4_init(void)
3310 {
3311         int cpu, res;
3312
3313         for_each_possible_cpu(cpu) {
3314                 struct sock *sk;
3315
3316                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3317                                            IPPROTO_TCP, &init_net);
3318                 if (res)
3319                         panic("Failed to create the TCP control socket.\n");
3320                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3321
3322                 /* Please enforce IP_DF and IPID==0 for RST and
3323                  * ACK sent in SYN-RECV and TIME-WAIT state.
3324                  */
3325                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3326
3327                 per_cpu(ipv4_tcp_sk, cpu) = sk;
3328         }
3329         if (register_pernet_subsys(&tcp_sk_ops))
3330                 panic("Failed to create the TCP control socket.\n");
3331
3332 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3333         bpf_iter_register();
3334 #endif
3335 }