1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Implementation of the Transmission Control Protocol(TCP).
9 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
48 #define pr_fmt(fmt) "TCP: " fmt
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
61 #include <net/net_namespace.h>
63 #include <net/inet_hashtables.h>
65 #include <net/transp_v6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
84 #include <trace/events/tcp.h>
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
98 return secure_tcp_seq(ip_hdr(skb)->daddr,
101 tcp_hdr(skb)->source);
104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112 const struct inet_timewait_sock *tw = inet_twsk(sktw);
113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 struct tcp_sock *tp = tcp_sk(sk);
117 /* Still does not detect *everything* that goes through
118 * lo, since we require a loopback src or dst address
119 * or direct binding to 'lo' interface.
121 bool loopback = false;
122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
124 #if IS_ENABLED(CONFIG_IPV6)
125 if (tw->tw_family == AF_INET6) {
126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
134 if (ipv4_is_loopback(tw->tw_daddr) ||
135 ipv4_is_loopback(tw->tw_rcv_saddr))
142 /* With PAWS, it is safe from the viewpoint
143 of data integrity. Even without PAWS it is safe provided sequence
144 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
146 Actually, the idea is close to VJ's one, only timestamp cache is
147 held not per host, but per port pair and TW bucket is used as state
150 If TW bucket has been already destroyed we fall back to VJ's scheme
151 and use initial timestamp retrieved from peer table.
153 if (tcptw->tw_ts_recent_stamp &&
154 (!twp || (reuse && time_after32(ktime_get_seconds(),
155 tcptw->tw_ts_recent_stamp)))) {
156 /* inet_twsk_hashdance() sets sk_refcnt after putting twsk
157 * and releasing the bucket lock.
159 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
162 /* In case of repair and re-using TIME-WAIT sockets we still
163 * want to be sure that it is safe as above but honor the
164 * sequence numbers and time stamps set as part of the repair
167 * Without this check re-using a TIME-WAIT socket with TCP
168 * repair would accumulate a -1 on the repair assigned
169 * sequence number. The first time it is reused the sequence
170 * is -1, the second time -2, etc. This fixes that issue
171 * without appearing to create any others.
173 if (likely(!tp->repair)) {
174 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
178 WRITE_ONCE(tp->write_seq, seq);
179 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
180 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
188 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
190 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
193 /* This check is replicated from tcp_v4_connect() and intended to
194 * prevent BPF program called below from accessing bytes that are out
195 * of the bound specified by user in addr_len.
197 if (addr_len < sizeof(struct sockaddr_in))
200 sock_owned_by_me(sk);
202 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
205 /* This will initiate an outgoing connection. */
206 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
208 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
209 struct inet_sock *inet = inet_sk(sk);
210 struct tcp_sock *tp = tcp_sk(sk);
211 __be16 orig_sport, orig_dport;
212 __be32 daddr, nexthop;
216 struct ip_options_rcu *inet_opt;
217 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
219 if (addr_len < sizeof(struct sockaddr_in))
222 if (usin->sin_family != AF_INET)
223 return -EAFNOSUPPORT;
225 nexthop = daddr = usin->sin_addr.s_addr;
226 inet_opt = rcu_dereference_protected(inet->inet_opt,
227 lockdep_sock_is_held(sk));
228 if (inet_opt && inet_opt->opt.srr) {
231 nexthop = inet_opt->opt.faddr;
234 orig_sport = inet->inet_sport;
235 orig_dport = usin->sin_port;
236 fl4 = &inet->cork.fl.u.ip4;
237 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
238 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
240 orig_sport, orig_dport, sk);
243 if (err == -ENETUNREACH)
244 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
248 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
253 if (!inet_opt || !inet_opt->opt.srr)
256 if (!inet->inet_saddr)
257 inet->inet_saddr = fl4->saddr;
258 sk_rcv_saddr_set(sk, inet->inet_saddr);
260 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
261 /* Reset inherited state */
262 tp->rx_opt.ts_recent = 0;
263 tp->rx_opt.ts_recent_stamp = 0;
264 if (likely(!tp->repair))
265 WRITE_ONCE(tp->write_seq, 0);
268 inet->inet_dport = usin->sin_port;
269 sk_daddr_set(sk, daddr);
271 inet_csk(sk)->icsk_ext_hdr_len = 0;
273 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
275 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
277 /* Socket identity is still unknown (sport may be zero).
278 * However we set state to SYN-SENT and not releasing socket
279 * lock select source port, enter ourselves into the hash tables and
280 * complete initialization after this.
282 tcp_set_state(sk, TCP_SYN_SENT);
283 err = inet_hash_connect(tcp_death_row, sk);
289 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
290 inet->inet_sport, inet->inet_dport, sk);
296 /* OK, now commit destination to socket. */
297 sk->sk_gso_type = SKB_GSO_TCPV4;
298 sk_setup_caps(sk, &rt->dst);
301 if (likely(!tp->repair)) {
303 WRITE_ONCE(tp->write_seq,
304 secure_tcp_seq(inet->inet_saddr,
308 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
313 inet->inet_id = prandom_u32();
315 if (tcp_fastopen_defer_connect(sk, &err))
320 err = tcp_connect(sk);
329 * This unhashes the socket and releases the local port,
332 tcp_set_state(sk, TCP_CLOSE);
333 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
334 inet_reset_saddr(sk);
336 sk->sk_route_caps = 0;
337 inet->inet_dport = 0;
340 EXPORT_SYMBOL(tcp_v4_connect);
343 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
344 * It can be called through tcp_release_cb() if socket was owned by user
345 * at the time tcp_v4_err() was called to handle ICMP message.
347 void tcp_v4_mtu_reduced(struct sock *sk)
349 struct inet_sock *inet = inet_sk(sk);
350 struct dst_entry *dst;
353 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
355 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
356 dst = inet_csk_update_pmtu(sk, mtu);
360 /* Something is about to be wrong... Remember soft error
361 * for the case, if this connection will not able to recover.
363 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
364 sk->sk_err_soft = EMSGSIZE;
368 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
369 ip_sk_accept_pmtu(sk) &&
370 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
371 tcp_sync_mss(sk, mtu);
373 /* Resend the TCP packet because it's
374 * clear that the old packet has been
375 * dropped. This is the new "fast" path mtu
378 tcp_simple_retransmit(sk);
379 } /* else let the usual retransmit timer handle it */
381 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
383 static void do_redirect(struct sk_buff *skb, struct sock *sk)
385 struct dst_entry *dst = __sk_dst_check(sk, 0);
388 dst->ops->redirect(dst, sk, skb);
392 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
393 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
395 struct request_sock *req = inet_reqsk(sk);
396 struct net *net = sock_net(sk);
398 /* ICMPs are not backlogged, hence we cannot get
399 * an established socket here.
401 if (seq != tcp_rsk(req)->snt_isn) {
402 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
405 * Still in SYN_RECV, just remove it silently.
406 * There is no good way to pass the error to the newly
407 * created socket, and POSIX does not want network
408 * errors returned from accept().
410 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
411 tcp_listendrop(req->rsk_listener);
415 EXPORT_SYMBOL(tcp_req_err);
417 /* TCP-LD (RFC 6069) logic */
418 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
420 struct inet_connection_sock *icsk = inet_csk(sk);
421 struct tcp_sock *tp = tcp_sk(sk);
426 if (sock_owned_by_user(sk))
429 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
433 skb = tcp_rtx_queue_head(sk);
434 if (WARN_ON_ONCE(!skb))
437 icsk->icsk_backoff--;
438 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
439 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
441 tcp_mstamp_refresh(tp);
442 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
443 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
446 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
447 remaining, TCP_RTO_MAX);
449 /* RTO revert clocked out retransmission.
450 * Will retransmit now.
452 tcp_retransmit_timer(sk);
455 EXPORT_SYMBOL(tcp_ld_RTO_revert);
458 * This routine is called by the ICMP module when it gets some
459 * sort of error condition. If err < 0 then the socket should
460 * be closed and the error returned to the user. If err > 0
461 * it's just the icmp type << 8 | icmp code. After adjustment
462 * header points to the first 8 bytes of the tcp header. We need
463 * to find the appropriate port.
465 * The locking strategy used here is very "optimistic". When
466 * someone else accesses the socket the ICMP is just dropped
467 * and for some paths there is no check at all.
468 * A more general error queue to queue errors for later handling
469 * is probably better.
473 int tcp_v4_err(struct sk_buff *skb, u32 info)
475 const struct iphdr *iph = (const struct iphdr *)skb->data;
476 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
478 struct inet_sock *inet;
479 const int type = icmp_hdr(skb)->type;
480 const int code = icmp_hdr(skb)->code;
482 struct request_sock *fastopen;
485 struct net *net = dev_net(skb->dev);
487 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
488 th->dest, iph->saddr, ntohs(th->source),
491 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
494 if (sk->sk_state == TCP_TIME_WAIT) {
495 inet_twsk_put(inet_twsk(sk));
498 seq = ntohl(th->seq);
499 if (sk->sk_state == TCP_NEW_SYN_RECV) {
500 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
501 type == ICMP_TIME_EXCEEDED ||
502 (type == ICMP_DEST_UNREACH &&
503 (code == ICMP_NET_UNREACH ||
504 code == ICMP_HOST_UNREACH)));
509 /* If too many ICMPs get dropped on busy
510 * servers this needs to be solved differently.
511 * We do take care of PMTU discovery (RFC1191) special case :
512 * we can receive locally generated ICMP messages while socket is held.
514 if (sock_owned_by_user(sk)) {
515 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
516 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
518 if (sk->sk_state == TCP_CLOSE)
521 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
522 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
527 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
528 fastopen = rcu_dereference(tp->fastopen_rsk);
529 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
530 if (sk->sk_state != TCP_LISTEN &&
531 !between(seq, snd_una, tp->snd_nxt)) {
532 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
538 if (!sock_owned_by_user(sk))
539 do_redirect(skb, sk);
541 case ICMP_SOURCE_QUENCH:
542 /* Just silently ignore these. */
544 case ICMP_PARAMETERPROB:
547 case ICMP_DEST_UNREACH:
548 if (code > NR_ICMP_UNREACH)
551 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
552 /* We are not interested in TCP_LISTEN and open_requests
553 * (SYN-ACKs send out by Linux are always <576bytes so
554 * they should go through unfragmented).
556 if (sk->sk_state == TCP_LISTEN)
559 WRITE_ONCE(tp->mtu_info, info);
560 if (!sock_owned_by_user(sk)) {
561 tcp_v4_mtu_reduced(sk);
563 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
569 err = icmp_err_convert[code].errno;
570 /* check if this ICMP message allows revert of backoff.
574 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
575 tcp_ld_RTO_revert(sk, seq);
577 case ICMP_TIME_EXCEEDED:
584 switch (sk->sk_state) {
587 /* Only in fast or simultaneous open. If a fast open socket is
588 * already accepted it is treated as a connected one below.
590 if (fastopen && !fastopen->sk)
593 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
595 if (!sock_owned_by_user(sk)) {
598 sk->sk_error_report(sk);
602 sk->sk_err_soft = err;
607 /* If we've already connected we will keep trying
608 * until we time out, or the user gives up.
610 * rfc1122 4.2.3.9 allows to consider as hard errors
611 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
612 * but it is obsoleted by pmtu discovery).
614 * Note, that in modern internet, where routing is unreliable
615 * and in each dark corner broken firewalls sit, sending random
616 * errors ordered by their masters even this two messages finally lose
617 * their original sense (even Linux sends invalid PORT_UNREACHs)
619 * Now we are in compliance with RFCs.
624 if (!sock_owned_by_user(sk) && inet->recverr) {
626 sk->sk_error_report(sk);
627 } else { /* Only an error on timeout */
628 sk->sk_err_soft = err;
637 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
639 struct tcphdr *th = tcp_hdr(skb);
641 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
642 skb->csum_start = skb_transport_header(skb) - skb->head;
643 skb->csum_offset = offsetof(struct tcphdr, check);
646 /* This routine computes an IPv4 TCP checksum. */
647 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
649 const struct inet_sock *inet = inet_sk(sk);
651 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
653 EXPORT_SYMBOL(tcp_v4_send_check);
656 * This routine will send an RST to the other tcp.
658 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
660 * Answer: if a packet caused RST, it is not for a socket
661 * existing in our system, if it is matched to a socket,
662 * it is just duplicate segment or bug in other side's TCP.
663 * So that we build reply only basing on parameters
664 * arrived with segment.
665 * Exception: precedence violation. We do not implement it in any case.
668 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
670 const struct tcphdr *th = tcp_hdr(skb);
673 #ifdef CONFIG_TCP_MD5SIG
674 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
677 struct ip_reply_arg arg;
678 #ifdef CONFIG_TCP_MD5SIG
679 struct tcp_md5sig_key *key = NULL;
680 const __u8 *hash_location = NULL;
681 unsigned char newhash[16];
683 struct sock *sk1 = NULL;
685 u64 transmit_time = 0;
689 /* Never send a reset in response to a reset. */
693 /* If sk not NULL, it means we did a successful lookup and incoming
694 * route had to be correct. prequeue might have dropped our dst.
696 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
699 /* Swap the send and the receive. */
700 memset(&rep, 0, sizeof(rep));
701 rep.th.dest = th->source;
702 rep.th.source = th->dest;
703 rep.th.doff = sizeof(struct tcphdr) / 4;
707 rep.th.seq = th->ack_seq;
710 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
711 skb->len - (th->doff << 2));
714 memset(&arg, 0, sizeof(arg));
715 arg.iov[0].iov_base = (unsigned char *)&rep;
716 arg.iov[0].iov_len = sizeof(rep.th);
718 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
719 #ifdef CONFIG_TCP_MD5SIG
721 hash_location = tcp_parse_md5sig_option(th);
722 if (sk && sk_fullsock(sk)) {
723 const union tcp_md5_addr *addr;
726 /* sdif set, means packet ingressed via a device
727 * in an L3 domain and inet_iif is set to it.
729 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
730 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
731 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
732 } else if (hash_location) {
733 const union tcp_md5_addr *addr;
734 int sdif = tcp_v4_sdif(skb);
735 int dif = inet_iif(skb);
739 * active side is lost. Try to find listening socket through
740 * source port, and then find md5 key through listening socket.
741 * we are not loose security here:
742 * Incoming packet is checked with md5 hash with finding key,
743 * no RST generated if md5 hash doesn't match.
745 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
747 th->source, ip_hdr(skb)->daddr,
748 ntohs(th->source), dif, sdif);
749 /* don't send rst if it can't find key */
753 /* sdif set, means packet ingressed via a device
754 * in an L3 domain and dif is set to it.
756 l3index = sdif ? dif : 0;
757 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
758 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
763 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
764 if (genhash || memcmp(hash_location, newhash, 16) != 0)
770 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
772 (TCPOPT_MD5SIG << 8) |
774 /* Update length and the length the header thinks exists */
775 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
776 rep.th.doff = arg.iov[0].iov_len / 4;
778 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
779 key, ip_hdr(skb)->saddr,
780 ip_hdr(skb)->daddr, &rep.th);
783 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
784 ip_hdr(skb)->saddr, /* XXX */
785 arg.iov[0].iov_len, IPPROTO_TCP, 0);
786 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
787 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
789 /* When socket is gone, all binding information is lost.
790 * routing might fail in this case. No choice here, if we choose to force
791 * input interface, we will misroute in case of asymmetric route.
794 arg.bound_dev_if = sk->sk_bound_dev_if;
796 trace_tcp_send_reset(sk, skb);
799 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
800 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
802 arg.tos = ip_hdr(skb)->tos;
803 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
805 ctl_sk = this_cpu_read(ipv4_tcp_sk);
806 sock_net_set(ctl_sk, net);
808 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
809 inet_twsk(sk)->tw_mark : sk->sk_mark;
810 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
811 inet_twsk(sk)->tw_priority : sk->sk_priority;
812 transmit_time = tcp_transmit_time(sk);
813 xfrm_sk_clone_policy(ctl_sk, sk);
816 ctl_sk->sk_priority = 0;
818 ip_send_unicast_reply(ctl_sk,
819 skb, &TCP_SKB_CB(skb)->header.h4.opt,
820 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
821 &arg, arg.iov[0].iov_len,
824 xfrm_sk_free_policy(ctl_sk);
825 sock_net_set(ctl_sk, &init_net);
826 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
827 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
830 #ifdef CONFIG_TCP_MD5SIG
836 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
837 outside socket context is ugly, certainly. What can I do?
840 static void tcp_v4_send_ack(const struct sock *sk,
841 struct sk_buff *skb, u32 seq, u32 ack,
842 u32 win, u32 tsval, u32 tsecr, int oif,
843 struct tcp_md5sig_key *key,
844 int reply_flags, u8 tos)
846 const struct tcphdr *th = tcp_hdr(skb);
849 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
850 #ifdef CONFIG_TCP_MD5SIG
851 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
855 struct net *net = sock_net(sk);
856 struct ip_reply_arg arg;
860 memset(&rep.th, 0, sizeof(struct tcphdr));
861 memset(&arg, 0, sizeof(arg));
863 arg.iov[0].iov_base = (unsigned char *)&rep;
864 arg.iov[0].iov_len = sizeof(rep.th);
866 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
867 (TCPOPT_TIMESTAMP << 8) |
869 rep.opt[1] = htonl(tsval);
870 rep.opt[2] = htonl(tsecr);
871 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
874 /* Swap the send and the receive. */
875 rep.th.dest = th->source;
876 rep.th.source = th->dest;
877 rep.th.doff = arg.iov[0].iov_len / 4;
878 rep.th.seq = htonl(seq);
879 rep.th.ack_seq = htonl(ack);
881 rep.th.window = htons(win);
883 #ifdef CONFIG_TCP_MD5SIG
885 int offset = (tsecr) ? 3 : 0;
887 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
889 (TCPOPT_MD5SIG << 8) |
891 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
892 rep.th.doff = arg.iov[0].iov_len/4;
894 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
895 key, ip_hdr(skb)->saddr,
896 ip_hdr(skb)->daddr, &rep.th);
899 arg.flags = reply_flags;
900 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
901 ip_hdr(skb)->saddr, /* XXX */
902 arg.iov[0].iov_len, IPPROTO_TCP, 0);
903 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
905 arg.bound_dev_if = oif;
907 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
909 ctl_sk = this_cpu_read(ipv4_tcp_sk);
910 sock_net_set(ctl_sk, net);
911 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
912 inet_twsk(sk)->tw_mark : sk->sk_mark;
913 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
914 inet_twsk(sk)->tw_priority : sk->sk_priority;
915 transmit_time = tcp_transmit_time(sk);
916 ip_send_unicast_reply(ctl_sk,
917 skb, &TCP_SKB_CB(skb)->header.h4.opt,
918 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
919 &arg, arg.iov[0].iov_len,
922 sock_net_set(ctl_sk, &init_net);
923 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
927 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
929 struct inet_timewait_sock *tw = inet_twsk(sk);
930 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
932 tcp_v4_send_ack(sk, skb,
933 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
934 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
935 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
938 tcp_twsk_md5_key(tcptw),
939 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
946 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
947 struct request_sock *req)
949 const union tcp_md5_addr *addr;
952 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
953 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
955 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
959 * The window field (SEG.WND) of every outgoing segment, with the
960 * exception of <SYN> segments, MUST be right-shifted by
961 * Rcv.Wind.Shift bits:
963 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
964 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
965 tcp_v4_send_ack(sk, skb, seq,
966 tcp_rsk(req)->rcv_nxt,
967 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
968 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
969 READ_ONCE(req->ts_recent),
971 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
972 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
977 * Send a SYN-ACK after having received a SYN.
978 * This still operates on a request_sock only, not on a big
981 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
983 struct request_sock *req,
984 struct tcp_fastopen_cookie *foc,
985 enum tcp_synack_type synack_type,
986 struct sk_buff *syn_skb)
988 const struct inet_request_sock *ireq = inet_rsk(req);
994 /* First, grab a route. */
995 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
998 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1001 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1003 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1004 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1005 (inet_sk(sk)->tos & INET_ECN_MASK) :
1008 if (!INET_ECN_is_capable(tos) &&
1009 tcp_bpf_ca_needs_ecn((struct sock *)req))
1010 tos |= INET_ECN_ECT_0;
1013 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1015 rcu_dereference(ireq->ireq_opt),
1018 err = net_xmit_eval(err);
1025 * IPv4 request_sock destructor.
1027 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1029 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1032 #ifdef CONFIG_TCP_MD5SIG
1034 * RFC2385 MD5 checksumming requires a mapping of
1035 * IP address->MD5 Key.
1036 * We need to maintain these in the sk structure.
1039 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1040 EXPORT_SYMBOL(tcp_md5_needed);
1042 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1047 /* l3index always overrides non-l3index */
1048 if (old->l3index && new->l3index == 0)
1050 if (old->l3index == 0 && new->l3index)
1053 return old->prefixlen < new->prefixlen;
1056 /* Find the Key structure for an address. */
1057 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1058 const union tcp_md5_addr *addr,
1061 const struct tcp_sock *tp = tcp_sk(sk);
1062 struct tcp_md5sig_key *key;
1063 const struct tcp_md5sig_info *md5sig;
1065 struct tcp_md5sig_key *best_match = NULL;
1068 /* caller either holds rcu_read_lock() or socket lock */
1069 md5sig = rcu_dereference_check(tp->md5sig_info,
1070 lockdep_sock_is_held(sk));
1074 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1075 lockdep_sock_is_held(sk)) {
1076 if (key->family != family)
1078 if (key->l3index && key->l3index != l3index)
1080 if (family == AF_INET) {
1081 mask = inet_make_mask(key->prefixlen);
1082 match = (key->addr.a4.s_addr & mask) ==
1083 (addr->a4.s_addr & mask);
1084 #if IS_ENABLED(CONFIG_IPV6)
1085 } else if (family == AF_INET6) {
1086 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1093 if (match && better_md5_match(best_match, key))
1098 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1100 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1101 const union tcp_md5_addr *addr,
1102 int family, u8 prefixlen,
1105 const struct tcp_sock *tp = tcp_sk(sk);
1106 struct tcp_md5sig_key *key;
1107 unsigned int size = sizeof(struct in_addr);
1108 const struct tcp_md5sig_info *md5sig;
1110 /* caller either holds rcu_read_lock() or socket lock */
1111 md5sig = rcu_dereference_check(tp->md5sig_info,
1112 lockdep_sock_is_held(sk));
1115 #if IS_ENABLED(CONFIG_IPV6)
1116 if (family == AF_INET6)
1117 size = sizeof(struct in6_addr);
1119 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1120 lockdep_sock_is_held(sk)) {
1121 if (key->family != family)
1123 if (key->l3index != l3index)
1125 if (!memcmp(&key->addr, addr, size) &&
1126 key->prefixlen == prefixlen)
1132 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1133 const struct sock *addr_sk)
1135 const union tcp_md5_addr *addr;
1138 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1139 addr_sk->sk_bound_dev_if);
1140 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1141 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1143 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1145 /* This can be called on a newly created socket, from other files */
1146 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1147 int family, u8 prefixlen, int l3index,
1148 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1150 /* Add Key to the list */
1151 struct tcp_md5sig_key *key;
1152 struct tcp_sock *tp = tcp_sk(sk);
1153 struct tcp_md5sig_info *md5sig;
1155 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1157 /* Pre-existing entry - just update that one.
1158 * Note that the key might be used concurrently.
1159 * data_race() is telling kcsan that we do not care of
1160 * key mismatches, since changing MD5 key on live flows
1161 * can lead to packet drops.
1163 data_race(memcpy(key->key, newkey, newkeylen));
1165 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1166 * Also note that a reader could catch new key->keylen value
1167 * but old key->key[], this is the reason we use __GFP_ZERO
1168 * at sock_kmalloc() time below these lines.
1170 WRITE_ONCE(key->keylen, newkeylen);
1175 md5sig = rcu_dereference_protected(tp->md5sig_info,
1176 lockdep_sock_is_held(sk));
1178 md5sig = kmalloc(sizeof(*md5sig), gfp);
1182 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1183 INIT_HLIST_HEAD(&md5sig->head);
1184 rcu_assign_pointer(tp->md5sig_info, md5sig);
1187 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1190 if (!tcp_alloc_md5sig_pool()) {
1191 sock_kfree_s(sk, key, sizeof(*key));
1195 memcpy(key->key, newkey, newkeylen);
1196 key->keylen = newkeylen;
1197 key->family = family;
1198 key->prefixlen = prefixlen;
1199 key->l3index = l3index;
1200 memcpy(&key->addr, addr,
1201 (family == AF_INET6) ? sizeof(struct in6_addr) :
1202 sizeof(struct in_addr));
1203 hlist_add_head_rcu(&key->node, &md5sig->head);
1206 EXPORT_SYMBOL(tcp_md5_do_add);
1208 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1209 u8 prefixlen, int l3index)
1211 struct tcp_md5sig_key *key;
1213 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1216 hlist_del_rcu(&key->node);
1217 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1218 kfree_rcu(key, rcu);
1221 EXPORT_SYMBOL(tcp_md5_do_del);
1223 static void tcp_clear_md5_list(struct sock *sk)
1225 struct tcp_sock *tp = tcp_sk(sk);
1226 struct tcp_md5sig_key *key;
1227 struct hlist_node *n;
1228 struct tcp_md5sig_info *md5sig;
1230 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1232 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1233 hlist_del_rcu(&key->node);
1234 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1235 kfree_rcu(key, rcu);
1239 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1240 sockptr_t optval, int optlen)
1242 struct tcp_md5sig cmd;
1243 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1244 const union tcp_md5_addr *addr;
1248 if (optlen < sizeof(cmd))
1251 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1254 if (sin->sin_family != AF_INET)
1257 if (optname == TCP_MD5SIG_EXT &&
1258 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1259 prefixlen = cmd.tcpm_prefixlen;
1264 if (optname == TCP_MD5SIG_EXT &&
1265 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1266 struct net_device *dev;
1269 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1270 if (dev && netif_is_l3_master(dev))
1271 l3index = dev->ifindex;
1275 /* ok to reference set/not set outside of rcu;
1276 * right now device MUST be an L3 master
1278 if (!dev || !l3index)
1282 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1284 if (!cmd.tcpm_keylen)
1285 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1287 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1290 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1291 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1294 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1295 __be32 daddr, __be32 saddr,
1296 const struct tcphdr *th, int nbytes)
1298 struct tcp4_pseudohdr *bp;
1299 struct scatterlist sg;
1306 bp->protocol = IPPROTO_TCP;
1307 bp->len = cpu_to_be16(nbytes);
1309 _th = (struct tcphdr *)(bp + 1);
1310 memcpy(_th, th, sizeof(*th));
1313 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1314 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1315 sizeof(*bp) + sizeof(*th));
1316 return crypto_ahash_update(hp->md5_req);
1319 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1320 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1322 struct tcp_md5sig_pool *hp;
1323 struct ahash_request *req;
1325 hp = tcp_get_md5sig_pool();
1327 goto clear_hash_noput;
1330 if (crypto_ahash_init(req))
1332 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1334 if (tcp_md5_hash_key(hp, key))
1336 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1337 if (crypto_ahash_final(req))
1340 tcp_put_md5sig_pool();
1344 tcp_put_md5sig_pool();
1346 memset(md5_hash, 0, 16);
1350 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1351 const struct sock *sk,
1352 const struct sk_buff *skb)
1354 struct tcp_md5sig_pool *hp;
1355 struct ahash_request *req;
1356 const struct tcphdr *th = tcp_hdr(skb);
1357 __be32 saddr, daddr;
1359 if (sk) { /* valid for establish/request sockets */
1360 saddr = sk->sk_rcv_saddr;
1361 daddr = sk->sk_daddr;
1363 const struct iphdr *iph = ip_hdr(skb);
1368 hp = tcp_get_md5sig_pool();
1370 goto clear_hash_noput;
1373 if (crypto_ahash_init(req))
1376 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1378 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1380 if (tcp_md5_hash_key(hp, key))
1382 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1383 if (crypto_ahash_final(req))
1386 tcp_put_md5sig_pool();
1390 tcp_put_md5sig_pool();
1392 memset(md5_hash, 0, 16);
1395 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1399 /* Called with rcu_read_lock() */
1400 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1401 const struct sk_buff *skb,
1404 #ifdef CONFIG_TCP_MD5SIG
1406 * This gets called for each TCP segment that arrives
1407 * so we want to be efficient.
1408 * We have 3 drop cases:
1409 * o No MD5 hash and one expected.
1410 * o MD5 hash and we're not expecting one.
1411 * o MD5 hash and its wrong.
1413 const __u8 *hash_location = NULL;
1414 struct tcp_md5sig_key *hash_expected;
1415 const struct iphdr *iph = ip_hdr(skb);
1416 const struct tcphdr *th = tcp_hdr(skb);
1417 const union tcp_md5_addr *addr;
1418 unsigned char newhash[16];
1419 int genhash, l3index;
1421 /* sdif set, means packet ingressed via a device
1422 * in an L3 domain and dif is set to the l3mdev
1424 l3index = sdif ? dif : 0;
1426 addr = (union tcp_md5_addr *)&iph->saddr;
1427 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1428 hash_location = tcp_parse_md5sig_option(th);
1430 /* We've parsed the options - do we have a hash? */
1431 if (!hash_expected && !hash_location)
1434 if (hash_expected && !hash_location) {
1435 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1439 if (!hash_expected && hash_location) {
1440 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1444 /* Okay, so this is hash_expected and hash_location -
1445 * so we need to calculate the checksum.
1447 genhash = tcp_v4_md5_hash_skb(newhash,
1451 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1452 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1453 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1454 &iph->saddr, ntohs(th->source),
1455 &iph->daddr, ntohs(th->dest),
1456 genhash ? " tcp_v4_calc_md5_hash failed"
1465 static void tcp_v4_init_req(struct request_sock *req,
1466 const struct sock *sk_listener,
1467 struct sk_buff *skb)
1469 struct inet_request_sock *ireq = inet_rsk(req);
1470 struct net *net = sock_net(sk_listener);
1472 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1473 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1474 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1477 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1479 const struct request_sock *req)
1481 return inet_csk_route_req(sk, &fl->u.ip4, req);
1484 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1486 .obj_size = sizeof(struct tcp_request_sock),
1487 .rtx_syn_ack = tcp_rtx_synack,
1488 .send_ack = tcp_v4_reqsk_send_ack,
1489 .destructor = tcp_v4_reqsk_destructor,
1490 .send_reset = tcp_v4_send_reset,
1491 .syn_ack_timeout = tcp_syn_ack_timeout,
1494 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1495 .mss_clamp = TCP_MSS_DEFAULT,
1496 #ifdef CONFIG_TCP_MD5SIG
1497 .req_md5_lookup = tcp_v4_md5_lookup,
1498 .calc_md5_hash = tcp_v4_md5_hash_skb,
1500 .init_req = tcp_v4_init_req,
1501 #ifdef CONFIG_SYN_COOKIES
1502 .cookie_init_seq = cookie_v4_init_sequence,
1504 .route_req = tcp_v4_route_req,
1505 .init_seq = tcp_v4_init_seq,
1506 .init_ts_off = tcp_v4_init_ts_off,
1507 .send_synack = tcp_v4_send_synack,
1510 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1512 /* Never answer to SYNs send to broadcast or multicast */
1513 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1516 return tcp_conn_request(&tcp_request_sock_ops,
1517 &tcp_request_sock_ipv4_ops, sk, skb);
1523 EXPORT_SYMBOL(tcp_v4_conn_request);
1527 * The three way handshake has completed - we got a valid synack -
1528 * now create the new socket.
1530 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1531 struct request_sock *req,
1532 struct dst_entry *dst,
1533 struct request_sock *req_unhash,
1536 struct inet_request_sock *ireq;
1537 bool found_dup_sk = false;
1538 struct inet_sock *newinet;
1539 struct tcp_sock *newtp;
1541 #ifdef CONFIG_TCP_MD5SIG
1542 const union tcp_md5_addr *addr;
1543 struct tcp_md5sig_key *key;
1546 struct ip_options_rcu *inet_opt;
1548 if (sk_acceptq_is_full(sk))
1551 newsk = tcp_create_openreq_child(sk, req, skb);
1555 newsk->sk_gso_type = SKB_GSO_TCPV4;
1556 inet_sk_rx_dst_set(newsk, skb);
1558 newtp = tcp_sk(newsk);
1559 newinet = inet_sk(newsk);
1560 ireq = inet_rsk(req);
1561 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1562 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1563 newsk->sk_bound_dev_if = ireq->ir_iif;
1564 newinet->inet_saddr = ireq->ir_loc_addr;
1565 inet_opt = rcu_dereference(ireq->ireq_opt);
1566 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1567 newinet->mc_index = inet_iif(skb);
1568 newinet->mc_ttl = ip_hdr(skb)->ttl;
1569 newinet->rcv_tos = ip_hdr(skb)->tos;
1570 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1572 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1573 newinet->inet_id = prandom_u32();
1575 /* Set ToS of the new socket based upon the value of incoming SYN.
1576 * ECT bits are set later in tcp_init_transfer().
1578 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1579 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1582 dst = inet_csk_route_child_sock(sk, newsk, req);
1586 /* syncookie case : see end of cookie_v4_check() */
1588 sk_setup_caps(newsk, dst);
1590 tcp_ca_openreq_child(newsk, dst);
1592 tcp_sync_mss(newsk, dst_mtu(dst));
1593 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1595 tcp_initialize_rcv_mss(newsk);
1597 #ifdef CONFIG_TCP_MD5SIG
1598 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1599 /* Copy over the MD5 key from the original socket */
1600 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1601 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1604 * We're using one, so create a matching key
1605 * on the newsk structure. If we fail to get
1606 * memory, then we end up not copying the key
1609 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1610 key->key, key->keylen, GFP_ATOMIC);
1611 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1615 if (__inet_inherit_port(sk, newsk) < 0)
1617 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1619 if (likely(*own_req)) {
1620 tcp_move_syn(newtp, req);
1621 ireq->ireq_opt = NULL;
1623 newinet->inet_opt = NULL;
1625 if (!req_unhash && found_dup_sk) {
1626 /* This code path should only be executed in the
1627 * syncookie case only
1629 bh_unlock_sock(newsk);
1637 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1644 newinet->inet_opt = NULL;
1645 inet_csk_prepare_forced_close(newsk);
1649 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1651 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1653 #ifdef CONFIG_SYN_COOKIES
1654 const struct tcphdr *th = tcp_hdr(skb);
1657 sk = cookie_v4_check(sk, skb);
1662 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1663 struct tcphdr *th, u32 *cookie)
1666 #ifdef CONFIG_SYN_COOKIES
1667 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1668 &tcp_request_sock_ipv4_ops, sk, th);
1670 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1671 tcp_synq_overflow(sk);
1677 /* The socket must have it's spinlock held when we get
1678 * here, unless it is a TCP_LISTEN socket.
1680 * We have a potential double-lock case here, so even when
1681 * doing backlog processing we use the BH locking scheme.
1682 * This is because we cannot sleep with the original spinlock
1685 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1689 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1690 struct dst_entry *dst;
1692 dst = rcu_dereference_protected(sk->sk_rx_dst,
1693 lockdep_sock_is_held(sk));
1695 sock_rps_save_rxhash(sk, skb);
1696 sk_mark_napi_id(sk, skb);
1698 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1699 !dst->ops->check(dst, 0)) {
1700 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1704 tcp_rcv_established(sk, skb);
1708 if (tcp_checksum_complete(skb))
1711 if (sk->sk_state == TCP_LISTEN) {
1712 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1717 if (tcp_child_process(sk, nsk, skb)) {
1724 sock_rps_save_rxhash(sk, skb);
1726 if (tcp_rcv_state_process(sk, skb)) {
1733 tcp_v4_send_reset(rsk, skb);
1736 /* Be careful here. If this function gets more complicated and
1737 * gcc suffers from register pressure on the x86, sk (in %ebx)
1738 * might be destroyed here. This current version compiles correctly,
1739 * but you have been warned.
1744 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1745 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1748 EXPORT_SYMBOL(tcp_v4_do_rcv);
1750 int tcp_v4_early_demux(struct sk_buff *skb)
1752 const struct iphdr *iph;
1753 const struct tcphdr *th;
1756 if (skb->pkt_type != PACKET_HOST)
1759 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1765 if (th->doff < sizeof(struct tcphdr) / 4)
1768 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1769 iph->saddr, th->source,
1770 iph->daddr, ntohs(th->dest),
1771 skb->skb_iif, inet_sdif(skb));
1774 skb->destructor = sock_edemux;
1775 if (sk_fullsock(sk)) {
1776 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1779 dst = dst_check(dst, 0);
1781 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1782 skb_dst_set_noref(skb, dst);
1788 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1790 u32 limit, tail_gso_size, tail_gso_segs;
1791 struct skb_shared_info *shinfo;
1792 const struct tcphdr *th;
1793 struct tcphdr *thtail;
1794 struct sk_buff *tail;
1795 unsigned int hdrlen;
1801 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1802 * we can fix skb->truesize to its real value to avoid future drops.
1803 * This is valid because skb is not yet charged to the socket.
1804 * It has been noticed pure SACK packets were sometimes dropped
1805 * (if cooked by drivers without copybreak feature).
1811 if (unlikely(tcp_checksum_complete(skb))) {
1813 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1814 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1818 /* Attempt coalescing to last skb in backlog, even if we are
1820 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1822 th = (const struct tcphdr *)skb->data;
1823 hdrlen = th->doff * 4;
1825 tail = sk->sk_backlog.tail;
1828 thtail = (struct tcphdr *)tail->data;
1830 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1831 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1832 ((TCP_SKB_CB(tail)->tcp_flags |
1833 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1834 !((TCP_SKB_CB(tail)->tcp_flags &
1835 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1836 ((TCP_SKB_CB(tail)->tcp_flags ^
1837 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1838 #ifdef CONFIG_TLS_DEVICE
1839 tail->decrypted != skb->decrypted ||
1841 !mptcp_skb_can_collapse(tail, skb) ||
1842 thtail->doff != th->doff ||
1843 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1846 __skb_pull(skb, hdrlen);
1848 shinfo = skb_shinfo(skb);
1849 gso_size = shinfo->gso_size ?: skb->len;
1850 gso_segs = shinfo->gso_segs ?: 1;
1852 shinfo = skb_shinfo(tail);
1853 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1854 tail_gso_segs = shinfo->gso_segs ?: 1;
1856 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1857 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1859 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1860 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1861 thtail->window = th->window;
1864 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1865 * thtail->fin, so that the fast path in tcp_rcv_established()
1866 * is not entered if we append a packet with a FIN.
1867 * SYN, RST, URG are not present.
1868 * ACK is set on both packets.
1869 * PSH : we do not really care in TCP stack,
1870 * at least for 'GRO' packets.
1872 thtail->fin |= th->fin;
1873 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1875 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1876 TCP_SKB_CB(tail)->has_rxtstamp = true;
1877 tail->tstamp = skb->tstamp;
1878 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1881 /* Not as strict as GRO. We only need to carry mss max value */
1882 shinfo->gso_size = max(gso_size, tail_gso_size);
1883 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1885 sk->sk_backlog.len += delta;
1886 __NET_INC_STATS(sock_net(sk),
1887 LINUX_MIB_TCPBACKLOGCOALESCE);
1888 kfree_skb_partial(skb, fragstolen);
1891 __skb_push(skb, hdrlen);
1894 limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1896 /* Only socket owner can try to collapse/prune rx queues
1897 * to reduce memory overhead, so add a little headroom here.
1898 * Few sockets backlog are possibly concurrently non empty.
1902 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1904 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1909 EXPORT_SYMBOL(tcp_add_backlog);
1911 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1913 struct tcphdr *th = (struct tcphdr *)skb->data;
1915 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1917 EXPORT_SYMBOL(tcp_filter);
1919 static void tcp_v4_restore_cb(struct sk_buff *skb)
1921 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1922 sizeof(struct inet_skb_parm));
1925 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1926 const struct tcphdr *th)
1928 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1929 * barrier() makes sure compiler wont play fool^Waliasing games.
1931 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1932 sizeof(struct inet_skb_parm));
1935 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1936 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1937 skb->len - th->doff * 4);
1938 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1939 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1940 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1941 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1942 TCP_SKB_CB(skb)->sacked = 0;
1943 TCP_SKB_CB(skb)->has_rxtstamp =
1944 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1951 int tcp_v4_rcv(struct sk_buff *skb)
1953 struct net *net = dev_net(skb->dev);
1954 struct sk_buff *skb_to_free;
1955 int sdif = inet_sdif(skb);
1956 int dif = inet_iif(skb);
1957 const struct iphdr *iph;
1958 const struct tcphdr *th;
1963 if (skb->pkt_type != PACKET_HOST)
1966 /* Count it even if it's bad */
1967 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1969 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1972 th = (const struct tcphdr *)skb->data;
1974 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1976 if (!pskb_may_pull(skb, th->doff * 4))
1979 /* An explanation is required here, I think.
1980 * Packet length and doff are validated by header prediction,
1981 * provided case of th->doff==0 is eliminated.
1982 * So, we defer the checks. */
1984 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1987 th = (const struct tcphdr *)skb->data;
1990 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1991 th->dest, sdif, &refcounted);
1996 if (sk->sk_state == TCP_TIME_WAIT)
1999 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2000 struct request_sock *req = inet_reqsk(sk);
2001 bool req_stolen = false;
2004 sk = req->rsk_listener;
2005 if (unlikely(!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb) ||
2006 tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2007 sk_drops_add(sk, skb);
2011 if (tcp_checksum_complete(skb)) {
2015 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2016 inet_csk_reqsk_queue_drop_and_put(sk, req);
2019 /* We own a reference on the listener, increase it again
2020 * as we might lose it too soon.
2025 if (!tcp_filter(sk, skb)) {
2026 th = (const struct tcphdr *)skb->data;
2028 tcp_v4_fill_cb(skb, iph, th);
2029 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2034 /* Another cpu got exclusive access to req
2035 * and created a full blown socket.
2036 * Try to feed this packet to this socket
2037 * instead of discarding it.
2039 tcp_v4_restore_cb(skb);
2043 goto discard_and_relse;
2048 tcp_v4_restore_cb(skb);
2049 } else if (tcp_child_process(sk, nsk, skb)) {
2050 tcp_v4_send_reset(nsk, skb);
2051 goto discard_and_relse;
2057 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2058 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2059 goto discard_and_relse;
2062 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2063 goto discard_and_relse;
2065 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2066 goto discard_and_relse;
2070 if (tcp_filter(sk, skb))
2071 goto discard_and_relse;
2072 th = (const struct tcphdr *)skb->data;
2074 tcp_v4_fill_cb(skb, iph, th);
2078 if (sk->sk_state == TCP_LISTEN) {
2079 ret = tcp_v4_do_rcv(sk, skb);
2080 goto put_and_return;
2083 sk_incoming_cpu_update(sk);
2085 bh_lock_sock_nested(sk);
2086 tcp_segs_in(tcp_sk(sk), skb);
2088 if (!sock_owned_by_user(sk)) {
2089 skb_to_free = sk->sk_rx_skb_cache;
2090 sk->sk_rx_skb_cache = NULL;
2091 ret = tcp_v4_do_rcv(sk, skb);
2093 if (tcp_add_backlog(sk, skb))
2094 goto discard_and_relse;
2099 __kfree_skb(skb_to_free);
2108 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2111 tcp_v4_fill_cb(skb, iph, th);
2113 if (tcp_checksum_complete(skb)) {
2115 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2117 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2119 tcp_v4_send_reset(NULL, skb);
2123 /* Discard frame. */
2128 sk_drops_add(sk, skb);
2134 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2135 inet_twsk_put(inet_twsk(sk));
2139 tcp_v4_fill_cb(skb, iph, th);
2141 if (tcp_checksum_complete(skb)) {
2142 inet_twsk_put(inet_twsk(sk));
2145 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2147 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2150 iph->saddr, th->source,
2151 iph->daddr, th->dest,
2155 inet_twsk_deschedule_put(inet_twsk(sk));
2157 tcp_v4_restore_cb(skb);
2165 tcp_v4_timewait_ack(sk, skb);
2168 tcp_v4_send_reset(sk, skb);
2169 inet_twsk_deschedule_put(inet_twsk(sk));
2171 case TCP_TW_SUCCESS:;
2176 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2177 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2178 .twsk_unique = tcp_twsk_unique,
2179 .twsk_destructor= tcp_twsk_destructor,
2182 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2184 struct dst_entry *dst = skb_dst(skb);
2186 if (dst && dst_hold_safe(dst)) {
2187 rcu_assign_pointer(sk->sk_rx_dst, dst);
2188 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2191 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2193 const struct inet_connection_sock_af_ops ipv4_specific = {
2194 .queue_xmit = ip_queue_xmit,
2195 .send_check = tcp_v4_send_check,
2196 .rebuild_header = inet_sk_rebuild_header,
2197 .sk_rx_dst_set = inet_sk_rx_dst_set,
2198 .conn_request = tcp_v4_conn_request,
2199 .syn_recv_sock = tcp_v4_syn_recv_sock,
2200 .net_header_len = sizeof(struct iphdr),
2201 .setsockopt = ip_setsockopt,
2202 .getsockopt = ip_getsockopt,
2203 .addr2sockaddr = inet_csk_addr2sockaddr,
2204 .sockaddr_len = sizeof(struct sockaddr_in),
2205 .mtu_reduced = tcp_v4_mtu_reduced,
2207 EXPORT_SYMBOL(ipv4_specific);
2209 #ifdef CONFIG_TCP_MD5SIG
2210 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2211 .md5_lookup = tcp_v4_md5_lookup,
2212 .calc_md5_hash = tcp_v4_md5_hash_skb,
2213 .md5_parse = tcp_v4_parse_md5_keys,
2217 /* NOTE: A lot of things set to zero explicitly by call to
2218 * sk_alloc() so need not be done here.
2220 static int tcp_v4_init_sock(struct sock *sk)
2222 struct inet_connection_sock *icsk = inet_csk(sk);
2226 icsk->icsk_af_ops = &ipv4_specific;
2228 #ifdef CONFIG_TCP_MD5SIG
2229 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2235 void tcp_v4_destroy_sock(struct sock *sk)
2237 struct tcp_sock *tp = tcp_sk(sk);
2239 trace_tcp_destroy_sock(sk);
2241 tcp_clear_xmit_timers(sk);
2243 tcp_cleanup_congestion_control(sk);
2245 tcp_cleanup_ulp(sk);
2247 /* Cleanup up the write buffer. */
2248 tcp_write_queue_purge(sk);
2250 /* Check if we want to disable active TFO */
2251 tcp_fastopen_active_disable_ofo_check(sk);
2253 /* Cleans up our, hopefully empty, out_of_order_queue. */
2254 skb_rbtree_purge(&tp->out_of_order_queue);
2256 #ifdef CONFIG_TCP_MD5SIG
2257 /* Clean up the MD5 key list, if any */
2258 if (tp->md5sig_info) {
2259 tcp_clear_md5_list(sk);
2260 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2261 tp->md5sig_info = NULL;
2265 /* Clean up a referenced TCP bind bucket. */
2266 if (inet_csk(sk)->icsk_bind_hash)
2269 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2271 /* If socket is aborted during connect operation */
2272 tcp_free_fastopen_req(tp);
2273 tcp_fastopen_destroy_cipher(sk);
2274 tcp_saved_syn_free(tp);
2276 sk_sockets_allocated_dec(sk);
2278 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2280 #ifdef CONFIG_PROC_FS
2281 /* Proc filesystem TCP sock list dumping. */
2284 * Get next listener socket follow cur. If cur is NULL, get first socket
2285 * starting from bucket given in st->bucket; when st->bucket is zero the
2286 * very first socket in the hash table is returned.
2288 static void *listening_get_next(struct seq_file *seq, void *cur)
2290 struct tcp_seq_afinfo *afinfo;
2291 struct tcp_iter_state *st = seq->private;
2292 struct net *net = seq_file_net(seq);
2293 struct inet_listen_hashbucket *ilb;
2294 struct hlist_nulls_node *node;
2295 struct sock *sk = cur;
2297 if (st->bpf_seq_afinfo)
2298 afinfo = st->bpf_seq_afinfo;
2300 afinfo = PDE_DATA(file_inode(seq->file));
2304 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2305 spin_lock(&ilb->lock);
2306 sk = sk_nulls_head(&ilb->nulls_head);
2310 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2314 sk = sk_nulls_next(sk);
2316 sk_nulls_for_each_from(sk, node) {
2317 if (!net_eq(sock_net(sk), net))
2319 if (afinfo->family == AF_UNSPEC ||
2320 sk->sk_family == afinfo->family)
2323 spin_unlock(&ilb->lock);
2325 if (++st->bucket < INET_LHTABLE_SIZE)
2330 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2332 struct tcp_iter_state *st = seq->private;
2337 rc = listening_get_next(seq, NULL);
2339 while (rc && *pos) {
2340 rc = listening_get_next(seq, rc);
2346 static inline bool empty_bucket(const struct tcp_iter_state *st)
2348 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2352 * Get first established socket starting from bucket given in st->bucket.
2353 * If st->bucket is zero, the very first socket in the hash is returned.
2355 static void *established_get_first(struct seq_file *seq)
2357 struct tcp_seq_afinfo *afinfo;
2358 struct tcp_iter_state *st = seq->private;
2359 struct net *net = seq_file_net(seq);
2362 if (st->bpf_seq_afinfo)
2363 afinfo = st->bpf_seq_afinfo;
2365 afinfo = PDE_DATA(file_inode(seq->file));
2368 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2370 struct hlist_nulls_node *node;
2371 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2373 /* Lockless fast path for the common case of empty buckets */
2374 if (empty_bucket(st))
2378 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2379 if ((afinfo->family != AF_UNSPEC &&
2380 sk->sk_family != afinfo->family) ||
2381 !net_eq(sock_net(sk), net)) {
2387 spin_unlock_bh(lock);
2393 static void *established_get_next(struct seq_file *seq, void *cur)
2395 struct tcp_seq_afinfo *afinfo;
2396 struct sock *sk = cur;
2397 struct hlist_nulls_node *node;
2398 struct tcp_iter_state *st = seq->private;
2399 struct net *net = seq_file_net(seq);
2401 if (st->bpf_seq_afinfo)
2402 afinfo = st->bpf_seq_afinfo;
2404 afinfo = PDE_DATA(file_inode(seq->file));
2409 sk = sk_nulls_next(sk);
2411 sk_nulls_for_each_from(sk, node) {
2412 if ((afinfo->family == AF_UNSPEC ||
2413 sk->sk_family == afinfo->family) &&
2414 net_eq(sock_net(sk), net))
2418 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2420 return established_get_first(seq);
2423 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2425 struct tcp_iter_state *st = seq->private;
2429 rc = established_get_first(seq);
2432 rc = established_get_next(seq, rc);
2438 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2441 struct tcp_iter_state *st = seq->private;
2443 st->state = TCP_SEQ_STATE_LISTENING;
2444 rc = listening_get_idx(seq, &pos);
2447 st->state = TCP_SEQ_STATE_ESTABLISHED;
2448 rc = established_get_idx(seq, pos);
2454 static void *tcp_seek_last_pos(struct seq_file *seq)
2456 struct tcp_iter_state *st = seq->private;
2457 int bucket = st->bucket;
2458 int offset = st->offset;
2459 int orig_num = st->num;
2462 switch (st->state) {
2463 case TCP_SEQ_STATE_LISTENING:
2464 if (st->bucket >= INET_LHTABLE_SIZE)
2466 st->state = TCP_SEQ_STATE_LISTENING;
2467 rc = listening_get_next(seq, NULL);
2468 while (offset-- && rc && bucket == st->bucket)
2469 rc = listening_get_next(seq, rc);
2473 st->state = TCP_SEQ_STATE_ESTABLISHED;
2475 case TCP_SEQ_STATE_ESTABLISHED:
2476 if (st->bucket > tcp_hashinfo.ehash_mask)
2478 rc = established_get_first(seq);
2479 while (offset-- && rc && bucket == st->bucket)
2480 rc = established_get_next(seq, rc);
2488 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2490 struct tcp_iter_state *st = seq->private;
2493 if (*pos && *pos == st->last_pos) {
2494 rc = tcp_seek_last_pos(seq);
2499 st->state = TCP_SEQ_STATE_LISTENING;
2503 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2506 st->last_pos = *pos;
2509 EXPORT_SYMBOL(tcp_seq_start);
2511 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2513 struct tcp_iter_state *st = seq->private;
2516 if (v == SEQ_START_TOKEN) {
2517 rc = tcp_get_idx(seq, 0);
2521 switch (st->state) {
2522 case TCP_SEQ_STATE_LISTENING:
2523 rc = listening_get_next(seq, v);
2525 st->state = TCP_SEQ_STATE_ESTABLISHED;
2528 rc = established_get_first(seq);
2531 case TCP_SEQ_STATE_ESTABLISHED:
2532 rc = established_get_next(seq, v);
2537 st->last_pos = *pos;
2540 EXPORT_SYMBOL(tcp_seq_next);
2542 void tcp_seq_stop(struct seq_file *seq, void *v)
2544 struct tcp_iter_state *st = seq->private;
2546 switch (st->state) {
2547 case TCP_SEQ_STATE_LISTENING:
2548 if (v != SEQ_START_TOKEN)
2549 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2551 case TCP_SEQ_STATE_ESTABLISHED:
2553 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2557 EXPORT_SYMBOL(tcp_seq_stop);
2559 static void get_openreq4(const struct request_sock *req,
2560 struct seq_file *f, int i)
2562 const struct inet_request_sock *ireq = inet_rsk(req);
2563 long delta = req->rsk_timer.expires - jiffies;
2565 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2566 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2571 ntohs(ireq->ir_rmt_port),
2573 0, 0, /* could print option size, but that is af dependent. */
2574 1, /* timers active (only the expire timer) */
2575 jiffies_delta_to_clock_t(delta),
2577 from_kuid_munged(seq_user_ns(f),
2578 sock_i_uid(req->rsk_listener)),
2579 0, /* non standard timer */
2580 0, /* open_requests have no inode */
2585 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2588 unsigned long timer_expires;
2589 const struct tcp_sock *tp = tcp_sk(sk);
2590 const struct inet_connection_sock *icsk = inet_csk(sk);
2591 const struct inet_sock *inet = inet_sk(sk);
2592 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2593 __be32 dest = inet->inet_daddr;
2594 __be32 src = inet->inet_rcv_saddr;
2595 __u16 destp = ntohs(inet->inet_dport);
2596 __u16 srcp = ntohs(inet->inet_sport);
2600 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2601 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2602 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2604 timer_expires = icsk->icsk_timeout;
2605 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2607 timer_expires = icsk->icsk_timeout;
2608 } else if (timer_pending(&sk->sk_timer)) {
2610 timer_expires = sk->sk_timer.expires;
2613 timer_expires = jiffies;
2616 state = inet_sk_state_load(sk);
2617 if (state == TCP_LISTEN)
2618 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2620 /* Because we don't lock the socket,
2621 * we might find a transient negative value.
2623 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2624 READ_ONCE(tp->copied_seq), 0);
2626 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2627 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2628 i, src, srcp, dest, destp, state,
2629 READ_ONCE(tp->write_seq) - tp->snd_una,
2632 jiffies_delta_to_clock_t(timer_expires - jiffies),
2633 icsk->icsk_retransmits,
2634 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2635 icsk->icsk_probes_out,
2637 refcount_read(&sk->sk_refcnt), sk,
2638 jiffies_to_clock_t(icsk->icsk_rto),
2639 jiffies_to_clock_t(icsk->icsk_ack.ato),
2640 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2642 state == TCP_LISTEN ?
2643 fastopenq->max_qlen :
2644 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2647 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2648 struct seq_file *f, int i)
2650 long delta = tw->tw_timer.expires - jiffies;
2654 dest = tw->tw_daddr;
2655 src = tw->tw_rcv_saddr;
2656 destp = ntohs(tw->tw_dport);
2657 srcp = ntohs(tw->tw_sport);
2659 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2660 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2661 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2662 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2663 refcount_read(&tw->tw_refcnt), tw);
2668 static int tcp4_seq_show(struct seq_file *seq, void *v)
2670 struct tcp_iter_state *st;
2671 struct sock *sk = v;
2673 seq_setwidth(seq, TMPSZ - 1);
2674 if (v == SEQ_START_TOKEN) {
2675 seq_puts(seq, " sl local_address rem_address st tx_queue "
2676 "rx_queue tr tm->when retrnsmt uid timeout "
2682 if (sk->sk_state == TCP_TIME_WAIT)
2683 get_timewait4_sock(v, seq, st->num);
2684 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2685 get_openreq4(v, seq, st->num);
2687 get_tcp4_sock(v, seq, st->num);
2693 #ifdef CONFIG_BPF_SYSCALL
2694 struct bpf_iter__tcp {
2695 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2696 __bpf_md_ptr(struct sock_common *, sk_common);
2697 uid_t uid __aligned(8);
2700 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2701 struct sock_common *sk_common, uid_t uid)
2703 struct bpf_iter__tcp ctx;
2705 meta->seq_num--; /* skip SEQ_START_TOKEN */
2707 ctx.sk_common = sk_common;
2709 return bpf_iter_run_prog(prog, &ctx);
2712 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2714 struct bpf_iter_meta meta;
2715 struct bpf_prog *prog;
2716 struct sock *sk = v;
2719 if (v == SEQ_START_TOKEN)
2722 if (sk->sk_state == TCP_TIME_WAIT) {
2724 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2725 const struct request_sock *req = v;
2727 uid = from_kuid_munged(seq_user_ns(seq),
2728 sock_i_uid(req->rsk_listener));
2730 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2734 prog = bpf_iter_get_info(&meta, false);
2735 return tcp_prog_seq_show(prog, &meta, v, uid);
2738 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2740 struct bpf_iter_meta meta;
2741 struct bpf_prog *prog;
2745 prog = bpf_iter_get_info(&meta, true);
2747 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2750 tcp_seq_stop(seq, v);
2753 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2754 .show = bpf_iter_tcp_seq_show,
2755 .start = tcp_seq_start,
2756 .next = tcp_seq_next,
2757 .stop = bpf_iter_tcp_seq_stop,
2761 static const struct seq_operations tcp4_seq_ops = {
2762 .show = tcp4_seq_show,
2763 .start = tcp_seq_start,
2764 .next = tcp_seq_next,
2765 .stop = tcp_seq_stop,
2768 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2772 static int __net_init tcp4_proc_init_net(struct net *net)
2774 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2775 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2780 static void __net_exit tcp4_proc_exit_net(struct net *net)
2782 remove_proc_entry("tcp", net->proc_net);
2785 static struct pernet_operations tcp4_net_ops = {
2786 .init = tcp4_proc_init_net,
2787 .exit = tcp4_proc_exit_net,
2790 int __init tcp4_proc_init(void)
2792 return register_pernet_subsys(&tcp4_net_ops);
2795 void tcp4_proc_exit(void)
2797 unregister_pernet_subsys(&tcp4_net_ops);
2799 #endif /* CONFIG_PROC_FS */
2801 struct proto tcp_prot = {
2803 .owner = THIS_MODULE,
2805 .pre_connect = tcp_v4_pre_connect,
2806 .connect = tcp_v4_connect,
2807 .disconnect = tcp_disconnect,
2808 .accept = inet_csk_accept,
2810 .init = tcp_v4_init_sock,
2811 .destroy = tcp_v4_destroy_sock,
2812 .shutdown = tcp_shutdown,
2813 .setsockopt = tcp_setsockopt,
2814 .getsockopt = tcp_getsockopt,
2815 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
2816 .keepalive = tcp_set_keepalive,
2817 .recvmsg = tcp_recvmsg,
2818 .sendmsg = tcp_sendmsg,
2819 .sendpage = tcp_sendpage,
2820 .backlog_rcv = tcp_v4_do_rcv,
2821 .release_cb = tcp_release_cb,
2823 .unhash = inet_unhash,
2824 .get_port = inet_csk_get_port,
2825 .enter_memory_pressure = tcp_enter_memory_pressure,
2826 .leave_memory_pressure = tcp_leave_memory_pressure,
2827 .stream_memory_free = tcp_stream_memory_free,
2828 .sockets_allocated = &tcp_sockets_allocated,
2829 .orphan_count = &tcp_orphan_count,
2830 .memory_allocated = &tcp_memory_allocated,
2831 .memory_pressure = &tcp_memory_pressure,
2832 .sysctl_mem = sysctl_tcp_mem,
2833 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2834 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2835 .max_header = MAX_TCP_HEADER,
2836 .obj_size = sizeof(struct tcp_sock),
2837 .slab_flags = SLAB_TYPESAFE_BY_RCU,
2838 .twsk_prot = &tcp_timewait_sock_ops,
2839 .rsk_prot = &tcp_request_sock_ops,
2840 .h.hashinfo = &tcp_hashinfo,
2841 .no_autobind = true,
2842 .diag_destroy = tcp_abort,
2844 EXPORT_SYMBOL(tcp_prot);
2846 static void __net_exit tcp_sk_exit(struct net *net)
2848 if (net->ipv4.tcp_congestion_control)
2849 bpf_module_put(net->ipv4.tcp_congestion_control,
2850 net->ipv4.tcp_congestion_control->owner);
2853 static int __net_init tcp_sk_init(struct net *net)
2857 net->ipv4.sysctl_tcp_ecn = 2;
2858 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2860 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2861 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2862 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2863 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2864 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2866 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2867 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2868 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2870 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2871 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2872 net->ipv4.sysctl_tcp_syncookies = 1;
2873 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2874 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2875 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2876 net->ipv4.sysctl_tcp_orphan_retries = 0;
2877 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2878 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2879 net->ipv4.sysctl_tcp_tw_reuse = 2;
2880 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2882 cnt = tcp_hashinfo.ehash_mask + 1;
2883 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2884 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2886 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2887 net->ipv4.sysctl_tcp_sack = 1;
2888 net->ipv4.sysctl_tcp_window_scaling = 1;
2889 net->ipv4.sysctl_tcp_timestamps = 1;
2890 net->ipv4.sysctl_tcp_early_retrans = 3;
2891 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2892 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
2893 net->ipv4.sysctl_tcp_retrans_collapse = 1;
2894 net->ipv4.sysctl_tcp_max_reordering = 300;
2895 net->ipv4.sysctl_tcp_dsack = 1;
2896 net->ipv4.sysctl_tcp_app_win = 31;
2897 net->ipv4.sysctl_tcp_adv_win_scale = 1;
2898 net->ipv4.sysctl_tcp_frto = 2;
2899 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2900 /* This limits the percentage of the congestion window which we
2901 * will allow a single TSO frame to consume. Building TSO frames
2902 * which are too large can cause TCP streams to be bursty.
2904 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2905 /* Default TSQ limit of 16 TSO segments */
2906 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2907 /* rfc5961 challenge ack rate limiting */
2908 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2909 net->ipv4.sysctl_tcp_min_tso_segs = 2;
2910 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2911 net->ipv4.sysctl_tcp_autocorking = 1;
2912 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2913 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2914 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2915 if (net != &init_net) {
2916 memcpy(net->ipv4.sysctl_tcp_rmem,
2917 init_net.ipv4.sysctl_tcp_rmem,
2918 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2919 memcpy(net->ipv4.sysctl_tcp_wmem,
2920 init_net.ipv4.sysctl_tcp_wmem,
2921 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2923 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2924 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2925 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2926 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2927 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2928 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
2929 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2931 /* Reno is always built in */
2932 if (!net_eq(net, &init_net) &&
2933 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2934 init_net.ipv4.tcp_congestion_control->owner))
2935 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2937 net->ipv4.tcp_congestion_control = &tcp_reno;
2942 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2946 inet_twsk_purge(&tcp_hashinfo, AF_INET);
2948 list_for_each_entry(net, net_exit_list, exit_list)
2949 tcp_fastopen_ctx_destroy(net);
2952 static struct pernet_operations __net_initdata tcp_sk_ops = {
2953 .init = tcp_sk_init,
2954 .exit = tcp_sk_exit,
2955 .exit_batch = tcp_sk_exit_batch,
2958 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2959 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2960 struct sock_common *sk_common, uid_t uid)
2962 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2964 struct tcp_iter_state *st = priv_data;
2965 struct tcp_seq_afinfo *afinfo;
2968 afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2972 afinfo->family = AF_UNSPEC;
2973 st->bpf_seq_afinfo = afinfo;
2974 ret = bpf_iter_init_seq_net(priv_data, aux);
2980 static void bpf_iter_fini_tcp(void *priv_data)
2982 struct tcp_iter_state *st = priv_data;
2984 kfree(st->bpf_seq_afinfo);
2985 bpf_iter_fini_seq_net(priv_data);
2988 static const struct bpf_iter_seq_info tcp_seq_info = {
2989 .seq_ops = &bpf_iter_tcp_seq_ops,
2990 .init_seq_private = bpf_iter_init_tcp,
2991 .fini_seq_private = bpf_iter_fini_tcp,
2992 .seq_priv_size = sizeof(struct tcp_iter_state),
2995 static struct bpf_iter_reg tcp_reg_info = {
2997 .ctx_arg_info_size = 1,
2999 { offsetof(struct bpf_iter__tcp, sk_common),
3000 PTR_TO_BTF_ID_OR_NULL },
3002 .seq_info = &tcp_seq_info,
3005 static void __init bpf_iter_register(void)
3007 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3008 if (bpf_iter_reg_target(&tcp_reg_info))
3009 pr_warn("Warning: could not register bpf iterator tcp\n");
3014 void __init tcp_v4_init(void)
3018 for_each_possible_cpu(cpu) {
3021 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3022 IPPROTO_TCP, &init_net);
3024 panic("Failed to create the TCP control socket.\n");
3025 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3027 /* Please enforce IP_DF and IPID==0 for RST and
3028 * ACK sent in SYN-RECV and TIME-WAIT state.
3030 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3032 per_cpu(ipv4_tcp_sk, cpu) = sk;
3034 if (register_pernet_subsys(&tcp_sk_ops))
3035 panic("Failed to create the TCP control socket.\n");
3037 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3038 bpf_iter_register();