1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Implementation of the Transmission Control Protocol(TCP).
9 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
48 #define pr_fmt(fmt) "TCP: " fmt
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
61 #include <net/net_namespace.h>
63 #include <net/inet_hashtables.h>
65 #include <net/transp_v6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
84 #include <trace/events/tcp.h>
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
98 return secure_tcp_seq(ip_hdr(skb)->daddr,
101 tcp_hdr(skb)->source);
104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112 const struct inet_timewait_sock *tw = inet_twsk(sktw);
113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 struct tcp_sock *tp = tcp_sk(sk);
117 /* Still does not detect *everything* that goes through
118 * lo, since we require a loopback src or dst address
119 * or direct binding to 'lo' interface.
121 bool loopback = false;
122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
124 #if IS_ENABLED(CONFIG_IPV6)
125 if (tw->tw_family == AF_INET6) {
126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
134 if (ipv4_is_loopback(tw->tw_daddr) ||
135 ipv4_is_loopback(tw->tw_rcv_saddr))
142 /* With PAWS, it is safe from the viewpoint
143 of data integrity. Even without PAWS it is safe provided sequence
144 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
146 Actually, the idea is close to VJ's one, only timestamp cache is
147 held not per host, but per port pair and TW bucket is used as state
150 If TW bucket has been already destroyed we fall back to VJ's scheme
151 and use initial timestamp retrieved from peer table.
153 if (tcptw->tw_ts_recent_stamp &&
154 (!twp || (reuse && time_after32(ktime_get_seconds(),
155 tcptw->tw_ts_recent_stamp)))) {
156 /* In case of repair and re-using TIME-WAIT sockets we still
157 * want to be sure that it is safe as above but honor the
158 * sequence numbers and time stamps set as part of the repair
161 * Without this check re-using a TIME-WAIT socket with TCP
162 * repair would accumulate a -1 on the repair assigned
163 * sequence number. The first time it is reused the sequence
164 * is -1, the second time -2, etc. This fixes that issue
165 * without appearing to create any others.
167 if (likely(!tp->repair)) {
168 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
172 WRITE_ONCE(tp->write_seq, seq);
173 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
174 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
187 /* This check is replicated from tcp_v4_connect() and intended to
188 * prevent BPF program called below from accessing bytes that are out
189 * of the bound specified by user in addr_len.
191 if (addr_len < sizeof(struct sockaddr_in))
194 sock_owned_by_me(sk);
196 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
199 /* This will initiate an outgoing connection. */
200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
202 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
203 struct inet_sock *inet = inet_sk(sk);
204 struct tcp_sock *tp = tcp_sk(sk);
205 __be16 orig_sport, orig_dport;
206 __be32 daddr, nexthop;
210 struct ip_options_rcu *inet_opt;
211 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
213 if (addr_len < sizeof(struct sockaddr_in))
216 if (usin->sin_family != AF_INET)
217 return -EAFNOSUPPORT;
219 nexthop = daddr = usin->sin_addr.s_addr;
220 inet_opt = rcu_dereference_protected(inet->inet_opt,
221 lockdep_sock_is_held(sk));
222 if (inet_opt && inet_opt->opt.srr) {
225 nexthop = inet_opt->opt.faddr;
228 orig_sport = inet->inet_sport;
229 orig_dport = usin->sin_port;
230 fl4 = &inet->cork.fl.u.ip4;
231 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
232 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
234 orig_sport, orig_dport, sk);
237 if (err == -ENETUNREACH)
238 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
242 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
247 if (!inet_opt || !inet_opt->opt.srr)
250 if (!inet->inet_saddr)
251 inet->inet_saddr = fl4->saddr;
252 sk_rcv_saddr_set(sk, inet->inet_saddr);
254 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
255 /* Reset inherited state */
256 tp->rx_opt.ts_recent = 0;
257 tp->rx_opt.ts_recent_stamp = 0;
258 if (likely(!tp->repair))
259 WRITE_ONCE(tp->write_seq, 0);
262 inet->inet_dport = usin->sin_port;
263 sk_daddr_set(sk, daddr);
265 inet_csk(sk)->icsk_ext_hdr_len = 0;
267 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
269 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
271 /* Socket identity is still unknown (sport may be zero).
272 * However we set state to SYN-SENT and not releasing socket
273 * lock select source port, enter ourselves into the hash tables and
274 * complete initialization after this.
276 tcp_set_state(sk, TCP_SYN_SENT);
277 err = inet_hash_connect(tcp_death_row, sk);
283 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
284 inet->inet_sport, inet->inet_dport, sk);
290 /* OK, now commit destination to socket. */
291 sk->sk_gso_type = SKB_GSO_TCPV4;
292 sk_setup_caps(sk, &rt->dst);
295 if (likely(!tp->repair)) {
297 WRITE_ONCE(tp->write_seq,
298 secure_tcp_seq(inet->inet_saddr,
302 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
307 inet->inet_id = prandom_u32();
309 if (tcp_fastopen_defer_connect(sk, &err))
314 err = tcp_connect(sk);
323 * This unhashes the socket and releases the local port,
326 tcp_set_state(sk, TCP_CLOSE);
327 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
328 inet_reset_saddr(sk);
330 sk->sk_route_caps = 0;
331 inet->inet_dport = 0;
334 EXPORT_SYMBOL(tcp_v4_connect);
337 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
338 * It can be called through tcp_release_cb() if socket was owned by user
339 * at the time tcp_v4_err() was called to handle ICMP message.
341 void tcp_v4_mtu_reduced(struct sock *sk)
343 struct inet_sock *inet = inet_sk(sk);
344 struct dst_entry *dst;
347 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
349 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
350 dst = inet_csk_update_pmtu(sk, mtu);
354 /* Something is about to be wrong... Remember soft error
355 * for the case, if this connection will not able to recover.
357 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
358 sk->sk_err_soft = EMSGSIZE;
362 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
363 ip_sk_accept_pmtu(sk) &&
364 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
365 tcp_sync_mss(sk, mtu);
367 /* Resend the TCP packet because it's
368 * clear that the old packet has been
369 * dropped. This is the new "fast" path mtu
372 tcp_simple_retransmit(sk);
373 } /* else let the usual retransmit timer handle it */
375 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
377 static void do_redirect(struct sk_buff *skb, struct sock *sk)
379 struct dst_entry *dst = __sk_dst_check(sk, 0);
382 dst->ops->redirect(dst, sk, skb);
386 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
387 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
389 struct request_sock *req = inet_reqsk(sk);
390 struct net *net = sock_net(sk);
392 /* ICMPs are not backlogged, hence we cannot get
393 * an established socket here.
395 if (seq != tcp_rsk(req)->snt_isn) {
396 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
399 * Still in SYN_RECV, just remove it silently.
400 * There is no good way to pass the error to the newly
401 * created socket, and POSIX does not want network
402 * errors returned from accept().
404 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
405 tcp_listendrop(req->rsk_listener);
409 EXPORT_SYMBOL(tcp_req_err);
411 /* TCP-LD (RFC 6069) logic */
412 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
414 struct inet_connection_sock *icsk = inet_csk(sk);
415 struct tcp_sock *tp = tcp_sk(sk);
420 if (sock_owned_by_user(sk))
423 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
427 skb = tcp_rtx_queue_head(sk);
428 if (WARN_ON_ONCE(!skb))
431 icsk->icsk_backoff--;
432 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
433 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
435 tcp_mstamp_refresh(tp);
436 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
437 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
440 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
441 remaining, TCP_RTO_MAX);
443 /* RTO revert clocked out retransmission.
444 * Will retransmit now.
446 tcp_retransmit_timer(sk);
449 EXPORT_SYMBOL(tcp_ld_RTO_revert);
452 * This routine is called by the ICMP module when it gets some
453 * sort of error condition. If err < 0 then the socket should
454 * be closed and the error returned to the user. If err > 0
455 * it's just the icmp type << 8 | icmp code. After adjustment
456 * header points to the first 8 bytes of the tcp header. We need
457 * to find the appropriate port.
459 * The locking strategy used here is very "optimistic". When
460 * someone else accesses the socket the ICMP is just dropped
461 * and for some paths there is no check at all.
462 * A more general error queue to queue errors for later handling
463 * is probably better.
467 int tcp_v4_err(struct sk_buff *skb, u32 info)
469 const struct iphdr *iph = (const struct iphdr *)skb->data;
470 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
472 struct inet_sock *inet;
473 const int type = icmp_hdr(skb)->type;
474 const int code = icmp_hdr(skb)->code;
476 struct request_sock *fastopen;
479 struct net *net = dev_net(skb->dev);
481 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
482 th->dest, iph->saddr, ntohs(th->source),
485 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
488 if (sk->sk_state == TCP_TIME_WAIT) {
489 inet_twsk_put(inet_twsk(sk));
492 seq = ntohl(th->seq);
493 if (sk->sk_state == TCP_NEW_SYN_RECV) {
494 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
495 type == ICMP_TIME_EXCEEDED ||
496 (type == ICMP_DEST_UNREACH &&
497 (code == ICMP_NET_UNREACH ||
498 code == ICMP_HOST_UNREACH)));
503 /* If too many ICMPs get dropped on busy
504 * servers this needs to be solved differently.
505 * We do take care of PMTU discovery (RFC1191) special case :
506 * we can receive locally generated ICMP messages while socket is held.
508 if (sock_owned_by_user(sk)) {
509 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
510 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
512 if (sk->sk_state == TCP_CLOSE)
515 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
516 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
521 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
522 fastopen = rcu_dereference(tp->fastopen_rsk);
523 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
524 if (sk->sk_state != TCP_LISTEN &&
525 !between(seq, snd_una, tp->snd_nxt)) {
526 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
532 if (!sock_owned_by_user(sk))
533 do_redirect(skb, sk);
535 case ICMP_SOURCE_QUENCH:
536 /* Just silently ignore these. */
538 case ICMP_PARAMETERPROB:
541 case ICMP_DEST_UNREACH:
542 if (code > NR_ICMP_UNREACH)
545 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
546 /* We are not interested in TCP_LISTEN and open_requests
547 * (SYN-ACKs send out by Linux are always <576bytes so
548 * they should go through unfragmented).
550 if (sk->sk_state == TCP_LISTEN)
553 WRITE_ONCE(tp->mtu_info, info);
554 if (!sock_owned_by_user(sk)) {
555 tcp_v4_mtu_reduced(sk);
557 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
563 err = icmp_err_convert[code].errno;
564 /* check if this ICMP message allows revert of backoff.
568 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
569 tcp_ld_RTO_revert(sk, seq);
571 case ICMP_TIME_EXCEEDED:
578 switch (sk->sk_state) {
581 /* Only in fast or simultaneous open. If a fast open socket is
582 * already accepted it is treated as a connected one below.
584 if (fastopen && !fastopen->sk)
587 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
589 if (!sock_owned_by_user(sk)) {
596 sk->sk_err_soft = err;
601 /* If we've already connected we will keep trying
602 * until we time out, or the user gives up.
604 * rfc1122 4.2.3.9 allows to consider as hard errors
605 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
606 * but it is obsoleted by pmtu discovery).
608 * Note, that in modern internet, where routing is unreliable
609 * and in each dark corner broken firewalls sit, sending random
610 * errors ordered by their masters even this two messages finally lose
611 * their original sense (even Linux sends invalid PORT_UNREACHs)
613 * Now we are in compliance with RFCs.
618 if (!sock_owned_by_user(sk) && inet->recverr) {
621 } else { /* Only an error on timeout */
622 sk->sk_err_soft = err;
631 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
633 struct tcphdr *th = tcp_hdr(skb);
635 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
636 skb->csum_start = skb_transport_header(skb) - skb->head;
637 skb->csum_offset = offsetof(struct tcphdr, check);
640 /* This routine computes an IPv4 TCP checksum. */
641 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
643 const struct inet_sock *inet = inet_sk(sk);
645 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
647 EXPORT_SYMBOL(tcp_v4_send_check);
650 * This routine will send an RST to the other tcp.
652 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
654 * Answer: if a packet caused RST, it is not for a socket
655 * existing in our system, if it is matched to a socket,
656 * it is just duplicate segment or bug in other side's TCP.
657 * So that we build reply only basing on parameters
658 * arrived with segment.
659 * Exception: precedence violation. We do not implement it in any case.
662 #ifdef CONFIG_TCP_MD5SIG
663 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
665 #define OPTION_BYTES sizeof(__be32)
668 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
670 const struct tcphdr *th = tcp_hdr(skb);
673 __be32 opt[OPTION_BYTES / sizeof(__be32)];
675 struct ip_reply_arg arg;
676 #ifdef CONFIG_TCP_MD5SIG
677 struct tcp_md5sig_key *key = NULL;
678 const __u8 *hash_location = NULL;
679 unsigned char newhash[16];
681 struct sock *sk1 = NULL;
683 u64 transmit_time = 0;
687 /* Never send a reset in response to a reset. */
691 /* If sk not NULL, it means we did a successful lookup and incoming
692 * route had to be correct. prequeue might have dropped our dst.
694 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
697 /* Swap the send and the receive. */
698 memset(&rep, 0, sizeof(rep));
699 rep.th.dest = th->source;
700 rep.th.source = th->dest;
701 rep.th.doff = sizeof(struct tcphdr) / 4;
705 rep.th.seq = th->ack_seq;
708 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
709 skb->len - (th->doff << 2));
712 memset(&arg, 0, sizeof(arg));
713 arg.iov[0].iov_base = (unsigned char *)&rep;
714 arg.iov[0].iov_len = sizeof(rep.th);
716 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
717 #ifdef CONFIG_TCP_MD5SIG
719 hash_location = tcp_parse_md5sig_option(th);
720 if (sk && sk_fullsock(sk)) {
721 const union tcp_md5_addr *addr;
724 /* sdif set, means packet ingressed via a device
725 * in an L3 domain and inet_iif is set to it.
727 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
728 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
729 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
730 } else if (hash_location) {
731 const union tcp_md5_addr *addr;
732 int sdif = tcp_v4_sdif(skb);
733 int dif = inet_iif(skb);
737 * active side is lost. Try to find listening socket through
738 * source port, and then find md5 key through listening socket.
739 * we are not loose security here:
740 * Incoming packet is checked with md5 hash with finding key,
741 * no RST generated if md5 hash doesn't match.
743 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
745 th->source, ip_hdr(skb)->daddr,
746 ntohs(th->source), dif, sdif);
747 /* don't send rst if it can't find key */
751 /* sdif set, means packet ingressed via a device
752 * in an L3 domain and dif is set to it.
754 l3index = sdif ? dif : 0;
755 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
756 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
761 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
762 if (genhash || memcmp(hash_location, newhash, 16) != 0)
768 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
770 (TCPOPT_MD5SIG << 8) |
772 /* Update length and the length the header thinks exists */
773 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
774 rep.th.doff = arg.iov[0].iov_len / 4;
776 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
777 key, ip_hdr(skb)->saddr,
778 ip_hdr(skb)->daddr, &rep.th);
781 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
782 if (rep.opt[0] == 0) {
783 __be32 mrst = mptcp_reset_option(skb);
787 arg.iov[0].iov_len += sizeof(mrst);
788 rep.th.doff = arg.iov[0].iov_len / 4;
792 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
793 ip_hdr(skb)->saddr, /* XXX */
794 arg.iov[0].iov_len, IPPROTO_TCP, 0);
795 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
796 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
798 /* When socket is gone, all binding information is lost.
799 * routing might fail in this case. No choice here, if we choose to force
800 * input interface, we will misroute in case of asymmetric route.
803 arg.bound_dev_if = sk->sk_bound_dev_if;
805 trace_tcp_send_reset(sk, skb);
808 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
809 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
811 arg.tos = ip_hdr(skb)->tos;
812 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
814 ctl_sk = this_cpu_read(ipv4_tcp_sk);
815 sock_net_set(ctl_sk, net);
817 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
818 inet_twsk(sk)->tw_mark : sk->sk_mark;
819 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
820 inet_twsk(sk)->tw_priority : sk->sk_priority;
821 transmit_time = tcp_transmit_time(sk);
822 xfrm_sk_clone_policy(ctl_sk, sk);
825 ctl_sk->sk_priority = 0;
827 ip_send_unicast_reply(ctl_sk,
828 skb, &TCP_SKB_CB(skb)->header.h4.opt,
829 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
830 &arg, arg.iov[0].iov_len,
833 xfrm_sk_free_policy(ctl_sk);
834 sock_net_set(ctl_sk, &init_net);
835 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
836 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
839 #ifdef CONFIG_TCP_MD5SIG
845 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
846 outside socket context is ugly, certainly. What can I do?
849 static void tcp_v4_send_ack(const struct sock *sk,
850 struct sk_buff *skb, u32 seq, u32 ack,
851 u32 win, u32 tsval, u32 tsecr, int oif,
852 struct tcp_md5sig_key *key,
853 int reply_flags, u8 tos)
855 const struct tcphdr *th = tcp_hdr(skb);
858 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
859 #ifdef CONFIG_TCP_MD5SIG
860 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
864 struct net *net = sock_net(sk);
865 struct ip_reply_arg arg;
869 memset(&rep.th, 0, sizeof(struct tcphdr));
870 memset(&arg, 0, sizeof(arg));
872 arg.iov[0].iov_base = (unsigned char *)&rep;
873 arg.iov[0].iov_len = sizeof(rep.th);
875 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
876 (TCPOPT_TIMESTAMP << 8) |
878 rep.opt[1] = htonl(tsval);
879 rep.opt[2] = htonl(tsecr);
880 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
883 /* Swap the send and the receive. */
884 rep.th.dest = th->source;
885 rep.th.source = th->dest;
886 rep.th.doff = arg.iov[0].iov_len / 4;
887 rep.th.seq = htonl(seq);
888 rep.th.ack_seq = htonl(ack);
890 rep.th.window = htons(win);
892 #ifdef CONFIG_TCP_MD5SIG
894 int offset = (tsecr) ? 3 : 0;
896 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
898 (TCPOPT_MD5SIG << 8) |
900 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
901 rep.th.doff = arg.iov[0].iov_len/4;
903 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
904 key, ip_hdr(skb)->saddr,
905 ip_hdr(skb)->daddr, &rep.th);
908 arg.flags = reply_flags;
909 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
910 ip_hdr(skb)->saddr, /* XXX */
911 arg.iov[0].iov_len, IPPROTO_TCP, 0);
912 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
914 arg.bound_dev_if = oif;
916 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
918 ctl_sk = this_cpu_read(ipv4_tcp_sk);
919 sock_net_set(ctl_sk, net);
920 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
921 inet_twsk(sk)->tw_mark : sk->sk_mark;
922 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
923 inet_twsk(sk)->tw_priority : sk->sk_priority;
924 transmit_time = tcp_transmit_time(sk);
925 ip_send_unicast_reply(ctl_sk,
926 skb, &TCP_SKB_CB(skb)->header.h4.opt,
927 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
928 &arg, arg.iov[0].iov_len,
931 sock_net_set(ctl_sk, &init_net);
932 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
936 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
938 struct inet_timewait_sock *tw = inet_twsk(sk);
939 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
941 tcp_v4_send_ack(sk, skb,
942 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
943 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
944 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
947 tcp_twsk_md5_key(tcptw),
948 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
955 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
956 struct request_sock *req)
958 const union tcp_md5_addr *addr;
961 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
962 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
964 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
968 * The window field (SEG.WND) of every outgoing segment, with the
969 * exception of <SYN> segments, MUST be right-shifted by
970 * Rcv.Wind.Shift bits:
972 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
973 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
974 tcp_v4_send_ack(sk, skb, seq,
975 tcp_rsk(req)->rcv_nxt,
976 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
977 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
978 READ_ONCE(req->ts_recent),
980 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
981 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
986 * Send a SYN-ACK after having received a SYN.
987 * This still operates on a request_sock only, not on a big
990 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
992 struct request_sock *req,
993 struct tcp_fastopen_cookie *foc,
994 enum tcp_synack_type synack_type,
995 struct sk_buff *syn_skb)
997 const struct inet_request_sock *ireq = inet_rsk(req);
1000 struct sk_buff *skb;
1003 /* First, grab a route. */
1004 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1007 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1010 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1012 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1013 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1014 (inet_sk(sk)->tos & INET_ECN_MASK) :
1017 if (!INET_ECN_is_capable(tos) &&
1018 tcp_bpf_ca_needs_ecn((struct sock *)req))
1019 tos |= INET_ECN_ECT_0;
1022 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1024 rcu_dereference(ireq->ireq_opt),
1027 err = net_xmit_eval(err);
1034 * IPv4 request_sock destructor.
1036 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1038 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1041 #ifdef CONFIG_TCP_MD5SIG
1043 * RFC2385 MD5 checksumming requires a mapping of
1044 * IP address->MD5 Key.
1045 * We need to maintain these in the sk structure.
1048 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1049 EXPORT_SYMBOL(tcp_md5_needed);
1051 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1056 /* l3index always overrides non-l3index */
1057 if (old->l3index && new->l3index == 0)
1059 if (old->l3index == 0 && new->l3index)
1062 return old->prefixlen < new->prefixlen;
1065 /* Find the Key structure for an address. */
1066 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1067 const union tcp_md5_addr *addr,
1070 const struct tcp_sock *tp = tcp_sk(sk);
1071 struct tcp_md5sig_key *key;
1072 const struct tcp_md5sig_info *md5sig;
1074 struct tcp_md5sig_key *best_match = NULL;
1077 /* caller either holds rcu_read_lock() or socket lock */
1078 md5sig = rcu_dereference_check(tp->md5sig_info,
1079 lockdep_sock_is_held(sk));
1083 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1084 lockdep_sock_is_held(sk)) {
1085 if (key->family != family)
1087 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1089 if (family == AF_INET) {
1090 mask = inet_make_mask(key->prefixlen);
1091 match = (key->addr.a4.s_addr & mask) ==
1092 (addr->a4.s_addr & mask);
1093 #if IS_ENABLED(CONFIG_IPV6)
1094 } else if (family == AF_INET6) {
1095 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1102 if (match && better_md5_match(best_match, key))
1107 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1109 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1110 const union tcp_md5_addr *addr,
1111 int family, u8 prefixlen,
1112 int l3index, u8 flags)
1114 const struct tcp_sock *tp = tcp_sk(sk);
1115 struct tcp_md5sig_key *key;
1116 unsigned int size = sizeof(struct in_addr);
1117 const struct tcp_md5sig_info *md5sig;
1119 /* caller either holds rcu_read_lock() or socket lock */
1120 md5sig = rcu_dereference_check(tp->md5sig_info,
1121 lockdep_sock_is_held(sk));
1124 #if IS_ENABLED(CONFIG_IPV6)
1125 if (family == AF_INET6)
1126 size = sizeof(struct in6_addr);
1128 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1129 lockdep_sock_is_held(sk)) {
1130 if (key->family != family)
1132 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1134 if (key->l3index != l3index)
1136 if (!memcmp(&key->addr, addr, size) &&
1137 key->prefixlen == prefixlen)
1143 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1144 const struct sock *addr_sk)
1146 const union tcp_md5_addr *addr;
1149 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1150 addr_sk->sk_bound_dev_if);
1151 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1152 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1154 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1156 /* This can be called on a newly created socket, from other files */
1157 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1158 int family, u8 prefixlen, int l3index, u8 flags,
1159 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1161 /* Add Key to the list */
1162 struct tcp_md5sig_key *key;
1163 struct tcp_sock *tp = tcp_sk(sk);
1164 struct tcp_md5sig_info *md5sig;
1166 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1168 /* Pre-existing entry - just update that one.
1169 * Note that the key might be used concurrently.
1170 * data_race() is telling kcsan that we do not care of
1171 * key mismatches, since changing MD5 key on live flows
1172 * can lead to packet drops.
1174 data_race(memcpy(key->key, newkey, newkeylen));
1176 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1177 * Also note that a reader could catch new key->keylen value
1178 * but old key->key[], this is the reason we use __GFP_ZERO
1179 * at sock_kmalloc() time below these lines.
1181 WRITE_ONCE(key->keylen, newkeylen);
1186 md5sig = rcu_dereference_protected(tp->md5sig_info,
1187 lockdep_sock_is_held(sk));
1189 md5sig = kmalloc(sizeof(*md5sig), gfp);
1193 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1194 INIT_HLIST_HEAD(&md5sig->head);
1195 rcu_assign_pointer(tp->md5sig_info, md5sig);
1198 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1201 if (!tcp_alloc_md5sig_pool()) {
1202 sock_kfree_s(sk, key, sizeof(*key));
1206 memcpy(key->key, newkey, newkeylen);
1207 key->keylen = newkeylen;
1208 key->family = family;
1209 key->prefixlen = prefixlen;
1210 key->l3index = l3index;
1212 memcpy(&key->addr, addr,
1213 (family == AF_INET6) ? sizeof(struct in6_addr) :
1214 sizeof(struct in_addr));
1215 hlist_add_head_rcu(&key->node, &md5sig->head);
1218 EXPORT_SYMBOL(tcp_md5_do_add);
1220 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1221 u8 prefixlen, int l3index, u8 flags)
1223 struct tcp_md5sig_key *key;
1225 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1228 hlist_del_rcu(&key->node);
1229 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1230 kfree_rcu(key, rcu);
1233 EXPORT_SYMBOL(tcp_md5_do_del);
1235 static void tcp_clear_md5_list(struct sock *sk)
1237 struct tcp_sock *tp = tcp_sk(sk);
1238 struct tcp_md5sig_key *key;
1239 struct hlist_node *n;
1240 struct tcp_md5sig_info *md5sig;
1242 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1244 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1245 hlist_del_rcu(&key->node);
1246 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1247 kfree_rcu(key, rcu);
1251 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1252 sockptr_t optval, int optlen)
1254 struct tcp_md5sig cmd;
1255 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1256 const union tcp_md5_addr *addr;
1261 if (optlen < sizeof(cmd))
1264 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1267 if (sin->sin_family != AF_INET)
1270 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1272 if (optname == TCP_MD5SIG_EXT &&
1273 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1274 prefixlen = cmd.tcpm_prefixlen;
1279 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1280 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1281 struct net_device *dev;
1284 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1285 if (dev && netif_is_l3_master(dev))
1286 l3index = dev->ifindex;
1290 /* ok to reference set/not set outside of rcu;
1291 * right now device MUST be an L3 master
1293 if (!dev || !l3index)
1297 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1299 if (!cmd.tcpm_keylen)
1300 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1302 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1305 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1306 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1309 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1310 __be32 daddr, __be32 saddr,
1311 const struct tcphdr *th, int nbytes)
1313 struct tcp4_pseudohdr *bp;
1314 struct scatterlist sg;
1321 bp->protocol = IPPROTO_TCP;
1322 bp->len = cpu_to_be16(nbytes);
1324 _th = (struct tcphdr *)(bp + 1);
1325 memcpy(_th, th, sizeof(*th));
1328 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1329 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1330 sizeof(*bp) + sizeof(*th));
1331 return crypto_ahash_update(hp->md5_req);
1334 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1335 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1337 struct tcp_md5sig_pool *hp;
1338 struct ahash_request *req;
1340 hp = tcp_get_md5sig_pool();
1342 goto clear_hash_noput;
1345 if (crypto_ahash_init(req))
1347 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1349 if (tcp_md5_hash_key(hp, key))
1351 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1352 if (crypto_ahash_final(req))
1355 tcp_put_md5sig_pool();
1359 tcp_put_md5sig_pool();
1361 memset(md5_hash, 0, 16);
1365 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1366 const struct sock *sk,
1367 const struct sk_buff *skb)
1369 struct tcp_md5sig_pool *hp;
1370 struct ahash_request *req;
1371 const struct tcphdr *th = tcp_hdr(skb);
1372 __be32 saddr, daddr;
1374 if (sk) { /* valid for establish/request sockets */
1375 saddr = sk->sk_rcv_saddr;
1376 daddr = sk->sk_daddr;
1378 const struct iphdr *iph = ip_hdr(skb);
1383 hp = tcp_get_md5sig_pool();
1385 goto clear_hash_noput;
1388 if (crypto_ahash_init(req))
1391 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1393 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1395 if (tcp_md5_hash_key(hp, key))
1397 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1398 if (crypto_ahash_final(req))
1401 tcp_put_md5sig_pool();
1405 tcp_put_md5sig_pool();
1407 memset(md5_hash, 0, 16);
1410 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1414 /* Called with rcu_read_lock() */
1415 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1416 const struct sk_buff *skb,
1419 #ifdef CONFIG_TCP_MD5SIG
1421 * This gets called for each TCP segment that arrives
1422 * so we want to be efficient.
1423 * We have 3 drop cases:
1424 * o No MD5 hash and one expected.
1425 * o MD5 hash and we're not expecting one.
1426 * o MD5 hash and its wrong.
1428 const __u8 *hash_location = NULL;
1429 struct tcp_md5sig_key *hash_expected;
1430 const struct iphdr *iph = ip_hdr(skb);
1431 const struct tcphdr *th = tcp_hdr(skb);
1432 const union tcp_md5_addr *addr;
1433 unsigned char newhash[16];
1434 int genhash, l3index;
1436 /* sdif set, means packet ingressed via a device
1437 * in an L3 domain and dif is set to the l3mdev
1439 l3index = sdif ? dif : 0;
1441 addr = (union tcp_md5_addr *)&iph->saddr;
1442 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1443 hash_location = tcp_parse_md5sig_option(th);
1445 /* We've parsed the options - do we have a hash? */
1446 if (!hash_expected && !hash_location)
1449 if (hash_expected && !hash_location) {
1450 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1454 if (!hash_expected && hash_location) {
1455 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1459 /* Okay, so this is hash_expected and hash_location -
1460 * so we need to calculate the checksum.
1462 genhash = tcp_v4_md5_hash_skb(newhash,
1466 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1467 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1468 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1469 &iph->saddr, ntohs(th->source),
1470 &iph->daddr, ntohs(th->dest),
1471 genhash ? " tcp_v4_calc_md5_hash failed"
1480 static void tcp_v4_init_req(struct request_sock *req,
1481 const struct sock *sk_listener,
1482 struct sk_buff *skb)
1484 struct inet_request_sock *ireq = inet_rsk(req);
1485 struct net *net = sock_net(sk_listener);
1487 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1488 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1489 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1492 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1493 struct sk_buff *skb,
1495 struct request_sock *req)
1497 tcp_v4_init_req(req, sk, skb);
1499 if (security_inet_conn_request(sk, skb, req))
1502 return inet_csk_route_req(sk, &fl->u.ip4, req);
1505 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1507 .obj_size = sizeof(struct tcp_request_sock),
1508 .rtx_syn_ack = tcp_rtx_synack,
1509 .send_ack = tcp_v4_reqsk_send_ack,
1510 .destructor = tcp_v4_reqsk_destructor,
1511 .send_reset = tcp_v4_send_reset,
1512 .syn_ack_timeout = tcp_syn_ack_timeout,
1515 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1516 .mss_clamp = TCP_MSS_DEFAULT,
1517 #ifdef CONFIG_TCP_MD5SIG
1518 .req_md5_lookup = tcp_v4_md5_lookup,
1519 .calc_md5_hash = tcp_v4_md5_hash_skb,
1521 #ifdef CONFIG_SYN_COOKIES
1522 .cookie_init_seq = cookie_v4_init_sequence,
1524 .route_req = tcp_v4_route_req,
1525 .init_seq = tcp_v4_init_seq,
1526 .init_ts_off = tcp_v4_init_ts_off,
1527 .send_synack = tcp_v4_send_synack,
1530 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1532 /* Never answer to SYNs send to broadcast or multicast */
1533 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1536 return tcp_conn_request(&tcp_request_sock_ops,
1537 &tcp_request_sock_ipv4_ops, sk, skb);
1543 EXPORT_SYMBOL(tcp_v4_conn_request);
1547 * The three way handshake has completed - we got a valid synack -
1548 * now create the new socket.
1550 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1551 struct request_sock *req,
1552 struct dst_entry *dst,
1553 struct request_sock *req_unhash,
1556 struct inet_request_sock *ireq;
1557 bool found_dup_sk = false;
1558 struct inet_sock *newinet;
1559 struct tcp_sock *newtp;
1561 #ifdef CONFIG_TCP_MD5SIG
1562 const union tcp_md5_addr *addr;
1563 struct tcp_md5sig_key *key;
1566 struct ip_options_rcu *inet_opt;
1568 if (sk_acceptq_is_full(sk))
1571 newsk = tcp_create_openreq_child(sk, req, skb);
1575 newsk->sk_gso_type = SKB_GSO_TCPV4;
1576 inet_sk_rx_dst_set(newsk, skb);
1578 newtp = tcp_sk(newsk);
1579 newinet = inet_sk(newsk);
1580 ireq = inet_rsk(req);
1581 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1582 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1583 newsk->sk_bound_dev_if = ireq->ir_iif;
1584 newinet->inet_saddr = ireq->ir_loc_addr;
1585 inet_opt = rcu_dereference(ireq->ireq_opt);
1586 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1587 newinet->mc_index = inet_iif(skb);
1588 newinet->mc_ttl = ip_hdr(skb)->ttl;
1589 newinet->rcv_tos = ip_hdr(skb)->tos;
1590 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1592 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1593 newinet->inet_id = prandom_u32();
1595 /* Set ToS of the new socket based upon the value of incoming SYN.
1596 * ECT bits are set later in tcp_init_transfer().
1598 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1599 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1602 dst = inet_csk_route_child_sock(sk, newsk, req);
1606 /* syncookie case : see end of cookie_v4_check() */
1608 sk_setup_caps(newsk, dst);
1610 tcp_ca_openreq_child(newsk, dst);
1612 tcp_sync_mss(newsk, dst_mtu(dst));
1613 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1615 tcp_initialize_rcv_mss(newsk);
1617 #ifdef CONFIG_TCP_MD5SIG
1618 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1619 /* Copy over the MD5 key from the original socket */
1620 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1621 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1624 * We're using one, so create a matching key
1625 * on the newsk structure. If we fail to get
1626 * memory, then we end up not copying the key
1629 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1630 key->key, key->keylen, GFP_ATOMIC);
1631 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1635 if (__inet_inherit_port(sk, newsk) < 0)
1637 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1639 if (likely(*own_req)) {
1640 tcp_move_syn(newtp, req);
1641 ireq->ireq_opt = NULL;
1643 newinet->inet_opt = NULL;
1645 if (!req_unhash && found_dup_sk) {
1646 /* This code path should only be executed in the
1647 * syncookie case only
1649 bh_unlock_sock(newsk);
1657 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1664 newinet->inet_opt = NULL;
1665 inet_csk_prepare_forced_close(newsk);
1669 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1671 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1673 #ifdef CONFIG_SYN_COOKIES
1674 const struct tcphdr *th = tcp_hdr(skb);
1677 sk = cookie_v4_check(sk, skb);
1682 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1683 struct tcphdr *th, u32 *cookie)
1686 #ifdef CONFIG_SYN_COOKIES
1687 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1688 &tcp_request_sock_ipv4_ops, sk, th);
1690 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1691 tcp_synq_overflow(sk);
1697 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1699 /* The socket must have it's spinlock held when we get
1700 * here, unless it is a TCP_LISTEN socket.
1702 * We have a potential double-lock case here, so even when
1703 * doing backlog processing we use the BH locking scheme.
1704 * This is because we cannot sleep with the original spinlock
1707 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1711 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1712 struct dst_entry *dst;
1714 dst = rcu_dereference_protected(sk->sk_rx_dst,
1715 lockdep_sock_is_held(sk));
1717 sock_rps_save_rxhash(sk, skb);
1718 sk_mark_napi_id(sk, skb);
1720 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1721 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1723 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1727 tcp_rcv_established(sk, skb);
1731 if (tcp_checksum_complete(skb))
1734 if (sk->sk_state == TCP_LISTEN) {
1735 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1740 if (tcp_child_process(sk, nsk, skb)) {
1747 sock_rps_save_rxhash(sk, skb);
1749 if (tcp_rcv_state_process(sk, skb)) {
1756 tcp_v4_send_reset(rsk, skb);
1759 /* Be careful here. If this function gets more complicated and
1760 * gcc suffers from register pressure on the x86, sk (in %ebx)
1761 * might be destroyed here. This current version compiles correctly,
1762 * but you have been warned.
1767 trace_tcp_bad_csum(skb);
1768 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1769 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1772 EXPORT_SYMBOL(tcp_v4_do_rcv);
1774 int tcp_v4_early_demux(struct sk_buff *skb)
1776 const struct iphdr *iph;
1777 const struct tcphdr *th;
1780 if (skb->pkt_type != PACKET_HOST)
1783 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1789 if (th->doff < sizeof(struct tcphdr) / 4)
1792 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1793 iph->saddr, th->source,
1794 iph->daddr, ntohs(th->dest),
1795 skb->skb_iif, inet_sdif(skb));
1798 skb->destructor = sock_edemux;
1799 if (sk_fullsock(sk)) {
1800 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1803 dst = dst_check(dst, 0);
1805 sk->sk_rx_dst_ifindex == skb->skb_iif)
1806 skb_dst_set_noref(skb, dst);
1812 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1814 u32 limit, tail_gso_size, tail_gso_segs;
1815 struct skb_shared_info *shinfo;
1816 const struct tcphdr *th;
1817 struct tcphdr *thtail;
1818 struct sk_buff *tail;
1819 unsigned int hdrlen;
1825 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1826 * we can fix skb->truesize to its real value to avoid future drops.
1827 * This is valid because skb is not yet charged to the socket.
1828 * It has been noticed pure SACK packets were sometimes dropped
1829 * (if cooked by drivers without copybreak feature).
1835 if (unlikely(tcp_checksum_complete(skb))) {
1837 trace_tcp_bad_csum(skb);
1838 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1839 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1843 /* Attempt coalescing to last skb in backlog, even if we are
1845 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1847 th = (const struct tcphdr *)skb->data;
1848 hdrlen = th->doff * 4;
1850 tail = sk->sk_backlog.tail;
1853 thtail = (struct tcphdr *)tail->data;
1855 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1856 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1857 ((TCP_SKB_CB(tail)->tcp_flags |
1858 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1859 !((TCP_SKB_CB(tail)->tcp_flags &
1860 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1861 ((TCP_SKB_CB(tail)->tcp_flags ^
1862 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1863 #ifdef CONFIG_TLS_DEVICE
1864 tail->decrypted != skb->decrypted ||
1866 !mptcp_skb_can_collapse(tail, skb) ||
1867 thtail->doff != th->doff ||
1868 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1871 __skb_pull(skb, hdrlen);
1873 shinfo = skb_shinfo(skb);
1874 gso_size = shinfo->gso_size ?: skb->len;
1875 gso_segs = shinfo->gso_segs ?: 1;
1877 shinfo = skb_shinfo(tail);
1878 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1879 tail_gso_segs = shinfo->gso_segs ?: 1;
1881 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1882 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1884 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1885 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1886 thtail->window = th->window;
1889 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1890 * thtail->fin, so that the fast path in tcp_rcv_established()
1891 * is not entered if we append a packet with a FIN.
1892 * SYN, RST, URG are not present.
1893 * ACK is set on both packets.
1894 * PSH : we do not really care in TCP stack,
1895 * at least for 'GRO' packets.
1897 thtail->fin |= th->fin;
1898 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1900 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1901 TCP_SKB_CB(tail)->has_rxtstamp = true;
1902 tail->tstamp = skb->tstamp;
1903 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1906 /* Not as strict as GRO. We only need to carry mss max value */
1907 shinfo->gso_size = max(gso_size, tail_gso_size);
1908 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1910 sk->sk_backlog.len += delta;
1911 __NET_INC_STATS(sock_net(sk),
1912 LINUX_MIB_TCPBACKLOGCOALESCE);
1913 kfree_skb_partial(skb, fragstolen);
1916 __skb_push(skb, hdrlen);
1919 limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1921 /* Only socket owner can try to collapse/prune rx queues
1922 * to reduce memory overhead, so add a little headroom here.
1923 * Few sockets backlog are possibly concurrently non empty.
1927 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1929 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1934 EXPORT_SYMBOL(tcp_add_backlog);
1936 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1938 struct tcphdr *th = (struct tcphdr *)skb->data;
1940 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1942 EXPORT_SYMBOL(tcp_filter);
1944 static void tcp_v4_restore_cb(struct sk_buff *skb)
1946 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1947 sizeof(struct inet_skb_parm));
1950 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1951 const struct tcphdr *th)
1953 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1954 * barrier() makes sure compiler wont play fool^Waliasing games.
1956 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1957 sizeof(struct inet_skb_parm));
1960 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1961 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1962 skb->len - th->doff * 4);
1963 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1964 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1965 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1966 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1967 TCP_SKB_CB(skb)->sacked = 0;
1968 TCP_SKB_CB(skb)->has_rxtstamp =
1969 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1976 int tcp_v4_rcv(struct sk_buff *skb)
1978 struct net *net = dev_net(skb->dev);
1979 struct sk_buff *skb_to_free;
1980 int sdif = inet_sdif(skb);
1981 int dif = inet_iif(skb);
1982 const struct iphdr *iph;
1983 const struct tcphdr *th;
1989 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1990 if (skb->pkt_type != PACKET_HOST)
1993 /* Count it even if it's bad */
1994 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1996 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1999 th = (const struct tcphdr *)skb->data;
2001 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2002 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2005 if (!pskb_may_pull(skb, th->doff * 4))
2008 /* An explanation is required here, I think.
2009 * Packet length and doff are validated by header prediction,
2010 * provided case of th->doff==0 is eliminated.
2011 * So, we defer the checks. */
2013 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2016 th = (const struct tcphdr *)skb->data;
2019 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
2020 th->dest, sdif, &refcounted);
2025 if (sk->sk_state == TCP_TIME_WAIT)
2028 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2029 struct request_sock *req = inet_reqsk(sk);
2030 bool req_stolen = false;
2033 sk = req->rsk_listener;
2034 if (unlikely(!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb) ||
2035 tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2036 sk_drops_add(sk, skb);
2040 if (tcp_checksum_complete(skb)) {
2044 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2045 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2047 inet_csk_reqsk_queue_drop_and_put(sk, req);
2051 /* reuseport_migrate_sock() has already held one sk_refcnt
2055 /* We own a reference on the listener, increase it again
2056 * as we might lose it too soon.
2062 if (!tcp_filter(sk, skb)) {
2063 th = (const struct tcphdr *)skb->data;
2065 tcp_v4_fill_cb(skb, iph, th);
2066 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2071 /* Another cpu got exclusive access to req
2072 * and created a full blown socket.
2073 * Try to feed this packet to this socket
2074 * instead of discarding it.
2076 tcp_v4_restore_cb(skb);
2080 goto discard_and_relse;
2085 tcp_v4_restore_cb(skb);
2086 } else if (tcp_child_process(sk, nsk, skb)) {
2087 tcp_v4_send_reset(nsk, skb);
2088 goto discard_and_relse;
2094 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2095 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2096 goto discard_and_relse;
2099 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2100 goto discard_and_relse;
2102 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2103 goto discard_and_relse;
2107 if (tcp_filter(sk, skb)) {
2108 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2109 goto discard_and_relse;
2111 th = (const struct tcphdr *)skb->data;
2113 tcp_v4_fill_cb(skb, iph, th);
2117 if (sk->sk_state == TCP_LISTEN) {
2118 ret = tcp_v4_do_rcv(sk, skb);
2119 goto put_and_return;
2122 sk_incoming_cpu_update(sk);
2124 bh_lock_sock_nested(sk);
2125 tcp_segs_in(tcp_sk(sk), skb);
2127 if (!sock_owned_by_user(sk)) {
2128 skb_to_free = sk->sk_rx_skb_cache;
2129 sk->sk_rx_skb_cache = NULL;
2130 ret = tcp_v4_do_rcv(sk, skb);
2132 if (tcp_add_backlog(sk, skb))
2133 goto discard_and_relse;
2138 __kfree_skb(skb_to_free);
2147 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2148 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2151 tcp_v4_fill_cb(skb, iph, th);
2153 if (tcp_checksum_complete(skb)) {
2155 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2156 trace_tcp_bad_csum(skb);
2157 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2159 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2161 tcp_v4_send_reset(NULL, skb);
2165 /* Discard frame. */
2166 kfree_skb_reason(skb, drop_reason);
2170 sk_drops_add(sk, skb);
2176 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2177 inet_twsk_put(inet_twsk(sk));
2181 tcp_v4_fill_cb(skb, iph, th);
2183 if (tcp_checksum_complete(skb)) {
2184 inet_twsk_put(inet_twsk(sk));
2187 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2189 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2192 iph->saddr, th->source,
2193 iph->daddr, th->dest,
2197 inet_twsk_deschedule_put(inet_twsk(sk));
2199 tcp_v4_restore_cb(skb);
2207 tcp_v4_timewait_ack(sk, skb);
2210 tcp_v4_send_reset(sk, skb);
2211 inet_twsk_deschedule_put(inet_twsk(sk));
2213 case TCP_TW_SUCCESS:;
2218 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2219 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2220 .twsk_unique = tcp_twsk_unique,
2221 .twsk_destructor= tcp_twsk_destructor,
2224 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2226 struct dst_entry *dst = skb_dst(skb);
2228 if (dst && dst_hold_safe(dst)) {
2229 rcu_assign_pointer(sk->sk_rx_dst, dst);
2230 sk->sk_rx_dst_ifindex = skb->skb_iif;
2233 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2235 const struct inet_connection_sock_af_ops ipv4_specific = {
2236 .queue_xmit = ip_queue_xmit,
2237 .send_check = tcp_v4_send_check,
2238 .rebuild_header = inet_sk_rebuild_header,
2239 .sk_rx_dst_set = inet_sk_rx_dst_set,
2240 .conn_request = tcp_v4_conn_request,
2241 .syn_recv_sock = tcp_v4_syn_recv_sock,
2242 .net_header_len = sizeof(struct iphdr),
2243 .setsockopt = ip_setsockopt,
2244 .getsockopt = ip_getsockopt,
2245 .addr2sockaddr = inet_csk_addr2sockaddr,
2246 .sockaddr_len = sizeof(struct sockaddr_in),
2247 .mtu_reduced = tcp_v4_mtu_reduced,
2249 EXPORT_SYMBOL(ipv4_specific);
2251 #ifdef CONFIG_TCP_MD5SIG
2252 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2253 .md5_lookup = tcp_v4_md5_lookup,
2254 .calc_md5_hash = tcp_v4_md5_hash_skb,
2255 .md5_parse = tcp_v4_parse_md5_keys,
2259 /* NOTE: A lot of things set to zero explicitly by call to
2260 * sk_alloc() so need not be done here.
2262 static int tcp_v4_init_sock(struct sock *sk)
2264 struct inet_connection_sock *icsk = inet_csk(sk);
2268 icsk->icsk_af_ops = &ipv4_specific;
2270 #ifdef CONFIG_TCP_MD5SIG
2271 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2277 void tcp_v4_destroy_sock(struct sock *sk)
2279 struct tcp_sock *tp = tcp_sk(sk);
2281 trace_tcp_destroy_sock(sk);
2283 tcp_clear_xmit_timers(sk);
2285 tcp_cleanup_congestion_control(sk);
2287 tcp_cleanup_ulp(sk);
2289 /* Cleanup up the write buffer. */
2290 tcp_write_queue_purge(sk);
2292 /* Check if we want to disable active TFO */
2293 tcp_fastopen_active_disable_ofo_check(sk);
2295 /* Cleans up our, hopefully empty, out_of_order_queue. */
2296 skb_rbtree_purge(&tp->out_of_order_queue);
2298 #ifdef CONFIG_TCP_MD5SIG
2299 /* Clean up the MD5 key list, if any */
2300 if (tp->md5sig_info) {
2301 tcp_clear_md5_list(sk);
2302 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2303 tp->md5sig_info = NULL;
2307 /* Clean up a referenced TCP bind bucket. */
2308 if (inet_csk(sk)->icsk_bind_hash)
2311 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2313 /* If socket is aborted during connect operation */
2314 tcp_free_fastopen_req(tp);
2315 tcp_fastopen_destroy_cipher(sk);
2316 tcp_saved_syn_free(tp);
2318 sk_sockets_allocated_dec(sk);
2320 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2322 #ifdef CONFIG_PROC_FS
2323 /* Proc filesystem TCP sock list dumping. */
2325 static unsigned short seq_file_family(const struct seq_file *seq);
2327 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2329 unsigned short family = seq_file_family(seq);
2331 /* AF_UNSPEC is used as a match all */
2332 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2333 net_eq(sock_net(sk), seq_file_net(seq)));
2336 /* Find a non empty bucket (starting from st->bucket)
2337 * and return the first sk from it.
2339 static void *listening_get_first(struct seq_file *seq)
2341 struct tcp_iter_state *st = seq->private;
2344 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2345 struct inet_listen_hashbucket *ilb2;
2346 struct inet_connection_sock *icsk;
2349 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2350 if (hlist_empty(&ilb2->head))
2353 spin_lock(&ilb2->lock);
2354 inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
2355 sk = (struct sock *)icsk;
2356 if (seq_sk_match(seq, sk))
2359 spin_unlock(&ilb2->lock);
2365 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2366 * If "cur" is the last one in the st->bucket,
2367 * call listening_get_first() to return the first sk of the next
2370 static void *listening_get_next(struct seq_file *seq, void *cur)
2372 struct tcp_iter_state *st = seq->private;
2373 struct inet_listen_hashbucket *ilb2;
2374 struct inet_connection_sock *icsk;
2375 struct sock *sk = cur;
2380 icsk = inet_csk(sk);
2381 inet_lhash2_for_each_icsk_continue(icsk) {
2382 sk = (struct sock *)icsk;
2383 if (seq_sk_match(seq, sk))
2387 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2388 spin_unlock(&ilb2->lock);
2390 return listening_get_first(seq);
2393 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2395 struct tcp_iter_state *st = seq->private;
2400 rc = listening_get_first(seq);
2402 while (rc && *pos) {
2403 rc = listening_get_next(seq, rc);
2409 static inline bool empty_bucket(const struct tcp_iter_state *st)
2411 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2415 * Get first established socket starting from bucket given in st->bucket.
2416 * If st->bucket is zero, the very first socket in the hash is returned.
2418 static void *established_get_first(struct seq_file *seq)
2420 struct tcp_iter_state *st = seq->private;
2423 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2425 struct hlist_nulls_node *node;
2426 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2428 /* Lockless fast path for the common case of empty buckets */
2429 if (empty_bucket(st))
2433 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2434 if (seq_sk_match(seq, sk))
2437 spin_unlock_bh(lock);
2443 static void *established_get_next(struct seq_file *seq, void *cur)
2445 struct sock *sk = cur;
2446 struct hlist_nulls_node *node;
2447 struct tcp_iter_state *st = seq->private;
2452 sk = sk_nulls_next(sk);
2454 sk_nulls_for_each_from(sk, node) {
2455 if (seq_sk_match(seq, sk))
2459 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2461 return established_get_first(seq);
2464 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2466 struct tcp_iter_state *st = seq->private;
2470 rc = established_get_first(seq);
2473 rc = established_get_next(seq, rc);
2479 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2482 struct tcp_iter_state *st = seq->private;
2484 st->state = TCP_SEQ_STATE_LISTENING;
2485 rc = listening_get_idx(seq, &pos);
2488 st->state = TCP_SEQ_STATE_ESTABLISHED;
2489 rc = established_get_idx(seq, pos);
2495 static void *tcp_seek_last_pos(struct seq_file *seq)
2497 struct tcp_iter_state *st = seq->private;
2498 int bucket = st->bucket;
2499 int offset = st->offset;
2500 int orig_num = st->num;
2503 switch (st->state) {
2504 case TCP_SEQ_STATE_LISTENING:
2505 if (st->bucket > tcp_hashinfo.lhash2_mask)
2507 st->state = TCP_SEQ_STATE_LISTENING;
2508 rc = listening_get_first(seq);
2509 while (offset-- && rc && bucket == st->bucket)
2510 rc = listening_get_next(seq, rc);
2514 st->state = TCP_SEQ_STATE_ESTABLISHED;
2516 case TCP_SEQ_STATE_ESTABLISHED:
2517 if (st->bucket > tcp_hashinfo.ehash_mask)
2519 rc = established_get_first(seq);
2520 while (offset-- && rc && bucket == st->bucket)
2521 rc = established_get_next(seq, rc);
2529 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2531 struct tcp_iter_state *st = seq->private;
2534 if (*pos && *pos == st->last_pos) {
2535 rc = tcp_seek_last_pos(seq);
2540 st->state = TCP_SEQ_STATE_LISTENING;
2544 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2547 st->last_pos = *pos;
2550 EXPORT_SYMBOL(tcp_seq_start);
2552 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2554 struct tcp_iter_state *st = seq->private;
2557 if (v == SEQ_START_TOKEN) {
2558 rc = tcp_get_idx(seq, 0);
2562 switch (st->state) {
2563 case TCP_SEQ_STATE_LISTENING:
2564 rc = listening_get_next(seq, v);
2566 st->state = TCP_SEQ_STATE_ESTABLISHED;
2569 rc = established_get_first(seq);
2572 case TCP_SEQ_STATE_ESTABLISHED:
2573 rc = established_get_next(seq, v);
2578 st->last_pos = *pos;
2581 EXPORT_SYMBOL(tcp_seq_next);
2583 void tcp_seq_stop(struct seq_file *seq, void *v)
2585 struct tcp_iter_state *st = seq->private;
2587 switch (st->state) {
2588 case TCP_SEQ_STATE_LISTENING:
2589 if (v != SEQ_START_TOKEN)
2590 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2592 case TCP_SEQ_STATE_ESTABLISHED:
2594 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2598 EXPORT_SYMBOL(tcp_seq_stop);
2600 static void get_openreq4(const struct request_sock *req,
2601 struct seq_file *f, int i)
2603 const struct inet_request_sock *ireq = inet_rsk(req);
2604 long delta = req->rsk_timer.expires - jiffies;
2606 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2607 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2612 ntohs(ireq->ir_rmt_port),
2614 0, 0, /* could print option size, but that is af dependent. */
2615 1, /* timers active (only the expire timer) */
2616 jiffies_delta_to_clock_t(delta),
2618 from_kuid_munged(seq_user_ns(f),
2619 sock_i_uid(req->rsk_listener)),
2620 0, /* non standard timer */
2621 0, /* open_requests have no inode */
2626 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2629 unsigned long timer_expires;
2630 const struct tcp_sock *tp = tcp_sk(sk);
2631 const struct inet_connection_sock *icsk = inet_csk(sk);
2632 const struct inet_sock *inet = inet_sk(sk);
2633 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2634 __be32 dest = inet->inet_daddr;
2635 __be32 src = inet->inet_rcv_saddr;
2636 __u16 destp = ntohs(inet->inet_dport);
2637 __u16 srcp = ntohs(inet->inet_sport);
2641 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2642 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2643 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2645 timer_expires = icsk->icsk_timeout;
2646 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2648 timer_expires = icsk->icsk_timeout;
2649 } else if (timer_pending(&sk->sk_timer)) {
2651 timer_expires = sk->sk_timer.expires;
2654 timer_expires = jiffies;
2657 state = inet_sk_state_load(sk);
2658 if (state == TCP_LISTEN)
2659 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2661 /* Because we don't lock the socket,
2662 * we might find a transient negative value.
2664 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2665 READ_ONCE(tp->copied_seq), 0);
2667 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2668 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2669 i, src, srcp, dest, destp, state,
2670 READ_ONCE(tp->write_seq) - tp->snd_una,
2673 jiffies_delta_to_clock_t(timer_expires - jiffies),
2674 icsk->icsk_retransmits,
2675 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2676 icsk->icsk_probes_out,
2678 refcount_read(&sk->sk_refcnt), sk,
2679 jiffies_to_clock_t(icsk->icsk_rto),
2680 jiffies_to_clock_t(icsk->icsk_ack.ato),
2681 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2683 state == TCP_LISTEN ?
2684 fastopenq->max_qlen :
2685 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2688 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2689 struct seq_file *f, int i)
2691 long delta = tw->tw_timer.expires - jiffies;
2695 dest = tw->tw_daddr;
2696 src = tw->tw_rcv_saddr;
2697 destp = ntohs(tw->tw_dport);
2698 srcp = ntohs(tw->tw_sport);
2700 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2701 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2702 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2703 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2704 refcount_read(&tw->tw_refcnt), tw);
2709 static int tcp4_seq_show(struct seq_file *seq, void *v)
2711 struct tcp_iter_state *st;
2712 struct sock *sk = v;
2714 seq_setwidth(seq, TMPSZ - 1);
2715 if (v == SEQ_START_TOKEN) {
2716 seq_puts(seq, " sl local_address rem_address st tx_queue "
2717 "rx_queue tr tm->when retrnsmt uid timeout "
2723 if (sk->sk_state == TCP_TIME_WAIT)
2724 get_timewait4_sock(v, seq, st->num);
2725 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2726 get_openreq4(v, seq, st->num);
2728 get_tcp4_sock(v, seq, st->num);
2734 #ifdef CONFIG_BPF_SYSCALL
2735 struct bpf_tcp_iter_state {
2736 struct tcp_iter_state state;
2737 unsigned int cur_sk;
2738 unsigned int end_sk;
2739 unsigned int max_sk;
2740 struct sock **batch;
2741 bool st_bucket_done;
2744 struct bpf_iter__tcp {
2745 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2746 __bpf_md_ptr(struct sock_common *, sk_common);
2747 uid_t uid __aligned(8);
2750 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2751 struct sock_common *sk_common, uid_t uid)
2753 struct bpf_iter__tcp ctx;
2755 meta->seq_num--; /* skip SEQ_START_TOKEN */
2757 ctx.sk_common = sk_common;
2759 return bpf_iter_run_prog(prog, &ctx);
2762 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2764 while (iter->cur_sk < iter->end_sk)
2765 sock_gen_put(iter->batch[iter->cur_sk++]);
2768 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2769 unsigned int new_batch_sz)
2771 struct sock **new_batch;
2773 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2774 GFP_USER | __GFP_NOWARN);
2778 bpf_iter_tcp_put_batch(iter);
2779 kvfree(iter->batch);
2780 iter->batch = new_batch;
2781 iter->max_sk = new_batch_sz;
2786 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2787 struct sock *start_sk)
2789 struct bpf_tcp_iter_state *iter = seq->private;
2790 struct tcp_iter_state *st = &iter->state;
2791 struct inet_connection_sock *icsk;
2792 unsigned int expected = 1;
2795 sock_hold(start_sk);
2796 iter->batch[iter->end_sk++] = start_sk;
2798 icsk = inet_csk(start_sk);
2799 inet_lhash2_for_each_icsk_continue(icsk) {
2800 sk = (struct sock *)icsk;
2801 if (seq_sk_match(seq, sk)) {
2802 if (iter->end_sk < iter->max_sk) {
2804 iter->batch[iter->end_sk++] = sk;
2809 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2814 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2815 struct sock *start_sk)
2817 struct bpf_tcp_iter_state *iter = seq->private;
2818 struct tcp_iter_state *st = &iter->state;
2819 struct hlist_nulls_node *node;
2820 unsigned int expected = 1;
2823 sock_hold(start_sk);
2824 iter->batch[iter->end_sk++] = start_sk;
2826 sk = sk_nulls_next(start_sk);
2827 sk_nulls_for_each_from(sk, node) {
2828 if (seq_sk_match(seq, sk)) {
2829 if (iter->end_sk < iter->max_sk) {
2831 iter->batch[iter->end_sk++] = sk;
2836 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2841 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2843 struct bpf_tcp_iter_state *iter = seq->private;
2844 struct tcp_iter_state *st = &iter->state;
2845 unsigned int expected;
2846 bool resized = false;
2849 /* The st->bucket is done. Directly advance to the next
2850 * bucket instead of having the tcp_seek_last_pos() to skip
2851 * one by one in the current bucket and eventually find out
2852 * it has to advance to the next bucket.
2854 if (iter->st_bucket_done) {
2857 if (st->state == TCP_SEQ_STATE_LISTENING &&
2858 st->bucket > tcp_hashinfo.lhash2_mask) {
2859 st->state = TCP_SEQ_STATE_ESTABLISHED;
2865 /* Get a new batch */
2868 iter->st_bucket_done = false;
2870 sk = tcp_seek_last_pos(seq);
2872 return NULL; /* Done */
2874 if (st->state == TCP_SEQ_STATE_LISTENING)
2875 expected = bpf_iter_tcp_listening_batch(seq, sk);
2877 expected = bpf_iter_tcp_established_batch(seq, sk);
2879 if (iter->end_sk == expected) {
2880 iter->st_bucket_done = true;
2884 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2892 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2894 /* bpf iter does not support lseek, so it always
2895 * continue from where it was stop()-ped.
2898 return bpf_iter_tcp_batch(seq);
2900 return SEQ_START_TOKEN;
2903 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2905 struct bpf_tcp_iter_state *iter = seq->private;
2906 struct tcp_iter_state *st = &iter->state;
2909 /* Whenever seq_next() is called, the iter->cur_sk is
2910 * done with seq_show(), so advance to the next sk in
2913 if (iter->cur_sk < iter->end_sk) {
2914 /* Keeping st->num consistent in tcp_iter_state.
2915 * bpf_iter_tcp does not use st->num.
2916 * meta.seq_num is used instead.
2919 /* Move st->offset to the next sk in the bucket such that
2920 * the future start() will resume at st->offset in
2921 * st->bucket. See tcp_seek_last_pos().
2924 sock_gen_put(iter->batch[iter->cur_sk++]);
2927 if (iter->cur_sk < iter->end_sk)
2928 sk = iter->batch[iter->cur_sk];
2930 sk = bpf_iter_tcp_batch(seq);
2933 /* Keeping st->last_pos consistent in tcp_iter_state.
2934 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2936 st->last_pos = *pos;
2940 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2942 struct bpf_iter_meta meta;
2943 struct bpf_prog *prog;
2944 struct sock *sk = v;
2949 if (v == SEQ_START_TOKEN)
2952 if (sk_fullsock(sk))
2953 slow = lock_sock_fast(sk);
2955 if (unlikely(sk_unhashed(sk))) {
2960 if (sk->sk_state == TCP_TIME_WAIT) {
2962 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2963 const struct request_sock *req = v;
2965 uid = from_kuid_munged(seq_user_ns(seq),
2966 sock_i_uid(req->rsk_listener));
2968 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2972 prog = bpf_iter_get_info(&meta, false);
2973 ret = tcp_prog_seq_show(prog, &meta, v, uid);
2976 if (sk_fullsock(sk))
2977 unlock_sock_fast(sk, slow);
2982 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2984 struct bpf_tcp_iter_state *iter = seq->private;
2985 struct bpf_iter_meta meta;
2986 struct bpf_prog *prog;
2990 prog = bpf_iter_get_info(&meta, true);
2992 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2995 if (iter->cur_sk < iter->end_sk) {
2996 bpf_iter_tcp_put_batch(iter);
2997 iter->st_bucket_done = false;
3001 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3002 .show = bpf_iter_tcp_seq_show,
3003 .start = bpf_iter_tcp_seq_start,
3004 .next = bpf_iter_tcp_seq_next,
3005 .stop = bpf_iter_tcp_seq_stop,
3008 static unsigned short seq_file_family(const struct seq_file *seq)
3010 const struct tcp_seq_afinfo *afinfo;
3012 #ifdef CONFIG_BPF_SYSCALL
3013 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
3014 if (seq->op == &bpf_iter_tcp_seq_ops)
3018 /* Iterated from proc fs */
3019 afinfo = PDE_DATA(file_inode(seq->file));
3020 return afinfo->family;
3023 static const struct seq_operations tcp4_seq_ops = {
3024 .show = tcp4_seq_show,
3025 .start = tcp_seq_start,
3026 .next = tcp_seq_next,
3027 .stop = tcp_seq_stop,
3030 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3034 static int __net_init tcp4_proc_init_net(struct net *net)
3036 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3037 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3042 static void __net_exit tcp4_proc_exit_net(struct net *net)
3044 remove_proc_entry("tcp", net->proc_net);
3047 static struct pernet_operations tcp4_net_ops = {
3048 .init = tcp4_proc_init_net,
3049 .exit = tcp4_proc_exit_net,
3052 int __init tcp4_proc_init(void)
3054 return register_pernet_subsys(&tcp4_net_ops);
3057 void tcp4_proc_exit(void)
3059 unregister_pernet_subsys(&tcp4_net_ops);
3061 #endif /* CONFIG_PROC_FS */
3063 /* @wake is one when sk_stream_write_space() calls us.
3064 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3065 * This mimics the strategy used in sock_def_write_space().
3067 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3069 const struct tcp_sock *tp = tcp_sk(sk);
3070 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3071 READ_ONCE(tp->snd_nxt);
3073 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3075 EXPORT_SYMBOL(tcp_stream_memory_free);
3077 struct proto tcp_prot = {
3079 .owner = THIS_MODULE,
3081 .pre_connect = tcp_v4_pre_connect,
3082 .connect = tcp_v4_connect,
3083 .disconnect = tcp_disconnect,
3084 .accept = inet_csk_accept,
3086 .init = tcp_v4_init_sock,
3087 .destroy = tcp_v4_destroy_sock,
3088 .shutdown = tcp_shutdown,
3089 .setsockopt = tcp_setsockopt,
3090 .getsockopt = tcp_getsockopt,
3091 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3092 .keepalive = tcp_set_keepalive,
3093 .recvmsg = tcp_recvmsg,
3094 .sendmsg = tcp_sendmsg,
3095 .sendpage = tcp_sendpage,
3096 .backlog_rcv = tcp_v4_do_rcv,
3097 .release_cb = tcp_release_cb,
3099 .unhash = inet_unhash,
3100 .get_port = inet_csk_get_port,
3101 #ifdef CONFIG_BPF_SYSCALL
3102 .psock_update_sk_prot = tcp_bpf_update_proto,
3104 .enter_memory_pressure = tcp_enter_memory_pressure,
3105 .leave_memory_pressure = tcp_leave_memory_pressure,
3106 .stream_memory_free = tcp_stream_memory_free,
3107 .sockets_allocated = &tcp_sockets_allocated,
3108 .orphan_count = &tcp_orphan_count,
3109 .memory_allocated = &tcp_memory_allocated,
3110 .memory_pressure = &tcp_memory_pressure,
3111 .sysctl_mem = sysctl_tcp_mem,
3112 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3113 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3114 .max_header = MAX_TCP_HEADER,
3115 .obj_size = sizeof(struct tcp_sock),
3116 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3117 .twsk_prot = &tcp_timewait_sock_ops,
3118 .rsk_prot = &tcp_request_sock_ops,
3119 .h.hashinfo = &tcp_hashinfo,
3120 .no_autobind = true,
3121 .diag_destroy = tcp_abort,
3123 EXPORT_SYMBOL(tcp_prot);
3125 static void __net_exit tcp_sk_exit(struct net *net)
3127 if (net->ipv4.tcp_congestion_control)
3128 bpf_module_put(net->ipv4.tcp_congestion_control,
3129 net->ipv4.tcp_congestion_control->owner);
3132 static int __net_init tcp_sk_init(struct net *net)
3136 net->ipv4.sysctl_tcp_ecn = 2;
3137 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3139 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3140 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3141 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3142 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3143 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3145 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3146 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3147 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3149 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3150 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3151 net->ipv4.sysctl_tcp_syncookies = 1;
3152 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3153 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3154 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3155 net->ipv4.sysctl_tcp_orphan_retries = 0;
3156 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3157 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3158 net->ipv4.sysctl_tcp_tw_reuse = 2;
3159 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3161 cnt = tcp_hashinfo.ehash_mask + 1;
3162 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3163 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
3165 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3166 net->ipv4.sysctl_tcp_sack = 1;
3167 net->ipv4.sysctl_tcp_window_scaling = 1;
3168 net->ipv4.sysctl_tcp_timestamps = 1;
3169 net->ipv4.sysctl_tcp_early_retrans = 3;
3170 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3171 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3172 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3173 net->ipv4.sysctl_tcp_max_reordering = 300;
3174 net->ipv4.sysctl_tcp_dsack = 1;
3175 net->ipv4.sysctl_tcp_app_win = 31;
3176 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3177 net->ipv4.sysctl_tcp_frto = 2;
3178 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3179 /* This limits the percentage of the congestion window which we
3180 * will allow a single TSO frame to consume. Building TSO frames
3181 * which are too large can cause TCP streams to be bursty.
3183 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3184 /* Default TSQ limit of 16 TSO segments */
3185 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3186 /* rfc5961 challenge ack rate limiting */
3187 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3188 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3189 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3190 net->ipv4.sysctl_tcp_autocorking = 1;
3191 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3192 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3193 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3194 if (net != &init_net) {
3195 memcpy(net->ipv4.sysctl_tcp_rmem,
3196 init_net.ipv4.sysctl_tcp_rmem,
3197 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3198 memcpy(net->ipv4.sysctl_tcp_wmem,
3199 init_net.ipv4.sysctl_tcp_wmem,
3200 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3202 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3203 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3204 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3205 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3206 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3207 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3209 /* Reno is always built in */
3210 if (!net_eq(net, &init_net) &&
3211 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3212 init_net.ipv4.tcp_congestion_control->owner))
3213 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3215 net->ipv4.tcp_congestion_control = &tcp_reno;
3220 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3224 inet_twsk_purge(&tcp_hashinfo, AF_INET);
3226 list_for_each_entry(net, net_exit_list, exit_list)
3227 tcp_fastopen_ctx_destroy(net);
3230 static struct pernet_operations __net_initdata tcp_sk_ops = {
3231 .init = tcp_sk_init,
3232 .exit = tcp_sk_exit,
3233 .exit_batch = tcp_sk_exit_batch,
3236 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3237 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3238 struct sock_common *sk_common, uid_t uid)
3240 #define INIT_BATCH_SZ 16
3242 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3244 struct bpf_tcp_iter_state *iter = priv_data;
3247 err = bpf_iter_init_seq_net(priv_data, aux);
3251 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3253 bpf_iter_fini_seq_net(priv_data);
3260 static void bpf_iter_fini_tcp(void *priv_data)
3262 struct bpf_tcp_iter_state *iter = priv_data;
3264 bpf_iter_fini_seq_net(priv_data);
3265 kvfree(iter->batch);
3268 static const struct bpf_iter_seq_info tcp_seq_info = {
3269 .seq_ops = &bpf_iter_tcp_seq_ops,
3270 .init_seq_private = bpf_iter_init_tcp,
3271 .fini_seq_private = bpf_iter_fini_tcp,
3272 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3275 static const struct bpf_func_proto *
3276 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3277 const struct bpf_prog *prog)
3280 case BPF_FUNC_setsockopt:
3281 return &bpf_sk_setsockopt_proto;
3282 case BPF_FUNC_getsockopt:
3283 return &bpf_sk_getsockopt_proto;
3289 static struct bpf_iter_reg tcp_reg_info = {
3291 .ctx_arg_info_size = 1,
3293 { offsetof(struct bpf_iter__tcp, sk_common),
3294 PTR_TO_BTF_ID_OR_NULL },
3296 .get_func_proto = bpf_iter_tcp_get_func_proto,
3297 .seq_info = &tcp_seq_info,
3300 static void __init bpf_iter_register(void)
3302 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3303 if (bpf_iter_reg_target(&tcp_reg_info))
3304 pr_warn("Warning: could not register bpf iterator tcp\n");
3309 void __init tcp_v4_init(void)
3313 for_each_possible_cpu(cpu) {
3316 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3317 IPPROTO_TCP, &init_net);
3319 panic("Failed to create the TCP control socket.\n");
3320 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3322 /* Please enforce IP_DF and IPID==0 for RST and
3323 * ACK sent in SYN-RECV and TIME-WAIT state.
3325 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3327 per_cpu(ipv4_tcp_sk, cpu) = sk;
3329 if (register_pernet_subsys(&tcp_sk_ops))
3330 panic("Failed to create the TCP control socket.\n");
3332 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3333 bpf_iter_register();