1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Implementation of the Transmission Control Protocol(TCP).
9 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
48 #define pr_fmt(fmt) "TCP: " fmt
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
61 #include <net/net_namespace.h>
63 #include <net/inet_hashtables.h>
65 #include <net/transp_v6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
84 #include <trace/events/tcp.h>
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
96 return secure_tcp_seq(ip_hdr(skb)->daddr,
99 tcp_hdr(skb)->source);
102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
104 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
109 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
110 const struct inet_timewait_sock *tw = inet_twsk(sktw);
111 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
112 struct tcp_sock *tp = tcp_sk(sk);
115 /* Still does not detect *everything* that goes through
116 * lo, since we require a loopback src or dst address
117 * or direct binding to 'lo' interface.
119 bool loopback = false;
120 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
122 #if IS_ENABLED(CONFIG_IPV6)
123 if (tw->tw_family == AF_INET6) {
124 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
132 if (ipv4_is_loopback(tw->tw_daddr) ||
133 ipv4_is_loopback(tw->tw_rcv_saddr))
140 /* With PAWS, it is safe from the viewpoint
141 of data integrity. Even without PAWS it is safe provided sequence
142 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
144 Actually, the idea is close to VJ's one, only timestamp cache is
145 held not per host, but per port pair and TW bucket is used as state
148 If TW bucket has been already destroyed we fall back to VJ's scheme
149 and use initial timestamp retrieved from peer table.
151 if (tcptw->tw_ts_recent_stamp &&
152 (!twp || (reuse && time_after32(ktime_get_seconds(),
153 tcptw->tw_ts_recent_stamp)))) {
154 /* In case of repair and re-using TIME-WAIT sockets we still
155 * want to be sure that it is safe as above but honor the
156 * sequence numbers and time stamps set as part of the repair
159 * Without this check re-using a TIME-WAIT socket with TCP
160 * repair would accumulate a -1 on the repair assigned
161 * sequence number. The first time it is reused the sequence
162 * is -1, the second time -2, etc. This fixes that issue
163 * without appearing to create any others.
165 if (likely(!tp->repair)) {
166 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
170 WRITE_ONCE(tp->write_seq, seq);
171 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
172 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185 /* This check is replicated from tcp_v4_connect() and intended to
186 * prevent BPF program called below from accessing bytes that are out
187 * of the bound specified by user in addr_len.
189 if (addr_len < sizeof(struct sockaddr_in))
192 sock_owned_by_me(sk);
194 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197 /* This will initiate an outgoing connection. */
198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
200 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201 struct inet_sock *inet = inet_sk(sk);
202 struct tcp_sock *tp = tcp_sk(sk);
203 __be16 orig_sport, orig_dport;
204 __be32 daddr, nexthop;
208 struct ip_options_rcu *inet_opt;
209 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
211 if (addr_len < sizeof(struct sockaddr_in))
214 if (usin->sin_family != AF_INET)
215 return -EAFNOSUPPORT;
217 nexthop = daddr = usin->sin_addr.s_addr;
218 inet_opt = rcu_dereference_protected(inet->inet_opt,
219 lockdep_sock_is_held(sk));
220 if (inet_opt && inet_opt->opt.srr) {
223 nexthop = inet_opt->opt.faddr;
226 orig_sport = inet->inet_sport;
227 orig_dport = usin->sin_port;
228 fl4 = &inet->cork.fl.u.ip4;
229 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
232 orig_sport, orig_dport, sk);
235 if (err == -ENETUNREACH)
236 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
240 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
245 if (!inet_opt || !inet_opt->opt.srr)
248 if (!inet->inet_saddr)
249 inet->inet_saddr = fl4->saddr;
250 sk_rcv_saddr_set(sk, inet->inet_saddr);
252 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253 /* Reset inherited state */
254 tp->rx_opt.ts_recent = 0;
255 tp->rx_opt.ts_recent_stamp = 0;
256 if (likely(!tp->repair))
257 WRITE_ONCE(tp->write_seq, 0);
260 inet->inet_dport = usin->sin_port;
261 sk_daddr_set(sk, daddr);
263 inet_csk(sk)->icsk_ext_hdr_len = 0;
265 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
267 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
269 /* Socket identity is still unknown (sport may be zero).
270 * However we set state to SYN-SENT and not releasing socket
271 * lock select source port, enter ourselves into the hash tables and
272 * complete initialization after this.
274 tcp_set_state(sk, TCP_SYN_SENT);
275 err = inet_hash_connect(tcp_death_row, sk);
281 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282 inet->inet_sport, inet->inet_dport, sk);
288 /* OK, now commit destination to socket. */
289 sk->sk_gso_type = SKB_GSO_TCPV4;
290 sk_setup_caps(sk, &rt->dst);
293 if (likely(!tp->repair)) {
295 WRITE_ONCE(tp->write_seq,
296 secure_tcp_seq(inet->inet_saddr,
300 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
305 inet->inet_id = prandom_u32();
307 if (tcp_fastopen_defer_connect(sk, &err))
312 err = tcp_connect(sk);
321 * This unhashes the socket and releases the local port,
324 tcp_set_state(sk, TCP_CLOSE);
326 sk->sk_route_caps = 0;
327 inet->inet_dport = 0;
330 EXPORT_SYMBOL(tcp_v4_connect);
333 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
334 * It can be called through tcp_release_cb() if socket was owned by user
335 * at the time tcp_v4_err() was called to handle ICMP message.
337 void tcp_v4_mtu_reduced(struct sock *sk)
339 struct inet_sock *inet = inet_sk(sk);
340 struct dst_entry *dst;
343 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
345 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
346 dst = inet_csk_update_pmtu(sk, mtu);
350 /* Something is about to be wrong... Remember soft error
351 * for the case, if this connection will not able to recover.
353 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
354 sk->sk_err_soft = EMSGSIZE;
358 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
359 ip_sk_accept_pmtu(sk) &&
360 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
361 tcp_sync_mss(sk, mtu);
363 /* Resend the TCP packet because it's
364 * clear that the old packet has been
365 * dropped. This is the new "fast" path mtu
368 tcp_simple_retransmit(sk);
369 } /* else let the usual retransmit timer handle it */
371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
375 struct dst_entry *dst = __sk_dst_check(sk, 0);
378 dst->ops->redirect(dst, sk, skb);
382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
385 struct request_sock *req = inet_reqsk(sk);
386 struct net *net = sock_net(sk);
388 /* ICMPs are not backlogged, hence we cannot get
389 * an established socket here.
391 if (seq != tcp_rsk(req)->snt_isn) {
392 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
395 * Still in SYN_RECV, just remove it silently.
396 * There is no good way to pass the error to the newly
397 * created socket, and POSIX does not want network
398 * errors returned from accept().
400 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
401 tcp_listendrop(req->rsk_listener);
405 EXPORT_SYMBOL(tcp_req_err);
407 /* TCP-LD (RFC 6069) logic */
408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
410 struct inet_connection_sock *icsk = inet_csk(sk);
411 struct tcp_sock *tp = tcp_sk(sk);
416 if (sock_owned_by_user(sk))
419 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
423 skb = tcp_rtx_queue_head(sk);
424 if (WARN_ON_ONCE(!skb))
427 icsk->icsk_backoff--;
428 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
431 tcp_mstamp_refresh(tp);
432 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
436 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437 remaining, TCP_RTO_MAX);
439 /* RTO revert clocked out retransmission.
440 * Will retransmit now.
442 tcp_retransmit_timer(sk);
445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
448 * This routine is called by the ICMP module when it gets some
449 * sort of error condition. If err < 0 then the socket should
450 * be closed and the error returned to the user. If err > 0
451 * it's just the icmp type << 8 | icmp code. After adjustment
452 * header points to the first 8 bytes of the tcp header. We need
453 * to find the appropriate port.
455 * The locking strategy used here is very "optimistic". When
456 * someone else accesses the socket the ICMP is just dropped
457 * and for some paths there is no check at all.
458 * A more general error queue to queue errors for later handling
459 * is probably better.
463 int tcp_v4_err(struct sk_buff *skb, u32 info)
465 const struct iphdr *iph = (const struct iphdr *)skb->data;
466 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
468 struct inet_sock *inet;
469 const int type = icmp_hdr(skb)->type;
470 const int code = icmp_hdr(skb)->code;
472 struct request_sock *fastopen;
475 struct net *net = dev_net(skb->dev);
477 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
478 th->dest, iph->saddr, ntohs(th->source),
481 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
484 if (sk->sk_state == TCP_TIME_WAIT) {
485 inet_twsk_put(inet_twsk(sk));
488 seq = ntohl(th->seq);
489 if (sk->sk_state == TCP_NEW_SYN_RECV) {
490 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
491 type == ICMP_TIME_EXCEEDED ||
492 (type == ICMP_DEST_UNREACH &&
493 (code == ICMP_NET_UNREACH ||
494 code == ICMP_HOST_UNREACH)));
499 /* If too many ICMPs get dropped on busy
500 * servers this needs to be solved differently.
501 * We do take care of PMTU discovery (RFC1191) special case :
502 * we can receive locally generated ICMP messages while socket is held.
504 if (sock_owned_by_user(sk)) {
505 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
506 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
508 if (sk->sk_state == TCP_CLOSE)
511 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
512 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
517 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
518 fastopen = rcu_dereference(tp->fastopen_rsk);
519 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
520 if (sk->sk_state != TCP_LISTEN &&
521 !between(seq, snd_una, tp->snd_nxt)) {
522 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
528 if (!sock_owned_by_user(sk))
529 do_redirect(skb, sk);
531 case ICMP_SOURCE_QUENCH:
532 /* Just silently ignore these. */
534 case ICMP_PARAMETERPROB:
537 case ICMP_DEST_UNREACH:
538 if (code > NR_ICMP_UNREACH)
541 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
542 /* We are not interested in TCP_LISTEN and open_requests
543 * (SYN-ACKs send out by Linux are always <576bytes so
544 * they should go through unfragmented).
546 if (sk->sk_state == TCP_LISTEN)
549 WRITE_ONCE(tp->mtu_info, info);
550 if (!sock_owned_by_user(sk)) {
551 tcp_v4_mtu_reduced(sk);
553 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
559 err = icmp_err_convert[code].errno;
560 /* check if this ICMP message allows revert of backoff.
564 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
565 tcp_ld_RTO_revert(sk, seq);
567 case ICMP_TIME_EXCEEDED:
574 switch (sk->sk_state) {
577 /* Only in fast or simultaneous open. If a fast open socket is
578 * already accepted it is treated as a connected one below.
580 if (fastopen && !fastopen->sk)
583 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
585 if (!sock_owned_by_user(sk)) {
588 sk->sk_error_report(sk);
592 sk->sk_err_soft = err;
597 /* If we've already connected we will keep trying
598 * until we time out, or the user gives up.
600 * rfc1122 4.2.3.9 allows to consider as hard errors
601 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
602 * but it is obsoleted by pmtu discovery).
604 * Note, that in modern internet, where routing is unreliable
605 * and in each dark corner broken firewalls sit, sending random
606 * errors ordered by their masters even this two messages finally lose
607 * their original sense (even Linux sends invalid PORT_UNREACHs)
609 * Now we are in compliance with RFCs.
614 if (!sock_owned_by_user(sk) && inet->recverr) {
616 sk->sk_error_report(sk);
617 } else { /* Only an error on timeout */
618 sk->sk_err_soft = err;
627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
629 struct tcphdr *th = tcp_hdr(skb);
631 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
632 skb->csum_start = skb_transport_header(skb) - skb->head;
633 skb->csum_offset = offsetof(struct tcphdr, check);
636 /* This routine computes an IPv4 TCP checksum. */
637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
639 const struct inet_sock *inet = inet_sk(sk);
641 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
643 EXPORT_SYMBOL(tcp_v4_send_check);
646 * This routine will send an RST to the other tcp.
648 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
650 * Answer: if a packet caused RST, it is not for a socket
651 * existing in our system, if it is matched to a socket,
652 * it is just duplicate segment or bug in other side's TCP.
653 * So that we build reply only basing on parameters
654 * arrived with segment.
655 * Exception: precedence violation. We do not implement it in any case.
658 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
660 const struct tcphdr *th = tcp_hdr(skb);
663 #ifdef CONFIG_TCP_MD5SIG
664 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
667 struct ip_reply_arg arg;
668 #ifdef CONFIG_TCP_MD5SIG
669 struct tcp_md5sig_key *key = NULL;
670 const __u8 *hash_location = NULL;
671 unsigned char newhash[16];
673 struct sock *sk1 = NULL;
675 u64 transmit_time = 0;
679 /* Never send a reset in response to a reset. */
683 /* If sk not NULL, it means we did a successful lookup and incoming
684 * route had to be correct. prequeue might have dropped our dst.
686 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
689 /* Swap the send and the receive. */
690 memset(&rep, 0, sizeof(rep));
691 rep.th.dest = th->source;
692 rep.th.source = th->dest;
693 rep.th.doff = sizeof(struct tcphdr) / 4;
697 rep.th.seq = th->ack_seq;
700 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
701 skb->len - (th->doff << 2));
704 memset(&arg, 0, sizeof(arg));
705 arg.iov[0].iov_base = (unsigned char *)&rep;
706 arg.iov[0].iov_len = sizeof(rep.th);
708 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
709 #ifdef CONFIG_TCP_MD5SIG
711 hash_location = tcp_parse_md5sig_option(th);
712 if (sk && sk_fullsock(sk)) {
713 const union tcp_md5_addr *addr;
716 /* sdif set, means packet ingressed via a device
717 * in an L3 domain and inet_iif is set to it.
719 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
720 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
721 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
722 } else if (hash_location) {
723 const union tcp_md5_addr *addr;
724 int sdif = tcp_v4_sdif(skb);
725 int dif = inet_iif(skb);
729 * active side is lost. Try to find listening socket through
730 * source port, and then find md5 key through listening socket.
731 * we are not loose security here:
732 * Incoming packet is checked with md5 hash with finding key,
733 * no RST generated if md5 hash doesn't match.
735 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
737 th->source, ip_hdr(skb)->daddr,
738 ntohs(th->source), dif, sdif);
739 /* don't send rst if it can't find key */
743 /* sdif set, means packet ingressed via a device
744 * in an L3 domain and dif is set to it.
746 l3index = sdif ? dif : 0;
747 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
748 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
753 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
754 if (genhash || memcmp(hash_location, newhash, 16) != 0)
760 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
762 (TCPOPT_MD5SIG << 8) |
764 /* Update length and the length the header thinks exists */
765 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
766 rep.th.doff = arg.iov[0].iov_len / 4;
768 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
769 key, ip_hdr(skb)->saddr,
770 ip_hdr(skb)->daddr, &rep.th);
773 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
774 ip_hdr(skb)->saddr, /* XXX */
775 arg.iov[0].iov_len, IPPROTO_TCP, 0);
776 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
777 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
779 /* When socket is gone, all binding information is lost.
780 * routing might fail in this case. No choice here, if we choose to force
781 * input interface, we will misroute in case of asymmetric route.
784 arg.bound_dev_if = sk->sk_bound_dev_if;
786 trace_tcp_send_reset(sk, skb);
789 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
790 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
792 arg.tos = ip_hdr(skb)->tos;
793 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
795 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
797 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
798 inet_twsk(sk)->tw_mark : sk->sk_mark;
799 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
800 inet_twsk(sk)->tw_priority : sk->sk_priority;
801 transmit_time = tcp_transmit_time(sk);
803 ip_send_unicast_reply(ctl_sk,
804 skb, &TCP_SKB_CB(skb)->header.h4.opt,
805 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
806 &arg, arg.iov[0].iov_len,
810 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
811 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
814 #ifdef CONFIG_TCP_MD5SIG
820 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
821 outside socket context is ugly, certainly. What can I do?
824 static void tcp_v4_send_ack(const struct sock *sk,
825 struct sk_buff *skb, u32 seq, u32 ack,
826 u32 win, u32 tsval, u32 tsecr, int oif,
827 struct tcp_md5sig_key *key,
828 int reply_flags, u8 tos)
830 const struct tcphdr *th = tcp_hdr(skb);
833 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
834 #ifdef CONFIG_TCP_MD5SIG
835 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
839 struct net *net = sock_net(sk);
840 struct ip_reply_arg arg;
844 memset(&rep.th, 0, sizeof(struct tcphdr));
845 memset(&arg, 0, sizeof(arg));
847 arg.iov[0].iov_base = (unsigned char *)&rep;
848 arg.iov[0].iov_len = sizeof(rep.th);
850 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
851 (TCPOPT_TIMESTAMP << 8) |
853 rep.opt[1] = htonl(tsval);
854 rep.opt[2] = htonl(tsecr);
855 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
858 /* Swap the send and the receive. */
859 rep.th.dest = th->source;
860 rep.th.source = th->dest;
861 rep.th.doff = arg.iov[0].iov_len / 4;
862 rep.th.seq = htonl(seq);
863 rep.th.ack_seq = htonl(ack);
865 rep.th.window = htons(win);
867 #ifdef CONFIG_TCP_MD5SIG
869 int offset = (tsecr) ? 3 : 0;
871 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
873 (TCPOPT_MD5SIG << 8) |
875 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
876 rep.th.doff = arg.iov[0].iov_len/4;
878 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
879 key, ip_hdr(skb)->saddr,
880 ip_hdr(skb)->daddr, &rep.th);
883 arg.flags = reply_flags;
884 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
885 ip_hdr(skb)->saddr, /* XXX */
886 arg.iov[0].iov_len, IPPROTO_TCP, 0);
887 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
889 arg.bound_dev_if = oif;
891 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
893 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
894 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
895 inet_twsk(sk)->tw_mark : sk->sk_mark;
896 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
897 inet_twsk(sk)->tw_priority : sk->sk_priority;
898 transmit_time = tcp_transmit_time(sk);
899 ip_send_unicast_reply(ctl_sk,
900 skb, &TCP_SKB_CB(skb)->header.h4.opt,
901 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
902 &arg, arg.iov[0].iov_len,
906 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
910 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
912 struct inet_timewait_sock *tw = inet_twsk(sk);
913 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
915 tcp_v4_send_ack(sk, skb,
916 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
917 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
918 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
921 tcp_twsk_md5_key(tcptw),
922 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
929 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
930 struct request_sock *req)
932 const union tcp_md5_addr *addr;
935 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
936 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
938 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
942 * The window field (SEG.WND) of every outgoing segment, with the
943 * exception of <SYN> segments, MUST be right-shifted by
944 * Rcv.Wind.Shift bits:
946 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
947 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
948 tcp_v4_send_ack(sk, skb, seq,
949 tcp_rsk(req)->rcv_nxt,
950 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
951 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
954 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
955 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
960 * Send a SYN-ACK after having received a SYN.
961 * This still operates on a request_sock only, not on a big
964 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
966 struct request_sock *req,
967 struct tcp_fastopen_cookie *foc,
968 enum tcp_synack_type synack_type,
969 struct sk_buff *syn_skb)
971 const struct inet_request_sock *ireq = inet_rsk(req);
977 /* First, grab a route. */
978 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
981 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
984 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
986 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
987 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
988 (inet_sk(sk)->tos & INET_ECN_MASK) :
991 if (!INET_ECN_is_capable(tos) &&
992 tcp_bpf_ca_needs_ecn((struct sock *)req))
993 tos |= INET_ECN_ECT_0;
996 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
998 rcu_dereference(ireq->ireq_opt),
1001 err = net_xmit_eval(err);
1008 * IPv4 request_sock destructor.
1010 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1012 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1015 #ifdef CONFIG_TCP_MD5SIG
1017 * RFC2385 MD5 checksumming requires a mapping of
1018 * IP address->MD5 Key.
1019 * We need to maintain these in the sk structure.
1022 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1023 EXPORT_SYMBOL(tcp_md5_needed);
1025 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1030 /* l3index always overrides non-l3index */
1031 if (old->l3index && new->l3index == 0)
1033 if (old->l3index == 0 && new->l3index)
1036 return old->prefixlen < new->prefixlen;
1039 /* Find the Key structure for an address. */
1040 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1041 const union tcp_md5_addr *addr,
1044 const struct tcp_sock *tp = tcp_sk(sk);
1045 struct tcp_md5sig_key *key;
1046 const struct tcp_md5sig_info *md5sig;
1048 struct tcp_md5sig_key *best_match = NULL;
1051 /* caller either holds rcu_read_lock() or socket lock */
1052 md5sig = rcu_dereference_check(tp->md5sig_info,
1053 lockdep_sock_is_held(sk));
1057 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1058 lockdep_sock_is_held(sk)) {
1059 if (key->family != family)
1061 if (key->l3index && key->l3index != l3index)
1063 if (family == AF_INET) {
1064 mask = inet_make_mask(key->prefixlen);
1065 match = (key->addr.a4.s_addr & mask) ==
1066 (addr->a4.s_addr & mask);
1067 #if IS_ENABLED(CONFIG_IPV6)
1068 } else if (family == AF_INET6) {
1069 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1076 if (match && better_md5_match(best_match, key))
1081 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1083 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1084 const union tcp_md5_addr *addr,
1085 int family, u8 prefixlen,
1088 const struct tcp_sock *tp = tcp_sk(sk);
1089 struct tcp_md5sig_key *key;
1090 unsigned int size = sizeof(struct in_addr);
1091 const struct tcp_md5sig_info *md5sig;
1093 /* caller either holds rcu_read_lock() or socket lock */
1094 md5sig = rcu_dereference_check(tp->md5sig_info,
1095 lockdep_sock_is_held(sk));
1098 #if IS_ENABLED(CONFIG_IPV6)
1099 if (family == AF_INET6)
1100 size = sizeof(struct in6_addr);
1102 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1103 lockdep_sock_is_held(sk)) {
1104 if (key->family != family)
1106 if (key->l3index != l3index)
1108 if (!memcmp(&key->addr, addr, size) &&
1109 key->prefixlen == prefixlen)
1115 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1116 const struct sock *addr_sk)
1118 const union tcp_md5_addr *addr;
1121 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1122 addr_sk->sk_bound_dev_if);
1123 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1124 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1126 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1128 /* This can be called on a newly created socket, from other files */
1129 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1130 int family, u8 prefixlen, int l3index,
1131 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1133 /* Add Key to the list */
1134 struct tcp_md5sig_key *key;
1135 struct tcp_sock *tp = tcp_sk(sk);
1136 struct tcp_md5sig_info *md5sig;
1138 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1140 /* Pre-existing entry - just update that one.
1141 * Note that the key might be used concurrently.
1142 * data_race() is telling kcsan that we do not care of
1143 * key mismatches, since changing MD5 key on live flows
1144 * can lead to packet drops.
1146 data_race(memcpy(key->key, newkey, newkeylen));
1148 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1149 * Also note that a reader could catch new key->keylen value
1150 * but old key->key[], this is the reason we use __GFP_ZERO
1151 * at sock_kmalloc() time below these lines.
1153 WRITE_ONCE(key->keylen, newkeylen);
1158 md5sig = rcu_dereference_protected(tp->md5sig_info,
1159 lockdep_sock_is_held(sk));
1161 md5sig = kmalloc(sizeof(*md5sig), gfp);
1165 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1166 INIT_HLIST_HEAD(&md5sig->head);
1167 rcu_assign_pointer(tp->md5sig_info, md5sig);
1170 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1173 if (!tcp_alloc_md5sig_pool()) {
1174 sock_kfree_s(sk, key, sizeof(*key));
1178 memcpy(key->key, newkey, newkeylen);
1179 key->keylen = newkeylen;
1180 key->family = family;
1181 key->prefixlen = prefixlen;
1182 key->l3index = l3index;
1183 memcpy(&key->addr, addr,
1184 (family == AF_INET6) ? sizeof(struct in6_addr) :
1185 sizeof(struct in_addr));
1186 hlist_add_head_rcu(&key->node, &md5sig->head);
1189 EXPORT_SYMBOL(tcp_md5_do_add);
1191 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1192 u8 prefixlen, int l3index)
1194 struct tcp_md5sig_key *key;
1196 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1199 hlist_del_rcu(&key->node);
1200 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1201 kfree_rcu(key, rcu);
1204 EXPORT_SYMBOL(tcp_md5_do_del);
1206 static void tcp_clear_md5_list(struct sock *sk)
1208 struct tcp_sock *tp = tcp_sk(sk);
1209 struct tcp_md5sig_key *key;
1210 struct hlist_node *n;
1211 struct tcp_md5sig_info *md5sig;
1213 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1215 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1216 hlist_del_rcu(&key->node);
1217 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1218 kfree_rcu(key, rcu);
1222 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1223 sockptr_t optval, int optlen)
1225 struct tcp_md5sig cmd;
1226 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1227 const union tcp_md5_addr *addr;
1231 if (optlen < sizeof(cmd))
1234 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1237 if (sin->sin_family != AF_INET)
1240 if (optname == TCP_MD5SIG_EXT &&
1241 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1242 prefixlen = cmd.tcpm_prefixlen;
1247 if (optname == TCP_MD5SIG_EXT &&
1248 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1249 struct net_device *dev;
1252 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1253 if (dev && netif_is_l3_master(dev))
1254 l3index = dev->ifindex;
1258 /* ok to reference set/not set outside of rcu;
1259 * right now device MUST be an L3 master
1261 if (!dev || !l3index)
1265 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1267 if (!cmd.tcpm_keylen)
1268 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1270 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1273 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1274 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1277 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1278 __be32 daddr, __be32 saddr,
1279 const struct tcphdr *th, int nbytes)
1281 struct tcp4_pseudohdr *bp;
1282 struct scatterlist sg;
1289 bp->protocol = IPPROTO_TCP;
1290 bp->len = cpu_to_be16(nbytes);
1292 _th = (struct tcphdr *)(bp + 1);
1293 memcpy(_th, th, sizeof(*th));
1296 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1297 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1298 sizeof(*bp) + sizeof(*th));
1299 return crypto_ahash_update(hp->md5_req);
1302 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1303 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1305 struct tcp_md5sig_pool *hp;
1306 struct ahash_request *req;
1308 hp = tcp_get_md5sig_pool();
1310 goto clear_hash_noput;
1313 if (crypto_ahash_init(req))
1315 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1317 if (tcp_md5_hash_key(hp, key))
1319 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1320 if (crypto_ahash_final(req))
1323 tcp_put_md5sig_pool();
1327 tcp_put_md5sig_pool();
1329 memset(md5_hash, 0, 16);
1333 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1334 const struct sock *sk,
1335 const struct sk_buff *skb)
1337 struct tcp_md5sig_pool *hp;
1338 struct ahash_request *req;
1339 const struct tcphdr *th = tcp_hdr(skb);
1340 __be32 saddr, daddr;
1342 if (sk) { /* valid for establish/request sockets */
1343 saddr = sk->sk_rcv_saddr;
1344 daddr = sk->sk_daddr;
1346 const struct iphdr *iph = ip_hdr(skb);
1351 hp = tcp_get_md5sig_pool();
1353 goto clear_hash_noput;
1356 if (crypto_ahash_init(req))
1359 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1361 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1363 if (tcp_md5_hash_key(hp, key))
1365 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1366 if (crypto_ahash_final(req))
1369 tcp_put_md5sig_pool();
1373 tcp_put_md5sig_pool();
1375 memset(md5_hash, 0, 16);
1378 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1382 /* Called with rcu_read_lock() */
1383 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1384 const struct sk_buff *skb,
1387 #ifdef CONFIG_TCP_MD5SIG
1389 * This gets called for each TCP segment that arrives
1390 * so we want to be efficient.
1391 * We have 3 drop cases:
1392 * o No MD5 hash and one expected.
1393 * o MD5 hash and we're not expecting one.
1394 * o MD5 hash and its wrong.
1396 const __u8 *hash_location = NULL;
1397 struct tcp_md5sig_key *hash_expected;
1398 const struct iphdr *iph = ip_hdr(skb);
1399 const struct tcphdr *th = tcp_hdr(skb);
1400 const union tcp_md5_addr *addr;
1401 unsigned char newhash[16];
1402 int genhash, l3index;
1404 /* sdif set, means packet ingressed via a device
1405 * in an L3 domain and dif is set to the l3mdev
1407 l3index = sdif ? dif : 0;
1409 addr = (union tcp_md5_addr *)&iph->saddr;
1410 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1411 hash_location = tcp_parse_md5sig_option(th);
1413 /* We've parsed the options - do we have a hash? */
1414 if (!hash_expected && !hash_location)
1417 if (hash_expected && !hash_location) {
1418 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1422 if (!hash_expected && hash_location) {
1423 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1427 /* Okay, so this is hash_expected and hash_location -
1428 * so we need to calculate the checksum.
1430 genhash = tcp_v4_md5_hash_skb(newhash,
1434 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1435 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1436 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1437 &iph->saddr, ntohs(th->source),
1438 &iph->daddr, ntohs(th->dest),
1439 genhash ? " tcp_v4_calc_md5_hash failed"
1448 static void tcp_v4_init_req(struct request_sock *req,
1449 const struct sock *sk_listener,
1450 struct sk_buff *skb)
1452 struct inet_request_sock *ireq = inet_rsk(req);
1453 struct net *net = sock_net(sk_listener);
1455 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1456 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1457 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1460 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1462 const struct request_sock *req)
1464 return inet_csk_route_req(sk, &fl->u.ip4, req);
1467 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1469 .obj_size = sizeof(struct tcp_request_sock),
1470 .rtx_syn_ack = tcp_rtx_synack,
1471 .send_ack = tcp_v4_reqsk_send_ack,
1472 .destructor = tcp_v4_reqsk_destructor,
1473 .send_reset = tcp_v4_send_reset,
1474 .syn_ack_timeout = tcp_syn_ack_timeout,
1477 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1478 .mss_clamp = TCP_MSS_DEFAULT,
1479 #ifdef CONFIG_TCP_MD5SIG
1480 .req_md5_lookup = tcp_v4_md5_lookup,
1481 .calc_md5_hash = tcp_v4_md5_hash_skb,
1483 .init_req = tcp_v4_init_req,
1484 #ifdef CONFIG_SYN_COOKIES
1485 .cookie_init_seq = cookie_v4_init_sequence,
1487 .route_req = tcp_v4_route_req,
1488 .init_seq = tcp_v4_init_seq,
1489 .init_ts_off = tcp_v4_init_ts_off,
1490 .send_synack = tcp_v4_send_synack,
1493 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1495 /* Never answer to SYNs send to broadcast or multicast */
1496 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1499 return tcp_conn_request(&tcp_request_sock_ops,
1500 &tcp_request_sock_ipv4_ops, sk, skb);
1506 EXPORT_SYMBOL(tcp_v4_conn_request);
1510 * The three way handshake has completed - we got a valid synack -
1511 * now create the new socket.
1513 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1514 struct request_sock *req,
1515 struct dst_entry *dst,
1516 struct request_sock *req_unhash,
1519 struct inet_request_sock *ireq;
1520 bool found_dup_sk = false;
1521 struct inet_sock *newinet;
1522 struct tcp_sock *newtp;
1524 #ifdef CONFIG_TCP_MD5SIG
1525 const union tcp_md5_addr *addr;
1526 struct tcp_md5sig_key *key;
1529 struct ip_options_rcu *inet_opt;
1531 if (sk_acceptq_is_full(sk))
1534 newsk = tcp_create_openreq_child(sk, req, skb);
1538 newsk->sk_gso_type = SKB_GSO_TCPV4;
1539 inet_sk_rx_dst_set(newsk, skb);
1541 newtp = tcp_sk(newsk);
1542 newinet = inet_sk(newsk);
1543 ireq = inet_rsk(req);
1544 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1545 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1546 newsk->sk_bound_dev_if = ireq->ir_iif;
1547 newinet->inet_saddr = ireq->ir_loc_addr;
1548 inet_opt = rcu_dereference(ireq->ireq_opt);
1549 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1550 newinet->mc_index = inet_iif(skb);
1551 newinet->mc_ttl = ip_hdr(skb)->ttl;
1552 newinet->rcv_tos = ip_hdr(skb)->tos;
1553 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1555 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1556 newinet->inet_id = prandom_u32();
1558 /* Set ToS of the new socket based upon the value of incoming SYN.
1559 * ECT bits are set later in tcp_init_transfer().
1561 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1562 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1565 dst = inet_csk_route_child_sock(sk, newsk, req);
1569 /* syncookie case : see end of cookie_v4_check() */
1571 sk_setup_caps(newsk, dst);
1573 tcp_ca_openreq_child(newsk, dst);
1575 tcp_sync_mss(newsk, dst_mtu(dst));
1576 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1578 tcp_initialize_rcv_mss(newsk);
1580 #ifdef CONFIG_TCP_MD5SIG
1581 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1582 /* Copy over the MD5 key from the original socket */
1583 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1584 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1587 * We're using one, so create a matching key
1588 * on the newsk structure. If we fail to get
1589 * memory, then we end up not copying the key
1592 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1593 key->key, key->keylen, GFP_ATOMIC);
1594 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1598 if (__inet_inherit_port(sk, newsk) < 0)
1600 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1602 if (likely(*own_req)) {
1603 tcp_move_syn(newtp, req);
1604 ireq->ireq_opt = NULL;
1606 newinet->inet_opt = NULL;
1608 if (!req_unhash && found_dup_sk) {
1609 /* This code path should only be executed in the
1610 * syncookie case only
1612 bh_unlock_sock(newsk);
1620 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1627 newinet->inet_opt = NULL;
1628 inet_csk_prepare_forced_close(newsk);
1632 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1634 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1636 #ifdef CONFIG_SYN_COOKIES
1637 const struct tcphdr *th = tcp_hdr(skb);
1640 sk = cookie_v4_check(sk, skb);
1645 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1646 struct tcphdr *th, u32 *cookie)
1649 #ifdef CONFIG_SYN_COOKIES
1650 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1651 &tcp_request_sock_ipv4_ops, sk, th);
1653 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1654 tcp_synq_overflow(sk);
1660 /* The socket must have it's spinlock held when we get
1661 * here, unless it is a TCP_LISTEN socket.
1663 * We have a potential double-lock case here, so even when
1664 * doing backlog processing we use the BH locking scheme.
1665 * This is because we cannot sleep with the original spinlock
1668 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1672 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1673 struct dst_entry *dst;
1675 dst = rcu_dereference_protected(sk->sk_rx_dst,
1676 lockdep_sock_is_held(sk));
1678 sock_rps_save_rxhash(sk, skb);
1679 sk_mark_napi_id(sk, skb);
1681 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1682 !dst->ops->check(dst, 0)) {
1683 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1687 tcp_rcv_established(sk, skb);
1691 if (tcp_checksum_complete(skb))
1694 if (sk->sk_state == TCP_LISTEN) {
1695 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1700 if (tcp_child_process(sk, nsk, skb)) {
1707 sock_rps_save_rxhash(sk, skb);
1709 if (tcp_rcv_state_process(sk, skb)) {
1716 tcp_v4_send_reset(rsk, skb);
1719 /* Be careful here. If this function gets more complicated and
1720 * gcc suffers from register pressure on the x86, sk (in %ebx)
1721 * might be destroyed here. This current version compiles correctly,
1722 * but you have been warned.
1727 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1728 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1731 EXPORT_SYMBOL(tcp_v4_do_rcv);
1733 int tcp_v4_early_demux(struct sk_buff *skb)
1735 const struct iphdr *iph;
1736 const struct tcphdr *th;
1739 if (skb->pkt_type != PACKET_HOST)
1742 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1748 if (th->doff < sizeof(struct tcphdr) / 4)
1751 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1752 iph->saddr, th->source,
1753 iph->daddr, ntohs(th->dest),
1754 skb->skb_iif, inet_sdif(skb));
1757 skb->destructor = sock_edemux;
1758 if (sk_fullsock(sk)) {
1759 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1762 dst = dst_check(dst, 0);
1764 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1765 skb_dst_set_noref(skb, dst);
1771 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1773 u32 limit, tail_gso_size, tail_gso_segs;
1774 struct skb_shared_info *shinfo;
1775 const struct tcphdr *th;
1776 struct tcphdr *thtail;
1777 struct sk_buff *tail;
1778 unsigned int hdrlen;
1784 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1785 * we can fix skb->truesize to its real value to avoid future drops.
1786 * This is valid because skb is not yet charged to the socket.
1787 * It has been noticed pure SACK packets were sometimes dropped
1788 * (if cooked by drivers without copybreak feature).
1794 if (unlikely(tcp_checksum_complete(skb))) {
1796 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1797 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1801 /* Attempt coalescing to last skb in backlog, even if we are
1803 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1805 th = (const struct tcphdr *)skb->data;
1806 hdrlen = th->doff * 4;
1808 tail = sk->sk_backlog.tail;
1811 thtail = (struct tcphdr *)tail->data;
1813 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1814 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1815 ((TCP_SKB_CB(tail)->tcp_flags |
1816 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1817 !((TCP_SKB_CB(tail)->tcp_flags &
1818 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1819 ((TCP_SKB_CB(tail)->tcp_flags ^
1820 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1821 #ifdef CONFIG_TLS_DEVICE
1822 tail->decrypted != skb->decrypted ||
1824 thtail->doff != th->doff ||
1825 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1828 __skb_pull(skb, hdrlen);
1830 shinfo = skb_shinfo(skb);
1831 gso_size = shinfo->gso_size ?: skb->len;
1832 gso_segs = shinfo->gso_segs ?: 1;
1834 shinfo = skb_shinfo(tail);
1835 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1836 tail_gso_segs = shinfo->gso_segs ?: 1;
1838 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1839 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1841 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1842 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1843 thtail->window = th->window;
1846 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1847 * thtail->fin, so that the fast path in tcp_rcv_established()
1848 * is not entered if we append a packet with a FIN.
1849 * SYN, RST, URG are not present.
1850 * ACK is set on both packets.
1851 * PSH : we do not really care in TCP stack,
1852 * at least for 'GRO' packets.
1854 thtail->fin |= th->fin;
1855 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1857 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1858 TCP_SKB_CB(tail)->has_rxtstamp = true;
1859 tail->tstamp = skb->tstamp;
1860 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1863 /* Not as strict as GRO. We only need to carry mss max value */
1864 shinfo->gso_size = max(gso_size, tail_gso_size);
1865 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1867 sk->sk_backlog.len += delta;
1868 __NET_INC_STATS(sock_net(sk),
1869 LINUX_MIB_TCPBACKLOGCOALESCE);
1870 kfree_skb_partial(skb, fragstolen);
1873 __skb_push(skb, hdrlen);
1876 limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1878 /* Only socket owner can try to collapse/prune rx queues
1879 * to reduce memory overhead, so add a little headroom here.
1880 * Few sockets backlog are possibly concurrently non empty.
1884 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1886 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1891 EXPORT_SYMBOL(tcp_add_backlog);
1893 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1895 struct tcphdr *th = (struct tcphdr *)skb->data;
1897 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1899 EXPORT_SYMBOL(tcp_filter);
1901 static void tcp_v4_restore_cb(struct sk_buff *skb)
1903 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1904 sizeof(struct inet_skb_parm));
1907 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1908 const struct tcphdr *th)
1910 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1911 * barrier() makes sure compiler wont play fool^Waliasing games.
1913 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1914 sizeof(struct inet_skb_parm));
1917 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1918 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1919 skb->len - th->doff * 4);
1920 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1921 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1922 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1923 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1924 TCP_SKB_CB(skb)->sacked = 0;
1925 TCP_SKB_CB(skb)->has_rxtstamp =
1926 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1933 int tcp_v4_rcv(struct sk_buff *skb)
1935 struct net *net = dev_net(skb->dev);
1936 struct sk_buff *skb_to_free;
1937 int sdif = inet_sdif(skb);
1938 int dif = inet_iif(skb);
1939 const struct iphdr *iph;
1940 const struct tcphdr *th;
1945 if (skb->pkt_type != PACKET_HOST)
1948 /* Count it even if it's bad */
1949 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1951 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1954 th = (const struct tcphdr *)skb->data;
1956 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1958 if (!pskb_may_pull(skb, th->doff * 4))
1961 /* An explanation is required here, I think.
1962 * Packet length and doff are validated by header prediction,
1963 * provided case of th->doff==0 is eliminated.
1964 * So, we defer the checks. */
1966 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1969 th = (const struct tcphdr *)skb->data;
1972 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1973 th->dest, sdif, &refcounted);
1978 if (sk->sk_state == TCP_TIME_WAIT)
1981 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1982 struct request_sock *req = inet_reqsk(sk);
1983 bool req_stolen = false;
1986 sk = req->rsk_listener;
1987 if (unlikely(!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb) ||
1988 tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1989 sk_drops_add(sk, skb);
1993 if (tcp_checksum_complete(skb)) {
1997 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1998 inet_csk_reqsk_queue_drop_and_put(sk, req);
2001 /* We own a reference on the listener, increase it again
2002 * as we might lose it too soon.
2007 if (!tcp_filter(sk, skb)) {
2008 th = (const struct tcphdr *)skb->data;
2010 tcp_v4_fill_cb(skb, iph, th);
2011 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2016 /* Another cpu got exclusive access to req
2017 * and created a full blown socket.
2018 * Try to feed this packet to this socket
2019 * instead of discarding it.
2021 tcp_v4_restore_cb(skb);
2025 goto discard_and_relse;
2030 tcp_v4_restore_cb(skb);
2031 } else if (tcp_child_process(sk, nsk, skb)) {
2032 tcp_v4_send_reset(nsk, skb);
2033 goto discard_and_relse;
2039 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2040 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2041 goto discard_and_relse;
2044 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2045 goto discard_and_relse;
2047 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2048 goto discard_and_relse;
2052 if (tcp_filter(sk, skb))
2053 goto discard_and_relse;
2054 th = (const struct tcphdr *)skb->data;
2056 tcp_v4_fill_cb(skb, iph, th);
2060 if (sk->sk_state == TCP_LISTEN) {
2061 ret = tcp_v4_do_rcv(sk, skb);
2062 goto put_and_return;
2065 sk_incoming_cpu_update(sk);
2067 bh_lock_sock_nested(sk);
2068 tcp_segs_in(tcp_sk(sk), skb);
2070 if (!sock_owned_by_user(sk)) {
2071 skb_to_free = sk->sk_rx_skb_cache;
2072 sk->sk_rx_skb_cache = NULL;
2073 ret = tcp_v4_do_rcv(sk, skb);
2075 if (tcp_add_backlog(sk, skb))
2076 goto discard_and_relse;
2081 __kfree_skb(skb_to_free);
2090 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2093 tcp_v4_fill_cb(skb, iph, th);
2095 if (tcp_checksum_complete(skb)) {
2097 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2099 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2101 tcp_v4_send_reset(NULL, skb);
2105 /* Discard frame. */
2110 sk_drops_add(sk, skb);
2116 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2117 inet_twsk_put(inet_twsk(sk));
2121 tcp_v4_fill_cb(skb, iph, th);
2123 if (tcp_checksum_complete(skb)) {
2124 inet_twsk_put(inet_twsk(sk));
2127 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2129 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2132 iph->saddr, th->source,
2133 iph->daddr, th->dest,
2137 inet_twsk_deschedule_put(inet_twsk(sk));
2139 tcp_v4_restore_cb(skb);
2147 tcp_v4_timewait_ack(sk, skb);
2150 tcp_v4_send_reset(sk, skb);
2151 inet_twsk_deschedule_put(inet_twsk(sk));
2153 case TCP_TW_SUCCESS:;
2158 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2159 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2160 .twsk_unique = tcp_twsk_unique,
2161 .twsk_destructor= tcp_twsk_destructor,
2164 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2166 struct dst_entry *dst = skb_dst(skb);
2168 if (dst && dst_hold_safe(dst)) {
2169 rcu_assign_pointer(sk->sk_rx_dst, dst);
2170 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2173 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2175 const struct inet_connection_sock_af_ops ipv4_specific = {
2176 .queue_xmit = ip_queue_xmit,
2177 .send_check = tcp_v4_send_check,
2178 .rebuild_header = inet_sk_rebuild_header,
2179 .sk_rx_dst_set = inet_sk_rx_dst_set,
2180 .conn_request = tcp_v4_conn_request,
2181 .syn_recv_sock = tcp_v4_syn_recv_sock,
2182 .net_header_len = sizeof(struct iphdr),
2183 .setsockopt = ip_setsockopt,
2184 .getsockopt = ip_getsockopt,
2185 .addr2sockaddr = inet_csk_addr2sockaddr,
2186 .sockaddr_len = sizeof(struct sockaddr_in),
2187 .mtu_reduced = tcp_v4_mtu_reduced,
2189 EXPORT_SYMBOL(ipv4_specific);
2191 #ifdef CONFIG_TCP_MD5SIG
2192 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2193 .md5_lookup = tcp_v4_md5_lookup,
2194 .calc_md5_hash = tcp_v4_md5_hash_skb,
2195 .md5_parse = tcp_v4_parse_md5_keys,
2199 /* NOTE: A lot of things set to zero explicitly by call to
2200 * sk_alloc() so need not be done here.
2202 static int tcp_v4_init_sock(struct sock *sk)
2204 struct inet_connection_sock *icsk = inet_csk(sk);
2208 icsk->icsk_af_ops = &ipv4_specific;
2210 #ifdef CONFIG_TCP_MD5SIG
2211 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2217 void tcp_v4_destroy_sock(struct sock *sk)
2219 struct tcp_sock *tp = tcp_sk(sk);
2221 trace_tcp_destroy_sock(sk);
2223 tcp_clear_xmit_timers(sk);
2225 tcp_cleanup_congestion_control(sk);
2227 tcp_cleanup_ulp(sk);
2229 /* Cleanup up the write buffer. */
2230 tcp_write_queue_purge(sk);
2232 /* Check if we want to disable active TFO */
2233 tcp_fastopen_active_disable_ofo_check(sk);
2235 /* Cleans up our, hopefully empty, out_of_order_queue. */
2236 skb_rbtree_purge(&tp->out_of_order_queue);
2238 #ifdef CONFIG_TCP_MD5SIG
2239 /* Clean up the MD5 key list, if any */
2240 if (tp->md5sig_info) {
2241 tcp_clear_md5_list(sk);
2242 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2243 tp->md5sig_info = NULL;
2247 /* Clean up a referenced TCP bind bucket. */
2248 if (inet_csk(sk)->icsk_bind_hash)
2251 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2253 /* If socket is aborted during connect operation */
2254 tcp_free_fastopen_req(tp);
2255 tcp_fastopen_destroy_cipher(sk);
2256 tcp_saved_syn_free(tp);
2258 sk_sockets_allocated_dec(sk);
2260 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2262 #ifdef CONFIG_PROC_FS
2263 /* Proc filesystem TCP sock list dumping. */
2266 * Get next listener socket follow cur. If cur is NULL, get first socket
2267 * starting from bucket given in st->bucket; when st->bucket is zero the
2268 * very first socket in the hash table is returned.
2270 static void *listening_get_next(struct seq_file *seq, void *cur)
2272 struct tcp_seq_afinfo *afinfo;
2273 struct tcp_iter_state *st = seq->private;
2274 struct net *net = seq_file_net(seq);
2275 struct inet_listen_hashbucket *ilb;
2276 struct hlist_nulls_node *node;
2277 struct sock *sk = cur;
2279 if (st->bpf_seq_afinfo)
2280 afinfo = st->bpf_seq_afinfo;
2282 afinfo = PDE_DATA(file_inode(seq->file));
2286 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2287 spin_lock(&ilb->lock);
2288 sk = sk_nulls_head(&ilb->nulls_head);
2292 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2296 sk = sk_nulls_next(sk);
2298 sk_nulls_for_each_from(sk, node) {
2299 if (!net_eq(sock_net(sk), net))
2301 if (afinfo->family == AF_UNSPEC ||
2302 sk->sk_family == afinfo->family)
2305 spin_unlock(&ilb->lock);
2307 if (++st->bucket < INET_LHTABLE_SIZE)
2312 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2314 struct tcp_iter_state *st = seq->private;
2319 rc = listening_get_next(seq, NULL);
2321 while (rc && *pos) {
2322 rc = listening_get_next(seq, rc);
2328 static inline bool empty_bucket(const struct tcp_iter_state *st)
2330 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2334 * Get first established socket starting from bucket given in st->bucket.
2335 * If st->bucket is zero, the very first socket in the hash is returned.
2337 static void *established_get_first(struct seq_file *seq)
2339 struct tcp_seq_afinfo *afinfo;
2340 struct tcp_iter_state *st = seq->private;
2341 struct net *net = seq_file_net(seq);
2344 if (st->bpf_seq_afinfo)
2345 afinfo = st->bpf_seq_afinfo;
2347 afinfo = PDE_DATA(file_inode(seq->file));
2350 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2352 struct hlist_nulls_node *node;
2353 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2355 /* Lockless fast path for the common case of empty buckets */
2356 if (empty_bucket(st))
2360 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2361 if ((afinfo->family != AF_UNSPEC &&
2362 sk->sk_family != afinfo->family) ||
2363 !net_eq(sock_net(sk), net)) {
2369 spin_unlock_bh(lock);
2375 static void *established_get_next(struct seq_file *seq, void *cur)
2377 struct tcp_seq_afinfo *afinfo;
2378 struct sock *sk = cur;
2379 struct hlist_nulls_node *node;
2380 struct tcp_iter_state *st = seq->private;
2381 struct net *net = seq_file_net(seq);
2383 if (st->bpf_seq_afinfo)
2384 afinfo = st->bpf_seq_afinfo;
2386 afinfo = PDE_DATA(file_inode(seq->file));
2391 sk = sk_nulls_next(sk);
2393 sk_nulls_for_each_from(sk, node) {
2394 if ((afinfo->family == AF_UNSPEC ||
2395 sk->sk_family == afinfo->family) &&
2396 net_eq(sock_net(sk), net))
2400 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2402 return established_get_first(seq);
2405 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2407 struct tcp_iter_state *st = seq->private;
2411 rc = established_get_first(seq);
2414 rc = established_get_next(seq, rc);
2420 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2423 struct tcp_iter_state *st = seq->private;
2425 st->state = TCP_SEQ_STATE_LISTENING;
2426 rc = listening_get_idx(seq, &pos);
2429 st->state = TCP_SEQ_STATE_ESTABLISHED;
2430 rc = established_get_idx(seq, pos);
2436 static void *tcp_seek_last_pos(struct seq_file *seq)
2438 struct tcp_iter_state *st = seq->private;
2439 int bucket = st->bucket;
2440 int offset = st->offset;
2441 int orig_num = st->num;
2444 switch (st->state) {
2445 case TCP_SEQ_STATE_LISTENING:
2446 if (st->bucket >= INET_LHTABLE_SIZE)
2448 st->state = TCP_SEQ_STATE_LISTENING;
2449 rc = listening_get_next(seq, NULL);
2450 while (offset-- && rc && bucket == st->bucket)
2451 rc = listening_get_next(seq, rc);
2455 st->state = TCP_SEQ_STATE_ESTABLISHED;
2457 case TCP_SEQ_STATE_ESTABLISHED:
2458 if (st->bucket > tcp_hashinfo.ehash_mask)
2460 rc = established_get_first(seq);
2461 while (offset-- && rc && bucket == st->bucket)
2462 rc = established_get_next(seq, rc);
2470 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2472 struct tcp_iter_state *st = seq->private;
2475 if (*pos && *pos == st->last_pos) {
2476 rc = tcp_seek_last_pos(seq);
2481 st->state = TCP_SEQ_STATE_LISTENING;
2485 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2488 st->last_pos = *pos;
2491 EXPORT_SYMBOL(tcp_seq_start);
2493 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2495 struct tcp_iter_state *st = seq->private;
2498 if (v == SEQ_START_TOKEN) {
2499 rc = tcp_get_idx(seq, 0);
2503 switch (st->state) {
2504 case TCP_SEQ_STATE_LISTENING:
2505 rc = listening_get_next(seq, v);
2507 st->state = TCP_SEQ_STATE_ESTABLISHED;
2510 rc = established_get_first(seq);
2513 case TCP_SEQ_STATE_ESTABLISHED:
2514 rc = established_get_next(seq, v);
2519 st->last_pos = *pos;
2522 EXPORT_SYMBOL(tcp_seq_next);
2524 void tcp_seq_stop(struct seq_file *seq, void *v)
2526 struct tcp_iter_state *st = seq->private;
2528 switch (st->state) {
2529 case TCP_SEQ_STATE_LISTENING:
2530 if (v != SEQ_START_TOKEN)
2531 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2533 case TCP_SEQ_STATE_ESTABLISHED:
2535 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2539 EXPORT_SYMBOL(tcp_seq_stop);
2541 static void get_openreq4(const struct request_sock *req,
2542 struct seq_file *f, int i)
2544 const struct inet_request_sock *ireq = inet_rsk(req);
2545 long delta = req->rsk_timer.expires - jiffies;
2547 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2548 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2553 ntohs(ireq->ir_rmt_port),
2555 0, 0, /* could print option size, but that is af dependent. */
2556 1, /* timers active (only the expire timer) */
2557 jiffies_delta_to_clock_t(delta),
2559 from_kuid_munged(seq_user_ns(f),
2560 sock_i_uid(req->rsk_listener)),
2561 0, /* non standard timer */
2562 0, /* open_requests have no inode */
2567 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2570 unsigned long timer_expires;
2571 const struct tcp_sock *tp = tcp_sk(sk);
2572 const struct inet_connection_sock *icsk = inet_csk(sk);
2573 const struct inet_sock *inet = inet_sk(sk);
2574 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2575 __be32 dest = inet->inet_daddr;
2576 __be32 src = inet->inet_rcv_saddr;
2577 __u16 destp = ntohs(inet->inet_dport);
2578 __u16 srcp = ntohs(inet->inet_sport);
2582 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2583 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2584 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2586 timer_expires = icsk->icsk_timeout;
2587 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2589 timer_expires = icsk->icsk_timeout;
2590 } else if (timer_pending(&sk->sk_timer)) {
2592 timer_expires = sk->sk_timer.expires;
2595 timer_expires = jiffies;
2598 state = inet_sk_state_load(sk);
2599 if (state == TCP_LISTEN)
2600 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2602 /* Because we don't lock the socket,
2603 * we might find a transient negative value.
2605 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2606 READ_ONCE(tp->copied_seq), 0);
2608 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2609 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2610 i, src, srcp, dest, destp, state,
2611 READ_ONCE(tp->write_seq) - tp->snd_una,
2614 jiffies_delta_to_clock_t(timer_expires - jiffies),
2615 icsk->icsk_retransmits,
2616 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2617 icsk->icsk_probes_out,
2619 refcount_read(&sk->sk_refcnt), sk,
2620 jiffies_to_clock_t(icsk->icsk_rto),
2621 jiffies_to_clock_t(icsk->icsk_ack.ato),
2622 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2624 state == TCP_LISTEN ?
2625 fastopenq->max_qlen :
2626 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2629 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2630 struct seq_file *f, int i)
2632 long delta = tw->tw_timer.expires - jiffies;
2636 dest = tw->tw_daddr;
2637 src = tw->tw_rcv_saddr;
2638 destp = ntohs(tw->tw_dport);
2639 srcp = ntohs(tw->tw_sport);
2641 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2642 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2643 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2644 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2645 refcount_read(&tw->tw_refcnt), tw);
2650 static int tcp4_seq_show(struct seq_file *seq, void *v)
2652 struct tcp_iter_state *st;
2653 struct sock *sk = v;
2655 seq_setwidth(seq, TMPSZ - 1);
2656 if (v == SEQ_START_TOKEN) {
2657 seq_puts(seq, " sl local_address rem_address st tx_queue "
2658 "rx_queue tr tm->when retrnsmt uid timeout "
2664 if (sk->sk_state == TCP_TIME_WAIT)
2665 get_timewait4_sock(v, seq, st->num);
2666 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2667 get_openreq4(v, seq, st->num);
2669 get_tcp4_sock(v, seq, st->num);
2675 #ifdef CONFIG_BPF_SYSCALL
2676 struct bpf_iter__tcp {
2677 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2678 __bpf_md_ptr(struct sock_common *, sk_common);
2679 uid_t uid __aligned(8);
2682 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2683 struct sock_common *sk_common, uid_t uid)
2685 struct bpf_iter__tcp ctx;
2687 meta->seq_num--; /* skip SEQ_START_TOKEN */
2689 ctx.sk_common = sk_common;
2691 return bpf_iter_run_prog(prog, &ctx);
2694 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2696 struct bpf_iter_meta meta;
2697 struct bpf_prog *prog;
2698 struct sock *sk = v;
2701 if (v == SEQ_START_TOKEN)
2704 if (sk->sk_state == TCP_TIME_WAIT) {
2706 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2707 const struct request_sock *req = v;
2709 uid = from_kuid_munged(seq_user_ns(seq),
2710 sock_i_uid(req->rsk_listener));
2712 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2716 prog = bpf_iter_get_info(&meta, false);
2717 return tcp_prog_seq_show(prog, &meta, v, uid);
2720 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2722 struct bpf_iter_meta meta;
2723 struct bpf_prog *prog;
2727 prog = bpf_iter_get_info(&meta, true);
2729 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2732 tcp_seq_stop(seq, v);
2735 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2736 .show = bpf_iter_tcp_seq_show,
2737 .start = tcp_seq_start,
2738 .next = tcp_seq_next,
2739 .stop = bpf_iter_tcp_seq_stop,
2743 static const struct seq_operations tcp4_seq_ops = {
2744 .show = tcp4_seq_show,
2745 .start = tcp_seq_start,
2746 .next = tcp_seq_next,
2747 .stop = tcp_seq_stop,
2750 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2754 static int __net_init tcp4_proc_init_net(struct net *net)
2756 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2757 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2762 static void __net_exit tcp4_proc_exit_net(struct net *net)
2764 remove_proc_entry("tcp", net->proc_net);
2767 static struct pernet_operations tcp4_net_ops = {
2768 .init = tcp4_proc_init_net,
2769 .exit = tcp4_proc_exit_net,
2772 int __init tcp4_proc_init(void)
2774 return register_pernet_subsys(&tcp4_net_ops);
2777 void tcp4_proc_exit(void)
2779 unregister_pernet_subsys(&tcp4_net_ops);
2781 #endif /* CONFIG_PROC_FS */
2783 struct proto tcp_prot = {
2785 .owner = THIS_MODULE,
2787 .pre_connect = tcp_v4_pre_connect,
2788 .connect = tcp_v4_connect,
2789 .disconnect = tcp_disconnect,
2790 .accept = inet_csk_accept,
2792 .init = tcp_v4_init_sock,
2793 .destroy = tcp_v4_destroy_sock,
2794 .shutdown = tcp_shutdown,
2795 .setsockopt = tcp_setsockopt,
2796 .getsockopt = tcp_getsockopt,
2797 .keepalive = tcp_set_keepalive,
2798 .recvmsg = tcp_recvmsg,
2799 .sendmsg = tcp_sendmsg,
2800 .sendpage = tcp_sendpage,
2801 .backlog_rcv = tcp_v4_do_rcv,
2802 .release_cb = tcp_release_cb,
2804 .unhash = inet_unhash,
2805 .get_port = inet_csk_get_port,
2806 .enter_memory_pressure = tcp_enter_memory_pressure,
2807 .leave_memory_pressure = tcp_leave_memory_pressure,
2808 .stream_memory_free = tcp_stream_memory_free,
2809 .sockets_allocated = &tcp_sockets_allocated,
2810 .orphan_count = &tcp_orphan_count,
2811 .memory_allocated = &tcp_memory_allocated,
2812 .memory_pressure = &tcp_memory_pressure,
2813 .sysctl_mem = sysctl_tcp_mem,
2814 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2815 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2816 .max_header = MAX_TCP_HEADER,
2817 .obj_size = sizeof(struct tcp_sock),
2818 .slab_flags = SLAB_TYPESAFE_BY_RCU,
2819 .twsk_prot = &tcp_timewait_sock_ops,
2820 .rsk_prot = &tcp_request_sock_ops,
2821 .h.hashinfo = &tcp_hashinfo,
2822 .no_autobind = true,
2823 .diag_destroy = tcp_abort,
2825 EXPORT_SYMBOL(tcp_prot);
2827 static void __net_exit tcp_sk_exit(struct net *net)
2831 if (net->ipv4.tcp_congestion_control)
2832 bpf_module_put(net->ipv4.tcp_congestion_control,
2833 net->ipv4.tcp_congestion_control->owner);
2835 for_each_possible_cpu(cpu)
2836 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2837 free_percpu(net->ipv4.tcp_sk);
2840 static int __net_init tcp_sk_init(struct net *net)
2844 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2845 if (!net->ipv4.tcp_sk)
2848 for_each_possible_cpu(cpu) {
2851 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2855 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2857 /* Please enforce IP_DF and IPID==0 for RST and
2858 * ACK sent in SYN-RECV and TIME-WAIT state.
2860 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2862 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2865 net->ipv4.sysctl_tcp_ecn = 2;
2866 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2868 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2869 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2870 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2871 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2872 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2874 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2875 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2876 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2878 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2879 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2880 net->ipv4.sysctl_tcp_syncookies = 1;
2881 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2882 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2883 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2884 net->ipv4.sysctl_tcp_orphan_retries = 0;
2885 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2886 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2887 net->ipv4.sysctl_tcp_tw_reuse = 2;
2888 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2890 cnt = tcp_hashinfo.ehash_mask + 1;
2891 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2892 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2894 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2895 net->ipv4.sysctl_tcp_sack = 1;
2896 net->ipv4.sysctl_tcp_window_scaling = 1;
2897 net->ipv4.sysctl_tcp_timestamps = 1;
2898 net->ipv4.sysctl_tcp_early_retrans = 3;
2899 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2900 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
2901 net->ipv4.sysctl_tcp_retrans_collapse = 1;
2902 net->ipv4.sysctl_tcp_max_reordering = 300;
2903 net->ipv4.sysctl_tcp_dsack = 1;
2904 net->ipv4.sysctl_tcp_app_win = 31;
2905 net->ipv4.sysctl_tcp_adv_win_scale = 1;
2906 net->ipv4.sysctl_tcp_frto = 2;
2907 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2908 /* This limits the percentage of the congestion window which we
2909 * will allow a single TSO frame to consume. Building TSO frames
2910 * which are too large can cause TCP streams to be bursty.
2912 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2913 /* Default TSQ limit of 16 TSO segments */
2914 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2915 /* rfc5961 challenge ack rate limiting */
2916 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2917 net->ipv4.sysctl_tcp_min_tso_segs = 2;
2918 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2919 net->ipv4.sysctl_tcp_autocorking = 1;
2920 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2921 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2922 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2923 if (net != &init_net) {
2924 memcpy(net->ipv4.sysctl_tcp_rmem,
2925 init_net.ipv4.sysctl_tcp_rmem,
2926 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2927 memcpy(net->ipv4.sysctl_tcp_wmem,
2928 init_net.ipv4.sysctl_tcp_wmem,
2929 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2931 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2932 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2933 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2934 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2935 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2936 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
2937 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2939 /* Reno is always built in */
2940 if (!net_eq(net, &init_net) &&
2941 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2942 init_net.ipv4.tcp_congestion_control->owner))
2943 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2945 net->ipv4.tcp_congestion_control = &tcp_reno;
2954 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2958 inet_twsk_purge(&tcp_hashinfo, AF_INET);
2960 list_for_each_entry(net, net_exit_list, exit_list)
2961 tcp_fastopen_ctx_destroy(net);
2964 static struct pernet_operations __net_initdata tcp_sk_ops = {
2965 .init = tcp_sk_init,
2966 .exit = tcp_sk_exit,
2967 .exit_batch = tcp_sk_exit_batch,
2970 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2971 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2972 struct sock_common *sk_common, uid_t uid)
2974 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2976 struct tcp_iter_state *st = priv_data;
2977 struct tcp_seq_afinfo *afinfo;
2980 afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2984 afinfo->family = AF_UNSPEC;
2985 st->bpf_seq_afinfo = afinfo;
2986 ret = bpf_iter_init_seq_net(priv_data, aux);
2992 static void bpf_iter_fini_tcp(void *priv_data)
2994 struct tcp_iter_state *st = priv_data;
2996 kfree(st->bpf_seq_afinfo);
2997 bpf_iter_fini_seq_net(priv_data);
3000 static const struct bpf_iter_seq_info tcp_seq_info = {
3001 .seq_ops = &bpf_iter_tcp_seq_ops,
3002 .init_seq_private = bpf_iter_init_tcp,
3003 .fini_seq_private = bpf_iter_fini_tcp,
3004 .seq_priv_size = sizeof(struct tcp_iter_state),
3007 static struct bpf_iter_reg tcp_reg_info = {
3009 .ctx_arg_info_size = 1,
3011 { offsetof(struct bpf_iter__tcp, sk_common),
3012 PTR_TO_BTF_ID_OR_NULL },
3014 .seq_info = &tcp_seq_info,
3017 static void __init bpf_iter_register(void)
3019 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3020 if (bpf_iter_reg_target(&tcp_reg_info))
3021 pr_warn("Warning: could not register bpf iterator tcp\n");
3026 void __init tcp_v4_init(void)
3028 if (register_pernet_subsys(&tcp_sk_ops))
3029 panic("Failed to create the TCP control socket.\n");
3031 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3032 bpf_iter_register();