1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Implementation of the Transmission Control Protocol(TCP).
9 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
48 #define pr_fmt(fmt) "TCP: " fmt
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
61 #include <net/net_namespace.h>
63 #include <net/inet_hashtables.h>
65 #include <net/transp_v6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
80 #include <crypto/hash.h>
81 #include <linux/scatterlist.h>
83 #include <trace/events/tcp.h>
85 #ifdef CONFIG_TCP_MD5SIG
86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
87 __be32 daddr, __be32 saddr, const struct tcphdr *th);
90 struct inet_hashinfo tcp_hashinfo;
91 EXPORT_SYMBOL(tcp_hashinfo);
93 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
95 return secure_tcp_seq(ip_hdr(skb)->daddr,
98 tcp_hdr(skb)->source);
101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
109 const struct inet_timewait_sock *tw = inet_twsk(sktw);
110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 struct tcp_sock *tp = tcp_sk(sk);
114 /* Still does not detect *everything* that goes through
115 * lo, since we require a loopback src or dst address
116 * or direct binding to 'lo' interface.
118 bool loopback = false;
119 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121 #if IS_ENABLED(CONFIG_IPV6)
122 if (tw->tw_family == AF_INET6) {
123 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
124 (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
125 (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
128 (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
133 if (ipv4_is_loopback(tw->tw_daddr) ||
134 ipv4_is_loopback(tw->tw_rcv_saddr))
141 /* With PAWS, it is safe from the viewpoint
142 of data integrity. Even without PAWS it is safe provided sequence
143 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145 Actually, the idea is close to VJ's one, only timestamp cache is
146 held not per host, but per port pair and TW bucket is used as state
149 If TW bucket has been already destroyed we fall back to VJ's scheme
150 and use initial timestamp retrieved from peer table.
152 if (tcptw->tw_ts_recent_stamp &&
153 (!twp || (reuse && time_after32(ktime_get_seconds(),
154 tcptw->tw_ts_recent_stamp)))) {
155 /* In case of repair and re-using TIME-WAIT sockets we still
156 * want to be sure that it is safe as above but honor the
157 * sequence numbers and time stamps set as part of the repair
160 * Without this check re-using a TIME-WAIT socket with TCP
161 * repair would accumulate a -1 on the repair assigned
162 * sequence number. The first time it is reused the sequence
163 * is -1, the second time -2, etc. This fixes that issue
164 * without appearing to create any others.
166 if (likely(!tp->repair)) {
167 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
171 WRITE_ONCE(tp->write_seq, seq);
172 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
173 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
181 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
183 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
186 /* This check is replicated from tcp_v4_connect() and intended to
187 * prevent BPF program called below from accessing bytes that are out
188 * of the bound specified by user in addr_len.
190 if (addr_len < sizeof(struct sockaddr_in))
193 sock_owned_by_me(sk);
195 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
198 /* This will initiate an outgoing connection. */
199 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
201 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
202 struct inet_sock *inet = inet_sk(sk);
203 struct tcp_sock *tp = tcp_sk(sk);
204 __be16 orig_sport, orig_dport;
205 __be32 daddr, nexthop;
209 struct ip_options_rcu *inet_opt;
210 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
212 if (addr_len < sizeof(struct sockaddr_in))
215 if (usin->sin_family != AF_INET)
216 return -EAFNOSUPPORT;
218 nexthop = daddr = usin->sin_addr.s_addr;
219 inet_opt = rcu_dereference_protected(inet->inet_opt,
220 lockdep_sock_is_held(sk));
221 if (inet_opt && inet_opt->opt.srr) {
224 nexthop = inet_opt->opt.faddr;
227 orig_sport = inet->inet_sport;
228 orig_dport = usin->sin_port;
229 fl4 = &inet->cork.fl.u.ip4;
230 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
231 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
233 orig_sport, orig_dport, sk);
236 if (err == -ENETUNREACH)
237 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
241 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
246 if (!inet_opt || !inet_opt->opt.srr)
249 if (!inet->inet_saddr)
250 inet->inet_saddr = fl4->saddr;
251 sk_rcv_saddr_set(sk, inet->inet_saddr);
253 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
254 /* Reset inherited state */
255 tp->rx_opt.ts_recent = 0;
256 tp->rx_opt.ts_recent_stamp = 0;
257 if (likely(!tp->repair))
258 WRITE_ONCE(tp->write_seq, 0);
261 inet->inet_dport = usin->sin_port;
262 sk_daddr_set(sk, daddr);
264 inet_csk(sk)->icsk_ext_hdr_len = 0;
266 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
268 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
270 /* Socket identity is still unknown (sport may be zero).
271 * However we set state to SYN-SENT and not releasing socket
272 * lock select source port, enter ourselves into the hash tables and
273 * complete initialization after this.
275 tcp_set_state(sk, TCP_SYN_SENT);
276 err = inet_hash_connect(tcp_death_row, sk);
282 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
283 inet->inet_sport, inet->inet_dport, sk);
289 /* OK, now commit destination to socket. */
290 sk->sk_gso_type = SKB_GSO_TCPV4;
291 sk_setup_caps(sk, &rt->dst);
294 if (likely(!tp->repair)) {
296 WRITE_ONCE(tp->write_seq,
297 secure_tcp_seq(inet->inet_saddr,
301 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
306 inet->inet_id = prandom_u32();
308 if (tcp_fastopen_defer_connect(sk, &err))
313 err = tcp_connect(sk);
322 * This unhashes the socket and releases the local port,
325 tcp_set_state(sk, TCP_CLOSE);
326 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
327 inet_reset_saddr(sk);
329 sk->sk_route_caps = 0;
330 inet->inet_dport = 0;
333 EXPORT_SYMBOL(tcp_v4_connect);
336 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
337 * It can be called through tcp_release_cb() if socket was owned by user
338 * at the time tcp_v4_err() was called to handle ICMP message.
340 void tcp_v4_mtu_reduced(struct sock *sk)
342 struct inet_sock *inet = inet_sk(sk);
343 struct dst_entry *dst;
346 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
348 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
349 dst = inet_csk_update_pmtu(sk, mtu);
353 /* Something is about to be wrong... Remember soft error
354 * for the case, if this connection will not able to recover.
356 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
357 sk->sk_err_soft = EMSGSIZE;
361 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
362 ip_sk_accept_pmtu(sk) &&
363 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
364 tcp_sync_mss(sk, mtu);
366 /* Resend the TCP packet because it's
367 * clear that the old packet has been
368 * dropped. This is the new "fast" path mtu
371 tcp_simple_retransmit(sk);
372 } /* else let the usual retransmit timer handle it */
374 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
376 static void do_redirect(struct sk_buff *skb, struct sock *sk)
378 struct dst_entry *dst = __sk_dst_check(sk, 0);
381 dst->ops->redirect(dst, sk, skb);
385 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
386 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
388 struct request_sock *req = inet_reqsk(sk);
389 struct net *net = sock_net(sk);
391 /* ICMPs are not backlogged, hence we cannot get
392 * an established socket here.
394 if (seq != tcp_rsk(req)->snt_isn) {
395 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
398 * Still in SYN_RECV, just remove it silently.
399 * There is no good way to pass the error to the newly
400 * created socket, and POSIX does not want network
401 * errors returned from accept().
403 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
404 tcp_listendrop(req->rsk_listener);
408 EXPORT_SYMBOL(tcp_req_err);
411 * This routine is called by the ICMP module when it gets some
412 * sort of error condition. If err < 0 then the socket should
413 * be closed and the error returned to the user. If err > 0
414 * it's just the icmp type << 8 | icmp code. After adjustment
415 * header points to the first 8 bytes of the tcp header. We need
416 * to find the appropriate port.
418 * The locking strategy used here is very "optimistic". When
419 * someone else accesses the socket the ICMP is just dropped
420 * and for some paths there is no check at all.
421 * A more general error queue to queue errors for later handling
422 * is probably better.
426 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
428 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
429 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
430 struct inet_connection_sock *icsk;
432 struct inet_sock *inet;
433 const int type = icmp_hdr(icmp_skb)->type;
434 const int code = icmp_hdr(icmp_skb)->code;
437 struct request_sock *fastopen;
442 struct net *net = dev_net(icmp_skb->dev);
444 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
445 th->dest, iph->saddr, ntohs(th->source),
446 inet_iif(icmp_skb), 0);
448 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
451 if (sk->sk_state == TCP_TIME_WAIT) {
452 inet_twsk_put(inet_twsk(sk));
455 seq = ntohl(th->seq);
456 if (sk->sk_state == TCP_NEW_SYN_RECV) {
457 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
458 type == ICMP_TIME_EXCEEDED ||
459 (type == ICMP_DEST_UNREACH &&
460 (code == ICMP_NET_UNREACH ||
461 code == ICMP_HOST_UNREACH)));
466 /* If too many ICMPs get dropped on busy
467 * servers this needs to be solved differently.
468 * We do take care of PMTU discovery (RFC1191) special case :
469 * we can receive locally generated ICMP messages while socket is held.
471 if (sock_owned_by_user(sk)) {
472 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
473 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
475 if (sk->sk_state == TCP_CLOSE)
478 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
479 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
485 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
486 fastopen = rcu_dereference(tp->fastopen_rsk);
487 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
488 if (sk->sk_state != TCP_LISTEN &&
489 !between(seq, snd_una, tp->snd_nxt)) {
490 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
496 if (!sock_owned_by_user(sk))
497 do_redirect(icmp_skb, sk);
499 case ICMP_SOURCE_QUENCH:
500 /* Just silently ignore these. */
502 case ICMP_PARAMETERPROB:
505 case ICMP_DEST_UNREACH:
506 if (code > NR_ICMP_UNREACH)
509 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
510 /* We are not interested in TCP_LISTEN and open_requests
511 * (SYN-ACKs send out by Linux are always <576bytes so
512 * they should go through unfragmented).
514 if (sk->sk_state == TCP_LISTEN)
517 WRITE_ONCE(tp->mtu_info, info);
518 if (!sock_owned_by_user(sk)) {
519 tcp_v4_mtu_reduced(sk);
521 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
527 err = icmp_err_convert[code].errno;
528 /* check if icmp_skb allows revert of backoff
529 * (see draft-zimmermann-tcp-lcd) */
530 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
532 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
533 !icsk->icsk_backoff || fastopen)
536 if (sock_owned_by_user(sk))
539 skb = tcp_rtx_queue_head(sk);
540 if (WARN_ON_ONCE(!skb))
543 icsk->icsk_backoff--;
544 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
546 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
549 tcp_mstamp_refresh(tp);
550 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
551 remaining = icsk->icsk_rto -
552 usecs_to_jiffies(delta_us);
555 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
556 remaining, TCP_RTO_MAX);
558 /* RTO revert clocked out retransmission.
559 * Will retransmit now */
560 tcp_retransmit_timer(sk);
564 case ICMP_TIME_EXCEEDED:
571 switch (sk->sk_state) {
574 /* Only in fast or simultaneous open. If a fast open socket is
575 * is already accepted it is treated as a connected one below.
577 if (fastopen && !fastopen->sk)
580 if (!sock_owned_by_user(sk)) {
583 sk->sk_error_report(sk);
587 sk->sk_err_soft = err;
592 /* If we've already connected we will keep trying
593 * until we time out, or the user gives up.
595 * rfc1122 4.2.3.9 allows to consider as hard errors
596 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
597 * but it is obsoleted by pmtu discovery).
599 * Note, that in modern internet, where routing is unreliable
600 * and in each dark corner broken firewalls sit, sending random
601 * errors ordered by their masters even this two messages finally lose
602 * their original sense (even Linux sends invalid PORT_UNREACHs)
604 * Now we are in compliance with RFCs.
609 if (!sock_owned_by_user(sk) && inet->recverr) {
611 sk->sk_error_report(sk);
612 } else { /* Only an error on timeout */
613 sk->sk_err_soft = err;
622 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
624 struct tcphdr *th = tcp_hdr(skb);
626 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
627 skb->csum_start = skb_transport_header(skb) - skb->head;
628 skb->csum_offset = offsetof(struct tcphdr, check);
631 /* This routine computes an IPv4 TCP checksum. */
632 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
634 const struct inet_sock *inet = inet_sk(sk);
636 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
638 EXPORT_SYMBOL(tcp_v4_send_check);
641 * This routine will send an RST to the other tcp.
643 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
645 * Answer: if a packet caused RST, it is not for a socket
646 * existing in our system, if it is matched to a socket,
647 * it is just duplicate segment or bug in other side's TCP.
648 * So that we build reply only basing on parameters
649 * arrived with segment.
650 * Exception: precedence violation. We do not implement it in any case.
653 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
655 const struct tcphdr *th = tcp_hdr(skb);
658 #ifdef CONFIG_TCP_MD5SIG
659 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
662 struct ip_reply_arg arg;
663 #ifdef CONFIG_TCP_MD5SIG
664 struct tcp_md5sig_key *key = NULL;
665 const __u8 *hash_location = NULL;
666 unsigned char newhash[16];
668 struct sock *sk1 = NULL;
670 u64 transmit_time = 0;
674 /* Never send a reset in response to a reset. */
678 /* If sk not NULL, it means we did a successful lookup and incoming
679 * route had to be correct. prequeue might have dropped our dst.
681 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
684 /* Swap the send and the receive. */
685 memset(&rep, 0, sizeof(rep));
686 rep.th.dest = th->source;
687 rep.th.source = th->dest;
688 rep.th.doff = sizeof(struct tcphdr) / 4;
692 rep.th.seq = th->ack_seq;
695 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
696 skb->len - (th->doff << 2));
699 memset(&arg, 0, sizeof(arg));
700 arg.iov[0].iov_base = (unsigned char *)&rep;
701 arg.iov[0].iov_len = sizeof(rep.th);
703 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
704 #ifdef CONFIG_TCP_MD5SIG
706 hash_location = tcp_parse_md5sig_option(th);
707 if (sk && sk_fullsock(sk)) {
708 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
709 &ip_hdr(skb)->saddr, AF_INET);
710 } else if (hash_location) {
712 * active side is lost. Try to find listening socket through
713 * source port, and then find md5 key through listening socket.
714 * we are not loose security here:
715 * Incoming packet is checked with md5 hash with finding key,
716 * no RST generated if md5 hash doesn't match.
718 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
720 th->source, ip_hdr(skb)->daddr,
721 ntohs(th->source), inet_iif(skb),
723 /* don't send rst if it can't find key */
727 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
728 &ip_hdr(skb)->saddr, AF_INET);
733 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
734 if (genhash || memcmp(hash_location, newhash, 16) != 0)
740 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
742 (TCPOPT_MD5SIG << 8) |
744 /* Update length and the length the header thinks exists */
745 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
746 rep.th.doff = arg.iov[0].iov_len / 4;
748 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
749 key, ip_hdr(skb)->saddr,
750 ip_hdr(skb)->daddr, &rep.th);
753 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
754 ip_hdr(skb)->saddr, /* XXX */
755 arg.iov[0].iov_len, IPPROTO_TCP, 0);
756 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
757 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
759 /* When socket is gone, all binding information is lost.
760 * routing might fail in this case. No choice here, if we choose to force
761 * input interface, we will misroute in case of asymmetric route.
764 arg.bound_dev_if = sk->sk_bound_dev_if;
766 trace_tcp_send_reset(sk, skb);
769 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
770 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
772 arg.tos = ip_hdr(skb)->tos;
773 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
775 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
777 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
778 inet_twsk(sk)->tw_mark : sk->sk_mark;
779 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
780 inet_twsk(sk)->tw_priority : sk->sk_priority;
781 transmit_time = tcp_transmit_time(sk);
783 ip_send_unicast_reply(ctl_sk,
784 skb, &TCP_SKB_CB(skb)->header.h4.opt,
785 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
786 &arg, arg.iov[0].iov_len,
790 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
791 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
794 #ifdef CONFIG_TCP_MD5SIG
800 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
801 outside socket context is ugly, certainly. What can I do?
804 static void tcp_v4_send_ack(const struct sock *sk,
805 struct sk_buff *skb, u32 seq, u32 ack,
806 u32 win, u32 tsval, u32 tsecr, int oif,
807 struct tcp_md5sig_key *key,
808 int reply_flags, u8 tos)
810 const struct tcphdr *th = tcp_hdr(skb);
813 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
814 #ifdef CONFIG_TCP_MD5SIG
815 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
819 struct net *net = sock_net(sk);
820 struct ip_reply_arg arg;
824 memset(&rep.th, 0, sizeof(struct tcphdr));
825 memset(&arg, 0, sizeof(arg));
827 arg.iov[0].iov_base = (unsigned char *)&rep;
828 arg.iov[0].iov_len = sizeof(rep.th);
830 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
831 (TCPOPT_TIMESTAMP << 8) |
833 rep.opt[1] = htonl(tsval);
834 rep.opt[2] = htonl(tsecr);
835 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
838 /* Swap the send and the receive. */
839 rep.th.dest = th->source;
840 rep.th.source = th->dest;
841 rep.th.doff = arg.iov[0].iov_len / 4;
842 rep.th.seq = htonl(seq);
843 rep.th.ack_seq = htonl(ack);
845 rep.th.window = htons(win);
847 #ifdef CONFIG_TCP_MD5SIG
849 int offset = (tsecr) ? 3 : 0;
851 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
853 (TCPOPT_MD5SIG << 8) |
855 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
856 rep.th.doff = arg.iov[0].iov_len/4;
858 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
859 key, ip_hdr(skb)->saddr,
860 ip_hdr(skb)->daddr, &rep.th);
863 arg.flags = reply_flags;
864 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
865 ip_hdr(skb)->saddr, /* XXX */
866 arg.iov[0].iov_len, IPPROTO_TCP, 0);
867 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
869 arg.bound_dev_if = oif;
871 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
873 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
874 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
875 inet_twsk(sk)->tw_mark : sk->sk_mark;
876 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
877 inet_twsk(sk)->tw_priority : sk->sk_priority;
878 transmit_time = tcp_transmit_time(sk);
879 ip_send_unicast_reply(ctl_sk,
880 skb, &TCP_SKB_CB(skb)->header.h4.opt,
881 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
882 &arg, arg.iov[0].iov_len,
886 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
890 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
892 struct inet_timewait_sock *tw = inet_twsk(sk);
893 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
895 tcp_v4_send_ack(sk, skb,
896 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
897 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
898 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
901 tcp_twsk_md5_key(tcptw),
902 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
909 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
910 struct request_sock *req)
912 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
913 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
915 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
919 * The window field (SEG.WND) of every outgoing segment, with the
920 * exception of <SYN> segments, MUST be right-shifted by
921 * Rcv.Wind.Shift bits:
923 tcp_v4_send_ack(sk, skb, seq,
924 tcp_rsk(req)->rcv_nxt,
925 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
926 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
929 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
931 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
936 * Send a SYN-ACK after having received a SYN.
937 * This still operates on a request_sock only, not on a big
940 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
942 struct request_sock *req,
943 struct tcp_fastopen_cookie *foc,
944 enum tcp_synack_type synack_type)
946 const struct inet_request_sock *ireq = inet_rsk(req);
951 /* First, grab a route. */
952 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
955 skb = tcp_make_synack(sk, dst, req, foc, synack_type);
958 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
961 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
963 rcu_dereference(ireq->ireq_opt));
965 err = net_xmit_eval(err);
972 * IPv4 request_sock destructor.
974 static void tcp_v4_reqsk_destructor(struct request_sock *req)
976 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
979 #ifdef CONFIG_TCP_MD5SIG
981 * RFC2385 MD5 checksumming requires a mapping of
982 * IP address->MD5 Key.
983 * We need to maintain these in the sk structure.
986 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
987 EXPORT_SYMBOL(tcp_md5_needed);
989 /* Find the Key structure for an address. */
990 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
991 const union tcp_md5_addr *addr,
994 const struct tcp_sock *tp = tcp_sk(sk);
995 struct tcp_md5sig_key *key;
996 const struct tcp_md5sig_info *md5sig;
998 struct tcp_md5sig_key *best_match = NULL;
1001 /* caller either holds rcu_read_lock() or socket lock */
1002 md5sig = rcu_dereference_check(tp->md5sig_info,
1003 lockdep_sock_is_held(sk));
1007 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1008 if (key->family != family)
1011 if (family == AF_INET) {
1012 mask = inet_make_mask(key->prefixlen);
1013 match = (key->addr.a4.s_addr & mask) ==
1014 (addr->a4.s_addr & mask);
1015 #if IS_ENABLED(CONFIG_IPV6)
1016 } else if (family == AF_INET6) {
1017 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1024 if (match && (!best_match ||
1025 key->prefixlen > best_match->prefixlen))
1030 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1032 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1033 const union tcp_md5_addr *addr,
1034 int family, u8 prefixlen)
1036 const struct tcp_sock *tp = tcp_sk(sk);
1037 struct tcp_md5sig_key *key;
1038 unsigned int size = sizeof(struct in_addr);
1039 const struct tcp_md5sig_info *md5sig;
1041 /* caller either holds rcu_read_lock() or socket lock */
1042 md5sig = rcu_dereference_check(tp->md5sig_info,
1043 lockdep_sock_is_held(sk));
1046 #if IS_ENABLED(CONFIG_IPV6)
1047 if (family == AF_INET6)
1048 size = sizeof(struct in6_addr);
1050 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1051 if (key->family != family)
1053 if (!memcmp(&key->addr, addr, size) &&
1054 key->prefixlen == prefixlen)
1060 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1061 const struct sock *addr_sk)
1063 const union tcp_md5_addr *addr;
1065 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1066 return tcp_md5_do_lookup(sk, addr, AF_INET);
1068 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1070 /* This can be called on a newly created socket, from other files */
1071 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1072 int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1075 /* Add Key to the list */
1076 struct tcp_md5sig_key *key;
1077 struct tcp_sock *tp = tcp_sk(sk);
1078 struct tcp_md5sig_info *md5sig;
1080 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1082 /* Pre-existing entry - just update that one.
1083 * Note that the key might be used concurrently.
1085 memcpy(key->key, newkey, newkeylen);
1087 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1088 * Also note that a reader could catch new key->keylen value
1089 * but old key->key[], this is the reason we use __GFP_ZERO
1090 * at sock_kmalloc() time below these lines.
1092 WRITE_ONCE(key->keylen, newkeylen);
1097 md5sig = rcu_dereference_protected(tp->md5sig_info,
1098 lockdep_sock_is_held(sk));
1100 md5sig = kmalloc(sizeof(*md5sig), gfp);
1104 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1105 INIT_HLIST_HEAD(&md5sig->head);
1106 rcu_assign_pointer(tp->md5sig_info, md5sig);
1109 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1112 if (!tcp_alloc_md5sig_pool()) {
1113 sock_kfree_s(sk, key, sizeof(*key));
1117 memcpy(key->key, newkey, newkeylen);
1118 key->keylen = newkeylen;
1119 key->family = family;
1120 key->prefixlen = prefixlen;
1121 memcpy(&key->addr, addr,
1122 (family == AF_INET6) ? sizeof(struct in6_addr) :
1123 sizeof(struct in_addr));
1124 hlist_add_head_rcu(&key->node, &md5sig->head);
1127 EXPORT_SYMBOL(tcp_md5_do_add);
1129 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1132 struct tcp_md5sig_key *key;
1134 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1137 hlist_del_rcu(&key->node);
1138 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1139 kfree_rcu(key, rcu);
1142 EXPORT_SYMBOL(tcp_md5_do_del);
1144 static void tcp_clear_md5_list(struct sock *sk)
1146 struct tcp_sock *tp = tcp_sk(sk);
1147 struct tcp_md5sig_key *key;
1148 struct hlist_node *n;
1149 struct tcp_md5sig_info *md5sig;
1151 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1153 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1154 hlist_del_rcu(&key->node);
1155 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1156 kfree_rcu(key, rcu);
1160 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1161 char __user *optval, int optlen)
1163 struct tcp_md5sig cmd;
1164 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1167 if (optlen < sizeof(cmd))
1170 if (copy_from_user(&cmd, optval, sizeof(cmd)))
1173 if (sin->sin_family != AF_INET)
1176 if (optname == TCP_MD5SIG_EXT &&
1177 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1178 prefixlen = cmd.tcpm_prefixlen;
1183 if (!cmd.tcpm_keylen)
1184 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1185 AF_INET, prefixlen);
1187 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1190 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1191 AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1195 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1196 __be32 daddr, __be32 saddr,
1197 const struct tcphdr *th, int nbytes)
1199 struct tcp4_pseudohdr *bp;
1200 struct scatterlist sg;
1207 bp->protocol = IPPROTO_TCP;
1208 bp->len = cpu_to_be16(nbytes);
1210 _th = (struct tcphdr *)(bp + 1);
1211 memcpy(_th, th, sizeof(*th));
1214 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1215 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1216 sizeof(*bp) + sizeof(*th));
1217 return crypto_ahash_update(hp->md5_req);
1220 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1221 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1223 struct tcp_md5sig_pool *hp;
1224 struct ahash_request *req;
1226 hp = tcp_get_md5sig_pool();
1228 goto clear_hash_noput;
1231 if (crypto_ahash_init(req))
1233 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1235 if (tcp_md5_hash_key(hp, key))
1237 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1238 if (crypto_ahash_final(req))
1241 tcp_put_md5sig_pool();
1245 tcp_put_md5sig_pool();
1247 memset(md5_hash, 0, 16);
1251 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1252 const struct sock *sk,
1253 const struct sk_buff *skb)
1255 struct tcp_md5sig_pool *hp;
1256 struct ahash_request *req;
1257 const struct tcphdr *th = tcp_hdr(skb);
1258 __be32 saddr, daddr;
1260 if (sk) { /* valid for establish/request sockets */
1261 saddr = sk->sk_rcv_saddr;
1262 daddr = sk->sk_daddr;
1264 const struct iphdr *iph = ip_hdr(skb);
1269 hp = tcp_get_md5sig_pool();
1271 goto clear_hash_noput;
1274 if (crypto_ahash_init(req))
1277 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1279 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1281 if (tcp_md5_hash_key(hp, key))
1283 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1284 if (crypto_ahash_final(req))
1287 tcp_put_md5sig_pool();
1291 tcp_put_md5sig_pool();
1293 memset(md5_hash, 0, 16);
1296 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1300 /* Called with rcu_read_lock() */
1301 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1302 const struct sk_buff *skb)
1304 #ifdef CONFIG_TCP_MD5SIG
1306 * This gets called for each TCP segment that arrives
1307 * so we want to be efficient.
1308 * We have 3 drop cases:
1309 * o No MD5 hash and one expected.
1310 * o MD5 hash and we're not expecting one.
1311 * o MD5 hash and its wrong.
1313 const __u8 *hash_location = NULL;
1314 struct tcp_md5sig_key *hash_expected;
1315 const struct iphdr *iph = ip_hdr(skb);
1316 const struct tcphdr *th = tcp_hdr(skb);
1318 unsigned char newhash[16];
1320 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1322 hash_location = tcp_parse_md5sig_option(th);
1324 /* We've parsed the options - do we have a hash? */
1325 if (!hash_expected && !hash_location)
1328 if (hash_expected && !hash_location) {
1329 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1333 if (!hash_expected && hash_location) {
1334 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1338 /* Okay, so this is hash_expected and hash_location -
1339 * so we need to calculate the checksum.
1341 genhash = tcp_v4_md5_hash_skb(newhash,
1345 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1346 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1347 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1348 &iph->saddr, ntohs(th->source),
1349 &iph->daddr, ntohs(th->dest),
1350 genhash ? " tcp_v4_calc_md5_hash failed"
1359 static void tcp_v4_init_req(struct request_sock *req,
1360 const struct sock *sk_listener,
1361 struct sk_buff *skb)
1363 struct inet_request_sock *ireq = inet_rsk(req);
1364 struct net *net = sock_net(sk_listener);
1366 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1367 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1368 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1371 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1373 const struct request_sock *req)
1375 return inet_csk_route_req(sk, &fl->u.ip4, req);
1378 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1380 .obj_size = sizeof(struct tcp_request_sock),
1381 .rtx_syn_ack = tcp_rtx_synack,
1382 .send_ack = tcp_v4_reqsk_send_ack,
1383 .destructor = tcp_v4_reqsk_destructor,
1384 .send_reset = tcp_v4_send_reset,
1385 .syn_ack_timeout = tcp_syn_ack_timeout,
1388 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1389 .mss_clamp = TCP_MSS_DEFAULT,
1390 #ifdef CONFIG_TCP_MD5SIG
1391 .req_md5_lookup = tcp_v4_md5_lookup,
1392 .calc_md5_hash = tcp_v4_md5_hash_skb,
1394 .init_req = tcp_v4_init_req,
1395 #ifdef CONFIG_SYN_COOKIES
1396 .cookie_init_seq = cookie_v4_init_sequence,
1398 .route_req = tcp_v4_route_req,
1399 .init_seq = tcp_v4_init_seq,
1400 .init_ts_off = tcp_v4_init_ts_off,
1401 .send_synack = tcp_v4_send_synack,
1404 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1406 /* Never answer to SYNs send to broadcast or multicast */
1407 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1410 return tcp_conn_request(&tcp_request_sock_ops,
1411 &tcp_request_sock_ipv4_ops, sk, skb);
1417 EXPORT_SYMBOL(tcp_v4_conn_request);
1421 * The three way handshake has completed - we got a valid synack -
1422 * now create the new socket.
1424 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1425 struct request_sock *req,
1426 struct dst_entry *dst,
1427 struct request_sock *req_unhash,
1430 struct inet_request_sock *ireq;
1431 bool found_dup_sk = false;
1432 struct inet_sock *newinet;
1433 struct tcp_sock *newtp;
1435 #ifdef CONFIG_TCP_MD5SIG
1436 struct tcp_md5sig_key *key;
1438 struct ip_options_rcu *inet_opt;
1440 if (sk_acceptq_is_full(sk))
1443 newsk = tcp_create_openreq_child(sk, req, skb);
1447 newsk->sk_gso_type = SKB_GSO_TCPV4;
1448 inet_sk_rx_dst_set(newsk, skb);
1450 newtp = tcp_sk(newsk);
1451 newinet = inet_sk(newsk);
1452 ireq = inet_rsk(req);
1453 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1454 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1455 newsk->sk_bound_dev_if = ireq->ir_iif;
1456 newinet->inet_saddr = ireq->ir_loc_addr;
1457 inet_opt = rcu_dereference(ireq->ireq_opt);
1458 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1459 newinet->mc_index = inet_iif(skb);
1460 newinet->mc_ttl = ip_hdr(skb)->ttl;
1461 newinet->rcv_tos = ip_hdr(skb)->tos;
1462 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1464 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1465 newinet->inet_id = prandom_u32();
1468 dst = inet_csk_route_child_sock(sk, newsk, req);
1472 /* syncookie case : see end of cookie_v4_check() */
1474 sk_setup_caps(newsk, dst);
1476 tcp_ca_openreq_child(newsk, dst);
1478 tcp_sync_mss(newsk, dst_mtu(dst));
1479 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1481 tcp_initialize_rcv_mss(newsk);
1483 #ifdef CONFIG_TCP_MD5SIG
1484 /* Copy over the MD5 key from the original socket */
1485 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1489 * We're using one, so create a matching key
1490 * on the newsk structure. If we fail to get
1491 * memory, then we end up not copying the key
1494 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1495 AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1496 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1500 if (__inet_inherit_port(sk, newsk) < 0)
1502 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1504 if (likely(*own_req)) {
1505 tcp_move_syn(newtp, req);
1506 ireq->ireq_opt = NULL;
1508 newinet->inet_opt = NULL;
1510 if (!req_unhash && found_dup_sk) {
1511 /* This code path should only be executed in the
1512 * syncookie case only
1514 bh_unlock_sock(newsk);
1522 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1529 newinet->inet_opt = NULL;
1530 inet_csk_prepare_forced_close(newsk);
1534 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1536 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1538 #ifdef CONFIG_SYN_COOKIES
1539 const struct tcphdr *th = tcp_hdr(skb);
1542 sk = cookie_v4_check(sk, skb);
1547 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1548 struct tcphdr *th, u32 *cookie)
1551 #ifdef CONFIG_SYN_COOKIES
1552 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1553 &tcp_request_sock_ipv4_ops, sk, th);
1555 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1556 tcp_synq_overflow(sk);
1562 /* The socket must have it's spinlock held when we get
1563 * here, unless it is a TCP_LISTEN socket.
1565 * We have a potential double-lock case here, so even when
1566 * doing backlog processing we use the BH locking scheme.
1567 * This is because we cannot sleep with the original spinlock
1570 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1574 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1575 struct dst_entry *dst;
1577 dst = rcu_dereference_protected(sk->sk_rx_dst,
1578 lockdep_sock_is_held(sk));
1580 sock_rps_save_rxhash(sk, skb);
1581 sk_mark_napi_id(sk, skb);
1583 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1584 !dst->ops->check(dst, 0)) {
1585 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1589 tcp_rcv_established(sk, skb);
1593 if (tcp_checksum_complete(skb))
1596 if (sk->sk_state == TCP_LISTEN) {
1597 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1602 if (tcp_child_process(sk, nsk, skb)) {
1609 sock_rps_save_rxhash(sk, skb);
1611 if (tcp_rcv_state_process(sk, skb)) {
1618 tcp_v4_send_reset(rsk, skb);
1621 /* Be careful here. If this function gets more complicated and
1622 * gcc suffers from register pressure on the x86, sk (in %ebx)
1623 * might be destroyed here. This current version compiles correctly,
1624 * but you have been warned.
1629 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1630 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1633 EXPORT_SYMBOL(tcp_v4_do_rcv);
1635 int tcp_v4_early_demux(struct sk_buff *skb)
1637 const struct iphdr *iph;
1638 const struct tcphdr *th;
1641 if (skb->pkt_type != PACKET_HOST)
1644 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1650 if (th->doff < sizeof(struct tcphdr) / 4)
1653 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1654 iph->saddr, th->source,
1655 iph->daddr, ntohs(th->dest),
1656 skb->skb_iif, inet_sdif(skb));
1659 skb->destructor = sock_edemux;
1660 if (sk_fullsock(sk)) {
1661 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1664 dst = dst_check(dst, 0);
1666 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1667 skb_dst_set_noref(skb, dst);
1673 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1675 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1676 u32 tail_gso_size, tail_gso_segs;
1677 struct skb_shared_info *shinfo;
1678 const struct tcphdr *th;
1679 struct tcphdr *thtail;
1680 struct sk_buff *tail;
1681 unsigned int hdrlen;
1687 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1688 * we can fix skb->truesize to its real value to avoid future drops.
1689 * This is valid because skb is not yet charged to the socket.
1690 * It has been noticed pure SACK packets were sometimes dropped
1691 * (if cooked by drivers without copybreak feature).
1697 if (unlikely(tcp_checksum_complete(skb))) {
1699 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1700 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1704 /* Attempt coalescing to last skb in backlog, even if we are
1706 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1708 th = (const struct tcphdr *)skb->data;
1709 hdrlen = th->doff * 4;
1711 tail = sk->sk_backlog.tail;
1714 thtail = (struct tcphdr *)tail->data;
1716 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1717 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1718 ((TCP_SKB_CB(tail)->tcp_flags |
1719 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1720 !((TCP_SKB_CB(tail)->tcp_flags &
1721 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1722 ((TCP_SKB_CB(tail)->tcp_flags ^
1723 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1724 #ifdef CONFIG_TLS_DEVICE
1725 tail->decrypted != skb->decrypted ||
1727 thtail->doff != th->doff ||
1728 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1731 __skb_pull(skb, hdrlen);
1733 shinfo = skb_shinfo(skb);
1734 gso_size = shinfo->gso_size ?: skb->len;
1735 gso_segs = shinfo->gso_segs ?: 1;
1737 shinfo = skb_shinfo(tail);
1738 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1739 tail_gso_segs = shinfo->gso_segs ?: 1;
1741 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1742 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1744 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1745 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1746 thtail->window = th->window;
1749 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1750 * thtail->fin, so that the fast path in tcp_rcv_established()
1751 * is not entered if we append a packet with a FIN.
1752 * SYN, RST, URG are not present.
1753 * ACK is set on both packets.
1754 * PSH : we do not really care in TCP stack,
1755 * at least for 'GRO' packets.
1757 thtail->fin |= th->fin;
1758 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1760 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1761 TCP_SKB_CB(tail)->has_rxtstamp = true;
1762 tail->tstamp = skb->tstamp;
1763 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1766 /* Not as strict as GRO. We only need to carry mss max value */
1767 shinfo->gso_size = max(gso_size, tail_gso_size);
1768 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1770 sk->sk_backlog.len += delta;
1771 __NET_INC_STATS(sock_net(sk),
1772 LINUX_MIB_TCPBACKLOGCOALESCE);
1773 kfree_skb_partial(skb, fragstolen);
1776 __skb_push(skb, hdrlen);
1779 /* Only socket owner can try to collapse/prune rx queues
1780 * to reduce memory overhead, so add a little headroom here.
1781 * Few sockets backlog are possibly concurrently non empty.
1785 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1787 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1792 EXPORT_SYMBOL(tcp_add_backlog);
1794 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1796 struct tcphdr *th = (struct tcphdr *)skb->data;
1798 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1800 EXPORT_SYMBOL(tcp_filter);
1802 static void tcp_v4_restore_cb(struct sk_buff *skb)
1804 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1805 sizeof(struct inet_skb_parm));
1808 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1809 const struct tcphdr *th)
1811 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1812 * barrier() makes sure compiler wont play fool^Waliasing games.
1814 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1815 sizeof(struct inet_skb_parm));
1818 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1819 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1820 skb->len - th->doff * 4);
1821 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1822 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1823 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1824 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1825 TCP_SKB_CB(skb)->sacked = 0;
1826 TCP_SKB_CB(skb)->has_rxtstamp =
1827 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1834 int tcp_v4_rcv(struct sk_buff *skb)
1836 struct net *net = dev_net(skb->dev);
1837 struct sk_buff *skb_to_free;
1838 int sdif = inet_sdif(skb);
1839 const struct iphdr *iph;
1840 const struct tcphdr *th;
1845 if (skb->pkt_type != PACKET_HOST)
1848 /* Count it even if it's bad */
1849 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1851 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1854 th = (const struct tcphdr *)skb->data;
1856 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1858 if (!pskb_may_pull(skb, th->doff * 4))
1861 /* An explanation is required here, I think.
1862 * Packet length and doff are validated by header prediction,
1863 * provided case of th->doff==0 is eliminated.
1864 * So, we defer the checks. */
1866 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1869 th = (const struct tcphdr *)skb->data;
1872 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1873 th->dest, sdif, &refcounted);
1878 if (sk->sk_state == TCP_TIME_WAIT)
1881 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1882 struct request_sock *req = inet_reqsk(sk);
1883 bool req_stolen = false;
1886 sk = req->rsk_listener;
1887 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1888 sk_drops_add(sk, skb);
1892 if (tcp_checksum_complete(skb)) {
1896 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1897 inet_csk_reqsk_queue_drop_and_put(sk, req);
1900 /* We own a reference on the listener, increase it again
1901 * as we might lose it too soon.
1906 if (!tcp_filter(sk, skb)) {
1907 th = (const struct tcphdr *)skb->data;
1909 tcp_v4_fill_cb(skb, iph, th);
1910 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1915 /* Another cpu got exclusive access to req
1916 * and created a full blown socket.
1917 * Try to feed this packet to this socket
1918 * instead of discarding it.
1920 tcp_v4_restore_cb(skb);
1924 goto discard_and_relse;
1928 tcp_v4_restore_cb(skb);
1929 } else if (tcp_child_process(sk, nsk, skb)) {
1930 tcp_v4_send_reset(nsk, skb);
1931 goto discard_and_relse;
1937 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1938 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1939 goto discard_and_relse;
1942 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1943 goto discard_and_relse;
1945 if (tcp_v4_inbound_md5_hash(sk, skb))
1946 goto discard_and_relse;
1950 if (tcp_filter(sk, skb))
1951 goto discard_and_relse;
1952 th = (const struct tcphdr *)skb->data;
1954 tcp_v4_fill_cb(skb, iph, th);
1958 if (sk->sk_state == TCP_LISTEN) {
1959 ret = tcp_v4_do_rcv(sk, skb);
1960 goto put_and_return;
1963 sk_incoming_cpu_update(sk);
1965 bh_lock_sock_nested(sk);
1966 tcp_segs_in(tcp_sk(sk), skb);
1968 if (!sock_owned_by_user(sk)) {
1969 skb_to_free = sk->sk_rx_skb_cache;
1970 sk->sk_rx_skb_cache = NULL;
1971 ret = tcp_v4_do_rcv(sk, skb);
1973 if (tcp_add_backlog(sk, skb))
1974 goto discard_and_relse;
1979 __kfree_skb(skb_to_free);
1988 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1991 tcp_v4_fill_cb(skb, iph, th);
1993 if (tcp_checksum_complete(skb)) {
1995 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1997 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1999 tcp_v4_send_reset(NULL, skb);
2003 /* Discard frame. */
2008 sk_drops_add(sk, skb);
2014 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2015 inet_twsk_put(inet_twsk(sk));
2019 tcp_v4_fill_cb(skb, iph, th);
2021 if (tcp_checksum_complete(skb)) {
2022 inet_twsk_put(inet_twsk(sk));
2025 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2027 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2030 iph->saddr, th->source,
2031 iph->daddr, th->dest,
2035 inet_twsk_deschedule_put(inet_twsk(sk));
2037 tcp_v4_restore_cb(skb);
2045 tcp_v4_timewait_ack(sk, skb);
2048 tcp_v4_send_reset(sk, skb);
2049 inet_twsk_deschedule_put(inet_twsk(sk));
2051 case TCP_TW_SUCCESS:;
2056 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2057 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2058 .twsk_unique = tcp_twsk_unique,
2059 .twsk_destructor= tcp_twsk_destructor,
2062 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2064 struct dst_entry *dst = skb_dst(skb);
2066 if (dst && dst_hold_safe(dst)) {
2067 rcu_assign_pointer(sk->sk_rx_dst, dst);
2068 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2071 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2073 const struct inet_connection_sock_af_ops ipv4_specific = {
2074 .queue_xmit = ip_queue_xmit,
2075 .send_check = tcp_v4_send_check,
2076 .rebuild_header = inet_sk_rebuild_header,
2077 .sk_rx_dst_set = inet_sk_rx_dst_set,
2078 .conn_request = tcp_v4_conn_request,
2079 .syn_recv_sock = tcp_v4_syn_recv_sock,
2080 .net_header_len = sizeof(struct iphdr),
2081 .setsockopt = ip_setsockopt,
2082 .getsockopt = ip_getsockopt,
2083 .addr2sockaddr = inet_csk_addr2sockaddr,
2084 .sockaddr_len = sizeof(struct sockaddr_in),
2085 #ifdef CONFIG_COMPAT
2086 .compat_setsockopt = compat_ip_setsockopt,
2087 .compat_getsockopt = compat_ip_getsockopt,
2089 .mtu_reduced = tcp_v4_mtu_reduced,
2091 EXPORT_SYMBOL(ipv4_specific);
2093 #ifdef CONFIG_TCP_MD5SIG
2094 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2095 .md5_lookup = tcp_v4_md5_lookup,
2096 .calc_md5_hash = tcp_v4_md5_hash_skb,
2097 .md5_parse = tcp_v4_parse_md5_keys,
2101 /* NOTE: A lot of things set to zero explicitly by call to
2102 * sk_alloc() so need not be done here.
2104 static int tcp_v4_init_sock(struct sock *sk)
2106 struct inet_connection_sock *icsk = inet_csk(sk);
2110 icsk->icsk_af_ops = &ipv4_specific;
2112 #ifdef CONFIG_TCP_MD5SIG
2113 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2119 void tcp_v4_destroy_sock(struct sock *sk)
2121 struct tcp_sock *tp = tcp_sk(sk);
2123 trace_tcp_destroy_sock(sk);
2125 tcp_clear_xmit_timers(sk);
2127 tcp_cleanup_congestion_control(sk);
2129 tcp_cleanup_ulp(sk);
2131 /* Cleanup up the write buffer. */
2132 tcp_write_queue_purge(sk);
2134 /* Check if we want to disable active TFO */
2135 tcp_fastopen_active_disable_ofo_check(sk);
2137 /* Cleans up our, hopefully empty, out_of_order_queue. */
2138 skb_rbtree_purge(&tp->out_of_order_queue);
2140 #ifdef CONFIG_TCP_MD5SIG
2141 /* Clean up the MD5 key list, if any */
2142 if (tp->md5sig_info) {
2143 tcp_clear_md5_list(sk);
2144 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2145 tp->md5sig_info = NULL;
2149 /* Clean up a referenced TCP bind bucket. */
2150 if (inet_csk(sk)->icsk_bind_hash)
2153 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2155 /* If socket is aborted during connect operation */
2156 tcp_free_fastopen_req(tp);
2157 tcp_fastopen_destroy_cipher(sk);
2158 tcp_saved_syn_free(tp);
2160 sk_sockets_allocated_dec(sk);
2162 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2164 #ifdef CONFIG_PROC_FS
2165 /* Proc filesystem TCP sock list dumping. */
2168 * Get next listener socket follow cur. If cur is NULL, get first socket
2169 * starting from bucket given in st->bucket; when st->bucket is zero the
2170 * very first socket in the hash table is returned.
2172 static void *listening_get_next(struct seq_file *seq, void *cur)
2174 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2175 struct tcp_iter_state *st = seq->private;
2176 struct net *net = seq_file_net(seq);
2177 struct inet_listen_hashbucket *ilb;
2178 struct hlist_nulls_node *node;
2179 struct sock *sk = cur;
2183 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2184 spin_lock(&ilb->lock);
2185 sk = sk_nulls_head(&ilb->nulls_head);
2189 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2193 sk = sk_nulls_next(sk);
2195 sk_nulls_for_each_from(sk, node) {
2196 if (!net_eq(sock_net(sk), net))
2198 if (sk->sk_family == afinfo->family)
2201 spin_unlock(&ilb->lock);
2203 if (++st->bucket < INET_LHTABLE_SIZE)
2208 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2210 struct tcp_iter_state *st = seq->private;
2215 rc = listening_get_next(seq, NULL);
2217 while (rc && *pos) {
2218 rc = listening_get_next(seq, rc);
2224 static inline bool empty_bucket(const struct tcp_iter_state *st)
2226 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2230 * Get first established socket starting from bucket given in st->bucket.
2231 * If st->bucket is zero, the very first socket in the hash is returned.
2233 static void *established_get_first(struct seq_file *seq)
2235 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2236 struct tcp_iter_state *st = seq->private;
2237 struct net *net = seq_file_net(seq);
2241 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2243 struct hlist_nulls_node *node;
2244 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2246 /* Lockless fast path for the common case of empty buckets */
2247 if (empty_bucket(st))
2251 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2252 if (sk->sk_family != afinfo->family ||
2253 !net_eq(sock_net(sk), net)) {
2259 spin_unlock_bh(lock);
2265 static void *established_get_next(struct seq_file *seq, void *cur)
2267 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2268 struct sock *sk = cur;
2269 struct hlist_nulls_node *node;
2270 struct tcp_iter_state *st = seq->private;
2271 struct net *net = seq_file_net(seq);
2276 sk = sk_nulls_next(sk);
2278 sk_nulls_for_each_from(sk, node) {
2279 if (sk->sk_family == afinfo->family &&
2280 net_eq(sock_net(sk), net))
2284 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2286 return established_get_first(seq);
2289 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2291 struct tcp_iter_state *st = seq->private;
2295 rc = established_get_first(seq);
2298 rc = established_get_next(seq, rc);
2304 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2307 struct tcp_iter_state *st = seq->private;
2309 st->state = TCP_SEQ_STATE_LISTENING;
2310 rc = listening_get_idx(seq, &pos);
2313 st->state = TCP_SEQ_STATE_ESTABLISHED;
2314 rc = established_get_idx(seq, pos);
2320 static void *tcp_seek_last_pos(struct seq_file *seq)
2322 struct tcp_iter_state *st = seq->private;
2323 int bucket = st->bucket;
2324 int offset = st->offset;
2325 int orig_num = st->num;
2328 switch (st->state) {
2329 case TCP_SEQ_STATE_LISTENING:
2330 if (st->bucket >= INET_LHTABLE_SIZE)
2332 st->state = TCP_SEQ_STATE_LISTENING;
2333 rc = listening_get_next(seq, NULL);
2334 while (offset-- && rc && bucket == st->bucket)
2335 rc = listening_get_next(seq, rc);
2339 st->state = TCP_SEQ_STATE_ESTABLISHED;
2341 case TCP_SEQ_STATE_ESTABLISHED:
2342 if (st->bucket > tcp_hashinfo.ehash_mask)
2344 rc = established_get_first(seq);
2345 while (offset-- && rc && bucket == st->bucket)
2346 rc = established_get_next(seq, rc);
2354 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2356 struct tcp_iter_state *st = seq->private;
2359 if (*pos && *pos == st->last_pos) {
2360 rc = tcp_seek_last_pos(seq);
2365 st->state = TCP_SEQ_STATE_LISTENING;
2369 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2372 st->last_pos = *pos;
2375 EXPORT_SYMBOL(tcp_seq_start);
2377 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2379 struct tcp_iter_state *st = seq->private;
2382 if (v == SEQ_START_TOKEN) {
2383 rc = tcp_get_idx(seq, 0);
2387 switch (st->state) {
2388 case TCP_SEQ_STATE_LISTENING:
2389 rc = listening_get_next(seq, v);
2391 st->state = TCP_SEQ_STATE_ESTABLISHED;
2394 rc = established_get_first(seq);
2397 case TCP_SEQ_STATE_ESTABLISHED:
2398 rc = established_get_next(seq, v);
2403 st->last_pos = *pos;
2406 EXPORT_SYMBOL(tcp_seq_next);
2408 void tcp_seq_stop(struct seq_file *seq, void *v)
2410 struct tcp_iter_state *st = seq->private;
2412 switch (st->state) {
2413 case TCP_SEQ_STATE_LISTENING:
2414 if (v != SEQ_START_TOKEN)
2415 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2417 case TCP_SEQ_STATE_ESTABLISHED:
2419 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2423 EXPORT_SYMBOL(tcp_seq_stop);
2425 static void get_openreq4(const struct request_sock *req,
2426 struct seq_file *f, int i)
2428 const struct inet_request_sock *ireq = inet_rsk(req);
2429 long delta = req->rsk_timer.expires - jiffies;
2431 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2432 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2437 ntohs(ireq->ir_rmt_port),
2439 0, 0, /* could print option size, but that is af dependent. */
2440 1, /* timers active (only the expire timer) */
2441 jiffies_delta_to_clock_t(delta),
2443 from_kuid_munged(seq_user_ns(f),
2444 sock_i_uid(req->rsk_listener)),
2445 0, /* non standard timer */
2446 0, /* open_requests have no inode */
2451 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2454 unsigned long timer_expires;
2455 const struct tcp_sock *tp = tcp_sk(sk);
2456 const struct inet_connection_sock *icsk = inet_csk(sk);
2457 const struct inet_sock *inet = inet_sk(sk);
2458 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2459 __be32 dest = inet->inet_daddr;
2460 __be32 src = inet->inet_rcv_saddr;
2461 __u16 destp = ntohs(inet->inet_dport);
2462 __u16 srcp = ntohs(inet->inet_sport);
2466 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2467 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2468 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2470 timer_expires = icsk->icsk_timeout;
2471 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2473 timer_expires = icsk->icsk_timeout;
2474 } else if (timer_pending(&sk->sk_timer)) {
2476 timer_expires = sk->sk_timer.expires;
2479 timer_expires = jiffies;
2482 state = inet_sk_state_load(sk);
2483 if (state == TCP_LISTEN)
2484 rx_queue = sk->sk_ack_backlog;
2486 /* Because we don't lock the socket,
2487 * we might find a transient negative value.
2489 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2490 READ_ONCE(tp->copied_seq), 0);
2492 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2493 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2494 i, src, srcp, dest, destp, state,
2495 READ_ONCE(tp->write_seq) - tp->snd_una,
2498 jiffies_delta_to_clock_t(timer_expires - jiffies),
2499 icsk->icsk_retransmits,
2500 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2501 icsk->icsk_probes_out,
2503 refcount_read(&sk->sk_refcnt), sk,
2504 jiffies_to_clock_t(icsk->icsk_rto),
2505 jiffies_to_clock_t(icsk->icsk_ack.ato),
2506 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2508 state == TCP_LISTEN ?
2509 fastopenq->max_qlen :
2510 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2513 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2514 struct seq_file *f, int i)
2516 long delta = tw->tw_timer.expires - jiffies;
2520 dest = tw->tw_daddr;
2521 src = tw->tw_rcv_saddr;
2522 destp = ntohs(tw->tw_dport);
2523 srcp = ntohs(tw->tw_sport);
2525 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2526 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2527 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2528 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2529 refcount_read(&tw->tw_refcnt), tw);
2534 static int tcp4_seq_show(struct seq_file *seq, void *v)
2536 struct tcp_iter_state *st;
2537 struct sock *sk = v;
2539 seq_setwidth(seq, TMPSZ - 1);
2540 if (v == SEQ_START_TOKEN) {
2541 seq_puts(seq, " sl local_address rem_address st tx_queue "
2542 "rx_queue tr tm->when retrnsmt uid timeout "
2548 if (sk->sk_state == TCP_TIME_WAIT)
2549 get_timewait4_sock(v, seq, st->num);
2550 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2551 get_openreq4(v, seq, st->num);
2553 get_tcp4_sock(v, seq, st->num);
2559 static const struct seq_operations tcp4_seq_ops = {
2560 .show = tcp4_seq_show,
2561 .start = tcp_seq_start,
2562 .next = tcp_seq_next,
2563 .stop = tcp_seq_stop,
2566 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2570 static int __net_init tcp4_proc_init_net(struct net *net)
2572 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2573 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2578 static void __net_exit tcp4_proc_exit_net(struct net *net)
2580 remove_proc_entry("tcp", net->proc_net);
2583 static struct pernet_operations tcp4_net_ops = {
2584 .init = tcp4_proc_init_net,
2585 .exit = tcp4_proc_exit_net,
2588 int __init tcp4_proc_init(void)
2590 return register_pernet_subsys(&tcp4_net_ops);
2593 void tcp4_proc_exit(void)
2595 unregister_pernet_subsys(&tcp4_net_ops);
2597 #endif /* CONFIG_PROC_FS */
2599 struct proto tcp_prot = {
2601 .owner = THIS_MODULE,
2603 .pre_connect = tcp_v4_pre_connect,
2604 .connect = tcp_v4_connect,
2605 .disconnect = tcp_disconnect,
2606 .accept = inet_csk_accept,
2608 .init = tcp_v4_init_sock,
2609 .destroy = tcp_v4_destroy_sock,
2610 .shutdown = tcp_shutdown,
2611 .setsockopt = tcp_setsockopt,
2612 .getsockopt = tcp_getsockopt,
2613 .keepalive = tcp_set_keepalive,
2614 .recvmsg = tcp_recvmsg,
2615 .sendmsg = tcp_sendmsg,
2616 .sendpage = tcp_sendpage,
2617 .backlog_rcv = tcp_v4_do_rcv,
2618 .release_cb = tcp_release_cb,
2620 .unhash = inet_unhash,
2621 .get_port = inet_csk_get_port,
2622 .enter_memory_pressure = tcp_enter_memory_pressure,
2623 .leave_memory_pressure = tcp_leave_memory_pressure,
2624 .stream_memory_free = tcp_stream_memory_free,
2625 .sockets_allocated = &tcp_sockets_allocated,
2626 .orphan_count = &tcp_orphan_count,
2627 .memory_allocated = &tcp_memory_allocated,
2628 .memory_pressure = &tcp_memory_pressure,
2629 .sysctl_mem = sysctl_tcp_mem,
2630 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2631 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2632 .max_header = MAX_TCP_HEADER,
2633 .obj_size = sizeof(struct tcp_sock),
2634 .slab_flags = SLAB_TYPESAFE_BY_RCU,
2635 .twsk_prot = &tcp_timewait_sock_ops,
2636 .rsk_prot = &tcp_request_sock_ops,
2637 .h.hashinfo = &tcp_hashinfo,
2638 .no_autobind = true,
2639 #ifdef CONFIG_COMPAT
2640 .compat_setsockopt = compat_tcp_setsockopt,
2641 .compat_getsockopt = compat_tcp_getsockopt,
2643 .diag_destroy = tcp_abort,
2645 EXPORT_SYMBOL(tcp_prot);
2647 static void __net_exit tcp_sk_exit(struct net *net)
2651 if (net->ipv4.tcp_congestion_control)
2652 module_put(net->ipv4.tcp_congestion_control->owner);
2654 for_each_possible_cpu(cpu)
2655 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2656 free_percpu(net->ipv4.tcp_sk);
2659 static int __net_init tcp_sk_init(struct net *net)
2663 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2664 if (!net->ipv4.tcp_sk)
2667 for_each_possible_cpu(cpu) {
2670 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2674 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2676 /* Please enforce IP_DF and IPID==0 for RST and
2677 * ACK sent in SYN-RECV and TIME-WAIT state.
2679 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2681 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2684 net->ipv4.sysctl_tcp_ecn = 2;
2685 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2687 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2688 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2689 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2690 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2691 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2693 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2694 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2695 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2697 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2698 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2699 net->ipv4.sysctl_tcp_syncookies = 1;
2700 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2701 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2702 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2703 net->ipv4.sysctl_tcp_orphan_retries = 0;
2704 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2705 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2706 net->ipv4.sysctl_tcp_tw_reuse = 2;
2708 cnt = tcp_hashinfo.ehash_mask + 1;
2709 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2710 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2712 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2713 net->ipv4.sysctl_tcp_sack = 1;
2714 net->ipv4.sysctl_tcp_window_scaling = 1;
2715 net->ipv4.sysctl_tcp_timestamps = 1;
2716 net->ipv4.sysctl_tcp_early_retrans = 3;
2717 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2718 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
2719 net->ipv4.sysctl_tcp_retrans_collapse = 1;
2720 net->ipv4.sysctl_tcp_max_reordering = 300;
2721 net->ipv4.sysctl_tcp_dsack = 1;
2722 net->ipv4.sysctl_tcp_app_win = 31;
2723 net->ipv4.sysctl_tcp_adv_win_scale = 1;
2724 net->ipv4.sysctl_tcp_frto = 2;
2725 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2726 /* This limits the percentage of the congestion window which we
2727 * will allow a single TSO frame to consume. Building TSO frames
2728 * which are too large can cause TCP streams to be bursty.
2730 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2731 /* Default TSQ limit of 16 TSO segments */
2732 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2733 /* rfc5961 challenge ack rate limiting */
2734 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2735 net->ipv4.sysctl_tcp_min_tso_segs = 2;
2736 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2737 net->ipv4.sysctl_tcp_autocorking = 1;
2738 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2739 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2740 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2741 if (net != &init_net) {
2742 memcpy(net->ipv4.sysctl_tcp_rmem,
2743 init_net.ipv4.sysctl_tcp_rmem,
2744 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2745 memcpy(net->ipv4.sysctl_tcp_wmem,
2746 init_net.ipv4.sysctl_tcp_wmem,
2747 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2749 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2750 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2751 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2752 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2753 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2754 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2756 /* Reno is always built in */
2757 if (!net_eq(net, &init_net) &&
2758 try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2759 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2761 net->ipv4.tcp_congestion_control = &tcp_reno;
2770 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2774 inet_twsk_purge(&tcp_hashinfo, AF_INET);
2776 list_for_each_entry(net, net_exit_list, exit_list)
2777 tcp_fastopen_ctx_destroy(net);
2780 static struct pernet_operations __net_initdata tcp_sk_ops = {
2781 .init = tcp_sk_init,
2782 .exit = tcp_sk_exit,
2783 .exit_batch = tcp_sk_exit_batch,
2786 void __init tcp_v4_init(void)
2788 if (register_pernet_subsys(&tcp_sk_ops))
2789 panic("Failed to create the TCP control socket.\n");