1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (c) 2013 Nicira, Inc.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
45 #include <net/dst_metadata.h>
47 #if IS_ENABLED(CONFIG_IPV6)
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
55 return hash_32((__force u32)key ^ (__force u32)remote,
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60 __be16 flags, __be32 key)
62 if (p->i_flags & TUNNEL_KEY) {
63 if (flags & TUNNEL_KEY)
64 return key == p->i_key;
66 /* key expected, none present */
69 return !(flags & TUNNEL_KEY);
72 /* Fallback tunnel: no source, no destination, no key, no options
75 We require exact key match i.e. if a key is present in packet
76 it will match only tunnel with the same key; if it is not present,
77 it will match only keyless tunnel.
79 All keysless packets, if not matched configured keyless tunnels
80 will match fallback tunnel.
81 Given src, dst and key, find appropriate for input tunnel.
83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84 int link, __be16 flags,
85 __be32 remote, __be32 local,
88 struct ip_tunnel *t, *cand = NULL;
89 struct hlist_head *head;
90 struct net_device *ndev;
93 hash = ip_tunnel_hash(key, remote);
94 head = &itn->tunnels[hash];
96 hlist_for_each_entry_rcu(t, head, hash_node) {
97 if (local != t->parms.iph.saddr ||
98 remote != t->parms.iph.daddr ||
99 !(t->dev->flags & IFF_UP))
102 if (!ip_tunnel_key_match(&t->parms, flags, key))
105 if (t->parms.link == link)
111 hlist_for_each_entry_rcu(t, head, hash_node) {
112 if (remote != t->parms.iph.daddr ||
113 t->parms.iph.saddr != 0 ||
114 !(t->dev->flags & IFF_UP))
117 if (!ip_tunnel_key_match(&t->parms, flags, key))
120 if (t->parms.link == link)
126 hash = ip_tunnel_hash(key, 0);
127 head = &itn->tunnels[hash];
129 hlist_for_each_entry_rcu(t, head, hash_node) {
130 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
131 (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
134 if (!(t->dev->flags & IFF_UP))
137 if (!ip_tunnel_key_match(&t->parms, flags, key))
140 if (t->parms.link == link)
146 hlist_for_each_entry_rcu(t, head, hash_node) {
147 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
148 t->parms.iph.saddr != 0 ||
149 t->parms.iph.daddr != 0 ||
150 !(t->dev->flags & IFF_UP))
153 if (t->parms.link == link)
162 t = rcu_dereference(itn->collect_md_tun);
163 if (t && t->dev->flags & IFF_UP)
166 ndev = READ_ONCE(itn->fb_tunnel_dev);
167 if (ndev && ndev->flags & IFF_UP)
168 return netdev_priv(ndev);
172 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
174 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
175 struct ip_tunnel_parm *parms)
179 __be32 i_key = parms->i_key;
181 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
182 remote = parms->iph.daddr;
186 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
189 h = ip_tunnel_hash(i_key, remote);
190 return &itn->tunnels[h];
193 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
195 struct hlist_head *head = ip_bucket(itn, &t->parms);
198 rcu_assign_pointer(itn->collect_md_tun, t);
199 hlist_add_head_rcu(&t->hash_node, head);
202 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
205 rcu_assign_pointer(itn->collect_md_tun, NULL);
206 hlist_del_init_rcu(&t->hash_node);
209 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
210 struct ip_tunnel_parm *parms,
213 __be32 remote = parms->iph.daddr;
214 __be32 local = parms->iph.saddr;
215 __be32 key = parms->i_key;
216 __be16 flags = parms->i_flags;
217 int link = parms->link;
218 struct ip_tunnel *t = NULL;
219 struct hlist_head *head = ip_bucket(itn, parms);
221 hlist_for_each_entry_rcu(t, head, hash_node) {
222 if (local == t->parms.iph.saddr &&
223 remote == t->parms.iph.daddr &&
224 link == t->parms.link &&
225 type == t->dev->type &&
226 ip_tunnel_key_match(&t->parms, flags, key))
232 static struct net_device *__ip_tunnel_create(struct net *net,
233 const struct rtnl_link_ops *ops,
234 struct ip_tunnel_parm *parms)
237 struct ip_tunnel *tunnel;
238 struct net_device *dev;
242 if (parms->name[0]) {
243 if (!dev_valid_name(parms->name))
245 strscpy(name, parms->name, IFNAMSIZ);
247 if (strlen(ops->kind) > (IFNAMSIZ - 3))
249 strcpy(name, ops->kind);
254 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
259 dev_net_set(dev, net);
261 dev->rtnl_link_ops = ops;
263 tunnel = netdev_priv(dev);
264 tunnel->parms = *parms;
267 err = register_netdevice(dev);
279 static int ip_tunnel_bind_dev(struct net_device *dev)
281 struct net_device *tdev = NULL;
282 struct ip_tunnel *tunnel = netdev_priv(dev);
283 const struct iphdr *iph;
284 int hlen = LL_MAX_HEADER;
285 int mtu = ETH_DATA_LEN;
286 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
288 iph = &tunnel->parms.iph;
290 /* Guess output device to choose reasonable mtu and needed_headroom */
295 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
296 iph->saddr, tunnel->parms.o_key,
297 RT_TOS(iph->tos), dev_net(dev),
298 tunnel->parms.link, tunnel->fwmark, 0, 0);
299 rt = ip_route_output_key(tunnel->net, &fl4);
305 if (dev->type != ARPHRD_ETHER)
306 dev->flags |= IFF_POINTOPOINT;
308 dst_cache_reset(&tunnel->dst_cache);
311 if (!tdev && tunnel->parms.link)
312 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
315 hlen = tdev->hard_header_len + tdev->needed_headroom;
316 mtu = min(tdev->mtu, IP_MAX_MTU);
319 dev->needed_headroom = t_hlen + hlen;
320 mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
322 if (mtu < IPV4_MIN_MTU)
328 static struct ip_tunnel *ip_tunnel_create(struct net *net,
329 struct ip_tunnel_net *itn,
330 struct ip_tunnel_parm *parms)
332 struct ip_tunnel *nt;
333 struct net_device *dev;
338 dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
340 return ERR_CAST(dev);
342 mtu = ip_tunnel_bind_dev(dev);
343 err = dev_set_mtu(dev, mtu);
345 goto err_dev_set_mtu;
347 nt = netdev_priv(dev);
348 t_hlen = nt->hlen + sizeof(struct iphdr);
349 dev->min_mtu = ETH_MIN_MTU;
350 dev->max_mtu = IP_MAX_MTU - t_hlen;
351 if (dev->type == ARPHRD_ETHER)
352 dev->max_mtu -= dev->hard_header_len;
354 ip_tunnel_add(itn, nt);
358 unregister_netdevice(dev);
362 void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info)
364 const struct iphdr *iph = ip_hdr(skb);
365 const struct udphdr *udph;
367 if (iph->protocol != IPPROTO_UDP)
370 udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2));
371 info->encap.sport = udph->source;
372 info->encap.dport = udph->dest;
374 EXPORT_SYMBOL(ip_tunnel_md_udp_encap);
376 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
377 const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
380 const struct iphdr *iph = ip_hdr(skb);
383 #ifdef CONFIG_NET_IPGRE_BROADCAST
384 if (ipv4_is_multicast(iph->daddr)) {
385 DEV_STATS_INC(tunnel->dev, multicast);
386 skb->pkt_type = PACKET_BROADCAST;
390 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
391 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
392 DEV_STATS_INC(tunnel->dev, rx_crc_errors);
393 DEV_STATS_INC(tunnel->dev, rx_errors);
397 if (tunnel->parms.i_flags&TUNNEL_SEQ) {
398 if (!(tpi->flags&TUNNEL_SEQ) ||
399 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
400 DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
401 DEV_STATS_INC(tunnel->dev, rx_errors);
404 tunnel->i_seqno = ntohl(tpi->seq) + 1;
407 /* Save offset of outer header relative to skb->head,
408 * because we are going to reset the network header to the inner header
409 * and might change skb->head.
411 nh = skb_network_header(skb) - skb->head;
413 skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0);
415 if (!pskb_inet_may_pull(skb)) {
416 DEV_STATS_INC(tunnel->dev, rx_length_errors);
417 DEV_STATS_INC(tunnel->dev, rx_errors);
420 iph = (struct iphdr *)(skb->head + nh);
422 err = IP_ECN_decapsulate(iph, skb);
425 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
426 &iph->saddr, iph->tos);
428 DEV_STATS_INC(tunnel->dev, rx_frame_errors);
429 DEV_STATS_INC(tunnel->dev, rx_errors);
434 dev_sw_netstats_rx_add(tunnel->dev, skb->len);
435 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
437 if (tunnel->dev->type == ARPHRD_ETHER) {
438 skb->protocol = eth_type_trans(skb, tunnel->dev);
439 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
441 skb->dev = tunnel->dev;
445 skb_dst_set(skb, (struct dst_entry *)tun_dst);
447 gro_cells_receive(&tunnel->gro_cells, skb);
452 dst_release((struct dst_entry *)tun_dst);
456 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
458 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
461 if (num >= MAX_IPTUN_ENCAP_OPS)
464 return !cmpxchg((const struct ip_tunnel_encap_ops **)
468 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
470 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
475 if (num >= MAX_IPTUN_ENCAP_OPS)
478 ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
480 ops, NULL) == ops) ? 0 : -1;
486 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
488 int ip_tunnel_encap_setup(struct ip_tunnel *t,
489 struct ip_tunnel_encap *ipencap)
493 memset(&t->encap, 0, sizeof(t->encap));
495 hlen = ip_encap_hlen(ipencap);
499 t->encap.type = ipencap->type;
500 t->encap.sport = ipencap->sport;
501 t->encap.dport = ipencap->dport;
502 t->encap.flags = ipencap->flags;
504 t->encap_hlen = hlen;
505 t->hlen = t->encap_hlen + t->tun_hlen;
509 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
511 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
512 struct rtable *rt, __be16 df,
513 const struct iphdr *inner_iph,
514 int tunnel_hlen, __be32 dst, bool md)
516 struct ip_tunnel *tunnel = netdev_priv(dev);
520 tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
521 pkt_size = skb->len - tunnel_hlen;
522 pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
525 mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
526 mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
528 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
531 if (skb_valid_dst(skb))
532 skb_dst_update_pmtu_no_confirm(skb, mtu);
534 if (skb->protocol == htons(ETH_P_IP)) {
535 if (!skb_is_gso(skb) &&
536 (inner_iph->frag_off & htons(IP_DF)) &&
538 icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
542 #if IS_ENABLED(CONFIG_IPV6)
543 else if (skb->protocol == htons(ETH_P_IPV6)) {
544 struct rt6_info *rt6;
547 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
549 daddr = md ? dst : tunnel->parms.iph.daddr;
551 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
552 mtu >= IPV6_MIN_MTU) {
553 if ((daddr && !ipv4_is_multicast(daddr)) ||
554 rt6->rt6i_dst.plen == 128) {
555 rt6->rt6i_flags |= RTF_MODIFIED;
556 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
560 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
562 icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
570 static void ip_tunnel_adj_headroom(struct net_device *dev, unsigned int headroom)
572 /* we must cap headroom to some upperlimit, else pskb_expand_head
573 * will overflow header offsets in skb_headers_offset_update().
575 static const unsigned int max_allowed = 512;
577 if (headroom > max_allowed)
578 headroom = max_allowed;
580 if (headroom > READ_ONCE(dev->needed_headroom))
581 WRITE_ONCE(dev->needed_headroom, headroom);
584 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
585 u8 proto, int tunnel_hlen)
587 struct ip_tunnel *tunnel = netdev_priv(dev);
588 u32 headroom = sizeof(struct iphdr);
589 struct ip_tunnel_info *tun_info;
590 const struct ip_tunnel_key *key;
591 const struct iphdr *inner_iph;
592 struct rtable *rt = NULL;
598 tun_info = skb_tunnel_info(skb);
599 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
600 ip_tunnel_info_af(tun_info) != AF_INET))
602 key = &tun_info->key;
603 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
604 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
607 if (skb->protocol == htons(ETH_P_IP))
608 tos = inner_iph->tos;
609 else if (skb->protocol == htons(ETH_P_IPV6))
610 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
612 ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
613 tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
614 dev_net(dev), 0, skb->mark, skb_get_hash(skb),
618 tunnel_hlen = ip_encap_hlen(&tun_info->encap);
620 if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0)
623 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
625 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
627 rt = ip_route_output_key(tunnel->net, &fl4);
629 DEV_STATS_INC(dev, tx_carrier_errors);
633 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
636 if (rt->dst.dev == dev) {
638 DEV_STATS_INC(dev, collisions);
642 if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
644 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
645 key->u.ipv4.dst, true)) {
650 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
653 if (skb->protocol == htons(ETH_P_IP))
654 ttl = inner_iph->ttl;
655 else if (skb->protocol == htons(ETH_P_IPV6))
656 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
658 ttl = ip4_dst_hoplimit(&rt->dst);
661 headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
662 if (skb_cow_head(skb, headroom)) {
667 ip_tunnel_adj_headroom(dev, headroom);
669 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
670 df, !net_eq(tunnel->net, dev_net(dev)));
673 DEV_STATS_INC(dev, tx_errors);
676 DEV_STATS_INC(dev, tx_dropped);
680 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
682 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
683 const struct iphdr *tnl_params, u8 protocol)
685 struct ip_tunnel *tunnel = netdev_priv(dev);
686 struct ip_tunnel_info *tun_info = NULL;
687 const struct iphdr *inner_iph;
688 unsigned int max_headroom; /* The extra header space needed */
689 struct rtable *rt = NULL; /* Route to the other host */
690 __be16 payload_protocol;
691 bool use_cache = false;
699 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
700 connected = (tunnel->parms.iph.daddr != 0);
701 payload_protocol = skb_protocol(skb, true);
703 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
705 dst = tnl_params->daddr;
710 DEV_STATS_INC(dev, tx_fifo_errors);
714 tun_info = skb_tunnel_info(skb);
715 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
716 ip_tunnel_info_af(tun_info) == AF_INET &&
717 tun_info->key.u.ipv4.dst) {
718 dst = tun_info->key.u.ipv4.dst;
721 } else if (payload_protocol == htons(ETH_P_IP)) {
722 rt = skb_rtable(skb);
723 dst = rt_nexthop(rt, inner_iph->daddr);
725 #if IS_ENABLED(CONFIG_IPV6)
726 else if (payload_protocol == htons(ETH_P_IPV6)) {
727 const struct in6_addr *addr6;
728 struct neighbour *neigh;
729 bool do_tx_error_icmp;
732 neigh = dst_neigh_lookup(skb_dst(skb),
733 &ipv6_hdr(skb)->daddr);
737 addr6 = (const struct in6_addr *)&neigh->primary_key;
738 addr_type = ipv6_addr_type(addr6);
740 if (addr_type == IPV6_ADDR_ANY) {
741 addr6 = &ipv6_hdr(skb)->daddr;
742 addr_type = ipv6_addr_type(addr6);
745 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
746 do_tx_error_icmp = true;
748 do_tx_error_icmp = false;
749 dst = addr6->s6_addr32[3];
751 neigh_release(neigh);
752 if (do_tx_error_icmp)
763 tos = tnl_params->tos;
766 if (payload_protocol == htons(ETH_P_IP)) {
767 tos = inner_iph->tos;
769 } else if (payload_protocol == htons(ETH_P_IPV6)) {
770 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
775 ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
776 tunnel->parms.o_key, RT_TOS(tos),
777 dev_net(dev), tunnel->parms.link,
778 tunnel->fwmark, skb_get_hash(skb), 0);
780 if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0)
783 if (connected && md) {
784 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
786 rt = dst_cache_get_ip4(&tun_info->dst_cache,
789 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
794 rt = ip_route_output_key(tunnel->net, &fl4);
797 DEV_STATS_INC(dev, tx_carrier_errors);
801 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
803 else if (!md && connected)
804 dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
808 if (rt->dst.dev == dev) {
810 DEV_STATS_INC(dev, collisions);
814 df = tnl_params->frag_off;
815 if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
816 df |= (inner_iph->frag_off & htons(IP_DF));
818 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
823 if (tunnel->err_count > 0) {
824 if (time_before(jiffies,
825 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
828 dst_link_failure(skb);
830 tunnel->err_count = 0;
833 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
834 ttl = tnl_params->ttl;
836 if (payload_protocol == htons(ETH_P_IP))
837 ttl = inner_iph->ttl;
838 #if IS_ENABLED(CONFIG_IPV6)
839 else if (payload_protocol == htons(ETH_P_IPV6))
840 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
843 ttl = ip4_dst_hoplimit(&rt->dst);
846 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
847 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
849 if (skb_cow_head(skb, max_headroom)) {
851 DEV_STATS_INC(dev, tx_dropped);
856 ip_tunnel_adj_headroom(dev, max_headroom);
858 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
859 df, !net_eq(tunnel->net, dev_net(dev)));
862 #if IS_ENABLED(CONFIG_IPV6)
864 dst_link_failure(skb);
867 DEV_STATS_INC(dev, tx_errors);
870 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
872 static void ip_tunnel_update(struct ip_tunnel_net *itn,
874 struct net_device *dev,
875 struct ip_tunnel_parm *p,
879 ip_tunnel_del(itn, t);
880 t->parms.iph.saddr = p->iph.saddr;
881 t->parms.iph.daddr = p->iph.daddr;
882 t->parms.i_key = p->i_key;
883 t->parms.o_key = p->o_key;
884 if (dev->type != ARPHRD_ETHER) {
885 __dev_addr_set(dev, &p->iph.saddr, 4);
886 memcpy(dev->broadcast, &p->iph.daddr, 4);
888 ip_tunnel_add(itn, t);
890 t->parms.iph.ttl = p->iph.ttl;
891 t->parms.iph.tos = p->iph.tos;
892 t->parms.iph.frag_off = p->iph.frag_off;
894 if (t->parms.link != p->link || t->fwmark != fwmark) {
897 t->parms.link = p->link;
899 mtu = ip_tunnel_bind_dev(dev);
903 dst_cache_reset(&t->dst_cache);
904 netdev_state_change(dev);
907 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
910 struct ip_tunnel *t = netdev_priv(dev);
911 struct net *net = t->net;
912 struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
916 if (dev == itn->fb_tunnel_dev) {
917 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
919 t = netdev_priv(dev);
921 memcpy(p, &t->parms, sizeof(*p));
927 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
930 p->iph.frag_off |= htons(IP_DF);
931 if (!(p->i_flags & VTI_ISVTI)) {
932 if (!(p->i_flags & TUNNEL_KEY))
934 if (!(p->o_flags & TUNNEL_KEY))
938 t = ip_tunnel_find(itn, p, itn->type);
940 if (cmd == SIOCADDTUNNEL) {
942 t = ip_tunnel_create(net, itn, p);
943 err = PTR_ERR_OR_ZERO(t);
950 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
957 unsigned int nflags = 0;
959 if (ipv4_is_multicast(p->iph.daddr))
960 nflags = IFF_BROADCAST;
961 else if (p->iph.daddr)
962 nflags = IFF_POINTOPOINT;
964 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
969 t = netdev_priv(dev);
975 ip_tunnel_update(itn, t, dev, p, true, 0);
983 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
986 if (dev == itn->fb_tunnel_dev) {
988 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
992 if (t == netdev_priv(itn->fb_tunnel_dev))
996 unregister_netdevice(dev);
1007 EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
1009 int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
1010 void __user *data, int cmd)
1012 struct ip_tunnel_parm p;
1015 if (copy_from_user(&p, data, sizeof(p)))
1017 err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
1018 if (!err && copy_to_user(data, &p, sizeof(p)))
1022 EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate);
1024 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
1026 struct ip_tunnel *tunnel = netdev_priv(dev);
1027 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
1028 int max_mtu = IP_MAX_MTU - t_hlen;
1030 if (dev->type == ARPHRD_ETHER)
1031 max_mtu -= dev->hard_header_len;
1033 if (new_mtu < ETH_MIN_MTU)
1036 if (new_mtu > max_mtu) {
1046 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
1048 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1050 return __ip_tunnel_change_mtu(dev, new_mtu, true);
1052 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
1054 static void ip_tunnel_dev_free(struct net_device *dev)
1056 struct ip_tunnel *tunnel = netdev_priv(dev);
1058 gro_cells_destroy(&tunnel->gro_cells);
1059 dst_cache_destroy(&tunnel->dst_cache);
1060 free_percpu(dev->tstats);
1063 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1065 struct ip_tunnel *tunnel = netdev_priv(dev);
1066 struct ip_tunnel_net *itn;
1068 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1070 if (itn->fb_tunnel_dev != dev) {
1071 ip_tunnel_del(itn, netdev_priv(dev));
1072 unregister_netdevice_queue(dev, head);
1075 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1077 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1079 struct ip_tunnel *tunnel = netdev_priv(dev);
1083 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1085 int ip_tunnel_get_iflink(const struct net_device *dev)
1087 struct ip_tunnel *tunnel = netdev_priv(dev);
1089 return tunnel->parms.link;
1091 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1093 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1094 struct rtnl_link_ops *ops, char *devname)
1096 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1097 struct ip_tunnel_parm parms;
1100 itn->rtnl_link_ops = ops;
1101 for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1102 INIT_HLIST_HEAD(&itn->tunnels[i]);
1104 if (!ops || !net_has_fallback_tunnels(net)) {
1105 struct ip_tunnel_net *it_init_net;
1107 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1108 itn->type = it_init_net->type;
1109 itn->fb_tunnel_dev = NULL;
1113 memset(&parms, 0, sizeof(parms));
1115 strscpy(parms.name, devname, IFNAMSIZ);
1118 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1119 /* FB netdevice is special: we have one, and only one per netns.
1120 * Allowing to move it to another netns is clearly unsafe.
1122 if (!IS_ERR(itn->fb_tunnel_dev)) {
1123 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1124 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1125 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1126 itn->type = itn->fb_tunnel_dev->type;
1130 return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1132 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1134 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1135 struct list_head *head,
1136 struct rtnl_link_ops *ops)
1138 struct net_device *dev, *aux;
1141 for_each_netdev_safe(net, dev, aux)
1142 if (dev->rtnl_link_ops == ops)
1143 unregister_netdevice_queue(dev, head);
1145 for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1146 struct ip_tunnel *t;
1147 struct hlist_node *n;
1148 struct hlist_head *thead = &itn->tunnels[h];
1150 hlist_for_each_entry_safe(t, n, thead, hash_node)
1151 /* If dev is in the same netns, it has already
1152 * been added to the list by the previous loop.
1154 if (!net_eq(dev_net(t->dev), net))
1155 unregister_netdevice_queue(t->dev, head);
1159 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1160 struct rtnl_link_ops *ops)
1162 struct ip_tunnel_net *itn;
1167 list_for_each_entry(net, net_list, exit_list) {
1168 itn = net_generic(net, id);
1169 ip_tunnel_destroy(net, itn, &list, ops);
1171 unregister_netdevice_many(&list);
1174 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1176 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1177 struct ip_tunnel_parm *p, __u32 fwmark)
1179 struct ip_tunnel *nt;
1180 struct net *net = dev_net(dev);
1181 struct ip_tunnel_net *itn;
1185 nt = netdev_priv(dev);
1186 itn = net_generic(net, nt->ip_tnl_net_id);
1188 if (nt->collect_md) {
1189 if (rtnl_dereference(itn->collect_md_tun))
1192 if (ip_tunnel_find(itn, p, dev->type))
1198 nt->fwmark = fwmark;
1199 err = register_netdevice(dev);
1201 goto err_register_netdevice;
1203 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1204 eth_hw_addr_random(dev);
1206 mtu = ip_tunnel_bind_dev(dev);
1208 unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1210 if (dev->type == ARPHRD_ETHER)
1211 max -= dev->hard_header_len;
1213 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1216 err = dev_set_mtu(dev, mtu);
1218 goto err_dev_set_mtu;
1220 ip_tunnel_add(itn, nt);
1224 unregister_netdevice(dev);
1225 err_register_netdevice:
1228 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1230 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1231 struct ip_tunnel_parm *p, __u32 fwmark)
1233 struct ip_tunnel *t;
1234 struct ip_tunnel *tunnel = netdev_priv(dev);
1235 struct net *net = tunnel->net;
1236 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1238 if (dev == itn->fb_tunnel_dev)
1241 t = ip_tunnel_find(itn, p, dev->type);
1249 if (dev->type != ARPHRD_ETHER) {
1250 unsigned int nflags = 0;
1252 if (ipv4_is_multicast(p->iph.daddr))
1253 nflags = IFF_BROADCAST;
1254 else if (p->iph.daddr)
1255 nflags = IFF_POINTOPOINT;
1257 if ((dev->flags ^ nflags) &
1258 (IFF_POINTOPOINT | IFF_BROADCAST))
1263 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1266 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1268 int ip_tunnel_init(struct net_device *dev)
1270 struct ip_tunnel *tunnel = netdev_priv(dev);
1271 struct iphdr *iph = &tunnel->parms.iph;
1274 dev->needs_free_netdev = true;
1275 dev->priv_destructor = ip_tunnel_dev_free;
1276 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1280 err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1282 free_percpu(dev->tstats);
1286 err = gro_cells_init(&tunnel->gro_cells, dev);
1288 dst_cache_destroy(&tunnel->dst_cache);
1289 free_percpu(dev->tstats);
1294 tunnel->net = dev_net(dev);
1295 strcpy(tunnel->parms.name, dev->name);
1299 if (tunnel->collect_md)
1300 netif_keep_dst(dev);
1301 netdev_lockdep_set_classes(dev);
1304 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1306 void ip_tunnel_uninit(struct net_device *dev)
1308 struct ip_tunnel *tunnel = netdev_priv(dev);
1309 struct net *net = tunnel->net;
1310 struct ip_tunnel_net *itn;
1312 itn = net_generic(net, tunnel->ip_tnl_net_id);
1313 ip_tunnel_del(itn, netdev_priv(dev));
1314 if (itn->fb_tunnel_dev == dev)
1315 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1317 dst_cache_reset(&tunnel->dst_cache);
1319 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1321 /* Do least required initialization, rest of init is done in tunnel_init call */
1322 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1324 struct ip_tunnel *tunnel = netdev_priv(dev);
1325 tunnel->ip_tnl_net_id = net_id;
1327 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1329 MODULE_DESCRIPTION("IPv4 tunnel implementation library");
1330 MODULE_LICENSE("GPL");