1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (c) 2013 Nicira, Inc.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
45 #include <net/dst_metadata.h>
47 #if IS_ENABLED(CONFIG_IPV6)
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
55 return hash_32((__force u32)key ^ (__force u32)remote,
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60 __be16 flags, __be32 key)
62 if (p->i_flags & TUNNEL_KEY) {
63 if (flags & TUNNEL_KEY)
64 return key == p->i_key;
66 /* key expected, none present */
69 return !(flags & TUNNEL_KEY);
72 /* Fallback tunnel: no source, no destination, no key, no options
75 We require exact key match i.e. if a key is present in packet
76 it will match only tunnel with the same key; if it is not present,
77 it will match only keyless tunnel.
79 All keysless packets, if not matched configured keyless tunnels
80 will match fallback tunnel.
81 Given src, dst and key, find appropriate for input tunnel.
83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84 int link, __be16 flags,
85 __be32 remote, __be32 local,
88 struct ip_tunnel *t, *cand = NULL;
89 struct hlist_head *head;
90 struct net_device *ndev;
93 hash = ip_tunnel_hash(key, remote);
94 head = &itn->tunnels[hash];
96 hlist_for_each_entry_rcu(t, head, hash_node) {
97 if (local != t->parms.iph.saddr ||
98 remote != t->parms.iph.daddr ||
99 !(t->dev->flags & IFF_UP))
102 if (!ip_tunnel_key_match(&t->parms, flags, key))
105 if (t->parms.link == link)
111 hlist_for_each_entry_rcu(t, head, hash_node) {
112 if (remote != t->parms.iph.daddr ||
113 t->parms.iph.saddr != 0 ||
114 !(t->dev->flags & IFF_UP))
117 if (!ip_tunnel_key_match(&t->parms, flags, key))
120 if (t->parms.link == link)
126 hash = ip_tunnel_hash(key, 0);
127 head = &itn->tunnels[hash];
129 hlist_for_each_entry_rcu(t, head, hash_node) {
130 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
131 (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
134 if (!(t->dev->flags & IFF_UP))
137 if (!ip_tunnel_key_match(&t->parms, flags, key))
140 if (t->parms.link == link)
146 hlist_for_each_entry_rcu(t, head, hash_node) {
147 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
148 t->parms.iph.saddr != 0 ||
149 t->parms.iph.daddr != 0 ||
150 !(t->dev->flags & IFF_UP))
153 if (t->parms.link == link)
162 t = rcu_dereference(itn->collect_md_tun);
163 if (t && t->dev->flags & IFF_UP)
166 ndev = READ_ONCE(itn->fb_tunnel_dev);
167 if (ndev && ndev->flags & IFF_UP)
168 return netdev_priv(ndev);
172 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
174 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
175 struct ip_tunnel_parm *parms)
179 __be32 i_key = parms->i_key;
181 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
182 remote = parms->iph.daddr;
186 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
189 h = ip_tunnel_hash(i_key, remote);
190 return &itn->tunnels[h];
193 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
195 struct hlist_head *head = ip_bucket(itn, &t->parms);
198 rcu_assign_pointer(itn->collect_md_tun, t);
199 hlist_add_head_rcu(&t->hash_node, head);
202 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
205 rcu_assign_pointer(itn->collect_md_tun, NULL);
206 hlist_del_init_rcu(&t->hash_node);
209 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
210 struct ip_tunnel_parm *parms,
213 __be32 remote = parms->iph.daddr;
214 __be32 local = parms->iph.saddr;
215 __be32 key = parms->i_key;
216 __be16 flags = parms->i_flags;
217 int link = parms->link;
218 struct ip_tunnel *t = NULL;
219 struct hlist_head *head = ip_bucket(itn, parms);
221 hlist_for_each_entry_rcu(t, head, hash_node) {
222 if (local == t->parms.iph.saddr &&
223 remote == t->parms.iph.daddr &&
224 link == t->parms.link &&
225 type == t->dev->type &&
226 ip_tunnel_key_match(&t->parms, flags, key))
232 static struct net_device *__ip_tunnel_create(struct net *net,
233 const struct rtnl_link_ops *ops,
234 struct ip_tunnel_parm *parms)
237 struct ip_tunnel *tunnel;
238 struct net_device *dev;
242 if (parms->name[0]) {
243 if (!dev_valid_name(parms->name))
245 strlcpy(name, parms->name, IFNAMSIZ);
247 if (strlen(ops->kind) > (IFNAMSIZ - 3))
249 strcpy(name, ops->kind);
254 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
259 dev_net_set(dev, net);
261 dev->rtnl_link_ops = ops;
263 tunnel = netdev_priv(dev);
264 tunnel->parms = *parms;
267 err = register_netdevice(dev);
279 static int ip_tunnel_bind_dev(struct net_device *dev)
281 struct net_device *tdev = NULL;
282 struct ip_tunnel *tunnel = netdev_priv(dev);
283 const struct iphdr *iph;
284 int hlen = LL_MAX_HEADER;
285 int mtu = ETH_DATA_LEN;
286 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
288 iph = &tunnel->parms.iph;
290 /* Guess output device to choose reasonable mtu and needed_headroom */
295 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
296 iph->saddr, tunnel->parms.o_key,
297 RT_TOS(iph->tos), tunnel->parms.link,
299 rt = ip_route_output_key(tunnel->net, &fl4);
305 if (dev->type != ARPHRD_ETHER)
306 dev->flags |= IFF_POINTOPOINT;
308 dst_cache_reset(&tunnel->dst_cache);
311 if (!tdev && tunnel->parms.link)
312 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
315 hlen = tdev->hard_header_len + tdev->needed_headroom;
316 mtu = min(tdev->mtu, IP_MAX_MTU);
319 dev->needed_headroom = t_hlen + hlen;
320 mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
322 if (mtu < IPV4_MIN_MTU)
328 static struct ip_tunnel *ip_tunnel_create(struct net *net,
329 struct ip_tunnel_net *itn,
330 struct ip_tunnel_parm *parms)
332 struct ip_tunnel *nt;
333 struct net_device *dev;
338 dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
340 return ERR_CAST(dev);
342 mtu = ip_tunnel_bind_dev(dev);
343 err = dev_set_mtu(dev, mtu);
345 goto err_dev_set_mtu;
347 nt = netdev_priv(dev);
348 t_hlen = nt->hlen + sizeof(struct iphdr);
349 dev->min_mtu = ETH_MIN_MTU;
350 dev->max_mtu = IP_MAX_MTU - t_hlen;
351 if (dev->type == ARPHRD_ETHER)
352 dev->max_mtu -= dev->hard_header_len;
354 ip_tunnel_add(itn, nt);
358 unregister_netdevice(dev);
362 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
363 const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
366 struct pcpu_sw_netstats *tstats;
367 const struct iphdr *iph = ip_hdr(skb);
370 #ifdef CONFIG_NET_IPGRE_BROADCAST
371 if (ipv4_is_multicast(iph->daddr)) {
372 tunnel->dev->stats.multicast++;
373 skb->pkt_type = PACKET_BROADCAST;
377 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
378 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
379 tunnel->dev->stats.rx_crc_errors++;
380 tunnel->dev->stats.rx_errors++;
384 if (tunnel->parms.i_flags&TUNNEL_SEQ) {
385 if (!(tpi->flags&TUNNEL_SEQ) ||
386 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
387 tunnel->dev->stats.rx_fifo_errors++;
388 tunnel->dev->stats.rx_errors++;
391 tunnel->i_seqno = ntohl(tpi->seq) + 1;
394 skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0);
396 err = IP_ECN_decapsulate(iph, skb);
399 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
400 &iph->saddr, iph->tos);
402 ++tunnel->dev->stats.rx_frame_errors;
403 ++tunnel->dev->stats.rx_errors;
408 tstats = this_cpu_ptr(tunnel->dev->tstats);
409 u64_stats_update_begin(&tstats->syncp);
410 tstats->rx_packets++;
411 tstats->rx_bytes += skb->len;
412 u64_stats_update_end(&tstats->syncp);
414 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
416 if (tunnel->dev->type == ARPHRD_ETHER) {
417 skb->protocol = eth_type_trans(skb, tunnel->dev);
418 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
420 skb->dev = tunnel->dev;
424 skb_dst_set(skb, (struct dst_entry *)tun_dst);
426 gro_cells_receive(&tunnel->gro_cells, skb);
431 dst_release((struct dst_entry *)tun_dst);
435 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
437 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
440 if (num >= MAX_IPTUN_ENCAP_OPS)
443 return !cmpxchg((const struct ip_tunnel_encap_ops **)
447 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
449 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
454 if (num >= MAX_IPTUN_ENCAP_OPS)
457 ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
459 ops, NULL) == ops) ? 0 : -1;
465 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
467 int ip_tunnel_encap_setup(struct ip_tunnel *t,
468 struct ip_tunnel_encap *ipencap)
472 memset(&t->encap, 0, sizeof(t->encap));
474 hlen = ip_encap_hlen(ipencap);
478 t->encap.type = ipencap->type;
479 t->encap.sport = ipencap->sport;
480 t->encap.dport = ipencap->dport;
481 t->encap.flags = ipencap->flags;
483 t->encap_hlen = hlen;
484 t->hlen = t->encap_hlen + t->tun_hlen;
488 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
490 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
491 struct rtable *rt, __be16 df,
492 const struct iphdr *inner_iph,
493 int tunnel_hlen, __be32 dst, bool md)
495 struct ip_tunnel *tunnel = netdev_priv(dev);
499 tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
500 pkt_size = skb->len - tunnel_hlen;
501 pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
504 mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
505 mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
507 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
510 if (skb_valid_dst(skb))
511 skb_dst_update_pmtu_no_confirm(skb, mtu);
513 if (skb->protocol == htons(ETH_P_IP)) {
514 if (!skb_is_gso(skb) &&
515 (inner_iph->frag_off & htons(IP_DF)) &&
517 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
518 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
522 #if IS_ENABLED(CONFIG_IPV6)
523 else if (skb->protocol == htons(ETH_P_IPV6)) {
524 struct rt6_info *rt6;
527 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
529 daddr = md ? dst : tunnel->parms.iph.daddr;
531 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
532 mtu >= IPV6_MIN_MTU) {
533 if ((daddr && !ipv4_is_multicast(daddr)) ||
534 rt6->rt6i_dst.plen == 128) {
535 rt6->rt6i_flags |= RTF_MODIFIED;
536 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
540 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
542 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
550 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
551 u8 proto, int tunnel_hlen)
553 struct ip_tunnel *tunnel = netdev_priv(dev);
554 u32 headroom = sizeof(struct iphdr);
555 struct ip_tunnel_info *tun_info;
556 const struct ip_tunnel_key *key;
557 const struct iphdr *inner_iph;
558 struct rtable *rt = NULL;
564 tun_info = skb_tunnel_info(skb);
565 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
566 ip_tunnel_info_af(tun_info) != AF_INET))
568 key = &tun_info->key;
569 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
570 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
573 if (skb->protocol == htons(ETH_P_IP))
574 tos = inner_iph->tos;
575 else if (skb->protocol == htons(ETH_P_IPV6))
576 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
578 ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
579 tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
580 0, skb->mark, skb_get_hash(skb));
581 if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
584 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
586 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
588 rt = ip_route_output_key(tunnel->net, &fl4);
590 dev->stats.tx_carrier_errors++;
594 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
597 if (rt->dst.dev == dev) {
599 dev->stats.collisions++;
603 if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
605 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
606 key->u.ipv4.dst, true)) {
611 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
614 if (skb->protocol == htons(ETH_P_IP))
615 ttl = inner_iph->ttl;
616 else if (skb->protocol == htons(ETH_P_IPV6))
617 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
619 ttl = ip4_dst_hoplimit(&rt->dst);
622 headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
623 if (headroom > READ_ONCE(dev->needed_headroom))
624 WRITE_ONCE(dev->needed_headroom, headroom);
626 if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) {
630 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
631 df, !net_eq(tunnel->net, dev_net(dev)));
634 dev->stats.tx_errors++;
637 dev->stats.tx_dropped++;
641 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
643 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
644 const struct iphdr *tnl_params, u8 protocol)
646 struct ip_tunnel *tunnel = netdev_priv(dev);
647 struct ip_tunnel_info *tun_info = NULL;
648 const struct iphdr *inner_iph;
649 unsigned int max_headroom; /* The extra header space needed */
650 struct rtable *rt = NULL; /* Route to the other host */
651 bool use_cache = false;
659 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
660 connected = (tunnel->parms.iph.daddr != 0);
662 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
664 dst = tnl_params->daddr;
669 dev->stats.tx_fifo_errors++;
673 tun_info = skb_tunnel_info(skb);
674 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
675 ip_tunnel_info_af(tun_info) == AF_INET &&
676 tun_info->key.u.ipv4.dst) {
677 dst = tun_info->key.u.ipv4.dst;
681 else if (skb->protocol == htons(ETH_P_IP)) {
682 rt = skb_rtable(skb);
683 dst = rt_nexthop(rt, inner_iph->daddr);
685 #if IS_ENABLED(CONFIG_IPV6)
686 else if (skb->protocol == htons(ETH_P_IPV6)) {
687 const struct in6_addr *addr6;
688 struct neighbour *neigh;
689 bool do_tx_error_icmp;
692 neigh = dst_neigh_lookup(skb_dst(skb),
693 &ipv6_hdr(skb)->daddr);
697 addr6 = (const struct in6_addr *)&neigh->primary_key;
698 addr_type = ipv6_addr_type(addr6);
700 if (addr_type == IPV6_ADDR_ANY) {
701 addr6 = &ipv6_hdr(skb)->daddr;
702 addr_type = ipv6_addr_type(addr6);
705 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
706 do_tx_error_icmp = true;
708 do_tx_error_icmp = false;
709 dst = addr6->s6_addr32[3];
711 neigh_release(neigh);
712 if (do_tx_error_icmp)
723 tos = tnl_params->tos;
726 if (skb->protocol == htons(ETH_P_IP)) {
727 tos = inner_iph->tos;
729 } else if (skb->protocol == htons(ETH_P_IPV6)) {
730 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
735 ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
736 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
737 tunnel->fwmark, skb_get_hash(skb));
739 if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
742 if (connected && md) {
743 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
745 rt = dst_cache_get_ip4(&tun_info->dst_cache,
748 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
753 rt = ip_route_output_key(tunnel->net, &fl4);
756 dev->stats.tx_carrier_errors++;
760 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
762 else if (!md && connected)
763 dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
767 if (rt->dst.dev == dev) {
769 dev->stats.collisions++;
773 df = tnl_params->frag_off;
774 if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
775 df |= (inner_iph->frag_off & htons(IP_DF));
777 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
782 if (tunnel->err_count > 0) {
783 if (time_before(jiffies,
784 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
787 dst_link_failure(skb);
789 tunnel->err_count = 0;
792 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
793 ttl = tnl_params->ttl;
795 if (skb->protocol == htons(ETH_P_IP))
796 ttl = inner_iph->ttl;
797 #if IS_ENABLED(CONFIG_IPV6)
798 else if (skb->protocol == htons(ETH_P_IPV6))
799 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
802 ttl = ip4_dst_hoplimit(&rt->dst);
805 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
806 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
807 if (max_headroom > READ_ONCE(dev->needed_headroom))
808 WRITE_ONCE(dev->needed_headroom, max_headroom);
810 if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) {
812 dev->stats.tx_dropped++;
817 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
818 df, !net_eq(tunnel->net, dev_net(dev)));
821 #if IS_ENABLED(CONFIG_IPV6)
823 dst_link_failure(skb);
826 dev->stats.tx_errors++;
829 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
831 static void ip_tunnel_update(struct ip_tunnel_net *itn,
833 struct net_device *dev,
834 struct ip_tunnel_parm *p,
838 ip_tunnel_del(itn, t);
839 t->parms.iph.saddr = p->iph.saddr;
840 t->parms.iph.daddr = p->iph.daddr;
841 t->parms.i_key = p->i_key;
842 t->parms.o_key = p->o_key;
843 if (dev->type != ARPHRD_ETHER) {
844 memcpy(dev->dev_addr, &p->iph.saddr, 4);
845 memcpy(dev->broadcast, &p->iph.daddr, 4);
847 ip_tunnel_add(itn, t);
849 t->parms.iph.ttl = p->iph.ttl;
850 t->parms.iph.tos = p->iph.tos;
851 t->parms.iph.frag_off = p->iph.frag_off;
853 if (t->parms.link != p->link || t->fwmark != fwmark) {
856 t->parms.link = p->link;
858 mtu = ip_tunnel_bind_dev(dev);
862 dst_cache_reset(&t->dst_cache);
863 netdev_state_change(dev);
866 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
869 struct ip_tunnel *t = netdev_priv(dev);
870 struct net *net = t->net;
871 struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
875 if (dev == itn->fb_tunnel_dev) {
876 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
878 t = netdev_priv(dev);
880 memcpy(p, &t->parms, sizeof(*p));
886 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
889 p->iph.frag_off |= htons(IP_DF);
890 if (!(p->i_flags & VTI_ISVTI)) {
891 if (!(p->i_flags & TUNNEL_KEY))
893 if (!(p->o_flags & TUNNEL_KEY))
897 t = ip_tunnel_find(itn, p, itn->type);
899 if (cmd == SIOCADDTUNNEL) {
901 t = ip_tunnel_create(net, itn, p);
902 err = PTR_ERR_OR_ZERO(t);
909 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
916 unsigned int nflags = 0;
918 if (ipv4_is_multicast(p->iph.daddr))
919 nflags = IFF_BROADCAST;
920 else if (p->iph.daddr)
921 nflags = IFF_POINTOPOINT;
923 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
928 t = netdev_priv(dev);
934 ip_tunnel_update(itn, t, dev, p, true, 0);
942 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
945 if (dev == itn->fb_tunnel_dev) {
947 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
951 if (t == netdev_priv(itn->fb_tunnel_dev))
955 unregister_netdevice(dev);
966 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
968 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
970 struct ip_tunnel *tunnel = netdev_priv(dev);
971 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
972 int max_mtu = IP_MAX_MTU - t_hlen;
974 if (dev->type == ARPHRD_ETHER)
975 max_mtu -= dev->hard_header_len;
977 if (new_mtu < ETH_MIN_MTU)
980 if (new_mtu > max_mtu) {
990 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
992 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
994 return __ip_tunnel_change_mtu(dev, new_mtu, true);
996 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
998 static void ip_tunnel_dev_free(struct net_device *dev)
1000 struct ip_tunnel *tunnel = netdev_priv(dev);
1002 gro_cells_destroy(&tunnel->gro_cells);
1003 dst_cache_destroy(&tunnel->dst_cache);
1004 free_percpu(dev->tstats);
1007 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1009 struct ip_tunnel *tunnel = netdev_priv(dev);
1010 struct ip_tunnel_net *itn;
1012 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1014 if (itn->fb_tunnel_dev != dev) {
1015 ip_tunnel_del(itn, netdev_priv(dev));
1016 unregister_netdevice_queue(dev, head);
1019 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1021 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1023 struct ip_tunnel *tunnel = netdev_priv(dev);
1027 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1029 int ip_tunnel_get_iflink(const struct net_device *dev)
1031 struct ip_tunnel *tunnel = netdev_priv(dev);
1033 return tunnel->parms.link;
1035 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1037 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1038 struct rtnl_link_ops *ops, char *devname)
1040 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1041 struct ip_tunnel_parm parms;
1044 itn->rtnl_link_ops = ops;
1045 for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1046 INIT_HLIST_HEAD(&itn->tunnels[i]);
1048 if (!ops || !net_has_fallback_tunnels(net)) {
1049 struct ip_tunnel_net *it_init_net;
1051 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1052 itn->type = it_init_net->type;
1053 itn->fb_tunnel_dev = NULL;
1057 memset(&parms, 0, sizeof(parms));
1059 strlcpy(parms.name, devname, IFNAMSIZ);
1062 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1063 /* FB netdevice is special: we have one, and only one per netns.
1064 * Allowing to move it to another netns is clearly unsafe.
1066 if (!IS_ERR(itn->fb_tunnel_dev)) {
1067 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1068 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1069 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1070 itn->type = itn->fb_tunnel_dev->type;
1074 return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1076 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1078 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1079 struct list_head *head,
1080 struct rtnl_link_ops *ops)
1082 struct net_device *dev, *aux;
1085 for_each_netdev_safe(net, dev, aux)
1086 if (dev->rtnl_link_ops == ops)
1087 unregister_netdevice_queue(dev, head);
1089 for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1090 struct ip_tunnel *t;
1091 struct hlist_node *n;
1092 struct hlist_head *thead = &itn->tunnels[h];
1094 hlist_for_each_entry_safe(t, n, thead, hash_node)
1095 /* If dev is in the same netns, it has already
1096 * been added to the list by the previous loop.
1098 if (!net_eq(dev_net(t->dev), net))
1099 unregister_netdevice_queue(t->dev, head);
1103 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1104 struct rtnl_link_ops *ops)
1106 struct ip_tunnel_net *itn;
1111 list_for_each_entry(net, net_list, exit_list) {
1112 itn = net_generic(net, id);
1113 ip_tunnel_destroy(net, itn, &list, ops);
1115 unregister_netdevice_many(&list);
1118 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1120 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1121 struct ip_tunnel_parm *p, __u32 fwmark)
1123 struct ip_tunnel *nt;
1124 struct net *net = dev_net(dev);
1125 struct ip_tunnel_net *itn;
1129 nt = netdev_priv(dev);
1130 itn = net_generic(net, nt->ip_tnl_net_id);
1132 if (nt->collect_md) {
1133 if (rtnl_dereference(itn->collect_md_tun))
1136 if (ip_tunnel_find(itn, p, dev->type))
1142 nt->fwmark = fwmark;
1143 err = register_netdevice(dev);
1145 goto err_register_netdevice;
1147 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1148 eth_hw_addr_random(dev);
1150 mtu = ip_tunnel_bind_dev(dev);
1152 unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1154 if (dev->type == ARPHRD_ETHER)
1155 max -= dev->hard_header_len;
1157 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1160 err = dev_set_mtu(dev, mtu);
1162 goto err_dev_set_mtu;
1164 ip_tunnel_add(itn, nt);
1168 unregister_netdevice(dev);
1169 err_register_netdevice:
1172 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1174 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1175 struct ip_tunnel_parm *p, __u32 fwmark)
1177 struct ip_tunnel *t;
1178 struct ip_tunnel *tunnel = netdev_priv(dev);
1179 struct net *net = tunnel->net;
1180 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1182 if (dev == itn->fb_tunnel_dev)
1185 t = ip_tunnel_find(itn, p, dev->type);
1193 if (dev->type != ARPHRD_ETHER) {
1194 unsigned int nflags = 0;
1196 if (ipv4_is_multicast(p->iph.daddr))
1197 nflags = IFF_BROADCAST;
1198 else if (p->iph.daddr)
1199 nflags = IFF_POINTOPOINT;
1201 if ((dev->flags ^ nflags) &
1202 (IFF_POINTOPOINT | IFF_BROADCAST))
1207 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1210 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1212 int ip_tunnel_init(struct net_device *dev)
1214 struct ip_tunnel *tunnel = netdev_priv(dev);
1215 struct iphdr *iph = &tunnel->parms.iph;
1218 dev->needs_free_netdev = true;
1219 dev->priv_destructor = ip_tunnel_dev_free;
1220 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1224 err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1226 free_percpu(dev->tstats);
1230 err = gro_cells_init(&tunnel->gro_cells, dev);
1232 dst_cache_destroy(&tunnel->dst_cache);
1233 free_percpu(dev->tstats);
1238 tunnel->net = dev_net(dev);
1239 strcpy(tunnel->parms.name, dev->name);
1243 if (tunnel->collect_md)
1244 netif_keep_dst(dev);
1247 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1249 void ip_tunnel_uninit(struct net_device *dev)
1251 struct ip_tunnel *tunnel = netdev_priv(dev);
1252 struct net *net = tunnel->net;
1253 struct ip_tunnel_net *itn;
1255 itn = net_generic(net, tunnel->ip_tnl_net_id);
1256 ip_tunnel_del(itn, netdev_priv(dev));
1257 if (itn->fb_tunnel_dev == dev)
1258 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1260 dst_cache_reset(&tunnel->dst_cache);
1262 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1264 /* Do least required initialization, rest of init is done in tunnel_init call */
1265 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1267 struct ip_tunnel *tunnel = netdev_priv(dev);
1268 tunnel->ip_tnl_net_id = net_id;
1270 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1272 MODULE_LICENSE("GPL");