2 * Copyright (c) 2013 Nicira, Inc.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #if IS_ENABLED(CONFIG_IPV6)
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
68 return hash_32((__force u32)key ^ (__force u32)remote,
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73 __be16 flags, __be32 key)
75 if (p->i_flags & TUNNEL_KEY) {
76 if (flags & TUNNEL_KEY)
77 return key == p->i_key;
79 /* key expected, none present */
82 return !(flags & TUNNEL_KEY);
85 /* Fallback tunnel: no source, no destination, no key, no options
88 We require exact key match i.e. if a key is present in packet
89 it will match only tunnel with the same key; if it is not present,
90 it will match only keyless tunnel.
92 All keysless packets, if not matched configured keyless tunnels
93 will match fallback tunnel.
94 Given src, dst and key, find appropriate for input tunnel.
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97 int link, __be16 flags,
98 __be32 remote, __be32 local,
101 struct ip_tunnel *t, *cand = NULL;
102 struct hlist_head *head;
103 struct net_device *ndev;
106 hash = ip_tunnel_hash(key, remote);
107 head = &itn->tunnels[hash];
109 hlist_for_each_entry_rcu(t, head, hash_node) {
110 if (local != t->parms.iph.saddr ||
111 remote != t->parms.iph.daddr ||
112 !(t->dev->flags & IFF_UP))
115 if (!ip_tunnel_key_match(&t->parms, flags, key))
118 if (t->parms.link == link)
124 hlist_for_each_entry_rcu(t, head, hash_node) {
125 if (remote != t->parms.iph.daddr ||
126 t->parms.iph.saddr != 0 ||
127 !(t->dev->flags & IFF_UP))
130 if (!ip_tunnel_key_match(&t->parms, flags, key))
133 if (t->parms.link == link)
139 hash = ip_tunnel_hash(key, 0);
140 head = &itn->tunnels[hash];
142 hlist_for_each_entry_rcu(t, head, hash_node) {
143 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
144 (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
147 if (!(t->dev->flags & IFF_UP))
150 if (!ip_tunnel_key_match(&t->parms, flags, key))
153 if (t->parms.link == link)
159 hlist_for_each_entry_rcu(t, head, hash_node) {
160 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
161 t->parms.iph.saddr != 0 ||
162 t->parms.iph.daddr != 0 ||
163 !(t->dev->flags & IFF_UP))
166 if (t->parms.link == link)
175 t = rcu_dereference(itn->collect_md_tun);
176 if (t && t->dev->flags & IFF_UP)
179 ndev = READ_ONCE(itn->fb_tunnel_dev);
180 if (ndev && ndev->flags & IFF_UP)
181 return netdev_priv(ndev);
185 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
187 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
188 struct ip_tunnel_parm *parms)
192 __be32 i_key = parms->i_key;
194 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
195 remote = parms->iph.daddr;
199 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
202 h = ip_tunnel_hash(i_key, remote);
203 return &itn->tunnels[h];
206 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
208 struct hlist_head *head = ip_bucket(itn, &t->parms);
211 rcu_assign_pointer(itn->collect_md_tun, t);
212 hlist_add_head_rcu(&t->hash_node, head);
215 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
218 rcu_assign_pointer(itn->collect_md_tun, NULL);
219 hlist_del_init_rcu(&t->hash_node);
222 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
223 struct ip_tunnel_parm *parms,
226 __be32 remote = parms->iph.daddr;
227 __be32 local = parms->iph.saddr;
228 __be32 key = parms->i_key;
229 __be16 flags = parms->i_flags;
230 int link = parms->link;
231 struct ip_tunnel *t = NULL;
232 struct hlist_head *head = ip_bucket(itn, parms);
234 hlist_for_each_entry_rcu(t, head, hash_node) {
235 if (local == t->parms.iph.saddr &&
236 remote == t->parms.iph.daddr &&
237 link == t->parms.link &&
238 type == t->dev->type &&
239 ip_tunnel_key_match(&t->parms, flags, key))
245 static struct net_device *__ip_tunnel_create(struct net *net,
246 const struct rtnl_link_ops *ops,
247 struct ip_tunnel_parm *parms)
250 struct ip_tunnel *tunnel;
251 struct net_device *dev;
255 if (parms->name[0]) {
256 if (!dev_valid_name(parms->name))
258 strlcpy(name, parms->name, IFNAMSIZ);
260 if (strlen(ops->kind) > (IFNAMSIZ - 3))
262 strcpy(name, ops->kind);
267 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
272 dev_net_set(dev, net);
274 dev->rtnl_link_ops = ops;
276 tunnel = netdev_priv(dev);
277 tunnel->parms = *parms;
280 err = register_netdevice(dev);
292 static inline void init_tunnel_flow(struct flowi4 *fl4,
294 __be32 daddr, __be32 saddr,
295 __be32 key, __u8 tos, int oif,
298 memset(fl4, 0, sizeof(*fl4));
299 fl4->flowi4_oif = oif;
302 fl4->flowi4_tos = tos;
303 fl4->flowi4_proto = proto;
304 fl4->fl4_gre_key = key;
305 fl4->flowi4_mark = mark;
308 static int ip_tunnel_bind_dev(struct net_device *dev)
310 struct net_device *tdev = NULL;
311 struct ip_tunnel *tunnel = netdev_priv(dev);
312 const struct iphdr *iph;
313 int hlen = LL_MAX_HEADER;
314 int mtu = ETH_DATA_LEN;
315 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
317 iph = &tunnel->parms.iph;
319 /* Guess output device to choose reasonable mtu and needed_headroom */
324 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
325 iph->saddr, tunnel->parms.o_key,
326 RT_TOS(iph->tos), tunnel->parms.link,
328 rt = ip_route_output_key(tunnel->net, &fl4);
334 if (dev->type != ARPHRD_ETHER)
335 dev->flags |= IFF_POINTOPOINT;
337 dst_cache_reset(&tunnel->dst_cache);
340 if (!tdev && tunnel->parms.link)
341 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
344 hlen = tdev->hard_header_len + tdev->needed_headroom;
348 dev->needed_headroom = t_hlen + hlen;
349 mtu -= (dev->hard_header_len + t_hlen);
351 if (mtu < IPV4_MIN_MTU)
357 static struct ip_tunnel *ip_tunnel_create(struct net *net,
358 struct ip_tunnel_net *itn,
359 struct ip_tunnel_parm *parms)
361 struct ip_tunnel *nt;
362 struct net_device *dev;
365 BUG_ON(!itn->fb_tunnel_dev);
366 dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
368 return ERR_CAST(dev);
370 dev->mtu = ip_tunnel_bind_dev(dev);
372 nt = netdev_priv(dev);
373 t_hlen = nt->hlen + sizeof(struct iphdr);
374 dev->min_mtu = ETH_MIN_MTU;
375 dev->max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
376 ip_tunnel_add(itn, nt);
380 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
381 const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
384 struct pcpu_sw_netstats *tstats;
385 const struct iphdr *iph = ip_hdr(skb);
388 #ifdef CONFIG_NET_IPGRE_BROADCAST
389 if (ipv4_is_multicast(iph->daddr)) {
390 tunnel->dev->stats.multicast++;
391 skb->pkt_type = PACKET_BROADCAST;
395 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
396 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
397 tunnel->dev->stats.rx_crc_errors++;
398 tunnel->dev->stats.rx_errors++;
402 if (tunnel->parms.i_flags&TUNNEL_SEQ) {
403 if (!(tpi->flags&TUNNEL_SEQ) ||
404 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
405 tunnel->dev->stats.rx_fifo_errors++;
406 tunnel->dev->stats.rx_errors++;
409 tunnel->i_seqno = ntohl(tpi->seq) + 1;
412 skb_reset_network_header(skb);
414 err = IP_ECN_decapsulate(iph, skb);
417 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
418 &iph->saddr, iph->tos);
420 ++tunnel->dev->stats.rx_frame_errors;
421 ++tunnel->dev->stats.rx_errors;
426 tstats = this_cpu_ptr(tunnel->dev->tstats);
427 u64_stats_update_begin(&tstats->syncp);
428 tstats->rx_packets++;
429 tstats->rx_bytes += skb->len;
430 u64_stats_update_end(&tstats->syncp);
432 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
434 if (tunnel->dev->type == ARPHRD_ETHER) {
435 skb->protocol = eth_type_trans(skb, tunnel->dev);
436 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
438 skb->dev = tunnel->dev;
442 skb_dst_set(skb, (struct dst_entry *)tun_dst);
444 gro_cells_receive(&tunnel->gro_cells, skb);
449 dst_release((struct dst_entry *)tun_dst);
453 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
455 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
458 if (num >= MAX_IPTUN_ENCAP_OPS)
461 return !cmpxchg((const struct ip_tunnel_encap_ops **)
465 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
467 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
472 if (num >= MAX_IPTUN_ENCAP_OPS)
475 ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
477 ops, NULL) == ops) ? 0 : -1;
483 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
485 int ip_tunnel_encap_setup(struct ip_tunnel *t,
486 struct ip_tunnel_encap *ipencap)
490 memset(&t->encap, 0, sizeof(t->encap));
492 hlen = ip_encap_hlen(ipencap);
496 t->encap.type = ipencap->type;
497 t->encap.sport = ipencap->sport;
498 t->encap.dport = ipencap->dport;
499 t->encap.flags = ipencap->flags;
501 t->encap_hlen = hlen;
502 t->hlen = t->encap_hlen + t->tun_hlen;
506 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
508 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
509 struct rtable *rt, __be16 df,
510 const struct iphdr *inner_iph)
512 struct ip_tunnel *tunnel = netdev_priv(dev);
513 int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
517 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
518 - sizeof(struct iphdr) - tunnel->hlen;
520 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
522 skb_dst_update_pmtu_no_confirm(skb, mtu);
524 if (skb->protocol == htons(ETH_P_IP)) {
525 if (!skb_is_gso(skb) &&
526 (inner_iph->frag_off & htons(IP_DF)) &&
528 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
529 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
533 #if IS_ENABLED(CONFIG_IPV6)
534 else if (skb->protocol == htons(ETH_P_IPV6)) {
535 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
537 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
538 mtu >= IPV6_MIN_MTU) {
539 if ((tunnel->parms.iph.daddr &&
540 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
541 rt6->rt6i_dst.plen == 128) {
542 rt6->rt6i_flags |= RTF_MODIFIED;
543 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
547 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
549 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
557 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
559 struct ip_tunnel *tunnel = netdev_priv(dev);
560 u32 headroom = sizeof(struct iphdr);
561 struct ip_tunnel_info *tun_info;
562 const struct ip_tunnel_key *key;
563 const struct iphdr *inner_iph;
569 tun_info = skb_tunnel_info(skb);
570 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
571 ip_tunnel_info_af(tun_info) != AF_INET))
573 key = &tun_info->key;
574 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
575 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
578 if (skb->protocol == htons(ETH_P_IP))
579 tos = inner_iph->tos;
580 else if (skb->protocol == htons(ETH_P_IPV6))
581 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
583 init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
584 RT_TOS(tos), tunnel->parms.link, tunnel->fwmark);
585 if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
587 rt = ip_route_output_key(tunnel->net, &fl4);
589 dev->stats.tx_carrier_errors++;
592 if (rt->dst.dev == dev) {
594 dev->stats.collisions++;
597 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
600 if (skb->protocol == htons(ETH_P_IP))
601 ttl = inner_iph->ttl;
602 else if (skb->protocol == htons(ETH_P_IPV6))
603 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
605 ttl = ip4_dst_hoplimit(&rt->dst);
607 if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
609 else if (skb->protocol == htons(ETH_P_IP))
610 df = inner_iph->frag_off & htons(IP_DF);
611 headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
612 if (headroom > READ_ONCE(dev->needed_headroom))
613 WRITE_ONCE(dev->needed_headroom, headroom);
615 if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) {
619 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
620 df, !net_eq(tunnel->net, dev_net(dev)));
623 dev->stats.tx_errors++;
626 dev->stats.tx_dropped++;
630 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
632 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
633 const struct iphdr *tnl_params, u8 protocol)
635 struct ip_tunnel *tunnel = netdev_priv(dev);
636 unsigned int inner_nhdr_len = 0;
637 const struct iphdr *inner_iph;
641 struct rtable *rt; /* Route to the other host */
642 unsigned int max_headroom; /* The extra header space needed */
646 /* ensure we can access the inner net header, for several users below */
647 if (skb->protocol == htons(ETH_P_IP))
648 inner_nhdr_len = sizeof(struct iphdr);
649 else if (skb->protocol == htons(ETH_P_IPV6))
650 inner_nhdr_len = sizeof(struct ipv6hdr);
651 if (unlikely(!pskb_may_pull(skb, inner_nhdr_len)))
654 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
655 connected = (tunnel->parms.iph.daddr != 0);
657 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
659 dst = tnl_params->daddr;
662 struct ip_tunnel_info *tun_info;
665 dev->stats.tx_fifo_errors++;
669 tun_info = skb_tunnel_info(skb);
670 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
671 ip_tunnel_info_af(tun_info) == AF_INET &&
672 tun_info->key.u.ipv4.dst)
673 dst = tun_info->key.u.ipv4.dst;
674 else if (skb->protocol == htons(ETH_P_IP)) {
675 rt = skb_rtable(skb);
676 dst = rt_nexthop(rt, inner_iph->daddr);
678 #if IS_ENABLED(CONFIG_IPV6)
679 else if (skb->protocol == htons(ETH_P_IPV6)) {
680 const struct in6_addr *addr6;
681 struct neighbour *neigh;
682 bool do_tx_error_icmp;
685 neigh = dst_neigh_lookup(skb_dst(skb),
686 &ipv6_hdr(skb)->daddr);
690 addr6 = (const struct in6_addr *)&neigh->primary_key;
691 addr_type = ipv6_addr_type(addr6);
693 if (addr_type == IPV6_ADDR_ANY) {
694 addr6 = &ipv6_hdr(skb)->daddr;
695 addr_type = ipv6_addr_type(addr6);
698 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
699 do_tx_error_icmp = true;
701 do_tx_error_icmp = false;
702 dst = addr6->s6_addr32[3];
704 neigh_release(neigh);
705 if (do_tx_error_icmp)
715 tos = tnl_params->tos;
718 if (skb->protocol == htons(ETH_P_IP)) {
719 tos = inner_iph->tos;
721 } else if (skb->protocol == htons(ETH_P_IPV6)) {
722 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
727 init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
728 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
731 if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
734 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
738 rt = ip_route_output_key(tunnel->net, &fl4);
741 dev->stats.tx_carrier_errors++;
745 dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
749 if (rt->dst.dev == dev) {
751 dev->stats.collisions++;
755 df = tnl_params->frag_off;
756 if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
757 df |= (inner_iph->frag_off & htons(IP_DF));
759 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph)) {
764 if (tunnel->err_count > 0) {
765 if (time_before(jiffies,
766 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
769 dst_link_failure(skb);
771 tunnel->err_count = 0;
774 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
775 ttl = tnl_params->ttl;
777 if (skb->protocol == htons(ETH_P_IP))
778 ttl = inner_iph->ttl;
779 #if IS_ENABLED(CONFIG_IPV6)
780 else if (skb->protocol == htons(ETH_P_IPV6))
781 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
784 ttl = ip4_dst_hoplimit(&rt->dst);
787 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
788 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
789 if (max_headroom > READ_ONCE(dev->needed_headroom))
790 WRITE_ONCE(dev->needed_headroom, max_headroom);
792 if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) {
794 dev->stats.tx_dropped++;
799 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
800 df, !net_eq(tunnel->net, dev_net(dev)));
803 #if IS_ENABLED(CONFIG_IPV6)
805 dst_link_failure(skb);
808 dev->stats.tx_errors++;
811 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
813 static void ip_tunnel_update(struct ip_tunnel_net *itn,
815 struct net_device *dev,
816 struct ip_tunnel_parm *p,
820 ip_tunnel_del(itn, t);
821 t->parms.iph.saddr = p->iph.saddr;
822 t->parms.iph.daddr = p->iph.daddr;
823 t->parms.i_key = p->i_key;
824 t->parms.o_key = p->o_key;
825 if (dev->type != ARPHRD_ETHER) {
826 memcpy(dev->dev_addr, &p->iph.saddr, 4);
827 memcpy(dev->broadcast, &p->iph.daddr, 4);
829 ip_tunnel_add(itn, t);
831 t->parms.iph.ttl = p->iph.ttl;
832 t->parms.iph.tos = p->iph.tos;
833 t->parms.iph.frag_off = p->iph.frag_off;
835 if (t->parms.link != p->link || t->fwmark != fwmark) {
838 t->parms.link = p->link;
840 mtu = ip_tunnel_bind_dev(dev);
844 dst_cache_reset(&t->dst_cache);
845 netdev_state_change(dev);
848 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
851 struct ip_tunnel *t = netdev_priv(dev);
852 struct net *net = t->net;
853 struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
855 BUG_ON(!itn->fb_tunnel_dev);
858 if (dev == itn->fb_tunnel_dev) {
859 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
861 t = netdev_priv(dev);
863 memcpy(p, &t->parms, sizeof(*p));
869 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
872 p->iph.frag_off |= htons(IP_DF);
873 if (!(p->i_flags & VTI_ISVTI)) {
874 if (!(p->i_flags & TUNNEL_KEY))
876 if (!(p->o_flags & TUNNEL_KEY))
880 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
882 if (cmd == SIOCADDTUNNEL) {
884 t = ip_tunnel_create(net, itn, p);
885 err = PTR_ERR_OR_ZERO(t);
892 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
899 unsigned int nflags = 0;
901 if (ipv4_is_multicast(p->iph.daddr))
902 nflags = IFF_BROADCAST;
903 else if (p->iph.daddr)
904 nflags = IFF_POINTOPOINT;
906 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
911 t = netdev_priv(dev);
917 ip_tunnel_update(itn, t, dev, p, true, 0);
925 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
928 if (dev == itn->fb_tunnel_dev) {
930 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
934 if (t == netdev_priv(itn->fb_tunnel_dev))
938 unregister_netdevice(dev);
949 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
951 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
953 struct ip_tunnel *tunnel = netdev_priv(dev);
954 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
955 int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
957 if (new_mtu < ETH_MIN_MTU)
960 if (new_mtu > max_mtu) {
970 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
972 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
974 return __ip_tunnel_change_mtu(dev, new_mtu, true);
976 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
978 static void ip_tunnel_dev_free(struct net_device *dev)
980 struct ip_tunnel *tunnel = netdev_priv(dev);
982 gro_cells_destroy(&tunnel->gro_cells);
983 dst_cache_destroy(&tunnel->dst_cache);
984 free_percpu(dev->tstats);
987 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
989 struct ip_tunnel *tunnel = netdev_priv(dev);
990 struct ip_tunnel_net *itn;
992 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
994 if (itn->fb_tunnel_dev != dev) {
995 ip_tunnel_del(itn, netdev_priv(dev));
996 unregister_netdevice_queue(dev, head);
999 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1001 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1003 struct ip_tunnel *tunnel = netdev_priv(dev);
1007 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1009 int ip_tunnel_get_iflink(const struct net_device *dev)
1011 struct ip_tunnel *tunnel = netdev_priv(dev);
1013 return tunnel->parms.link;
1015 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1017 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1018 struct rtnl_link_ops *ops, char *devname)
1020 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1021 struct ip_tunnel_parm parms;
1024 for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1025 INIT_HLIST_HEAD(&itn->tunnels[i]);
1028 itn->fb_tunnel_dev = NULL;
1032 memset(&parms, 0, sizeof(parms));
1034 strlcpy(parms.name, devname, IFNAMSIZ);
1037 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1038 /* FB netdevice is special: we have one, and only one per netns.
1039 * Allowing to move it to another netns is clearly unsafe.
1041 if (!IS_ERR(itn->fb_tunnel_dev)) {
1042 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1043 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1044 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1048 return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1050 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1052 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1053 struct rtnl_link_ops *ops)
1055 struct net *net = dev_net(itn->fb_tunnel_dev);
1056 struct net_device *dev, *aux;
1059 for_each_netdev_safe(net, dev, aux)
1060 if (dev->rtnl_link_ops == ops)
1061 unregister_netdevice_queue(dev, head);
1063 for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1064 struct ip_tunnel *t;
1065 struct hlist_node *n;
1066 struct hlist_head *thead = &itn->tunnels[h];
1068 hlist_for_each_entry_safe(t, n, thead, hash_node)
1069 /* If dev is in the same netns, it has already
1070 * been added to the list by the previous loop.
1072 if (!net_eq(dev_net(t->dev), net))
1073 unregister_netdevice_queue(t->dev, head);
1077 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1082 ip_tunnel_destroy(itn, &list, ops);
1083 unregister_netdevice_many(&list);
1086 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1088 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1089 struct ip_tunnel_parm *p, __u32 fwmark)
1091 struct ip_tunnel *nt;
1092 struct net *net = dev_net(dev);
1093 struct ip_tunnel_net *itn;
1097 nt = netdev_priv(dev);
1098 itn = net_generic(net, nt->ip_tnl_net_id);
1100 if (nt->collect_md) {
1101 if (rtnl_dereference(itn->collect_md_tun))
1104 if (ip_tunnel_find(itn, p, dev->type))
1110 nt->fwmark = fwmark;
1111 err = register_netdevice(dev);
1115 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1116 eth_hw_addr_random(dev);
1118 mtu = ip_tunnel_bind_dev(dev);
1120 unsigned int max = 0xfff8 - dev->hard_header_len - nt->hlen;
1122 dev->mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
1123 (unsigned int)(max - sizeof(struct iphdr)));
1128 ip_tunnel_add(itn, nt);
1132 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1134 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1135 struct ip_tunnel_parm *p, __u32 fwmark)
1137 struct ip_tunnel *t;
1138 struct ip_tunnel *tunnel = netdev_priv(dev);
1139 struct net *net = tunnel->net;
1140 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1142 if (dev == itn->fb_tunnel_dev)
1145 t = ip_tunnel_find(itn, p, dev->type);
1153 if (dev->type != ARPHRD_ETHER) {
1154 unsigned int nflags = 0;
1156 if (ipv4_is_multicast(p->iph.daddr))
1157 nflags = IFF_BROADCAST;
1158 else if (p->iph.daddr)
1159 nflags = IFF_POINTOPOINT;
1161 if ((dev->flags ^ nflags) &
1162 (IFF_POINTOPOINT | IFF_BROADCAST))
1167 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1170 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1172 int ip_tunnel_init(struct net_device *dev)
1174 struct ip_tunnel *tunnel = netdev_priv(dev);
1175 struct iphdr *iph = &tunnel->parms.iph;
1178 dev->needs_free_netdev = true;
1179 dev->priv_destructor = ip_tunnel_dev_free;
1180 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1184 err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1186 free_percpu(dev->tstats);
1190 err = gro_cells_init(&tunnel->gro_cells, dev);
1192 dst_cache_destroy(&tunnel->dst_cache);
1193 free_percpu(dev->tstats);
1198 tunnel->net = dev_net(dev);
1199 strcpy(tunnel->parms.name, dev->name);
1203 if (tunnel->collect_md)
1204 netif_keep_dst(dev);
1207 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1209 void ip_tunnel_uninit(struct net_device *dev)
1211 struct ip_tunnel *tunnel = netdev_priv(dev);
1212 struct net *net = tunnel->net;
1213 struct ip_tunnel_net *itn;
1215 itn = net_generic(net, tunnel->ip_tnl_net_id);
1216 ip_tunnel_del(itn, netdev_priv(dev));
1217 if (itn->fb_tunnel_dev == dev)
1218 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1220 dst_cache_reset(&tunnel->dst_cache);
1222 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1224 /* Do least required initialization, rest of init is done in tunnel_init call */
1225 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1227 struct ip_tunnel *tunnel = netdev_priv(dev);
1228 tunnel->ip_tnl_net_id = net_id;
1230 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1232 MODULE_LICENSE("GPL");