1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (c) 2013 Nicira, Inc.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
45 #include <net/dst_metadata.h>
47 #if IS_ENABLED(CONFIG_IPV6)
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
55 return hash_32((__force u32)key ^ (__force u32)remote,
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60 __be16 flags, __be32 key)
62 if (p->i_flags & TUNNEL_KEY) {
63 if (flags & TUNNEL_KEY)
64 return key == p->i_key;
66 /* key expected, none present */
69 return !(flags & TUNNEL_KEY);
72 /* Fallback tunnel: no source, no destination, no key, no options
75 We require exact key match i.e. if a key is present in packet
76 it will match only tunnel with the same key; if it is not present,
77 it will match only keyless tunnel.
79 All keysless packets, if not matched configured keyless tunnels
80 will match fallback tunnel.
81 Given src, dst and key, find appropriate for input tunnel.
83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84 int link, __be16 flags,
85 __be32 remote, __be32 local,
88 struct ip_tunnel *t, *cand = NULL;
89 struct hlist_head *head;
90 struct net_device *ndev;
93 hash = ip_tunnel_hash(key, remote);
94 head = &itn->tunnels[hash];
96 hlist_for_each_entry_rcu(t, head, hash_node) {
97 if (local != t->parms.iph.saddr ||
98 remote != t->parms.iph.daddr ||
99 !(t->dev->flags & IFF_UP))
102 if (!ip_tunnel_key_match(&t->parms, flags, key))
105 if (t->parms.link == link)
111 hlist_for_each_entry_rcu(t, head, hash_node) {
112 if (remote != t->parms.iph.daddr ||
113 t->parms.iph.saddr != 0 ||
114 !(t->dev->flags & IFF_UP))
117 if (!ip_tunnel_key_match(&t->parms, flags, key))
120 if (t->parms.link == link)
126 hash = ip_tunnel_hash(key, 0);
127 head = &itn->tunnels[hash];
129 hlist_for_each_entry_rcu(t, head, hash_node) {
130 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
131 (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
134 if (!(t->dev->flags & IFF_UP))
137 if (!ip_tunnel_key_match(&t->parms, flags, key))
140 if (t->parms.link == link)
146 hlist_for_each_entry_rcu(t, head, hash_node) {
147 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
148 t->parms.iph.saddr != 0 ||
149 t->parms.iph.daddr != 0 ||
150 !(t->dev->flags & IFF_UP))
153 if (t->parms.link == link)
162 t = rcu_dereference(itn->collect_md_tun);
163 if (t && t->dev->flags & IFF_UP)
166 ndev = READ_ONCE(itn->fb_tunnel_dev);
167 if (ndev && ndev->flags & IFF_UP)
168 return netdev_priv(ndev);
172 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
174 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
175 struct ip_tunnel_parm *parms)
179 __be32 i_key = parms->i_key;
181 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
182 remote = parms->iph.daddr;
186 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
189 h = ip_tunnel_hash(i_key, remote);
190 return &itn->tunnels[h];
193 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
195 struct hlist_head *head = ip_bucket(itn, &t->parms);
198 rcu_assign_pointer(itn->collect_md_tun, t);
199 hlist_add_head_rcu(&t->hash_node, head);
202 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
205 rcu_assign_pointer(itn->collect_md_tun, NULL);
206 hlist_del_init_rcu(&t->hash_node);
209 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
210 struct ip_tunnel_parm *parms,
213 __be32 remote = parms->iph.daddr;
214 __be32 local = parms->iph.saddr;
215 __be32 key = parms->i_key;
216 __be16 flags = parms->i_flags;
217 int link = parms->link;
218 struct ip_tunnel *t = NULL;
219 struct hlist_head *head = ip_bucket(itn, parms);
221 hlist_for_each_entry_rcu(t, head, hash_node) {
222 if (local == t->parms.iph.saddr &&
223 remote == t->parms.iph.daddr &&
224 link == t->parms.link &&
225 type == t->dev->type &&
226 ip_tunnel_key_match(&t->parms, flags, key))
232 static struct net_device *__ip_tunnel_create(struct net *net,
233 const struct rtnl_link_ops *ops,
234 struct ip_tunnel_parm *parms)
237 struct ip_tunnel *tunnel;
238 struct net_device *dev;
242 if (parms->name[0]) {
243 if (!dev_valid_name(parms->name))
245 strscpy(name, parms->name, IFNAMSIZ);
247 if (strlen(ops->kind) > (IFNAMSIZ - 3))
249 strcpy(name, ops->kind);
254 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
259 dev_net_set(dev, net);
261 dev->rtnl_link_ops = ops;
263 tunnel = netdev_priv(dev);
264 tunnel->parms = *parms;
267 err = register_netdevice(dev);
279 static int ip_tunnel_bind_dev(struct net_device *dev)
281 struct net_device *tdev = NULL;
282 struct ip_tunnel *tunnel = netdev_priv(dev);
283 const struct iphdr *iph;
284 int hlen = LL_MAX_HEADER;
285 int mtu = ETH_DATA_LEN;
286 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
288 iph = &tunnel->parms.iph;
290 /* Guess output device to choose reasonable mtu and needed_headroom */
295 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
296 iph->saddr, tunnel->parms.o_key,
297 RT_TOS(iph->tos), dev_net(dev),
298 tunnel->parms.link, tunnel->fwmark, 0, 0);
299 rt = ip_route_output_key(tunnel->net, &fl4);
305 if (dev->type != ARPHRD_ETHER)
306 dev->flags |= IFF_POINTOPOINT;
308 dst_cache_reset(&tunnel->dst_cache);
311 if (!tdev && tunnel->parms.link)
312 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
315 hlen = tdev->hard_header_len + tdev->needed_headroom;
316 mtu = min(tdev->mtu, IP_MAX_MTU);
319 dev->needed_headroom = t_hlen + hlen;
320 mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
322 if (mtu < IPV4_MIN_MTU)
328 static struct ip_tunnel *ip_tunnel_create(struct net *net,
329 struct ip_tunnel_net *itn,
330 struct ip_tunnel_parm *parms)
332 struct ip_tunnel *nt;
333 struct net_device *dev;
338 dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
340 return ERR_CAST(dev);
342 mtu = ip_tunnel_bind_dev(dev);
343 err = dev_set_mtu(dev, mtu);
345 goto err_dev_set_mtu;
347 nt = netdev_priv(dev);
348 t_hlen = nt->hlen + sizeof(struct iphdr);
349 dev->min_mtu = ETH_MIN_MTU;
350 dev->max_mtu = IP_MAX_MTU - t_hlen;
351 if (dev->type == ARPHRD_ETHER)
352 dev->max_mtu -= dev->hard_header_len;
354 ip_tunnel_add(itn, nt);
358 unregister_netdevice(dev);
362 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
363 const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
366 const struct iphdr *iph = ip_hdr(skb);
369 #ifdef CONFIG_NET_IPGRE_BROADCAST
370 if (ipv4_is_multicast(iph->daddr)) {
371 tunnel->dev->stats.multicast++;
372 skb->pkt_type = PACKET_BROADCAST;
376 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
377 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
378 tunnel->dev->stats.rx_crc_errors++;
379 tunnel->dev->stats.rx_errors++;
383 if (tunnel->parms.i_flags&TUNNEL_SEQ) {
384 if (!(tpi->flags&TUNNEL_SEQ) ||
385 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
386 tunnel->dev->stats.rx_fifo_errors++;
387 tunnel->dev->stats.rx_errors++;
390 tunnel->i_seqno = ntohl(tpi->seq) + 1;
393 /* Save offset of outer header relative to skb->head,
394 * because we are going to reset the network header to the inner header
395 * and might change skb->head.
397 nh = skb_network_header(skb) - skb->head;
399 skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0);
401 if (!pskb_inet_may_pull(skb)) {
402 DEV_STATS_INC(tunnel->dev, rx_length_errors);
403 DEV_STATS_INC(tunnel->dev, rx_errors);
406 iph = (struct iphdr *)(skb->head + nh);
408 err = IP_ECN_decapsulate(iph, skb);
411 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
412 &iph->saddr, iph->tos);
414 ++tunnel->dev->stats.rx_frame_errors;
415 ++tunnel->dev->stats.rx_errors;
420 dev_sw_netstats_rx_add(tunnel->dev, skb->len);
421 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
423 if (tunnel->dev->type == ARPHRD_ETHER) {
424 skb->protocol = eth_type_trans(skb, tunnel->dev);
425 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
427 skb->dev = tunnel->dev;
431 skb_dst_set(skb, (struct dst_entry *)tun_dst);
433 gro_cells_receive(&tunnel->gro_cells, skb);
438 dst_release((struct dst_entry *)tun_dst);
442 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
444 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
447 if (num >= MAX_IPTUN_ENCAP_OPS)
450 return !cmpxchg((const struct ip_tunnel_encap_ops **)
454 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
456 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
461 if (num >= MAX_IPTUN_ENCAP_OPS)
464 ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
466 ops, NULL) == ops) ? 0 : -1;
472 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
474 int ip_tunnel_encap_setup(struct ip_tunnel *t,
475 struct ip_tunnel_encap *ipencap)
479 memset(&t->encap, 0, sizeof(t->encap));
481 hlen = ip_encap_hlen(ipencap);
485 t->encap.type = ipencap->type;
486 t->encap.sport = ipencap->sport;
487 t->encap.dport = ipencap->dport;
488 t->encap.flags = ipencap->flags;
490 t->encap_hlen = hlen;
491 t->hlen = t->encap_hlen + t->tun_hlen;
495 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
497 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
498 struct rtable *rt, __be16 df,
499 const struct iphdr *inner_iph,
500 int tunnel_hlen, __be32 dst, bool md)
502 struct ip_tunnel *tunnel = netdev_priv(dev);
506 tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
507 pkt_size = skb->len - tunnel_hlen;
508 pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
511 mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
512 mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
514 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
517 if (skb_valid_dst(skb))
518 skb_dst_update_pmtu_no_confirm(skb, mtu);
520 if (skb->protocol == htons(ETH_P_IP)) {
521 if (!skb_is_gso(skb) &&
522 (inner_iph->frag_off & htons(IP_DF)) &&
524 icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
528 #if IS_ENABLED(CONFIG_IPV6)
529 else if (skb->protocol == htons(ETH_P_IPV6)) {
530 struct rt6_info *rt6;
533 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
535 daddr = md ? dst : tunnel->parms.iph.daddr;
537 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
538 mtu >= IPV6_MIN_MTU) {
539 if ((daddr && !ipv4_is_multicast(daddr)) ||
540 rt6->rt6i_dst.plen == 128) {
541 rt6->rt6i_flags |= RTF_MODIFIED;
542 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
546 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
548 icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
556 static void ip_tunnel_adj_headroom(struct net_device *dev, unsigned int headroom)
558 /* we must cap headroom to some upperlimit, else pskb_expand_head
559 * will overflow header offsets in skb_headers_offset_update().
561 static const unsigned int max_allowed = 512;
563 if (headroom > max_allowed)
564 headroom = max_allowed;
566 if (headroom > READ_ONCE(dev->needed_headroom))
567 WRITE_ONCE(dev->needed_headroom, headroom);
570 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
571 u8 proto, int tunnel_hlen)
573 struct ip_tunnel *tunnel = netdev_priv(dev);
574 u32 headroom = sizeof(struct iphdr);
575 struct ip_tunnel_info *tun_info;
576 const struct ip_tunnel_key *key;
577 const struct iphdr *inner_iph;
578 struct rtable *rt = NULL;
584 tun_info = skb_tunnel_info(skb);
585 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
586 ip_tunnel_info_af(tun_info) != AF_INET))
588 key = &tun_info->key;
589 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
590 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
593 if (skb->protocol == htons(ETH_P_IP))
594 tos = inner_iph->tos;
595 else if (skb->protocol == htons(ETH_P_IPV6))
596 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
598 ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
599 tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
600 dev_net(dev), 0, skb->mark, skb_get_hash(skb),
602 if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
605 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
607 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
609 rt = ip_route_output_key(tunnel->net, &fl4);
611 dev->stats.tx_carrier_errors++;
615 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
618 if (rt->dst.dev == dev) {
620 dev->stats.collisions++;
624 if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
626 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
627 key->u.ipv4.dst, true)) {
632 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
635 if (skb->protocol == htons(ETH_P_IP))
636 ttl = inner_iph->ttl;
637 else if (skb->protocol == htons(ETH_P_IPV6))
638 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
640 ttl = ip4_dst_hoplimit(&rt->dst);
643 headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
644 if (skb_cow_head(skb, headroom)) {
649 ip_tunnel_adj_headroom(dev, headroom);
651 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
652 df, !net_eq(tunnel->net, dev_net(dev)));
655 dev->stats.tx_errors++;
658 dev->stats.tx_dropped++;
662 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
664 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
665 const struct iphdr *tnl_params, u8 protocol)
667 struct ip_tunnel *tunnel = netdev_priv(dev);
668 struct ip_tunnel_info *tun_info = NULL;
669 const struct iphdr *inner_iph;
670 unsigned int max_headroom; /* The extra header space needed */
671 struct rtable *rt = NULL; /* Route to the other host */
672 __be16 payload_protocol;
673 bool use_cache = false;
681 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
682 connected = (tunnel->parms.iph.daddr != 0);
683 payload_protocol = skb_protocol(skb, true);
685 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
687 dst = tnl_params->daddr;
692 dev->stats.tx_fifo_errors++;
696 tun_info = skb_tunnel_info(skb);
697 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
698 ip_tunnel_info_af(tun_info) == AF_INET &&
699 tun_info->key.u.ipv4.dst) {
700 dst = tun_info->key.u.ipv4.dst;
703 } else if (payload_protocol == htons(ETH_P_IP)) {
704 rt = skb_rtable(skb);
705 dst = rt_nexthop(rt, inner_iph->daddr);
707 #if IS_ENABLED(CONFIG_IPV6)
708 else if (payload_protocol == htons(ETH_P_IPV6)) {
709 const struct in6_addr *addr6;
710 struct neighbour *neigh;
711 bool do_tx_error_icmp;
714 neigh = dst_neigh_lookup(skb_dst(skb),
715 &ipv6_hdr(skb)->daddr);
719 addr6 = (const struct in6_addr *)&neigh->primary_key;
720 addr_type = ipv6_addr_type(addr6);
722 if (addr_type == IPV6_ADDR_ANY) {
723 addr6 = &ipv6_hdr(skb)->daddr;
724 addr_type = ipv6_addr_type(addr6);
727 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
728 do_tx_error_icmp = true;
730 do_tx_error_icmp = false;
731 dst = addr6->s6_addr32[3];
733 neigh_release(neigh);
734 if (do_tx_error_icmp)
745 tos = tnl_params->tos;
748 if (payload_protocol == htons(ETH_P_IP)) {
749 tos = inner_iph->tos;
751 } else if (payload_protocol == htons(ETH_P_IPV6)) {
752 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
757 ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
758 tunnel->parms.o_key, RT_TOS(tos),
759 dev_net(dev), tunnel->parms.link,
760 tunnel->fwmark, skb_get_hash(skb), 0);
762 if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
765 if (connected && md) {
766 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
768 rt = dst_cache_get_ip4(&tun_info->dst_cache,
771 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
776 rt = ip_route_output_key(tunnel->net, &fl4);
779 dev->stats.tx_carrier_errors++;
783 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
785 else if (!md && connected)
786 dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
790 if (rt->dst.dev == dev) {
792 dev->stats.collisions++;
796 df = tnl_params->frag_off;
797 if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
798 df |= (inner_iph->frag_off & htons(IP_DF));
800 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
805 if (tunnel->err_count > 0) {
806 if (time_before(jiffies,
807 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
810 dst_link_failure(skb);
812 tunnel->err_count = 0;
815 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
816 ttl = tnl_params->ttl;
818 if (payload_protocol == htons(ETH_P_IP))
819 ttl = inner_iph->ttl;
820 #if IS_ENABLED(CONFIG_IPV6)
821 else if (payload_protocol == htons(ETH_P_IPV6))
822 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
825 ttl = ip4_dst_hoplimit(&rt->dst);
828 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
829 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
831 if (skb_cow_head(skb, max_headroom)) {
833 dev->stats.tx_dropped++;
838 ip_tunnel_adj_headroom(dev, max_headroom);
840 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
841 df, !net_eq(tunnel->net, dev_net(dev)));
844 #if IS_ENABLED(CONFIG_IPV6)
846 dst_link_failure(skb);
849 dev->stats.tx_errors++;
852 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
854 static void ip_tunnel_update(struct ip_tunnel_net *itn,
856 struct net_device *dev,
857 struct ip_tunnel_parm *p,
861 ip_tunnel_del(itn, t);
862 t->parms.iph.saddr = p->iph.saddr;
863 t->parms.iph.daddr = p->iph.daddr;
864 t->parms.i_key = p->i_key;
865 t->parms.o_key = p->o_key;
866 if (dev->type != ARPHRD_ETHER) {
867 __dev_addr_set(dev, &p->iph.saddr, 4);
868 memcpy(dev->broadcast, &p->iph.daddr, 4);
870 ip_tunnel_add(itn, t);
872 t->parms.iph.ttl = p->iph.ttl;
873 t->parms.iph.tos = p->iph.tos;
874 t->parms.iph.frag_off = p->iph.frag_off;
876 if (t->parms.link != p->link || t->fwmark != fwmark) {
879 t->parms.link = p->link;
881 mtu = ip_tunnel_bind_dev(dev);
885 dst_cache_reset(&t->dst_cache);
886 netdev_state_change(dev);
889 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
892 struct ip_tunnel *t = netdev_priv(dev);
893 struct net *net = t->net;
894 struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
898 if (dev == itn->fb_tunnel_dev) {
899 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
901 t = netdev_priv(dev);
903 memcpy(p, &t->parms, sizeof(*p));
909 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
912 p->iph.frag_off |= htons(IP_DF);
913 if (!(p->i_flags & VTI_ISVTI)) {
914 if (!(p->i_flags & TUNNEL_KEY))
916 if (!(p->o_flags & TUNNEL_KEY))
920 t = ip_tunnel_find(itn, p, itn->type);
922 if (cmd == SIOCADDTUNNEL) {
924 t = ip_tunnel_create(net, itn, p);
925 err = PTR_ERR_OR_ZERO(t);
932 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
939 unsigned int nflags = 0;
941 if (ipv4_is_multicast(p->iph.daddr))
942 nflags = IFF_BROADCAST;
943 else if (p->iph.daddr)
944 nflags = IFF_POINTOPOINT;
946 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
951 t = netdev_priv(dev);
957 ip_tunnel_update(itn, t, dev, p, true, 0);
965 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
968 if (dev == itn->fb_tunnel_dev) {
970 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
974 if (t == netdev_priv(itn->fb_tunnel_dev))
978 unregister_netdevice(dev);
989 EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
991 int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
992 void __user *data, int cmd)
994 struct ip_tunnel_parm p;
997 if (copy_from_user(&p, data, sizeof(p)))
999 err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
1000 if (!err && copy_to_user(data, &p, sizeof(p)))
1004 EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate);
1006 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
1008 struct ip_tunnel *tunnel = netdev_priv(dev);
1009 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
1010 int max_mtu = IP_MAX_MTU - t_hlen;
1012 if (dev->type == ARPHRD_ETHER)
1013 max_mtu -= dev->hard_header_len;
1015 if (new_mtu < ETH_MIN_MTU)
1018 if (new_mtu > max_mtu) {
1028 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
1030 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1032 return __ip_tunnel_change_mtu(dev, new_mtu, true);
1034 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
1036 static void ip_tunnel_dev_free(struct net_device *dev)
1038 struct ip_tunnel *tunnel = netdev_priv(dev);
1040 gro_cells_destroy(&tunnel->gro_cells);
1041 dst_cache_destroy(&tunnel->dst_cache);
1042 free_percpu(dev->tstats);
1045 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1047 struct ip_tunnel *tunnel = netdev_priv(dev);
1048 struct ip_tunnel_net *itn;
1050 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1052 if (itn->fb_tunnel_dev != dev) {
1053 ip_tunnel_del(itn, netdev_priv(dev));
1054 unregister_netdevice_queue(dev, head);
1057 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1059 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1061 struct ip_tunnel *tunnel = netdev_priv(dev);
1065 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1067 int ip_tunnel_get_iflink(const struct net_device *dev)
1069 struct ip_tunnel *tunnel = netdev_priv(dev);
1071 return tunnel->parms.link;
1073 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1075 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1076 struct rtnl_link_ops *ops, char *devname)
1078 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1079 struct ip_tunnel_parm parms;
1082 itn->rtnl_link_ops = ops;
1083 for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1084 INIT_HLIST_HEAD(&itn->tunnels[i]);
1086 if (!ops || !net_has_fallback_tunnels(net)) {
1087 struct ip_tunnel_net *it_init_net;
1089 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1090 itn->type = it_init_net->type;
1091 itn->fb_tunnel_dev = NULL;
1095 memset(&parms, 0, sizeof(parms));
1097 strscpy(parms.name, devname, IFNAMSIZ);
1100 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1101 /* FB netdevice is special: we have one, and only one per netns.
1102 * Allowing to move it to another netns is clearly unsafe.
1104 if (!IS_ERR(itn->fb_tunnel_dev)) {
1105 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1106 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1107 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1108 itn->type = itn->fb_tunnel_dev->type;
1112 return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1114 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1116 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1117 struct list_head *head,
1118 struct rtnl_link_ops *ops)
1120 struct net_device *dev, *aux;
1123 for_each_netdev_safe(net, dev, aux)
1124 if (dev->rtnl_link_ops == ops)
1125 unregister_netdevice_queue(dev, head);
1127 for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1128 struct ip_tunnel *t;
1129 struct hlist_node *n;
1130 struct hlist_head *thead = &itn->tunnels[h];
1132 hlist_for_each_entry_safe(t, n, thead, hash_node)
1133 /* If dev is in the same netns, it has already
1134 * been added to the list by the previous loop.
1136 if (!net_eq(dev_net(t->dev), net))
1137 unregister_netdevice_queue(t->dev, head);
1141 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1142 struct rtnl_link_ops *ops)
1144 struct ip_tunnel_net *itn;
1149 list_for_each_entry(net, net_list, exit_list) {
1150 itn = net_generic(net, id);
1151 ip_tunnel_destroy(net, itn, &list, ops);
1153 unregister_netdevice_many(&list);
1156 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1158 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1159 struct ip_tunnel_parm *p, __u32 fwmark)
1161 struct ip_tunnel *nt;
1162 struct net *net = dev_net(dev);
1163 struct ip_tunnel_net *itn;
1167 nt = netdev_priv(dev);
1168 itn = net_generic(net, nt->ip_tnl_net_id);
1170 if (nt->collect_md) {
1171 if (rtnl_dereference(itn->collect_md_tun))
1174 if (ip_tunnel_find(itn, p, dev->type))
1180 nt->fwmark = fwmark;
1181 err = register_netdevice(dev);
1183 goto err_register_netdevice;
1185 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1186 eth_hw_addr_random(dev);
1188 mtu = ip_tunnel_bind_dev(dev);
1190 unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1192 if (dev->type == ARPHRD_ETHER)
1193 max -= dev->hard_header_len;
1195 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1198 err = dev_set_mtu(dev, mtu);
1200 goto err_dev_set_mtu;
1202 ip_tunnel_add(itn, nt);
1206 unregister_netdevice(dev);
1207 err_register_netdevice:
1210 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1212 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1213 struct ip_tunnel_parm *p, __u32 fwmark)
1215 struct ip_tunnel *t;
1216 struct ip_tunnel *tunnel = netdev_priv(dev);
1217 struct net *net = tunnel->net;
1218 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1220 if (dev == itn->fb_tunnel_dev)
1223 t = ip_tunnel_find(itn, p, dev->type);
1231 if (dev->type != ARPHRD_ETHER) {
1232 unsigned int nflags = 0;
1234 if (ipv4_is_multicast(p->iph.daddr))
1235 nflags = IFF_BROADCAST;
1236 else if (p->iph.daddr)
1237 nflags = IFF_POINTOPOINT;
1239 if ((dev->flags ^ nflags) &
1240 (IFF_POINTOPOINT | IFF_BROADCAST))
1245 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1248 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1250 int ip_tunnel_init(struct net_device *dev)
1252 struct ip_tunnel *tunnel = netdev_priv(dev);
1253 struct iphdr *iph = &tunnel->parms.iph;
1256 dev->needs_free_netdev = true;
1257 dev->priv_destructor = ip_tunnel_dev_free;
1258 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1262 err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1264 free_percpu(dev->tstats);
1268 err = gro_cells_init(&tunnel->gro_cells, dev);
1270 dst_cache_destroy(&tunnel->dst_cache);
1271 free_percpu(dev->tstats);
1276 tunnel->net = dev_net(dev);
1277 strcpy(tunnel->parms.name, dev->name);
1281 if (tunnel->collect_md)
1282 netif_keep_dst(dev);
1285 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1287 void ip_tunnel_uninit(struct net_device *dev)
1289 struct ip_tunnel *tunnel = netdev_priv(dev);
1290 struct net *net = tunnel->net;
1291 struct ip_tunnel_net *itn;
1293 itn = net_generic(net, tunnel->ip_tnl_net_id);
1294 ip_tunnel_del(itn, netdev_priv(dev));
1295 if (itn->fb_tunnel_dev == dev)
1296 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1298 dst_cache_reset(&tunnel->dst_cache);
1300 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1302 /* Do least required initialization, rest of init is done in tunnel_init call */
1303 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1305 struct ip_tunnel *tunnel = netdev_priv(dev);
1306 tunnel->ip_tnl_net_id = net_id;
1308 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1310 MODULE_LICENSE("GPL");