GNU Linux-libre 4.19.245-gnu1
[releases.git] / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 #include <net/dst_metadata.h>
59
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68         return hash_32((__force u32)key ^ (__force u32)remote,
69                          IP_TNL_HASH_BITS);
70 }
71
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73                                 __be16 flags, __be32 key)
74 {
75         if (p->i_flags & TUNNEL_KEY) {
76                 if (flags & TUNNEL_KEY)
77                         return key == p->i_key;
78                 else
79                         /* key expected, none present */
80                         return false;
81         } else
82                 return !(flags & TUNNEL_KEY);
83 }
84
85 /* Fallback tunnel: no source, no destination, no key, no options
86
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97                                    int link, __be16 flags,
98                                    __be32 remote, __be32 local,
99                                    __be32 key)
100 {
101         struct ip_tunnel *t, *cand = NULL;
102         struct hlist_head *head;
103         struct net_device *ndev;
104         unsigned int hash;
105
106         hash = ip_tunnel_hash(key, remote);
107         head = &itn->tunnels[hash];
108
109         hlist_for_each_entry_rcu(t, head, hash_node) {
110                 if (local != t->parms.iph.saddr ||
111                     remote != t->parms.iph.daddr ||
112                     !(t->dev->flags & IFF_UP))
113                         continue;
114
115                 if (!ip_tunnel_key_match(&t->parms, flags, key))
116                         continue;
117
118                 if (t->parms.link == link)
119                         return t;
120                 else
121                         cand = t;
122         }
123
124         hlist_for_each_entry_rcu(t, head, hash_node) {
125                 if (remote != t->parms.iph.daddr ||
126                     t->parms.iph.saddr != 0 ||
127                     !(t->dev->flags & IFF_UP))
128                         continue;
129
130                 if (!ip_tunnel_key_match(&t->parms, flags, key))
131                         continue;
132
133                 if (t->parms.link == link)
134                         return t;
135                 else if (!cand)
136                         cand = t;
137         }
138
139         hash = ip_tunnel_hash(key, 0);
140         head = &itn->tunnels[hash];
141
142         hlist_for_each_entry_rcu(t, head, hash_node) {
143                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
144                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
145                         continue;
146
147                 if (!(t->dev->flags & IFF_UP))
148                         continue;
149
150                 if (!ip_tunnel_key_match(&t->parms, flags, key))
151                         continue;
152
153                 if (t->parms.link == link)
154                         return t;
155                 else if (!cand)
156                         cand = t;
157         }
158
159         hlist_for_each_entry_rcu(t, head, hash_node) {
160                 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
161                     t->parms.iph.saddr != 0 ||
162                     t->parms.iph.daddr != 0 ||
163                     !(t->dev->flags & IFF_UP))
164                         continue;
165
166                 if (t->parms.link == link)
167                         return t;
168                 else if (!cand)
169                         cand = t;
170         }
171
172         if (cand)
173                 return cand;
174
175         t = rcu_dereference(itn->collect_md_tun);
176         if (t && t->dev->flags & IFF_UP)
177                 return t;
178
179         ndev = READ_ONCE(itn->fb_tunnel_dev);
180         if (ndev && ndev->flags & IFF_UP)
181                 return netdev_priv(ndev);
182
183         return NULL;
184 }
185 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
186
187 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
188                                     struct ip_tunnel_parm *parms)
189 {
190         unsigned int h;
191         __be32 remote;
192         __be32 i_key = parms->i_key;
193
194         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
195                 remote = parms->iph.daddr;
196         else
197                 remote = 0;
198
199         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
200                 i_key = 0;
201
202         h = ip_tunnel_hash(i_key, remote);
203         return &itn->tunnels[h];
204 }
205
206 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
207 {
208         struct hlist_head *head = ip_bucket(itn, &t->parms);
209
210         if (t->collect_md)
211                 rcu_assign_pointer(itn->collect_md_tun, t);
212         hlist_add_head_rcu(&t->hash_node, head);
213 }
214
215 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
216 {
217         if (t->collect_md)
218                 rcu_assign_pointer(itn->collect_md_tun, NULL);
219         hlist_del_init_rcu(&t->hash_node);
220 }
221
222 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
223                                         struct ip_tunnel_parm *parms,
224                                         int type)
225 {
226         __be32 remote = parms->iph.daddr;
227         __be32 local = parms->iph.saddr;
228         __be32 key = parms->i_key;
229         __be16 flags = parms->i_flags;
230         int link = parms->link;
231         struct ip_tunnel *t = NULL;
232         struct hlist_head *head = ip_bucket(itn, parms);
233
234         hlist_for_each_entry_rcu(t, head, hash_node) {
235                 if (local == t->parms.iph.saddr &&
236                     remote == t->parms.iph.daddr &&
237                     link == t->parms.link &&
238                     type == t->dev->type &&
239                     ip_tunnel_key_match(&t->parms, flags, key))
240                         break;
241         }
242         return t;
243 }
244
245 static struct net_device *__ip_tunnel_create(struct net *net,
246                                              const struct rtnl_link_ops *ops,
247                                              struct ip_tunnel_parm *parms)
248 {
249         int err;
250         struct ip_tunnel *tunnel;
251         struct net_device *dev;
252         char name[IFNAMSIZ];
253
254         err = -E2BIG;
255         if (parms->name[0]) {
256                 if (!dev_valid_name(parms->name))
257                         goto failed;
258                 strlcpy(name, parms->name, IFNAMSIZ);
259         } else {
260                 if (strlen(ops->kind) > (IFNAMSIZ - 3))
261                         goto failed;
262                 strcpy(name, ops->kind);
263                 strcat(name, "%d");
264         }
265
266         ASSERT_RTNL();
267         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
268         if (!dev) {
269                 err = -ENOMEM;
270                 goto failed;
271         }
272         dev_net_set(dev, net);
273
274         dev->rtnl_link_ops = ops;
275
276         tunnel = netdev_priv(dev);
277         tunnel->parms = *parms;
278         tunnel->net = net;
279
280         err = register_netdevice(dev);
281         if (err)
282                 goto failed_free;
283
284         return dev;
285
286 failed_free:
287         free_netdev(dev);
288 failed:
289         return ERR_PTR(err);
290 }
291
292 static int ip_tunnel_bind_dev(struct net_device *dev)
293 {
294         struct net_device *tdev = NULL;
295         struct ip_tunnel *tunnel = netdev_priv(dev);
296         const struct iphdr *iph;
297         int hlen = LL_MAX_HEADER;
298         int mtu = ETH_DATA_LEN;
299         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
300
301         iph = &tunnel->parms.iph;
302
303         /* Guess output device to choose reasonable mtu and needed_headroom */
304         if (iph->daddr) {
305                 struct flowi4 fl4;
306                 struct rtable *rt;
307
308                 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
309                                     iph->saddr, tunnel->parms.o_key,
310                                     RT_TOS(iph->tos), tunnel->parms.link,
311                                     tunnel->fwmark);
312                 rt = ip_route_output_key(tunnel->net, &fl4);
313
314                 if (!IS_ERR(rt)) {
315                         tdev = rt->dst.dev;
316                         ip_rt_put(rt);
317                 }
318                 if (dev->type != ARPHRD_ETHER)
319                         dev->flags |= IFF_POINTOPOINT;
320
321                 dst_cache_reset(&tunnel->dst_cache);
322         }
323
324         if (!tdev && tunnel->parms.link)
325                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
326
327         if (tdev) {
328                 hlen = tdev->hard_header_len + tdev->needed_headroom;
329                 mtu = min(tdev->mtu, IP_MAX_MTU);
330         }
331
332         dev->needed_headroom = t_hlen + hlen;
333         mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
334
335         if (mtu < IPV4_MIN_MTU)
336                 mtu = IPV4_MIN_MTU;
337
338         return mtu;
339 }
340
341 static struct ip_tunnel *ip_tunnel_create(struct net *net,
342                                           struct ip_tunnel_net *itn,
343                                           struct ip_tunnel_parm *parms)
344 {
345         struct ip_tunnel *nt;
346         struct net_device *dev;
347         int t_hlen;
348         int mtu;
349         int err;
350
351         dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
352         if (IS_ERR(dev))
353                 return ERR_CAST(dev);
354
355         mtu = ip_tunnel_bind_dev(dev);
356         err = dev_set_mtu(dev, mtu);
357         if (err)
358                 goto err_dev_set_mtu;
359
360         nt = netdev_priv(dev);
361         t_hlen = nt->hlen + sizeof(struct iphdr);
362         dev->min_mtu = ETH_MIN_MTU;
363         dev->max_mtu = IP_MAX_MTU - t_hlen;
364         if (dev->type == ARPHRD_ETHER)
365                 dev->max_mtu -= dev->hard_header_len;
366
367         ip_tunnel_add(itn, nt);
368         return nt;
369
370 err_dev_set_mtu:
371         unregister_netdevice(dev);
372         return ERR_PTR(err);
373 }
374
375 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
376                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
377                   bool log_ecn_error)
378 {
379         struct pcpu_sw_netstats *tstats;
380         const struct iphdr *iph = ip_hdr(skb);
381         int err;
382
383 #ifdef CONFIG_NET_IPGRE_BROADCAST
384         if (ipv4_is_multicast(iph->daddr)) {
385                 tunnel->dev->stats.multicast++;
386                 skb->pkt_type = PACKET_BROADCAST;
387         }
388 #endif
389
390         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
391              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
392                 tunnel->dev->stats.rx_crc_errors++;
393                 tunnel->dev->stats.rx_errors++;
394                 goto drop;
395         }
396
397         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
398                 if (!(tpi->flags&TUNNEL_SEQ) ||
399                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
400                         tunnel->dev->stats.rx_fifo_errors++;
401                         tunnel->dev->stats.rx_errors++;
402                         goto drop;
403                 }
404                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
405         }
406
407         skb_reset_network_header(skb);
408
409         err = IP_ECN_decapsulate(iph, skb);
410         if (unlikely(err)) {
411                 if (log_ecn_error)
412                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
413                                         &iph->saddr, iph->tos);
414                 if (err > 1) {
415                         ++tunnel->dev->stats.rx_frame_errors;
416                         ++tunnel->dev->stats.rx_errors;
417                         goto drop;
418                 }
419         }
420
421         tstats = this_cpu_ptr(tunnel->dev->tstats);
422         u64_stats_update_begin(&tstats->syncp);
423         tstats->rx_packets++;
424         tstats->rx_bytes += skb->len;
425         u64_stats_update_end(&tstats->syncp);
426
427         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
428
429         if (tunnel->dev->type == ARPHRD_ETHER) {
430                 skb->protocol = eth_type_trans(skb, tunnel->dev);
431                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
432         } else {
433                 skb->dev = tunnel->dev;
434         }
435
436         if (tun_dst)
437                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
438
439         gro_cells_receive(&tunnel->gro_cells, skb);
440         return 0;
441
442 drop:
443         if (tun_dst)
444                 dst_release((struct dst_entry *)tun_dst);
445         kfree_skb(skb);
446         return 0;
447 }
448 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
449
450 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
451                             unsigned int num)
452 {
453         if (num >= MAX_IPTUN_ENCAP_OPS)
454                 return -ERANGE;
455
456         return !cmpxchg((const struct ip_tunnel_encap_ops **)
457                         &iptun_encaps[num],
458                         NULL, ops) ? 0 : -1;
459 }
460 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
461
462 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
463                             unsigned int num)
464 {
465         int ret;
466
467         if (num >= MAX_IPTUN_ENCAP_OPS)
468                 return -ERANGE;
469
470         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
471                        &iptun_encaps[num],
472                        ops, NULL) == ops) ? 0 : -1;
473
474         synchronize_net();
475
476         return ret;
477 }
478 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
479
480 int ip_tunnel_encap_setup(struct ip_tunnel *t,
481                           struct ip_tunnel_encap *ipencap)
482 {
483         int hlen;
484
485         memset(&t->encap, 0, sizeof(t->encap));
486
487         hlen = ip_encap_hlen(ipencap);
488         if (hlen < 0)
489                 return hlen;
490
491         t->encap.type = ipencap->type;
492         t->encap.sport = ipencap->sport;
493         t->encap.dport = ipencap->dport;
494         t->encap.flags = ipencap->flags;
495
496         t->encap_hlen = hlen;
497         t->hlen = t->encap_hlen + t->tun_hlen;
498
499         return 0;
500 }
501 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
502
503 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
504                             struct rtable *rt, __be16 df,
505                             const struct iphdr *inner_iph)
506 {
507         struct ip_tunnel *tunnel = netdev_priv(dev);
508         int pkt_size;
509         int mtu;
510
511         pkt_size = skb->len - tunnel->hlen;
512         pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
513
514         if (df) {
515                 mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel->hlen);
516                 mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
517         } else {
518                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
519         }
520
521         skb_dst_update_pmtu_no_confirm(skb, mtu);
522
523         if (skb->protocol == htons(ETH_P_IP)) {
524                 if (!skb_is_gso(skb) &&
525                     (inner_iph->frag_off & htons(IP_DF)) &&
526                     mtu < pkt_size) {
527                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
528                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
529                         return -E2BIG;
530                 }
531         }
532 #if IS_ENABLED(CONFIG_IPV6)
533         else if (skb->protocol == htons(ETH_P_IPV6)) {
534                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
535
536                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
537                            mtu >= IPV6_MIN_MTU) {
538                         if ((tunnel->parms.iph.daddr &&
539                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
540                             rt6->rt6i_dst.plen == 128) {
541                                 rt6->rt6i_flags |= RTF_MODIFIED;
542                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
543                         }
544                 }
545
546                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
547                                         mtu < pkt_size) {
548                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
549                         return -E2BIG;
550                 }
551         }
552 #endif
553         return 0;
554 }
555
556 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
557 {
558         struct ip_tunnel *tunnel = netdev_priv(dev);
559         u32 headroom = sizeof(struct iphdr);
560         struct ip_tunnel_info *tun_info;
561         const struct ip_tunnel_key *key;
562         const struct iphdr *inner_iph;
563         struct rtable *rt;
564         struct flowi4 fl4;
565         __be16 df = 0;
566         u8 tos, ttl;
567
568         tun_info = skb_tunnel_info(skb);
569         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
570                      ip_tunnel_info_af(tun_info) != AF_INET))
571                 goto tx_error;
572         key = &tun_info->key;
573         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
574         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
575         tos = key->tos;
576         if (tos == 1) {
577                 if (skb->protocol == htons(ETH_P_IP))
578                         tos = inner_iph->tos;
579                 else if (skb->protocol == htons(ETH_P_IPV6))
580                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
581         }
582         ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
583                             tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
584                             0, skb->mark);
585         if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
586                 goto tx_error;
587         rt = ip_route_output_key(tunnel->net, &fl4);
588         if (IS_ERR(rt)) {
589                 dev->stats.tx_carrier_errors++;
590                 goto tx_error;
591         }
592         if (rt->dst.dev == dev) {
593                 ip_rt_put(rt);
594                 dev->stats.collisions++;
595                 goto tx_error;
596         }
597         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
598         ttl = key->ttl;
599         if (ttl == 0) {
600                 if (skb->protocol == htons(ETH_P_IP))
601                         ttl = inner_iph->ttl;
602                 else if (skb->protocol == htons(ETH_P_IPV6))
603                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
604                 else
605                         ttl = ip4_dst_hoplimit(&rt->dst);
606         }
607         if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
608                 df = htons(IP_DF);
609         else if (skb->protocol == htons(ETH_P_IP))
610                 df = inner_iph->frag_off & htons(IP_DF);
611         headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
612         if (headroom > dev->needed_headroom)
613                 dev->needed_headroom = headroom;
614
615         if (skb_cow_head(skb, dev->needed_headroom)) {
616                 ip_rt_put(rt);
617                 goto tx_dropped;
618         }
619         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
620                       df, !net_eq(tunnel->net, dev_net(dev)));
621         return;
622 tx_error:
623         dev->stats.tx_errors++;
624         goto kfree;
625 tx_dropped:
626         dev->stats.tx_dropped++;
627 kfree:
628         kfree_skb(skb);
629 }
630 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
631
632 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
633                     const struct iphdr *tnl_params, u8 protocol)
634 {
635         struct ip_tunnel *tunnel = netdev_priv(dev);
636         const struct iphdr *inner_iph;
637         struct flowi4 fl4;
638         u8     tos, ttl;
639         __be16 df;
640         struct rtable *rt;              /* Route to the other host */
641         unsigned int max_headroom;      /* The extra header space needed */
642         __be32 dst;
643         bool connected;
644
645         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
646         connected = (tunnel->parms.iph.daddr != 0);
647
648         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
649
650         dst = tnl_params->daddr;
651         if (dst == 0) {
652                 /* NBMA tunnel */
653                 struct ip_tunnel_info *tun_info;
654
655                 if (!skb_dst(skb)) {
656                         dev->stats.tx_fifo_errors++;
657                         goto tx_error;
658                 }
659
660                 tun_info = skb_tunnel_info(skb);
661                 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
662                     ip_tunnel_info_af(tun_info) == AF_INET &&
663                     tun_info->key.u.ipv4.dst)
664                         dst = tun_info->key.u.ipv4.dst;
665                 else if (skb->protocol == htons(ETH_P_IP)) {
666                         rt = skb_rtable(skb);
667                         dst = rt_nexthop(rt, inner_iph->daddr);
668                 }
669 #if IS_ENABLED(CONFIG_IPV6)
670                 else if (skb->protocol == htons(ETH_P_IPV6)) {
671                         const struct in6_addr *addr6;
672                         struct neighbour *neigh;
673                         bool do_tx_error_icmp;
674                         int addr_type;
675
676                         neigh = dst_neigh_lookup(skb_dst(skb),
677                                                  &ipv6_hdr(skb)->daddr);
678                         if (!neigh)
679                                 goto tx_error;
680
681                         addr6 = (const struct in6_addr *)&neigh->primary_key;
682                         addr_type = ipv6_addr_type(addr6);
683
684                         if (addr_type == IPV6_ADDR_ANY) {
685                                 addr6 = &ipv6_hdr(skb)->daddr;
686                                 addr_type = ipv6_addr_type(addr6);
687                         }
688
689                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
690                                 do_tx_error_icmp = true;
691                         else {
692                                 do_tx_error_icmp = false;
693                                 dst = addr6->s6_addr32[3];
694                         }
695                         neigh_release(neigh);
696                         if (do_tx_error_icmp)
697                                 goto tx_error_icmp;
698                 }
699 #endif
700                 else
701                         goto tx_error;
702
703                 connected = false;
704         }
705
706         tos = tnl_params->tos;
707         if (tos & 0x1) {
708                 tos &= ~0x1;
709                 if (skb->protocol == htons(ETH_P_IP)) {
710                         tos = inner_iph->tos;
711                         connected = false;
712                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
713                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
714                         connected = false;
715                 }
716         }
717
718         ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
719                             tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
720                             tunnel->fwmark);
721
722         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
723                 goto tx_error;
724
725         rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
726                          NULL;
727
728         if (!rt) {
729                 rt = ip_route_output_key(tunnel->net, &fl4);
730
731                 if (IS_ERR(rt)) {
732                         dev->stats.tx_carrier_errors++;
733                         goto tx_error;
734                 }
735                 if (connected)
736                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
737                                           fl4.saddr);
738         }
739
740         if (rt->dst.dev == dev) {
741                 ip_rt_put(rt);
742                 dev->stats.collisions++;
743                 goto tx_error;
744         }
745
746         df = tnl_params->frag_off;
747         if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
748                 df |= (inner_iph->frag_off & htons(IP_DF));
749
750         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph)) {
751                 ip_rt_put(rt);
752                 goto tx_error;
753         }
754
755         if (tunnel->err_count > 0) {
756                 if (time_before(jiffies,
757                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
758                         tunnel->err_count--;
759
760                         dst_link_failure(skb);
761                 } else
762                         tunnel->err_count = 0;
763         }
764
765         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
766         ttl = tnl_params->ttl;
767         if (ttl == 0) {
768                 if (skb->protocol == htons(ETH_P_IP))
769                         ttl = inner_iph->ttl;
770 #if IS_ENABLED(CONFIG_IPV6)
771                 else if (skb->protocol == htons(ETH_P_IPV6))
772                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
773 #endif
774                 else
775                         ttl = ip4_dst_hoplimit(&rt->dst);
776         }
777
778         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
779                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
780         if (max_headroom > dev->needed_headroom)
781                 dev->needed_headroom = max_headroom;
782
783         if (skb_cow_head(skb, dev->needed_headroom)) {
784                 ip_rt_put(rt);
785                 dev->stats.tx_dropped++;
786                 kfree_skb(skb);
787                 return;
788         }
789
790         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
791                       df, !net_eq(tunnel->net, dev_net(dev)));
792         return;
793
794 #if IS_ENABLED(CONFIG_IPV6)
795 tx_error_icmp:
796         dst_link_failure(skb);
797 #endif
798 tx_error:
799         dev->stats.tx_errors++;
800         kfree_skb(skb);
801 }
802 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
803
804 static void ip_tunnel_update(struct ip_tunnel_net *itn,
805                              struct ip_tunnel *t,
806                              struct net_device *dev,
807                              struct ip_tunnel_parm *p,
808                              bool set_mtu,
809                              __u32 fwmark)
810 {
811         ip_tunnel_del(itn, t);
812         t->parms.iph.saddr = p->iph.saddr;
813         t->parms.iph.daddr = p->iph.daddr;
814         t->parms.i_key = p->i_key;
815         t->parms.o_key = p->o_key;
816         if (dev->type != ARPHRD_ETHER) {
817                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
818                 memcpy(dev->broadcast, &p->iph.daddr, 4);
819         }
820         ip_tunnel_add(itn, t);
821
822         t->parms.iph.ttl = p->iph.ttl;
823         t->parms.iph.tos = p->iph.tos;
824         t->parms.iph.frag_off = p->iph.frag_off;
825
826         if (t->parms.link != p->link || t->fwmark != fwmark) {
827                 int mtu;
828
829                 t->parms.link = p->link;
830                 t->fwmark = fwmark;
831                 mtu = ip_tunnel_bind_dev(dev);
832                 if (set_mtu)
833                         dev->mtu = mtu;
834         }
835         dst_cache_reset(&t->dst_cache);
836         netdev_state_change(dev);
837 }
838
839 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
840 {
841         int err = 0;
842         struct ip_tunnel *t = netdev_priv(dev);
843         struct net *net = t->net;
844         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
845
846         switch (cmd) {
847         case SIOCGETTUNNEL:
848                 if (dev == itn->fb_tunnel_dev) {
849                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
850                         if (!t)
851                                 t = netdev_priv(dev);
852                 }
853                 memcpy(p, &t->parms, sizeof(*p));
854                 break;
855
856         case SIOCADDTUNNEL:
857         case SIOCCHGTUNNEL:
858                 err = -EPERM;
859                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
860                         goto done;
861                 if (p->iph.ttl)
862                         p->iph.frag_off |= htons(IP_DF);
863                 if (!(p->i_flags & VTI_ISVTI)) {
864                         if (!(p->i_flags & TUNNEL_KEY))
865                                 p->i_key = 0;
866                         if (!(p->o_flags & TUNNEL_KEY))
867                                 p->o_key = 0;
868                 }
869
870                 t = ip_tunnel_find(itn, p, itn->type);
871
872                 if (cmd == SIOCADDTUNNEL) {
873                         if (!t) {
874                                 t = ip_tunnel_create(net, itn, p);
875                                 err = PTR_ERR_OR_ZERO(t);
876                                 break;
877                         }
878
879                         err = -EEXIST;
880                         break;
881                 }
882                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
883                         if (t) {
884                                 if (t->dev != dev) {
885                                         err = -EEXIST;
886                                         break;
887                                 }
888                         } else {
889                                 unsigned int nflags = 0;
890
891                                 if (ipv4_is_multicast(p->iph.daddr))
892                                         nflags = IFF_BROADCAST;
893                                 else if (p->iph.daddr)
894                                         nflags = IFF_POINTOPOINT;
895
896                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
897                                         err = -EINVAL;
898                                         break;
899                                 }
900
901                                 t = netdev_priv(dev);
902                         }
903                 }
904
905                 if (t) {
906                         err = 0;
907                         ip_tunnel_update(itn, t, dev, p, true, 0);
908                 } else {
909                         err = -ENOENT;
910                 }
911                 break;
912
913         case SIOCDELTUNNEL:
914                 err = -EPERM;
915                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
916                         goto done;
917
918                 if (dev == itn->fb_tunnel_dev) {
919                         err = -ENOENT;
920                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
921                         if (!t)
922                                 goto done;
923                         err = -EPERM;
924                         if (t == netdev_priv(itn->fb_tunnel_dev))
925                                 goto done;
926                         dev = t->dev;
927                 }
928                 unregister_netdevice(dev);
929                 err = 0;
930                 break;
931
932         default:
933                 err = -EINVAL;
934         }
935
936 done:
937         return err;
938 }
939 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
940
941 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
942 {
943         struct ip_tunnel *tunnel = netdev_priv(dev);
944         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
945         int max_mtu = IP_MAX_MTU - t_hlen;
946
947         if (dev->type == ARPHRD_ETHER)
948                 max_mtu -= dev->hard_header_len;
949
950         if (new_mtu < ETH_MIN_MTU)
951                 return -EINVAL;
952
953         if (new_mtu > max_mtu) {
954                 if (strict)
955                         return -EINVAL;
956
957                 new_mtu = max_mtu;
958         }
959
960         dev->mtu = new_mtu;
961         return 0;
962 }
963 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
964
965 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
966 {
967         return __ip_tunnel_change_mtu(dev, new_mtu, true);
968 }
969 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
970
971 static void ip_tunnel_dev_free(struct net_device *dev)
972 {
973         struct ip_tunnel *tunnel = netdev_priv(dev);
974
975         gro_cells_destroy(&tunnel->gro_cells);
976         dst_cache_destroy(&tunnel->dst_cache);
977         free_percpu(dev->tstats);
978 }
979
980 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
981 {
982         struct ip_tunnel *tunnel = netdev_priv(dev);
983         struct ip_tunnel_net *itn;
984
985         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
986
987         if (itn->fb_tunnel_dev != dev) {
988                 ip_tunnel_del(itn, netdev_priv(dev));
989                 unregister_netdevice_queue(dev, head);
990         }
991 }
992 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
993
994 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
995 {
996         struct ip_tunnel *tunnel = netdev_priv(dev);
997
998         return tunnel->net;
999 }
1000 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1001
1002 int ip_tunnel_get_iflink(const struct net_device *dev)
1003 {
1004         struct ip_tunnel *tunnel = netdev_priv(dev);
1005
1006         return tunnel->parms.link;
1007 }
1008 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1009
1010 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1011                                   struct rtnl_link_ops *ops, char *devname)
1012 {
1013         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1014         struct ip_tunnel_parm parms;
1015         unsigned int i;
1016
1017         itn->rtnl_link_ops = ops;
1018         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1019                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1020
1021         if (!ops || !net_has_fallback_tunnels(net)) {
1022                 struct ip_tunnel_net *it_init_net;
1023
1024                 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1025                 itn->type = it_init_net->type;
1026                 itn->fb_tunnel_dev = NULL;
1027                 return 0;
1028         }
1029
1030         memset(&parms, 0, sizeof(parms));
1031         if (devname)
1032                 strlcpy(parms.name, devname, IFNAMSIZ);
1033
1034         rtnl_lock();
1035         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1036         /* FB netdevice is special: we have one, and only one per netns.
1037          * Allowing to move it to another netns is clearly unsafe.
1038          */
1039         if (!IS_ERR(itn->fb_tunnel_dev)) {
1040                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1041                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1042                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1043                 itn->type = itn->fb_tunnel_dev->type;
1044         }
1045         rtnl_unlock();
1046
1047         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1048 }
1049 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1050
1051 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1052                               struct list_head *head,
1053                               struct rtnl_link_ops *ops)
1054 {
1055         struct net_device *dev, *aux;
1056         int h;
1057
1058         for_each_netdev_safe(net, dev, aux)
1059                 if (dev->rtnl_link_ops == ops)
1060                         unregister_netdevice_queue(dev, head);
1061
1062         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1063                 struct ip_tunnel *t;
1064                 struct hlist_node *n;
1065                 struct hlist_head *thead = &itn->tunnels[h];
1066
1067                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1068                         /* If dev is in the same netns, it has already
1069                          * been added to the list by the previous loop.
1070                          */
1071                         if (!net_eq(dev_net(t->dev), net))
1072                                 unregister_netdevice_queue(t->dev, head);
1073         }
1074 }
1075
1076 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1077                            struct rtnl_link_ops *ops)
1078 {
1079         struct ip_tunnel_net *itn;
1080         struct net *net;
1081         LIST_HEAD(list);
1082
1083         rtnl_lock();
1084         list_for_each_entry(net, net_list, exit_list) {
1085                 itn = net_generic(net, id);
1086                 ip_tunnel_destroy(net, itn, &list, ops);
1087         }
1088         unregister_netdevice_many(&list);
1089         rtnl_unlock();
1090 }
1091 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1092
1093 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1094                       struct ip_tunnel_parm *p, __u32 fwmark)
1095 {
1096         struct ip_tunnel *nt;
1097         struct net *net = dev_net(dev);
1098         struct ip_tunnel_net *itn;
1099         int mtu;
1100         int err;
1101
1102         nt = netdev_priv(dev);
1103         itn = net_generic(net, nt->ip_tnl_net_id);
1104
1105         if (nt->collect_md) {
1106                 if (rtnl_dereference(itn->collect_md_tun))
1107                         return -EEXIST;
1108         } else {
1109                 if (ip_tunnel_find(itn, p, dev->type))
1110                         return -EEXIST;
1111         }
1112
1113         nt->net = net;
1114         nt->parms = *p;
1115         nt->fwmark = fwmark;
1116         err = register_netdevice(dev);
1117         if (err)
1118                 goto err_register_netdevice;
1119
1120         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1121                 eth_hw_addr_random(dev);
1122
1123         mtu = ip_tunnel_bind_dev(dev);
1124         if (tb[IFLA_MTU]) {
1125                 unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1126
1127                 if (dev->type == ARPHRD_ETHER)
1128                         max -= dev->hard_header_len;
1129
1130                 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1131         }
1132
1133         err = dev_set_mtu(dev, mtu);
1134         if (err)
1135                 goto err_dev_set_mtu;
1136
1137         ip_tunnel_add(itn, nt);
1138         return 0;
1139
1140 err_dev_set_mtu:
1141         unregister_netdevice(dev);
1142 err_register_netdevice:
1143         return err;
1144 }
1145 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1146
1147 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1148                          struct ip_tunnel_parm *p, __u32 fwmark)
1149 {
1150         struct ip_tunnel *t;
1151         struct ip_tunnel *tunnel = netdev_priv(dev);
1152         struct net *net = tunnel->net;
1153         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1154
1155         if (dev == itn->fb_tunnel_dev)
1156                 return -EINVAL;
1157
1158         t = ip_tunnel_find(itn, p, dev->type);
1159
1160         if (t) {
1161                 if (t->dev != dev)
1162                         return -EEXIST;
1163         } else {
1164                 t = tunnel;
1165
1166                 if (dev->type != ARPHRD_ETHER) {
1167                         unsigned int nflags = 0;
1168
1169                         if (ipv4_is_multicast(p->iph.daddr))
1170                                 nflags = IFF_BROADCAST;
1171                         else if (p->iph.daddr)
1172                                 nflags = IFF_POINTOPOINT;
1173
1174                         if ((dev->flags ^ nflags) &
1175                             (IFF_POINTOPOINT | IFF_BROADCAST))
1176                                 return -EINVAL;
1177                 }
1178         }
1179
1180         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1181         return 0;
1182 }
1183 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1184
1185 int ip_tunnel_init(struct net_device *dev)
1186 {
1187         struct ip_tunnel *tunnel = netdev_priv(dev);
1188         struct iphdr *iph = &tunnel->parms.iph;
1189         int err;
1190
1191         dev->needs_free_netdev = true;
1192         dev->priv_destructor = ip_tunnel_dev_free;
1193         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1194         if (!dev->tstats)
1195                 return -ENOMEM;
1196
1197         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1198         if (err) {
1199                 free_percpu(dev->tstats);
1200                 return err;
1201         }
1202
1203         err = gro_cells_init(&tunnel->gro_cells, dev);
1204         if (err) {
1205                 dst_cache_destroy(&tunnel->dst_cache);
1206                 free_percpu(dev->tstats);
1207                 return err;
1208         }
1209
1210         tunnel->dev = dev;
1211         tunnel->net = dev_net(dev);
1212         strcpy(tunnel->parms.name, dev->name);
1213         iph->version            = 4;
1214         iph->ihl                = 5;
1215
1216         if (tunnel->collect_md)
1217                 netif_keep_dst(dev);
1218         return 0;
1219 }
1220 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1221
1222 void ip_tunnel_uninit(struct net_device *dev)
1223 {
1224         struct ip_tunnel *tunnel = netdev_priv(dev);
1225         struct net *net = tunnel->net;
1226         struct ip_tunnel_net *itn;
1227
1228         itn = net_generic(net, tunnel->ip_tnl_net_id);
1229         ip_tunnel_del(itn, netdev_priv(dev));
1230         if (itn->fb_tunnel_dev == dev)
1231                 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1232
1233         dst_cache_reset(&tunnel->dst_cache);
1234 }
1235 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1236
1237 /* Do least required initialization, rest of init is done in tunnel_init call */
1238 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1239 {
1240         struct ip_tunnel *tunnel = netdev_priv(dev);
1241         tunnel->ip_tnl_net_id = net_id;
1242 }
1243 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1244
1245 MODULE_LICENSE("GPL");