GNU Linux-libre 4.9.304-gnu1
[releases.git] / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 #include <net/dst_metadata.h>
59
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68         return hash_32((__force u32)key ^ (__force u32)remote,
69                          IP_TNL_HASH_BITS);
70 }
71
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73                                 __be16 flags, __be32 key)
74 {
75         if (p->i_flags & TUNNEL_KEY) {
76                 if (flags & TUNNEL_KEY)
77                         return key == p->i_key;
78                 else
79                         /* key expected, none present */
80                         return false;
81         } else
82                 return !(flags & TUNNEL_KEY);
83 }
84
85 /* Fallback tunnel: no source, no destination, no key, no options
86
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97                                    int link, __be16 flags,
98                                    __be32 remote, __be32 local,
99                                    __be32 key)
100 {
101         struct ip_tunnel *t, *cand = NULL;
102         struct hlist_head *head;
103         struct net_device *ndev;
104         unsigned int hash;
105
106         hash = ip_tunnel_hash(key, remote);
107         head = &itn->tunnels[hash];
108
109         hlist_for_each_entry_rcu(t, head, hash_node) {
110                 if (local != t->parms.iph.saddr ||
111                     remote != t->parms.iph.daddr ||
112                     !(t->dev->flags & IFF_UP))
113                         continue;
114
115                 if (!ip_tunnel_key_match(&t->parms, flags, key))
116                         continue;
117
118                 if (t->parms.link == link)
119                         return t;
120                 else
121                         cand = t;
122         }
123
124         hlist_for_each_entry_rcu(t, head, hash_node) {
125                 if (remote != t->parms.iph.daddr ||
126                     t->parms.iph.saddr != 0 ||
127                     !(t->dev->flags & IFF_UP))
128                         continue;
129
130                 if (!ip_tunnel_key_match(&t->parms, flags, key))
131                         continue;
132
133                 if (t->parms.link == link)
134                         return t;
135                 else if (!cand)
136                         cand = t;
137         }
138
139         hash = ip_tunnel_hash(key, 0);
140         head = &itn->tunnels[hash];
141
142         hlist_for_each_entry_rcu(t, head, hash_node) {
143                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
144                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
145                         continue;
146
147                 if (!(t->dev->flags & IFF_UP))
148                         continue;
149
150                 if (!ip_tunnel_key_match(&t->parms, flags, key))
151                         continue;
152
153                 if (t->parms.link == link)
154                         return t;
155                 else if (!cand)
156                         cand = t;
157         }
158
159         hlist_for_each_entry_rcu(t, head, hash_node) {
160                 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
161                     t->parms.iph.saddr != 0 ||
162                     t->parms.iph.daddr != 0 ||
163                     !(t->dev->flags & IFF_UP))
164                         continue;
165
166                 if (t->parms.link == link)
167                         return t;
168                 else if (!cand)
169                         cand = t;
170         }
171
172         if (cand)
173                 return cand;
174
175         t = rcu_dereference(itn->collect_md_tun);
176         if (t)
177                 return t;
178
179         ndev = READ_ONCE(itn->fb_tunnel_dev);
180         if (ndev && ndev->flags & IFF_UP)
181                 return netdev_priv(ndev);
182
183         return NULL;
184 }
185 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
186
187 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
188                                     struct ip_tunnel_parm *parms)
189 {
190         unsigned int h;
191         __be32 remote;
192         __be32 i_key = parms->i_key;
193
194         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
195                 remote = parms->iph.daddr;
196         else
197                 remote = 0;
198
199         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
200                 i_key = 0;
201
202         h = ip_tunnel_hash(i_key, remote);
203         return &itn->tunnels[h];
204 }
205
206 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
207 {
208         struct hlist_head *head = ip_bucket(itn, &t->parms);
209
210         if (t->collect_md)
211                 rcu_assign_pointer(itn->collect_md_tun, t);
212         hlist_add_head_rcu(&t->hash_node, head);
213 }
214
215 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
216 {
217         if (t->collect_md)
218                 rcu_assign_pointer(itn->collect_md_tun, NULL);
219         hlist_del_init_rcu(&t->hash_node);
220 }
221
222 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
223                                         struct ip_tunnel_parm *parms,
224                                         int type)
225 {
226         __be32 remote = parms->iph.daddr;
227         __be32 local = parms->iph.saddr;
228         __be32 key = parms->i_key;
229         __be16 flags = parms->i_flags;
230         int link = parms->link;
231         struct ip_tunnel *t = NULL;
232         struct hlist_head *head = ip_bucket(itn, parms);
233
234         hlist_for_each_entry_rcu(t, head, hash_node) {
235                 if (local == t->parms.iph.saddr &&
236                     remote == t->parms.iph.daddr &&
237                     link == t->parms.link &&
238                     type == t->dev->type &&
239                     ip_tunnel_key_match(&t->parms, flags, key))
240                         break;
241         }
242         return t;
243 }
244
245 static struct net_device *__ip_tunnel_create(struct net *net,
246                                              const struct rtnl_link_ops *ops,
247                                              struct ip_tunnel_parm *parms)
248 {
249         int err;
250         struct ip_tunnel *tunnel;
251         struct net_device *dev;
252         char name[IFNAMSIZ];
253
254         err = -E2BIG;
255         if (parms->name[0]) {
256                 if (!dev_valid_name(parms->name))
257                         goto failed;
258                 strlcpy(name, parms->name, IFNAMSIZ);
259         } else {
260                 if (strlen(ops->kind) > (IFNAMSIZ - 3))
261                         goto failed;
262                 strcpy(name, ops->kind);
263                 strcat(name, "%d");
264         }
265
266         ASSERT_RTNL();
267         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
268         if (!dev) {
269                 err = -ENOMEM;
270                 goto failed;
271         }
272         dev_net_set(dev, net);
273
274         dev->rtnl_link_ops = ops;
275
276         tunnel = netdev_priv(dev);
277         tunnel->parms = *parms;
278         tunnel->net = net;
279
280         err = register_netdevice(dev);
281         if (err)
282                 goto failed_free;
283
284         return dev;
285
286 failed_free:
287         free_netdev(dev);
288 failed:
289         return ERR_PTR(err);
290 }
291
292 static inline void init_tunnel_flow(struct flowi4 *fl4,
293                                     int proto,
294                                     __be32 daddr, __be32 saddr,
295                                     __be32 key, __u8 tos, int oif)
296 {
297         memset(fl4, 0, sizeof(*fl4));
298         fl4->flowi4_oif = oif;
299         fl4->daddr = daddr;
300         fl4->saddr = saddr;
301         fl4->flowi4_tos = tos;
302         fl4->flowi4_proto = proto;
303         fl4->fl4_gre_key = key;
304 }
305
306 static int ip_tunnel_bind_dev(struct net_device *dev)
307 {
308         struct net_device *tdev = NULL;
309         struct ip_tunnel *tunnel = netdev_priv(dev);
310         const struct iphdr *iph;
311         int hlen = LL_MAX_HEADER;
312         int mtu = ETH_DATA_LEN;
313         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
314
315         iph = &tunnel->parms.iph;
316
317         /* Guess output device to choose reasonable mtu and needed_headroom */
318         if (iph->daddr) {
319                 struct flowi4 fl4;
320                 struct rtable *rt;
321
322                 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
323                                  iph->saddr, tunnel->parms.o_key,
324                                  RT_TOS(iph->tos), tunnel->parms.link);
325                 rt = ip_route_output_key(tunnel->net, &fl4);
326
327                 if (!IS_ERR(rt)) {
328                         tdev = rt->dst.dev;
329                         ip_rt_put(rt);
330                 }
331                 if (dev->type != ARPHRD_ETHER)
332                         dev->flags |= IFF_POINTOPOINT;
333
334                 dst_cache_reset(&tunnel->dst_cache);
335         }
336
337         if (!tdev && tunnel->parms.link)
338                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
339
340         if (tdev) {
341                 hlen = tdev->hard_header_len + tdev->needed_headroom;
342                 mtu = tdev->mtu;
343         }
344
345         dev->needed_headroom = t_hlen + hlen;
346         mtu -= (dev->hard_header_len + t_hlen);
347
348         if (mtu < IPV4_MIN_MTU)
349                 mtu = IPV4_MIN_MTU;
350
351         return mtu;
352 }
353
354 static struct ip_tunnel *ip_tunnel_create(struct net *net,
355                                           struct ip_tunnel_net *itn,
356                                           struct ip_tunnel_parm *parms)
357 {
358         struct ip_tunnel *nt;
359         struct net_device *dev;
360
361         BUG_ON(!itn->fb_tunnel_dev);
362         dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
363         if (IS_ERR(dev))
364                 return ERR_CAST(dev);
365
366         dev->mtu = ip_tunnel_bind_dev(dev);
367
368         nt = netdev_priv(dev);
369         ip_tunnel_add(itn, nt);
370         return nt;
371 }
372
373 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
374                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
375                   bool log_ecn_error)
376 {
377         struct pcpu_sw_netstats *tstats;
378         const struct iphdr *iph = ip_hdr(skb);
379         int err;
380
381 #ifdef CONFIG_NET_IPGRE_BROADCAST
382         if (ipv4_is_multicast(iph->daddr)) {
383                 tunnel->dev->stats.multicast++;
384                 skb->pkt_type = PACKET_BROADCAST;
385         }
386 #endif
387
388         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
389              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
390                 tunnel->dev->stats.rx_crc_errors++;
391                 tunnel->dev->stats.rx_errors++;
392                 goto drop;
393         }
394
395         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
396                 if (!(tpi->flags&TUNNEL_SEQ) ||
397                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
398                         tunnel->dev->stats.rx_fifo_errors++;
399                         tunnel->dev->stats.rx_errors++;
400                         goto drop;
401                 }
402                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
403         }
404
405         skb_reset_network_header(skb);
406
407         err = IP_ECN_decapsulate(iph, skb);
408         if (unlikely(err)) {
409                 if (log_ecn_error)
410                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
411                                         &iph->saddr, iph->tos);
412                 if (err > 1) {
413                         ++tunnel->dev->stats.rx_frame_errors;
414                         ++tunnel->dev->stats.rx_errors;
415                         goto drop;
416                 }
417         }
418
419         tstats = this_cpu_ptr(tunnel->dev->tstats);
420         u64_stats_update_begin(&tstats->syncp);
421         tstats->rx_packets++;
422         tstats->rx_bytes += skb->len;
423         u64_stats_update_end(&tstats->syncp);
424
425         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
426
427         if (tunnel->dev->type == ARPHRD_ETHER) {
428                 skb->protocol = eth_type_trans(skb, tunnel->dev);
429                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
430         } else {
431                 skb->dev = tunnel->dev;
432         }
433
434         if (tun_dst)
435                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
436
437         gro_cells_receive(&tunnel->gro_cells, skb);
438         return 0;
439
440 drop:
441         kfree_skb(skb);
442         return 0;
443 }
444 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
445
446 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
447                             unsigned int num)
448 {
449         if (num >= MAX_IPTUN_ENCAP_OPS)
450                 return -ERANGE;
451
452         return !cmpxchg((const struct ip_tunnel_encap_ops **)
453                         &iptun_encaps[num],
454                         NULL, ops) ? 0 : -1;
455 }
456 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
457
458 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
459                             unsigned int num)
460 {
461         int ret;
462
463         if (num >= MAX_IPTUN_ENCAP_OPS)
464                 return -ERANGE;
465
466         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
467                        &iptun_encaps[num],
468                        ops, NULL) == ops) ? 0 : -1;
469
470         synchronize_net();
471
472         return ret;
473 }
474 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
475
476 int ip_tunnel_encap_setup(struct ip_tunnel *t,
477                           struct ip_tunnel_encap *ipencap)
478 {
479         int hlen;
480
481         memset(&t->encap, 0, sizeof(t->encap));
482
483         hlen = ip_encap_hlen(ipencap);
484         if (hlen < 0)
485                 return hlen;
486
487         t->encap.type = ipencap->type;
488         t->encap.sport = ipencap->sport;
489         t->encap.dport = ipencap->dport;
490         t->encap.flags = ipencap->flags;
491
492         t->encap_hlen = hlen;
493         t->hlen = t->encap_hlen + t->tun_hlen;
494
495         return 0;
496 }
497 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
498
499 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
500                             struct rtable *rt, __be16 df,
501                             const struct iphdr *inner_iph)
502 {
503         struct ip_tunnel *tunnel = netdev_priv(dev);
504         int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
505         int mtu;
506
507         if (df)
508                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
509                                         - sizeof(struct iphdr) - tunnel->hlen;
510         else
511                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
512
513         if (skb_dst(skb))
514                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
515
516         if (skb->protocol == htons(ETH_P_IP)) {
517                 if (!skb_is_gso(skb) &&
518                     (inner_iph->frag_off & htons(IP_DF)) &&
519                     mtu < pkt_size) {
520                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
521                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
522                         return -E2BIG;
523                 }
524         }
525 #if IS_ENABLED(CONFIG_IPV6)
526         else if (skb->protocol == htons(ETH_P_IPV6)) {
527                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
528
529                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
530                            mtu >= IPV6_MIN_MTU) {
531                         if ((tunnel->parms.iph.daddr &&
532                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
533                             rt6->rt6i_dst.plen == 128) {
534                                 rt6->rt6i_flags |= RTF_MODIFIED;
535                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
536                         }
537                 }
538
539                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
540                                         mtu < pkt_size) {
541                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
542                         return -E2BIG;
543                 }
544         }
545 #endif
546         return 0;
547 }
548
549 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
550 {
551         struct ip_tunnel *tunnel = netdev_priv(dev);
552         u32 headroom = sizeof(struct iphdr);
553         struct ip_tunnel_info *tun_info;
554         const struct ip_tunnel_key *key;
555         const struct iphdr *inner_iph;
556         struct rtable *rt;
557         struct flowi4 fl4;
558         __be16 df = 0;
559         u8 tos, ttl;
560
561         tun_info = skb_tunnel_info(skb);
562         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
563                      ip_tunnel_info_af(tun_info) != AF_INET))
564                 goto tx_error;
565         key = &tun_info->key;
566         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
567         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
568         tos = key->tos;
569         if (tos == 1) {
570                 if (skb->protocol == htons(ETH_P_IP))
571                         tos = inner_iph->tos;
572                 else if (skb->protocol == htons(ETH_P_IPV6))
573                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
574         }
575         init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
576                          RT_TOS(tos), tunnel->parms.link);
577         if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
578                 goto tx_error;
579         rt = ip_route_output_key(tunnel->net, &fl4);
580         if (IS_ERR(rt)) {
581                 dev->stats.tx_carrier_errors++;
582                 goto tx_error;
583         }
584         if (rt->dst.dev == dev) {
585                 ip_rt_put(rt);
586                 dev->stats.collisions++;
587                 goto tx_error;
588         }
589         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
590         ttl = key->ttl;
591         if (ttl == 0) {
592                 if (skb->protocol == htons(ETH_P_IP))
593                         ttl = inner_iph->ttl;
594                 else if (skb->protocol == htons(ETH_P_IPV6))
595                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
596                 else
597                         ttl = ip4_dst_hoplimit(&rt->dst);
598         }
599         if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
600                 df = htons(IP_DF);
601         else if (skb->protocol == htons(ETH_P_IP))
602                 df = inner_iph->frag_off & htons(IP_DF);
603         headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
604         if (headroom > dev->needed_headroom)
605                 dev->needed_headroom = headroom;
606
607         if (skb_cow_head(skb, dev->needed_headroom)) {
608                 ip_rt_put(rt);
609                 goto tx_dropped;
610         }
611         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
612                       df, !net_eq(tunnel->net, dev_net(dev)));
613         return;
614 tx_error:
615         dev->stats.tx_errors++;
616         goto kfree;
617 tx_dropped:
618         dev->stats.tx_dropped++;
619 kfree:
620         kfree_skb(skb);
621 }
622 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
623
624 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
625                     const struct iphdr *tnl_params, u8 protocol)
626 {
627         struct ip_tunnel *tunnel = netdev_priv(dev);
628         unsigned int inner_nhdr_len = 0;
629         const struct iphdr *inner_iph;
630         struct flowi4 fl4;
631         u8     tos, ttl;
632         __be16 df;
633         struct rtable *rt;              /* Route to the other host */
634         unsigned int max_headroom;      /* The extra header space needed */
635         __be32 dst;
636         bool connected;
637
638         /* ensure we can access the inner net header, for several users below */
639         if (skb->protocol == htons(ETH_P_IP))
640                 inner_nhdr_len = sizeof(struct iphdr);
641         else if (skb->protocol == htons(ETH_P_IPV6))
642                 inner_nhdr_len = sizeof(struct ipv6hdr);
643         if (unlikely(!pskb_may_pull(skb, inner_nhdr_len)))
644                 goto tx_error;
645
646         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
647         connected = (tunnel->parms.iph.daddr != 0);
648
649         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
650
651         dst = tnl_params->daddr;
652         if (dst == 0) {
653                 /* NBMA tunnel */
654                 struct ip_tunnel_info *tun_info;
655
656                 if (!skb_dst(skb)) {
657                         dev->stats.tx_fifo_errors++;
658                         goto tx_error;
659                 }
660
661                 tun_info = skb_tunnel_info(skb);
662                 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
663                     ip_tunnel_info_af(tun_info) == AF_INET &&
664                     tun_info->key.u.ipv4.dst)
665                         dst = tun_info->key.u.ipv4.dst;
666                 else if (skb->protocol == htons(ETH_P_IP)) {
667                         rt = skb_rtable(skb);
668                         dst = rt_nexthop(rt, inner_iph->daddr);
669                 }
670 #if IS_ENABLED(CONFIG_IPV6)
671                 else if (skb->protocol == htons(ETH_P_IPV6)) {
672                         const struct in6_addr *addr6;
673                         struct neighbour *neigh;
674                         bool do_tx_error_icmp;
675                         int addr_type;
676
677                         neigh = dst_neigh_lookup(skb_dst(skb),
678                                                  &ipv6_hdr(skb)->daddr);
679                         if (!neigh)
680                                 goto tx_error;
681
682                         addr6 = (const struct in6_addr *)&neigh->primary_key;
683                         addr_type = ipv6_addr_type(addr6);
684
685                         if (addr_type == IPV6_ADDR_ANY) {
686                                 addr6 = &ipv6_hdr(skb)->daddr;
687                                 addr_type = ipv6_addr_type(addr6);
688                         }
689
690                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
691                                 do_tx_error_icmp = true;
692                         else {
693                                 do_tx_error_icmp = false;
694                                 dst = addr6->s6_addr32[3];
695                         }
696                         neigh_release(neigh);
697                         if (do_tx_error_icmp)
698                                 goto tx_error_icmp;
699                 }
700 #endif
701                 else
702                         goto tx_error;
703
704                 connected = false;
705         }
706
707         tos = tnl_params->tos;
708         if (tos & 0x1) {
709                 tos &= ~0x1;
710                 if (skb->protocol == htons(ETH_P_IP)) {
711                         tos = inner_iph->tos;
712                         connected = false;
713                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
714                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
715                         connected = false;
716                 }
717         }
718
719         init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
720                          tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
721
722         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
723                 goto tx_error;
724
725         rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
726                          NULL;
727
728         if (!rt) {
729                 rt = ip_route_output_key(tunnel->net, &fl4);
730
731                 if (IS_ERR(rt)) {
732                         dev->stats.tx_carrier_errors++;
733                         goto tx_error;
734                 }
735                 if (connected)
736                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
737                                           fl4.saddr);
738         }
739
740         if (rt->dst.dev == dev) {
741                 ip_rt_put(rt);
742                 dev->stats.collisions++;
743                 goto tx_error;
744         }
745
746         df = tnl_params->frag_off;
747         if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
748                 df |= (inner_iph->frag_off & htons(IP_DF));
749
750         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph)) {
751                 ip_rt_put(rt);
752                 goto tx_error;
753         }
754
755         if (tunnel->err_count > 0) {
756                 if (time_before(jiffies,
757                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
758                         tunnel->err_count--;
759
760                         dst_link_failure(skb);
761                 } else
762                         tunnel->err_count = 0;
763         }
764
765         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
766         ttl = tnl_params->ttl;
767         if (ttl == 0) {
768                 if (skb->protocol == htons(ETH_P_IP))
769                         ttl = inner_iph->ttl;
770 #if IS_ENABLED(CONFIG_IPV6)
771                 else if (skb->protocol == htons(ETH_P_IPV6))
772                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
773 #endif
774                 else
775                         ttl = ip4_dst_hoplimit(&rt->dst);
776         }
777
778         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
779                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
780         if (max_headroom > dev->needed_headroom)
781                 dev->needed_headroom = max_headroom;
782
783         if (skb_cow_head(skb, dev->needed_headroom)) {
784                 ip_rt_put(rt);
785                 dev->stats.tx_dropped++;
786                 kfree_skb(skb);
787                 return;
788         }
789
790         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
791                       df, !net_eq(tunnel->net, dev_net(dev)));
792         return;
793
794 #if IS_ENABLED(CONFIG_IPV6)
795 tx_error_icmp:
796         dst_link_failure(skb);
797 #endif
798 tx_error:
799         dev->stats.tx_errors++;
800         kfree_skb(skb);
801 }
802 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
803
804 static void ip_tunnel_update(struct ip_tunnel_net *itn,
805                              struct ip_tunnel *t,
806                              struct net_device *dev,
807                              struct ip_tunnel_parm *p,
808                              bool set_mtu)
809 {
810         ip_tunnel_del(itn, t);
811         t->parms.iph.saddr = p->iph.saddr;
812         t->parms.iph.daddr = p->iph.daddr;
813         t->parms.i_key = p->i_key;
814         t->parms.o_key = p->o_key;
815         if (dev->type != ARPHRD_ETHER) {
816                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
817                 memcpy(dev->broadcast, &p->iph.daddr, 4);
818         }
819         ip_tunnel_add(itn, t);
820
821         t->parms.iph.ttl = p->iph.ttl;
822         t->parms.iph.tos = p->iph.tos;
823         t->parms.iph.frag_off = p->iph.frag_off;
824
825         if (t->parms.link != p->link) {
826                 int mtu;
827
828                 t->parms.link = p->link;
829                 mtu = ip_tunnel_bind_dev(dev);
830                 if (set_mtu)
831                         dev->mtu = mtu;
832         }
833         dst_cache_reset(&t->dst_cache);
834         netdev_state_change(dev);
835 }
836
837 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
838 {
839         int err = 0;
840         struct ip_tunnel *t = netdev_priv(dev);
841         struct net *net = t->net;
842         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
843
844         BUG_ON(!itn->fb_tunnel_dev);
845         switch (cmd) {
846         case SIOCGETTUNNEL:
847                 if (dev == itn->fb_tunnel_dev) {
848                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
849                         if (!t)
850                                 t = netdev_priv(dev);
851                 }
852                 memcpy(p, &t->parms, sizeof(*p));
853                 break;
854
855         case SIOCADDTUNNEL:
856         case SIOCCHGTUNNEL:
857                 err = -EPERM;
858                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
859                         goto done;
860                 if (p->iph.ttl)
861                         p->iph.frag_off |= htons(IP_DF);
862                 if (!(p->i_flags & VTI_ISVTI)) {
863                         if (!(p->i_flags & TUNNEL_KEY))
864                                 p->i_key = 0;
865                         if (!(p->o_flags & TUNNEL_KEY))
866                                 p->o_key = 0;
867                 }
868
869                 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
870
871                 if (cmd == SIOCADDTUNNEL) {
872                         if (!t) {
873                                 t = ip_tunnel_create(net, itn, p);
874                                 err = PTR_ERR_OR_ZERO(t);
875                                 break;
876                         }
877
878                         err = -EEXIST;
879                         break;
880                 }
881                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
882                         if (t) {
883                                 if (t->dev != dev) {
884                                         err = -EEXIST;
885                                         break;
886                                 }
887                         } else {
888                                 unsigned int nflags = 0;
889
890                                 if (ipv4_is_multicast(p->iph.daddr))
891                                         nflags = IFF_BROADCAST;
892                                 else if (p->iph.daddr)
893                                         nflags = IFF_POINTOPOINT;
894
895                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
896                                         err = -EINVAL;
897                                         break;
898                                 }
899
900                                 t = netdev_priv(dev);
901                         }
902                 }
903
904                 if (t) {
905                         err = 0;
906                         ip_tunnel_update(itn, t, dev, p, true);
907                 } else {
908                         err = -ENOENT;
909                 }
910                 break;
911
912         case SIOCDELTUNNEL:
913                 err = -EPERM;
914                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
915                         goto done;
916
917                 if (dev == itn->fb_tunnel_dev) {
918                         err = -ENOENT;
919                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
920                         if (!t)
921                                 goto done;
922                         err = -EPERM;
923                         if (t == netdev_priv(itn->fb_tunnel_dev))
924                                 goto done;
925                         dev = t->dev;
926                 }
927                 unregister_netdevice(dev);
928                 err = 0;
929                 break;
930
931         default:
932                 err = -EINVAL;
933         }
934
935 done:
936         return err;
937 }
938 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
939
940 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
941 {
942         struct ip_tunnel *tunnel = netdev_priv(dev);
943         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
944         int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
945
946         if (new_mtu < 68)
947                 return -EINVAL;
948
949         if (new_mtu > max_mtu) {
950                 if (strict)
951                         return -EINVAL;
952
953                 new_mtu = max_mtu;
954         }
955
956         dev->mtu = new_mtu;
957         return 0;
958 }
959 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
960
961 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
962 {
963         return __ip_tunnel_change_mtu(dev, new_mtu, true);
964 }
965 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
966
967 static void ip_tunnel_dev_free(struct net_device *dev)
968 {
969         struct ip_tunnel *tunnel = netdev_priv(dev);
970
971         gro_cells_destroy(&tunnel->gro_cells);
972         dst_cache_destroy(&tunnel->dst_cache);
973         free_percpu(dev->tstats);
974         free_netdev(dev);
975 }
976
977 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
978 {
979         struct ip_tunnel *tunnel = netdev_priv(dev);
980         struct ip_tunnel_net *itn;
981
982         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
983
984         if (itn->fb_tunnel_dev != dev) {
985                 ip_tunnel_del(itn, netdev_priv(dev));
986                 unregister_netdevice_queue(dev, head);
987         }
988 }
989 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
990
991 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
992 {
993         struct ip_tunnel *tunnel = netdev_priv(dev);
994
995         return tunnel->net;
996 }
997 EXPORT_SYMBOL(ip_tunnel_get_link_net);
998
999 int ip_tunnel_get_iflink(const struct net_device *dev)
1000 {
1001         struct ip_tunnel *tunnel = netdev_priv(dev);
1002
1003         return tunnel->parms.link;
1004 }
1005 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1006
1007 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
1008                                   struct rtnl_link_ops *ops, char *devname)
1009 {
1010         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1011         struct ip_tunnel_parm parms;
1012         unsigned int i;
1013
1014         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1015                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1016
1017         if (!ops) {
1018                 itn->fb_tunnel_dev = NULL;
1019                 return 0;
1020         }
1021
1022         memset(&parms, 0, sizeof(parms));
1023         if (devname)
1024                 strlcpy(parms.name, devname, IFNAMSIZ);
1025
1026         rtnl_lock();
1027         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1028         /* FB netdevice is special: we have one, and only one per netns.
1029          * Allowing to move it to another netns is clearly unsafe.
1030          */
1031         if (!IS_ERR(itn->fb_tunnel_dev)) {
1032                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1033                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1034                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1035         }
1036         rtnl_unlock();
1037
1038         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1039 }
1040 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1041
1042 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1043                               struct rtnl_link_ops *ops)
1044 {
1045         struct net *net = dev_net(itn->fb_tunnel_dev);
1046         struct net_device *dev, *aux;
1047         int h;
1048
1049         for_each_netdev_safe(net, dev, aux)
1050                 if (dev->rtnl_link_ops == ops)
1051                         unregister_netdevice_queue(dev, head);
1052
1053         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1054                 struct ip_tunnel *t;
1055                 struct hlist_node *n;
1056                 struct hlist_head *thead = &itn->tunnels[h];
1057
1058                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1059                         /* If dev is in the same netns, it has already
1060                          * been added to the list by the previous loop.
1061                          */
1062                         if (!net_eq(dev_net(t->dev), net))
1063                                 unregister_netdevice_queue(t->dev, head);
1064         }
1065 }
1066
1067 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1068 {
1069         LIST_HEAD(list);
1070
1071         rtnl_lock();
1072         ip_tunnel_destroy(itn, &list, ops);
1073         unregister_netdevice_many(&list);
1074         rtnl_unlock();
1075 }
1076 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1077
1078 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1079                       struct ip_tunnel_parm *p)
1080 {
1081         struct ip_tunnel *nt;
1082         struct net *net = dev_net(dev);
1083         struct ip_tunnel_net *itn;
1084         int mtu;
1085         int err;
1086
1087         nt = netdev_priv(dev);
1088         itn = net_generic(net, nt->ip_tnl_net_id);
1089
1090         if (nt->collect_md) {
1091                 if (rtnl_dereference(itn->collect_md_tun))
1092                         return -EEXIST;
1093         } else {
1094                 if (ip_tunnel_find(itn, p, dev->type))
1095                         return -EEXIST;
1096         }
1097
1098         nt->net = net;
1099         nt->parms = *p;
1100         err = register_netdevice(dev);
1101         if (err)
1102                 goto out;
1103
1104         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1105                 eth_hw_addr_random(dev);
1106
1107         mtu = ip_tunnel_bind_dev(dev);
1108         if (!tb[IFLA_MTU])
1109                 dev->mtu = mtu;
1110
1111         ip_tunnel_add(itn, nt);
1112 out:
1113         return err;
1114 }
1115 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1116
1117 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1118                          struct ip_tunnel_parm *p)
1119 {
1120         struct ip_tunnel *t;
1121         struct ip_tunnel *tunnel = netdev_priv(dev);
1122         struct net *net = tunnel->net;
1123         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1124
1125         if (dev == itn->fb_tunnel_dev)
1126                 return -EINVAL;
1127
1128         t = ip_tunnel_find(itn, p, dev->type);
1129
1130         if (t) {
1131                 if (t->dev != dev)
1132                         return -EEXIST;
1133         } else {
1134                 t = tunnel;
1135
1136                 if (dev->type != ARPHRD_ETHER) {
1137                         unsigned int nflags = 0;
1138
1139                         if (ipv4_is_multicast(p->iph.daddr))
1140                                 nflags = IFF_BROADCAST;
1141                         else if (p->iph.daddr)
1142                                 nflags = IFF_POINTOPOINT;
1143
1144                         if ((dev->flags ^ nflags) &
1145                             (IFF_POINTOPOINT | IFF_BROADCAST))
1146                                 return -EINVAL;
1147                 }
1148         }
1149
1150         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1151         return 0;
1152 }
1153 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1154
1155 int ip_tunnel_init(struct net_device *dev)
1156 {
1157         struct ip_tunnel *tunnel = netdev_priv(dev);
1158         struct iphdr *iph = &tunnel->parms.iph;
1159         int err;
1160
1161         dev->destructor = ip_tunnel_dev_free;
1162         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1163         if (!dev->tstats)
1164                 return -ENOMEM;
1165
1166         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1167         if (err) {
1168                 free_percpu(dev->tstats);
1169                 return err;
1170         }
1171
1172         err = gro_cells_init(&tunnel->gro_cells, dev);
1173         if (err) {
1174                 dst_cache_destroy(&tunnel->dst_cache);
1175                 free_percpu(dev->tstats);
1176                 return err;
1177         }
1178
1179         tunnel->dev = dev;
1180         tunnel->net = dev_net(dev);
1181         strcpy(tunnel->parms.name, dev->name);
1182         iph->version            = 4;
1183         iph->ihl                = 5;
1184
1185         if (tunnel->collect_md)
1186                 netif_keep_dst(dev);
1187         return 0;
1188 }
1189 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1190
1191 void ip_tunnel_uninit(struct net_device *dev)
1192 {
1193         struct ip_tunnel *tunnel = netdev_priv(dev);
1194         struct net *net = tunnel->net;
1195         struct ip_tunnel_net *itn;
1196
1197         itn = net_generic(net, tunnel->ip_tnl_net_id);
1198         ip_tunnel_del(itn, netdev_priv(dev));
1199         if (itn->fb_tunnel_dev == dev)
1200                 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1201
1202         dst_cache_reset(&tunnel->dst_cache);
1203 }
1204 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1205
1206 /* Do least required initialization, rest of init is done in tunnel_init call */
1207 void ip_tunnel_setup(struct net_device *dev, int net_id)
1208 {
1209         struct ip_tunnel *tunnel = netdev_priv(dev);
1210         tunnel->ip_tnl_net_id = net_id;
1211 }
1212 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1213
1214 MODULE_LICENSE("GPL");