GNU Linux-libre 4.14.332-gnu1
[releases.git] / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 #include <net/dst_metadata.h>
59
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68         return hash_32((__force u32)key ^ (__force u32)remote,
69                          IP_TNL_HASH_BITS);
70 }
71
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73                                 __be16 flags, __be32 key)
74 {
75         if (p->i_flags & TUNNEL_KEY) {
76                 if (flags & TUNNEL_KEY)
77                         return key == p->i_key;
78                 else
79                         /* key expected, none present */
80                         return false;
81         } else
82                 return !(flags & TUNNEL_KEY);
83 }
84
85 /* Fallback tunnel: no source, no destination, no key, no options
86
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97                                    int link, __be16 flags,
98                                    __be32 remote, __be32 local,
99                                    __be32 key)
100 {
101         struct ip_tunnel *t, *cand = NULL;
102         struct hlist_head *head;
103         struct net_device *ndev;
104         unsigned int hash;
105
106         hash = ip_tunnel_hash(key, remote);
107         head = &itn->tunnels[hash];
108
109         hlist_for_each_entry_rcu(t, head, hash_node) {
110                 if (local != t->parms.iph.saddr ||
111                     remote != t->parms.iph.daddr ||
112                     !(t->dev->flags & IFF_UP))
113                         continue;
114
115                 if (!ip_tunnel_key_match(&t->parms, flags, key))
116                         continue;
117
118                 if (t->parms.link == link)
119                         return t;
120                 else
121                         cand = t;
122         }
123
124         hlist_for_each_entry_rcu(t, head, hash_node) {
125                 if (remote != t->parms.iph.daddr ||
126                     t->parms.iph.saddr != 0 ||
127                     !(t->dev->flags & IFF_UP))
128                         continue;
129
130                 if (!ip_tunnel_key_match(&t->parms, flags, key))
131                         continue;
132
133                 if (t->parms.link == link)
134                         return t;
135                 else if (!cand)
136                         cand = t;
137         }
138
139         hash = ip_tunnel_hash(key, 0);
140         head = &itn->tunnels[hash];
141
142         hlist_for_each_entry_rcu(t, head, hash_node) {
143                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
144                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
145                         continue;
146
147                 if (!(t->dev->flags & IFF_UP))
148                         continue;
149
150                 if (!ip_tunnel_key_match(&t->parms, flags, key))
151                         continue;
152
153                 if (t->parms.link == link)
154                         return t;
155                 else if (!cand)
156                         cand = t;
157         }
158
159         hlist_for_each_entry_rcu(t, head, hash_node) {
160                 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
161                     t->parms.iph.saddr != 0 ||
162                     t->parms.iph.daddr != 0 ||
163                     !(t->dev->flags & IFF_UP))
164                         continue;
165
166                 if (t->parms.link == link)
167                         return t;
168                 else if (!cand)
169                         cand = t;
170         }
171
172         if (cand)
173                 return cand;
174
175         t = rcu_dereference(itn->collect_md_tun);
176         if (t && t->dev->flags & IFF_UP)
177                 return t;
178
179         ndev = READ_ONCE(itn->fb_tunnel_dev);
180         if (ndev && ndev->flags & IFF_UP)
181                 return netdev_priv(ndev);
182
183         return NULL;
184 }
185 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
186
187 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
188                                     struct ip_tunnel_parm *parms)
189 {
190         unsigned int h;
191         __be32 remote;
192         __be32 i_key = parms->i_key;
193
194         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
195                 remote = parms->iph.daddr;
196         else
197                 remote = 0;
198
199         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
200                 i_key = 0;
201
202         h = ip_tunnel_hash(i_key, remote);
203         return &itn->tunnels[h];
204 }
205
206 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
207 {
208         struct hlist_head *head = ip_bucket(itn, &t->parms);
209
210         if (t->collect_md)
211                 rcu_assign_pointer(itn->collect_md_tun, t);
212         hlist_add_head_rcu(&t->hash_node, head);
213 }
214
215 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
216 {
217         if (t->collect_md)
218                 rcu_assign_pointer(itn->collect_md_tun, NULL);
219         hlist_del_init_rcu(&t->hash_node);
220 }
221
222 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
223                                         struct ip_tunnel_parm *parms,
224                                         int type)
225 {
226         __be32 remote = parms->iph.daddr;
227         __be32 local = parms->iph.saddr;
228         __be32 key = parms->i_key;
229         __be16 flags = parms->i_flags;
230         int link = parms->link;
231         struct ip_tunnel *t = NULL;
232         struct hlist_head *head = ip_bucket(itn, parms);
233
234         hlist_for_each_entry_rcu(t, head, hash_node) {
235                 if (local == t->parms.iph.saddr &&
236                     remote == t->parms.iph.daddr &&
237                     link == t->parms.link &&
238                     type == t->dev->type &&
239                     ip_tunnel_key_match(&t->parms, flags, key))
240                         break;
241         }
242         return t;
243 }
244
245 static struct net_device *__ip_tunnel_create(struct net *net,
246                                              const struct rtnl_link_ops *ops,
247                                              struct ip_tunnel_parm *parms)
248 {
249         int err;
250         struct ip_tunnel *tunnel;
251         struct net_device *dev;
252         char name[IFNAMSIZ];
253
254         err = -E2BIG;
255         if (parms->name[0]) {
256                 if (!dev_valid_name(parms->name))
257                         goto failed;
258                 strlcpy(name, parms->name, IFNAMSIZ);
259         } else {
260                 if (strlen(ops->kind) > (IFNAMSIZ - 3))
261                         goto failed;
262                 strcpy(name, ops->kind);
263                 strcat(name, "%d");
264         }
265
266         ASSERT_RTNL();
267         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
268         if (!dev) {
269                 err = -ENOMEM;
270                 goto failed;
271         }
272         dev_net_set(dev, net);
273
274         dev->rtnl_link_ops = ops;
275
276         tunnel = netdev_priv(dev);
277         tunnel->parms = *parms;
278         tunnel->net = net;
279
280         err = register_netdevice(dev);
281         if (err)
282                 goto failed_free;
283
284         return dev;
285
286 failed_free:
287         free_netdev(dev);
288 failed:
289         return ERR_PTR(err);
290 }
291
292 static inline void init_tunnel_flow(struct flowi4 *fl4,
293                                     int proto,
294                                     __be32 daddr, __be32 saddr,
295                                     __be32 key, __u8 tos, int oif,
296                                     __u32 mark)
297 {
298         memset(fl4, 0, sizeof(*fl4));
299         fl4->flowi4_oif = oif;
300         fl4->daddr = daddr;
301         fl4->saddr = saddr;
302         fl4->flowi4_tos = tos;
303         fl4->flowi4_proto = proto;
304         fl4->fl4_gre_key = key;
305         fl4->flowi4_mark = mark;
306 }
307
308 static int ip_tunnel_bind_dev(struct net_device *dev)
309 {
310         struct net_device *tdev = NULL;
311         struct ip_tunnel *tunnel = netdev_priv(dev);
312         const struct iphdr *iph;
313         int hlen = LL_MAX_HEADER;
314         int mtu = ETH_DATA_LEN;
315         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
316
317         iph = &tunnel->parms.iph;
318
319         /* Guess output device to choose reasonable mtu and needed_headroom */
320         if (iph->daddr) {
321                 struct flowi4 fl4;
322                 struct rtable *rt;
323
324                 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
325                                  iph->saddr, tunnel->parms.o_key,
326                                  RT_TOS(iph->tos), tunnel->parms.link,
327                                  tunnel->fwmark);
328                 rt = ip_route_output_key(tunnel->net, &fl4);
329
330                 if (!IS_ERR(rt)) {
331                         tdev = rt->dst.dev;
332                         ip_rt_put(rt);
333                 }
334                 if (dev->type != ARPHRD_ETHER)
335                         dev->flags |= IFF_POINTOPOINT;
336
337                 dst_cache_reset(&tunnel->dst_cache);
338         }
339
340         if (!tdev && tunnel->parms.link)
341                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
342
343         if (tdev) {
344                 hlen = tdev->hard_header_len + tdev->needed_headroom;
345                 mtu = tdev->mtu;
346         }
347
348         dev->needed_headroom = t_hlen + hlen;
349         mtu -= (dev->hard_header_len + t_hlen);
350
351         if (mtu < IPV4_MIN_MTU)
352                 mtu = IPV4_MIN_MTU;
353
354         return mtu;
355 }
356
357 static struct ip_tunnel *ip_tunnel_create(struct net *net,
358                                           struct ip_tunnel_net *itn,
359                                           struct ip_tunnel_parm *parms)
360 {
361         struct ip_tunnel *nt;
362         struct net_device *dev;
363         int t_hlen;
364
365         BUG_ON(!itn->fb_tunnel_dev);
366         dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
367         if (IS_ERR(dev))
368                 return ERR_CAST(dev);
369
370         dev->mtu = ip_tunnel_bind_dev(dev);
371
372         nt = netdev_priv(dev);
373         t_hlen = nt->hlen + sizeof(struct iphdr);
374         dev->min_mtu = ETH_MIN_MTU;
375         dev->max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
376         ip_tunnel_add(itn, nt);
377         return nt;
378 }
379
380 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
381                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
382                   bool log_ecn_error)
383 {
384         struct pcpu_sw_netstats *tstats;
385         const struct iphdr *iph = ip_hdr(skb);
386         int err;
387
388 #ifdef CONFIG_NET_IPGRE_BROADCAST
389         if (ipv4_is_multicast(iph->daddr)) {
390                 tunnel->dev->stats.multicast++;
391                 skb->pkt_type = PACKET_BROADCAST;
392         }
393 #endif
394
395         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
396              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
397                 tunnel->dev->stats.rx_crc_errors++;
398                 tunnel->dev->stats.rx_errors++;
399                 goto drop;
400         }
401
402         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
403                 if (!(tpi->flags&TUNNEL_SEQ) ||
404                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
405                         tunnel->dev->stats.rx_fifo_errors++;
406                         tunnel->dev->stats.rx_errors++;
407                         goto drop;
408                 }
409                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
410         }
411
412         skb_reset_network_header(skb);
413
414         err = IP_ECN_decapsulate(iph, skb);
415         if (unlikely(err)) {
416                 if (log_ecn_error)
417                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
418                                         &iph->saddr, iph->tos);
419                 if (err > 1) {
420                         ++tunnel->dev->stats.rx_frame_errors;
421                         ++tunnel->dev->stats.rx_errors;
422                         goto drop;
423                 }
424         }
425
426         tstats = this_cpu_ptr(tunnel->dev->tstats);
427         u64_stats_update_begin(&tstats->syncp);
428         tstats->rx_packets++;
429         tstats->rx_bytes += skb->len;
430         u64_stats_update_end(&tstats->syncp);
431
432         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
433
434         if (tunnel->dev->type == ARPHRD_ETHER) {
435                 skb->protocol = eth_type_trans(skb, tunnel->dev);
436                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
437         } else {
438                 skb->dev = tunnel->dev;
439         }
440
441         if (tun_dst)
442                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
443
444         gro_cells_receive(&tunnel->gro_cells, skb);
445         return 0;
446
447 drop:
448         if (tun_dst)
449                 dst_release((struct dst_entry *)tun_dst);
450         kfree_skb(skb);
451         return 0;
452 }
453 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
454
455 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
456                             unsigned int num)
457 {
458         if (num >= MAX_IPTUN_ENCAP_OPS)
459                 return -ERANGE;
460
461         return !cmpxchg((const struct ip_tunnel_encap_ops **)
462                         &iptun_encaps[num],
463                         NULL, ops) ? 0 : -1;
464 }
465 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
466
467 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
468                             unsigned int num)
469 {
470         int ret;
471
472         if (num >= MAX_IPTUN_ENCAP_OPS)
473                 return -ERANGE;
474
475         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
476                        &iptun_encaps[num],
477                        ops, NULL) == ops) ? 0 : -1;
478
479         synchronize_net();
480
481         return ret;
482 }
483 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
484
485 int ip_tunnel_encap_setup(struct ip_tunnel *t,
486                           struct ip_tunnel_encap *ipencap)
487 {
488         int hlen;
489
490         memset(&t->encap, 0, sizeof(t->encap));
491
492         hlen = ip_encap_hlen(ipencap);
493         if (hlen < 0)
494                 return hlen;
495
496         t->encap.type = ipencap->type;
497         t->encap.sport = ipencap->sport;
498         t->encap.dport = ipencap->dport;
499         t->encap.flags = ipencap->flags;
500
501         t->encap_hlen = hlen;
502         t->hlen = t->encap_hlen + t->tun_hlen;
503
504         return 0;
505 }
506 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
507
508 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
509                             struct rtable *rt, __be16 df,
510                             const struct iphdr *inner_iph)
511 {
512         struct ip_tunnel *tunnel = netdev_priv(dev);
513         int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
514         int mtu;
515
516         if (df)
517                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
518                                         - sizeof(struct iphdr) - tunnel->hlen;
519         else
520                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
521
522         skb_dst_update_pmtu_no_confirm(skb, mtu);
523
524         if (skb->protocol == htons(ETH_P_IP)) {
525                 if (!skb_is_gso(skb) &&
526                     (inner_iph->frag_off & htons(IP_DF)) &&
527                     mtu < pkt_size) {
528                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
529                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
530                         return -E2BIG;
531                 }
532         }
533 #if IS_ENABLED(CONFIG_IPV6)
534         else if (skb->protocol == htons(ETH_P_IPV6)) {
535                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
536
537                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
538                            mtu >= IPV6_MIN_MTU) {
539                         if ((tunnel->parms.iph.daddr &&
540                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
541                             rt6->rt6i_dst.plen == 128) {
542                                 rt6->rt6i_flags |= RTF_MODIFIED;
543                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
544                         }
545                 }
546
547                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
548                                         mtu < pkt_size) {
549                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
550                         return -E2BIG;
551                 }
552         }
553 #endif
554         return 0;
555 }
556
557 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
558 {
559         struct ip_tunnel *tunnel = netdev_priv(dev);
560         u32 headroom = sizeof(struct iphdr);
561         struct ip_tunnel_info *tun_info;
562         const struct ip_tunnel_key *key;
563         const struct iphdr *inner_iph;
564         struct rtable *rt;
565         struct flowi4 fl4;
566         __be16 df = 0;
567         u8 tos, ttl;
568
569         tun_info = skb_tunnel_info(skb);
570         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
571                      ip_tunnel_info_af(tun_info) != AF_INET))
572                 goto tx_error;
573         key = &tun_info->key;
574         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
575         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
576         tos = key->tos;
577         if (tos == 1) {
578                 if (skb->protocol == htons(ETH_P_IP))
579                         tos = inner_iph->tos;
580                 else if (skb->protocol == htons(ETH_P_IPV6))
581                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
582         }
583         init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
584                          RT_TOS(tos), tunnel->parms.link, tunnel->fwmark);
585         if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
586                 goto tx_error;
587         rt = ip_route_output_key(tunnel->net, &fl4);
588         if (IS_ERR(rt)) {
589                 dev->stats.tx_carrier_errors++;
590                 goto tx_error;
591         }
592         if (rt->dst.dev == dev) {
593                 ip_rt_put(rt);
594                 dev->stats.collisions++;
595                 goto tx_error;
596         }
597         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
598         ttl = key->ttl;
599         if (ttl == 0) {
600                 if (skb->protocol == htons(ETH_P_IP))
601                         ttl = inner_iph->ttl;
602                 else if (skb->protocol == htons(ETH_P_IPV6))
603                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
604                 else
605                         ttl = ip4_dst_hoplimit(&rt->dst);
606         }
607         if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
608                 df = htons(IP_DF);
609         else if (skb->protocol == htons(ETH_P_IP))
610                 df = inner_iph->frag_off & htons(IP_DF);
611         headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
612         if (headroom > READ_ONCE(dev->needed_headroom))
613                 WRITE_ONCE(dev->needed_headroom, headroom);
614
615         if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) {
616                 ip_rt_put(rt);
617                 goto tx_dropped;
618         }
619         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
620                       df, !net_eq(tunnel->net, dev_net(dev)));
621         return;
622 tx_error:
623         dev->stats.tx_errors++;
624         goto kfree;
625 tx_dropped:
626         dev->stats.tx_dropped++;
627 kfree:
628         kfree_skb(skb);
629 }
630 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
631
632 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
633                     const struct iphdr *tnl_params, u8 protocol)
634 {
635         struct ip_tunnel *tunnel = netdev_priv(dev);
636         unsigned int inner_nhdr_len = 0;
637         const struct iphdr *inner_iph;
638         struct flowi4 fl4;
639         u8     tos, ttl;
640         __be16 df;
641         struct rtable *rt;              /* Route to the other host */
642         unsigned int max_headroom;      /* The extra header space needed */
643         __be32 dst;
644         bool connected;
645
646         /* ensure we can access the inner net header, for several users below */
647         if (skb->protocol == htons(ETH_P_IP))
648                 inner_nhdr_len = sizeof(struct iphdr);
649         else if (skb->protocol == htons(ETH_P_IPV6))
650                 inner_nhdr_len = sizeof(struct ipv6hdr);
651         if (unlikely(!pskb_may_pull(skb, inner_nhdr_len)))
652                 goto tx_error;
653
654         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
655         connected = (tunnel->parms.iph.daddr != 0);
656
657         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
658
659         dst = tnl_params->daddr;
660         if (dst == 0) {
661                 /* NBMA tunnel */
662                 struct ip_tunnel_info *tun_info;
663
664                 if (!skb_dst(skb)) {
665                         dev->stats.tx_fifo_errors++;
666                         goto tx_error;
667                 }
668
669                 tun_info = skb_tunnel_info(skb);
670                 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
671                     ip_tunnel_info_af(tun_info) == AF_INET &&
672                     tun_info->key.u.ipv4.dst)
673                         dst = tun_info->key.u.ipv4.dst;
674                 else if (skb->protocol == htons(ETH_P_IP)) {
675                         rt = skb_rtable(skb);
676                         dst = rt_nexthop(rt, inner_iph->daddr);
677                 }
678 #if IS_ENABLED(CONFIG_IPV6)
679                 else if (skb->protocol == htons(ETH_P_IPV6)) {
680                         const struct in6_addr *addr6;
681                         struct neighbour *neigh;
682                         bool do_tx_error_icmp;
683                         int addr_type;
684
685                         neigh = dst_neigh_lookup(skb_dst(skb),
686                                                  &ipv6_hdr(skb)->daddr);
687                         if (!neigh)
688                                 goto tx_error;
689
690                         addr6 = (const struct in6_addr *)&neigh->primary_key;
691                         addr_type = ipv6_addr_type(addr6);
692
693                         if (addr_type == IPV6_ADDR_ANY) {
694                                 addr6 = &ipv6_hdr(skb)->daddr;
695                                 addr_type = ipv6_addr_type(addr6);
696                         }
697
698                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
699                                 do_tx_error_icmp = true;
700                         else {
701                                 do_tx_error_icmp = false;
702                                 dst = addr6->s6_addr32[3];
703                         }
704                         neigh_release(neigh);
705                         if (do_tx_error_icmp)
706                                 goto tx_error_icmp;
707                 }
708 #endif
709                 else
710                         goto tx_error;
711
712                 connected = false;
713         }
714
715         tos = tnl_params->tos;
716         if (tos & 0x1) {
717                 tos &= ~0x1;
718                 if (skb->protocol == htons(ETH_P_IP)) {
719                         tos = inner_iph->tos;
720                         connected = false;
721                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
722                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
723                         connected = false;
724                 }
725         }
726
727         init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
728                          tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
729                          tunnel->fwmark);
730
731         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
732                 goto tx_error;
733
734         rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
735                          NULL;
736
737         if (!rt) {
738                 rt = ip_route_output_key(tunnel->net, &fl4);
739
740                 if (IS_ERR(rt)) {
741                         dev->stats.tx_carrier_errors++;
742                         goto tx_error;
743                 }
744                 if (connected)
745                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
746                                           fl4.saddr);
747         }
748
749         if (rt->dst.dev == dev) {
750                 ip_rt_put(rt);
751                 dev->stats.collisions++;
752                 goto tx_error;
753         }
754
755         df = tnl_params->frag_off;
756         if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
757                 df |= (inner_iph->frag_off & htons(IP_DF));
758
759         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph)) {
760                 ip_rt_put(rt);
761                 goto tx_error;
762         }
763
764         if (tunnel->err_count > 0) {
765                 if (time_before(jiffies,
766                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
767                         tunnel->err_count--;
768
769                         dst_link_failure(skb);
770                 } else
771                         tunnel->err_count = 0;
772         }
773
774         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
775         ttl = tnl_params->ttl;
776         if (ttl == 0) {
777                 if (skb->protocol == htons(ETH_P_IP))
778                         ttl = inner_iph->ttl;
779 #if IS_ENABLED(CONFIG_IPV6)
780                 else if (skb->protocol == htons(ETH_P_IPV6))
781                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
782 #endif
783                 else
784                         ttl = ip4_dst_hoplimit(&rt->dst);
785         }
786
787         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
788                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
789         if (max_headroom > READ_ONCE(dev->needed_headroom))
790                 WRITE_ONCE(dev->needed_headroom, max_headroom);
791
792         if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) {
793                 ip_rt_put(rt);
794                 dev->stats.tx_dropped++;
795                 kfree_skb(skb);
796                 return;
797         }
798
799         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
800                       df, !net_eq(tunnel->net, dev_net(dev)));
801         return;
802
803 #if IS_ENABLED(CONFIG_IPV6)
804 tx_error_icmp:
805         dst_link_failure(skb);
806 #endif
807 tx_error:
808         dev->stats.tx_errors++;
809         kfree_skb(skb);
810 }
811 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
812
813 static void ip_tunnel_update(struct ip_tunnel_net *itn,
814                              struct ip_tunnel *t,
815                              struct net_device *dev,
816                              struct ip_tunnel_parm *p,
817                              bool set_mtu,
818                              __u32 fwmark)
819 {
820         ip_tunnel_del(itn, t);
821         t->parms.iph.saddr = p->iph.saddr;
822         t->parms.iph.daddr = p->iph.daddr;
823         t->parms.i_key = p->i_key;
824         t->parms.o_key = p->o_key;
825         if (dev->type != ARPHRD_ETHER) {
826                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
827                 memcpy(dev->broadcast, &p->iph.daddr, 4);
828         }
829         ip_tunnel_add(itn, t);
830
831         t->parms.iph.ttl = p->iph.ttl;
832         t->parms.iph.tos = p->iph.tos;
833         t->parms.iph.frag_off = p->iph.frag_off;
834
835         if (t->parms.link != p->link || t->fwmark != fwmark) {
836                 int mtu;
837
838                 t->parms.link = p->link;
839                 t->fwmark = fwmark;
840                 mtu = ip_tunnel_bind_dev(dev);
841                 if (set_mtu)
842                         dev->mtu = mtu;
843         }
844         dst_cache_reset(&t->dst_cache);
845         netdev_state_change(dev);
846 }
847
848 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
849 {
850         int err = 0;
851         struct ip_tunnel *t = netdev_priv(dev);
852         struct net *net = t->net;
853         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
854
855         BUG_ON(!itn->fb_tunnel_dev);
856         switch (cmd) {
857         case SIOCGETTUNNEL:
858                 if (dev == itn->fb_tunnel_dev) {
859                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
860                         if (!t)
861                                 t = netdev_priv(dev);
862                 }
863                 memcpy(p, &t->parms, sizeof(*p));
864                 break;
865
866         case SIOCADDTUNNEL:
867         case SIOCCHGTUNNEL:
868                 err = -EPERM;
869                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
870                         goto done;
871                 if (p->iph.ttl)
872                         p->iph.frag_off |= htons(IP_DF);
873                 if (!(p->i_flags & VTI_ISVTI)) {
874                         if (!(p->i_flags & TUNNEL_KEY))
875                                 p->i_key = 0;
876                         if (!(p->o_flags & TUNNEL_KEY))
877                                 p->o_key = 0;
878                 }
879
880                 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
881
882                 if (cmd == SIOCADDTUNNEL) {
883                         if (!t) {
884                                 t = ip_tunnel_create(net, itn, p);
885                                 err = PTR_ERR_OR_ZERO(t);
886                                 break;
887                         }
888
889                         err = -EEXIST;
890                         break;
891                 }
892                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
893                         if (t) {
894                                 if (t->dev != dev) {
895                                         err = -EEXIST;
896                                         break;
897                                 }
898                         } else {
899                                 unsigned int nflags = 0;
900
901                                 if (ipv4_is_multicast(p->iph.daddr))
902                                         nflags = IFF_BROADCAST;
903                                 else if (p->iph.daddr)
904                                         nflags = IFF_POINTOPOINT;
905
906                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
907                                         err = -EINVAL;
908                                         break;
909                                 }
910
911                                 t = netdev_priv(dev);
912                         }
913                 }
914
915                 if (t) {
916                         err = 0;
917                         ip_tunnel_update(itn, t, dev, p, true, 0);
918                 } else {
919                         err = -ENOENT;
920                 }
921                 break;
922
923         case SIOCDELTUNNEL:
924                 err = -EPERM;
925                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
926                         goto done;
927
928                 if (dev == itn->fb_tunnel_dev) {
929                         err = -ENOENT;
930                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
931                         if (!t)
932                                 goto done;
933                         err = -EPERM;
934                         if (t == netdev_priv(itn->fb_tunnel_dev))
935                                 goto done;
936                         dev = t->dev;
937                 }
938                 unregister_netdevice(dev);
939                 err = 0;
940                 break;
941
942         default:
943                 err = -EINVAL;
944         }
945
946 done:
947         return err;
948 }
949 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
950
951 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
952 {
953         struct ip_tunnel *tunnel = netdev_priv(dev);
954         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
955         int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
956
957         if (new_mtu < ETH_MIN_MTU)
958                 return -EINVAL;
959
960         if (new_mtu > max_mtu) {
961                 if (strict)
962                         return -EINVAL;
963
964                 new_mtu = max_mtu;
965         }
966
967         dev->mtu = new_mtu;
968         return 0;
969 }
970 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
971
972 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
973 {
974         return __ip_tunnel_change_mtu(dev, new_mtu, true);
975 }
976 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
977
978 static void ip_tunnel_dev_free(struct net_device *dev)
979 {
980         struct ip_tunnel *tunnel = netdev_priv(dev);
981
982         gro_cells_destroy(&tunnel->gro_cells);
983         dst_cache_destroy(&tunnel->dst_cache);
984         free_percpu(dev->tstats);
985 }
986
987 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
988 {
989         struct ip_tunnel *tunnel = netdev_priv(dev);
990         struct ip_tunnel_net *itn;
991
992         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
993
994         if (itn->fb_tunnel_dev != dev) {
995                 ip_tunnel_del(itn, netdev_priv(dev));
996                 unregister_netdevice_queue(dev, head);
997         }
998 }
999 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1000
1001 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1002 {
1003         struct ip_tunnel *tunnel = netdev_priv(dev);
1004
1005         return tunnel->net;
1006 }
1007 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1008
1009 int ip_tunnel_get_iflink(const struct net_device *dev)
1010 {
1011         struct ip_tunnel *tunnel = netdev_priv(dev);
1012
1013         return tunnel->parms.link;
1014 }
1015 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1016
1017 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1018                                   struct rtnl_link_ops *ops, char *devname)
1019 {
1020         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1021         struct ip_tunnel_parm parms;
1022         unsigned int i;
1023
1024         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1025                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1026
1027         if (!ops) {
1028                 itn->fb_tunnel_dev = NULL;
1029                 return 0;
1030         }
1031
1032         memset(&parms, 0, sizeof(parms));
1033         if (devname)
1034                 strlcpy(parms.name, devname, IFNAMSIZ);
1035
1036         rtnl_lock();
1037         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1038         /* FB netdevice is special: we have one, and only one per netns.
1039          * Allowing to move it to another netns is clearly unsafe.
1040          */
1041         if (!IS_ERR(itn->fb_tunnel_dev)) {
1042                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1043                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1044                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1045         }
1046         rtnl_unlock();
1047
1048         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1049 }
1050 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1051
1052 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1053                               struct rtnl_link_ops *ops)
1054 {
1055         struct net *net = dev_net(itn->fb_tunnel_dev);
1056         struct net_device *dev, *aux;
1057         int h;
1058
1059         for_each_netdev_safe(net, dev, aux)
1060                 if (dev->rtnl_link_ops == ops)
1061                         unregister_netdevice_queue(dev, head);
1062
1063         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1064                 struct ip_tunnel *t;
1065                 struct hlist_node *n;
1066                 struct hlist_head *thead = &itn->tunnels[h];
1067
1068                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1069                         /* If dev is in the same netns, it has already
1070                          * been added to the list by the previous loop.
1071                          */
1072                         if (!net_eq(dev_net(t->dev), net))
1073                                 unregister_netdevice_queue(t->dev, head);
1074         }
1075 }
1076
1077 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1078 {
1079         LIST_HEAD(list);
1080
1081         rtnl_lock();
1082         ip_tunnel_destroy(itn, &list, ops);
1083         unregister_netdevice_many(&list);
1084         rtnl_unlock();
1085 }
1086 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1087
1088 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1089                       struct ip_tunnel_parm *p, __u32 fwmark)
1090 {
1091         struct ip_tunnel *nt;
1092         struct net *net = dev_net(dev);
1093         struct ip_tunnel_net *itn;
1094         int mtu;
1095         int err;
1096
1097         nt = netdev_priv(dev);
1098         itn = net_generic(net, nt->ip_tnl_net_id);
1099
1100         if (nt->collect_md) {
1101                 if (rtnl_dereference(itn->collect_md_tun))
1102                         return -EEXIST;
1103         } else {
1104                 if (ip_tunnel_find(itn, p, dev->type))
1105                         return -EEXIST;
1106         }
1107
1108         nt->net = net;
1109         nt->parms = *p;
1110         nt->fwmark = fwmark;
1111         err = register_netdevice(dev);
1112         if (err)
1113                 goto out;
1114
1115         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1116                 eth_hw_addr_random(dev);
1117
1118         mtu = ip_tunnel_bind_dev(dev);
1119         if (tb[IFLA_MTU]) {
1120                 unsigned int max = 0xfff8 - dev->hard_header_len - nt->hlen;
1121
1122                 dev->mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
1123                                  (unsigned int)(max - sizeof(struct iphdr)));
1124         } else {
1125                 dev->mtu = mtu;
1126         }
1127
1128         ip_tunnel_add(itn, nt);
1129 out:
1130         return err;
1131 }
1132 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1133
1134 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1135                          struct ip_tunnel_parm *p, __u32 fwmark)
1136 {
1137         struct ip_tunnel *t;
1138         struct ip_tunnel *tunnel = netdev_priv(dev);
1139         struct net *net = tunnel->net;
1140         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1141
1142         if (dev == itn->fb_tunnel_dev)
1143                 return -EINVAL;
1144
1145         t = ip_tunnel_find(itn, p, dev->type);
1146
1147         if (t) {
1148                 if (t->dev != dev)
1149                         return -EEXIST;
1150         } else {
1151                 t = tunnel;
1152
1153                 if (dev->type != ARPHRD_ETHER) {
1154                         unsigned int nflags = 0;
1155
1156                         if (ipv4_is_multicast(p->iph.daddr))
1157                                 nflags = IFF_BROADCAST;
1158                         else if (p->iph.daddr)
1159                                 nflags = IFF_POINTOPOINT;
1160
1161                         if ((dev->flags ^ nflags) &
1162                             (IFF_POINTOPOINT | IFF_BROADCAST))
1163                                 return -EINVAL;
1164                 }
1165         }
1166
1167         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1168         return 0;
1169 }
1170 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1171
1172 int ip_tunnel_init(struct net_device *dev)
1173 {
1174         struct ip_tunnel *tunnel = netdev_priv(dev);
1175         struct iphdr *iph = &tunnel->parms.iph;
1176         int err;
1177
1178         dev->needs_free_netdev = true;
1179         dev->priv_destructor = ip_tunnel_dev_free;
1180         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1181         if (!dev->tstats)
1182                 return -ENOMEM;
1183
1184         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1185         if (err) {
1186                 free_percpu(dev->tstats);
1187                 return err;
1188         }
1189
1190         err = gro_cells_init(&tunnel->gro_cells, dev);
1191         if (err) {
1192                 dst_cache_destroy(&tunnel->dst_cache);
1193                 free_percpu(dev->tstats);
1194                 return err;
1195         }
1196
1197         tunnel->dev = dev;
1198         tunnel->net = dev_net(dev);
1199         strcpy(tunnel->parms.name, dev->name);
1200         iph->version            = 4;
1201         iph->ihl                = 5;
1202
1203         if (tunnel->collect_md)
1204                 netif_keep_dst(dev);
1205         return 0;
1206 }
1207 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1208
1209 void ip_tunnel_uninit(struct net_device *dev)
1210 {
1211         struct ip_tunnel *tunnel = netdev_priv(dev);
1212         struct net *net = tunnel->net;
1213         struct ip_tunnel_net *itn;
1214
1215         itn = net_generic(net, tunnel->ip_tnl_net_id);
1216         ip_tunnel_del(itn, netdev_priv(dev));
1217         if (itn->fb_tunnel_dev == dev)
1218                 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1219
1220         dst_cache_reset(&tunnel->dst_cache);
1221 }
1222 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1223
1224 /* Do least required initialization, rest of init is done in tunnel_init call */
1225 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1226 {
1227         struct ip_tunnel *tunnel = netdev_priv(dev);
1228         tunnel->ip_tnl_net_id = net_id;
1229 }
1230 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1231
1232 MODULE_LICENSE("GPL");