GNU Linux-libre 5.4.257-gnu1
[releases.git] / net / ipv4 / ip_tunnel.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2013 Nicira, Inc.
4  */
5
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
16 #include <linux/in.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
30
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
36 #include <net/arp.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
40 #include <net/xfrm.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
44 #include <net/udp.h>
45 #include <net/dst_metadata.h>
46
47 #if IS_ENABLED(CONFIG_IPV6)
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
54 {
55         return hash_32((__force u32)key ^ (__force u32)remote,
56                          IP_TNL_HASH_BITS);
57 }
58
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60                                 __be16 flags, __be32 key)
61 {
62         if (p->i_flags & TUNNEL_KEY) {
63                 if (flags & TUNNEL_KEY)
64                         return key == p->i_key;
65                 else
66                         /* key expected, none present */
67                         return false;
68         } else
69                 return !(flags & TUNNEL_KEY);
70 }
71
72 /* Fallback tunnel: no source, no destination, no key, no options
73
74    Tunnel hash table:
75    We require exact key match i.e. if a key is present in packet
76    it will match only tunnel with the same key; if it is not present,
77    it will match only keyless tunnel.
78
79    All keysless packets, if not matched configured keyless tunnels
80    will match fallback tunnel.
81    Given src, dst and key, find appropriate for input tunnel.
82 */
83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84                                    int link, __be16 flags,
85                                    __be32 remote, __be32 local,
86                                    __be32 key)
87 {
88         struct ip_tunnel *t, *cand = NULL;
89         struct hlist_head *head;
90         struct net_device *ndev;
91         unsigned int hash;
92
93         hash = ip_tunnel_hash(key, remote);
94         head = &itn->tunnels[hash];
95
96         hlist_for_each_entry_rcu(t, head, hash_node) {
97                 if (local != t->parms.iph.saddr ||
98                     remote != t->parms.iph.daddr ||
99                     !(t->dev->flags & IFF_UP))
100                         continue;
101
102                 if (!ip_tunnel_key_match(&t->parms, flags, key))
103                         continue;
104
105                 if (t->parms.link == link)
106                         return t;
107                 else
108                         cand = t;
109         }
110
111         hlist_for_each_entry_rcu(t, head, hash_node) {
112                 if (remote != t->parms.iph.daddr ||
113                     t->parms.iph.saddr != 0 ||
114                     !(t->dev->flags & IFF_UP))
115                         continue;
116
117                 if (!ip_tunnel_key_match(&t->parms, flags, key))
118                         continue;
119
120                 if (t->parms.link == link)
121                         return t;
122                 else if (!cand)
123                         cand = t;
124         }
125
126         hash = ip_tunnel_hash(key, 0);
127         head = &itn->tunnels[hash];
128
129         hlist_for_each_entry_rcu(t, head, hash_node) {
130                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
131                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
132                         continue;
133
134                 if (!(t->dev->flags & IFF_UP))
135                         continue;
136
137                 if (!ip_tunnel_key_match(&t->parms, flags, key))
138                         continue;
139
140                 if (t->parms.link == link)
141                         return t;
142                 else if (!cand)
143                         cand = t;
144         }
145
146         hlist_for_each_entry_rcu(t, head, hash_node) {
147                 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
148                     t->parms.iph.saddr != 0 ||
149                     t->parms.iph.daddr != 0 ||
150                     !(t->dev->flags & IFF_UP))
151                         continue;
152
153                 if (t->parms.link == link)
154                         return t;
155                 else if (!cand)
156                         cand = t;
157         }
158
159         if (cand)
160                 return cand;
161
162         t = rcu_dereference(itn->collect_md_tun);
163         if (t && t->dev->flags & IFF_UP)
164                 return t;
165
166         ndev = READ_ONCE(itn->fb_tunnel_dev);
167         if (ndev && ndev->flags & IFF_UP)
168                 return netdev_priv(ndev);
169
170         return NULL;
171 }
172 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
173
174 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
175                                     struct ip_tunnel_parm *parms)
176 {
177         unsigned int h;
178         __be32 remote;
179         __be32 i_key = parms->i_key;
180
181         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
182                 remote = parms->iph.daddr;
183         else
184                 remote = 0;
185
186         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
187                 i_key = 0;
188
189         h = ip_tunnel_hash(i_key, remote);
190         return &itn->tunnels[h];
191 }
192
193 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
194 {
195         struct hlist_head *head = ip_bucket(itn, &t->parms);
196
197         if (t->collect_md)
198                 rcu_assign_pointer(itn->collect_md_tun, t);
199         hlist_add_head_rcu(&t->hash_node, head);
200 }
201
202 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
203 {
204         if (t->collect_md)
205                 rcu_assign_pointer(itn->collect_md_tun, NULL);
206         hlist_del_init_rcu(&t->hash_node);
207 }
208
209 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
210                                         struct ip_tunnel_parm *parms,
211                                         int type)
212 {
213         __be32 remote = parms->iph.daddr;
214         __be32 local = parms->iph.saddr;
215         __be32 key = parms->i_key;
216         __be16 flags = parms->i_flags;
217         int link = parms->link;
218         struct ip_tunnel *t = NULL;
219         struct hlist_head *head = ip_bucket(itn, parms);
220
221         hlist_for_each_entry_rcu(t, head, hash_node) {
222                 if (local == t->parms.iph.saddr &&
223                     remote == t->parms.iph.daddr &&
224                     link == t->parms.link &&
225                     type == t->dev->type &&
226                     ip_tunnel_key_match(&t->parms, flags, key))
227                         break;
228         }
229         return t;
230 }
231
232 static struct net_device *__ip_tunnel_create(struct net *net,
233                                              const struct rtnl_link_ops *ops,
234                                              struct ip_tunnel_parm *parms)
235 {
236         int err;
237         struct ip_tunnel *tunnel;
238         struct net_device *dev;
239         char name[IFNAMSIZ];
240
241         err = -E2BIG;
242         if (parms->name[0]) {
243                 if (!dev_valid_name(parms->name))
244                         goto failed;
245                 strlcpy(name, parms->name, IFNAMSIZ);
246         } else {
247                 if (strlen(ops->kind) > (IFNAMSIZ - 3))
248                         goto failed;
249                 strcpy(name, ops->kind);
250                 strcat(name, "%d");
251         }
252
253         ASSERT_RTNL();
254         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
255         if (!dev) {
256                 err = -ENOMEM;
257                 goto failed;
258         }
259         dev_net_set(dev, net);
260
261         dev->rtnl_link_ops = ops;
262
263         tunnel = netdev_priv(dev);
264         tunnel->parms = *parms;
265         tunnel->net = net;
266
267         err = register_netdevice(dev);
268         if (err)
269                 goto failed_free;
270
271         return dev;
272
273 failed_free:
274         free_netdev(dev);
275 failed:
276         return ERR_PTR(err);
277 }
278
279 static int ip_tunnel_bind_dev(struct net_device *dev)
280 {
281         struct net_device *tdev = NULL;
282         struct ip_tunnel *tunnel = netdev_priv(dev);
283         const struct iphdr *iph;
284         int hlen = LL_MAX_HEADER;
285         int mtu = ETH_DATA_LEN;
286         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
287
288         iph = &tunnel->parms.iph;
289
290         /* Guess output device to choose reasonable mtu and needed_headroom */
291         if (iph->daddr) {
292                 struct flowi4 fl4;
293                 struct rtable *rt;
294
295                 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
296                                     iph->saddr, tunnel->parms.o_key,
297                                     RT_TOS(iph->tos), tunnel->parms.link,
298                                     tunnel->fwmark, 0);
299                 rt = ip_route_output_key(tunnel->net, &fl4);
300
301                 if (!IS_ERR(rt)) {
302                         tdev = rt->dst.dev;
303                         ip_rt_put(rt);
304                 }
305                 if (dev->type != ARPHRD_ETHER)
306                         dev->flags |= IFF_POINTOPOINT;
307
308                 dst_cache_reset(&tunnel->dst_cache);
309         }
310
311         if (!tdev && tunnel->parms.link)
312                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
313
314         if (tdev) {
315                 hlen = tdev->hard_header_len + tdev->needed_headroom;
316                 mtu = min(tdev->mtu, IP_MAX_MTU);
317         }
318
319         dev->needed_headroom = t_hlen + hlen;
320         mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
321
322         if (mtu < IPV4_MIN_MTU)
323                 mtu = IPV4_MIN_MTU;
324
325         return mtu;
326 }
327
328 static struct ip_tunnel *ip_tunnel_create(struct net *net,
329                                           struct ip_tunnel_net *itn,
330                                           struct ip_tunnel_parm *parms)
331 {
332         struct ip_tunnel *nt;
333         struct net_device *dev;
334         int t_hlen;
335         int mtu;
336         int err;
337
338         dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
339         if (IS_ERR(dev))
340                 return ERR_CAST(dev);
341
342         mtu = ip_tunnel_bind_dev(dev);
343         err = dev_set_mtu(dev, mtu);
344         if (err)
345                 goto err_dev_set_mtu;
346
347         nt = netdev_priv(dev);
348         t_hlen = nt->hlen + sizeof(struct iphdr);
349         dev->min_mtu = ETH_MIN_MTU;
350         dev->max_mtu = IP_MAX_MTU - t_hlen;
351         if (dev->type == ARPHRD_ETHER)
352                 dev->max_mtu -= dev->hard_header_len;
353
354         ip_tunnel_add(itn, nt);
355         return nt;
356
357 err_dev_set_mtu:
358         unregister_netdevice(dev);
359         return ERR_PTR(err);
360 }
361
362 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
363                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
364                   bool log_ecn_error)
365 {
366         struct pcpu_sw_netstats *tstats;
367         const struct iphdr *iph = ip_hdr(skb);
368         int err;
369
370 #ifdef CONFIG_NET_IPGRE_BROADCAST
371         if (ipv4_is_multicast(iph->daddr)) {
372                 tunnel->dev->stats.multicast++;
373                 skb->pkt_type = PACKET_BROADCAST;
374         }
375 #endif
376
377         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
378              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
379                 tunnel->dev->stats.rx_crc_errors++;
380                 tunnel->dev->stats.rx_errors++;
381                 goto drop;
382         }
383
384         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
385                 if (!(tpi->flags&TUNNEL_SEQ) ||
386                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
387                         tunnel->dev->stats.rx_fifo_errors++;
388                         tunnel->dev->stats.rx_errors++;
389                         goto drop;
390                 }
391                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
392         }
393
394         skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0);
395
396         err = IP_ECN_decapsulate(iph, skb);
397         if (unlikely(err)) {
398                 if (log_ecn_error)
399                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
400                                         &iph->saddr, iph->tos);
401                 if (err > 1) {
402                         ++tunnel->dev->stats.rx_frame_errors;
403                         ++tunnel->dev->stats.rx_errors;
404                         goto drop;
405                 }
406         }
407
408         tstats = this_cpu_ptr(tunnel->dev->tstats);
409         u64_stats_update_begin(&tstats->syncp);
410         tstats->rx_packets++;
411         tstats->rx_bytes += skb->len;
412         u64_stats_update_end(&tstats->syncp);
413
414         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
415
416         if (tunnel->dev->type == ARPHRD_ETHER) {
417                 skb->protocol = eth_type_trans(skb, tunnel->dev);
418                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
419         } else {
420                 skb->dev = tunnel->dev;
421         }
422
423         if (tun_dst)
424                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
425
426         gro_cells_receive(&tunnel->gro_cells, skb);
427         return 0;
428
429 drop:
430         if (tun_dst)
431                 dst_release((struct dst_entry *)tun_dst);
432         kfree_skb(skb);
433         return 0;
434 }
435 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
436
437 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
438                             unsigned int num)
439 {
440         if (num >= MAX_IPTUN_ENCAP_OPS)
441                 return -ERANGE;
442
443         return !cmpxchg((const struct ip_tunnel_encap_ops **)
444                         &iptun_encaps[num],
445                         NULL, ops) ? 0 : -1;
446 }
447 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
448
449 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
450                             unsigned int num)
451 {
452         int ret;
453
454         if (num >= MAX_IPTUN_ENCAP_OPS)
455                 return -ERANGE;
456
457         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
458                        &iptun_encaps[num],
459                        ops, NULL) == ops) ? 0 : -1;
460
461         synchronize_net();
462
463         return ret;
464 }
465 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
466
467 int ip_tunnel_encap_setup(struct ip_tunnel *t,
468                           struct ip_tunnel_encap *ipencap)
469 {
470         int hlen;
471
472         memset(&t->encap, 0, sizeof(t->encap));
473
474         hlen = ip_encap_hlen(ipencap);
475         if (hlen < 0)
476                 return hlen;
477
478         t->encap.type = ipencap->type;
479         t->encap.sport = ipencap->sport;
480         t->encap.dport = ipencap->dport;
481         t->encap.flags = ipencap->flags;
482
483         t->encap_hlen = hlen;
484         t->hlen = t->encap_hlen + t->tun_hlen;
485
486         return 0;
487 }
488 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
489
490 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
491                             struct rtable *rt, __be16 df,
492                             const struct iphdr *inner_iph,
493                             int tunnel_hlen, __be32 dst, bool md)
494 {
495         struct ip_tunnel *tunnel = netdev_priv(dev);
496         int pkt_size;
497         int mtu;
498
499         tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
500         pkt_size = skb->len - tunnel_hlen;
501         pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
502
503         if (df) {
504                 mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
505                 mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
506         } else {
507                 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
508         }
509
510         if (skb_valid_dst(skb))
511                 skb_dst_update_pmtu_no_confirm(skb, mtu);
512
513         if (skb->protocol == htons(ETH_P_IP)) {
514                 if (!skb_is_gso(skb) &&
515                     (inner_iph->frag_off & htons(IP_DF)) &&
516                     mtu < pkt_size) {
517                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
518                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
519                         return -E2BIG;
520                 }
521         }
522 #if IS_ENABLED(CONFIG_IPV6)
523         else if (skb->protocol == htons(ETH_P_IPV6)) {
524                 struct rt6_info *rt6;
525                 __be32 daddr;
526
527                 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
528                                            NULL;
529                 daddr = md ? dst : tunnel->parms.iph.daddr;
530
531                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
532                            mtu >= IPV6_MIN_MTU) {
533                         if ((daddr && !ipv4_is_multicast(daddr)) ||
534                             rt6->rt6i_dst.plen == 128) {
535                                 rt6->rt6i_flags |= RTF_MODIFIED;
536                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
537                         }
538                 }
539
540                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
541                                         mtu < pkt_size) {
542                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
543                         return -E2BIG;
544                 }
545         }
546 #endif
547         return 0;
548 }
549
550 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
551                        u8 proto, int tunnel_hlen)
552 {
553         struct ip_tunnel *tunnel = netdev_priv(dev);
554         u32 headroom = sizeof(struct iphdr);
555         struct ip_tunnel_info *tun_info;
556         const struct ip_tunnel_key *key;
557         const struct iphdr *inner_iph;
558         struct rtable *rt = NULL;
559         struct flowi4 fl4;
560         __be16 df = 0;
561         u8 tos, ttl;
562         bool use_cache;
563
564         tun_info = skb_tunnel_info(skb);
565         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
566                      ip_tunnel_info_af(tun_info) != AF_INET))
567                 goto tx_error;
568         key = &tun_info->key;
569         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
570         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
571         tos = key->tos;
572         if (tos == 1) {
573                 if (skb->protocol == htons(ETH_P_IP))
574                         tos = inner_iph->tos;
575                 else if (skb->protocol == htons(ETH_P_IPV6))
576                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
577         }
578         ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
579                             tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
580                             0, skb->mark, skb_get_hash(skb));
581         if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
582                 goto tx_error;
583
584         use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
585         if (use_cache)
586                 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
587         if (!rt) {
588                 rt = ip_route_output_key(tunnel->net, &fl4);
589                 if (IS_ERR(rt)) {
590                         dev->stats.tx_carrier_errors++;
591                         goto tx_error;
592                 }
593                 if (use_cache)
594                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
595                                           fl4.saddr);
596         }
597         if (rt->dst.dev == dev) {
598                 ip_rt_put(rt);
599                 dev->stats.collisions++;
600                 goto tx_error;
601         }
602
603         if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
604                 df = htons(IP_DF);
605         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
606                             key->u.ipv4.dst, true)) {
607                 ip_rt_put(rt);
608                 goto tx_error;
609         }
610
611         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
612         ttl = key->ttl;
613         if (ttl == 0) {
614                 if (skb->protocol == htons(ETH_P_IP))
615                         ttl = inner_iph->ttl;
616                 else if (skb->protocol == htons(ETH_P_IPV6))
617                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
618                 else
619                         ttl = ip4_dst_hoplimit(&rt->dst);
620         }
621
622         headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
623         if (headroom > READ_ONCE(dev->needed_headroom))
624                 WRITE_ONCE(dev->needed_headroom, headroom);
625
626         if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) {
627                 ip_rt_put(rt);
628                 goto tx_dropped;
629         }
630         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
631                       df, !net_eq(tunnel->net, dev_net(dev)));
632         return;
633 tx_error:
634         dev->stats.tx_errors++;
635         goto kfree;
636 tx_dropped:
637         dev->stats.tx_dropped++;
638 kfree:
639         kfree_skb(skb);
640 }
641 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
642
643 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
644                     const struct iphdr *tnl_params, u8 protocol)
645 {
646         struct ip_tunnel *tunnel = netdev_priv(dev);
647         struct ip_tunnel_info *tun_info = NULL;
648         const struct iphdr *inner_iph;
649         unsigned int max_headroom;      /* The extra header space needed */
650         struct rtable *rt = NULL;               /* Route to the other host */
651         bool use_cache = false;
652         struct flowi4 fl4;
653         bool md = false;
654         bool connected;
655         u8 tos, ttl;
656         __be32 dst;
657         __be16 df;
658
659         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
660         connected = (tunnel->parms.iph.daddr != 0);
661
662         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
663
664         dst = tnl_params->daddr;
665         if (dst == 0) {
666                 /* NBMA tunnel */
667
668                 if (!skb_dst(skb)) {
669                         dev->stats.tx_fifo_errors++;
670                         goto tx_error;
671                 }
672
673                 tun_info = skb_tunnel_info(skb);
674                 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
675                     ip_tunnel_info_af(tun_info) == AF_INET &&
676                     tun_info->key.u.ipv4.dst) {
677                         dst = tun_info->key.u.ipv4.dst;
678                         md = true;
679                         connected = true;
680                 }
681                 else if (skb->protocol == htons(ETH_P_IP)) {
682                         rt = skb_rtable(skb);
683                         dst = rt_nexthop(rt, inner_iph->daddr);
684                 }
685 #if IS_ENABLED(CONFIG_IPV6)
686                 else if (skb->protocol == htons(ETH_P_IPV6)) {
687                         const struct in6_addr *addr6;
688                         struct neighbour *neigh;
689                         bool do_tx_error_icmp;
690                         int addr_type;
691
692                         neigh = dst_neigh_lookup(skb_dst(skb),
693                                                  &ipv6_hdr(skb)->daddr);
694                         if (!neigh)
695                                 goto tx_error;
696
697                         addr6 = (const struct in6_addr *)&neigh->primary_key;
698                         addr_type = ipv6_addr_type(addr6);
699
700                         if (addr_type == IPV6_ADDR_ANY) {
701                                 addr6 = &ipv6_hdr(skb)->daddr;
702                                 addr_type = ipv6_addr_type(addr6);
703                         }
704
705                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
706                                 do_tx_error_icmp = true;
707                         else {
708                                 do_tx_error_icmp = false;
709                                 dst = addr6->s6_addr32[3];
710                         }
711                         neigh_release(neigh);
712                         if (do_tx_error_icmp)
713                                 goto tx_error_icmp;
714                 }
715 #endif
716                 else
717                         goto tx_error;
718
719                 if (!md)
720                         connected = false;
721         }
722
723         tos = tnl_params->tos;
724         if (tos & 0x1) {
725                 tos &= ~0x1;
726                 if (skb->protocol == htons(ETH_P_IP)) {
727                         tos = inner_iph->tos;
728                         connected = false;
729                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
730                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
731                         connected = false;
732                 }
733         }
734
735         ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
736                             tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
737                             tunnel->fwmark, skb_get_hash(skb));
738
739         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
740                 goto tx_error;
741
742         if (connected && md) {
743                 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
744                 if (use_cache)
745                         rt = dst_cache_get_ip4(&tun_info->dst_cache,
746                                                &fl4.saddr);
747         } else {
748                 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
749                                                 &fl4.saddr) : NULL;
750         }
751
752         if (!rt) {
753                 rt = ip_route_output_key(tunnel->net, &fl4);
754
755                 if (IS_ERR(rt)) {
756                         dev->stats.tx_carrier_errors++;
757                         goto tx_error;
758                 }
759                 if (use_cache)
760                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
761                                           fl4.saddr);
762                 else if (!md && connected)
763                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
764                                           fl4.saddr);
765         }
766
767         if (rt->dst.dev == dev) {
768                 ip_rt_put(rt);
769                 dev->stats.collisions++;
770                 goto tx_error;
771         }
772
773         df = tnl_params->frag_off;
774         if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
775                 df |= (inner_iph->frag_off & htons(IP_DF));
776
777         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
778                 ip_rt_put(rt);
779                 goto tx_error;
780         }
781
782         if (tunnel->err_count > 0) {
783                 if (time_before(jiffies,
784                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
785                         tunnel->err_count--;
786
787                         dst_link_failure(skb);
788                 } else
789                         tunnel->err_count = 0;
790         }
791
792         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
793         ttl = tnl_params->ttl;
794         if (ttl == 0) {
795                 if (skb->protocol == htons(ETH_P_IP))
796                         ttl = inner_iph->ttl;
797 #if IS_ENABLED(CONFIG_IPV6)
798                 else if (skb->protocol == htons(ETH_P_IPV6))
799                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
800 #endif
801                 else
802                         ttl = ip4_dst_hoplimit(&rt->dst);
803         }
804
805         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
806                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
807         if (max_headroom > READ_ONCE(dev->needed_headroom))
808                 WRITE_ONCE(dev->needed_headroom, max_headroom);
809
810         if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) {
811                 ip_rt_put(rt);
812                 dev->stats.tx_dropped++;
813                 kfree_skb(skb);
814                 return;
815         }
816
817         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
818                       df, !net_eq(tunnel->net, dev_net(dev)));
819         return;
820
821 #if IS_ENABLED(CONFIG_IPV6)
822 tx_error_icmp:
823         dst_link_failure(skb);
824 #endif
825 tx_error:
826         dev->stats.tx_errors++;
827         kfree_skb(skb);
828 }
829 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
830
831 static void ip_tunnel_update(struct ip_tunnel_net *itn,
832                              struct ip_tunnel *t,
833                              struct net_device *dev,
834                              struct ip_tunnel_parm *p,
835                              bool set_mtu,
836                              __u32 fwmark)
837 {
838         ip_tunnel_del(itn, t);
839         t->parms.iph.saddr = p->iph.saddr;
840         t->parms.iph.daddr = p->iph.daddr;
841         t->parms.i_key = p->i_key;
842         t->parms.o_key = p->o_key;
843         if (dev->type != ARPHRD_ETHER) {
844                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
845                 memcpy(dev->broadcast, &p->iph.daddr, 4);
846         }
847         ip_tunnel_add(itn, t);
848
849         t->parms.iph.ttl = p->iph.ttl;
850         t->parms.iph.tos = p->iph.tos;
851         t->parms.iph.frag_off = p->iph.frag_off;
852
853         if (t->parms.link != p->link || t->fwmark != fwmark) {
854                 int mtu;
855
856                 t->parms.link = p->link;
857                 t->fwmark = fwmark;
858                 mtu = ip_tunnel_bind_dev(dev);
859                 if (set_mtu)
860                         dev->mtu = mtu;
861         }
862         dst_cache_reset(&t->dst_cache);
863         netdev_state_change(dev);
864 }
865
866 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
867 {
868         int err = 0;
869         struct ip_tunnel *t = netdev_priv(dev);
870         struct net *net = t->net;
871         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
872
873         switch (cmd) {
874         case SIOCGETTUNNEL:
875                 if (dev == itn->fb_tunnel_dev) {
876                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
877                         if (!t)
878                                 t = netdev_priv(dev);
879                 }
880                 memcpy(p, &t->parms, sizeof(*p));
881                 break;
882
883         case SIOCADDTUNNEL:
884         case SIOCCHGTUNNEL:
885                 err = -EPERM;
886                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
887                         goto done;
888                 if (p->iph.ttl)
889                         p->iph.frag_off |= htons(IP_DF);
890                 if (!(p->i_flags & VTI_ISVTI)) {
891                         if (!(p->i_flags & TUNNEL_KEY))
892                                 p->i_key = 0;
893                         if (!(p->o_flags & TUNNEL_KEY))
894                                 p->o_key = 0;
895                 }
896
897                 t = ip_tunnel_find(itn, p, itn->type);
898
899                 if (cmd == SIOCADDTUNNEL) {
900                         if (!t) {
901                                 t = ip_tunnel_create(net, itn, p);
902                                 err = PTR_ERR_OR_ZERO(t);
903                                 break;
904                         }
905
906                         err = -EEXIST;
907                         break;
908                 }
909                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
910                         if (t) {
911                                 if (t->dev != dev) {
912                                         err = -EEXIST;
913                                         break;
914                                 }
915                         } else {
916                                 unsigned int nflags = 0;
917
918                                 if (ipv4_is_multicast(p->iph.daddr))
919                                         nflags = IFF_BROADCAST;
920                                 else if (p->iph.daddr)
921                                         nflags = IFF_POINTOPOINT;
922
923                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
924                                         err = -EINVAL;
925                                         break;
926                                 }
927
928                                 t = netdev_priv(dev);
929                         }
930                 }
931
932                 if (t) {
933                         err = 0;
934                         ip_tunnel_update(itn, t, dev, p, true, 0);
935                 } else {
936                         err = -ENOENT;
937                 }
938                 break;
939
940         case SIOCDELTUNNEL:
941                 err = -EPERM;
942                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
943                         goto done;
944
945                 if (dev == itn->fb_tunnel_dev) {
946                         err = -ENOENT;
947                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
948                         if (!t)
949                                 goto done;
950                         err = -EPERM;
951                         if (t == netdev_priv(itn->fb_tunnel_dev))
952                                 goto done;
953                         dev = t->dev;
954                 }
955                 unregister_netdevice(dev);
956                 err = 0;
957                 break;
958
959         default:
960                 err = -EINVAL;
961         }
962
963 done:
964         return err;
965 }
966 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
967
968 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
969 {
970         struct ip_tunnel *tunnel = netdev_priv(dev);
971         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
972         int max_mtu = IP_MAX_MTU - t_hlen;
973
974         if (dev->type == ARPHRD_ETHER)
975                 max_mtu -= dev->hard_header_len;
976
977         if (new_mtu < ETH_MIN_MTU)
978                 return -EINVAL;
979
980         if (new_mtu > max_mtu) {
981                 if (strict)
982                         return -EINVAL;
983
984                 new_mtu = max_mtu;
985         }
986
987         dev->mtu = new_mtu;
988         return 0;
989 }
990 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
991
992 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
993 {
994         return __ip_tunnel_change_mtu(dev, new_mtu, true);
995 }
996 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
997
998 static void ip_tunnel_dev_free(struct net_device *dev)
999 {
1000         struct ip_tunnel *tunnel = netdev_priv(dev);
1001
1002         gro_cells_destroy(&tunnel->gro_cells);
1003         dst_cache_destroy(&tunnel->dst_cache);
1004         free_percpu(dev->tstats);
1005 }
1006
1007 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1008 {
1009         struct ip_tunnel *tunnel = netdev_priv(dev);
1010         struct ip_tunnel_net *itn;
1011
1012         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1013
1014         if (itn->fb_tunnel_dev != dev) {
1015                 ip_tunnel_del(itn, netdev_priv(dev));
1016                 unregister_netdevice_queue(dev, head);
1017         }
1018 }
1019 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1020
1021 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1022 {
1023         struct ip_tunnel *tunnel = netdev_priv(dev);
1024
1025         return tunnel->net;
1026 }
1027 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1028
1029 int ip_tunnel_get_iflink(const struct net_device *dev)
1030 {
1031         struct ip_tunnel *tunnel = netdev_priv(dev);
1032
1033         return tunnel->parms.link;
1034 }
1035 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1036
1037 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1038                                   struct rtnl_link_ops *ops, char *devname)
1039 {
1040         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1041         struct ip_tunnel_parm parms;
1042         unsigned int i;
1043
1044         itn->rtnl_link_ops = ops;
1045         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1046                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1047
1048         if (!ops || !net_has_fallback_tunnels(net)) {
1049                 struct ip_tunnel_net *it_init_net;
1050
1051                 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1052                 itn->type = it_init_net->type;
1053                 itn->fb_tunnel_dev = NULL;
1054                 return 0;
1055         }
1056
1057         memset(&parms, 0, sizeof(parms));
1058         if (devname)
1059                 strlcpy(parms.name, devname, IFNAMSIZ);
1060
1061         rtnl_lock();
1062         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1063         /* FB netdevice is special: we have one, and only one per netns.
1064          * Allowing to move it to another netns is clearly unsafe.
1065          */
1066         if (!IS_ERR(itn->fb_tunnel_dev)) {
1067                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1068                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1069                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1070                 itn->type = itn->fb_tunnel_dev->type;
1071         }
1072         rtnl_unlock();
1073
1074         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1075 }
1076 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1077
1078 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1079                               struct list_head *head,
1080                               struct rtnl_link_ops *ops)
1081 {
1082         struct net_device *dev, *aux;
1083         int h;
1084
1085         for_each_netdev_safe(net, dev, aux)
1086                 if (dev->rtnl_link_ops == ops)
1087                         unregister_netdevice_queue(dev, head);
1088
1089         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1090                 struct ip_tunnel *t;
1091                 struct hlist_node *n;
1092                 struct hlist_head *thead = &itn->tunnels[h];
1093
1094                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1095                         /* If dev is in the same netns, it has already
1096                          * been added to the list by the previous loop.
1097                          */
1098                         if (!net_eq(dev_net(t->dev), net))
1099                                 unregister_netdevice_queue(t->dev, head);
1100         }
1101 }
1102
1103 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1104                            struct rtnl_link_ops *ops)
1105 {
1106         struct ip_tunnel_net *itn;
1107         struct net *net;
1108         LIST_HEAD(list);
1109
1110         rtnl_lock();
1111         list_for_each_entry(net, net_list, exit_list) {
1112                 itn = net_generic(net, id);
1113                 ip_tunnel_destroy(net, itn, &list, ops);
1114         }
1115         unregister_netdevice_many(&list);
1116         rtnl_unlock();
1117 }
1118 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1119
1120 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1121                       struct ip_tunnel_parm *p, __u32 fwmark)
1122 {
1123         struct ip_tunnel *nt;
1124         struct net *net = dev_net(dev);
1125         struct ip_tunnel_net *itn;
1126         int mtu;
1127         int err;
1128
1129         nt = netdev_priv(dev);
1130         itn = net_generic(net, nt->ip_tnl_net_id);
1131
1132         if (nt->collect_md) {
1133                 if (rtnl_dereference(itn->collect_md_tun))
1134                         return -EEXIST;
1135         } else {
1136                 if (ip_tunnel_find(itn, p, dev->type))
1137                         return -EEXIST;
1138         }
1139
1140         nt->net = net;
1141         nt->parms = *p;
1142         nt->fwmark = fwmark;
1143         err = register_netdevice(dev);
1144         if (err)
1145                 goto err_register_netdevice;
1146
1147         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1148                 eth_hw_addr_random(dev);
1149
1150         mtu = ip_tunnel_bind_dev(dev);
1151         if (tb[IFLA_MTU]) {
1152                 unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1153
1154                 if (dev->type == ARPHRD_ETHER)
1155                         max -= dev->hard_header_len;
1156
1157                 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1158         }
1159
1160         err = dev_set_mtu(dev, mtu);
1161         if (err)
1162                 goto err_dev_set_mtu;
1163
1164         ip_tunnel_add(itn, nt);
1165         return 0;
1166
1167 err_dev_set_mtu:
1168         unregister_netdevice(dev);
1169 err_register_netdevice:
1170         return err;
1171 }
1172 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1173
1174 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1175                          struct ip_tunnel_parm *p, __u32 fwmark)
1176 {
1177         struct ip_tunnel *t;
1178         struct ip_tunnel *tunnel = netdev_priv(dev);
1179         struct net *net = tunnel->net;
1180         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1181
1182         if (dev == itn->fb_tunnel_dev)
1183                 return -EINVAL;
1184
1185         t = ip_tunnel_find(itn, p, dev->type);
1186
1187         if (t) {
1188                 if (t->dev != dev)
1189                         return -EEXIST;
1190         } else {
1191                 t = tunnel;
1192
1193                 if (dev->type != ARPHRD_ETHER) {
1194                         unsigned int nflags = 0;
1195
1196                         if (ipv4_is_multicast(p->iph.daddr))
1197                                 nflags = IFF_BROADCAST;
1198                         else if (p->iph.daddr)
1199                                 nflags = IFF_POINTOPOINT;
1200
1201                         if ((dev->flags ^ nflags) &
1202                             (IFF_POINTOPOINT | IFF_BROADCAST))
1203                                 return -EINVAL;
1204                 }
1205         }
1206
1207         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1208         return 0;
1209 }
1210 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1211
1212 int ip_tunnel_init(struct net_device *dev)
1213 {
1214         struct ip_tunnel *tunnel = netdev_priv(dev);
1215         struct iphdr *iph = &tunnel->parms.iph;
1216         int err;
1217
1218         dev->needs_free_netdev = true;
1219         dev->priv_destructor = ip_tunnel_dev_free;
1220         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1221         if (!dev->tstats)
1222                 return -ENOMEM;
1223
1224         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1225         if (err) {
1226                 free_percpu(dev->tstats);
1227                 return err;
1228         }
1229
1230         err = gro_cells_init(&tunnel->gro_cells, dev);
1231         if (err) {
1232                 dst_cache_destroy(&tunnel->dst_cache);
1233                 free_percpu(dev->tstats);
1234                 return err;
1235         }
1236
1237         tunnel->dev = dev;
1238         tunnel->net = dev_net(dev);
1239         strcpy(tunnel->parms.name, dev->name);
1240         iph->version            = 4;
1241         iph->ihl                = 5;
1242
1243         if (tunnel->collect_md)
1244                 netif_keep_dst(dev);
1245         return 0;
1246 }
1247 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1248
1249 void ip_tunnel_uninit(struct net_device *dev)
1250 {
1251         struct ip_tunnel *tunnel = netdev_priv(dev);
1252         struct net *net = tunnel->net;
1253         struct ip_tunnel_net *itn;
1254
1255         itn = net_generic(net, tunnel->ip_tnl_net_id);
1256         ip_tunnel_del(itn, netdev_priv(dev));
1257         if (itn->fb_tunnel_dev == dev)
1258                 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1259
1260         dst_cache_reset(&tunnel->dst_cache);
1261 }
1262 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1263
1264 /* Do least required initialization, rest of init is done in tunnel_init call */
1265 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1266 {
1267         struct ip_tunnel *tunnel = netdev_priv(dev);
1268         tunnel->ip_tnl_net_id = net_id;
1269 }
1270 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1271
1272 MODULE_LICENSE("GPL");