GNU Linux-libre 6.1.90-gnu
[releases.git] / net / ipv4 / ip_gre.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *      Linux NET3:     GRE over IP protocol decoder.
4  *
5  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6  */
7
8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9
10 #include <linux/capability.h>
11 #include <linux/module.h>
12 #include <linux/types.h>
13 #include <linux/kernel.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/skbuff.h>
17 #include <linux/netdevice.h>
18 #include <linux/in.h>
19 #include <linux/tcp.h>
20 #include <linux/udp.h>
21 #include <linux/if_arp.h>
22 #include <linux/if_vlan.h>
23 #include <linux/init.h>
24 #include <linux/in6.h>
25 #include <linux/inetdevice.h>
26 #include <linux/igmp.h>
27 #include <linux/netfilter_ipv4.h>
28 #include <linux/etherdevice.h>
29 #include <linux/if_ether.h>
30
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
36 #include <net/arp.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
40 #include <net/xfrm.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
44 #include <net/gre.h>
45 #include <net/dst_metadata.h>
46 #include <net/erspan.h>
47
48 /*
49    Problems & solutions
50    --------------------
51
52    1. The most important issue is detecting local dead loops.
53    They would cause complete host lockup in transmit, which
54    would be "resolved" by stack overflow or, if queueing is enabled,
55    with infinite looping in net_bh.
56
57    We cannot track such dead loops during route installation,
58    it is infeasible task. The most general solutions would be
59    to keep skb->encapsulation counter (sort of local ttl),
60    and silently drop packet when it expires. It is a good
61    solution, but it supposes maintaining new variable in ALL
62    skb, even if no tunneling is used.
63
64    Current solution: xmit_recursion breaks dead loops. This is a percpu
65    counter, since when we enter the first ndo_xmit(), cpu migration is
66    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
67
68    2. Networking dead loops would not kill routers, but would really
69    kill network. IP hop limit plays role of "t->recursion" in this case,
70    if we copy it from packet being encapsulated to upper header.
71    It is very good solution, but it introduces two problems:
72
73    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
74      do not work over tunnels.
75    - traceroute does not work. I planned to relay ICMP from tunnel,
76      so that this problem would be solved and traceroute output
77      would even more informative. This idea appeared to be wrong:
78      only Linux complies to rfc1812 now (yes, guys, Linux is the only
79      true router now :-)), all routers (at least, in neighbourhood of mine)
80      return only 8 bytes of payload. It is the end.
81
82    Hence, if we want that OSPF worked or traceroute said something reasonable,
83    we should search for another solution.
84
85    One of them is to parse packet trying to detect inner encapsulation
86    made by our node. It is difficult or even impossible, especially,
87    taking into account fragmentation. TO be short, ttl is not solution at all.
88
89    Current solution: The solution was UNEXPECTEDLY SIMPLE.
90    We force DF flag on tunnels with preconfigured hop limit,
91    that is ALL. :-) Well, it does not remove the problem completely,
92    but exponential growth of network traffic is changed to linear
93    (branches, that exceed pmtu are pruned) and tunnel mtu
94    rapidly degrades to value <68, where looping stops.
95    Yes, it is not good if there exists a router in the loop,
96    which does not force DF, even when encapsulating packets have DF set.
97    But it is not our problem! Nobody could accuse us, we made
98    all that we could make. Even if it is your gated who injected
99    fatal route to network, even if it were you who configured
100    fatal static route: you are innocent. :-)
101
102    Alexey Kuznetsov.
103  */
104
105 static bool log_ecn_error = true;
106 module_param(log_ecn_error, bool, 0644);
107 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
108
109 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
110 static const struct header_ops ipgre_header_ops;
111
112 static int ipgre_tunnel_init(struct net_device *dev);
113 static void erspan_build_header(struct sk_buff *skb,
114                                 u32 id, u32 index,
115                                 bool truncate, bool is_ipv4);
116
117 static unsigned int ipgre_net_id __read_mostly;
118 static unsigned int gre_tap_net_id __read_mostly;
119 static unsigned int erspan_net_id __read_mostly;
120
121 static int ipgre_err(struct sk_buff *skb, u32 info,
122                      const struct tnl_ptk_info *tpi)
123 {
124
125         /* All the routers (except for Linux) return only
126            8 bytes of packet payload. It means, that precise relaying of
127            ICMP in the real Internet is absolutely infeasible.
128
129            Moreover, Cisco "wise men" put GRE key to the third word
130            in GRE header. It makes impossible maintaining even soft
131            state for keyed GRE tunnels with enabled checksum. Tell
132            them "thank you".
133
134            Well, I wonder, rfc1812 was written by Cisco employee,
135            what the hell these idiots break standards established
136            by themselves???
137            */
138         struct net *net = dev_net(skb->dev);
139         struct ip_tunnel_net *itn;
140         const struct iphdr *iph;
141         const int type = icmp_hdr(skb)->type;
142         const int code = icmp_hdr(skb)->code;
143         unsigned int data_len = 0;
144         struct ip_tunnel *t;
145
146         if (tpi->proto == htons(ETH_P_TEB))
147                 itn = net_generic(net, gre_tap_net_id);
148         else if (tpi->proto == htons(ETH_P_ERSPAN) ||
149                  tpi->proto == htons(ETH_P_ERSPAN2))
150                 itn = net_generic(net, erspan_net_id);
151         else
152                 itn = net_generic(net, ipgre_net_id);
153
154         iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
155         t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
156                              iph->daddr, iph->saddr, tpi->key);
157
158         if (!t)
159                 return -ENOENT;
160
161         switch (type) {
162         default:
163         case ICMP_PARAMETERPROB:
164                 return 0;
165
166         case ICMP_DEST_UNREACH:
167                 switch (code) {
168                 case ICMP_SR_FAILED:
169                 case ICMP_PORT_UNREACH:
170                         /* Impossible event. */
171                         return 0;
172                 default:
173                         /* All others are translated to HOST_UNREACH.
174                            rfc2003 contains "deep thoughts" about NET_UNREACH,
175                            I believe they are just ether pollution. --ANK
176                          */
177                         break;
178                 }
179                 break;
180
181         case ICMP_TIME_EXCEEDED:
182                 if (code != ICMP_EXC_TTL)
183                         return 0;
184                 data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
185                 break;
186
187         case ICMP_REDIRECT:
188                 break;
189         }
190
191 #if IS_ENABLED(CONFIG_IPV6)
192        if (tpi->proto == htons(ETH_P_IPV6) &&
193            !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
194                                        type, data_len))
195                return 0;
196 #endif
197
198         if (t->parms.iph.daddr == 0 ||
199             ipv4_is_multicast(t->parms.iph.daddr))
200                 return 0;
201
202         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
203                 return 0;
204
205         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
206                 t->err_count++;
207         else
208                 t->err_count = 1;
209         t->err_time = jiffies;
210
211         return 0;
212 }
213
214 static void gre_err(struct sk_buff *skb, u32 info)
215 {
216         /* All the routers (except for Linux) return only
217          * 8 bytes of packet payload. It means, that precise relaying of
218          * ICMP in the real Internet is absolutely infeasible.
219          *
220          * Moreover, Cisco "wise men" put GRE key to the third word
221          * in GRE header. It makes impossible maintaining even soft
222          * state for keyed
223          * GRE tunnels with enabled checksum. Tell them "thank you".
224          *
225          * Well, I wonder, rfc1812 was written by Cisco employee,
226          * what the hell these idiots break standards established
227          * by themselves???
228          */
229
230         const struct iphdr *iph = (struct iphdr *)skb->data;
231         const int type = icmp_hdr(skb)->type;
232         const int code = icmp_hdr(skb)->code;
233         struct tnl_ptk_info tpi;
234
235         if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IP),
236                              iph->ihl * 4) < 0)
237                 return;
238
239         if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
240                 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
241                                  skb->dev->ifindex, IPPROTO_GRE);
242                 return;
243         }
244         if (type == ICMP_REDIRECT) {
245                 ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex,
246                               IPPROTO_GRE);
247                 return;
248         }
249
250         ipgre_err(skb, info, &tpi);
251 }
252
253 static bool is_erspan_type1(int gre_hdr_len)
254 {
255         /* Both ERSPAN type I (version 0) and type II (version 1) use
256          * protocol 0x88BE, but the type I has only 4-byte GRE header,
257          * while type II has 8-byte.
258          */
259         return gre_hdr_len == 4;
260 }
261
262 static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
263                       int gre_hdr_len)
264 {
265         struct net *net = dev_net(skb->dev);
266         struct metadata_dst *tun_dst = NULL;
267         struct erspan_base_hdr *ershdr;
268         struct ip_tunnel_net *itn;
269         struct ip_tunnel *tunnel;
270         const struct iphdr *iph;
271         struct erspan_md2 *md2;
272         int ver;
273         int len;
274
275         itn = net_generic(net, erspan_net_id);
276         iph = ip_hdr(skb);
277         if (is_erspan_type1(gre_hdr_len)) {
278                 ver = 0;
279                 tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
280                                           tpi->flags | TUNNEL_NO_KEY,
281                                           iph->saddr, iph->daddr, 0);
282         } else {
283                 if (unlikely(!pskb_may_pull(skb,
284                                             gre_hdr_len + sizeof(*ershdr))))
285                         return PACKET_REJECT;
286
287                 ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
288                 ver = ershdr->ver;
289                 iph = ip_hdr(skb);
290                 tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
291                                           tpi->flags | TUNNEL_KEY,
292                                           iph->saddr, iph->daddr, tpi->key);
293         }
294
295         if (tunnel) {
296                 if (is_erspan_type1(gre_hdr_len))
297                         len = gre_hdr_len;
298                 else
299                         len = gre_hdr_len + erspan_hdr_len(ver);
300
301                 if (unlikely(!pskb_may_pull(skb, len)))
302                         return PACKET_REJECT;
303
304                 if (__iptunnel_pull_header(skb,
305                                            len,
306                                            htons(ETH_P_TEB),
307                                            false, false) < 0)
308                         goto drop;
309
310                 if (tunnel->collect_md) {
311                         struct erspan_metadata *pkt_md, *md;
312                         struct ip_tunnel_info *info;
313                         unsigned char *gh;
314                         __be64 tun_id;
315                         __be16 flags;
316
317                         tpi->flags |= TUNNEL_KEY;
318                         flags = tpi->flags;
319                         tun_id = key32_to_tunnel_id(tpi->key);
320
321                         tun_dst = ip_tun_rx_dst(skb, flags,
322                                                 tun_id, sizeof(*md));
323                         if (!tun_dst)
324                                 return PACKET_REJECT;
325
326                         /* skb can be uncloned in __iptunnel_pull_header, so
327                          * old pkt_md is no longer valid and we need to reset
328                          * it
329                          */
330                         gh = skb_network_header(skb) +
331                              skb_network_header_len(skb);
332                         pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len +
333                                                             sizeof(*ershdr));
334                         md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
335                         md->version = ver;
336                         md2 = &md->u.md2;
337                         memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE :
338                                                        ERSPAN_V2_MDSIZE);
339
340                         info = &tun_dst->u.tun_info;
341                         info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
342                         info->options_len = sizeof(*md);
343                 }
344
345                 skb_reset_mac_header(skb);
346                 ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
347                 return PACKET_RCVD;
348         }
349         return PACKET_REJECT;
350
351 drop:
352         kfree_skb(skb);
353         return PACKET_RCVD;
354 }
355
356 static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
357                        struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
358 {
359         struct metadata_dst *tun_dst = NULL;
360         const struct iphdr *iph;
361         struct ip_tunnel *tunnel;
362
363         iph = ip_hdr(skb);
364         tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
365                                   iph->saddr, iph->daddr, tpi->key);
366
367         if (tunnel) {
368                 const struct iphdr *tnl_params;
369
370                 if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
371                                            raw_proto, false) < 0)
372                         goto drop;
373
374                 /* Special case for ipgre_header_parse(), which expects the
375                  * mac_header to point to the outer IP header.
376                  */
377                 if (tunnel->dev->header_ops == &ipgre_header_ops)
378                         skb_pop_mac_header(skb);
379                 else
380                         skb_reset_mac_header(skb);
381
382                 tnl_params = &tunnel->parms.iph;
383                 if (tunnel->collect_md || tnl_params->daddr == 0) {
384                         __be16 flags;
385                         __be64 tun_id;
386
387                         flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
388                         tun_id = key32_to_tunnel_id(tpi->key);
389                         tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
390                         if (!tun_dst)
391                                 return PACKET_REJECT;
392                 }
393
394                 ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
395                 return PACKET_RCVD;
396         }
397         return PACKET_NEXT;
398
399 drop:
400         kfree_skb(skb);
401         return PACKET_RCVD;
402 }
403
404 static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
405                      int hdr_len)
406 {
407         struct net *net = dev_net(skb->dev);
408         struct ip_tunnel_net *itn;
409         int res;
410
411         if (tpi->proto == htons(ETH_P_TEB))
412                 itn = net_generic(net, gre_tap_net_id);
413         else
414                 itn = net_generic(net, ipgre_net_id);
415
416         res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
417         if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
418                 /* ipgre tunnels in collect metadata mode should receive
419                  * also ETH_P_TEB traffic.
420                  */
421                 itn = net_generic(net, ipgre_net_id);
422                 res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
423         }
424         return res;
425 }
426
427 static int gre_rcv(struct sk_buff *skb)
428 {
429         struct tnl_ptk_info tpi;
430         bool csum_err = false;
431         int hdr_len;
432
433 #ifdef CONFIG_NET_IPGRE_BROADCAST
434         if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
435                 /* Looped back packet, drop it! */
436                 if (rt_is_output_route(skb_rtable(skb)))
437                         goto drop;
438         }
439 #endif
440
441         hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0);
442         if (hdr_len < 0)
443                 goto drop;
444
445         if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) ||
446                      tpi.proto == htons(ETH_P_ERSPAN2))) {
447                 if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
448                         return 0;
449                 goto out;
450         }
451
452         if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
453                 return 0;
454
455 out:
456         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
457 drop:
458         kfree_skb(skb);
459         return 0;
460 }
461
462 static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
463                        const struct iphdr *tnl_params,
464                        __be16 proto)
465 {
466         struct ip_tunnel *tunnel = netdev_priv(dev);
467         __be16 flags = tunnel->parms.o_flags;
468
469         /* Push GRE header. */
470         gre_build_header(skb, tunnel->tun_hlen,
471                          flags, proto, tunnel->parms.o_key,
472                          (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
473
474         ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
475 }
476
477 static int gre_handle_offloads(struct sk_buff *skb, bool csum)
478 {
479         return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
480 }
481
482 static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
483                         __be16 proto)
484 {
485         struct ip_tunnel *tunnel = netdev_priv(dev);
486         struct ip_tunnel_info *tun_info;
487         const struct ip_tunnel_key *key;
488         int tunnel_hlen;
489         __be16 flags;
490
491         tun_info = skb_tunnel_info(skb);
492         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
493                      ip_tunnel_info_af(tun_info) != AF_INET))
494                 goto err_free_skb;
495
496         key = &tun_info->key;
497         tunnel_hlen = gre_calc_hlen(key->tun_flags);
498
499         if (skb_cow_head(skb, dev->needed_headroom))
500                 goto err_free_skb;
501
502         /* Push Tunnel header. */
503         if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
504                 goto err_free_skb;
505
506         flags = tun_info->key.tun_flags &
507                 (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
508         gre_build_header(skb, tunnel_hlen, flags, proto,
509                          tunnel_id_to_key32(tun_info->key.tun_id),
510                          (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
511
512         ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
513
514         return;
515
516 err_free_skb:
517         kfree_skb(skb);
518         dev->stats.tx_dropped++;
519 }
520
521 static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
522 {
523         struct ip_tunnel *tunnel = netdev_priv(dev);
524         struct ip_tunnel_info *tun_info;
525         const struct ip_tunnel_key *key;
526         struct erspan_metadata *md;
527         bool truncate = false;
528         __be16 proto;
529         int tunnel_hlen;
530         int version;
531         int nhoff;
532
533         tun_info = skb_tunnel_info(skb);
534         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
535                      ip_tunnel_info_af(tun_info) != AF_INET))
536                 goto err_free_skb;
537
538         key = &tun_info->key;
539         if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
540                 goto err_free_skb;
541         if (tun_info->options_len < sizeof(*md))
542                 goto err_free_skb;
543         md = ip_tunnel_info_opts(tun_info);
544
545         /* ERSPAN has fixed 8 byte GRE header */
546         version = md->version;
547         tunnel_hlen = 8 + erspan_hdr_len(version);
548
549         if (skb_cow_head(skb, dev->needed_headroom))
550                 goto err_free_skb;
551
552         if (gre_handle_offloads(skb, false))
553                 goto err_free_skb;
554
555         if (skb->len > dev->mtu + dev->hard_header_len) {
556                 pskb_trim(skb, dev->mtu + dev->hard_header_len);
557                 truncate = true;
558         }
559
560         nhoff = skb_network_offset(skb);
561         if (skb->protocol == htons(ETH_P_IP) &&
562             (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
563                 truncate = true;
564
565         if (skb->protocol == htons(ETH_P_IPV6)) {
566                 int thoff;
567
568                 if (skb_transport_header_was_set(skb))
569                         thoff = skb_transport_offset(skb);
570                 else
571                         thoff = nhoff + sizeof(struct ipv6hdr);
572                 if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)
573                         truncate = true;
574         }
575
576         if (version == 1) {
577                 erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)),
578                                     ntohl(md->u.index), truncate, true);
579                 proto = htons(ETH_P_ERSPAN);
580         } else if (version == 2) {
581                 erspan_build_header_v2(skb,
582                                        ntohl(tunnel_id_to_key32(key->tun_id)),
583                                        md->u.md2.dir,
584                                        get_hwid(&md->u.md2),
585                                        truncate, true);
586                 proto = htons(ETH_P_ERSPAN2);
587         } else {
588                 goto err_free_skb;
589         }
590
591         gre_build_header(skb, 8, TUNNEL_SEQ,
592                          proto, 0, htonl(atomic_fetch_inc(&tunnel->o_seqno)));
593
594         ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
595
596         return;
597
598 err_free_skb:
599         kfree_skb(skb);
600         dev->stats.tx_dropped++;
601 }
602
603 static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
604 {
605         struct ip_tunnel_info *info = skb_tunnel_info(skb);
606         const struct ip_tunnel_key *key;
607         struct rtable *rt;
608         struct flowi4 fl4;
609
610         if (ip_tunnel_info_af(info) != AF_INET)
611                 return -EINVAL;
612
613         key = &info->key;
614         ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src,
615                             tunnel_id_to_key32(key->tun_id),
616                             key->tos & ~INET_ECN_MASK, dev_net(dev), 0,
617                             skb->mark, skb_get_hash(skb), key->flow_flags);
618         rt = ip_route_output_key(dev_net(dev), &fl4);
619         if (IS_ERR(rt))
620                 return PTR_ERR(rt);
621
622         ip_rt_put(rt);
623         info->key.u.ipv4.src = fl4.saddr;
624         return 0;
625 }
626
627 static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
628                               struct net_device *dev)
629 {
630         struct ip_tunnel *tunnel = netdev_priv(dev);
631         const struct iphdr *tnl_params;
632
633         if (!pskb_inet_may_pull(skb))
634                 goto free_skb;
635
636         if (tunnel->collect_md) {
637                 gre_fb_xmit(skb, dev, skb->protocol);
638                 return NETDEV_TX_OK;
639         }
640
641         if (dev->header_ops) {
642                 int pull_len = tunnel->hlen + sizeof(struct iphdr);
643
644                 if (skb_cow_head(skb, 0))
645                         goto free_skb;
646
647                 tnl_params = (const struct iphdr *)skb->data;
648
649                 if (!pskb_network_may_pull(skb, pull_len))
650                         goto free_skb;
651
652                 /* ip_tunnel_xmit() needs skb->data pointing to gre header. */
653                 skb_pull(skb, pull_len);
654                 skb_reset_mac_header(skb);
655
656                 if (skb->ip_summed == CHECKSUM_PARTIAL &&
657                     skb_checksum_start(skb) < skb->data)
658                         goto free_skb;
659         } else {
660                 if (skb_cow_head(skb, dev->needed_headroom))
661                         goto free_skb;
662
663                 tnl_params = &tunnel->parms.iph;
664         }
665
666         if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
667                 goto free_skb;
668
669         __gre_xmit(skb, dev, tnl_params, skb->protocol);
670         return NETDEV_TX_OK;
671
672 free_skb:
673         kfree_skb(skb);
674         dev->stats.tx_dropped++;
675         return NETDEV_TX_OK;
676 }
677
678 static netdev_tx_t erspan_xmit(struct sk_buff *skb,
679                                struct net_device *dev)
680 {
681         struct ip_tunnel *tunnel = netdev_priv(dev);
682         bool truncate = false;
683         __be16 proto;
684
685         if (!pskb_inet_may_pull(skb))
686                 goto free_skb;
687
688         if (tunnel->collect_md) {
689                 erspan_fb_xmit(skb, dev);
690                 return NETDEV_TX_OK;
691         }
692
693         if (gre_handle_offloads(skb, false))
694                 goto free_skb;
695
696         if (skb_cow_head(skb, dev->needed_headroom))
697                 goto free_skb;
698
699         if (skb->len > dev->mtu + dev->hard_header_len) {
700                 pskb_trim(skb, dev->mtu + dev->hard_header_len);
701                 truncate = true;
702         }
703
704         /* Push ERSPAN header */
705         if (tunnel->erspan_ver == 0) {
706                 proto = htons(ETH_P_ERSPAN);
707                 tunnel->parms.o_flags &= ~TUNNEL_SEQ;
708         } else if (tunnel->erspan_ver == 1) {
709                 erspan_build_header(skb, ntohl(tunnel->parms.o_key),
710                                     tunnel->index,
711                                     truncate, true);
712                 proto = htons(ETH_P_ERSPAN);
713         } else if (tunnel->erspan_ver == 2) {
714                 erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key),
715                                        tunnel->dir, tunnel->hwid,
716                                        truncate, true);
717                 proto = htons(ETH_P_ERSPAN2);
718         } else {
719                 goto free_skb;
720         }
721
722         tunnel->parms.o_flags &= ~TUNNEL_KEY;
723         __gre_xmit(skb, dev, &tunnel->parms.iph, proto);
724         return NETDEV_TX_OK;
725
726 free_skb:
727         kfree_skb(skb);
728         dev->stats.tx_dropped++;
729         return NETDEV_TX_OK;
730 }
731
732 static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
733                                 struct net_device *dev)
734 {
735         struct ip_tunnel *tunnel = netdev_priv(dev);
736
737         if (!pskb_inet_may_pull(skb))
738                 goto free_skb;
739
740         if (tunnel->collect_md) {
741                 gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
742                 return NETDEV_TX_OK;
743         }
744
745         if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
746                 goto free_skb;
747
748         if (skb_cow_head(skb, dev->needed_headroom))
749                 goto free_skb;
750
751         __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
752         return NETDEV_TX_OK;
753
754 free_skb:
755         kfree_skb(skb);
756         dev->stats.tx_dropped++;
757         return NETDEV_TX_OK;
758 }
759
760 static void ipgre_link_update(struct net_device *dev, bool set_mtu)
761 {
762         struct ip_tunnel *tunnel = netdev_priv(dev);
763         __be16 flags;
764         int len;
765
766         len = tunnel->tun_hlen;
767         tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
768         len = tunnel->tun_hlen - len;
769         tunnel->hlen = tunnel->hlen + len;
770
771         if (dev->header_ops)
772                 dev->hard_header_len += len;
773         else
774                 dev->needed_headroom += len;
775
776         if (set_mtu)
777                 dev->mtu = max_t(int, dev->mtu - len, 68);
778
779         flags = tunnel->parms.o_flags;
780
781         if (flags & TUNNEL_SEQ ||
782             (flags & TUNNEL_CSUM && tunnel->encap.type != TUNNEL_ENCAP_NONE)) {
783                 dev->features &= ~NETIF_F_GSO_SOFTWARE;
784                 dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
785         } else {
786                 dev->features |= NETIF_F_GSO_SOFTWARE;
787                 dev->hw_features |= NETIF_F_GSO_SOFTWARE;
788         }
789 }
790
791 static int ipgre_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p,
792                             int cmd)
793 {
794         int err;
795
796         if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
797                 if (p->iph.version != 4 || p->iph.protocol != IPPROTO_GRE ||
798                     p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)) ||
799                     ((p->i_flags | p->o_flags) & (GRE_VERSION | GRE_ROUTING)))
800                         return -EINVAL;
801         }
802
803         p->i_flags = gre_flags_to_tnl_flags(p->i_flags);
804         p->o_flags = gre_flags_to_tnl_flags(p->o_flags);
805
806         err = ip_tunnel_ctl(dev, p, cmd);
807         if (err)
808                 return err;
809
810         if (cmd == SIOCCHGTUNNEL) {
811                 struct ip_tunnel *t = netdev_priv(dev);
812
813                 t->parms.i_flags = p->i_flags;
814                 t->parms.o_flags = p->o_flags;
815
816                 if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
817                         ipgre_link_update(dev, true);
818         }
819
820         p->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags);
821         p->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags);
822         return 0;
823 }
824
825 /* Nice toy. Unfortunately, useless in real life :-)
826    It allows to construct virtual multiprotocol broadcast "LAN"
827    over the Internet, provided multicast routing is tuned.
828
829
830    I have no idea was this bicycle invented before me,
831    so that I had to set ARPHRD_IPGRE to a random value.
832    I have an impression, that Cisco could make something similar,
833    but this feature is apparently missing in IOS<=11.2(8).
834
835    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
836    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
837
838    ping -t 255 224.66.66.66
839
840    If nobody answers, mbone does not work.
841
842    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
843    ip addr add 10.66.66.<somewhat>/24 dev Universe
844    ifconfig Universe up
845    ifconfig Universe add fe80::<Your_real_addr>/10
846    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
847    ftp 10.66.66.66
848    ...
849    ftp fec0:6666:6666::193.233.7.65
850    ...
851  */
852 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
853                         unsigned short type,
854                         const void *daddr, const void *saddr, unsigned int len)
855 {
856         struct ip_tunnel *t = netdev_priv(dev);
857         struct iphdr *iph;
858         struct gre_base_hdr *greh;
859
860         iph = skb_push(skb, t->hlen + sizeof(*iph));
861         greh = (struct gre_base_hdr *)(iph+1);
862         greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags);
863         greh->protocol = htons(type);
864
865         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
866
867         /* Set the source hardware address. */
868         if (saddr)
869                 memcpy(&iph->saddr, saddr, 4);
870         if (daddr)
871                 memcpy(&iph->daddr, daddr, 4);
872         if (iph->daddr)
873                 return t->hlen + sizeof(*iph);
874
875         return -(t->hlen + sizeof(*iph));
876 }
877
878 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
879 {
880         const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
881         memcpy(haddr, &iph->saddr, 4);
882         return 4;
883 }
884
885 static const struct header_ops ipgre_header_ops = {
886         .create = ipgre_header,
887         .parse  = ipgre_header_parse,
888 };
889
890 #ifdef CONFIG_NET_IPGRE_BROADCAST
891 static int ipgre_open(struct net_device *dev)
892 {
893         struct ip_tunnel *t = netdev_priv(dev);
894
895         if (ipv4_is_multicast(t->parms.iph.daddr)) {
896                 struct flowi4 fl4;
897                 struct rtable *rt;
898
899                 rt = ip_route_output_gre(t->net, &fl4,
900                                          t->parms.iph.daddr,
901                                          t->parms.iph.saddr,
902                                          t->parms.o_key,
903                                          RT_TOS(t->parms.iph.tos),
904                                          t->parms.link);
905                 if (IS_ERR(rt))
906                         return -EADDRNOTAVAIL;
907                 dev = rt->dst.dev;
908                 ip_rt_put(rt);
909                 if (!__in_dev_get_rtnl(dev))
910                         return -EADDRNOTAVAIL;
911                 t->mlink = dev->ifindex;
912                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
913         }
914         return 0;
915 }
916
917 static int ipgre_close(struct net_device *dev)
918 {
919         struct ip_tunnel *t = netdev_priv(dev);
920
921         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
922                 struct in_device *in_dev;
923                 in_dev = inetdev_by_index(t->net, t->mlink);
924                 if (in_dev)
925                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
926         }
927         return 0;
928 }
929 #endif
930
931 static const struct net_device_ops ipgre_netdev_ops = {
932         .ndo_init               = ipgre_tunnel_init,
933         .ndo_uninit             = ip_tunnel_uninit,
934 #ifdef CONFIG_NET_IPGRE_BROADCAST
935         .ndo_open               = ipgre_open,
936         .ndo_stop               = ipgre_close,
937 #endif
938         .ndo_start_xmit         = ipgre_xmit,
939         .ndo_siocdevprivate     = ip_tunnel_siocdevprivate,
940         .ndo_change_mtu         = ip_tunnel_change_mtu,
941         .ndo_get_stats64        = dev_get_tstats64,
942         .ndo_get_iflink         = ip_tunnel_get_iflink,
943         .ndo_tunnel_ctl         = ipgre_tunnel_ctl,
944 };
945
946 #define GRE_FEATURES (NETIF_F_SG |              \
947                       NETIF_F_FRAGLIST |        \
948                       NETIF_F_HIGHDMA |         \
949                       NETIF_F_HW_CSUM)
950
951 static void ipgre_tunnel_setup(struct net_device *dev)
952 {
953         dev->netdev_ops         = &ipgre_netdev_ops;
954         dev->type               = ARPHRD_IPGRE;
955         ip_tunnel_setup(dev, ipgre_net_id);
956 }
957
958 static void __gre_tunnel_init(struct net_device *dev)
959 {
960         struct ip_tunnel *tunnel;
961         __be16 flags;
962
963         tunnel = netdev_priv(dev);
964         tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
965         tunnel->parms.iph.protocol = IPPROTO_GRE;
966
967         tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
968         dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph);
969
970         dev->features           |= GRE_FEATURES | NETIF_F_LLTX;
971         dev->hw_features        |= GRE_FEATURES;
972
973         flags = tunnel->parms.o_flags;
974
975         /* TCP offload with GRE SEQ is not supported, nor can we support 2
976          * levels of outer headers requiring an update.
977          */
978         if (flags & TUNNEL_SEQ)
979                 return;
980         if (flags & TUNNEL_CSUM && tunnel->encap.type != TUNNEL_ENCAP_NONE)
981                 return;
982
983         dev->features |= NETIF_F_GSO_SOFTWARE;
984         dev->hw_features |= NETIF_F_GSO_SOFTWARE;
985 }
986
987 static int ipgre_tunnel_init(struct net_device *dev)
988 {
989         struct ip_tunnel *tunnel = netdev_priv(dev);
990         struct iphdr *iph = &tunnel->parms.iph;
991
992         __gre_tunnel_init(dev);
993
994         __dev_addr_set(dev, &iph->saddr, 4);
995         memcpy(dev->broadcast, &iph->daddr, 4);
996
997         dev->flags              = IFF_NOARP;
998         netif_keep_dst(dev);
999         dev->addr_len           = 4;
1000
1001         if (iph->daddr && !tunnel->collect_md) {
1002 #ifdef CONFIG_NET_IPGRE_BROADCAST
1003                 if (ipv4_is_multicast(iph->daddr)) {
1004                         if (!iph->saddr)
1005                                 return -EINVAL;
1006                         dev->flags = IFF_BROADCAST;
1007                         dev->header_ops = &ipgre_header_ops;
1008                         dev->hard_header_len = tunnel->hlen + sizeof(*iph);
1009                         dev->needed_headroom = 0;
1010                 }
1011 #endif
1012         } else if (!tunnel->collect_md) {
1013                 dev->header_ops = &ipgre_header_ops;
1014                 dev->hard_header_len = tunnel->hlen + sizeof(*iph);
1015                 dev->needed_headroom = 0;
1016         }
1017
1018         return ip_tunnel_init(dev);
1019 }
1020
1021 static const struct gre_protocol ipgre_protocol = {
1022         .handler     = gre_rcv,
1023         .err_handler = gre_err,
1024 };
1025
1026 static int __net_init ipgre_init_net(struct net *net)
1027 {
1028         return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
1029 }
1030
1031 static void __net_exit ipgre_exit_batch_net(struct list_head *list_net)
1032 {
1033         ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops);
1034 }
1035
1036 static struct pernet_operations ipgre_net_ops = {
1037         .init = ipgre_init_net,
1038         .exit_batch = ipgre_exit_batch_net,
1039         .id   = &ipgre_net_id,
1040         .size = sizeof(struct ip_tunnel_net),
1041 };
1042
1043 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
1044                                  struct netlink_ext_ack *extack)
1045 {
1046         __be16 flags;
1047
1048         if (!data)
1049                 return 0;
1050
1051         flags = 0;
1052         if (data[IFLA_GRE_IFLAGS])
1053                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1054         if (data[IFLA_GRE_OFLAGS])
1055                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1056         if (flags & (GRE_VERSION|GRE_ROUTING))
1057                 return -EINVAL;
1058
1059         if (data[IFLA_GRE_COLLECT_METADATA] &&
1060             data[IFLA_GRE_ENCAP_TYPE] &&
1061             nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE)
1062                 return -EINVAL;
1063
1064         return 0;
1065 }
1066
1067 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[],
1068                               struct netlink_ext_ack *extack)
1069 {
1070         __be32 daddr;
1071
1072         if (tb[IFLA_ADDRESS]) {
1073                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1074                         return -EINVAL;
1075                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1076                         return -EADDRNOTAVAIL;
1077         }
1078
1079         if (!data)
1080                 goto out;
1081
1082         if (data[IFLA_GRE_REMOTE]) {
1083                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1084                 if (!daddr)
1085                         return -EINVAL;
1086         }
1087
1088 out:
1089         return ipgre_tunnel_validate(tb, data, extack);
1090 }
1091
1092 static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
1093                            struct netlink_ext_ack *extack)
1094 {
1095         __be16 flags = 0;
1096         int ret;
1097
1098         if (!data)
1099                 return 0;
1100
1101         ret = ipgre_tap_validate(tb, data, extack);
1102         if (ret)
1103                 return ret;
1104
1105         if (data[IFLA_GRE_ERSPAN_VER] &&
1106             nla_get_u8(data[IFLA_GRE_ERSPAN_VER]) == 0)
1107                 return 0;
1108
1109         /* ERSPAN type II/III should only have GRE sequence and key flag */
1110         if (data[IFLA_GRE_OFLAGS])
1111                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1112         if (data[IFLA_GRE_IFLAGS])
1113                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1114         if (!data[IFLA_GRE_COLLECT_METADATA] &&
1115             flags != (GRE_SEQ | GRE_KEY))
1116                 return -EINVAL;
1117
1118         /* ERSPAN Session ID only has 10-bit. Since we reuse
1119          * 32-bit key field as ID, check it's range.
1120          */
1121         if (data[IFLA_GRE_IKEY] &&
1122             (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
1123                 return -EINVAL;
1124
1125         if (data[IFLA_GRE_OKEY] &&
1126             (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
1127                 return -EINVAL;
1128
1129         return 0;
1130 }
1131
1132 static int ipgre_netlink_parms(struct net_device *dev,
1133                                 struct nlattr *data[],
1134                                 struct nlattr *tb[],
1135                                 struct ip_tunnel_parm *parms,
1136                                 __u32 *fwmark)
1137 {
1138         struct ip_tunnel *t = netdev_priv(dev);
1139
1140         memset(parms, 0, sizeof(*parms));
1141
1142         parms->iph.protocol = IPPROTO_GRE;
1143
1144         if (!data)
1145                 return 0;
1146
1147         if (data[IFLA_GRE_LINK])
1148                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1149
1150         if (data[IFLA_GRE_IFLAGS])
1151                 parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
1152
1153         if (data[IFLA_GRE_OFLAGS])
1154                 parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
1155
1156         if (data[IFLA_GRE_IKEY])
1157                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1158
1159         if (data[IFLA_GRE_OKEY])
1160                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1161
1162         if (data[IFLA_GRE_LOCAL])
1163                 parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);
1164
1165         if (data[IFLA_GRE_REMOTE])
1166                 parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);
1167
1168         if (data[IFLA_GRE_TTL])
1169                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1170
1171         if (data[IFLA_GRE_TOS])
1172                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1173
1174         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) {
1175                 if (t->ignore_df)
1176                         return -EINVAL;
1177                 parms->iph.frag_off = htons(IP_DF);
1178         }
1179
1180         if (data[IFLA_GRE_COLLECT_METADATA]) {
1181                 t->collect_md = true;
1182                 if (dev->type == ARPHRD_IPGRE)
1183                         dev->type = ARPHRD_NONE;
1184         }
1185
1186         if (data[IFLA_GRE_IGNORE_DF]) {
1187                 if (nla_get_u8(data[IFLA_GRE_IGNORE_DF])
1188                   && (parms->iph.frag_off & htons(IP_DF)))
1189                         return -EINVAL;
1190                 t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]);
1191         }
1192
1193         if (data[IFLA_GRE_FWMARK])
1194                 *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
1195
1196         return 0;
1197 }
1198
1199 static int erspan_netlink_parms(struct net_device *dev,
1200                                 struct nlattr *data[],
1201                                 struct nlattr *tb[],
1202                                 struct ip_tunnel_parm *parms,
1203                                 __u32 *fwmark)
1204 {
1205         struct ip_tunnel *t = netdev_priv(dev);
1206         int err;
1207
1208         err = ipgre_netlink_parms(dev, data, tb, parms, fwmark);
1209         if (err)
1210                 return err;
1211         if (!data)
1212                 return 0;
1213
1214         if (data[IFLA_GRE_ERSPAN_VER]) {
1215                 t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
1216
1217                 if (t->erspan_ver > 2)
1218                         return -EINVAL;
1219         }
1220
1221         if (t->erspan_ver == 1) {
1222                 if (data[IFLA_GRE_ERSPAN_INDEX]) {
1223                         t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
1224                         if (t->index & ~INDEX_MASK)
1225                                 return -EINVAL;
1226                 }
1227         } else if (t->erspan_ver == 2) {
1228                 if (data[IFLA_GRE_ERSPAN_DIR]) {
1229                         t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
1230                         if (t->dir & ~(DIR_MASK >> DIR_OFFSET))
1231                                 return -EINVAL;
1232                 }
1233                 if (data[IFLA_GRE_ERSPAN_HWID]) {
1234                         t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
1235                         if (t->hwid & ~(HWID_MASK >> HWID_OFFSET))
1236                                 return -EINVAL;
1237                 }
1238         }
1239
1240         return 0;
1241 }
1242
1243 /* This function returns true when ENCAP attributes are present in the nl msg */
1244 static bool ipgre_netlink_encap_parms(struct nlattr *data[],
1245                                       struct ip_tunnel_encap *ipencap)
1246 {
1247         bool ret = false;
1248
1249         memset(ipencap, 0, sizeof(*ipencap));
1250
1251         if (!data)
1252                 return ret;
1253
1254         if (data[IFLA_GRE_ENCAP_TYPE]) {
1255                 ret = true;
1256                 ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
1257         }
1258
1259         if (data[IFLA_GRE_ENCAP_FLAGS]) {
1260                 ret = true;
1261                 ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
1262         }
1263
1264         if (data[IFLA_GRE_ENCAP_SPORT]) {
1265                 ret = true;
1266                 ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
1267         }
1268
1269         if (data[IFLA_GRE_ENCAP_DPORT]) {
1270                 ret = true;
1271                 ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
1272         }
1273
1274         return ret;
1275 }
1276
1277 static int gre_tap_init(struct net_device *dev)
1278 {
1279         __gre_tunnel_init(dev);
1280         dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1281         netif_keep_dst(dev);
1282
1283         return ip_tunnel_init(dev);
1284 }
1285
1286 static const struct net_device_ops gre_tap_netdev_ops = {
1287         .ndo_init               = gre_tap_init,
1288         .ndo_uninit             = ip_tunnel_uninit,
1289         .ndo_start_xmit         = gre_tap_xmit,
1290         .ndo_set_mac_address    = eth_mac_addr,
1291         .ndo_validate_addr      = eth_validate_addr,
1292         .ndo_change_mtu         = ip_tunnel_change_mtu,
1293         .ndo_get_stats64        = dev_get_tstats64,
1294         .ndo_get_iflink         = ip_tunnel_get_iflink,
1295         .ndo_fill_metadata_dst  = gre_fill_metadata_dst,
1296 };
1297
1298 static int erspan_tunnel_init(struct net_device *dev)
1299 {
1300         struct ip_tunnel *tunnel = netdev_priv(dev);
1301
1302         if (tunnel->erspan_ver == 0)
1303                 tunnel->tun_hlen = 4; /* 4-byte GRE hdr. */
1304         else
1305                 tunnel->tun_hlen = 8; /* 8-byte GRE hdr. */
1306
1307         tunnel->parms.iph.protocol = IPPROTO_GRE;
1308         tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
1309                        erspan_hdr_len(tunnel->erspan_ver);
1310
1311         dev->features           |= GRE_FEATURES;
1312         dev->hw_features        |= GRE_FEATURES;
1313         dev->priv_flags         |= IFF_LIVE_ADDR_CHANGE;
1314         netif_keep_dst(dev);
1315
1316         return ip_tunnel_init(dev);
1317 }
1318
1319 static const struct net_device_ops erspan_netdev_ops = {
1320         .ndo_init               = erspan_tunnel_init,
1321         .ndo_uninit             = ip_tunnel_uninit,
1322         .ndo_start_xmit         = erspan_xmit,
1323         .ndo_set_mac_address    = eth_mac_addr,
1324         .ndo_validate_addr      = eth_validate_addr,
1325         .ndo_change_mtu         = ip_tunnel_change_mtu,
1326         .ndo_get_stats64        = dev_get_tstats64,
1327         .ndo_get_iflink         = ip_tunnel_get_iflink,
1328         .ndo_fill_metadata_dst  = gre_fill_metadata_dst,
1329 };
1330
1331 static void ipgre_tap_setup(struct net_device *dev)
1332 {
1333         ether_setup(dev);
1334         dev->max_mtu = 0;
1335         dev->netdev_ops = &gre_tap_netdev_ops;
1336         dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1337         dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1338         ip_tunnel_setup(dev, gre_tap_net_id);
1339 }
1340
1341 static int
1342 ipgre_newlink_encap_setup(struct net_device *dev, struct nlattr *data[])
1343 {
1344         struct ip_tunnel_encap ipencap;
1345
1346         if (ipgre_netlink_encap_parms(data, &ipencap)) {
1347                 struct ip_tunnel *t = netdev_priv(dev);
1348                 int err = ip_tunnel_encap_setup(t, &ipencap);
1349
1350                 if (err < 0)
1351                         return err;
1352         }
1353
1354         return 0;
1355 }
1356
1357 static int ipgre_newlink(struct net *src_net, struct net_device *dev,
1358                          struct nlattr *tb[], struct nlattr *data[],
1359                          struct netlink_ext_ack *extack)
1360 {
1361         struct ip_tunnel_parm p;
1362         __u32 fwmark = 0;
1363         int err;
1364
1365         err = ipgre_newlink_encap_setup(dev, data);
1366         if (err)
1367                 return err;
1368
1369         err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1370         if (err < 0)
1371                 return err;
1372         return ip_tunnel_newlink(dev, tb, &p, fwmark);
1373 }
1374
1375 static int erspan_newlink(struct net *src_net, struct net_device *dev,
1376                           struct nlattr *tb[], struct nlattr *data[],
1377                           struct netlink_ext_ack *extack)
1378 {
1379         struct ip_tunnel_parm p;
1380         __u32 fwmark = 0;
1381         int err;
1382
1383         err = ipgre_newlink_encap_setup(dev, data);
1384         if (err)
1385                 return err;
1386
1387         err = erspan_netlink_parms(dev, data, tb, &p, &fwmark);
1388         if (err)
1389                 return err;
1390         return ip_tunnel_newlink(dev, tb, &p, fwmark);
1391 }
1392
1393 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1394                             struct nlattr *data[],
1395                             struct netlink_ext_ack *extack)
1396 {
1397         struct ip_tunnel *t = netdev_priv(dev);
1398         __u32 fwmark = t->fwmark;
1399         struct ip_tunnel_parm p;
1400         int err;
1401
1402         err = ipgre_newlink_encap_setup(dev, data);
1403         if (err)
1404                 return err;
1405
1406         err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1407         if (err < 0)
1408                 return err;
1409
1410         err = ip_tunnel_changelink(dev, tb, &p, fwmark);
1411         if (err < 0)
1412                 return err;
1413
1414         t->parms.i_flags = p.i_flags;
1415         t->parms.o_flags = p.o_flags;
1416
1417         ipgre_link_update(dev, !tb[IFLA_MTU]);
1418
1419         return 0;
1420 }
1421
1422 static int erspan_changelink(struct net_device *dev, struct nlattr *tb[],
1423                              struct nlattr *data[],
1424                              struct netlink_ext_ack *extack)
1425 {
1426         struct ip_tunnel *t = netdev_priv(dev);
1427         __u32 fwmark = t->fwmark;
1428         struct ip_tunnel_parm p;
1429         int err;
1430
1431         err = ipgre_newlink_encap_setup(dev, data);
1432         if (err)
1433                 return err;
1434
1435         err = erspan_netlink_parms(dev, data, tb, &p, &fwmark);
1436         if (err < 0)
1437                 return err;
1438
1439         err = ip_tunnel_changelink(dev, tb, &p, fwmark);
1440         if (err < 0)
1441                 return err;
1442
1443         t->parms.i_flags = p.i_flags;
1444         t->parms.o_flags = p.o_flags;
1445
1446         return 0;
1447 }
1448
1449 static size_t ipgre_get_size(const struct net_device *dev)
1450 {
1451         return
1452                 /* IFLA_GRE_LINK */
1453                 nla_total_size(4) +
1454                 /* IFLA_GRE_IFLAGS */
1455                 nla_total_size(2) +
1456                 /* IFLA_GRE_OFLAGS */
1457                 nla_total_size(2) +
1458                 /* IFLA_GRE_IKEY */
1459                 nla_total_size(4) +
1460                 /* IFLA_GRE_OKEY */
1461                 nla_total_size(4) +
1462                 /* IFLA_GRE_LOCAL */
1463                 nla_total_size(4) +
1464                 /* IFLA_GRE_REMOTE */
1465                 nla_total_size(4) +
1466                 /* IFLA_GRE_TTL */
1467                 nla_total_size(1) +
1468                 /* IFLA_GRE_TOS */
1469                 nla_total_size(1) +
1470                 /* IFLA_GRE_PMTUDISC */
1471                 nla_total_size(1) +
1472                 /* IFLA_GRE_ENCAP_TYPE */
1473                 nla_total_size(2) +
1474                 /* IFLA_GRE_ENCAP_FLAGS */
1475                 nla_total_size(2) +
1476                 /* IFLA_GRE_ENCAP_SPORT */
1477                 nla_total_size(2) +
1478                 /* IFLA_GRE_ENCAP_DPORT */
1479                 nla_total_size(2) +
1480                 /* IFLA_GRE_COLLECT_METADATA */
1481                 nla_total_size(0) +
1482                 /* IFLA_GRE_IGNORE_DF */
1483                 nla_total_size(1) +
1484                 /* IFLA_GRE_FWMARK */
1485                 nla_total_size(4) +
1486                 /* IFLA_GRE_ERSPAN_INDEX */
1487                 nla_total_size(4) +
1488                 /* IFLA_GRE_ERSPAN_VER */
1489                 nla_total_size(1) +
1490                 /* IFLA_GRE_ERSPAN_DIR */
1491                 nla_total_size(1) +
1492                 /* IFLA_GRE_ERSPAN_HWID */
1493                 nla_total_size(2) +
1494                 0;
1495 }
1496
1497 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1498 {
1499         struct ip_tunnel *t = netdev_priv(dev);
1500         struct ip_tunnel_parm *p = &t->parms;
1501         __be16 o_flags = p->o_flags;
1502
1503         if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1504             nla_put_be16(skb, IFLA_GRE_IFLAGS,
1505                          gre_tnl_flags_to_gre_flags(p->i_flags)) ||
1506             nla_put_be16(skb, IFLA_GRE_OFLAGS,
1507                          gre_tnl_flags_to_gre_flags(o_flags)) ||
1508             nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1509             nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1510             nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1511             nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1512             nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1513             nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1514             nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1515                        !!(p->iph.frag_off & htons(IP_DF))) ||
1516             nla_put_u32(skb, IFLA_GRE_FWMARK, t->fwmark))
1517                 goto nla_put_failure;
1518
1519         if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
1520                         t->encap.type) ||
1521             nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
1522                          t->encap.sport) ||
1523             nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
1524                          t->encap.dport) ||
1525             nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
1526                         t->encap.flags))
1527                 goto nla_put_failure;
1528
1529         if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df))
1530                 goto nla_put_failure;
1531
1532         if (t->collect_md) {
1533                 if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
1534                         goto nla_put_failure;
1535         }
1536
1537         return 0;
1538
1539 nla_put_failure:
1540         return -EMSGSIZE;
1541 }
1542
1543 static int erspan_fill_info(struct sk_buff *skb, const struct net_device *dev)
1544 {
1545         struct ip_tunnel *t = netdev_priv(dev);
1546
1547         if (t->erspan_ver <= 2) {
1548                 if (t->erspan_ver != 0 && !t->collect_md)
1549                         t->parms.o_flags |= TUNNEL_KEY;
1550
1551                 if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
1552                         goto nla_put_failure;
1553
1554                 if (t->erspan_ver == 1) {
1555                         if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
1556                                 goto nla_put_failure;
1557                 } else if (t->erspan_ver == 2) {
1558                         if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir))
1559                                 goto nla_put_failure;
1560                         if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid))
1561                                 goto nla_put_failure;
1562                 }
1563         }
1564
1565         return ipgre_fill_info(skb, dev);
1566
1567 nla_put_failure:
1568         return -EMSGSIZE;
1569 }
1570
1571 static void erspan_setup(struct net_device *dev)
1572 {
1573         struct ip_tunnel *t = netdev_priv(dev);
1574
1575         ether_setup(dev);
1576         dev->max_mtu = 0;
1577         dev->netdev_ops = &erspan_netdev_ops;
1578         dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1579         dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1580         ip_tunnel_setup(dev, erspan_net_id);
1581         t->erspan_ver = 1;
1582 }
1583
1584 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1585         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1586         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1587         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1588         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1589         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1590         [IFLA_GRE_LOCAL]        = { .len = sizeof_field(struct iphdr, saddr) },
1591         [IFLA_GRE_REMOTE]       = { .len = sizeof_field(struct iphdr, daddr) },
1592         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1593         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1594         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1595         [IFLA_GRE_ENCAP_TYPE]   = { .type = NLA_U16 },
1596         [IFLA_GRE_ENCAP_FLAGS]  = { .type = NLA_U16 },
1597         [IFLA_GRE_ENCAP_SPORT]  = { .type = NLA_U16 },
1598         [IFLA_GRE_ENCAP_DPORT]  = { .type = NLA_U16 },
1599         [IFLA_GRE_COLLECT_METADATA]     = { .type = NLA_FLAG },
1600         [IFLA_GRE_IGNORE_DF]    = { .type = NLA_U8 },
1601         [IFLA_GRE_FWMARK]       = { .type = NLA_U32 },
1602         [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 },
1603         [IFLA_GRE_ERSPAN_VER]   = { .type = NLA_U8 },
1604         [IFLA_GRE_ERSPAN_DIR]   = { .type = NLA_U8 },
1605         [IFLA_GRE_ERSPAN_HWID]  = { .type = NLA_U16 },
1606 };
1607
1608 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1609         .kind           = "gre",
1610         .maxtype        = IFLA_GRE_MAX,
1611         .policy         = ipgre_policy,
1612         .priv_size      = sizeof(struct ip_tunnel),
1613         .setup          = ipgre_tunnel_setup,
1614         .validate       = ipgre_tunnel_validate,
1615         .newlink        = ipgre_newlink,
1616         .changelink     = ipgre_changelink,
1617         .dellink        = ip_tunnel_dellink,
1618         .get_size       = ipgre_get_size,
1619         .fill_info      = ipgre_fill_info,
1620         .get_link_net   = ip_tunnel_get_link_net,
1621 };
1622
1623 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1624         .kind           = "gretap",
1625         .maxtype        = IFLA_GRE_MAX,
1626         .policy         = ipgre_policy,
1627         .priv_size      = sizeof(struct ip_tunnel),
1628         .setup          = ipgre_tap_setup,
1629         .validate       = ipgre_tap_validate,
1630         .newlink        = ipgre_newlink,
1631         .changelink     = ipgre_changelink,
1632         .dellink        = ip_tunnel_dellink,
1633         .get_size       = ipgre_get_size,
1634         .fill_info      = ipgre_fill_info,
1635         .get_link_net   = ip_tunnel_get_link_net,
1636 };
1637
1638 static struct rtnl_link_ops erspan_link_ops __read_mostly = {
1639         .kind           = "erspan",
1640         .maxtype        = IFLA_GRE_MAX,
1641         .policy         = ipgre_policy,
1642         .priv_size      = sizeof(struct ip_tunnel),
1643         .setup          = erspan_setup,
1644         .validate       = erspan_validate,
1645         .newlink        = erspan_newlink,
1646         .changelink     = erspan_changelink,
1647         .dellink        = ip_tunnel_dellink,
1648         .get_size       = ipgre_get_size,
1649         .fill_info      = erspan_fill_info,
1650         .get_link_net   = ip_tunnel_get_link_net,
1651 };
1652
1653 struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
1654                                         u8 name_assign_type)
1655 {
1656         struct nlattr *tb[IFLA_MAX + 1];
1657         struct net_device *dev;
1658         LIST_HEAD(list_kill);
1659         struct ip_tunnel *t;
1660         int err;
1661
1662         memset(&tb, 0, sizeof(tb));
1663
1664         dev = rtnl_create_link(net, name, name_assign_type,
1665                                &ipgre_tap_ops, tb, NULL);
1666         if (IS_ERR(dev))
1667                 return dev;
1668
1669         /* Configure flow based GRE device. */
1670         t = netdev_priv(dev);
1671         t->collect_md = true;
1672
1673         err = ipgre_newlink(net, dev, tb, NULL, NULL);
1674         if (err < 0) {
1675                 free_netdev(dev);
1676                 return ERR_PTR(err);
1677         }
1678
1679         /* openvswitch users expect packet sizes to be unrestricted,
1680          * so set the largest MTU we can.
1681          */
1682         err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
1683         if (err)
1684                 goto out;
1685
1686         err = rtnl_configure_link(dev, NULL);
1687         if (err < 0)
1688                 goto out;
1689
1690         return dev;
1691 out:
1692         ip_tunnel_dellink(dev, &list_kill);
1693         unregister_netdevice_many(&list_kill);
1694         return ERR_PTR(err);
1695 }
1696 EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
1697
1698 static int __net_init ipgre_tap_init_net(struct net *net)
1699 {
1700         return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
1701 }
1702
1703 static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net)
1704 {
1705         ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops);
1706 }
1707
1708 static struct pernet_operations ipgre_tap_net_ops = {
1709         .init = ipgre_tap_init_net,
1710         .exit_batch = ipgre_tap_exit_batch_net,
1711         .id   = &gre_tap_net_id,
1712         .size = sizeof(struct ip_tunnel_net),
1713 };
1714
1715 static int __net_init erspan_init_net(struct net *net)
1716 {
1717         return ip_tunnel_init_net(net, erspan_net_id,
1718                                   &erspan_link_ops, "erspan0");
1719 }
1720
1721 static void __net_exit erspan_exit_batch_net(struct list_head *net_list)
1722 {
1723         ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops);
1724 }
1725
1726 static struct pernet_operations erspan_net_ops = {
1727         .init = erspan_init_net,
1728         .exit_batch = erspan_exit_batch_net,
1729         .id   = &erspan_net_id,
1730         .size = sizeof(struct ip_tunnel_net),
1731 };
1732
1733 static int __init ipgre_init(void)
1734 {
1735         int err;
1736
1737         pr_info("GRE over IPv4 tunneling driver\n");
1738
1739         err = register_pernet_device(&ipgre_net_ops);
1740         if (err < 0)
1741                 return err;
1742
1743         err = register_pernet_device(&ipgre_tap_net_ops);
1744         if (err < 0)
1745                 goto pnet_tap_failed;
1746
1747         err = register_pernet_device(&erspan_net_ops);
1748         if (err < 0)
1749                 goto pnet_erspan_failed;
1750
1751         err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1752         if (err < 0) {
1753                 pr_info("%s: can't add protocol\n", __func__);
1754                 goto add_proto_failed;
1755         }
1756
1757         err = rtnl_link_register(&ipgre_link_ops);
1758         if (err < 0)
1759                 goto rtnl_link_failed;
1760
1761         err = rtnl_link_register(&ipgre_tap_ops);
1762         if (err < 0)
1763                 goto tap_ops_failed;
1764
1765         err = rtnl_link_register(&erspan_link_ops);
1766         if (err < 0)
1767                 goto erspan_link_failed;
1768
1769         return 0;
1770
1771 erspan_link_failed:
1772         rtnl_link_unregister(&ipgre_tap_ops);
1773 tap_ops_failed:
1774         rtnl_link_unregister(&ipgre_link_ops);
1775 rtnl_link_failed:
1776         gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1777 add_proto_failed:
1778         unregister_pernet_device(&erspan_net_ops);
1779 pnet_erspan_failed:
1780         unregister_pernet_device(&ipgre_tap_net_ops);
1781 pnet_tap_failed:
1782         unregister_pernet_device(&ipgre_net_ops);
1783         return err;
1784 }
1785
1786 static void __exit ipgre_fini(void)
1787 {
1788         rtnl_link_unregister(&ipgre_tap_ops);
1789         rtnl_link_unregister(&ipgre_link_ops);
1790         rtnl_link_unregister(&erspan_link_ops);
1791         gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1792         unregister_pernet_device(&ipgre_tap_net_ops);
1793         unregister_pernet_device(&ipgre_net_ops);
1794         unregister_pernet_device(&erspan_net_ops);
1795 }
1796
1797 module_init(ipgre_init);
1798 module_exit(ipgre_fini);
1799 MODULE_LICENSE("GPL");
1800 MODULE_ALIAS_RTNL_LINK("gre");
1801 MODULE_ALIAS_RTNL_LINK("gretap");
1802 MODULE_ALIAS_RTNL_LINK("erspan");
1803 MODULE_ALIAS_NETDEV("gre0");
1804 MODULE_ALIAS_NETDEV("gretap0");
1805 MODULE_ALIAS_NETDEV("erspan0");