GNU Linux-libre 5.15.137-gnu
[releases.git] / net / ipv4 / ip_gre.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *      Linux NET3:     GRE over IP protocol decoder.
4  *
5  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6  */
7
8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9
10 #include <linux/capability.h>
11 #include <linux/module.h>
12 #include <linux/types.h>
13 #include <linux/kernel.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/skbuff.h>
17 #include <linux/netdevice.h>
18 #include <linux/in.h>
19 #include <linux/tcp.h>
20 #include <linux/udp.h>
21 #include <linux/if_arp.h>
22 #include <linux/if_vlan.h>
23 #include <linux/init.h>
24 #include <linux/in6.h>
25 #include <linux/inetdevice.h>
26 #include <linux/igmp.h>
27 #include <linux/netfilter_ipv4.h>
28 #include <linux/etherdevice.h>
29 #include <linux/if_ether.h>
30
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
36 #include <net/arp.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
40 #include <net/xfrm.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
44 #include <net/gre.h>
45 #include <net/dst_metadata.h>
46 #include <net/erspan.h>
47
48 /*
49    Problems & solutions
50    --------------------
51
52    1. The most important issue is detecting local dead loops.
53    They would cause complete host lockup in transmit, which
54    would be "resolved" by stack overflow or, if queueing is enabled,
55    with infinite looping in net_bh.
56
57    We cannot track such dead loops during route installation,
58    it is infeasible task. The most general solutions would be
59    to keep skb->encapsulation counter (sort of local ttl),
60    and silently drop packet when it expires. It is a good
61    solution, but it supposes maintaining new variable in ALL
62    skb, even if no tunneling is used.
63
64    Current solution: xmit_recursion breaks dead loops. This is a percpu
65    counter, since when we enter the first ndo_xmit(), cpu migration is
66    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
67
68    2. Networking dead loops would not kill routers, but would really
69    kill network. IP hop limit plays role of "t->recursion" in this case,
70    if we copy it from packet being encapsulated to upper header.
71    It is very good solution, but it introduces two problems:
72
73    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
74      do not work over tunnels.
75    - traceroute does not work. I planned to relay ICMP from tunnel,
76      so that this problem would be solved and traceroute output
77      would even more informative. This idea appeared to be wrong:
78      only Linux complies to rfc1812 now (yes, guys, Linux is the only
79      true router now :-)), all routers (at least, in neighbourhood of mine)
80      return only 8 bytes of payload. It is the end.
81
82    Hence, if we want that OSPF worked or traceroute said something reasonable,
83    we should search for another solution.
84
85    One of them is to parse packet trying to detect inner encapsulation
86    made by our node. It is difficult or even impossible, especially,
87    taking into account fragmentation. TO be short, ttl is not solution at all.
88
89    Current solution: The solution was UNEXPECTEDLY SIMPLE.
90    We force DF flag on tunnels with preconfigured hop limit,
91    that is ALL. :-) Well, it does not remove the problem completely,
92    but exponential growth of network traffic is changed to linear
93    (branches, that exceed pmtu are pruned) and tunnel mtu
94    rapidly degrades to value <68, where looping stops.
95    Yes, it is not good if there exists a router in the loop,
96    which does not force DF, even when encapsulating packets have DF set.
97    But it is not our problem! Nobody could accuse us, we made
98    all that we could make. Even if it is your gated who injected
99    fatal route to network, even if it were you who configured
100    fatal static route: you are innocent. :-)
101
102    Alexey Kuznetsov.
103  */
104
105 static bool log_ecn_error = true;
106 module_param(log_ecn_error, bool, 0644);
107 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
108
109 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
110 static const struct header_ops ipgre_header_ops;
111
112 static int ipgre_tunnel_init(struct net_device *dev);
113 static void erspan_build_header(struct sk_buff *skb,
114                                 u32 id, u32 index,
115                                 bool truncate, bool is_ipv4);
116
117 static unsigned int ipgre_net_id __read_mostly;
118 static unsigned int gre_tap_net_id __read_mostly;
119 static unsigned int erspan_net_id __read_mostly;
120
121 static int ipgre_err(struct sk_buff *skb, u32 info,
122                      const struct tnl_ptk_info *tpi)
123 {
124
125         /* All the routers (except for Linux) return only
126            8 bytes of packet payload. It means, that precise relaying of
127            ICMP in the real Internet is absolutely infeasible.
128
129            Moreover, Cisco "wise men" put GRE key to the third word
130            in GRE header. It makes impossible maintaining even soft
131            state for keyed GRE tunnels with enabled checksum. Tell
132            them "thank you".
133
134            Well, I wonder, rfc1812 was written by Cisco employee,
135            what the hell these idiots break standards established
136            by themselves???
137            */
138         struct net *net = dev_net(skb->dev);
139         struct ip_tunnel_net *itn;
140         const struct iphdr *iph;
141         const int type = icmp_hdr(skb)->type;
142         const int code = icmp_hdr(skb)->code;
143         unsigned int data_len = 0;
144         struct ip_tunnel *t;
145
146         if (tpi->proto == htons(ETH_P_TEB))
147                 itn = net_generic(net, gre_tap_net_id);
148         else if (tpi->proto == htons(ETH_P_ERSPAN) ||
149                  tpi->proto == htons(ETH_P_ERSPAN2))
150                 itn = net_generic(net, erspan_net_id);
151         else
152                 itn = net_generic(net, ipgre_net_id);
153
154         iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
155         t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
156                              iph->daddr, iph->saddr, tpi->key);
157
158         if (!t)
159                 return -ENOENT;
160
161         switch (type) {
162         default:
163         case ICMP_PARAMETERPROB:
164                 return 0;
165
166         case ICMP_DEST_UNREACH:
167                 switch (code) {
168                 case ICMP_SR_FAILED:
169                 case ICMP_PORT_UNREACH:
170                         /* Impossible event. */
171                         return 0;
172                 default:
173                         /* All others are translated to HOST_UNREACH.
174                            rfc2003 contains "deep thoughts" about NET_UNREACH,
175                            I believe they are just ether pollution. --ANK
176                          */
177                         break;
178                 }
179                 break;
180
181         case ICMP_TIME_EXCEEDED:
182                 if (code != ICMP_EXC_TTL)
183                         return 0;
184                 data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
185                 break;
186
187         case ICMP_REDIRECT:
188                 break;
189         }
190
191 #if IS_ENABLED(CONFIG_IPV6)
192        if (tpi->proto == htons(ETH_P_IPV6) &&
193            !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
194                                        type, data_len))
195                return 0;
196 #endif
197
198         if (t->parms.iph.daddr == 0 ||
199             ipv4_is_multicast(t->parms.iph.daddr))
200                 return 0;
201
202         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
203                 return 0;
204
205         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
206                 t->err_count++;
207         else
208                 t->err_count = 1;
209         t->err_time = jiffies;
210
211         return 0;
212 }
213
214 static void gre_err(struct sk_buff *skb, u32 info)
215 {
216         /* All the routers (except for Linux) return only
217          * 8 bytes of packet payload. It means, that precise relaying of
218          * ICMP in the real Internet is absolutely infeasible.
219          *
220          * Moreover, Cisco "wise men" put GRE key to the third word
221          * in GRE header. It makes impossible maintaining even soft
222          * state for keyed
223          * GRE tunnels with enabled checksum. Tell them "thank you".
224          *
225          * Well, I wonder, rfc1812 was written by Cisco employee,
226          * what the hell these idiots break standards established
227          * by themselves???
228          */
229
230         const struct iphdr *iph = (struct iphdr *)skb->data;
231         const int type = icmp_hdr(skb)->type;
232         const int code = icmp_hdr(skb)->code;
233         struct tnl_ptk_info tpi;
234
235         if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IP),
236                              iph->ihl * 4) < 0)
237                 return;
238
239         if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
240                 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
241                                  skb->dev->ifindex, IPPROTO_GRE);
242                 return;
243         }
244         if (type == ICMP_REDIRECT) {
245                 ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex,
246                               IPPROTO_GRE);
247                 return;
248         }
249
250         ipgre_err(skb, info, &tpi);
251 }
252
253 static bool is_erspan_type1(int gre_hdr_len)
254 {
255         /* Both ERSPAN type I (version 0) and type II (version 1) use
256          * protocol 0x88BE, but the type I has only 4-byte GRE header,
257          * while type II has 8-byte.
258          */
259         return gre_hdr_len == 4;
260 }
261
262 static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
263                       int gre_hdr_len)
264 {
265         struct net *net = dev_net(skb->dev);
266         struct metadata_dst *tun_dst = NULL;
267         struct erspan_base_hdr *ershdr;
268         struct ip_tunnel_net *itn;
269         struct ip_tunnel *tunnel;
270         const struct iphdr *iph;
271         struct erspan_md2 *md2;
272         int ver;
273         int len;
274
275         itn = net_generic(net, erspan_net_id);
276         iph = ip_hdr(skb);
277         if (is_erspan_type1(gre_hdr_len)) {
278                 ver = 0;
279                 tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
280                                           tpi->flags | TUNNEL_NO_KEY,
281                                           iph->saddr, iph->daddr, 0);
282         } else {
283                 ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
284                 ver = ershdr->ver;
285                 tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
286                                           tpi->flags | TUNNEL_KEY,
287                                           iph->saddr, iph->daddr, tpi->key);
288         }
289
290         if (tunnel) {
291                 if (is_erspan_type1(gre_hdr_len))
292                         len = gre_hdr_len;
293                 else
294                         len = gre_hdr_len + erspan_hdr_len(ver);
295
296                 if (unlikely(!pskb_may_pull(skb, len)))
297                         return PACKET_REJECT;
298
299                 if (__iptunnel_pull_header(skb,
300                                            len,
301                                            htons(ETH_P_TEB),
302                                            false, false) < 0)
303                         goto drop;
304
305                 if (tunnel->collect_md) {
306                         struct erspan_metadata *pkt_md, *md;
307                         struct ip_tunnel_info *info;
308                         unsigned char *gh;
309                         __be64 tun_id;
310                         __be16 flags;
311
312                         tpi->flags |= TUNNEL_KEY;
313                         flags = tpi->flags;
314                         tun_id = key32_to_tunnel_id(tpi->key);
315
316                         tun_dst = ip_tun_rx_dst(skb, flags,
317                                                 tun_id, sizeof(*md));
318                         if (!tun_dst)
319                                 return PACKET_REJECT;
320
321                         /* skb can be uncloned in __iptunnel_pull_header, so
322                          * old pkt_md is no longer valid and we need to reset
323                          * it
324                          */
325                         gh = skb_network_header(skb) +
326                              skb_network_header_len(skb);
327                         pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len +
328                                                             sizeof(*ershdr));
329                         md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
330                         md->version = ver;
331                         md2 = &md->u.md2;
332                         memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE :
333                                                        ERSPAN_V2_MDSIZE);
334
335                         info = &tun_dst->u.tun_info;
336                         info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
337                         info->options_len = sizeof(*md);
338                 }
339
340                 skb_reset_mac_header(skb);
341                 ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
342                 return PACKET_RCVD;
343         }
344         return PACKET_REJECT;
345
346 drop:
347         kfree_skb(skb);
348         return PACKET_RCVD;
349 }
350
351 static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
352                        struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
353 {
354         struct metadata_dst *tun_dst = NULL;
355         const struct iphdr *iph;
356         struct ip_tunnel *tunnel;
357
358         iph = ip_hdr(skb);
359         tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
360                                   iph->saddr, iph->daddr, tpi->key);
361
362         if (tunnel) {
363                 const struct iphdr *tnl_params;
364
365                 if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
366                                            raw_proto, false) < 0)
367                         goto drop;
368
369                 /* Special case for ipgre_header_parse(), which expects the
370                  * mac_header to point to the outer IP header.
371                  */
372                 if (tunnel->dev->header_ops == &ipgre_header_ops)
373                         skb_pop_mac_header(skb);
374                 else
375                         skb_reset_mac_header(skb);
376
377                 tnl_params = &tunnel->parms.iph;
378                 if (tunnel->collect_md || tnl_params->daddr == 0) {
379                         __be16 flags;
380                         __be64 tun_id;
381
382                         flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
383                         tun_id = key32_to_tunnel_id(tpi->key);
384                         tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
385                         if (!tun_dst)
386                                 return PACKET_REJECT;
387                 }
388
389                 ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
390                 return PACKET_RCVD;
391         }
392         return PACKET_NEXT;
393
394 drop:
395         kfree_skb(skb);
396         return PACKET_RCVD;
397 }
398
399 static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
400                      int hdr_len)
401 {
402         struct net *net = dev_net(skb->dev);
403         struct ip_tunnel_net *itn;
404         int res;
405
406         if (tpi->proto == htons(ETH_P_TEB))
407                 itn = net_generic(net, gre_tap_net_id);
408         else
409                 itn = net_generic(net, ipgre_net_id);
410
411         res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
412         if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
413                 /* ipgre tunnels in collect metadata mode should receive
414                  * also ETH_P_TEB traffic.
415                  */
416                 itn = net_generic(net, ipgre_net_id);
417                 res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
418         }
419         return res;
420 }
421
422 static int gre_rcv(struct sk_buff *skb)
423 {
424         struct tnl_ptk_info tpi;
425         bool csum_err = false;
426         int hdr_len;
427
428 #ifdef CONFIG_NET_IPGRE_BROADCAST
429         if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
430                 /* Looped back packet, drop it! */
431                 if (rt_is_output_route(skb_rtable(skb)))
432                         goto drop;
433         }
434 #endif
435
436         hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0);
437         if (hdr_len < 0)
438                 goto drop;
439
440         if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) ||
441                      tpi.proto == htons(ETH_P_ERSPAN2))) {
442                 if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
443                         return 0;
444                 goto out;
445         }
446
447         if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
448                 return 0;
449
450 out:
451         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
452 drop:
453         kfree_skb(skb);
454         return 0;
455 }
456
457 static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
458                        const struct iphdr *tnl_params,
459                        __be16 proto)
460 {
461         struct ip_tunnel *tunnel = netdev_priv(dev);
462         __be16 flags = tunnel->parms.o_flags;
463
464         /* Push GRE header. */
465         gre_build_header(skb, tunnel->tun_hlen,
466                          flags, proto, tunnel->parms.o_key,
467                          (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
468
469         ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
470 }
471
472 static int gre_handle_offloads(struct sk_buff *skb, bool csum)
473 {
474         return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
475 }
476
477 static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
478                         __be16 proto)
479 {
480         struct ip_tunnel *tunnel = netdev_priv(dev);
481         struct ip_tunnel_info *tun_info;
482         const struct ip_tunnel_key *key;
483         int tunnel_hlen;
484         __be16 flags;
485
486         tun_info = skb_tunnel_info(skb);
487         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
488                      ip_tunnel_info_af(tun_info) != AF_INET))
489                 goto err_free_skb;
490
491         key = &tun_info->key;
492         tunnel_hlen = gre_calc_hlen(key->tun_flags);
493
494         if (skb_cow_head(skb, dev->needed_headroom))
495                 goto err_free_skb;
496
497         /* Push Tunnel header. */
498         if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
499                 goto err_free_skb;
500
501         flags = tun_info->key.tun_flags &
502                 (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
503         gre_build_header(skb, tunnel_hlen, flags, proto,
504                          tunnel_id_to_key32(tun_info->key.tun_id),
505                          (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
506
507         ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
508
509         return;
510
511 err_free_skb:
512         kfree_skb(skb);
513         dev->stats.tx_dropped++;
514 }
515
516 static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
517 {
518         struct ip_tunnel *tunnel = netdev_priv(dev);
519         struct ip_tunnel_info *tun_info;
520         const struct ip_tunnel_key *key;
521         struct erspan_metadata *md;
522         bool truncate = false;
523         __be16 proto;
524         int tunnel_hlen;
525         int version;
526         int nhoff;
527
528         tun_info = skb_tunnel_info(skb);
529         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
530                      ip_tunnel_info_af(tun_info) != AF_INET))
531                 goto err_free_skb;
532
533         key = &tun_info->key;
534         if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
535                 goto err_free_skb;
536         if (tun_info->options_len < sizeof(*md))
537                 goto err_free_skb;
538         md = ip_tunnel_info_opts(tun_info);
539
540         /* ERSPAN has fixed 8 byte GRE header */
541         version = md->version;
542         tunnel_hlen = 8 + erspan_hdr_len(version);
543
544         if (skb_cow_head(skb, dev->needed_headroom))
545                 goto err_free_skb;
546
547         if (gre_handle_offloads(skb, false))
548                 goto err_free_skb;
549
550         if (skb->len > dev->mtu + dev->hard_header_len) {
551                 pskb_trim(skb, dev->mtu + dev->hard_header_len);
552                 truncate = true;
553         }
554
555         nhoff = skb_network_offset(skb);
556         if (skb->protocol == htons(ETH_P_IP) &&
557             (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
558                 truncate = true;
559
560         if (skb->protocol == htons(ETH_P_IPV6)) {
561                 int thoff;
562
563                 if (skb_transport_header_was_set(skb))
564                         thoff = skb_transport_offset(skb);
565                 else
566                         thoff = nhoff + sizeof(struct ipv6hdr);
567                 if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)
568                         truncate = true;
569         }
570
571         if (version == 1) {
572                 erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)),
573                                     ntohl(md->u.index), truncate, true);
574                 proto = htons(ETH_P_ERSPAN);
575         } else if (version == 2) {
576                 erspan_build_header_v2(skb,
577                                        ntohl(tunnel_id_to_key32(key->tun_id)),
578                                        md->u.md2.dir,
579                                        get_hwid(&md->u.md2),
580                                        truncate, true);
581                 proto = htons(ETH_P_ERSPAN2);
582         } else {
583                 goto err_free_skb;
584         }
585
586         gre_build_header(skb, 8, TUNNEL_SEQ,
587                          proto, 0, htonl(atomic_fetch_inc(&tunnel->o_seqno)));
588
589         ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
590
591         return;
592
593 err_free_skb:
594         kfree_skb(skb);
595         dev->stats.tx_dropped++;
596 }
597
598 static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
599 {
600         struct ip_tunnel_info *info = skb_tunnel_info(skb);
601         const struct ip_tunnel_key *key;
602         struct rtable *rt;
603         struct flowi4 fl4;
604
605         if (ip_tunnel_info_af(info) != AF_INET)
606                 return -EINVAL;
607
608         key = &info->key;
609         ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src,
610                             tunnel_id_to_key32(key->tun_id),
611                             key->tos & ~INET_ECN_MASK, 0, skb->mark,
612                             skb_get_hash(skb));
613         rt = ip_route_output_key(dev_net(dev), &fl4);
614         if (IS_ERR(rt))
615                 return PTR_ERR(rt);
616
617         ip_rt_put(rt);
618         info->key.u.ipv4.src = fl4.saddr;
619         return 0;
620 }
621
622 static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
623                               struct net_device *dev)
624 {
625         struct ip_tunnel *tunnel = netdev_priv(dev);
626         const struct iphdr *tnl_params;
627
628         if (!pskb_inet_may_pull(skb))
629                 goto free_skb;
630
631         if (tunnel->collect_md) {
632                 gre_fb_xmit(skb, dev, skb->protocol);
633                 return NETDEV_TX_OK;
634         }
635
636         if (dev->header_ops) {
637                 if (skb_cow_head(skb, 0))
638                         goto free_skb;
639
640                 tnl_params = (const struct iphdr *)skb->data;
641
642                 /* Pull skb since ip_tunnel_xmit() needs skb->data pointing
643                  * to gre header.
644                  */
645                 skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
646                 skb_reset_mac_header(skb);
647
648                 if (skb->ip_summed == CHECKSUM_PARTIAL &&
649                     skb_checksum_start(skb) < skb->data)
650                         goto free_skb;
651         } else {
652                 if (skb_cow_head(skb, dev->needed_headroom))
653                         goto free_skb;
654
655                 tnl_params = &tunnel->parms.iph;
656         }
657
658         if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
659                 goto free_skb;
660
661         __gre_xmit(skb, dev, tnl_params, skb->protocol);
662         return NETDEV_TX_OK;
663
664 free_skb:
665         kfree_skb(skb);
666         dev->stats.tx_dropped++;
667         return NETDEV_TX_OK;
668 }
669
670 static netdev_tx_t erspan_xmit(struct sk_buff *skb,
671                                struct net_device *dev)
672 {
673         struct ip_tunnel *tunnel = netdev_priv(dev);
674         bool truncate = false;
675         __be16 proto;
676
677         if (!pskb_inet_may_pull(skb))
678                 goto free_skb;
679
680         if (tunnel->collect_md) {
681                 erspan_fb_xmit(skb, dev);
682                 return NETDEV_TX_OK;
683         }
684
685         if (gre_handle_offloads(skb, false))
686                 goto free_skb;
687
688         if (skb_cow_head(skb, dev->needed_headroom))
689                 goto free_skb;
690
691         if (skb->len > dev->mtu + dev->hard_header_len) {
692                 pskb_trim(skb, dev->mtu + dev->hard_header_len);
693                 truncate = true;
694         }
695
696         /* Push ERSPAN header */
697         if (tunnel->erspan_ver == 0) {
698                 proto = htons(ETH_P_ERSPAN);
699                 tunnel->parms.o_flags &= ~TUNNEL_SEQ;
700         } else if (tunnel->erspan_ver == 1) {
701                 erspan_build_header(skb, ntohl(tunnel->parms.o_key),
702                                     tunnel->index,
703                                     truncate, true);
704                 proto = htons(ETH_P_ERSPAN);
705         } else if (tunnel->erspan_ver == 2) {
706                 erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key),
707                                        tunnel->dir, tunnel->hwid,
708                                        truncate, true);
709                 proto = htons(ETH_P_ERSPAN2);
710         } else {
711                 goto free_skb;
712         }
713
714         tunnel->parms.o_flags &= ~TUNNEL_KEY;
715         __gre_xmit(skb, dev, &tunnel->parms.iph, proto);
716         return NETDEV_TX_OK;
717
718 free_skb:
719         kfree_skb(skb);
720         dev->stats.tx_dropped++;
721         return NETDEV_TX_OK;
722 }
723
724 static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
725                                 struct net_device *dev)
726 {
727         struct ip_tunnel *tunnel = netdev_priv(dev);
728
729         if (!pskb_inet_may_pull(skb))
730                 goto free_skb;
731
732         if (tunnel->collect_md) {
733                 gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
734                 return NETDEV_TX_OK;
735         }
736
737         if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
738                 goto free_skb;
739
740         if (skb_cow_head(skb, dev->needed_headroom))
741                 goto free_skb;
742
743         __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
744         return NETDEV_TX_OK;
745
746 free_skb:
747         kfree_skb(skb);
748         dev->stats.tx_dropped++;
749         return NETDEV_TX_OK;
750 }
751
752 static void ipgre_link_update(struct net_device *dev, bool set_mtu)
753 {
754         struct ip_tunnel *tunnel = netdev_priv(dev);
755         int len;
756
757         len = tunnel->tun_hlen;
758         tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
759         len = tunnel->tun_hlen - len;
760         tunnel->hlen = tunnel->hlen + len;
761
762         if (dev->header_ops)
763                 dev->hard_header_len += len;
764         else
765                 dev->needed_headroom += len;
766
767         if (set_mtu)
768                 dev->mtu = max_t(int, dev->mtu - len, 68);
769
770         if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
771                 if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
772                     tunnel->encap.type == TUNNEL_ENCAP_NONE) {
773                         dev->features |= NETIF_F_GSO_SOFTWARE;
774                         dev->hw_features |= NETIF_F_GSO_SOFTWARE;
775                 } else {
776                         dev->features &= ~NETIF_F_GSO_SOFTWARE;
777                         dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
778                 }
779                 dev->features |= NETIF_F_LLTX;
780         } else {
781                 dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
782                 dev->features &= ~(NETIF_F_LLTX | NETIF_F_GSO_SOFTWARE);
783         }
784 }
785
786 static int ipgre_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p,
787                             int cmd)
788 {
789         int err;
790
791         if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
792                 if (p->iph.version != 4 || p->iph.protocol != IPPROTO_GRE ||
793                     p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)) ||
794                     ((p->i_flags | p->o_flags) & (GRE_VERSION | GRE_ROUTING)))
795                         return -EINVAL;
796         }
797
798         p->i_flags = gre_flags_to_tnl_flags(p->i_flags);
799         p->o_flags = gre_flags_to_tnl_flags(p->o_flags);
800
801         err = ip_tunnel_ctl(dev, p, cmd);
802         if (err)
803                 return err;
804
805         if (cmd == SIOCCHGTUNNEL) {
806                 struct ip_tunnel *t = netdev_priv(dev);
807
808                 t->parms.i_flags = p->i_flags;
809                 t->parms.o_flags = p->o_flags;
810
811                 if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
812                         ipgre_link_update(dev, true);
813         }
814
815         p->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags);
816         p->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags);
817         return 0;
818 }
819
820 /* Nice toy. Unfortunately, useless in real life :-)
821    It allows to construct virtual multiprotocol broadcast "LAN"
822    over the Internet, provided multicast routing is tuned.
823
824
825    I have no idea was this bicycle invented before me,
826    so that I had to set ARPHRD_IPGRE to a random value.
827    I have an impression, that Cisco could make something similar,
828    but this feature is apparently missing in IOS<=11.2(8).
829
830    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
831    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
832
833    ping -t 255 224.66.66.66
834
835    If nobody answers, mbone does not work.
836
837    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
838    ip addr add 10.66.66.<somewhat>/24 dev Universe
839    ifconfig Universe up
840    ifconfig Universe add fe80::<Your_real_addr>/10
841    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
842    ftp 10.66.66.66
843    ...
844    ftp fec0:6666:6666::193.233.7.65
845    ...
846  */
847 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
848                         unsigned short type,
849                         const void *daddr, const void *saddr, unsigned int len)
850 {
851         struct ip_tunnel *t = netdev_priv(dev);
852         struct iphdr *iph;
853         struct gre_base_hdr *greh;
854
855         iph = skb_push(skb, t->hlen + sizeof(*iph));
856         greh = (struct gre_base_hdr *)(iph+1);
857         greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags);
858         greh->protocol = htons(type);
859
860         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
861
862         /* Set the source hardware address. */
863         if (saddr)
864                 memcpy(&iph->saddr, saddr, 4);
865         if (daddr)
866                 memcpy(&iph->daddr, daddr, 4);
867         if (iph->daddr)
868                 return t->hlen + sizeof(*iph);
869
870         return -(t->hlen + sizeof(*iph));
871 }
872
873 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
874 {
875         const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
876         memcpy(haddr, &iph->saddr, 4);
877         return 4;
878 }
879
880 static const struct header_ops ipgre_header_ops = {
881         .create = ipgre_header,
882         .parse  = ipgre_header_parse,
883 };
884
885 #ifdef CONFIG_NET_IPGRE_BROADCAST
886 static int ipgre_open(struct net_device *dev)
887 {
888         struct ip_tunnel *t = netdev_priv(dev);
889
890         if (ipv4_is_multicast(t->parms.iph.daddr)) {
891                 struct flowi4 fl4;
892                 struct rtable *rt;
893
894                 rt = ip_route_output_gre(t->net, &fl4,
895                                          t->parms.iph.daddr,
896                                          t->parms.iph.saddr,
897                                          t->parms.o_key,
898                                          RT_TOS(t->parms.iph.tos),
899                                          t->parms.link);
900                 if (IS_ERR(rt))
901                         return -EADDRNOTAVAIL;
902                 dev = rt->dst.dev;
903                 ip_rt_put(rt);
904                 if (!__in_dev_get_rtnl(dev))
905                         return -EADDRNOTAVAIL;
906                 t->mlink = dev->ifindex;
907                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
908         }
909         return 0;
910 }
911
912 static int ipgre_close(struct net_device *dev)
913 {
914         struct ip_tunnel *t = netdev_priv(dev);
915
916         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
917                 struct in_device *in_dev;
918                 in_dev = inetdev_by_index(t->net, t->mlink);
919                 if (in_dev)
920                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
921         }
922         return 0;
923 }
924 #endif
925
926 static const struct net_device_ops ipgre_netdev_ops = {
927         .ndo_init               = ipgre_tunnel_init,
928         .ndo_uninit             = ip_tunnel_uninit,
929 #ifdef CONFIG_NET_IPGRE_BROADCAST
930         .ndo_open               = ipgre_open,
931         .ndo_stop               = ipgre_close,
932 #endif
933         .ndo_start_xmit         = ipgre_xmit,
934         .ndo_siocdevprivate     = ip_tunnel_siocdevprivate,
935         .ndo_change_mtu         = ip_tunnel_change_mtu,
936         .ndo_get_stats64        = dev_get_tstats64,
937         .ndo_get_iflink         = ip_tunnel_get_iflink,
938         .ndo_tunnel_ctl         = ipgre_tunnel_ctl,
939 };
940
941 #define GRE_FEATURES (NETIF_F_SG |              \
942                       NETIF_F_FRAGLIST |        \
943                       NETIF_F_HIGHDMA |         \
944                       NETIF_F_HW_CSUM)
945
946 static void ipgre_tunnel_setup(struct net_device *dev)
947 {
948         dev->netdev_ops         = &ipgre_netdev_ops;
949         dev->type               = ARPHRD_IPGRE;
950         ip_tunnel_setup(dev, ipgre_net_id);
951 }
952
953 static void __gre_tunnel_init(struct net_device *dev)
954 {
955         struct ip_tunnel *tunnel;
956
957         tunnel = netdev_priv(dev);
958         tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
959         tunnel->parms.iph.protocol = IPPROTO_GRE;
960
961         tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
962         dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph);
963
964         dev->features           |= GRE_FEATURES;
965         dev->hw_features        |= GRE_FEATURES;
966
967         if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
968                 /* TCP offload with GRE SEQ is not supported, nor
969                  * can we support 2 levels of outer headers requiring
970                  * an update.
971                  */
972                 if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
973                     (tunnel->encap.type == TUNNEL_ENCAP_NONE)) {
974                         dev->features    |= NETIF_F_GSO_SOFTWARE;
975                         dev->hw_features |= NETIF_F_GSO_SOFTWARE;
976                 }
977
978                 /* Can use a lockless transmit, unless we generate
979                  * output sequences
980                  */
981                 dev->features |= NETIF_F_LLTX;
982         }
983 }
984
985 static int ipgre_tunnel_init(struct net_device *dev)
986 {
987         struct ip_tunnel *tunnel = netdev_priv(dev);
988         struct iphdr *iph = &tunnel->parms.iph;
989
990         __gre_tunnel_init(dev);
991
992         memcpy(dev->dev_addr, &iph->saddr, 4);
993         memcpy(dev->broadcast, &iph->daddr, 4);
994
995         dev->flags              = IFF_NOARP;
996         netif_keep_dst(dev);
997         dev->addr_len           = 4;
998
999         if (iph->daddr && !tunnel->collect_md) {
1000 #ifdef CONFIG_NET_IPGRE_BROADCAST
1001                 if (ipv4_is_multicast(iph->daddr)) {
1002                         if (!iph->saddr)
1003                                 return -EINVAL;
1004                         dev->flags = IFF_BROADCAST;
1005                         dev->header_ops = &ipgre_header_ops;
1006                         dev->hard_header_len = tunnel->hlen + sizeof(*iph);
1007                         dev->needed_headroom = 0;
1008                 }
1009 #endif
1010         } else if (!tunnel->collect_md) {
1011                 dev->header_ops = &ipgre_header_ops;
1012                 dev->hard_header_len = tunnel->hlen + sizeof(*iph);
1013                 dev->needed_headroom = 0;
1014         }
1015
1016         return ip_tunnel_init(dev);
1017 }
1018
1019 static const struct gre_protocol ipgre_protocol = {
1020         .handler     = gre_rcv,
1021         .err_handler = gre_err,
1022 };
1023
1024 static int __net_init ipgre_init_net(struct net *net)
1025 {
1026         return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
1027 }
1028
1029 static void __net_exit ipgre_exit_batch_net(struct list_head *list_net)
1030 {
1031         ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops);
1032 }
1033
1034 static struct pernet_operations ipgre_net_ops = {
1035         .init = ipgre_init_net,
1036         .exit_batch = ipgre_exit_batch_net,
1037         .id   = &ipgre_net_id,
1038         .size = sizeof(struct ip_tunnel_net),
1039 };
1040
1041 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
1042                                  struct netlink_ext_ack *extack)
1043 {
1044         __be16 flags;
1045
1046         if (!data)
1047                 return 0;
1048
1049         flags = 0;
1050         if (data[IFLA_GRE_IFLAGS])
1051                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1052         if (data[IFLA_GRE_OFLAGS])
1053                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1054         if (flags & (GRE_VERSION|GRE_ROUTING))
1055                 return -EINVAL;
1056
1057         if (data[IFLA_GRE_COLLECT_METADATA] &&
1058             data[IFLA_GRE_ENCAP_TYPE] &&
1059             nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE)
1060                 return -EINVAL;
1061
1062         return 0;
1063 }
1064
1065 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[],
1066                               struct netlink_ext_ack *extack)
1067 {
1068         __be32 daddr;
1069
1070         if (tb[IFLA_ADDRESS]) {
1071                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1072                         return -EINVAL;
1073                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1074                         return -EADDRNOTAVAIL;
1075         }
1076
1077         if (!data)
1078                 goto out;
1079
1080         if (data[IFLA_GRE_REMOTE]) {
1081                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1082                 if (!daddr)
1083                         return -EINVAL;
1084         }
1085
1086 out:
1087         return ipgre_tunnel_validate(tb, data, extack);
1088 }
1089
1090 static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
1091                            struct netlink_ext_ack *extack)
1092 {
1093         __be16 flags = 0;
1094         int ret;
1095
1096         if (!data)
1097                 return 0;
1098
1099         ret = ipgre_tap_validate(tb, data, extack);
1100         if (ret)
1101                 return ret;
1102
1103         if (data[IFLA_GRE_ERSPAN_VER] &&
1104             nla_get_u8(data[IFLA_GRE_ERSPAN_VER]) == 0)
1105                 return 0;
1106
1107         /* ERSPAN type II/III should only have GRE sequence and key flag */
1108         if (data[IFLA_GRE_OFLAGS])
1109                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1110         if (data[IFLA_GRE_IFLAGS])
1111                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1112         if (!data[IFLA_GRE_COLLECT_METADATA] &&
1113             flags != (GRE_SEQ | GRE_KEY))
1114                 return -EINVAL;
1115
1116         /* ERSPAN Session ID only has 10-bit. Since we reuse
1117          * 32-bit key field as ID, check it's range.
1118          */
1119         if (data[IFLA_GRE_IKEY] &&
1120             (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
1121                 return -EINVAL;
1122
1123         if (data[IFLA_GRE_OKEY] &&
1124             (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
1125                 return -EINVAL;
1126
1127         return 0;
1128 }
1129
1130 static int ipgre_netlink_parms(struct net_device *dev,
1131                                 struct nlattr *data[],
1132                                 struct nlattr *tb[],
1133                                 struct ip_tunnel_parm *parms,
1134                                 __u32 *fwmark)
1135 {
1136         struct ip_tunnel *t = netdev_priv(dev);
1137
1138         memset(parms, 0, sizeof(*parms));
1139
1140         parms->iph.protocol = IPPROTO_GRE;
1141
1142         if (!data)
1143                 return 0;
1144
1145         if (data[IFLA_GRE_LINK])
1146                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1147
1148         if (data[IFLA_GRE_IFLAGS])
1149                 parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
1150
1151         if (data[IFLA_GRE_OFLAGS])
1152                 parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
1153
1154         if (data[IFLA_GRE_IKEY])
1155                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1156
1157         if (data[IFLA_GRE_OKEY])
1158                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1159
1160         if (data[IFLA_GRE_LOCAL])
1161                 parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);
1162
1163         if (data[IFLA_GRE_REMOTE])
1164                 parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);
1165
1166         if (data[IFLA_GRE_TTL])
1167                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1168
1169         if (data[IFLA_GRE_TOS])
1170                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1171
1172         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) {
1173                 if (t->ignore_df)
1174                         return -EINVAL;
1175                 parms->iph.frag_off = htons(IP_DF);
1176         }
1177
1178         if (data[IFLA_GRE_COLLECT_METADATA]) {
1179                 t->collect_md = true;
1180                 if (dev->type == ARPHRD_IPGRE)
1181                         dev->type = ARPHRD_NONE;
1182         }
1183
1184         if (data[IFLA_GRE_IGNORE_DF]) {
1185                 if (nla_get_u8(data[IFLA_GRE_IGNORE_DF])
1186                   && (parms->iph.frag_off & htons(IP_DF)))
1187                         return -EINVAL;
1188                 t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]);
1189         }
1190
1191         if (data[IFLA_GRE_FWMARK])
1192                 *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
1193
1194         return 0;
1195 }
1196
1197 static int erspan_netlink_parms(struct net_device *dev,
1198                                 struct nlattr *data[],
1199                                 struct nlattr *tb[],
1200                                 struct ip_tunnel_parm *parms,
1201                                 __u32 *fwmark)
1202 {
1203         struct ip_tunnel *t = netdev_priv(dev);
1204         int err;
1205
1206         err = ipgre_netlink_parms(dev, data, tb, parms, fwmark);
1207         if (err)
1208                 return err;
1209         if (!data)
1210                 return 0;
1211
1212         if (data[IFLA_GRE_ERSPAN_VER]) {
1213                 t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
1214
1215                 if (t->erspan_ver > 2)
1216                         return -EINVAL;
1217         }
1218
1219         if (t->erspan_ver == 1) {
1220                 if (data[IFLA_GRE_ERSPAN_INDEX]) {
1221                         t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
1222                         if (t->index & ~INDEX_MASK)
1223                                 return -EINVAL;
1224                 }
1225         } else if (t->erspan_ver == 2) {
1226                 if (data[IFLA_GRE_ERSPAN_DIR]) {
1227                         t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
1228                         if (t->dir & ~(DIR_MASK >> DIR_OFFSET))
1229                                 return -EINVAL;
1230                 }
1231                 if (data[IFLA_GRE_ERSPAN_HWID]) {
1232                         t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
1233                         if (t->hwid & ~(HWID_MASK >> HWID_OFFSET))
1234                                 return -EINVAL;
1235                 }
1236         }
1237
1238         return 0;
1239 }
1240
1241 /* This function returns true when ENCAP attributes are present in the nl msg */
1242 static bool ipgre_netlink_encap_parms(struct nlattr *data[],
1243                                       struct ip_tunnel_encap *ipencap)
1244 {
1245         bool ret = false;
1246
1247         memset(ipencap, 0, sizeof(*ipencap));
1248
1249         if (!data)
1250                 return ret;
1251
1252         if (data[IFLA_GRE_ENCAP_TYPE]) {
1253                 ret = true;
1254                 ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
1255         }
1256
1257         if (data[IFLA_GRE_ENCAP_FLAGS]) {
1258                 ret = true;
1259                 ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
1260         }
1261
1262         if (data[IFLA_GRE_ENCAP_SPORT]) {
1263                 ret = true;
1264                 ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
1265         }
1266
1267         if (data[IFLA_GRE_ENCAP_DPORT]) {
1268                 ret = true;
1269                 ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
1270         }
1271
1272         return ret;
1273 }
1274
1275 static int gre_tap_init(struct net_device *dev)
1276 {
1277         __gre_tunnel_init(dev);
1278         dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1279         netif_keep_dst(dev);
1280
1281         return ip_tunnel_init(dev);
1282 }
1283
1284 static const struct net_device_ops gre_tap_netdev_ops = {
1285         .ndo_init               = gre_tap_init,
1286         .ndo_uninit             = ip_tunnel_uninit,
1287         .ndo_start_xmit         = gre_tap_xmit,
1288         .ndo_set_mac_address    = eth_mac_addr,
1289         .ndo_validate_addr      = eth_validate_addr,
1290         .ndo_change_mtu         = ip_tunnel_change_mtu,
1291         .ndo_get_stats64        = dev_get_tstats64,
1292         .ndo_get_iflink         = ip_tunnel_get_iflink,
1293         .ndo_fill_metadata_dst  = gre_fill_metadata_dst,
1294 };
1295
1296 static int erspan_tunnel_init(struct net_device *dev)
1297 {
1298         struct ip_tunnel *tunnel = netdev_priv(dev);
1299
1300         if (tunnel->erspan_ver == 0)
1301                 tunnel->tun_hlen = 4; /* 4-byte GRE hdr. */
1302         else
1303                 tunnel->tun_hlen = 8; /* 8-byte GRE hdr. */
1304
1305         tunnel->parms.iph.protocol = IPPROTO_GRE;
1306         tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
1307                        erspan_hdr_len(tunnel->erspan_ver);
1308
1309         dev->features           |= GRE_FEATURES;
1310         dev->hw_features        |= GRE_FEATURES;
1311         dev->priv_flags         |= IFF_LIVE_ADDR_CHANGE;
1312         netif_keep_dst(dev);
1313
1314         return ip_tunnel_init(dev);
1315 }
1316
1317 static const struct net_device_ops erspan_netdev_ops = {
1318         .ndo_init               = erspan_tunnel_init,
1319         .ndo_uninit             = ip_tunnel_uninit,
1320         .ndo_start_xmit         = erspan_xmit,
1321         .ndo_set_mac_address    = eth_mac_addr,
1322         .ndo_validate_addr      = eth_validate_addr,
1323         .ndo_change_mtu         = ip_tunnel_change_mtu,
1324         .ndo_get_stats64        = dev_get_tstats64,
1325         .ndo_get_iflink         = ip_tunnel_get_iflink,
1326         .ndo_fill_metadata_dst  = gre_fill_metadata_dst,
1327 };
1328
1329 static void ipgre_tap_setup(struct net_device *dev)
1330 {
1331         ether_setup(dev);
1332         dev->max_mtu = 0;
1333         dev->netdev_ops = &gre_tap_netdev_ops;
1334         dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1335         dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1336         ip_tunnel_setup(dev, gre_tap_net_id);
1337 }
1338
1339 static int
1340 ipgre_newlink_encap_setup(struct net_device *dev, struct nlattr *data[])
1341 {
1342         struct ip_tunnel_encap ipencap;
1343
1344         if (ipgre_netlink_encap_parms(data, &ipencap)) {
1345                 struct ip_tunnel *t = netdev_priv(dev);
1346                 int err = ip_tunnel_encap_setup(t, &ipencap);
1347
1348                 if (err < 0)
1349                         return err;
1350         }
1351
1352         return 0;
1353 }
1354
1355 static int ipgre_newlink(struct net *src_net, struct net_device *dev,
1356                          struct nlattr *tb[], struct nlattr *data[],
1357                          struct netlink_ext_ack *extack)
1358 {
1359         struct ip_tunnel_parm p;
1360         __u32 fwmark = 0;
1361         int err;
1362
1363         err = ipgre_newlink_encap_setup(dev, data);
1364         if (err)
1365                 return err;
1366
1367         err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1368         if (err < 0)
1369                 return err;
1370         return ip_tunnel_newlink(dev, tb, &p, fwmark);
1371 }
1372
1373 static int erspan_newlink(struct net *src_net, struct net_device *dev,
1374                           struct nlattr *tb[], struct nlattr *data[],
1375                           struct netlink_ext_ack *extack)
1376 {
1377         struct ip_tunnel_parm p;
1378         __u32 fwmark = 0;
1379         int err;
1380
1381         err = ipgre_newlink_encap_setup(dev, data);
1382         if (err)
1383                 return err;
1384
1385         err = erspan_netlink_parms(dev, data, tb, &p, &fwmark);
1386         if (err)
1387                 return err;
1388         return ip_tunnel_newlink(dev, tb, &p, fwmark);
1389 }
1390
1391 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1392                             struct nlattr *data[],
1393                             struct netlink_ext_ack *extack)
1394 {
1395         struct ip_tunnel *t = netdev_priv(dev);
1396         __u32 fwmark = t->fwmark;
1397         struct ip_tunnel_parm p;
1398         int err;
1399
1400         err = ipgre_newlink_encap_setup(dev, data);
1401         if (err)
1402                 return err;
1403
1404         err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1405         if (err < 0)
1406                 return err;
1407
1408         err = ip_tunnel_changelink(dev, tb, &p, fwmark);
1409         if (err < 0)
1410                 return err;
1411
1412         t->parms.i_flags = p.i_flags;
1413         t->parms.o_flags = p.o_flags;
1414
1415         ipgre_link_update(dev, !tb[IFLA_MTU]);
1416
1417         return 0;
1418 }
1419
1420 static int erspan_changelink(struct net_device *dev, struct nlattr *tb[],
1421                              struct nlattr *data[],
1422                              struct netlink_ext_ack *extack)
1423 {
1424         struct ip_tunnel *t = netdev_priv(dev);
1425         __u32 fwmark = t->fwmark;
1426         struct ip_tunnel_parm p;
1427         int err;
1428
1429         err = ipgre_newlink_encap_setup(dev, data);
1430         if (err)
1431                 return err;
1432
1433         err = erspan_netlink_parms(dev, data, tb, &p, &fwmark);
1434         if (err < 0)
1435                 return err;
1436
1437         err = ip_tunnel_changelink(dev, tb, &p, fwmark);
1438         if (err < 0)
1439                 return err;
1440
1441         t->parms.i_flags = p.i_flags;
1442         t->parms.o_flags = p.o_flags;
1443
1444         return 0;
1445 }
1446
1447 static size_t ipgre_get_size(const struct net_device *dev)
1448 {
1449         return
1450                 /* IFLA_GRE_LINK */
1451                 nla_total_size(4) +
1452                 /* IFLA_GRE_IFLAGS */
1453                 nla_total_size(2) +
1454                 /* IFLA_GRE_OFLAGS */
1455                 nla_total_size(2) +
1456                 /* IFLA_GRE_IKEY */
1457                 nla_total_size(4) +
1458                 /* IFLA_GRE_OKEY */
1459                 nla_total_size(4) +
1460                 /* IFLA_GRE_LOCAL */
1461                 nla_total_size(4) +
1462                 /* IFLA_GRE_REMOTE */
1463                 nla_total_size(4) +
1464                 /* IFLA_GRE_TTL */
1465                 nla_total_size(1) +
1466                 /* IFLA_GRE_TOS */
1467                 nla_total_size(1) +
1468                 /* IFLA_GRE_PMTUDISC */
1469                 nla_total_size(1) +
1470                 /* IFLA_GRE_ENCAP_TYPE */
1471                 nla_total_size(2) +
1472                 /* IFLA_GRE_ENCAP_FLAGS */
1473                 nla_total_size(2) +
1474                 /* IFLA_GRE_ENCAP_SPORT */
1475                 nla_total_size(2) +
1476                 /* IFLA_GRE_ENCAP_DPORT */
1477                 nla_total_size(2) +
1478                 /* IFLA_GRE_COLLECT_METADATA */
1479                 nla_total_size(0) +
1480                 /* IFLA_GRE_IGNORE_DF */
1481                 nla_total_size(1) +
1482                 /* IFLA_GRE_FWMARK */
1483                 nla_total_size(4) +
1484                 /* IFLA_GRE_ERSPAN_INDEX */
1485                 nla_total_size(4) +
1486                 /* IFLA_GRE_ERSPAN_VER */
1487                 nla_total_size(1) +
1488                 /* IFLA_GRE_ERSPAN_DIR */
1489                 nla_total_size(1) +
1490                 /* IFLA_GRE_ERSPAN_HWID */
1491                 nla_total_size(2) +
1492                 0;
1493 }
1494
1495 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1496 {
1497         struct ip_tunnel *t = netdev_priv(dev);
1498         struct ip_tunnel_parm *p = &t->parms;
1499         __be16 o_flags = p->o_flags;
1500
1501         if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1502             nla_put_be16(skb, IFLA_GRE_IFLAGS,
1503                          gre_tnl_flags_to_gre_flags(p->i_flags)) ||
1504             nla_put_be16(skb, IFLA_GRE_OFLAGS,
1505                          gre_tnl_flags_to_gre_flags(o_flags)) ||
1506             nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1507             nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1508             nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1509             nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1510             nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1511             nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1512             nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1513                        !!(p->iph.frag_off & htons(IP_DF))) ||
1514             nla_put_u32(skb, IFLA_GRE_FWMARK, t->fwmark))
1515                 goto nla_put_failure;
1516
1517         if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
1518                         t->encap.type) ||
1519             nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
1520                          t->encap.sport) ||
1521             nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
1522                          t->encap.dport) ||
1523             nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
1524                         t->encap.flags))
1525                 goto nla_put_failure;
1526
1527         if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df))
1528                 goto nla_put_failure;
1529
1530         if (t->collect_md) {
1531                 if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
1532                         goto nla_put_failure;
1533         }
1534
1535         return 0;
1536
1537 nla_put_failure:
1538         return -EMSGSIZE;
1539 }
1540
1541 static int erspan_fill_info(struct sk_buff *skb, const struct net_device *dev)
1542 {
1543         struct ip_tunnel *t = netdev_priv(dev);
1544
1545         if (t->erspan_ver <= 2) {
1546                 if (t->erspan_ver != 0 && !t->collect_md)
1547                         t->parms.o_flags |= TUNNEL_KEY;
1548
1549                 if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
1550                         goto nla_put_failure;
1551
1552                 if (t->erspan_ver == 1) {
1553                         if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
1554                                 goto nla_put_failure;
1555                 } else if (t->erspan_ver == 2) {
1556                         if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir))
1557                                 goto nla_put_failure;
1558                         if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid))
1559                                 goto nla_put_failure;
1560                 }
1561         }
1562
1563         return ipgre_fill_info(skb, dev);
1564
1565 nla_put_failure:
1566         return -EMSGSIZE;
1567 }
1568
1569 static void erspan_setup(struct net_device *dev)
1570 {
1571         struct ip_tunnel *t = netdev_priv(dev);
1572
1573         ether_setup(dev);
1574         dev->max_mtu = 0;
1575         dev->netdev_ops = &erspan_netdev_ops;
1576         dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1577         dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1578         ip_tunnel_setup(dev, erspan_net_id);
1579         t->erspan_ver = 1;
1580 }
1581
1582 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1583         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1584         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1585         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1586         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1587         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1588         [IFLA_GRE_LOCAL]        = { .len = sizeof_field(struct iphdr, saddr) },
1589         [IFLA_GRE_REMOTE]       = { .len = sizeof_field(struct iphdr, daddr) },
1590         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1591         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1592         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1593         [IFLA_GRE_ENCAP_TYPE]   = { .type = NLA_U16 },
1594         [IFLA_GRE_ENCAP_FLAGS]  = { .type = NLA_U16 },
1595         [IFLA_GRE_ENCAP_SPORT]  = { .type = NLA_U16 },
1596         [IFLA_GRE_ENCAP_DPORT]  = { .type = NLA_U16 },
1597         [IFLA_GRE_COLLECT_METADATA]     = { .type = NLA_FLAG },
1598         [IFLA_GRE_IGNORE_DF]    = { .type = NLA_U8 },
1599         [IFLA_GRE_FWMARK]       = { .type = NLA_U32 },
1600         [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 },
1601         [IFLA_GRE_ERSPAN_VER]   = { .type = NLA_U8 },
1602         [IFLA_GRE_ERSPAN_DIR]   = { .type = NLA_U8 },
1603         [IFLA_GRE_ERSPAN_HWID]  = { .type = NLA_U16 },
1604 };
1605
1606 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1607         .kind           = "gre",
1608         .maxtype        = IFLA_GRE_MAX,
1609         .policy         = ipgre_policy,
1610         .priv_size      = sizeof(struct ip_tunnel),
1611         .setup          = ipgre_tunnel_setup,
1612         .validate       = ipgre_tunnel_validate,
1613         .newlink        = ipgre_newlink,
1614         .changelink     = ipgre_changelink,
1615         .dellink        = ip_tunnel_dellink,
1616         .get_size       = ipgre_get_size,
1617         .fill_info      = ipgre_fill_info,
1618         .get_link_net   = ip_tunnel_get_link_net,
1619 };
1620
1621 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1622         .kind           = "gretap",
1623         .maxtype        = IFLA_GRE_MAX,
1624         .policy         = ipgre_policy,
1625         .priv_size      = sizeof(struct ip_tunnel),
1626         .setup          = ipgre_tap_setup,
1627         .validate       = ipgre_tap_validate,
1628         .newlink        = ipgre_newlink,
1629         .changelink     = ipgre_changelink,
1630         .dellink        = ip_tunnel_dellink,
1631         .get_size       = ipgre_get_size,
1632         .fill_info      = ipgre_fill_info,
1633         .get_link_net   = ip_tunnel_get_link_net,
1634 };
1635
1636 static struct rtnl_link_ops erspan_link_ops __read_mostly = {
1637         .kind           = "erspan",
1638         .maxtype        = IFLA_GRE_MAX,
1639         .policy         = ipgre_policy,
1640         .priv_size      = sizeof(struct ip_tunnel),
1641         .setup          = erspan_setup,
1642         .validate       = erspan_validate,
1643         .newlink        = erspan_newlink,
1644         .changelink     = erspan_changelink,
1645         .dellink        = ip_tunnel_dellink,
1646         .get_size       = ipgre_get_size,
1647         .fill_info      = erspan_fill_info,
1648         .get_link_net   = ip_tunnel_get_link_net,
1649 };
1650
1651 struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
1652                                         u8 name_assign_type)
1653 {
1654         struct nlattr *tb[IFLA_MAX + 1];
1655         struct net_device *dev;
1656         LIST_HEAD(list_kill);
1657         struct ip_tunnel *t;
1658         int err;
1659
1660         memset(&tb, 0, sizeof(tb));
1661
1662         dev = rtnl_create_link(net, name, name_assign_type,
1663                                &ipgre_tap_ops, tb, NULL);
1664         if (IS_ERR(dev))
1665                 return dev;
1666
1667         /* Configure flow based GRE device. */
1668         t = netdev_priv(dev);
1669         t->collect_md = true;
1670
1671         err = ipgre_newlink(net, dev, tb, NULL, NULL);
1672         if (err < 0) {
1673                 free_netdev(dev);
1674                 return ERR_PTR(err);
1675         }
1676
1677         /* openvswitch users expect packet sizes to be unrestricted,
1678          * so set the largest MTU we can.
1679          */
1680         err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
1681         if (err)
1682                 goto out;
1683
1684         err = rtnl_configure_link(dev, NULL);
1685         if (err < 0)
1686                 goto out;
1687
1688         return dev;
1689 out:
1690         ip_tunnel_dellink(dev, &list_kill);
1691         unregister_netdevice_many(&list_kill);
1692         return ERR_PTR(err);
1693 }
1694 EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
1695
1696 static int __net_init ipgre_tap_init_net(struct net *net)
1697 {
1698         return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
1699 }
1700
1701 static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net)
1702 {
1703         ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops);
1704 }
1705
1706 static struct pernet_operations ipgre_tap_net_ops = {
1707         .init = ipgre_tap_init_net,
1708         .exit_batch = ipgre_tap_exit_batch_net,
1709         .id   = &gre_tap_net_id,
1710         .size = sizeof(struct ip_tunnel_net),
1711 };
1712
1713 static int __net_init erspan_init_net(struct net *net)
1714 {
1715         return ip_tunnel_init_net(net, erspan_net_id,
1716                                   &erspan_link_ops, "erspan0");
1717 }
1718
1719 static void __net_exit erspan_exit_batch_net(struct list_head *net_list)
1720 {
1721         ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops);
1722 }
1723
1724 static struct pernet_operations erspan_net_ops = {
1725         .init = erspan_init_net,
1726         .exit_batch = erspan_exit_batch_net,
1727         .id   = &erspan_net_id,
1728         .size = sizeof(struct ip_tunnel_net),
1729 };
1730
1731 static int __init ipgre_init(void)
1732 {
1733         int err;
1734
1735         pr_info("GRE over IPv4 tunneling driver\n");
1736
1737         err = register_pernet_device(&ipgre_net_ops);
1738         if (err < 0)
1739                 return err;
1740
1741         err = register_pernet_device(&ipgre_tap_net_ops);
1742         if (err < 0)
1743                 goto pnet_tap_failed;
1744
1745         err = register_pernet_device(&erspan_net_ops);
1746         if (err < 0)
1747                 goto pnet_erspan_failed;
1748
1749         err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1750         if (err < 0) {
1751                 pr_info("%s: can't add protocol\n", __func__);
1752                 goto add_proto_failed;
1753         }
1754
1755         err = rtnl_link_register(&ipgre_link_ops);
1756         if (err < 0)
1757                 goto rtnl_link_failed;
1758
1759         err = rtnl_link_register(&ipgre_tap_ops);
1760         if (err < 0)
1761                 goto tap_ops_failed;
1762
1763         err = rtnl_link_register(&erspan_link_ops);
1764         if (err < 0)
1765                 goto erspan_link_failed;
1766
1767         return 0;
1768
1769 erspan_link_failed:
1770         rtnl_link_unregister(&ipgre_tap_ops);
1771 tap_ops_failed:
1772         rtnl_link_unregister(&ipgre_link_ops);
1773 rtnl_link_failed:
1774         gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1775 add_proto_failed:
1776         unregister_pernet_device(&erspan_net_ops);
1777 pnet_erspan_failed:
1778         unregister_pernet_device(&ipgre_tap_net_ops);
1779 pnet_tap_failed:
1780         unregister_pernet_device(&ipgre_net_ops);
1781         return err;
1782 }
1783
1784 static void __exit ipgre_fini(void)
1785 {
1786         rtnl_link_unregister(&ipgre_tap_ops);
1787         rtnl_link_unregister(&ipgre_link_ops);
1788         rtnl_link_unregister(&erspan_link_ops);
1789         gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1790         unregister_pernet_device(&ipgre_tap_net_ops);
1791         unregister_pernet_device(&ipgre_net_ops);
1792         unregister_pernet_device(&erspan_net_ops);
1793 }
1794
1795 module_init(ipgre_init);
1796 module_exit(ipgre_fini);
1797 MODULE_LICENSE("GPL");
1798 MODULE_ALIAS_RTNL_LINK("gre");
1799 MODULE_ALIAS_RTNL_LINK("gretap");
1800 MODULE_ALIAS_RTNL_LINK("erspan");
1801 MODULE_ALIAS_NETDEV("gre0");
1802 MODULE_ALIAS_NETDEV("gretap0");
1803 MODULE_ALIAS_NETDEV("erspan0");