drivers/net/vrf.c

   1 /*
   2  * vrf.c: device driver to encapsulate a VRF space
   3  *
   4  * Copyright (c) 2015 Cumulus Networks. All rights reserved.
   5  * Copyright (c) 2015 Shrijeet Mukherjee <shm@cumulusnetworks.com>
   6  * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com>
   7  *
   8  * Based on dummy, team and ipvlan drivers
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  */
  15
  16 #include <linux/module.h>
  17 #include <linux/kernel.h>
  18 #include <linux/netdevice.h>
  19 #include <linux/etherdevice.h>
  20 #include <linux/ip.h>
  21 #include <linux/init.h>
  22 #include <linux/moduleparam.h>
  23 #include <linux/netfilter.h>
  24 #include <linux/rtnetlink.h>
  25 #include <net/rtnetlink.h>
  26 #include <linux/u64_stats_sync.h>
  27 #include <linux/hashtable.h>
  28
  29 #include <linux/inetdevice.h>
  30 #include <net/arp.h>
  31 #include <net/ip.h>
  32 #include <net/ip_fib.h>
  33 #include <net/ip6_fib.h>
  34 #include <net/ip6_route.h>
  35 #include <net/rtnetlink.h>
  36 #include <net/route.h>
  37 #include <net/addrconf.h>
  38 #include <net/l3mdev.h>
  39
  40 #define RT_FL_TOS(oldflp4) \
  41         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
  42
  43 #define DRV_NAME        "vrf"
  44 #define DRV_VERSION     "1.0"
  45
  46 #define vrf_master_get_rcu(dev) \
  47         ((struct net_device *)rcu_dereference(dev->rx_handler_data))
  48
  49 struct slave {
  50         struct list_head        list;
  51         struct net_device       *dev;
  52 };
  53
  54 struct slave_queue {
  55         struct list_head        all_slaves;
  56 };
  57
  58 struct net_vrf {
  59         struct slave_queue      queue;
  60         struct rtable           *rth;
  61         struct rt6_info         *rt6;
  62         u32                     tb_id;
  63 };
  64
  65 struct pcpu_dstats {
  66         u64                     tx_pkts;
  67         u64                     tx_bytes;
  68         u64                     tx_drps;
  69         u64                     rx_pkts;
  70         u64                     rx_bytes;
  71         struct u64_stats_sync   syncp;
  72 };
  73
  74 /* neighbor handling is done with actual device; do not want
  75  * to flip skb->dev for those ndisc packets. This really fails
  76  * for multiple next protocols (e.g., NEXTHDR_HOP). But it is
  77  * a start.
  78  */
  79 #if IS_ENABLED(CONFIG_IPV6)
  80 static bool check_ipv6_frame(const struct sk_buff *skb)
  81 {
  82         const struct ipv6hdr *ipv6h;
  83         struct ipv6hdr _ipv6h;
  84         bool rc = true;
  85
  86         ipv6h = skb_header_pointer(skb, 0, sizeof(_ipv6h), &_ipv6h);
  87         if (!ipv6h)
  88                 goto out;
  89
  90         if (ipv6h->nexthdr == NEXTHDR_ICMP) {
  91                 const struct icmp6hdr *icmph;
  92                 struct icmp6hdr _icmph;
  93
  94                 icmph = skb_header_pointer(skb, sizeof(_ipv6h),
  95                                            sizeof(_icmph), &_icmph);
  96                 if (!icmph)
  97                         goto out;
  98
  99                 switch (icmph->icmp6_type) {
 100                 case NDISC_ROUTER_SOLICITATION:
 101                 case NDISC_ROUTER_ADVERTISEMENT:
 102                 case NDISC_NEIGHBOUR_SOLICITATION:
 103                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 104                 case NDISC_REDIRECT:
 105                         rc = false;
 106                         break;
 107                 }
 108         }
 109
 110 out:
 111         return rc;
 112 }
 113 #else
 114 static bool check_ipv6_frame(const struct sk_buff *skb)
 115 {
 116         return false;
 117 }
 118 #endif
 119
 120 static bool is_ip_rx_frame(struct sk_buff *skb)
 121 {
 122         switch (skb->protocol) {
 123         case htons(ETH_P_IP):
 124                 return true;
 125         case htons(ETH_P_IPV6):
 126                 return check_ipv6_frame(skb);
 127         }
 128         return false;
 129 }
 130
 131 static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb)
 132 {
 133         vrf_dev->stats.tx_errors++;
 134         kfree_skb(skb);
 135 }
 136
 137 /* note: already called with rcu_read_lock */
 138 static rx_handler_result_t vrf_handle_frame(struct sk_buff **pskb)
 139 {
 140         struct sk_buff *skb = *pskb;
 141
 142         if (is_ip_rx_frame(skb)) {
 143                 struct net_device *dev = vrf_master_get_rcu(skb->dev);
 144                 struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
 145
 146                 u64_stats_update_begin(&dstats->syncp);
 147                 dstats->rx_pkts++;
 148                 dstats->rx_bytes += skb->len;
 149                 u64_stats_update_end(&dstats->syncp);
 150
 151                 skb->dev = dev;
 152
 153                 return RX_HANDLER_ANOTHER;
 154         }
 155         return RX_HANDLER_PASS;
 156 }
 157
 158 static struct rtnl_link_stats64 *vrf_get_stats64(struct net_device *dev,
 159                                                  struct rtnl_link_stats64 *stats)
 160 {
 161         int i;
 162
 163         for_each_possible_cpu(i) {
 164                 const struct pcpu_dstats *dstats;
 165                 u64 tbytes, tpkts, tdrops, rbytes, rpkts;
 166                 unsigned int start;
 167
 168                 dstats = per_cpu_ptr(dev->dstats, i);
 169                 do {
 170                         start = u64_stats_fetch_begin_irq(&dstats->syncp);
 171                         tbytes = dstats->tx_bytes;
 172                         tpkts = dstats->tx_pkts;
 173                         tdrops = dstats->tx_drps;
 174                         rbytes = dstats->rx_bytes;
 175                         rpkts = dstats->rx_pkts;
 176                 } while (u64_stats_fetch_retry_irq(&dstats->syncp, start));
 177                 stats->tx_bytes += tbytes;
 178                 stats->tx_packets += tpkts;
 179                 stats->tx_dropped += tdrops;
 180                 stats->rx_bytes += rbytes;
 181                 stats->rx_packets += rpkts;
 182         }
 183         return stats;
 184 }
 185
 186 #if IS_ENABLED(CONFIG_IPV6)
 187 static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
 188                                            struct net_device *dev)
 189 {
 190         const struct ipv6hdr *iph = ipv6_hdr(skb);
 191         struct net *net = dev_net(skb->dev);
 192         struct flowi6 fl6 = {
 193                 /* needed to match OIF rule */
 194                 .flowi6_oif = dev->ifindex,
 195                 .flowi6_iif = LOOPBACK_IFINDEX,
 196                 .daddr = iph->daddr,
 197                 .saddr = iph->saddr,
 198                 .flowlabel = ip6_flowinfo(iph),
 199                 .flowi6_mark = skb->mark,
 200                 .flowi6_proto = iph->nexthdr,
 201                 .flowi6_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF,
 202         };
 203         int ret = NET_XMIT_DROP;
 204         struct dst_entry *dst;
 205         struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst;
 206
 207         dst = ip6_route_output(net, NULL, &fl6);
 208         if (dst == dst_null)
 209                 goto err;
 210
 211         skb_dst_drop(skb);
 212         skb_dst_set(skb, dst);
 213
 214         ret = ip6_local_out(net, skb->sk, skb);
 215         if (unlikely(net_xmit_eval(ret)))
 216                 dev->stats.tx_errors++;
 217         else
 218                 ret = NET_XMIT_SUCCESS;
 219
 220         return ret;
 221 err:
 222         vrf_tx_error(dev, skb);
 223         return NET_XMIT_DROP;
 224 }
 225 #else
 226 static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
 227                                            struct net_device *dev)
 228 {
 229         vrf_tx_error(dev, skb);
 230         return NET_XMIT_DROP;
 231 }
 232 #endif
 233
 234 static int vrf_send_v4_prep(struct sk_buff *skb, struct flowi4 *fl4,
 235                             struct net_device *vrf_dev)
 236 {
 237         struct rtable *rt;
 238         int err = 1;
 239
 240         rt = ip_route_output_flow(dev_net(vrf_dev), fl4, NULL);
 241         if (IS_ERR(rt))
 242                 goto out;
 243
 244         /* TO-DO: what about broadcast ? */
 245         if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
 246                 ip_rt_put(rt);
 247                 goto out;
 248         }
 249
 250         skb_dst_drop(skb);
 251         skb_dst_set(skb, &rt->dst);
 252         err = 0;
 253 out:
 254         return err;
 255 }
 256
 257 static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb,
 258                                            struct net_device *vrf_dev)
 259 {
 260         struct iphdr *ip4h = ip_hdr(skb);
 261         int ret = NET_XMIT_DROP;
 262         struct flowi4 fl4 = {
 263                 /* needed to match OIF rule */
 264                 .flowi4_oif = vrf_dev->ifindex,
 265                 .flowi4_iif = LOOPBACK_IFINDEX,
 266                 .flowi4_tos = RT_TOS(ip4h->tos),
 267                 .flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_L3MDEV_SRC |
 268                                 FLOWI_FLAG_SKIP_NH_OIF,
 269                 .flowi4_proto = ip4h->protocol,
 270                 .daddr = ip4h->daddr,
 271                 .saddr = ip4h->saddr,
 272         };
 273
 274         if (vrf_send_v4_prep(skb, &fl4, vrf_dev))
 275                 goto err;
 276
 277         if (!ip4h->saddr) {
 278                 ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0,
 279                                                RT_SCOPE_LINK);
 280         }
 281
 282         ret = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
 283         if (unlikely(net_xmit_eval(ret)))
 284                 vrf_dev->stats.tx_errors++;
 285         else
 286                 ret = NET_XMIT_SUCCESS;
 287
 288 out:
 289         return ret;
 290 err:
 291         vrf_tx_error(vrf_dev, skb);
 292         goto out;
 293 }
 294
 295 static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev)
 296 {
 297         /* strip the ethernet header added for pass through VRF device */
 298         __skb_pull(skb, skb_network_offset(skb));
 299
 300         switch (skb->protocol) {
 301         case htons(ETH_P_IP):
 302                 return vrf_process_v4_outbound(skb, dev);
 303         case htons(ETH_P_IPV6):
 304                 return vrf_process_v6_outbound(skb, dev);
 305         default:
 306                 vrf_tx_error(dev, skb);
 307                 return NET_XMIT_DROP;
 308         }
 309 }
 310
 311 static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev)
 312 {
 313         int len = skb->len;
 314         netdev_tx_t ret = is_ip_tx_frame(skb, dev);
 315
 316         if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
 317                 struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
 318
 319                 u64_stats_update_begin(&dstats->syncp);
 320                 dstats->tx_pkts++;
 321                 dstats->tx_bytes += len;
 322                 u64_stats_update_end(&dstats->syncp);
 323         } else {
 324                 this_cpu_inc(dev->dstats->tx_drps);
 325         }
 326
 327         return ret;
 328 }
 329
 330 #if IS_ENABLED(CONFIG_IPV6)
 331 /* modelled after ip6_finish_output2 */
 332 static int vrf_finish_output6(struct net *net, struct sock *sk,
 333                               struct sk_buff *skb)
 334 {
 335         struct dst_entry *dst = skb_dst(skb);
 336         struct net_device *dev = dst->dev;
 337         struct neighbour *neigh;
 338         struct in6_addr *nexthop;
 339         int ret;
 340
 341         nf_reset(skb);
 342
 343         skb->protocol = htons(ETH_P_IPV6);
 344         skb->dev = dev;
 345
 346         rcu_read_lock_bh();
 347         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 348         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 349         if (unlikely(!neigh))
 350                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 351         if (!IS_ERR(neigh)) {
 352                 ret = dst_neigh_output(dst, neigh, skb);
 353                 rcu_read_unlock_bh();
 354                 return ret;
 355         }
 356         rcu_read_unlock_bh();
 357
 358         IP6_INC_STATS(dev_net(dst->dev),
 359                       ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 360         kfree_skb(skb);
 361         return -EINVAL;
 362 }
 363
 364 /* modelled after ip6_output */
 365 static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb)
 366 {
 367         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 368                             net, sk, skb, NULL, skb_dst(skb)->dev,
 369                             vrf_finish_output6,
 370                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 371 }
 372
 373 static void vrf_rt6_release(struct net_vrf *vrf)
 374 {
 375         dst_release(&vrf->rt6->dst);
 376         vrf->rt6 = NULL;
 377 }
 378
 379 static int vrf_rt6_create(struct net_device *dev)
 380 {
 381         struct net_vrf *vrf = netdev_priv(dev);
 382         struct net *net = dev_net(dev);
 383         struct rt6_info *rt6;
 384         int rc = -ENOMEM;
 385
 386         rt6 = ip6_dst_alloc(net, dev,
 387                             DST_HOST | DST_NOPOLICY | DST_NOXFRM | DST_NOCACHE);
 388         if (!rt6)
 389                 goto out;
 390
 391         rt6->dst.output = vrf_output6;
 392         rt6->rt6i_table = fib6_get_table(net, vrf->tb_id);
 393         dst_hold(&rt6->dst);
 394         vrf->rt6 = rt6;
 395         rc = 0;
 396 out:
 397         return rc;
 398 }
 399 #else
 400 static void vrf_rt6_release(struct net_vrf *vrf)
 401 {
 402 }
 403
 404 static int vrf_rt6_create(struct net_device *dev)
 405 {
 406         return 0;
 407 }
 408 #endif
 409
 410 /* modelled after ip_finish_output2 */
 411 static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 412 {
 413         struct dst_entry *dst = skb_dst(skb);
 414         struct rtable *rt = (struct rtable *)dst;
 415         struct net_device *dev = dst->dev;
 416         unsigned int hh_len = LL_RESERVED_SPACE(dev);
 417         struct neighbour *neigh;
 418         u32 nexthop;
 419         int ret = -EINVAL;
 420
 421         nf_reset(skb);
 422
 423         /* Be paranoid, rather than too clever. */
 424         if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
 425                 struct sk_buff *skb2;
 426
 427                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 428                 if (!skb2) {
 429                         ret = -ENOMEM;
 430                         goto err;
 431                 }
 432                 if (skb->sk)
 433                         skb_set_owner_w(skb2, skb->sk);
 434
 435                 consume_skb(skb);
 436                 skb = skb2;
 437         }
 438
 439         rcu_read_lock_bh();
 440
 441         nexthop = (__force u32)rt_nexthop(rt, ip_hdr(skb)->daddr);
 442         neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
 443         if (unlikely(!neigh))
 444                 neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
 445         if (!IS_ERR(neigh)) {
 446                 ret = dst_neigh_output(dst, neigh, skb);
 447                 rcu_read_unlock_bh();
 448                 return ret;
 449         }
 450
 451         rcu_read_unlock_bh();
 452 err:
 453         vrf_tx_error(skb->dev, skb);
 454         return ret;
 455 }
 456
 457 static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 458 {
 459         struct net_device *dev = skb_dst(skb)->dev;
 460
 461         IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
 462
 463         skb->dev = dev;
 464         skb->protocol = htons(ETH_P_IP);
 465
 466         return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
 467                             net, sk, skb, NULL, dev,
 468                             vrf_finish_output,
 469                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 470 }
 471
 472 static void vrf_rtable_release(struct net_vrf *vrf)
 473 {
 474         struct dst_entry *dst = (struct dst_entry *)vrf->rth;
 475
 476         dst_release(dst);
 477         vrf->rth = NULL;
 478 }
 479
 480 static struct rtable *vrf_rtable_create(struct net_device *dev)
 481 {
 482         struct net_vrf *vrf = netdev_priv(dev);
 483         struct rtable *rth;
 484
 485         rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1, 1, 0);
 486         if (rth) {
 487                 rth->dst.output = vrf_output;
 488                 rth->rt_table_id = vrf->tb_id;
 489         }
 490
 491         return rth;
 492 }
 493
 494 /**************************** device handling ********************/
 495
 496 /* cycle interface to flush neighbor cache and move routes across tables */
 497 static void cycle_netdev(struct net_device *dev)
 498 {
 499         unsigned int flags = dev->flags;
 500         int ret;
 501
 502         if (!netif_running(dev))
 503                 return;
 504
 505         ret = dev_change_flags(dev, flags & ~IFF_UP);
 506         if (ret >= 0)
 507                 ret = dev_change_flags(dev, flags);
 508
 509         if (ret < 0) {
 510                 netdev_err(dev,
 511                            "Failed to cycle device %s; route tables might be wrong!\n",
 512                            dev->name);
 513         }
 514 }
 515
 516 static struct slave *__vrf_find_slave_dev(struct slave_queue *queue,
 517                                           struct net_device *dev)
 518 {
 519         struct list_head *head = &queue->all_slaves;
 520         struct slave *slave;
 521
 522         list_for_each_entry(slave, head, list) {
 523                 if (slave->dev == dev)
 524                         return slave;
 525         }
 526
 527         return NULL;
 528 }
 529
 530 /* inverse of __vrf_insert_slave */
 531 static void __vrf_remove_slave(struct slave_queue *queue, struct slave *slave)
 532 {
 533         list_del(&slave->list);
 534 }
 535
 536 static void __vrf_insert_slave(struct slave_queue *queue, struct slave *slave)
 537 {
 538         list_add(&slave->list, &queue->all_slaves);
 539 }
 540
 541 static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
 542 {
 543         struct slave *slave = kzalloc(sizeof(*slave), GFP_KERNEL);
 544         struct net_vrf *vrf = netdev_priv(dev);
 545         struct slave_queue *queue = &vrf->queue;
 546         int ret = -ENOMEM;
 547
 548         if (!slave)
 549                 goto out_fail;
 550
 551         slave->dev = port_dev;
 552
 553         /* register the packet handler for slave ports */
 554         ret = netdev_rx_handler_register(port_dev, vrf_handle_frame, dev);
 555         if (ret) {
 556                 netdev_err(port_dev,
 557                            "Device %s failed to register rx_handler\n",
 558                            port_dev->name);
 559                 goto out_fail;
 560         }
 561
 562         ret = netdev_master_upper_dev_link(port_dev, dev);
 563         if (ret < 0)
 564                 goto out_unregister;
 565
 566         port_dev->priv_flags |= IFF_L3MDEV_SLAVE;
 567         __vrf_insert_slave(queue, slave);
 568         cycle_netdev(port_dev);
 569
 570         return 0;
 571
 572 out_unregister:
 573         netdev_rx_handler_unregister(port_dev);
 574 out_fail:
 575         kfree(slave);
 576         return ret;
 577 }
 578
 579 static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
 580 {
 581         if (netif_is_l3_master(port_dev) || netif_is_l3_slave(port_dev))
 582                 return -EINVAL;
 583
 584         return do_vrf_add_slave(dev, port_dev);
 585 }
 586
 587 /* inverse of do_vrf_add_slave */
 588 static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev)
 589 {
 590         struct net_vrf *vrf = netdev_priv(dev);
 591         struct slave_queue *queue = &vrf->queue;
 592         struct slave *slave;
 593
 594         netdev_upper_dev_unlink(port_dev, dev);
 595         port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE;
 596
 597         netdev_rx_handler_unregister(port_dev);
 598
 599         cycle_netdev(port_dev);
 600
 601         slave = __vrf_find_slave_dev(queue, port_dev);
 602         if (slave)
 603                 __vrf_remove_slave(queue, slave);
 604
 605         kfree(slave);
 606
 607         return 0;
 608 }
 609
 610 static int vrf_del_slave(struct net_device *dev, struct net_device *port_dev)
 611 {
 612         return do_vrf_del_slave(dev, port_dev);
 613 }
 614
 615 static void vrf_dev_uninit(struct net_device *dev)
 616 {
 617         struct net_vrf *vrf = netdev_priv(dev);
 618 //      struct slave_queue *queue = &vrf->queue;
 619 //      struct list_head *head = &queue->all_slaves;
 620 //      struct slave *slave, *next;
 621
 622         vrf_rtable_release(vrf);
 623         vrf_rt6_release(vrf);
 624
 625 //      list_for_each_entry_safe(slave, next, head, list)
 626 //              vrf_del_slave(dev, slave->dev);
 627
 628         free_percpu(dev->dstats);
 629         dev->dstats = NULL;
 630 }
 631
 632 static int vrf_dev_init(struct net_device *dev)
 633 {
 634         struct net_vrf *vrf = netdev_priv(dev);
 635
 636         INIT_LIST_HEAD(&vrf->queue.all_slaves);
 637
 638         dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
 639         if (!dev->dstats)
 640                 goto out_nomem;
 641
 642         /* create the default dst which points back to us */
 643         vrf->rth = vrf_rtable_create(dev);
 644         if (!vrf->rth)
 645                 goto out_stats;
 646
 647         if (vrf_rt6_create(dev) != 0)
 648                 goto out_rth;
 649
 650         dev->flags = IFF_MASTER | IFF_NOARP;
 651
 652         return 0;
 653
 654 out_rth:
 655         vrf_rtable_release(vrf);
 656 out_stats:
 657         free_percpu(dev->dstats);
 658         dev->dstats = NULL;
 659 out_nomem:
 660         return -ENOMEM;
 661 }
 662
 663 static const struct net_device_ops vrf_netdev_ops = {
 664         .ndo_init               = vrf_dev_init,
 665         .ndo_uninit             = vrf_dev_uninit,
 666         .ndo_start_xmit         = vrf_xmit,
 667         .ndo_get_stats64        = vrf_get_stats64,
 668         .ndo_add_slave          = vrf_add_slave,
 669         .ndo_del_slave          = vrf_del_slave,
 670 };
 671
 672 static u32 vrf_fib_table(const struct net_device *dev)
 673 {
 674         struct net_vrf *vrf = netdev_priv(dev);
 675
 676         return vrf->tb_id;
 677 }
 678
 679 static struct rtable *vrf_get_rtable(const struct net_device *dev,
 680                                      const struct flowi4 *fl4)
 681 {
 682         struct rtable *rth = NULL;
 683
 684         if (!(fl4->flowi4_flags & FLOWI_FLAG_L3MDEV_SRC)) {
 685                 struct net_vrf *vrf = netdev_priv(dev);
 686
 687                 rth = vrf->rth;
 688                 dst_hold(&rth->dst);
 689         }
 690
 691         return rth;
 692 }
 693
 694 /* called under rcu_read_lock */
 695 static int vrf_get_saddr(struct net_device *dev, struct flowi4 *fl4)
 696 {
 697         struct fib_result res = { .tclassid = 0 };
 698         struct net *net = dev_net(dev);
 699         u32 orig_tos = fl4->flowi4_tos;
 700         u8 flags = fl4->flowi4_flags;
 701         u8 scope = fl4->flowi4_scope;
 702         u8 tos = RT_FL_TOS(fl4);
 703         int rc;
 704
 705         if (unlikely(!fl4->daddr))
 706                 return 0;
 707
 708         fl4->flowi4_flags |= FLOWI_FLAG_SKIP_NH_OIF;
 709         fl4->flowi4_iif = LOOPBACK_IFINDEX;
 710         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
 711         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
 712                              RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
 713
 714         rc = fib_lookup(net, fl4, &res, 0);
 715         if (!rc) {
 716                 if (res.type == RTN_LOCAL)
 717                         fl4->saddr = res.fi->fib_prefsrc ? : fl4->daddr;
 718                 else
 719                         fib_select_path(net, &res, fl4, -1);
 720         }
 721
 722         fl4->flowi4_flags = flags;
 723         fl4->flowi4_tos = orig_tos;
 724         fl4->flowi4_scope = scope;
 725
 726         return rc;
 727 }
 728
 729 #if IS_ENABLED(CONFIG_IPV6)
 730 static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev,
 731                                          const struct flowi6 *fl6)
 732 {
 733         struct rt6_info *rt = NULL;
 734
 735         if (!(fl6->flowi6_flags & FLOWI_FLAG_L3MDEV_SRC)) {
 736                 struct net_vrf *vrf = netdev_priv(dev);
 737
 738                 rt = vrf->rt6;
 739                 dst_hold(&rt->dst);
 740         }
 741
 742         return (struct dst_entry *)rt;
 743 }
 744 #endif
 745
 746 static const struct l3mdev_ops vrf_l3mdev_ops = {
 747         .l3mdev_fib_table       = vrf_fib_table,
 748         .l3mdev_get_rtable      = vrf_get_rtable,
 749         .l3mdev_get_saddr       = vrf_get_saddr,
 750 #if IS_ENABLED(CONFIG_IPV6)
 751         .l3mdev_get_rt6_dst     = vrf_get_rt6_dst,
 752 #endif
 753 };
 754
 755 static void vrf_get_drvinfo(struct net_device *dev,
 756                             struct ethtool_drvinfo *info)
 757 {
 758         strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
 759         strlcpy(info->version, DRV_VERSION, sizeof(info->version));
 760 }
 761
 762 static const struct ethtool_ops vrf_ethtool_ops = {
 763         .get_drvinfo    = vrf_get_drvinfo,
 764 };
 765
 766 static void vrf_setup(struct net_device *dev)
 767 {
 768         ether_setup(dev);
 769
 770         /* Initialize the device structure. */
 771         dev->netdev_ops = &vrf_netdev_ops;
 772         dev->l3mdev_ops = &vrf_l3mdev_ops;
 773         dev->ethtool_ops = &vrf_ethtool_ops;
 774         dev->destructor = free_netdev;
 775
 776         /* Fill in device structure with ethernet-generic values. */
 777         eth_hw_addr_random(dev);
 778
 779         /* don't acquire vrf device's netif_tx_lock when transmitting */
 780         dev->features |= NETIF_F_LLTX;
 781
 782         /* don't allow vrf devices to change network namespaces. */
 783         dev->features |= NETIF_F_NETNS_LOCAL;
 784 }
 785
 786 static int vrf_validate(struct nlattr *tb[], struct nlattr *data[])
 787 {
 788         if (tb[IFLA_ADDRESS]) {
 789                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
 790                         return -EINVAL;
 791                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
 792                         return -EADDRNOTAVAIL;
 793         }
 794         return 0;
 795 }
 796
 797 static void vrf_dellink(struct net_device *dev, struct list_head *head)
 798 {
 799         struct net_vrf *vrf = netdev_priv(dev);
 800         struct slave_queue *queue = &vrf->queue;
 801         struct list_head *all_slaves = &queue->all_slaves;
 802         struct slave *slave, *next;
 803
 804         list_for_each_entry_safe(slave, next, all_slaves, list)
 805                 vrf_del_slave(dev, slave->dev);
 806
 807         unregister_netdevice_queue(dev, head);
 808 }
 809
 810 static int vrf_newlink(struct net *src_net, struct net_device *dev,
 811                        struct nlattr *tb[], struct nlattr *data[])
 812 {
 813         struct net_vrf *vrf = netdev_priv(dev);
 814
 815         if (!data || !data[IFLA_VRF_TABLE])
 816                 return -EINVAL;
 817
 818         vrf->tb_id = nla_get_u32(data[IFLA_VRF_TABLE]);
 819         if (vrf->tb_id == RT_TABLE_UNSPEC)
 820                 return -EINVAL;
 821
 822         dev->priv_flags |= IFF_L3MDEV_MASTER;
 823
 824         return register_netdevice(dev);
 825 }
 826
 827 static size_t vrf_nl_getsize(const struct net_device *dev)
 828 {
 829         return nla_total_size(sizeof(u32));  /* IFLA_VRF_TABLE */
 830 }
 831
 832 static int vrf_fillinfo(struct sk_buff *skb,
 833                         const struct net_device *dev)
 834 {
 835         struct net_vrf *vrf = netdev_priv(dev);
 836
 837         return nla_put_u32(skb, IFLA_VRF_TABLE, vrf->tb_id);
 838 }
 839
 840 static const struct nla_policy vrf_nl_policy[IFLA_VRF_MAX + 1] = {
 841         [IFLA_VRF_TABLE] = { .type = NLA_U32 },
 842 };
 843
 844 static struct rtnl_link_ops vrf_link_ops __read_mostly = {
 845         .kind           = DRV_NAME,
 846         .priv_size      = sizeof(struct net_vrf),
 847
 848         .get_size       = vrf_nl_getsize,
 849         .policy         = vrf_nl_policy,
 850         .validate       = vrf_validate,
 851         .fill_info      = vrf_fillinfo,
 852
 853         .newlink        = vrf_newlink,
 854         .dellink        = vrf_dellink,
 855         .setup          = vrf_setup,
 856         .maxtype        = IFLA_VRF_MAX,
 857 };
 858
 859 static int vrf_device_event(struct notifier_block *unused,
 860                             unsigned long event, void *ptr)
 861 {
 862         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 863
 864         /* only care about unregister events to drop slave references */
 865         if (event == NETDEV_UNREGISTER) {
 866                 struct net_device *vrf_dev;
 867
 868                 if (!netif_is_l3_slave(dev))
 869                         goto out;
 870
 871                 vrf_dev = netdev_master_upper_dev_get(dev);
 872                 vrf_del_slave(vrf_dev, dev);
 873         }
 874 out:
 875         return NOTIFY_DONE;
 876 }
 877
 878 static struct notifier_block vrf_notifier_block __read_mostly = {
 879         .notifier_call = vrf_device_event,
 880 };
 881
 882 static int __init vrf_init_module(void)
 883 {
 884         int rc;
 885
 886         register_netdevice_notifier(&vrf_notifier_block);
 887
 888         rc = rtnl_link_register(&vrf_link_ops);
 889         if (rc < 0)
 890                 goto error;
 891
 892         return 0;
 893
 894 error:
 895         unregister_netdevice_notifier(&vrf_notifier_block);
 896         return rc;
 897 }
 898
 899 module_init(vrf_init_module);
 900 MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern");
 901 MODULE_DESCRIPTION("Device driver to instantiate VRF domains");
 902 MODULE_LICENSE("GPL");
 903 MODULE_ALIAS_RTNL_LINK(DRV_NAME);
 904 MODULE_VERSION(DRV_VERSION);