net/ipv6/ip6_output.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  *      IPv6 output functions
   4  *      Linux INET6 implementation
   5  *
   6  *      Authors:
   7  *      Pedro Roque             <roque@di.fc.ul.pt>
   8  *
   9  *      Based on linux/net/ipv4/ip_output.c
  10  *
  11  *      Changes:
  12  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  13  *                              extension headers are implemented.
  14  *                              route changes now work.
  15  *                              ip6_forward does not confuse sniffers.
  16  *                              etc.
  17  *
  18  *      H. von Brand    :       Added missing #include <linux/string.h>
  19  *      Imran Patel     :       frag id should be in NBO
  20  *      Kazunori MIYAZAWA @USAGI
  21  *                      :       add ip6_append_data and related functions
  22  *                              for datagram xmit
  23  */
  24
  25 #include <linux/errno.h>
  26 #include <linux/kernel.h>
  27 #include <linux/string.h>
  28 #include <linux/socket.h>
  29 #include <linux/net.h>
  30 #include <linux/netdevice.h>
  31 #include <linux/if_arp.h>
  32 #include <linux/in6.h>
  33 #include <linux/tcp.h>
  34 #include <linux/route.h>
  35 #include <linux/module.h>
  36 #include <linux/slab.h>
  37
  38 #include <linux/bpf-cgroup.h>
  39 #include <linux/netfilter.h>
  40 #include <linux/netfilter_ipv6.h>
  41
  42 #include <net/sock.h>
  43 #include <net/snmp.h>
  44
  45 #include <net/ipv6.h>
  46 #include <net/ndisc.h>
  47 #include <net/protocol.h>
  48 #include <net/ip6_route.h>
  49 #include <net/addrconf.h>
  50 #include <net/rawv6.h>
  51 #include <net/icmp.h>
  52 #include <net/xfrm.h>
  53 #include <net/checksum.h>
  54 #include <linux/mroute6.h>
  55 #include <net/l3mdev.h>
  56 #include <net/lwtunnel.h>
  57 #include <net/ip_tunnels.h>
  58
  59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  60 {
  61         struct dst_entry *dst = skb_dst(skb);
  62         struct net_device *dev = dst->dev;
  63         unsigned int hh_len = LL_RESERVED_SPACE(dev);
  64         int delta = hh_len - skb_headroom(skb);
  65         const struct in6_addr *nexthop;
  66         struct neighbour *neigh;
  67         int ret;
  68
  69         /* Be paranoid, rather than too clever. */
  70         if (unlikely(delta > 0) && dev->header_ops) {
  71                 /* pskb_expand_head() might crash, if skb is shared */
  72                 if (skb_shared(skb)) {
  73                         struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
  74
  75                         if (likely(nskb)) {
  76                                 if (skb->sk)
  77                                         skb_set_owner_w(nskb, skb->sk);
  78                                 consume_skb(skb);
  79                         } else {
  80                                 kfree_skb(skb);
  81                         }
  82                         skb = nskb;
  83                 }
  84                 if (skb &&
  85                     pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
  86                         kfree_skb(skb);
  87                         skb = NULL;
  88                 }
  89                 if (!skb) {
  90                         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
  91                         return -ENOMEM;
  92                 }
  93         }
  94
  95         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  96                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  97
  98                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  99                     ((mroute6_is_socket(net, skb) &&
 100                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 101                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 102                                          &ipv6_hdr(skb)->saddr))) {
 103                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 104
 105                         /* Do not check for IFF_ALLMULTI; multicast routing
 106                            is not supported in any case.
 107                          */
 108                         if (newskb)
 109                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 110                                         net, sk, newskb, NULL, newskb->dev,
 111                                         dev_loopback_xmit);
 112
 113                         if (ipv6_hdr(skb)->hop_limit == 0) {
 114                                 IP6_INC_STATS(net, idev,
 115                                               IPSTATS_MIB_OUTDISCARDS);
 116                                 kfree_skb(skb);
 117                                 return 0;
 118                         }
 119                 }
 120
 121                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
 122
 123                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
 124                     IPV6_ADDR_SCOPE_NODELOCAL &&
 125                     !(dev->flags & IFF_LOOPBACK)) {
 126                         kfree_skb(skb);
 127                         return 0;
 128                 }
 129         }
 130
 131         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 132                 int res = lwtunnel_xmit(skb);
 133
 134                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 135                         return res;
 136         }
 137
 138         rcu_read_lock_bh();
 139         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 140         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 141         if (unlikely(!neigh))
 142                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 143         if (!IS_ERR(neigh)) {
 144                 sock_confirm_neigh(skb, neigh);
 145                 ret = neigh_output(neigh, skb, false);
 146                 rcu_read_unlock_bh();
 147                 return ret;
 148         }
 149         rcu_read_unlock_bh();
 150
 151         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 152         kfree_skb(skb);
 153         return -EINVAL;
 154 }
 155
 156 static int
 157 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
 158                                     struct sk_buff *skb, unsigned int mtu)
 159 {
 160         struct sk_buff *segs, *nskb;
 161         netdev_features_t features;
 162         int ret = 0;
 163
 164         /* Please see corresponding comment in ip_finish_output_gso
 165          * describing the cases where GSO segment length exceeds the
 166          * egress MTU.
 167          */
 168         features = netif_skb_features(skb);
 169         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 170         if (IS_ERR_OR_NULL(segs)) {
 171                 kfree_skb(skb);
 172                 return -ENOMEM;
 173         }
 174
 175         consume_skb(skb);
 176
 177         skb_list_walk_safe(segs, segs, nskb) {
 178                 int err;
 179
 180                 skb_mark_not_on_list(segs);
 181                 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
 182                 if (err && ret == 0)
 183                         ret = err;
 184         }
 185
 186         return ret;
 187 }
 188
 189 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 190 {
 191         unsigned int mtu;
 192
 193 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 194         /* Policy lookup after SNAT yielded a new policy */
 195         if (skb_dst(skb)->xfrm) {
 196                 IP6CB(skb)->flags |= IP6SKB_REROUTED;
 197                 return dst_output(net, sk, skb);
 198         }
 199 #endif
 200
 201         mtu = ip6_skb_dst_mtu(skb);
 202         if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
 203                 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
 204
 205         if ((skb->len > mtu && !skb_is_gso(skb)) ||
 206             dst_allfrag(skb_dst(skb)) ||
 207             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 208                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 209         else
 210                 return ip6_finish_output2(net, sk, skb);
 211 }
 212
 213 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 214 {
 215         int ret;
 216
 217         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 218         switch (ret) {
 219         case NET_XMIT_SUCCESS:
 220                 return __ip6_finish_output(net, sk, skb);
 221         case NET_XMIT_CN:
 222                 return __ip6_finish_output(net, sk, skb) ? : ret;
 223         default:
 224                 kfree_skb(skb);
 225                 return ret;
 226         }
 227 }
 228
 229 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 230 {
 231         struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
 232         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 233
 234         skb->protocol = htons(ETH_P_IPV6);
 235         skb->dev = dev;
 236
 237         if (unlikely(idev->cnf.disable_ipv6)) {
 238                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 239                 kfree_skb(skb);
 240                 return 0;
 241         }
 242
 243         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 244                             net, sk, skb, indev, dev,
 245                             ip6_finish_output,
 246                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 247 }
 248
 249 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 250 {
 251         if (!np->autoflowlabel_set)
 252                 return ip6_default_np_autolabel(net);
 253         else
 254                 return np->autoflowlabel;
 255 }
 256
 257 /*
 258  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 259  * Note : socket lock is not held for SYNACK packets, but might be modified
 260  * by calls to skb_set_owner_w() and ipv6_local_error(),
 261  * which are using proper atomic operations or spinlocks.
 262  */
 263 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 264              __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
 265 {
 266         struct net *net = sock_net(sk);
 267         const struct ipv6_pinfo *np = inet6_sk(sk);
 268         struct in6_addr *first_hop = &fl6->daddr;
 269         struct dst_entry *dst = skb_dst(skb);
 270         unsigned int head_room;
 271         struct ipv6hdr *hdr;
 272         u8  proto = fl6->flowi6_proto;
 273         int seg_len = skb->len;
 274         int hlimit = -1;
 275         u32 mtu;
 276
 277         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 278         if (opt)
 279                 head_room += opt->opt_nflen + opt->opt_flen;
 280
 281         if (unlikely(skb_headroom(skb) < head_room)) {
 282                 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 283                 if (!skb2) {
 284                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 285                                       IPSTATS_MIB_OUTDISCARDS);
 286                         kfree_skb(skb);
 287                         return -ENOBUFS;
 288                 }
 289                 if (skb->sk)
 290                         skb_set_owner_w(skb2, skb->sk);
 291                 consume_skb(skb);
 292                 skb = skb2;
 293         }
 294
 295         if (opt) {
 296                 seg_len += opt->opt_nflen + opt->opt_flen;
 297
 298                 if (opt->opt_flen)
 299                         ipv6_push_frag_opts(skb, opt, &proto);
 300
 301                 if (opt->opt_nflen)
 302                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 303                                              &fl6->saddr);
 304         }
 305
 306         skb_push(skb, sizeof(struct ipv6hdr));
 307         skb_reset_network_header(skb);
 308         hdr = ipv6_hdr(skb);
 309
 310         /*
 311          *      Fill in the IPv6 header
 312          */
 313         if (np)
 314                 hlimit = np->hop_limit;
 315         if (hlimit < 0)
 316                 hlimit = ip6_dst_hoplimit(dst);
 317
 318         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 319                                 ip6_autoflowlabel(net, np), fl6));
 320
 321         hdr->payload_len = htons(seg_len);
 322         hdr->nexthdr = proto;
 323         hdr->hop_limit = hlimit;
 324
 325         hdr->saddr = fl6->saddr;
 326         hdr->daddr = *first_hop;
 327
 328         skb->protocol = htons(ETH_P_IPV6);
 329         skb->priority = priority;
 330         skb->mark = mark;
 331
 332         mtu = dst_mtu(dst);
 333         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 334                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 335                               IPSTATS_MIB_OUT, skb->len);
 336
 337                 /* if egress device is enslaved to an L3 master device pass the
 338                  * skb to its handler for processing
 339                  */
 340                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 341                 if (unlikely(!skb))
 342                         return 0;
 343
 344                 /* hooks should never assume socket lock is held.
 345                  * we promote our socket to non const
 346                  */
 347                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 348                                net, (struct sock *)sk, skb, NULL, dst->dev,
 349                                dst_output);
 350         }
 351
 352         skb->dev = dst->dev;
 353         /* ipv6_local_error() does not require socket lock,
 354          * we promote our socket to non const
 355          */
 356         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 357
 358         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 359         kfree_skb(skb);
 360         return -EMSGSIZE;
 361 }
 362 EXPORT_SYMBOL(ip6_xmit);
 363
 364 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 365 {
 366         struct ip6_ra_chain *ra;
 367         struct sock *last = NULL;
 368
 369         read_lock(&ip6_ra_lock);
 370         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 371                 struct sock *sk = ra->sk;
 372                 if (sk && ra->sel == sel &&
 373                     (!sk->sk_bound_dev_if ||
 374                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 375                         struct ipv6_pinfo *np = inet6_sk(sk);
 376
 377                         if (np && np->rtalert_isolate &&
 378                             !net_eq(sock_net(sk), dev_net(skb->dev))) {
 379                                 continue;
 380                         }
 381                         if (last) {
 382                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 383                                 if (skb2)
 384                                         rawv6_rcv(last, skb2);
 385                         }
 386                         last = sk;
 387                 }
 388         }
 389
 390         if (last) {
 391                 rawv6_rcv(last, skb);
 392                 read_unlock(&ip6_ra_lock);
 393                 return 1;
 394         }
 395         read_unlock(&ip6_ra_lock);
 396         return 0;
 397 }
 398
 399 static int ip6_forward_proxy_check(struct sk_buff *skb)
 400 {
 401         struct ipv6hdr *hdr = ipv6_hdr(skb);
 402         u8 nexthdr = hdr->nexthdr;
 403         __be16 frag_off;
 404         int offset;
 405
 406         if (ipv6_ext_hdr(nexthdr)) {
 407                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 408                 if (offset < 0)
 409                         return 0;
 410         } else
 411                 offset = sizeof(struct ipv6hdr);
 412
 413         if (nexthdr == IPPROTO_ICMPV6) {
 414                 struct icmp6hdr *icmp6;
 415
 416                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 417                                          offset + 1 - skb->data)))
 418                         return 0;
 419
 420                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 421
 422                 switch (icmp6->icmp6_type) {
 423                 case NDISC_ROUTER_SOLICITATION:
 424                 case NDISC_ROUTER_ADVERTISEMENT:
 425                 case NDISC_NEIGHBOUR_SOLICITATION:
 426                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 427                 case NDISC_REDIRECT:
 428                         /* For reaction involving unicast neighbor discovery
 429                          * message destined to the proxied address, pass it to
 430                          * input function.
 431                          */
 432                         return 1;
 433                 default:
 434                         break;
 435                 }
 436         }
 437
 438         /*
 439          * The proxying router can't forward traffic sent to a link-local
 440          * address, so signal the sender and discard the packet. This
 441          * behavior is clarified by the MIPv6 specification.
 442          */
 443         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 444                 dst_link_failure(skb);
 445                 return -1;
 446         }
 447
 448         return 0;
 449 }
 450
 451 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 452                                      struct sk_buff *skb)
 453 {
 454         struct dst_entry *dst = skb_dst(skb);
 455
 456         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 457         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 458
 459 #ifdef CONFIG_NET_SWITCHDEV
 460         if (skb->offload_l3_fwd_mark) {
 461                 consume_skb(skb);
 462                 return 0;
 463         }
 464 #endif
 465
 466         skb->tstamp = 0;
 467         return dst_output(net, sk, skb);
 468 }
 469
 470 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 471 {
 472         if (skb->len <= mtu)
 473                 return false;
 474
 475         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 476         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 477                 return true;
 478
 479         if (skb->ignore_df)
 480                 return false;
 481
 482         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 483                 return false;
 484
 485         return true;
 486 }
 487
 488 int ip6_forward(struct sk_buff *skb)
 489 {
 490         struct dst_entry *dst = skb_dst(skb);
 491         struct ipv6hdr *hdr = ipv6_hdr(skb);
 492         struct inet6_skb_parm *opt = IP6CB(skb);
 493         struct net *net = dev_net(dst->dev);
 494         struct inet6_dev *idev;
 495         u32 mtu;
 496
 497         idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
 498         if (net->ipv6.devconf_all->forwarding == 0)
 499                 goto error;
 500
 501         if (skb->pkt_type != PACKET_HOST)
 502                 goto drop;
 503
 504         if (unlikely(skb->sk))
 505                 goto drop;
 506
 507         if (skb_warn_if_lro(skb))
 508                 goto drop;
 509
 510         if (!net->ipv6.devconf_all->disable_policy &&
 511             (!idev || !idev->cnf.disable_policy) &&
 512             !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 513                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 514                 goto drop;
 515         }
 516
 517         skb_forward_csum(skb);
 518
 519         /*
 520          *      We DO NOT make any processing on
 521          *      RA packets, pushing them to user level AS IS
 522          *      without ane WARRANTY that application will be able
 523          *      to interpret them. The reason is that we
 524          *      cannot make anything clever here.
 525          *
 526          *      We are not end-node, so that if packet contains
 527          *      AH/ESP, we cannot make anything.
 528          *      Defragmentation also would be mistake, RA packets
 529          *      cannot be fragmented, because there is no warranty
 530          *      that different fragments will go along one path. --ANK
 531          */
 532         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 533                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 534                         return 0;
 535         }
 536
 537         /*
 538          *      check and decrement ttl
 539          */
 540         if (hdr->hop_limit <= 1) {
 541                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 542                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 543
 544                 kfree_skb(skb);
 545                 return -ETIMEDOUT;
 546         }
 547
 548         /* XXX: idev->cnf.proxy_ndp? */
 549         if (net->ipv6.devconf_all->proxy_ndp &&
 550             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 551                 int proxied = ip6_forward_proxy_check(skb);
 552                 if (proxied > 0)
 553                         return ip6_input(skb);
 554                 else if (proxied < 0) {
 555                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 556                         goto drop;
 557                 }
 558         }
 559
 560         if (!xfrm6_route_forward(skb)) {
 561                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 562                 goto drop;
 563         }
 564         dst = skb_dst(skb);
 565
 566         /* IPv6 specs say nothing about it, but it is clear that we cannot
 567            send redirects to source routed frames.
 568            We don't send redirects to frames decapsulated from IPsec.
 569          */
 570         if (IP6CB(skb)->iif == dst->dev->ifindex &&
 571             opt->srcrt == 0 && !skb_sec_path(skb)) {
 572                 struct in6_addr *target = NULL;
 573                 struct inet_peer *peer;
 574                 struct rt6_info *rt;
 575
 576                 /*
 577                  *      incoming and outgoing devices are the same
 578                  *      send a redirect.
 579                  */
 580
 581                 rt = (struct rt6_info *) dst;
 582                 if (rt->rt6i_flags & RTF_GATEWAY)
 583                         target = &rt->rt6i_gateway;
 584                 else
 585                         target = &hdr->daddr;
 586
 587                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 588
 589                 /* Limit redirects both by destination (here)
 590                    and by source (inside ndisc_send_redirect)
 591                  */
 592                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 593                         ndisc_send_redirect(skb, target);
 594                 if (peer)
 595                         inet_putpeer(peer);
 596         } else {
 597                 int addrtype = ipv6_addr_type(&hdr->saddr);
 598
 599                 /* This check is security critical. */
 600                 if (addrtype == IPV6_ADDR_ANY ||
 601                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 602                         goto error;
 603                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 604                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 605                                     ICMPV6_NOT_NEIGHBOUR, 0);
 606                         goto error;
 607                 }
 608         }
 609
 610         mtu = ip6_dst_mtu_forward(dst);
 611         if (mtu < IPV6_MIN_MTU)
 612                 mtu = IPV6_MIN_MTU;
 613
 614         if (ip6_pkt_too_big(skb, mtu)) {
 615                 /* Again, force OUTPUT device used as source address */
 616                 skb->dev = dst->dev;
 617                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 618                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 619                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 620                                 IPSTATS_MIB_FRAGFAILS);
 621                 kfree_skb(skb);
 622                 return -EMSGSIZE;
 623         }
 624
 625         if (skb_cow(skb, dst->dev->hard_header_len)) {
 626                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 627                                 IPSTATS_MIB_OUTDISCARDS);
 628                 goto drop;
 629         }
 630
 631         hdr = ipv6_hdr(skb);
 632
 633         /* Mangling hops number delayed to point after skb COW */
 634
 635         hdr->hop_limit--;
 636
 637         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 638                        net, NULL, skb, skb->dev, dst->dev,
 639                        ip6_forward_finish);
 640
 641 error:
 642         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 643 drop:
 644         kfree_skb(skb);
 645         return -EINVAL;
 646 }
 647
 648 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 649 {
 650         to->pkt_type = from->pkt_type;
 651         to->priority = from->priority;
 652         to->protocol = from->protocol;
 653         skb_dst_drop(to);
 654         skb_dst_set(to, dst_clone(skb_dst(from)));
 655         to->dev = from->dev;
 656         to->mark = from->mark;
 657
 658         skb_copy_hash(to, from);
 659
 660 #ifdef CONFIG_NET_SCHED
 661         to->tc_index = from->tc_index;
 662 #endif
 663         nf_copy(to, from);
 664         skb_ext_copy(to, from);
 665         skb_copy_secmark(to, from);
 666 }
 667
 668 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
 669                       u8 nexthdr, __be32 frag_id,
 670                       struct ip6_fraglist_iter *iter)
 671 {
 672         unsigned int first_len;
 673         struct frag_hdr *fh;
 674
 675         /* BUILD HEADER */
 676         *prevhdr = NEXTHDR_FRAGMENT;
 677         iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 678         if (!iter->tmp_hdr)
 679                 return -ENOMEM;
 680
 681         iter->frag = skb_shinfo(skb)->frag_list;
 682         skb_frag_list_init(skb);
 683
 684         iter->offset = 0;
 685         iter->hlen = hlen;
 686         iter->frag_id = frag_id;
 687         iter->nexthdr = nexthdr;
 688
 689         __skb_pull(skb, hlen);
 690         fh = __skb_push(skb, sizeof(struct frag_hdr));
 691         __skb_push(skb, hlen);
 692         skb_reset_network_header(skb);
 693         memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
 694
 695         fh->nexthdr = nexthdr;
 696         fh->reserved = 0;
 697         fh->frag_off = htons(IP6_MF);
 698         fh->identification = frag_id;
 699
 700         first_len = skb_pagelen(skb);
 701         skb->data_len = first_len - skb_headlen(skb);
 702         skb->len = first_len;
 703         ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
 704
 705         return 0;
 706 }
 707 EXPORT_SYMBOL(ip6_fraglist_init);
 708
 709 void ip6_fraglist_prepare(struct sk_buff *skb,
 710                           struct ip6_fraglist_iter *iter)
 711 {
 712         struct sk_buff *frag = iter->frag;
 713         unsigned int hlen = iter->hlen;
 714         struct frag_hdr *fh;
 715
 716         frag->ip_summed = CHECKSUM_NONE;
 717         skb_reset_transport_header(frag);
 718         fh = __skb_push(frag, sizeof(struct frag_hdr));
 719         __skb_push(frag, hlen);
 720         skb_reset_network_header(frag);
 721         memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
 722         iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
 723         fh->nexthdr = iter->nexthdr;
 724         fh->reserved = 0;
 725         fh->frag_off = htons(iter->offset);
 726         if (frag->next)
 727                 fh->frag_off |= htons(IP6_MF);
 728         fh->identification = iter->frag_id;
 729         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 730         ip6_copy_metadata(frag, skb);
 731 }
 732 EXPORT_SYMBOL(ip6_fraglist_prepare);
 733
 734 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
 735                    unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
 736                    u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
 737 {
 738         state->prevhdr = prevhdr;
 739         state->nexthdr = nexthdr;
 740         state->frag_id = frag_id;
 741
 742         state->hlen = hlen;
 743         state->mtu = mtu;
 744
 745         state->left = skb->len - hlen;  /* Space per frame */
 746         state->ptr = hlen;              /* Where to start from */
 747
 748         state->hroom = hdr_room;
 749         state->troom = needed_tailroom;
 750
 751         state->offset = 0;
 752 }
 753 EXPORT_SYMBOL(ip6_frag_init);
 754
 755 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
 756 {
 757         u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
 758         struct sk_buff *frag;
 759         struct frag_hdr *fh;
 760         unsigned int len;
 761
 762         len = state->left;
 763         /* IF: it doesn't fit, use 'mtu' - the data space left */
 764         if (len > state->mtu)
 765                 len = state->mtu;
 766         /* IF: we are not sending up to and including the packet end
 767            then align the next start on an eight byte boundary */
 768         if (len < state->left)
 769                 len &= ~7;
 770
 771         /* Allocate buffer */
 772         frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
 773                          state->hroom + state->troom, GFP_ATOMIC);
 774         if (!frag)
 775                 return ERR_PTR(-ENOMEM);
 776
 777         /*
 778          *      Set up data on packet
 779          */
 780
 781         ip6_copy_metadata(frag, skb);
 782         skb_reserve(frag, state->hroom);
 783         skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
 784         skb_reset_network_header(frag);
 785         fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
 786         frag->transport_header = (frag->network_header + state->hlen +
 787                                   sizeof(struct frag_hdr));
 788
 789         /*
 790          *      Charge the memory for the fragment to any owner
 791          *      it might possess
 792          */
 793         if (skb->sk)
 794                 skb_set_owner_w(frag, skb->sk);
 795
 796         /*
 797          *      Copy the packet header into the new buffer.
 798          */
 799         skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
 800
 801         fragnexthdr_offset = skb_network_header(frag);
 802         fragnexthdr_offset += prevhdr - skb_network_header(skb);
 803         *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 804
 805         /*
 806          *      Build fragment header.
 807          */
 808         fh->nexthdr = state->nexthdr;
 809         fh->reserved = 0;
 810         fh->identification = state->frag_id;
 811
 812         /*
 813          *      Copy a block of the IP datagram.
 814          */
 815         BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
 816                              len));
 817         state->left -= len;
 818
 819         fh->frag_off = htons(state->offset);
 820         if (state->left > 0)
 821                 fh->frag_off |= htons(IP6_MF);
 822         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 823
 824         state->ptr += len;
 825         state->offset += len;
 826
 827         return frag;
 828 }
 829 EXPORT_SYMBOL(ip6_frag_next);
 830
 831 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 832                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 833 {
 834         struct sk_buff *frag;
 835         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 836         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 837                                 inet6_sk(skb->sk) : NULL;
 838         struct ip6_frag_state state;
 839         unsigned int mtu, hlen, nexthdr_offset;
 840         ktime_t tstamp = skb->tstamp;
 841         int hroom, err = 0;
 842         __be32 frag_id;
 843         u8 *prevhdr, nexthdr = 0;
 844
 845         err = ip6_find_1stfragopt(skb, &prevhdr);
 846         if (err < 0)
 847                 goto fail;
 848         hlen = err;
 849         nexthdr = *prevhdr;
 850         nexthdr_offset = prevhdr - skb_network_header(skb);
 851
 852         mtu = ip6_skb_dst_mtu(skb);
 853
 854         /* We must not fragment if the socket is set to force MTU discovery
 855          * or if the skb it not generated by a local socket.
 856          */
 857         if (unlikely(!skb->ignore_df && skb->len > mtu))
 858                 goto fail_toobig;
 859
 860         if (IP6CB(skb)->frag_max_size) {
 861                 if (IP6CB(skb)->frag_max_size > mtu)
 862                         goto fail_toobig;
 863
 864                 /* don't send fragments larger than what we received */
 865                 mtu = IP6CB(skb)->frag_max_size;
 866                 if (mtu < IPV6_MIN_MTU)
 867                         mtu = IPV6_MIN_MTU;
 868         }
 869
 870         if (np && np->frag_size < mtu) {
 871                 if (np->frag_size)
 872                         mtu = np->frag_size;
 873         }
 874         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 875                 goto fail_toobig;
 876         mtu -= hlen + sizeof(struct frag_hdr);
 877
 878         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 879                                     &ipv6_hdr(skb)->saddr);
 880
 881         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 882             (err = skb_checksum_help(skb)))
 883                 goto fail;
 884
 885         prevhdr = skb_network_header(skb) + nexthdr_offset;
 886         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 887         if (skb_has_frag_list(skb)) {
 888                 unsigned int first_len = skb_pagelen(skb);
 889                 struct ip6_fraglist_iter iter;
 890                 struct sk_buff *frag2;
 891
 892                 if (first_len - hlen > mtu ||
 893                     ((first_len - hlen) & 7) ||
 894                     skb_cloned(skb) ||
 895                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 896                         goto slow_path;
 897
 898                 skb_walk_frags(skb, frag) {
 899                         /* Correct geometry. */
 900                         if (frag->len > mtu ||
 901                             ((frag->len & 7) && frag->next) ||
 902                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 903                                 goto slow_path_clean;
 904
 905                         /* Partially cloned skb? */
 906                         if (skb_shared(frag))
 907                                 goto slow_path_clean;
 908
 909                         BUG_ON(frag->sk);
 910                         if (skb->sk) {
 911                                 frag->sk = skb->sk;
 912                                 frag->destructor = sock_wfree;
 913                         }
 914                         skb->truesize -= frag->truesize;
 915                 }
 916
 917                 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
 918                                         &iter);
 919                 if (err < 0)
 920                         goto fail;
 921
 922                 for (;;) {
 923                         /* Prepare header of the next frame,
 924                          * before previous one went down. */
 925                         if (iter.frag)
 926                                 ip6_fraglist_prepare(skb, &iter);
 927
 928                         skb->tstamp = tstamp;
 929                         err = output(net, sk, skb);
 930                         if (!err)
 931                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 932                                               IPSTATS_MIB_FRAGCREATES);
 933
 934                         if (err || !iter.frag)
 935                                 break;
 936
 937                         skb = ip6_fraglist_next(&iter);
 938                 }
 939
 940                 kfree(iter.tmp_hdr);
 941
 942                 if (err == 0) {
 943                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 944                                       IPSTATS_MIB_FRAGOKS);
 945                         return 0;
 946                 }
 947
 948                 kfree_skb_list(iter.frag);
 949
 950                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 951                               IPSTATS_MIB_FRAGFAILS);
 952                 return err;
 953
 954 slow_path_clean:
 955                 skb_walk_frags(skb, frag2) {
 956                         if (frag2 == frag)
 957                                 break;
 958                         frag2->sk = NULL;
 959                         frag2->destructor = NULL;
 960                         skb->truesize += frag2->truesize;
 961                 }
 962         }
 963
 964 slow_path:
 965         /*
 966          *      Fragment the datagram.
 967          */
 968
 969         ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
 970                       LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
 971                       &state);
 972
 973         /*
 974          *      Keep copying data until we run out.
 975          */
 976
 977         while (state.left > 0) {
 978                 frag = ip6_frag_next(skb, &state);
 979                 if (IS_ERR(frag)) {
 980                         err = PTR_ERR(frag);
 981                         goto fail;
 982                 }
 983
 984                 /*
 985                  *      Put this fragment into the sending queue.
 986                  */
 987                 frag->tstamp = tstamp;
 988                 err = output(net, sk, frag);
 989                 if (err)
 990                         goto fail;
 991
 992                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 993                               IPSTATS_MIB_FRAGCREATES);
 994         }
 995         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 996                       IPSTATS_MIB_FRAGOKS);
 997         consume_skb(skb);
 998         return err;
 999
1000 fail_toobig:
1001         if (skb->sk && dst_allfrag(skb_dst(skb)))
1002                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
1003
1004         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1005         err = -EMSGSIZE;
1006
1007 fail:
1008         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1009                       IPSTATS_MIB_FRAGFAILS);
1010         kfree_skb(skb);
1011         return err;
1012 }
1013
1014 static inline int ip6_rt_check(const struct rt6key *rt_key,
1015                                const struct in6_addr *fl_addr,
1016                                const struct in6_addr *addr_cache)
1017 {
1018         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1019                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1020 }
1021
1022 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1023                                           struct dst_entry *dst,
1024                                           const struct flowi6 *fl6)
1025 {
1026         struct ipv6_pinfo *np = inet6_sk(sk);
1027         struct rt6_info *rt;
1028
1029         if (!dst)
1030                 goto out;
1031
1032         if (dst->ops->family != AF_INET6) {
1033                 dst_release(dst);
1034                 return NULL;
1035         }
1036
1037         rt = (struct rt6_info *)dst;
1038         /* Yes, checking route validity in not connected
1039          * case is not very simple. Take into account,
1040          * that we do not support routing by source, TOS,
1041          * and MSG_DONTROUTE            --ANK (980726)
1042          *
1043          * 1. ip6_rt_check(): If route was host route,
1044          *    check that cached destination is current.
1045          *    If it is network route, we still may
1046          *    check its validity using saved pointer
1047          *    to the last used address: daddr_cache.
1048          *    We do not want to save whole address now,
1049          *    (because main consumer of this service
1050          *    is tcp, which has not this problem),
1051          *    so that the last trick works only on connected
1052          *    sockets.
1053          * 2. oif also should be the same.
1054          */
1055         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1056 #ifdef CONFIG_IPV6_SUBTREES
1057             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1058 #endif
1059            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1060               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1061                 dst_release(dst);
1062                 dst = NULL;
1063         }
1064
1065 out:
1066         return dst;
1067 }
1068
1069 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1070                                struct dst_entry **dst, struct flowi6 *fl6)
1071 {
1072 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1073         struct neighbour *n;
1074         struct rt6_info *rt;
1075 #endif
1076         int err;
1077         int flags = 0;
1078
1079         /* The correct way to handle this would be to do
1080          * ip6_route_get_saddr, and then ip6_route_output; however,
1081          * the route-specific preferred source forces the
1082          * ip6_route_output call _before_ ip6_route_get_saddr.
1083          *
1084          * In source specific routing (no src=any default route),
1085          * ip6_route_output will fail given src=any saddr, though, so
1086          * that's why we try it again later.
1087          */
1088         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
1089                 struct fib6_info *from;
1090                 struct rt6_info *rt;
1091                 bool had_dst = *dst != NULL;
1092
1093                 if (!had_dst)
1094                         *dst = ip6_route_output(net, sk, fl6);
1095                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1096
1097                 rcu_read_lock();
1098                 from = rt ? rcu_dereference(rt->from) : NULL;
1099                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1100                                           sk ? inet6_sk(sk)->srcprefs : 0,
1101                                           &fl6->saddr);
1102                 rcu_read_unlock();
1103
1104                 if (err)
1105                         goto out_err_release;
1106
1107                 /* If we had an erroneous initial result, pretend it
1108                  * never existed and let the SA-enabled version take
1109                  * over.
1110                  */
1111                 if (!had_dst && (*dst)->error) {
1112                         dst_release(*dst);
1113                         *dst = NULL;
1114                 }
1115
1116                 if (fl6->flowi6_oif)
1117                         flags |= RT6_LOOKUP_F_IFACE;
1118         }
1119
1120         if (!*dst)
1121                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1122
1123         err = (*dst)->error;
1124         if (err)
1125                 goto out_err_release;
1126
1127 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1128         /*
1129          * Here if the dst entry we've looked up
1130          * has a neighbour entry that is in the INCOMPLETE
1131          * state and the src address from the flow is
1132          * marked as OPTIMISTIC, we release the found
1133          * dst entry and replace it instead with the
1134          * dst entry of the nexthop router
1135          */
1136         rt = (struct rt6_info *) *dst;
1137         rcu_read_lock_bh();
1138         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1139                                       rt6_nexthop(rt, &fl6->daddr));
1140         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1141         rcu_read_unlock_bh();
1142
1143         if (err) {
1144                 struct inet6_ifaddr *ifp;
1145                 struct flowi6 fl_gw6;
1146                 int redirect;
1147
1148                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1149                                       (*dst)->dev, 1);
1150
1151                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1152                 if (ifp)
1153                         in6_ifa_put(ifp);
1154
1155                 if (redirect) {
1156                         /*
1157                          * We need to get the dst entry for the
1158                          * default router instead
1159                          */
1160                         dst_release(*dst);
1161                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1162                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1163                         *dst = ip6_route_output(net, sk, &fl_gw6);
1164                         err = (*dst)->error;
1165                         if (err)
1166                                 goto out_err_release;
1167                 }
1168         }
1169 #endif
1170         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1171             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1172                 err = -EAFNOSUPPORT;
1173                 goto out_err_release;
1174         }
1175
1176         return 0;
1177
1178 out_err_release:
1179         dst_release(*dst);
1180         *dst = NULL;
1181
1182         if (err == -ENETUNREACH)
1183                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1184         return err;
1185 }
1186
1187 /**
1188  *      ip6_dst_lookup - perform route lookup on flow
1189  *      @net: Network namespace to perform lookup in
1190  *      @sk: socket which provides route info
1191  *      @dst: pointer to dst_entry * for result
1192  *      @fl6: flow to lookup
1193  *
1194  *      This function performs a route lookup on the given flow.
1195  *
1196  *      It returns zero on success, or a standard errno code on error.
1197  */
1198 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1199                    struct flowi6 *fl6)
1200 {
1201         *dst = NULL;
1202         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1203 }
1204 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1205
1206 /**
1207  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1208  *      @net: Network namespace to perform lookup in
1209  *      @sk: socket which provides route info
1210  *      @fl6: flow to lookup
1211  *      @final_dst: final destination address for ipsec lookup
1212  *
1213  *      This function performs a route lookup on the given flow.
1214  *
1215  *      It returns a valid dst pointer on success, or a pointer encoded
1216  *      error code.
1217  */
1218 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1219                                       const struct in6_addr *final_dst)
1220 {
1221         struct dst_entry *dst = NULL;
1222         int err;
1223
1224         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1225         if (err)
1226                 return ERR_PTR(err);
1227         if (final_dst)
1228                 fl6->daddr = *final_dst;
1229
1230         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1231 }
1232 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1233
1234 /**
1235  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1236  *      @sk: socket which provides the dst cache and route info
1237  *      @fl6: flow to lookup
1238  *      @final_dst: final destination address for ipsec lookup
1239  *      @connected: whether @sk is connected or not
1240  *
1241  *      This function performs a route lookup on the given flow with the
1242  *      possibility of using the cached route in the socket if it is valid.
1243  *      It will take the socket dst lock when operating on the dst cache.
1244  *      As a result, this function can only be used in process context.
1245  *
1246  *      In addition, for a connected socket, cache the dst in the socket
1247  *      if the current cache is not valid.
1248  *
1249  *      It returns a valid dst pointer on success, or a pointer encoded
1250  *      error code.
1251  */
1252 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1253                                          const struct in6_addr *final_dst,
1254                                          bool connected)
1255 {
1256         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1257
1258         dst = ip6_sk_dst_check(sk, dst, fl6);
1259         if (dst)
1260                 return dst;
1261
1262         dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1263         if (connected && !IS_ERR(dst))
1264                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1265
1266         return dst;
1267 }
1268 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1269
1270 /**
1271  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1272  *      @skb: Packet for which lookup is done
1273  *      @dev: Tunnel device
1274  *      @net: Network namespace of tunnel device
1275  *      @sock: Socket which provides route info
1276  *      @saddr: Memory to store the src ip address
1277  *      @info: Tunnel information
1278  *      @protocol: IP protocol
1279  *      @use_cache: Flag to enable cache usage
1280  *      This function performs a route lookup on a tunnel
1281  *
1282  *      It returns a valid dst pointer and stores src address to be used in
1283  *      tunnel in param saddr on success, else a pointer encoded error code.
1284  */
1285
1286 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1287                                         struct net_device *dev,
1288                                         struct net *net,
1289                                         struct socket *sock,
1290                                         struct in6_addr *saddr,
1291                                         const struct ip_tunnel_info *info,
1292                                         u8 protocol,
1293                                         bool use_cache)
1294 {
1295         struct dst_entry *dst = NULL;
1296 #ifdef CONFIG_DST_CACHE
1297         struct dst_cache *dst_cache;
1298 #endif
1299         struct flowi6 fl6;
1300         __u8 prio;
1301
1302 #ifdef CONFIG_DST_CACHE
1303         dst_cache = (struct dst_cache *)&info->dst_cache;
1304         if (use_cache) {
1305                 dst = dst_cache_get_ip6(dst_cache, saddr);
1306                 if (dst)
1307                         return dst;
1308         }
1309 #endif
1310         memset(&fl6, 0, sizeof(fl6));
1311         fl6.flowi6_mark = skb->mark;
1312         fl6.flowi6_proto = protocol;
1313         fl6.daddr = info->key.u.ipv6.dst;
1314         fl6.saddr = info->key.u.ipv6.src;
1315         prio = info->key.tos;
1316         fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1317
1318         dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1319                                               NULL);
1320         if (IS_ERR(dst)) {
1321                 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1322                 return ERR_PTR(-ENETUNREACH);
1323         }
1324         if (dst->dev == dev) { /* is this necessary? */
1325                 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1326                 dst_release(dst);
1327                 return ERR_PTR(-ELOOP);
1328         }
1329 #ifdef CONFIG_DST_CACHE
1330         if (use_cache)
1331                 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1332 #endif
1333         *saddr = fl6.saddr;
1334         return dst;
1335 }
1336 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1337
1338 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1339                                                gfp_t gfp)
1340 {
1341         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1342 }
1343
1344 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1345                                                 gfp_t gfp)
1346 {
1347         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1348 }
1349
1350 static void ip6_append_data_mtu(unsigned int *mtu,
1351                                 int *maxfraglen,
1352                                 unsigned int fragheaderlen,
1353                                 struct sk_buff *skb,
1354                                 struct rt6_info *rt,
1355                                 unsigned int orig_mtu)
1356 {
1357         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1358                 if (!skb) {
1359                         /* first fragment, reserve header_len */
1360                         *mtu = orig_mtu - rt->dst.header_len;
1361
1362                 } else {
1363                         /*
1364                          * this fragment is not first, the headers
1365                          * space is regarded as data space.
1366                          */
1367                         *mtu = orig_mtu;
1368                 }
1369                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1370                               + fragheaderlen - sizeof(struct frag_hdr);
1371         }
1372 }
1373
1374 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1375                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1376                           struct rt6_info *rt, struct flowi6 *fl6)
1377 {
1378         struct ipv6_pinfo *np = inet6_sk(sk);
1379         unsigned int mtu;
1380         struct ipv6_txoptions *opt = ipc6->opt;
1381
1382         /*
1383          * setup for corking
1384          */
1385         if (opt) {
1386                 if (WARN_ON(v6_cork->opt))
1387                         return -EINVAL;
1388
1389                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1390                 if (unlikely(!v6_cork->opt))
1391                         return -ENOBUFS;
1392
1393                 v6_cork->opt->tot_len = sizeof(*opt);
1394                 v6_cork->opt->opt_flen = opt->opt_flen;
1395                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1396
1397                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1398                                                     sk->sk_allocation);
1399                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1400                         return -ENOBUFS;
1401
1402                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1403                                                     sk->sk_allocation);
1404                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1405                         return -ENOBUFS;
1406
1407                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1408                                                    sk->sk_allocation);
1409                 if (opt->hopopt && !v6_cork->opt->hopopt)
1410                         return -ENOBUFS;
1411
1412                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1413                                                     sk->sk_allocation);
1414                 if (opt->srcrt && !v6_cork->opt->srcrt)
1415                         return -ENOBUFS;
1416
1417                 /* need source address above miyazawa*/
1418         }
1419         dst_hold(&rt->dst);
1420         cork->base.dst = &rt->dst;
1421         cork->fl.u.ip6 = *fl6;
1422         v6_cork->hop_limit = ipc6->hlimit;
1423         v6_cork->tclass = ipc6->tclass;
1424         if (rt->dst.flags & DST_XFRM_TUNNEL)
1425                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1426                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1427         else
1428                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1429                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1430         if (np->frag_size < mtu) {
1431                 if (np->frag_size)
1432                         mtu = np->frag_size;
1433         }
1434         cork->base.fragsize = mtu;
1435         cork->base.gso_size = ipc6->gso_size;
1436         cork->base.tx_flags = 0;
1437         cork->base.mark = ipc6->sockc.mark;
1438         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1439
1440         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1441                 cork->base.flags |= IPCORK_ALLFRAG;
1442         cork->base.length = 0;
1443
1444         cork->base.transmit_time = ipc6->sockc.transmit_time;
1445
1446         return 0;
1447 }
1448
1449 static int __ip6_append_data(struct sock *sk,
1450                              struct flowi6 *fl6,
1451                              struct sk_buff_head *queue,
1452                              struct inet_cork *cork,
1453                              struct inet6_cork *v6_cork,
1454                              struct page_frag *pfrag,
1455                              int getfrag(void *from, char *to, int offset,
1456                                          int len, int odd, struct sk_buff *skb),
1457                              void *from, int length, int transhdrlen,
1458                              unsigned int flags, struct ipcm6_cookie *ipc6)
1459 {
1460         struct sk_buff *skb, *skb_prev = NULL;
1461         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1462         struct ubuf_info *uarg = NULL;
1463         int exthdrlen = 0;
1464         int dst_exthdrlen = 0;
1465         int hh_len;
1466         int copy;
1467         int err;
1468         int offset = 0;
1469         u32 tskey = 0;
1470         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1471         struct ipv6_txoptions *opt = v6_cork->opt;
1472         int csummode = CHECKSUM_NONE;
1473         unsigned int maxnonfragsize, headersize;
1474         unsigned int wmem_alloc_delta = 0;
1475         bool paged, extra_uref = false;
1476
1477         skb = skb_peek_tail(queue);
1478         if (!skb) {
1479                 exthdrlen = opt ? opt->opt_flen : 0;
1480                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1481         }
1482
1483         paged = !!cork->gso_size;
1484         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1485         orig_mtu = mtu;
1486
1487         if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1488             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1489                 tskey = sk->sk_tskey++;
1490
1491         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1492
1493         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1494                         (opt ? opt->opt_nflen : 0);
1495
1496         headersize = sizeof(struct ipv6hdr) +
1497                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1498                      (dst_allfrag(&rt->dst) ?
1499                       sizeof(struct frag_hdr) : 0) +
1500                      rt->rt6i_nfheader_len;
1501
1502         if (mtu <= fragheaderlen ||
1503             ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1504                 goto emsgsize;
1505
1506         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1507                      sizeof(struct frag_hdr);
1508
1509         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1510          * the first fragment
1511          */
1512         if (headersize + transhdrlen > mtu)
1513                 goto emsgsize;
1514
1515         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1516             (sk->sk_protocol == IPPROTO_UDP ||
1517              sk->sk_protocol == IPPROTO_RAW)) {
1518                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1519                                 sizeof(struct ipv6hdr));
1520                 goto emsgsize;
1521         }
1522
1523         if (ip6_sk_ignore_df(sk))
1524                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1525         else
1526                 maxnonfragsize = mtu;
1527
1528         if (cork->length + length > maxnonfragsize - headersize) {
1529 emsgsize:
1530                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1531                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1532                 return -EMSGSIZE;
1533         }
1534
1535         /* CHECKSUM_PARTIAL only with no extension headers and when
1536          * we are not going to fragment
1537          */
1538         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1539             headersize == sizeof(struct ipv6hdr) &&
1540             length <= mtu - headersize &&
1541             (!(flags & MSG_MORE) || cork->gso_size) &&
1542             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1543                 csummode = CHECKSUM_PARTIAL;
1544
1545         if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1546                 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1547                 if (!uarg)
1548                         return -ENOBUFS;
1549                 extra_uref = !skb_zcopy(skb);   /* only ref on new uarg */
1550                 if (rt->dst.dev->features & NETIF_F_SG &&
1551                     csummode == CHECKSUM_PARTIAL) {
1552                         paged = true;
1553                 } else {
1554                         uarg->zerocopy = 0;
1555                         skb_zcopy_set(skb, uarg, &extra_uref);
1556                 }
1557         }
1558
1559         /*
1560          * Let's try using as much space as possible.
1561          * Use MTU if total length of the message fits into the MTU.
1562          * Otherwise, we need to reserve fragment header and
1563          * fragment alignment (= 8-15 octects, in total).
1564          *
1565          * Note that we may need to "move" the data from the tail
1566          * of the buffer to the new fragment when we split
1567          * the message.
1568          *
1569          * FIXME: It may be fragmented into multiple chunks
1570          *        at once if non-fragmentable extension headers
1571          *        are too large.
1572          * --yoshfuji
1573          */
1574
1575         cork->length += length;
1576         if (!skb)
1577                 goto alloc_new_skb;
1578
1579         while (length > 0) {
1580                 /* Check if the remaining data fits into current packet. */
1581                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1582                 if (copy < length)
1583                         copy = maxfraglen - skb->len;
1584
1585                 if (copy <= 0) {
1586                         char *data;
1587                         unsigned int datalen;
1588                         unsigned int fraglen;
1589                         unsigned int fraggap;
1590                         unsigned int alloclen, alloc_extra;
1591                         unsigned int pagedlen;
1592 alloc_new_skb:
1593                         /* There's no room in the current skb */
1594                         if (skb)
1595                                 fraggap = skb->len - maxfraglen;
1596                         else
1597                                 fraggap = 0;
1598                         /* update mtu and maxfraglen if necessary */
1599                         if (!skb || !skb_prev)
1600                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1601                                                     fragheaderlen, skb, rt,
1602                                                     orig_mtu);
1603
1604                         skb_prev = skb;
1605
1606                         /*
1607                          * If remaining data exceeds the mtu,
1608                          * we know we need more fragment(s).
1609                          */
1610                         datalen = length + fraggap;
1611
1612                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1613                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1614                         fraglen = datalen + fragheaderlen;
1615                         pagedlen = 0;
1616
1617                         alloc_extra = hh_len;
1618                         alloc_extra += dst_exthdrlen;
1619                         alloc_extra += rt->dst.trailer_len;
1620
1621                         /* We just reserve space for fragment header.
1622                          * Note: this may be overallocation if the message
1623                          * (without MSG_MORE) fits into the MTU.
1624                          */
1625                         alloc_extra += sizeof(struct frag_hdr);
1626
1627                         if ((flags & MSG_MORE) &&
1628                             !(rt->dst.dev->features&NETIF_F_SG))
1629                                 alloclen = mtu;
1630                         else if (!paged &&
1631                                  (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1632                                   !(rt->dst.dev->features & NETIF_F_SG)))
1633                                 alloclen = fraglen;
1634                         else {
1635                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1636                                 pagedlen = fraglen - alloclen;
1637                         }
1638                         alloclen += alloc_extra;
1639
1640                         if (datalen != length + fraggap) {
1641                                 /*
1642                                  * this is not the last fragment, the trailer
1643                                  * space is regarded as data space.
1644                                  */
1645                                 datalen += rt->dst.trailer_len;
1646                         }
1647
1648                         fraglen = datalen + fragheaderlen;
1649
1650                         copy = datalen - transhdrlen - fraggap - pagedlen;
1651                         if (copy < 0) {
1652                                 err = -EINVAL;
1653                                 goto error;
1654                         }
1655                         if (transhdrlen) {
1656                                 skb = sock_alloc_send_skb(sk, alloclen,
1657                                                 (flags & MSG_DONTWAIT), &err);
1658                         } else {
1659                                 skb = NULL;
1660                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1661                                     2 * sk->sk_sndbuf)
1662                                         skb = alloc_skb(alloclen,
1663                                                         sk->sk_allocation);
1664                                 if (unlikely(!skb))
1665                                         err = -ENOBUFS;
1666                         }
1667                         if (!skb)
1668                                 goto error;
1669                         /*
1670                          *      Fill in the control structures
1671                          */
1672                         skb->protocol = htons(ETH_P_IPV6);
1673                         skb->ip_summed = csummode;
1674                         skb->csum = 0;
1675                         /* reserve for fragmentation and ipsec header */
1676                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1677                                     dst_exthdrlen);
1678
1679                         /*
1680                          *      Find where to start putting bytes
1681                          */
1682                         data = skb_put(skb, fraglen - pagedlen);
1683                         skb_set_network_header(skb, exthdrlen);
1684                         data += fragheaderlen;
1685                         skb->transport_header = (skb->network_header +
1686                                                  fragheaderlen);
1687                         if (fraggap) {
1688                                 skb->csum = skb_copy_and_csum_bits(
1689                                         skb_prev, maxfraglen,
1690                                         data + transhdrlen, fraggap);
1691                                 skb_prev->csum = csum_sub(skb_prev->csum,
1692                                                           skb->csum);
1693                                 data += fraggap;
1694                                 pskb_trim_unique(skb_prev, maxfraglen);
1695                         }
1696                         if (copy > 0 &&
1697                             getfrag(from, data + transhdrlen, offset,
1698                                     copy, fraggap, skb) < 0) {
1699                                 err = -EFAULT;
1700                                 kfree_skb(skb);
1701                                 goto error;
1702                         }
1703
1704                         offset += copy;
1705                         length -= copy + transhdrlen;
1706                         transhdrlen = 0;
1707                         exthdrlen = 0;
1708                         dst_exthdrlen = 0;
1709
1710                         /* Only the initial fragment is time stamped */
1711                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1712                         cork->tx_flags = 0;
1713                         skb_shinfo(skb)->tskey = tskey;
1714                         tskey = 0;
1715                         skb_zcopy_set(skb, uarg, &extra_uref);
1716
1717                         if ((flags & MSG_CONFIRM) && !skb_prev)
1718                                 skb_set_dst_pending_confirm(skb, 1);
1719
1720                         /*
1721                          * Put the packet on the pending queue
1722                          */
1723                         if (!skb->destructor) {
1724                                 skb->destructor = sock_wfree;
1725                                 skb->sk = sk;
1726                                 wmem_alloc_delta += skb->truesize;
1727                         }
1728                         __skb_queue_tail(queue, skb);
1729                         continue;
1730                 }
1731
1732                 if (copy > length)
1733                         copy = length;
1734
1735                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1736                     skb_tailroom(skb) >= copy) {
1737                         unsigned int off;
1738
1739                         off = skb->len;
1740                         if (getfrag(from, skb_put(skb, copy),
1741                                                 offset, copy, off, skb) < 0) {
1742                                 __skb_trim(skb, off);
1743                                 err = -EFAULT;
1744                                 goto error;
1745                         }
1746                 } else if (!uarg || !uarg->zerocopy) {
1747                         int i = skb_shinfo(skb)->nr_frags;
1748
1749                         err = -ENOMEM;
1750                         if (!sk_page_frag_refill(sk, pfrag))
1751                                 goto error;
1752
1753                         if (!skb_can_coalesce(skb, i, pfrag->page,
1754                                               pfrag->offset)) {
1755                                 err = -EMSGSIZE;
1756                                 if (i == MAX_SKB_FRAGS)
1757                                         goto error;
1758
1759                                 __skb_fill_page_desc(skb, i, pfrag->page,
1760                                                      pfrag->offset, 0);
1761                                 skb_shinfo(skb)->nr_frags = ++i;
1762                                 get_page(pfrag->page);
1763                         }
1764                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1765                         if (getfrag(from,
1766                                     page_address(pfrag->page) + pfrag->offset,
1767                                     offset, copy, skb->len, skb) < 0)
1768                                 goto error_efault;
1769
1770                         pfrag->offset += copy;
1771                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1772                         skb->len += copy;
1773                         skb->data_len += copy;
1774                         skb->truesize += copy;
1775                         wmem_alloc_delta += copy;
1776                 } else {
1777                         err = skb_zerocopy_iter_dgram(skb, from, copy);
1778                         if (err < 0)
1779                                 goto error;
1780                 }
1781                 offset += copy;
1782                 length -= copy;
1783         }
1784
1785         if (wmem_alloc_delta)
1786                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1787         return 0;
1788
1789 error_efault:
1790         err = -EFAULT;
1791 error:
1792         if (uarg)
1793                 sock_zerocopy_put_abort(uarg, extra_uref);
1794         cork->length -= length;
1795         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1796         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1797         return err;
1798 }
1799
1800 int ip6_append_data(struct sock *sk,
1801                     int getfrag(void *from, char *to, int offset, int len,
1802                                 int odd, struct sk_buff *skb),
1803                     void *from, int length, int transhdrlen,
1804                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1805                     struct rt6_info *rt, unsigned int flags)
1806 {
1807         struct inet_sock *inet = inet_sk(sk);
1808         struct ipv6_pinfo *np = inet6_sk(sk);
1809         int exthdrlen;
1810         int err;
1811
1812         if (flags&MSG_PROBE)
1813                 return 0;
1814         if (skb_queue_empty(&sk->sk_write_queue)) {
1815                 /*
1816                  * setup for corking
1817                  */
1818                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1819                                      ipc6, rt, fl6);
1820                 if (err)
1821                         return err;
1822
1823                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1824                 length += exthdrlen;
1825                 transhdrlen += exthdrlen;
1826         } else {
1827                 fl6 = &inet->cork.fl.u.ip6;
1828                 transhdrlen = 0;
1829         }
1830
1831         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1832                                  &np->cork, sk_page_frag(sk), getfrag,
1833                                  from, length, transhdrlen, flags, ipc6);
1834 }
1835 EXPORT_SYMBOL_GPL(ip6_append_data);
1836
1837 static void ip6_cork_release(struct inet_cork_full *cork,
1838                              struct inet6_cork *v6_cork)
1839 {
1840         if (v6_cork->opt) {
1841                 kfree(v6_cork->opt->dst0opt);
1842                 kfree(v6_cork->opt->dst1opt);
1843                 kfree(v6_cork->opt->hopopt);
1844                 kfree(v6_cork->opt->srcrt);
1845                 kfree(v6_cork->opt);
1846                 v6_cork->opt = NULL;
1847         }
1848
1849         if (cork->base.dst) {
1850                 dst_release(cork->base.dst);
1851                 cork->base.dst = NULL;
1852                 cork->base.flags &= ~IPCORK_ALLFRAG;
1853         }
1854         memset(&cork->fl, 0, sizeof(cork->fl));
1855 }
1856
1857 struct sk_buff *__ip6_make_skb(struct sock *sk,
1858                                struct sk_buff_head *queue,
1859                                struct inet_cork_full *cork,
1860                                struct inet6_cork *v6_cork)
1861 {
1862         struct sk_buff *skb, *tmp_skb;
1863         struct sk_buff **tail_skb;
1864         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1865         struct ipv6_pinfo *np = inet6_sk(sk);
1866         struct net *net = sock_net(sk);
1867         struct ipv6hdr *hdr;
1868         struct ipv6_txoptions *opt = v6_cork->opt;
1869         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1870         struct flowi6 *fl6 = &cork->fl.u.ip6;
1871         unsigned char proto = fl6->flowi6_proto;
1872
1873         skb = __skb_dequeue(queue);
1874         if (!skb)
1875                 goto out;
1876         tail_skb = &(skb_shinfo(skb)->frag_list);
1877
1878         /* move skb->data to ip header from ext header */
1879         if (skb->data < skb_network_header(skb))
1880                 __skb_pull(skb, skb_network_offset(skb));
1881         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1882                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1883                 *tail_skb = tmp_skb;
1884                 tail_skb = &(tmp_skb->next);
1885                 skb->len += tmp_skb->len;
1886                 skb->data_len += tmp_skb->len;
1887                 skb->truesize += tmp_skb->truesize;
1888                 tmp_skb->destructor = NULL;
1889                 tmp_skb->sk = NULL;
1890         }
1891
1892         /* Allow local fragmentation. */
1893         skb->ignore_df = ip6_sk_ignore_df(sk);
1894
1895         *final_dst = fl6->daddr;
1896         __skb_pull(skb, skb_network_header_len(skb));
1897         if (opt && opt->opt_flen)
1898                 ipv6_push_frag_opts(skb, opt, &proto);
1899         if (opt && opt->opt_nflen)
1900                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1901
1902         skb_push(skb, sizeof(struct ipv6hdr));
1903         skb_reset_network_header(skb);
1904         hdr = ipv6_hdr(skb);
1905
1906         ip6_flow_hdr(hdr, v6_cork->tclass,
1907                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1908                                         ip6_autoflowlabel(net, np), fl6));
1909         hdr->hop_limit = v6_cork->hop_limit;
1910         hdr->nexthdr = proto;
1911         hdr->saddr = fl6->saddr;
1912         hdr->daddr = *final_dst;
1913
1914         skb->priority = sk->sk_priority;
1915         skb->mark = cork->base.mark;
1916
1917         skb->tstamp = cork->base.transmit_time;
1918
1919         skb_dst_set(skb, dst_clone(&rt->dst));
1920         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1921         if (proto == IPPROTO_ICMPV6) {
1922                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1923
1924                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1925                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1926         }
1927
1928         ip6_cork_release(cork, v6_cork);
1929 out:
1930         return skb;
1931 }
1932
1933 int ip6_send_skb(struct sk_buff *skb)
1934 {
1935         struct net *net = sock_net(skb->sk);
1936         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1937         int err;
1938
1939         err = ip6_local_out(net, skb->sk, skb);
1940         if (err) {
1941                 if (err > 0)
1942                         err = net_xmit_errno(err);
1943                 if (err)
1944                         IP6_INC_STATS(net, rt->rt6i_idev,
1945                                       IPSTATS_MIB_OUTDISCARDS);
1946         }
1947
1948         return err;
1949 }
1950
1951 int ip6_push_pending_frames(struct sock *sk)
1952 {
1953         struct sk_buff *skb;
1954
1955         skb = ip6_finish_skb(sk);
1956         if (!skb)
1957                 return 0;
1958
1959         return ip6_send_skb(skb);
1960 }
1961 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1962
1963 static void __ip6_flush_pending_frames(struct sock *sk,
1964                                        struct sk_buff_head *queue,
1965                                        struct inet_cork_full *cork,
1966                                        struct inet6_cork *v6_cork)
1967 {
1968         struct sk_buff *skb;
1969
1970         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1971                 if (skb_dst(skb))
1972                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1973                                       IPSTATS_MIB_OUTDISCARDS);
1974                 kfree_skb(skb);
1975         }
1976
1977         ip6_cork_release(cork, v6_cork);
1978 }
1979
1980 void ip6_flush_pending_frames(struct sock *sk)
1981 {
1982         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1983                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1984 }
1985 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1986
1987 struct sk_buff *ip6_make_skb(struct sock *sk,
1988                              int getfrag(void *from, char *to, int offset,
1989                                          int len, int odd, struct sk_buff *skb),
1990                              void *from, int length, int transhdrlen,
1991                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1992                              struct rt6_info *rt, unsigned int flags,
1993                              struct inet_cork_full *cork)
1994 {
1995         struct inet6_cork v6_cork;
1996         struct sk_buff_head queue;
1997         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1998         int err;
1999
2000         if (flags & MSG_PROBE)
2001                 return NULL;
2002
2003         __skb_queue_head_init(&queue);
2004
2005         cork->base.flags = 0;
2006         cork->base.addr = 0;
2007         cork->base.opt = NULL;
2008         cork->base.dst = NULL;
2009         v6_cork.opt = NULL;
2010         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
2011         if (err) {
2012                 ip6_cork_release(cork, &v6_cork);
2013                 return ERR_PTR(err);
2014         }
2015         if (ipc6->dontfrag < 0)
2016                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2017
2018         err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
2019                                 &current->task_frag, getfrag, from,
2020                                 length + exthdrlen, transhdrlen + exthdrlen,
2021                                 flags, ipc6);
2022         if (err) {
2023                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2024                 return ERR_PTR(err);
2025         }
2026
2027         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2028 }