ip6_output.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  *      IPv6 output functions
   4  *      Linux INET6 implementation
   5  *
   6  *      Authors:
   7  *      Pedro Roque             <roque@di.fc.ul.pt>
   8  *
   9  *      Based on linux/net/ipv4/ip_output.c
  10  *
  11  *      Changes:
  12  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  13  *                              extension headers are implemented.
  14  *                              route changes now work.
  15  *                              ip6_forward does not confuse sniffers.
  16  *                              etc.
  17  *
  18  *      H. von Brand    :       Added missing #include <linux/string.h>
  19  *      Imran Patel     :       frag id should be in NBO
  20  *      Kazunori MIYAZAWA @USAGI
  21  *                      :       add ip6_append_data and related functions
  22  *                              for datagram xmit
  23  */
  24
  25 #include <linux/errno.h>
  26 #include <linux/kernel.h>
  27 #include <linux/string.h>
  28 #include <linux/socket.h>
  29 #include <linux/net.h>
  30 #include <linux/netdevice.h>
  31 #include <linux/if_arp.h>
  32 #include <linux/in6.h>
  33 #include <linux/tcp.h>
  34 #include <linux/route.h>
  35 #include <linux/module.h>
  36 #include <linux/slab.h>
  37
  38 #include <linux/bpf-cgroup.h>
  39 #include <linux/netfilter.h>
  40 #include <linux/netfilter_ipv6.h>
  41
  42 #include <net/sock.h>
  43 #include <net/snmp.h>
  44
  45 #include <net/ipv6.h>
  46 #include <net/ndisc.h>
  47 #include <net/protocol.h>
  48 #include <net/ip6_route.h>
  49 #include <net/addrconf.h>
  50 #include <net/rawv6.h>
  51 #include <net/icmp.h>
  52 #include <net/xfrm.h>
  53 #include <net/checksum.h>
  54 #include <linux/mroute6.h>
  55 #include <net/l3mdev.h>
  56 #include <net/lwtunnel.h>
  57
  58 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  59 {
  60         struct dst_entry *dst = skb_dst(skb);
  61         struct net_device *dev = dst->dev;
  62         unsigned int hh_len = LL_RESERVED_SPACE(dev);
  63         int delta = hh_len - skb_headroom(skb);
  64         const struct in6_addr *nexthop;
  65         struct neighbour *neigh;
  66         int ret;
  67
  68         /* Be paranoid, rather than too clever. */
  69         if (unlikely(delta > 0) && dev->header_ops) {
  70                 /* pskb_expand_head() might crash, if skb is shared */
  71                 if (skb_shared(skb)) {
  72                         struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
  73
  74                         if (likely(nskb)) {
  75                                 if (skb->sk)
  76                                         skb_set_owner_w(nskb, skb->sk);
  77                                 consume_skb(skb);
  78                         } else {
  79                                 kfree_skb(skb);
  80                         }
  81                         skb = nskb;
  82                 }
  83                 if (skb &&
  84                     pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
  85                         kfree_skb(skb);
  86                         skb = NULL;
  87                 }
  88                 if (!skb) {
  89                         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
  90                         return -ENOMEM;
  91                 }
  92         }
  93
  94         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  95                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  96
  97                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  98                     ((mroute6_is_socket(net, skb) &&
  99                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 100                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 101                                          &ipv6_hdr(skb)->saddr))) {
 102                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 103
 104                         /* Do not check for IFF_ALLMULTI; multicast routing
 105                            is not supported in any case.
 106                          */
 107                         if (newskb)
 108                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 109                                         net, sk, newskb, NULL, newskb->dev,
 110                                         dev_loopback_xmit);
 111
 112                         if (ipv6_hdr(skb)->hop_limit == 0) {
 113                                 IP6_INC_STATS(net, idev,
 114                                               IPSTATS_MIB_OUTDISCARDS);
 115                                 kfree_skb(skb);
 116                                 return 0;
 117                         }
 118                 }
 119
 120                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
 121
 122                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
 123                     IPV6_ADDR_SCOPE_NODELOCAL &&
 124                     !(dev->flags & IFF_LOOPBACK)) {
 125                         kfree_skb(skb);
 126                         return 0;
 127                 }
 128         }
 129
 130         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 131                 int res = lwtunnel_xmit(skb);
 132
 133                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 134                         return res;
 135         }
 136
 137         rcu_read_lock_bh();
 138         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 139         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 140         if (unlikely(!neigh))
 141                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 142         if (!IS_ERR(neigh)) {
 143                 sock_confirm_neigh(skb, neigh);
 144                 ret = neigh_output(neigh, skb, false);
 145                 rcu_read_unlock_bh();
 146                 return ret;
 147         }
 148         rcu_read_unlock_bh();
 149
 150         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 151         kfree_skb(skb);
 152         return -EINVAL;
 153 }
 154
 155 static int
 156 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
 157                                     struct sk_buff *skb, unsigned int mtu)
 158 {
 159         struct sk_buff *segs, *nskb;
 160         netdev_features_t features;
 161         int ret = 0;
 162
 163         /* Please see corresponding comment in ip_finish_output_gso
 164          * describing the cases where GSO segment length exceeds the
 165          * egress MTU.
 166          */
 167         features = netif_skb_features(skb);
 168         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 169         if (IS_ERR_OR_NULL(segs)) {
 170                 kfree_skb(skb);
 171                 return -ENOMEM;
 172         }
 173
 174         consume_skb(skb);
 175
 176         skb_list_walk_safe(segs, segs, nskb) {
 177                 int err;
 178
 179                 skb_mark_not_on_list(segs);
 180                 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
 181                 if (err && ret == 0)
 182                         ret = err;
 183         }
 184
 185         return ret;
 186 }
 187
 188 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 189 {
 190         unsigned int mtu;
 191
 192 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 193         /* Policy lookup after SNAT yielded a new policy */
 194         if (skb_dst(skb)->xfrm) {
 195                 IP6CB(skb)->flags |= IP6SKB_REROUTED;
 196                 return dst_output(net, sk, skb);
 197         }
 198 #endif
 199
 200         mtu = ip6_skb_dst_mtu(skb);
 201         if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
 202                 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
 203
 204         if ((skb->len > mtu && !skb_is_gso(skb)) ||
 205             dst_allfrag(skb_dst(skb)) ||
 206             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 207                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 208         else
 209                 return ip6_finish_output2(net, sk, skb);
 210 }
 211
 212 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 213 {
 214         int ret;
 215
 216         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 217         switch (ret) {
 218         case NET_XMIT_SUCCESS:
 219                 return __ip6_finish_output(net, sk, skb);
 220         case NET_XMIT_CN:
 221                 return __ip6_finish_output(net, sk, skb) ? : ret;
 222         default:
 223                 kfree_skb(skb);
 224                 return ret;
 225         }
 226 }
 227
 228 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 229 {
 230         struct net_device *dev = skb_dst(skb)->dev;
 231         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 232
 233         skb->protocol = htons(ETH_P_IPV6);
 234         skb->dev = dev;
 235
 236         if (unlikely(idev->cnf.disable_ipv6)) {
 237                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 238                 kfree_skb(skb);
 239                 return 0;
 240         }
 241
 242         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 243                             net, sk, skb, NULL, dev,
 244                             ip6_finish_output,
 245                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 246 }
 247
 248 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 249 {
 250         if (!np->autoflowlabel_set)
 251                 return ip6_default_np_autolabel(net);
 252         else
 253                 return np->autoflowlabel;
 254 }
 255
 256 /*
 257  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 258  * Note : socket lock is not held for SYNACK packets, but might be modified
 259  * by calls to skb_set_owner_w() and ipv6_local_error(),
 260  * which are using proper atomic operations or spinlocks.
 261  */
 262 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 263              __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
 264 {
 265         struct net *net = sock_net(sk);
 266         const struct ipv6_pinfo *np = inet6_sk(sk);
 267         struct in6_addr *first_hop = &fl6->daddr;
 268         struct dst_entry *dst = skb_dst(skb);
 269         unsigned int head_room;
 270         struct ipv6hdr *hdr;
 271         u8  proto = fl6->flowi6_proto;
 272         int seg_len = skb->len;
 273         int hlimit = -1;
 274         u32 mtu;
 275
 276         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 277         if (opt)
 278                 head_room += opt->opt_nflen + opt->opt_flen;
 279
 280         if (unlikely(skb_headroom(skb) < head_room)) {
 281                 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 282                 if (!skb2) {
 283                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 284                                       IPSTATS_MIB_OUTDISCARDS);
 285                         kfree_skb(skb);
 286                         return -ENOBUFS;
 287                 }
 288                 if (skb->sk)
 289                         skb_set_owner_w(skb2, skb->sk);
 290                 consume_skb(skb);
 291                 skb = skb2;
 292         }
 293
 294         if (opt) {
 295                 seg_len += opt->opt_nflen + opt->opt_flen;
 296
 297                 if (opt->opt_flen)
 298                         ipv6_push_frag_opts(skb, opt, &proto);
 299
 300                 if (opt->opt_nflen)
 301                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 302                                              &fl6->saddr);
 303         }
 304
 305         skb_push(skb, sizeof(struct ipv6hdr));
 306         skb_reset_network_header(skb);
 307         hdr = ipv6_hdr(skb);
 308
 309         /*
 310          *      Fill in the IPv6 header
 311          */
 312         if (np)
 313                 hlimit = np->hop_limit;
 314         if (hlimit < 0)
 315                 hlimit = ip6_dst_hoplimit(dst);
 316
 317         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 318                                 ip6_autoflowlabel(net, np), fl6));
 319
 320         hdr->payload_len = htons(seg_len);
 321         hdr->nexthdr = proto;
 322         hdr->hop_limit = hlimit;
 323
 324         hdr->saddr = fl6->saddr;
 325         hdr->daddr = *first_hop;
 326
 327         skb->protocol = htons(ETH_P_IPV6);
 328         skb->priority = priority;
 329         skb->mark = mark;
 330
 331         mtu = dst_mtu(dst);
 332         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 333                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 334                               IPSTATS_MIB_OUT, skb->len);
 335
 336                 /* if egress device is enslaved to an L3 master device pass the
 337                  * skb to its handler for processing
 338                  */
 339                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 340                 if (unlikely(!skb))
 341                         return 0;
 342
 343                 /* hooks should never assume socket lock is held.
 344                  * we promote our socket to non const
 345                  */
 346                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 347                                net, (struct sock *)sk, skb, NULL, dst->dev,
 348                                dst_output);
 349         }
 350
 351         skb->dev = dst->dev;
 352         /* ipv6_local_error() does not require socket lock,
 353          * we promote our socket to non const
 354          */
 355         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 356
 357         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 358         kfree_skb(skb);
 359         return -EMSGSIZE;
 360 }
 361 EXPORT_SYMBOL(ip6_xmit);
 362
 363 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 364 {
 365         struct ip6_ra_chain *ra;
 366         struct sock *last = NULL;
 367
 368         read_lock(&ip6_ra_lock);
 369         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 370                 struct sock *sk = ra->sk;
 371                 if (sk && ra->sel == sel &&
 372                     (!sk->sk_bound_dev_if ||
 373                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 374                         struct ipv6_pinfo *np = inet6_sk(sk);
 375
 376                         if (np && np->rtalert_isolate &&
 377                             !net_eq(sock_net(sk), dev_net(skb->dev))) {
 378                                 continue;
 379                         }
 380                         if (last) {
 381                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 382                                 if (skb2)
 383                                         rawv6_rcv(last, skb2);
 384                         }
 385                         last = sk;
 386                 }
 387         }
 388
 389         if (last) {
 390                 rawv6_rcv(last, skb);
 391                 read_unlock(&ip6_ra_lock);
 392                 return 1;
 393         }
 394         read_unlock(&ip6_ra_lock);
 395         return 0;
 396 }
 397
 398 static int ip6_forward_proxy_check(struct sk_buff *skb)
 399 {
 400         struct ipv6hdr *hdr = ipv6_hdr(skb);
 401         u8 nexthdr = hdr->nexthdr;
 402         __be16 frag_off;
 403         int offset;
 404
 405         if (ipv6_ext_hdr(nexthdr)) {
 406                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 407                 if (offset < 0)
 408                         return 0;
 409         } else
 410                 offset = sizeof(struct ipv6hdr);
 411
 412         if (nexthdr == IPPROTO_ICMPV6) {
 413                 struct icmp6hdr *icmp6;
 414
 415                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 416                                          offset + 1 - skb->data)))
 417                         return 0;
 418
 419                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 420
 421                 switch (icmp6->icmp6_type) {
 422                 case NDISC_ROUTER_SOLICITATION:
 423                 case NDISC_ROUTER_ADVERTISEMENT:
 424                 case NDISC_NEIGHBOUR_SOLICITATION:
 425                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 426                 case NDISC_REDIRECT:
 427                         /* For reaction involving unicast neighbor discovery
 428                          * message destined to the proxied address, pass it to
 429                          * input function.
 430                          */
 431                         return 1;
 432                 default:
 433                         break;
 434                 }
 435         }
 436
 437         /*
 438          * The proxying router can't forward traffic sent to a link-local
 439          * address, so signal the sender and discard the packet. This
 440          * behavior is clarified by the MIPv6 specification.
 441          */
 442         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 443                 dst_link_failure(skb);
 444                 return -1;
 445         }
 446
 447         return 0;
 448 }
 449
 450 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 451                                      struct sk_buff *skb)
 452 {
 453         struct dst_entry *dst = skb_dst(skb);
 454
 455         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 456         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 457
 458 #ifdef CONFIG_NET_SWITCHDEV
 459         if (skb->offload_l3_fwd_mark) {
 460                 consume_skb(skb);
 461                 return 0;
 462         }
 463 #endif
 464
 465         skb->tstamp = 0;
 466         return dst_output(net, sk, skb);
 467 }
 468
 469 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 470 {
 471         if (skb->len <= mtu)
 472                 return false;
 473
 474         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 475         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 476                 return true;
 477
 478         if (skb->ignore_df)
 479                 return false;
 480
 481         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 482                 return false;
 483
 484         return true;
 485 }
 486
 487 int ip6_forward(struct sk_buff *skb)
 488 {
 489         struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
 490         struct dst_entry *dst = skb_dst(skb);
 491         struct ipv6hdr *hdr = ipv6_hdr(skb);
 492         struct inet6_skb_parm *opt = IP6CB(skb);
 493         struct net *net = dev_net(dst->dev);
 494         u32 mtu;
 495
 496         if (net->ipv6.devconf_all->forwarding == 0)
 497                 goto error;
 498
 499         if (skb->pkt_type != PACKET_HOST)
 500                 goto drop;
 501
 502         if (unlikely(skb->sk))
 503                 goto drop;
 504
 505         if (skb_warn_if_lro(skb))
 506                 goto drop;
 507
 508         if (!net->ipv6.devconf_all->disable_policy &&
 509             (!idev || !idev->cnf.disable_policy) &&
 510             !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 511                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 512                 goto drop;
 513         }
 514
 515         skb_forward_csum(skb);
 516
 517         /*
 518          *      We DO NOT make any processing on
 519          *      RA packets, pushing them to user level AS IS
 520          *      without ane WARRANTY that application will be able
 521          *      to interpret them. The reason is that we
 522          *      cannot make anything clever here.
 523          *
 524          *      We are not end-node, so that if packet contains
 525          *      AH/ESP, we cannot make anything.
 526          *      Defragmentation also would be mistake, RA packets
 527          *      cannot be fragmented, because there is no warranty
 528          *      that different fragments will go along one path. --ANK
 529          */
 530         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 531                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 532                         return 0;
 533         }
 534
 535         /*
 536          *      check and decrement ttl
 537          */
 538         if (hdr->hop_limit <= 1) {
 539                 /* Force OUTPUT device used as source address */
 540                 skb->dev = dst->dev;
 541                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 542                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 543
 544                 kfree_skb(skb);
 545                 return -ETIMEDOUT;
 546         }
 547
 548         /* XXX: idev->cnf.proxy_ndp? */
 549         if (net->ipv6.devconf_all->proxy_ndp &&
 550             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 551                 int proxied = ip6_forward_proxy_check(skb);
 552                 if (proxied > 0)
 553                         return ip6_input(skb);
 554                 else if (proxied < 0) {
 555                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 556                         goto drop;
 557                 }
 558         }
 559
 560         if (!xfrm6_route_forward(skb)) {
 561                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 562                 goto drop;
 563         }
 564         dst = skb_dst(skb);
 565
 566         /* IPv6 specs say nothing about it, but it is clear that we cannot
 567            send redirects to source routed frames.
 568            We don't send redirects to frames decapsulated from IPsec.
 569          */
 570         if (IP6CB(skb)->iif == dst->dev->ifindex &&
 571             opt->srcrt == 0 && !skb_sec_path(skb)) {
 572                 struct in6_addr *target = NULL;
 573                 struct inet_peer *peer;
 574                 struct rt6_info *rt;
 575
 576                 /*
 577                  *      incoming and outgoing devices are the same
 578                  *      send a redirect.
 579                  */
 580
 581                 rt = (struct rt6_info *) dst;
 582                 if (rt->rt6i_flags & RTF_GATEWAY)
 583                         target = &rt->rt6i_gateway;
 584                 else
 585                         target = &hdr->daddr;
 586
 587                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 588
 589                 /* Limit redirects both by destination (here)
 590                    and by source (inside ndisc_send_redirect)
 591                  */
 592                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 593                         ndisc_send_redirect(skb, target);
 594                 if (peer)
 595                         inet_putpeer(peer);
 596         } else {
 597                 int addrtype = ipv6_addr_type(&hdr->saddr);
 598
 599                 /* This check is security critical. */
 600                 if (addrtype == IPV6_ADDR_ANY ||
 601                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 602                         goto error;
 603                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 604                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 605                                     ICMPV6_NOT_NEIGHBOUR, 0);
 606                         goto error;
 607                 }
 608         }
 609
 610         mtu = ip6_dst_mtu_forward(dst);
 611         if (mtu < IPV6_MIN_MTU)
 612                 mtu = IPV6_MIN_MTU;
 613
 614         if (ip6_pkt_too_big(skb, mtu)) {
 615                 /* Again, force OUTPUT device used as source address */
 616                 skb->dev = dst->dev;
 617                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 618                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 619                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 620                                 IPSTATS_MIB_FRAGFAILS);
 621                 kfree_skb(skb);
 622                 return -EMSGSIZE;
 623         }
 624
 625         if (skb_cow(skb, dst->dev->hard_header_len)) {
 626                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 627                                 IPSTATS_MIB_OUTDISCARDS);
 628                 goto drop;
 629         }
 630
 631         hdr = ipv6_hdr(skb);
 632
 633         /* Mangling hops number delayed to point after skb COW */
 634
 635         hdr->hop_limit--;
 636
 637         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 638                        net, NULL, skb, skb->dev, dst->dev,
 639                        ip6_forward_finish);
 640
 641 error:
 642         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 643 drop:
 644         kfree_skb(skb);
 645         return -EINVAL;
 646 }
 647
 648 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 649 {
 650         to->pkt_type = from->pkt_type;
 651         to->priority = from->priority;
 652         to->protocol = from->protocol;
 653         skb_dst_drop(to);
 654         skb_dst_set(to, dst_clone(skb_dst(from)));
 655         to->dev = from->dev;
 656         to->mark = from->mark;
 657
 658         skb_copy_hash(to, from);
 659
 660 #ifdef CONFIG_NET_SCHED
 661         to->tc_index = from->tc_index;
 662 #endif
 663         nf_copy(to, from);
 664         skb_ext_copy(to, from);
 665         skb_copy_secmark(to, from);
 666 }
 667
 668 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
 669                       u8 nexthdr, __be32 frag_id,
 670                       struct ip6_fraglist_iter *iter)
 671 {
 672         unsigned int first_len;
 673         struct frag_hdr *fh;
 674
 675         /* BUILD HEADER */
 676         *prevhdr = NEXTHDR_FRAGMENT;
 677         iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 678         if (!iter->tmp_hdr)
 679                 return -ENOMEM;
 680
 681         iter->frag = skb_shinfo(skb)->frag_list;
 682         skb_frag_list_init(skb);
 683
 684         iter->offset = 0;
 685         iter->hlen = hlen;
 686         iter->frag_id = frag_id;
 687         iter->nexthdr = nexthdr;
 688
 689         __skb_pull(skb, hlen);
 690         fh = __skb_push(skb, sizeof(struct frag_hdr));
 691         __skb_push(skb, hlen);
 692         skb_reset_network_header(skb);
 693         memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
 694
 695         fh->nexthdr = nexthdr;
 696         fh->reserved = 0;
 697         fh->frag_off = htons(IP6_MF);
 698         fh->identification = frag_id;
 699
 700         first_len = skb_pagelen(skb);
 701         skb->data_len = first_len - skb_headlen(skb);
 702         skb->len = first_len;
 703         ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
 704
 705         return 0;
 706 }
 707 EXPORT_SYMBOL(ip6_fraglist_init);
 708
 709 void ip6_fraglist_prepare(struct sk_buff *skb,
 710                           struct ip6_fraglist_iter *iter)
 711 {
 712         struct sk_buff *frag = iter->frag;
 713         unsigned int hlen = iter->hlen;
 714         struct frag_hdr *fh;
 715
 716         frag->ip_summed = CHECKSUM_NONE;
 717         skb_reset_transport_header(frag);
 718         fh = __skb_push(frag, sizeof(struct frag_hdr));
 719         __skb_push(frag, hlen);
 720         skb_reset_network_header(frag);
 721         memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
 722         iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
 723         fh->nexthdr = iter->nexthdr;
 724         fh->reserved = 0;
 725         fh->frag_off = htons(iter->offset);
 726         if (frag->next)
 727                 fh->frag_off |= htons(IP6_MF);
 728         fh->identification = iter->frag_id;
 729         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 730         ip6_copy_metadata(frag, skb);
 731 }
 732 EXPORT_SYMBOL(ip6_fraglist_prepare);
 733
 734 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
 735                    unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
 736                    u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
 737 {
 738         state->prevhdr = prevhdr;
 739         state->nexthdr = nexthdr;
 740         state->frag_id = frag_id;
 741
 742         state->hlen = hlen;
 743         state->mtu = mtu;
 744
 745         state->left = skb->len - hlen;  /* Space per frame */
 746         state->ptr = hlen;              /* Where to start from */
 747
 748         state->hroom = hdr_room;
 749         state->troom = needed_tailroom;
 750
 751         state->offset = 0;
 752 }
 753 EXPORT_SYMBOL(ip6_frag_init);
 754
 755 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
 756 {
 757         u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
 758         struct sk_buff *frag;
 759         struct frag_hdr *fh;
 760         unsigned int len;
 761
 762         len = state->left;
 763         /* IF: it doesn't fit, use 'mtu' - the data space left */
 764         if (len > state->mtu)
 765                 len = state->mtu;
 766         /* IF: we are not sending up to and including the packet end
 767            then align the next start on an eight byte boundary */
 768         if (len < state->left)
 769                 len &= ~7;
 770
 771         /* Allocate buffer */
 772         frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
 773                          state->hroom + state->troom, GFP_ATOMIC);
 774         if (!frag)
 775                 return ERR_PTR(-ENOMEM);
 776
 777         /*
 778          *      Set up data on packet
 779          */
 780
 781         ip6_copy_metadata(frag, skb);
 782         skb_reserve(frag, state->hroom);
 783         skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
 784         skb_reset_network_header(frag);
 785         fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
 786         frag->transport_header = (frag->network_header + state->hlen +
 787                                   sizeof(struct frag_hdr));
 788
 789         /*
 790          *      Charge the memory for the fragment to any owner
 791          *      it might possess
 792          */
 793         if (skb->sk)
 794                 skb_set_owner_w(frag, skb->sk);
 795
 796         /*
 797          *      Copy the packet header into the new buffer.
 798          */
 799         skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
 800
 801         fragnexthdr_offset = skb_network_header(frag);
 802         fragnexthdr_offset += prevhdr - skb_network_header(skb);
 803         *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 804
 805         /*
 806          *      Build fragment header.
 807          */
 808         fh->nexthdr = state->nexthdr;
 809         fh->reserved = 0;
 810         fh->identification = state->frag_id;
 811
 812         /*
 813          *      Copy a block of the IP datagram.
 814          */
 815         BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
 816                              len));
 817         state->left -= len;
 818
 819         fh->frag_off = htons(state->offset);
 820         if (state->left > 0)
 821                 fh->frag_off |= htons(IP6_MF);
 822         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 823
 824         state->ptr += len;
 825         state->offset += len;
 826
 827         return frag;
 828 }
 829 EXPORT_SYMBOL(ip6_frag_next);
 830
 831 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 832                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 833 {
 834         struct sk_buff *frag;
 835         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 836         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 837                                 inet6_sk(skb->sk) : NULL;
 838         struct ip6_frag_state state;
 839         unsigned int mtu, hlen, nexthdr_offset;
 840         ktime_t tstamp = skb->tstamp;
 841         int hroom, err = 0;
 842         __be32 frag_id;
 843         u8 *prevhdr, nexthdr = 0;
 844
 845         err = ip6_find_1stfragopt(skb, &prevhdr);
 846         if (err < 0)
 847                 goto fail;
 848         hlen = err;
 849         nexthdr = *prevhdr;
 850         nexthdr_offset = prevhdr - skb_network_header(skb);
 851
 852         mtu = ip6_skb_dst_mtu(skb);
 853
 854         /* We must not fragment if the socket is set to force MTU discovery
 855          * or if the skb it not generated by a local socket.
 856          */
 857         if (unlikely(!skb->ignore_df && skb->len > mtu))
 858                 goto fail_toobig;
 859
 860         if (IP6CB(skb)->frag_max_size) {
 861                 if (IP6CB(skb)->frag_max_size > mtu)
 862                         goto fail_toobig;
 863
 864                 /* don't send fragments larger than what we received */
 865                 mtu = IP6CB(skb)->frag_max_size;
 866                 if (mtu < IPV6_MIN_MTU)
 867                         mtu = IPV6_MIN_MTU;
 868         }
 869
 870         if (np && np->frag_size < mtu) {
 871                 if (np->frag_size)
 872                         mtu = np->frag_size;
 873         }
 874         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 875                 goto fail_toobig;
 876         mtu -= hlen + sizeof(struct frag_hdr);
 877
 878         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 879                                     &ipv6_hdr(skb)->saddr);
 880
 881         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 882             (err = skb_checksum_help(skb)))
 883                 goto fail;
 884
 885         prevhdr = skb_network_header(skb) + nexthdr_offset;
 886         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 887         if (skb_has_frag_list(skb)) {
 888                 unsigned int first_len = skb_pagelen(skb);
 889                 struct ip6_fraglist_iter iter;
 890                 struct sk_buff *frag2;
 891
 892                 if (first_len - hlen > mtu ||
 893                     ((first_len - hlen) & 7) ||
 894                     skb_cloned(skb) ||
 895                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 896                         goto slow_path;
 897
 898                 skb_walk_frags(skb, frag) {
 899                         /* Correct geometry. */
 900                         if (frag->len > mtu ||
 901                             ((frag->len & 7) && frag->next) ||
 902                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 903                                 goto slow_path_clean;
 904
 905                         /* Partially cloned skb? */
 906                         if (skb_shared(frag))
 907                                 goto slow_path_clean;
 908
 909                         BUG_ON(frag->sk);
 910                         if (skb->sk) {
 911                                 frag->sk = skb->sk;
 912                                 frag->destructor = sock_wfree;
 913                         }
 914                         skb->truesize -= frag->truesize;
 915                 }
 916
 917                 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
 918                                         &iter);
 919                 if (err < 0)
 920                         goto fail;
 921
 922                 for (;;) {
 923                         /* Prepare header of the next frame,
 924                          * before previous one went down. */
 925                         if (iter.frag)
 926                                 ip6_fraglist_prepare(skb, &iter);
 927
 928                         skb->tstamp = tstamp;
 929                         err = output(net, sk, skb);
 930                         if (!err)
 931                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 932                                               IPSTATS_MIB_FRAGCREATES);
 933
 934                         if (err || !iter.frag)
 935                                 break;
 936
 937                         skb = ip6_fraglist_next(&iter);
 938                 }
 939
 940                 kfree(iter.tmp_hdr);
 941
 942                 if (err == 0) {
 943                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 944                                       IPSTATS_MIB_FRAGOKS);
 945                         return 0;
 946                 }
 947
 948                 kfree_skb_list(iter.frag);
 949
 950                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 951                               IPSTATS_MIB_FRAGFAILS);
 952                 return err;
 953
 954 slow_path_clean:
 955                 skb_walk_frags(skb, frag2) {
 956                         if (frag2 == frag)
 957                                 break;
 958                         frag2->sk = NULL;
 959                         frag2->destructor = NULL;
 960                         skb->truesize += frag2->truesize;
 961                 }
 962         }
 963
 964 slow_path:
 965         /*
 966          *      Fragment the datagram.
 967          */
 968
 969         ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
 970                       LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
 971                       &state);
 972
 973         /*
 974          *      Keep copying data until we run out.
 975          */
 976
 977         while (state.left > 0) {
 978                 frag = ip6_frag_next(skb, &state);
 979                 if (IS_ERR(frag)) {
 980                         err = PTR_ERR(frag);
 981                         goto fail;
 982                 }
 983
 984                 /*
 985                  *      Put this fragment into the sending queue.
 986                  */
 987                 frag->tstamp = tstamp;
 988                 err = output(net, sk, frag);
 989                 if (err)
 990                         goto fail;
 991
 992                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 993                               IPSTATS_MIB_FRAGCREATES);
 994         }
 995         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 996                       IPSTATS_MIB_FRAGOKS);
 997         consume_skb(skb);
 998         return err;
 999
1000 fail_toobig:
1001         if (skb->sk && dst_allfrag(skb_dst(skb)))
1002                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
1003
1004         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1005         err = -EMSGSIZE;
1006
1007 fail:
1008         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1009                       IPSTATS_MIB_FRAGFAILS);
1010         kfree_skb(skb);
1011         return err;
1012 }
1013
1014 static inline int ip6_rt_check(const struct rt6key *rt_key,
1015                                const struct in6_addr *fl_addr,
1016                                const struct in6_addr *addr_cache)
1017 {
1018         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1019                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1020 }
1021
1022 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1023                                           struct dst_entry *dst,
1024                                           const struct flowi6 *fl6)
1025 {
1026         struct ipv6_pinfo *np = inet6_sk(sk);
1027         struct rt6_info *rt;
1028
1029         if (!dst)
1030                 goto out;
1031
1032         if (dst->ops->family != AF_INET6) {
1033                 dst_release(dst);
1034                 return NULL;
1035         }
1036
1037         rt = (struct rt6_info *)dst;
1038         /* Yes, checking route validity in not connected
1039          * case is not very simple. Take into account,
1040          * that we do not support routing by source, TOS,
1041          * and MSG_DONTROUTE            --ANK (980726)
1042          *
1043          * 1. ip6_rt_check(): If route was host route,
1044          *    check that cached destination is current.
1045          *    If it is network route, we still may
1046          *    check its validity using saved pointer
1047          *    to the last used address: daddr_cache.
1048          *    We do not want to save whole address now,
1049          *    (because main consumer of this service
1050          *    is tcp, which has not this problem),
1051          *    so that the last trick works only on connected
1052          *    sockets.
1053          * 2. oif also should be the same.
1054          */
1055         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1056 #ifdef CONFIG_IPV6_SUBTREES
1057             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1058 #endif
1059            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1060               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1061                 dst_release(dst);
1062                 dst = NULL;
1063         }
1064
1065 out:
1066         return dst;
1067 }
1068
1069 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1070                                struct dst_entry **dst, struct flowi6 *fl6)
1071 {
1072 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1073         struct neighbour *n;
1074         struct rt6_info *rt;
1075 #endif
1076         int err;
1077         int flags = 0;
1078
1079         /* The correct way to handle this would be to do
1080          * ip6_route_get_saddr, and then ip6_route_output; however,
1081          * the route-specific preferred source forces the
1082          * ip6_route_output call _before_ ip6_route_get_saddr.
1083          *
1084          * In source specific routing (no src=any default route),
1085          * ip6_route_output will fail given src=any saddr, though, so
1086          * that's why we try it again later.
1087          */
1088         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
1089                 struct fib6_info *from;
1090                 struct rt6_info *rt;
1091                 bool had_dst = *dst != NULL;
1092
1093                 if (!had_dst)
1094                         *dst = ip6_route_output(net, sk, fl6);
1095                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1096
1097                 rcu_read_lock();
1098                 from = rt ? rcu_dereference(rt->from) : NULL;
1099                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1100                                           sk ? inet6_sk(sk)->srcprefs : 0,
1101                                           &fl6->saddr);
1102                 rcu_read_unlock();
1103
1104                 if (err)
1105                         goto out_err_release;
1106
1107                 /* If we had an erroneous initial result, pretend it
1108                  * never existed and let the SA-enabled version take
1109                  * over.
1110                  */
1111                 if (!had_dst && (*dst)->error) {
1112                         dst_release(*dst);
1113                         *dst = NULL;
1114                 }
1115
1116                 if (fl6->flowi6_oif)
1117                         flags |= RT6_LOOKUP_F_IFACE;
1118         }
1119
1120         if (!*dst)
1121                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1122
1123         err = (*dst)->error;
1124         if (err)
1125                 goto out_err_release;
1126
1127 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1128         /*
1129          * Here if the dst entry we've looked up
1130          * has a neighbour entry that is in the INCOMPLETE
1131          * state and the src address from the flow is
1132          * marked as OPTIMISTIC, we release the found
1133          * dst entry and replace it instead with the
1134          * dst entry of the nexthop router
1135          */
1136         rt = (struct rt6_info *) *dst;
1137         rcu_read_lock_bh();
1138         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1139                                       rt6_nexthop(rt, &fl6->daddr));
1140         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1141         rcu_read_unlock_bh();
1142
1143         if (err) {
1144                 struct inet6_ifaddr *ifp;
1145                 struct flowi6 fl_gw6;
1146                 int redirect;
1147
1148                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1149                                       (*dst)->dev, 1);
1150
1151                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1152                 if (ifp)
1153                         in6_ifa_put(ifp);
1154
1155                 if (redirect) {
1156                         /*
1157                          * We need to get the dst entry for the
1158                          * default router instead
1159                          */
1160                         dst_release(*dst);
1161                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1162                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1163                         *dst = ip6_route_output(net, sk, &fl_gw6);
1164                         err = (*dst)->error;
1165                         if (err)
1166                                 goto out_err_release;
1167                 }
1168         }
1169 #endif
1170         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1171             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1172                 err = -EAFNOSUPPORT;
1173                 goto out_err_release;
1174         }
1175
1176         return 0;
1177
1178 out_err_release:
1179         dst_release(*dst);
1180         *dst = NULL;
1181
1182         if (err == -ENETUNREACH)
1183                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1184         return err;
1185 }
1186
1187 /**
1188  *      ip6_dst_lookup - perform route lookup on flow
1189  *      @sk: socket which provides route info
1190  *      @dst: pointer to dst_entry * for result
1191  *      @fl6: flow to lookup
1192  *
1193  *      This function performs a route lookup on the given flow.
1194  *
1195  *      It returns zero on success, or a standard errno code on error.
1196  */
1197 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1198                    struct flowi6 *fl6)
1199 {
1200         *dst = NULL;
1201         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1202 }
1203 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1204
1205 /**
1206  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1207  *      @sk: socket which provides route info
1208  *      @fl6: flow to lookup
1209  *      @final_dst: final destination address for ipsec lookup
1210  *
1211  *      This function performs a route lookup on the given flow.
1212  *
1213  *      It returns a valid dst pointer on success, or a pointer encoded
1214  *      error code.
1215  */
1216 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1217                                       const struct in6_addr *final_dst)
1218 {
1219         struct dst_entry *dst = NULL;
1220         int err;
1221
1222         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1223         if (err)
1224                 return ERR_PTR(err);
1225         if (final_dst)
1226                 fl6->daddr = *final_dst;
1227
1228         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1229 }
1230 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1231
1232 /**
1233  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1234  *      @sk: socket which provides the dst cache and route info
1235  *      @fl6: flow to lookup
1236  *      @final_dst: final destination address for ipsec lookup
1237  *      @connected: whether @sk is connected or not
1238  *
1239  *      This function performs a route lookup on the given flow with the
1240  *      possibility of using the cached route in the socket if it is valid.
1241  *      It will take the socket dst lock when operating on the dst cache.
1242  *      As a result, this function can only be used in process context.
1243  *
1244  *      In addition, for a connected socket, cache the dst in the socket
1245  *      if the current cache is not valid.
1246  *
1247  *      It returns a valid dst pointer on success, or a pointer encoded
1248  *      error code.
1249  */
1250 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1251                                          const struct in6_addr *final_dst,
1252                                          bool connected)
1253 {
1254         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1255
1256         dst = ip6_sk_dst_check(sk, dst, fl6);
1257         if (dst)
1258                 return dst;
1259
1260         dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1261         if (connected && !IS_ERR(dst))
1262                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1263
1264         return dst;
1265 }
1266 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1267
1268 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1269                                                gfp_t gfp)
1270 {
1271         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1272 }
1273
1274 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1275                                                 gfp_t gfp)
1276 {
1277         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1278 }
1279
1280 static void ip6_append_data_mtu(unsigned int *mtu,
1281                                 int *maxfraglen,
1282                                 unsigned int fragheaderlen,
1283                                 struct sk_buff *skb,
1284                                 struct rt6_info *rt,
1285                                 unsigned int orig_mtu)
1286 {
1287         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1288                 if (!skb) {
1289                         /* first fragment, reserve header_len */
1290                         *mtu = orig_mtu - rt->dst.header_len;
1291
1292                 } else {
1293                         /*
1294                          * this fragment is not first, the headers
1295                          * space is regarded as data space.
1296                          */
1297                         *mtu = orig_mtu;
1298                 }
1299                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1300                               + fragheaderlen - sizeof(struct frag_hdr);
1301         }
1302 }
1303
1304 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1305                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1306                           struct rt6_info *rt, struct flowi6 *fl6)
1307 {
1308         struct ipv6_pinfo *np = inet6_sk(sk);
1309         unsigned int mtu;
1310         struct ipv6_txoptions *opt = ipc6->opt;
1311
1312         /*
1313          * setup for corking
1314          */
1315         if (opt) {
1316                 if (WARN_ON(v6_cork->opt))
1317                         return -EINVAL;
1318
1319                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1320                 if (unlikely(!v6_cork->opt))
1321                         return -ENOBUFS;
1322
1323                 v6_cork->opt->tot_len = sizeof(*opt);
1324                 v6_cork->opt->opt_flen = opt->opt_flen;
1325                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1326
1327                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1328                                                     sk->sk_allocation);
1329                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1330                         return -ENOBUFS;
1331
1332                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1333                                                     sk->sk_allocation);
1334                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1335                         return -ENOBUFS;
1336
1337                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1338                                                    sk->sk_allocation);
1339                 if (opt->hopopt && !v6_cork->opt->hopopt)
1340                         return -ENOBUFS;
1341
1342                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1343                                                     sk->sk_allocation);
1344                 if (opt->srcrt && !v6_cork->opt->srcrt)
1345                         return -ENOBUFS;
1346
1347                 /* need source address above miyazawa*/
1348         }
1349         dst_hold(&rt->dst);
1350         cork->base.dst = &rt->dst;
1351         cork->fl.u.ip6 = *fl6;
1352         v6_cork->hop_limit = ipc6->hlimit;
1353         v6_cork->tclass = ipc6->tclass;
1354         if (rt->dst.flags & DST_XFRM_TUNNEL)
1355                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1356                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1357         else
1358                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1359                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1360         if (np->frag_size < mtu) {
1361                 if (np->frag_size)
1362                         mtu = np->frag_size;
1363         }
1364         cork->base.fragsize = mtu;
1365         cork->base.gso_size = ipc6->gso_size;
1366         cork->base.tx_flags = 0;
1367         cork->base.mark = ipc6->sockc.mark;
1368         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1369
1370         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1371                 cork->base.flags |= IPCORK_ALLFRAG;
1372         cork->base.length = 0;
1373
1374         cork->base.transmit_time = ipc6->sockc.transmit_time;
1375
1376         return 0;
1377 }
1378
1379 static int __ip6_append_data(struct sock *sk,
1380                              struct flowi6 *fl6,
1381                              struct sk_buff_head *queue,
1382                              struct inet_cork *cork,
1383                              struct inet6_cork *v6_cork,
1384                              struct page_frag *pfrag,
1385                              int getfrag(void *from, char *to, int offset,
1386                                          int len, int odd, struct sk_buff *skb),
1387                              void *from, int length, int transhdrlen,
1388                              unsigned int flags, struct ipcm6_cookie *ipc6)
1389 {
1390         struct sk_buff *skb, *skb_prev = NULL;
1391         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1392         struct ubuf_info *uarg = NULL;
1393         int exthdrlen = 0;
1394         int dst_exthdrlen = 0;
1395         int hh_len;
1396         int copy;
1397         int err;
1398         int offset = 0;
1399         u32 tskey = 0;
1400         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1401         struct ipv6_txoptions *opt = v6_cork->opt;
1402         int csummode = CHECKSUM_NONE;
1403         unsigned int maxnonfragsize, headersize;
1404         unsigned int wmem_alloc_delta = 0;
1405         bool paged, extra_uref = false;
1406
1407         skb = skb_peek_tail(queue);
1408         if (!skb) {
1409                 exthdrlen = opt ? opt->opt_flen : 0;
1410                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1411         }
1412
1413         paged = !!cork->gso_size;
1414         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1415         orig_mtu = mtu;
1416
1417         if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1418             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1419                 tskey = sk->sk_tskey++;
1420
1421         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1422
1423         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1424                         (opt ? opt->opt_nflen : 0);
1425
1426         headersize = sizeof(struct ipv6hdr) +
1427                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1428                      (dst_allfrag(&rt->dst) ?
1429                       sizeof(struct frag_hdr) : 0) +
1430                      rt->rt6i_nfheader_len;
1431
1432         if (mtu <= fragheaderlen ||
1433             ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1434                 goto emsgsize;
1435
1436         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1437                      sizeof(struct frag_hdr);
1438
1439         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1440          * the first fragment
1441          */
1442         if (headersize + transhdrlen > mtu)
1443                 goto emsgsize;
1444
1445         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1446             (sk->sk_protocol == IPPROTO_UDP ||
1447              sk->sk_protocol == IPPROTO_RAW)) {
1448                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1449                                 sizeof(struct ipv6hdr));
1450                 goto emsgsize;
1451         }
1452
1453         if (ip6_sk_ignore_df(sk))
1454                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1455         else
1456                 maxnonfragsize = mtu;
1457
1458         if (cork->length + length > maxnonfragsize - headersize) {
1459 emsgsize:
1460                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1461                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1462                 return -EMSGSIZE;
1463         }
1464
1465         /* CHECKSUM_PARTIAL only with no extension headers and when
1466          * we are not going to fragment
1467          */
1468         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1469             headersize == sizeof(struct ipv6hdr) &&
1470             length <= mtu - headersize &&
1471             (!(flags & MSG_MORE) || cork->gso_size) &&
1472             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1473                 csummode = CHECKSUM_PARTIAL;
1474
1475         if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1476                 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1477                 if (!uarg)
1478                         return -ENOBUFS;
1479                 extra_uref = !skb_zcopy(skb);   /* only ref on new uarg */
1480                 if (rt->dst.dev->features & NETIF_F_SG &&
1481                     csummode == CHECKSUM_PARTIAL) {
1482                         paged = true;
1483                 } else {
1484                         uarg->zerocopy = 0;
1485                         skb_zcopy_set(skb, uarg, &extra_uref);
1486                 }
1487         }
1488
1489         /*
1490          * Let's try using as much space as possible.
1491          * Use MTU if total length of the message fits into the MTU.
1492          * Otherwise, we need to reserve fragment header and
1493          * fragment alignment (= 8-15 octects, in total).
1494          *
1495          * Note that we may need to "move" the data from the tail of
1496          * of the buffer to the new fragment when we split
1497          * the message.
1498          *
1499          * FIXME: It may be fragmented into multiple chunks
1500          *        at once if non-fragmentable extension headers
1501          *        are too large.
1502          * --yoshfuji
1503          */
1504
1505         cork->length += length;
1506         if (!skb)
1507                 goto alloc_new_skb;
1508
1509         while (length > 0) {
1510                 /* Check if the remaining data fits into current packet. */
1511                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1512                 if (copy < length)
1513                         copy = maxfraglen - skb->len;
1514
1515                 if (copy <= 0) {
1516                         char *data;
1517                         unsigned int datalen;
1518                         unsigned int fraglen;
1519                         unsigned int fraggap;
1520                         unsigned int alloclen, alloc_extra;
1521                         unsigned int pagedlen;
1522 alloc_new_skb:
1523                         /* There's no room in the current skb */
1524                         if (skb)
1525                                 fraggap = skb->len - maxfraglen;
1526                         else
1527                                 fraggap = 0;
1528                         /* update mtu and maxfraglen if necessary */
1529                         if (!skb || !skb_prev)
1530                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1531                                                     fragheaderlen, skb, rt,
1532                                                     orig_mtu);
1533
1534                         skb_prev = skb;
1535
1536                         /*
1537                          * If remaining data exceeds the mtu,
1538                          * we know we need more fragment(s).
1539                          */
1540                         datalen = length + fraggap;
1541
1542                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1543                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1544                         fraglen = datalen + fragheaderlen;
1545                         pagedlen = 0;
1546
1547                         alloc_extra = hh_len;
1548                         alloc_extra += dst_exthdrlen;
1549                         alloc_extra += rt->dst.trailer_len;
1550
1551                         /* We just reserve space for fragment header.
1552                          * Note: this may be overallocation if the message
1553                          * (without MSG_MORE) fits into the MTU.
1554                          */
1555                         alloc_extra += sizeof(struct frag_hdr);
1556
1557                         if ((flags & MSG_MORE) &&
1558                             !(rt->dst.dev->features&NETIF_F_SG))
1559                                 alloclen = mtu;
1560                         else if (!paged &&
1561                                  (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1562                                   !(rt->dst.dev->features & NETIF_F_SG)))
1563                                 alloclen = fraglen;
1564                         else {
1565                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1566                                 pagedlen = fraglen - alloclen;
1567                         }
1568                         alloclen += alloc_extra;
1569
1570                         if (datalen != length + fraggap) {
1571                                 /*
1572                                  * this is not the last fragment, the trailer
1573                                  * space is regarded as data space.
1574                                  */
1575                                 datalen += rt->dst.trailer_len;
1576                         }
1577
1578                         fraglen = datalen + fragheaderlen;
1579
1580                         copy = datalen - transhdrlen - fraggap - pagedlen;
1581                         if (copy < 0) {
1582                                 err = -EINVAL;
1583                                 goto error;
1584                         }
1585                         if (transhdrlen) {
1586                                 skb = sock_alloc_send_skb(sk, alloclen,
1587                                                 (flags & MSG_DONTWAIT), &err);
1588                         } else {
1589                                 skb = NULL;
1590                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1591                                     2 * sk->sk_sndbuf)
1592                                         skb = alloc_skb(alloclen,
1593                                                         sk->sk_allocation);
1594                                 if (unlikely(!skb))
1595                                         err = -ENOBUFS;
1596                         }
1597                         if (!skb)
1598                                 goto error;
1599                         /*
1600                          *      Fill in the control structures
1601                          */
1602                         skb->protocol = htons(ETH_P_IPV6);
1603                         skb->ip_summed = csummode;
1604                         skb->csum = 0;
1605                         /* reserve for fragmentation and ipsec header */
1606                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1607                                     dst_exthdrlen);
1608
1609                         /*
1610                          *      Find where to start putting bytes
1611                          */
1612                         data = skb_put(skb, fraglen - pagedlen);
1613                         skb_set_network_header(skb, exthdrlen);
1614                         data += fragheaderlen;
1615                         skb->transport_header = (skb->network_header +
1616                                                  fragheaderlen);
1617                         if (fraggap) {
1618                                 skb->csum = skb_copy_and_csum_bits(
1619                                         skb_prev, maxfraglen,
1620                                         data + transhdrlen, fraggap, 0);
1621                                 skb_prev->csum = csum_sub(skb_prev->csum,
1622                                                           skb->csum);
1623                                 data += fraggap;
1624                                 pskb_trim_unique(skb_prev, maxfraglen);
1625                         }
1626                         if (copy > 0 &&
1627                             getfrag(from, data + transhdrlen, offset,
1628                                     copy, fraggap, skb) < 0) {
1629                                 err = -EFAULT;
1630                                 kfree_skb(skb);
1631                                 goto error;
1632                         }
1633
1634                         offset += copy;
1635                         length -= copy + transhdrlen;
1636                         transhdrlen = 0;
1637                         exthdrlen = 0;
1638                         dst_exthdrlen = 0;
1639
1640                         /* Only the initial fragment is time stamped */
1641                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1642                         cork->tx_flags = 0;
1643                         skb_shinfo(skb)->tskey = tskey;
1644                         tskey = 0;
1645                         skb_zcopy_set(skb, uarg, &extra_uref);
1646
1647                         if ((flags & MSG_CONFIRM) && !skb_prev)
1648                                 skb_set_dst_pending_confirm(skb, 1);
1649
1650                         /*
1651                          * Put the packet on the pending queue
1652                          */
1653                         if (!skb->destructor) {
1654                                 skb->destructor = sock_wfree;
1655                                 skb->sk = sk;
1656                                 wmem_alloc_delta += skb->truesize;
1657                         }
1658                         __skb_queue_tail(queue, skb);
1659                         continue;
1660                 }
1661
1662                 if (copy > length)
1663                         copy = length;
1664
1665                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1666                     skb_tailroom(skb) >= copy) {
1667                         unsigned int off;
1668
1669                         off = skb->len;
1670                         if (getfrag(from, skb_put(skb, copy),
1671                                                 offset, copy, off, skb) < 0) {
1672                                 __skb_trim(skb, off);
1673                                 err = -EFAULT;
1674                                 goto error;
1675                         }
1676                 } else if (!uarg || !uarg->zerocopy) {
1677                         int i = skb_shinfo(skb)->nr_frags;
1678
1679                         err = -ENOMEM;
1680                         if (!sk_page_frag_refill(sk, pfrag))
1681                                 goto error;
1682
1683                         if (!skb_can_coalesce(skb, i, pfrag->page,
1684                                               pfrag->offset)) {
1685                                 err = -EMSGSIZE;
1686                                 if (i == MAX_SKB_FRAGS)
1687                                         goto error;
1688
1689                                 __skb_fill_page_desc(skb, i, pfrag->page,
1690                                                      pfrag->offset, 0);
1691                                 skb_shinfo(skb)->nr_frags = ++i;
1692                                 get_page(pfrag->page);
1693                         }
1694                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1695                         if (getfrag(from,
1696                                     page_address(pfrag->page) + pfrag->offset,
1697                                     offset, copy, skb->len, skb) < 0)
1698                                 goto error_efault;
1699
1700                         pfrag->offset += copy;
1701                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1702                         skb->len += copy;
1703                         skb->data_len += copy;
1704                         skb->truesize += copy;
1705                         wmem_alloc_delta += copy;
1706                 } else {
1707                         err = skb_zerocopy_iter_dgram(skb, from, copy);
1708                         if (err < 0)
1709                                 goto error;
1710                 }
1711                 offset += copy;
1712                 length -= copy;
1713         }
1714
1715         if (wmem_alloc_delta)
1716                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1717         return 0;
1718
1719 error_efault:
1720         err = -EFAULT;
1721 error:
1722         if (uarg)
1723                 sock_zerocopy_put_abort(uarg, extra_uref);
1724         cork->length -= length;
1725         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1726         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1727         return err;
1728 }
1729
1730 int ip6_append_data(struct sock *sk,
1731                     int getfrag(void *from, char *to, int offset, int len,
1732                                 int odd, struct sk_buff *skb),
1733                     void *from, int length, int transhdrlen,
1734                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1735                     struct rt6_info *rt, unsigned int flags)
1736 {
1737         struct inet_sock *inet = inet_sk(sk);
1738         struct ipv6_pinfo *np = inet6_sk(sk);
1739         int exthdrlen;
1740         int err;
1741
1742         if (flags&MSG_PROBE)
1743                 return 0;
1744         if (skb_queue_empty(&sk->sk_write_queue)) {
1745                 /*
1746                  * setup for corking
1747                  */
1748                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1749                                      ipc6, rt, fl6);
1750                 if (err)
1751                         return err;
1752
1753                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1754                 length += exthdrlen;
1755                 transhdrlen += exthdrlen;
1756         } else {
1757                 fl6 = &inet->cork.fl.u.ip6;
1758                 transhdrlen = 0;
1759         }
1760
1761         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1762                                  &np->cork, sk_page_frag(sk), getfrag,
1763                                  from, length, transhdrlen, flags, ipc6);
1764 }
1765 EXPORT_SYMBOL_GPL(ip6_append_data);
1766
1767 static void ip6_cork_release(struct inet_cork_full *cork,
1768                              struct inet6_cork *v6_cork)
1769 {
1770         if (v6_cork->opt) {
1771                 kfree(v6_cork->opt->dst0opt);
1772                 kfree(v6_cork->opt->dst1opt);
1773                 kfree(v6_cork->opt->hopopt);
1774                 kfree(v6_cork->opt->srcrt);
1775                 kfree(v6_cork->opt);
1776                 v6_cork->opt = NULL;
1777         }
1778
1779         if (cork->base.dst) {
1780                 dst_release(cork->base.dst);
1781                 cork->base.dst = NULL;
1782                 cork->base.flags &= ~IPCORK_ALLFRAG;
1783         }
1784         memset(&cork->fl, 0, sizeof(cork->fl));
1785 }
1786
1787 struct sk_buff *__ip6_make_skb(struct sock *sk,
1788                                struct sk_buff_head *queue,
1789                                struct inet_cork_full *cork,
1790                                struct inet6_cork *v6_cork)
1791 {
1792         struct sk_buff *skb, *tmp_skb;
1793         struct sk_buff **tail_skb;
1794         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1795         struct ipv6_pinfo *np = inet6_sk(sk);
1796         struct net *net = sock_net(sk);
1797         struct ipv6hdr *hdr;
1798         struct ipv6_txoptions *opt = v6_cork->opt;
1799         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1800         struct flowi6 *fl6 = &cork->fl.u.ip6;
1801         unsigned char proto = fl6->flowi6_proto;
1802
1803         skb = __skb_dequeue(queue);
1804         if (!skb)
1805                 goto out;
1806         tail_skb = &(skb_shinfo(skb)->frag_list);
1807
1808         /* move skb->data to ip header from ext header */
1809         if (skb->data < skb_network_header(skb))
1810                 __skb_pull(skb, skb_network_offset(skb));
1811         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1812                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1813                 *tail_skb = tmp_skb;
1814                 tail_skb = &(tmp_skb->next);
1815                 skb->len += tmp_skb->len;
1816                 skb->data_len += tmp_skb->len;
1817                 skb->truesize += tmp_skb->truesize;
1818                 tmp_skb->destructor = NULL;
1819                 tmp_skb->sk = NULL;
1820         }
1821
1822         /* Allow local fragmentation. */
1823         skb->ignore_df = ip6_sk_ignore_df(sk);
1824
1825         *final_dst = fl6->daddr;
1826         __skb_pull(skb, skb_network_header_len(skb));
1827         if (opt && opt->opt_flen)
1828                 ipv6_push_frag_opts(skb, opt, &proto);
1829         if (opt && opt->opt_nflen)
1830                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1831
1832         skb_push(skb, sizeof(struct ipv6hdr));
1833         skb_reset_network_header(skb);
1834         hdr = ipv6_hdr(skb);
1835
1836         ip6_flow_hdr(hdr, v6_cork->tclass,
1837                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1838                                         ip6_autoflowlabel(net, np), fl6));
1839         hdr->hop_limit = v6_cork->hop_limit;
1840         hdr->nexthdr = proto;
1841         hdr->saddr = fl6->saddr;
1842         hdr->daddr = *final_dst;
1843
1844         skb->priority = sk->sk_priority;
1845         skb->mark = cork->base.mark;
1846
1847         skb->tstamp = cork->base.transmit_time;
1848
1849         skb_dst_set(skb, dst_clone(&rt->dst));
1850         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1851         if (proto == IPPROTO_ICMPV6) {
1852                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1853
1854                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1855                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1856         }
1857
1858         ip6_cork_release(cork, v6_cork);
1859 out:
1860         return skb;
1861 }
1862
1863 int ip6_send_skb(struct sk_buff *skb)
1864 {
1865         struct net *net = sock_net(skb->sk);
1866         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1867         int err;
1868
1869         err = ip6_local_out(net, skb->sk, skb);
1870         if (err) {
1871                 if (err > 0)
1872                         err = net_xmit_errno(err);
1873                 if (err)
1874                         IP6_INC_STATS(net, rt->rt6i_idev,
1875                                       IPSTATS_MIB_OUTDISCARDS);
1876         }
1877
1878         return err;
1879 }
1880
1881 int ip6_push_pending_frames(struct sock *sk)
1882 {
1883         struct sk_buff *skb;
1884
1885         skb = ip6_finish_skb(sk);
1886         if (!skb)
1887                 return 0;
1888
1889         return ip6_send_skb(skb);
1890 }
1891 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1892
1893 static void __ip6_flush_pending_frames(struct sock *sk,
1894                                        struct sk_buff_head *queue,
1895                                        struct inet_cork_full *cork,
1896                                        struct inet6_cork *v6_cork)
1897 {
1898         struct sk_buff *skb;
1899
1900         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1901                 if (skb_dst(skb))
1902                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1903                                       IPSTATS_MIB_OUTDISCARDS);
1904                 kfree_skb(skb);
1905         }
1906
1907         ip6_cork_release(cork, v6_cork);
1908 }
1909
1910 void ip6_flush_pending_frames(struct sock *sk)
1911 {
1912         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1913                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1914 }
1915 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1916
1917 struct sk_buff *ip6_make_skb(struct sock *sk,
1918                              int getfrag(void *from, char *to, int offset,
1919                                          int len, int odd, struct sk_buff *skb),
1920                              void *from, int length, int transhdrlen,
1921                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1922                              struct rt6_info *rt, unsigned int flags,
1923                              struct inet_cork_full *cork)
1924 {
1925         struct inet6_cork v6_cork;
1926         struct sk_buff_head queue;
1927         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1928         int err;
1929
1930         if (flags & MSG_PROBE)
1931                 return NULL;
1932
1933         __skb_queue_head_init(&queue);
1934
1935         cork->base.flags = 0;
1936         cork->base.addr = 0;
1937         cork->base.opt = NULL;
1938         cork->base.dst = NULL;
1939         v6_cork.opt = NULL;
1940         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1941         if (err) {
1942                 ip6_cork_release(cork, &v6_cork);
1943                 return ERR_PTR(err);
1944         }
1945         if (ipc6->dontfrag < 0)
1946                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1947
1948         err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1949                                 &current->task_frag, getfrag, from,
1950                                 length + exthdrlen, transhdrlen + exthdrlen,
1951                                 flags, ipc6);
1952         if (err) {
1953                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1954                 return ERR_PTR(err);
1955         }
1956
1957         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1958 }