net/ipv6/ip6_output.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  *      IPv6 output functions
   4  *      Linux INET6 implementation
   5  *
   6  *      Authors:
   7  *      Pedro Roque             <roque@di.fc.ul.pt>
   8  *
   9  *      Based on linux/net/ipv4/ip_output.c
  10  *
  11  *      Changes:
  12  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  13  *                              extension headers are implemented.
  14  *                              route changes now work.
  15  *                              ip6_forward does not confuse sniffers.
  16  *                              etc.
  17  *
  18  *      H. von Brand    :       Added missing #include <linux/string.h>
  19  *      Imran Patel     :       frag id should be in NBO
  20  *      Kazunori MIYAZAWA @USAGI
  21  *                      :       add ip6_append_data and related functions
  22  *                              for datagram xmit
  23  */
  24
  25 #include <linux/errno.h>
  26 #include <linux/kernel.h>
  27 #include <linux/string.h>
  28 #include <linux/socket.h>
  29 #include <linux/net.h>
  30 #include <linux/netdevice.h>
  31 #include <linux/if_arp.h>
  32 #include <linux/in6.h>
  33 #include <linux/tcp.h>
  34 #include <linux/route.h>
  35 #include <linux/module.h>
  36 #include <linux/slab.h>
  37
  38 #include <linux/bpf-cgroup.h>
  39 #include <linux/netfilter.h>
  40 #include <linux/netfilter_ipv6.h>
  41
  42 #include <net/sock.h>
  43 #include <net/snmp.h>
  44
  45 #include <net/ipv6.h>
  46 #include <net/ndisc.h>
  47 #include <net/protocol.h>
  48 #include <net/ip6_route.h>
  49 #include <net/addrconf.h>
  50 #include <net/rawv6.h>
  51 #include <net/icmp.h>
  52 #include <net/xfrm.h>
  53 #include <net/checksum.h>
  54 #include <linux/mroute6.h>
  55 #include <net/l3mdev.h>
  56 #include <net/lwtunnel.h>
  57 #include <net/ip_tunnels.h>
  58
  59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  60 {
  61         struct dst_entry *dst = skb_dst(skb);
  62         struct net_device *dev = dst->dev;
  63         struct inet6_dev *idev = ip6_dst_idev(dst);
  64         unsigned int hh_len = LL_RESERVED_SPACE(dev);
  65         const struct in6_addr *daddr, *nexthop;
  66         struct ipv6hdr *hdr;
  67         struct neighbour *neigh;
  68         int ret;
  69
  70         /* Be paranoid, rather than too clever. */
  71         if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
  72                 skb = skb_expand_head(skb, hh_len);
  73                 if (!skb) {
  74                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
  75                         return -ENOMEM;
  76                 }
  77         }
  78
  79         hdr = ipv6_hdr(skb);
  80         daddr = &hdr->daddr;
  81         if (ipv6_addr_is_multicast(daddr)) {
  82                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  83                     ((mroute6_is_socket(net, skb) &&
  84                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  85                      ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
  86                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  87
  88                         /* Do not check for IFF_ALLMULTI; multicast routing
  89                            is not supported in any case.
  90                          */
  91                         if (newskb)
  92                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  93                                         net, sk, newskb, NULL, newskb->dev,
  94                                         dev_loopback_xmit);
  95
  96                         if (hdr->hop_limit == 0) {
  97                                 IP6_INC_STATS(net, idev,
  98                                               IPSTATS_MIB_OUTDISCARDS);
  99                                 kfree_skb(skb);
 100                                 return 0;
 101                         }
 102                 }
 103
 104                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
 105                 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
 106                     !(dev->flags & IFF_LOOPBACK)) {
 107                         kfree_skb(skb);
 108                         return 0;
 109                 }
 110         }
 111
 112         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 113                 int res = lwtunnel_xmit(skb);
 114
 115                 if (res != LWTUNNEL_XMIT_CONTINUE)
 116                         return res;
 117         }
 118
 119         rcu_read_lock_bh();
 120         nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
 121         neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
 122         if (unlikely(!neigh))
 123                 neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
 124         if (!IS_ERR(neigh)) {
 125                 sock_confirm_neigh(skb, neigh);
 126                 ret = neigh_output(neigh, skb, false);
 127                 rcu_read_unlock_bh();
 128                 return ret;
 129         }
 130         rcu_read_unlock_bh();
 131
 132         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
 133         kfree_skb(skb);
 134         return -EINVAL;
 135 }
 136
 137 static int
 138 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
 139                                     struct sk_buff *skb, unsigned int mtu)
 140 {
 141         struct sk_buff *segs, *nskb;
 142         netdev_features_t features;
 143         int ret = 0;
 144
 145         /* Please see corresponding comment in ip_finish_output_gso
 146          * describing the cases where GSO segment length exceeds the
 147          * egress MTU.
 148          */
 149         features = netif_skb_features(skb);
 150         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 151         if (IS_ERR_OR_NULL(segs)) {
 152                 kfree_skb(skb);
 153                 return -ENOMEM;
 154         }
 155
 156         consume_skb(skb);
 157
 158         skb_list_walk_safe(segs, segs, nskb) {
 159                 int err;
 160
 161                 skb_mark_not_on_list(segs);
 162                 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
 163                 if (err && ret == 0)
 164                         ret = err;
 165         }
 166
 167         return ret;
 168 }
 169
 170 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 171 {
 172         unsigned int mtu;
 173
 174 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 175         /* Policy lookup after SNAT yielded a new policy */
 176         if (skb_dst(skb)->xfrm) {
 177                 IP6CB(skb)->flags |= IP6SKB_REROUTED;
 178                 return dst_output(net, sk, skb);
 179         }
 180 #endif
 181
 182         mtu = ip6_skb_dst_mtu(skb);
 183         if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
 184                 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
 185
 186         if ((skb->len > mtu && !skb_is_gso(skb)) ||
 187             dst_allfrag(skb_dst(skb)) ||
 188             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 189                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 190         else
 191                 return ip6_finish_output2(net, sk, skb);
 192 }
 193
 194 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 195 {
 196         int ret;
 197
 198         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 199         switch (ret) {
 200         case NET_XMIT_SUCCESS:
 201                 return __ip6_finish_output(net, sk, skb);
 202         case NET_XMIT_CN:
 203                 return __ip6_finish_output(net, sk, skb) ? : ret;
 204         default:
 205                 kfree_skb(skb);
 206                 return ret;
 207         }
 208 }
 209
 210 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 211 {
 212         struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
 213         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 214
 215         skb->protocol = htons(ETH_P_IPV6);
 216         skb->dev = dev;
 217
 218         if (unlikely(idev->cnf.disable_ipv6)) {
 219                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 220                 kfree_skb(skb);
 221                 return 0;
 222         }
 223
 224         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 225                             net, sk, skb, indev, dev,
 226                             ip6_finish_output,
 227                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 228 }
 229 EXPORT_SYMBOL(ip6_output);
 230
 231 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 232 {
 233         if (!np->autoflowlabel_set)
 234                 return ip6_default_np_autolabel(net);
 235         else
 236                 return np->autoflowlabel;
 237 }
 238
 239 /*
 240  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 241  * Note : socket lock is not held for SYNACK packets, but might be modified
 242  * by calls to skb_set_owner_w() and ipv6_local_error(),
 243  * which are using proper atomic operations or spinlocks.
 244  */
 245 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 246              __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
 247 {
 248         struct net *net = sock_net(sk);
 249         const struct ipv6_pinfo *np = inet6_sk(sk);
 250         struct in6_addr *first_hop = &fl6->daddr;
 251         struct dst_entry *dst = skb_dst(skb);
 252         struct net_device *dev = dst->dev;
 253         struct inet6_dev *idev = ip6_dst_idev(dst);
 254         unsigned int head_room;
 255         struct ipv6hdr *hdr;
 256         u8  proto = fl6->flowi6_proto;
 257         int seg_len = skb->len;
 258         int hlimit = -1;
 259         u32 mtu;
 260
 261         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev);
 262         if (opt)
 263                 head_room += opt->opt_nflen + opt->opt_flen;
 264
 265         if (unlikely(head_room > skb_headroom(skb))) {
 266                 skb = skb_expand_head(skb, head_room);
 267                 if (!skb) {
 268                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 269                         return -ENOBUFS;
 270                 }
 271         }
 272
 273         if (opt) {
 274                 seg_len += opt->opt_nflen + opt->opt_flen;
 275
 276                 if (opt->opt_flen)
 277                         ipv6_push_frag_opts(skb, opt, &proto);
 278
 279                 if (opt->opt_nflen)
 280                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 281                                              &fl6->saddr);
 282         }
 283
 284         skb_push(skb, sizeof(struct ipv6hdr));
 285         skb_reset_network_header(skb);
 286         hdr = ipv6_hdr(skb);
 287
 288         /*
 289          *      Fill in the IPv6 header
 290          */
 291         if (np)
 292                 hlimit = np->hop_limit;
 293         if (hlimit < 0)
 294                 hlimit = ip6_dst_hoplimit(dst);
 295
 296         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 297                                 ip6_autoflowlabel(net, np), fl6));
 298
 299         hdr->payload_len = htons(seg_len);
 300         hdr->nexthdr = proto;
 301         hdr->hop_limit = hlimit;
 302
 303         hdr->saddr = fl6->saddr;
 304         hdr->daddr = *first_hop;
 305
 306         skb->protocol = htons(ETH_P_IPV6);
 307         skb->priority = priority;
 308         skb->mark = mark;
 309
 310         mtu = dst_mtu(dst);
 311         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 312                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
 313
 314                 /* if egress device is enslaved to an L3 master device pass the
 315                  * skb to its handler for processing
 316                  */
 317                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 318                 if (unlikely(!skb))
 319                         return 0;
 320
 321                 /* hooks should never assume socket lock is held.
 322                  * we promote our socket to non const
 323                  */
 324                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 325                                net, (struct sock *)sk, skb, NULL, dev,
 326                                dst_output);
 327         }
 328
 329         skb->dev = dev;
 330         /* ipv6_local_error() does not require socket lock,
 331          * we promote our socket to non const
 332          */
 333         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 334
 335         IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
 336         kfree_skb(skb);
 337         return -EMSGSIZE;
 338 }
 339 EXPORT_SYMBOL(ip6_xmit);
 340
 341 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 342 {
 343         struct ip6_ra_chain *ra;
 344         struct sock *last = NULL;
 345
 346         read_lock(&ip6_ra_lock);
 347         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 348                 struct sock *sk = ra->sk;
 349                 if (sk && ra->sel == sel &&
 350                     (!sk->sk_bound_dev_if ||
 351                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 352                         struct ipv6_pinfo *np = inet6_sk(sk);
 353
 354                         if (np && np->rtalert_isolate &&
 355                             !net_eq(sock_net(sk), dev_net(skb->dev))) {
 356                                 continue;
 357                         }
 358                         if (last) {
 359                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 360                                 if (skb2)
 361                                         rawv6_rcv(last, skb2);
 362                         }
 363                         last = sk;
 364                 }
 365         }
 366
 367         if (last) {
 368                 rawv6_rcv(last, skb);
 369                 read_unlock(&ip6_ra_lock);
 370                 return 1;
 371         }
 372         read_unlock(&ip6_ra_lock);
 373         return 0;
 374 }
 375
 376 static int ip6_forward_proxy_check(struct sk_buff *skb)
 377 {
 378         struct ipv6hdr *hdr = ipv6_hdr(skb);
 379         u8 nexthdr = hdr->nexthdr;
 380         __be16 frag_off;
 381         int offset;
 382
 383         if (ipv6_ext_hdr(nexthdr)) {
 384                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 385                 if (offset < 0)
 386                         return 0;
 387         } else
 388                 offset = sizeof(struct ipv6hdr);
 389
 390         if (nexthdr == IPPROTO_ICMPV6) {
 391                 struct icmp6hdr *icmp6;
 392
 393                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 394                                          offset + 1 - skb->data)))
 395                         return 0;
 396
 397                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 398
 399                 switch (icmp6->icmp6_type) {
 400                 case NDISC_ROUTER_SOLICITATION:
 401                 case NDISC_ROUTER_ADVERTISEMENT:
 402                 case NDISC_NEIGHBOUR_SOLICITATION:
 403                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 404                 case NDISC_REDIRECT:
 405                         /* For reaction involving unicast neighbor discovery
 406                          * message destined to the proxied address, pass it to
 407                          * input function.
 408                          */
 409                         return 1;
 410                 default:
 411                         break;
 412                 }
 413         }
 414
 415         /*
 416          * The proxying router can't forward traffic sent to a link-local
 417          * address, so signal the sender and discard the packet. This
 418          * behavior is clarified by the MIPv6 specification.
 419          */
 420         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 421                 dst_link_failure(skb);
 422                 return -1;
 423         }
 424
 425         return 0;
 426 }
 427
 428 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 429                                      struct sk_buff *skb)
 430 {
 431         struct dst_entry *dst = skb_dst(skb);
 432
 433         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 434         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 435
 436 #ifdef CONFIG_NET_SWITCHDEV
 437         if (skb->offload_l3_fwd_mark) {
 438                 consume_skb(skb);
 439                 return 0;
 440         }
 441 #endif
 442
 443         skb->tstamp = 0;
 444         return dst_output(net, sk, skb);
 445 }
 446
 447 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 448 {
 449         if (skb->len <= mtu)
 450                 return false;
 451
 452         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 453         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 454                 return true;
 455
 456         if (skb->ignore_df)
 457                 return false;
 458
 459         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 460                 return false;
 461
 462         return true;
 463 }
 464
 465 int ip6_forward(struct sk_buff *skb)
 466 {
 467         struct dst_entry *dst = skb_dst(skb);
 468         struct ipv6hdr *hdr = ipv6_hdr(skb);
 469         struct inet6_skb_parm *opt = IP6CB(skb);
 470         struct net *net = dev_net(dst->dev);
 471         struct inet6_dev *idev;
 472         u32 mtu;
 473
 474         idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
 475         if (net->ipv6.devconf_all->forwarding == 0)
 476                 goto error;
 477
 478         if (skb->pkt_type != PACKET_HOST)
 479                 goto drop;
 480
 481         if (unlikely(skb->sk))
 482                 goto drop;
 483
 484         if (skb_warn_if_lro(skb))
 485                 goto drop;
 486
 487         if (!net->ipv6.devconf_all->disable_policy &&
 488             (!idev || !idev->cnf.disable_policy) &&
 489             !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 490                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 491                 goto drop;
 492         }
 493
 494         skb_forward_csum(skb);
 495
 496         /*
 497          *      We DO NOT make any processing on
 498          *      RA packets, pushing them to user level AS IS
 499          *      without ane WARRANTY that application will be able
 500          *      to interpret them. The reason is that we
 501          *      cannot make anything clever here.
 502          *
 503          *      We are not end-node, so that if packet contains
 504          *      AH/ESP, we cannot make anything.
 505          *      Defragmentation also would be mistake, RA packets
 506          *      cannot be fragmented, because there is no warranty
 507          *      that different fragments will go along one path. --ANK
 508          */
 509         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 510                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 511                         return 0;
 512         }
 513
 514         /*
 515          *      check and decrement ttl
 516          */
 517         if (hdr->hop_limit <= 1) {
 518                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 519                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 520
 521                 kfree_skb(skb);
 522                 return -ETIMEDOUT;
 523         }
 524
 525         /* XXX: idev->cnf.proxy_ndp? */
 526         if (net->ipv6.devconf_all->proxy_ndp &&
 527             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 528                 int proxied = ip6_forward_proxy_check(skb);
 529                 if (proxied > 0) {
 530                         /* It's tempting to decrease the hop limit
 531                          * here by 1, as we do at the end of the
 532                          * function too.
 533                          *
 534                          * But that would be incorrect, as proxying is
 535                          * not forwarding.  The ip6_input function
 536                          * will handle this packet locally, and it
 537                          * depends on the hop limit being unchanged.
 538                          *
 539                          * One example is the NDP hop limit, that
 540                          * always has to stay 255, but other would be
 541                          * similar checks around RA packets, where the
 542                          * user can even change the desired limit.
 543                          */
 544                         return ip6_input(skb);
 545                 } else if (proxied < 0) {
 546                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 547                         goto drop;
 548                 }
 549         }
 550
 551         if (!xfrm6_route_forward(skb)) {
 552                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 553                 goto drop;
 554         }
 555         dst = skb_dst(skb);
 556
 557         /* IPv6 specs say nothing about it, but it is clear that we cannot
 558            send redirects to source routed frames.
 559            We don't send redirects to frames decapsulated from IPsec.
 560          */
 561         if (IP6CB(skb)->iif == dst->dev->ifindex &&
 562             opt->srcrt == 0 && !skb_sec_path(skb)) {
 563                 struct in6_addr *target = NULL;
 564                 struct inet_peer *peer;
 565                 struct rt6_info *rt;
 566
 567                 /*
 568                  *      incoming and outgoing devices are the same
 569                  *      send a redirect.
 570                  */
 571
 572                 rt = (struct rt6_info *) dst;
 573                 if (rt->rt6i_flags & RTF_GATEWAY)
 574                         target = &rt->rt6i_gateway;
 575                 else
 576                         target = &hdr->daddr;
 577
 578                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 579
 580                 /* Limit redirects both by destination (here)
 581                    and by source (inside ndisc_send_redirect)
 582                  */
 583                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 584                         ndisc_send_redirect(skb, target);
 585                 if (peer)
 586                         inet_putpeer(peer);
 587         } else {
 588                 int addrtype = ipv6_addr_type(&hdr->saddr);
 589
 590                 /* This check is security critical. */
 591                 if (addrtype == IPV6_ADDR_ANY ||
 592                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 593                         goto error;
 594                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 595                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 596                                     ICMPV6_NOT_NEIGHBOUR, 0);
 597                         goto error;
 598                 }
 599         }
 600
 601         mtu = ip6_dst_mtu_maybe_forward(dst, true);
 602         if (mtu < IPV6_MIN_MTU)
 603                 mtu = IPV6_MIN_MTU;
 604
 605         if (ip6_pkt_too_big(skb, mtu)) {
 606                 /* Again, force OUTPUT device used as source address */
 607                 skb->dev = dst->dev;
 608                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 609                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 610                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 611                                 IPSTATS_MIB_FRAGFAILS);
 612                 kfree_skb(skb);
 613                 return -EMSGSIZE;
 614         }
 615
 616         if (skb_cow(skb, dst->dev->hard_header_len)) {
 617                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 618                                 IPSTATS_MIB_OUTDISCARDS);
 619                 goto drop;
 620         }
 621
 622         hdr = ipv6_hdr(skb);
 623
 624         /* Mangling hops number delayed to point after skb COW */
 625
 626         hdr->hop_limit--;
 627
 628         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 629                        net, NULL, skb, skb->dev, dst->dev,
 630                        ip6_forward_finish);
 631
 632 error:
 633         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 634 drop:
 635         kfree_skb(skb);
 636         return -EINVAL;
 637 }
 638
 639 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 640 {
 641         to->pkt_type = from->pkt_type;
 642         to->priority = from->priority;
 643         to->protocol = from->protocol;
 644         skb_dst_drop(to);
 645         skb_dst_set(to, dst_clone(skb_dst(from)));
 646         to->dev = from->dev;
 647         to->mark = from->mark;
 648
 649         skb_copy_hash(to, from);
 650
 651 #ifdef CONFIG_NET_SCHED
 652         to->tc_index = from->tc_index;
 653 #endif
 654         nf_copy(to, from);
 655         skb_ext_copy(to, from);
 656         skb_copy_secmark(to, from);
 657 }
 658
 659 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
 660                       u8 nexthdr, __be32 frag_id,
 661                       struct ip6_fraglist_iter *iter)
 662 {
 663         unsigned int first_len;
 664         struct frag_hdr *fh;
 665
 666         /* BUILD HEADER */
 667         *prevhdr = NEXTHDR_FRAGMENT;
 668         iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 669         if (!iter->tmp_hdr)
 670                 return -ENOMEM;
 671
 672         iter->frag = skb_shinfo(skb)->frag_list;
 673         skb_frag_list_init(skb);
 674
 675         iter->offset = 0;
 676         iter->hlen = hlen;
 677         iter->frag_id = frag_id;
 678         iter->nexthdr = nexthdr;
 679
 680         __skb_pull(skb, hlen);
 681         fh = __skb_push(skb, sizeof(struct frag_hdr));
 682         __skb_push(skb, hlen);
 683         skb_reset_network_header(skb);
 684         memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
 685
 686         fh->nexthdr = nexthdr;
 687         fh->reserved = 0;
 688         fh->frag_off = htons(IP6_MF);
 689         fh->identification = frag_id;
 690
 691         first_len = skb_pagelen(skb);
 692         skb->data_len = first_len - skb_headlen(skb);
 693         skb->len = first_len;
 694         ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
 695
 696         return 0;
 697 }
 698 EXPORT_SYMBOL(ip6_fraglist_init);
 699
 700 void ip6_fraglist_prepare(struct sk_buff *skb,
 701                           struct ip6_fraglist_iter *iter)
 702 {
 703         struct sk_buff *frag = iter->frag;
 704         unsigned int hlen = iter->hlen;
 705         struct frag_hdr *fh;
 706
 707         frag->ip_summed = CHECKSUM_NONE;
 708         skb_reset_transport_header(frag);
 709         fh = __skb_push(frag, sizeof(struct frag_hdr));
 710         __skb_push(frag, hlen);
 711         skb_reset_network_header(frag);
 712         memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
 713         iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
 714         fh->nexthdr = iter->nexthdr;
 715         fh->reserved = 0;
 716         fh->frag_off = htons(iter->offset);
 717         if (frag->next)
 718                 fh->frag_off |= htons(IP6_MF);
 719         fh->identification = iter->frag_id;
 720         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 721         ip6_copy_metadata(frag, skb);
 722 }
 723 EXPORT_SYMBOL(ip6_fraglist_prepare);
 724
 725 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
 726                    unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
 727                    u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
 728 {
 729         state->prevhdr = prevhdr;
 730         state->nexthdr = nexthdr;
 731         state->frag_id = frag_id;
 732
 733         state->hlen = hlen;
 734         state->mtu = mtu;
 735
 736         state->left = skb->len - hlen;  /* Space per frame */
 737         state->ptr = hlen;              /* Where to start from */
 738
 739         state->hroom = hdr_room;
 740         state->troom = needed_tailroom;
 741
 742         state->offset = 0;
 743 }
 744 EXPORT_SYMBOL(ip6_frag_init);
 745
 746 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
 747 {
 748         u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
 749         struct sk_buff *frag;
 750         struct frag_hdr *fh;
 751         unsigned int len;
 752
 753         len = state->left;
 754         /* IF: it doesn't fit, use 'mtu' - the data space left */
 755         if (len > state->mtu)
 756                 len = state->mtu;
 757         /* IF: we are not sending up to and including the packet end
 758            then align the next start on an eight byte boundary */
 759         if (len < state->left)
 760                 len &= ~7;
 761
 762         /* Allocate buffer */
 763         frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
 764                          state->hroom + state->troom, GFP_ATOMIC);
 765         if (!frag)
 766                 return ERR_PTR(-ENOMEM);
 767
 768         /*
 769          *      Set up data on packet
 770          */
 771
 772         ip6_copy_metadata(frag, skb);
 773         skb_reserve(frag, state->hroom);
 774         skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
 775         skb_reset_network_header(frag);
 776         fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
 777         frag->transport_header = (frag->network_header + state->hlen +
 778                                   sizeof(struct frag_hdr));
 779
 780         /*
 781          *      Charge the memory for the fragment to any owner
 782          *      it might possess
 783          */
 784         if (skb->sk)
 785                 skb_set_owner_w(frag, skb->sk);
 786
 787         /*
 788          *      Copy the packet header into the new buffer.
 789          */
 790         skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
 791
 792         fragnexthdr_offset = skb_network_header(frag);
 793         fragnexthdr_offset += prevhdr - skb_network_header(skb);
 794         *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 795
 796         /*
 797          *      Build fragment header.
 798          */
 799         fh->nexthdr = state->nexthdr;
 800         fh->reserved = 0;
 801         fh->identification = state->frag_id;
 802
 803         /*
 804          *      Copy a block of the IP datagram.
 805          */
 806         BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
 807                              len));
 808         state->left -= len;
 809
 810         fh->frag_off = htons(state->offset);
 811         if (state->left > 0)
 812                 fh->frag_off |= htons(IP6_MF);
 813         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 814
 815         state->ptr += len;
 816         state->offset += len;
 817
 818         return frag;
 819 }
 820 EXPORT_SYMBOL(ip6_frag_next);
 821
 822 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 823                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 824 {
 825         struct sk_buff *frag;
 826         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 827         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 828                                 inet6_sk(skb->sk) : NULL;
 829         struct ip6_frag_state state;
 830         unsigned int mtu, hlen, nexthdr_offset;
 831         ktime_t tstamp = skb->tstamp;
 832         int hroom, err = 0;
 833         __be32 frag_id;
 834         u8 *prevhdr, nexthdr = 0;
 835
 836         err = ip6_find_1stfragopt(skb, &prevhdr);
 837         if (err < 0)
 838                 goto fail;
 839         hlen = err;
 840         nexthdr = *prevhdr;
 841         nexthdr_offset = prevhdr - skb_network_header(skb);
 842
 843         mtu = ip6_skb_dst_mtu(skb);
 844
 845         /* We must not fragment if the socket is set to force MTU discovery
 846          * or if the skb it not generated by a local socket.
 847          */
 848         if (unlikely(!skb->ignore_df && skb->len > mtu))
 849                 goto fail_toobig;
 850
 851         if (IP6CB(skb)->frag_max_size) {
 852                 if (IP6CB(skb)->frag_max_size > mtu)
 853                         goto fail_toobig;
 854
 855                 /* don't send fragments larger than what we received */
 856                 mtu = IP6CB(skb)->frag_max_size;
 857                 if (mtu < IPV6_MIN_MTU)
 858                         mtu = IPV6_MIN_MTU;
 859         }
 860
 861         if (np && np->frag_size < mtu) {
 862                 if (np->frag_size)
 863                         mtu = np->frag_size;
 864         }
 865         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 866                 goto fail_toobig;
 867         mtu -= hlen + sizeof(struct frag_hdr);
 868
 869         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 870                                     &ipv6_hdr(skb)->saddr);
 871
 872         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 873             (err = skb_checksum_help(skb)))
 874                 goto fail;
 875
 876         prevhdr = skb_network_header(skb) + nexthdr_offset;
 877         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 878         if (skb_has_frag_list(skb)) {
 879                 unsigned int first_len = skb_pagelen(skb);
 880                 struct ip6_fraglist_iter iter;
 881                 struct sk_buff *frag2;
 882
 883                 if (first_len - hlen > mtu ||
 884                     ((first_len - hlen) & 7) ||
 885                     skb_cloned(skb) ||
 886                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 887                         goto slow_path;
 888
 889                 skb_walk_frags(skb, frag) {
 890                         /* Correct geometry. */
 891                         if (frag->len > mtu ||
 892                             ((frag->len & 7) && frag->next) ||
 893                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 894                                 goto slow_path_clean;
 895
 896                         /* Partially cloned skb? */
 897                         if (skb_shared(frag))
 898                                 goto slow_path_clean;
 899
 900                         BUG_ON(frag->sk);
 901                         if (skb->sk) {
 902                                 frag->sk = skb->sk;
 903                                 frag->destructor = sock_wfree;
 904                         }
 905                         skb->truesize -= frag->truesize;
 906                 }
 907
 908                 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
 909                                         &iter);
 910                 if (err < 0)
 911                         goto fail;
 912
 913                 /* We prevent @rt from being freed. */
 914                 rcu_read_lock();
 915
 916                 for (;;) {
 917                         /* Prepare header of the next frame,
 918                          * before previous one went down. */
 919                         if (iter.frag)
 920                                 ip6_fraglist_prepare(skb, &iter);
 921
 922                         skb->tstamp = tstamp;
 923                         err = output(net, sk, skb);
 924                         if (!err)
 925                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 926                                               IPSTATS_MIB_FRAGCREATES);
 927
 928                         if (err || !iter.frag)
 929                                 break;
 930
 931                         skb = ip6_fraglist_next(&iter);
 932                 }
 933
 934                 kfree(iter.tmp_hdr);
 935
 936                 if (err == 0) {
 937                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 938                                       IPSTATS_MIB_FRAGOKS);
 939                         rcu_read_unlock();
 940                         return 0;
 941                 }
 942
 943                 kfree_skb_list(iter.frag);
 944
 945                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 946                               IPSTATS_MIB_FRAGFAILS);
 947                 rcu_read_unlock();
 948                 return err;
 949
 950 slow_path_clean:
 951                 skb_walk_frags(skb, frag2) {
 952                         if (frag2 == frag)
 953                                 break;
 954                         frag2->sk = NULL;
 955                         frag2->destructor = NULL;
 956                         skb->truesize += frag2->truesize;
 957                 }
 958         }
 959
 960 slow_path:
 961         /*
 962          *      Fragment the datagram.
 963          */
 964
 965         ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
 966                       LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
 967                       &state);
 968
 969         /*
 970          *      Keep copying data until we run out.
 971          */
 972
 973         while (state.left > 0) {
 974                 frag = ip6_frag_next(skb, &state);
 975                 if (IS_ERR(frag)) {
 976                         err = PTR_ERR(frag);
 977                         goto fail;
 978                 }
 979
 980                 /*
 981                  *      Put this fragment into the sending queue.
 982                  */
 983                 frag->tstamp = tstamp;
 984                 err = output(net, sk, frag);
 985                 if (err)
 986                         goto fail;
 987
 988                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 989                               IPSTATS_MIB_FRAGCREATES);
 990         }
 991         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 992                       IPSTATS_MIB_FRAGOKS);
 993         consume_skb(skb);
 994         return err;
 995
 996 fail_toobig:
 997         if (skb->sk && dst_allfrag(skb_dst(skb)))
 998                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 999
1000         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1001         err = -EMSGSIZE;
1002
1003 fail:
1004         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1005                       IPSTATS_MIB_FRAGFAILS);
1006         kfree_skb(skb);
1007         return err;
1008 }
1009
1010 static inline int ip6_rt_check(const struct rt6key *rt_key,
1011                                const struct in6_addr *fl_addr,
1012                                const struct in6_addr *addr_cache)
1013 {
1014         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1015                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1016 }
1017
1018 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1019                                           struct dst_entry *dst,
1020                                           const struct flowi6 *fl6)
1021 {
1022         struct ipv6_pinfo *np = inet6_sk(sk);
1023         struct rt6_info *rt;
1024
1025         if (!dst)
1026                 goto out;
1027
1028         if (dst->ops->family != AF_INET6) {
1029                 dst_release(dst);
1030                 return NULL;
1031         }
1032
1033         rt = (struct rt6_info *)dst;
1034         /* Yes, checking route validity in not connected
1035          * case is not very simple. Take into account,
1036          * that we do not support routing by source, TOS,
1037          * and MSG_DONTROUTE            --ANK (980726)
1038          *
1039          * 1. ip6_rt_check(): If route was host route,
1040          *    check that cached destination is current.
1041          *    If it is network route, we still may
1042          *    check its validity using saved pointer
1043          *    to the last used address: daddr_cache.
1044          *    We do not want to save whole address now,
1045          *    (because main consumer of this service
1046          *    is tcp, which has not this problem),
1047          *    so that the last trick works only on connected
1048          *    sockets.
1049          * 2. oif also should be the same.
1050          */
1051         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1052 #ifdef CONFIG_IPV6_SUBTREES
1053             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1054 #endif
1055            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1056               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1057                 dst_release(dst);
1058                 dst = NULL;
1059         }
1060
1061 out:
1062         return dst;
1063 }
1064
1065 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1066                                struct dst_entry **dst, struct flowi6 *fl6)
1067 {
1068 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1069         struct neighbour *n;
1070         struct rt6_info *rt;
1071 #endif
1072         int err;
1073         int flags = 0;
1074
1075         /* The correct way to handle this would be to do
1076          * ip6_route_get_saddr, and then ip6_route_output; however,
1077          * the route-specific preferred source forces the
1078          * ip6_route_output call _before_ ip6_route_get_saddr.
1079          *
1080          * In source specific routing (no src=any default route),
1081          * ip6_route_output will fail given src=any saddr, though, so
1082          * that's why we try it again later.
1083          */
1084         if (ipv6_addr_any(&fl6->saddr)) {
1085                 struct fib6_info *from;
1086                 struct rt6_info *rt;
1087
1088                 *dst = ip6_route_output(net, sk, fl6);
1089                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1090
1091                 rcu_read_lock();
1092                 from = rt ? rcu_dereference(rt->from) : NULL;
1093                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1094                                           sk ? inet6_sk(sk)->srcprefs : 0,
1095                                           &fl6->saddr);
1096                 rcu_read_unlock();
1097
1098                 if (err)
1099                         goto out_err_release;
1100
1101                 /* If we had an erroneous initial result, pretend it
1102                  * never existed and let the SA-enabled version take
1103                  * over.
1104                  */
1105                 if ((*dst)->error) {
1106                         dst_release(*dst);
1107                         *dst = NULL;
1108                 }
1109
1110                 if (fl6->flowi6_oif)
1111                         flags |= RT6_LOOKUP_F_IFACE;
1112         }
1113
1114         if (!*dst)
1115                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1116
1117         err = (*dst)->error;
1118         if (err)
1119                 goto out_err_release;
1120
1121 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1122         /*
1123          * Here if the dst entry we've looked up
1124          * has a neighbour entry that is in the INCOMPLETE
1125          * state and the src address from the flow is
1126          * marked as OPTIMISTIC, we release the found
1127          * dst entry and replace it instead with the
1128          * dst entry of the nexthop router
1129          */
1130         rt = (struct rt6_info *) *dst;
1131         rcu_read_lock_bh();
1132         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1133                                       rt6_nexthop(rt, &fl6->daddr));
1134         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1135         rcu_read_unlock_bh();
1136
1137         if (err) {
1138                 struct inet6_ifaddr *ifp;
1139                 struct flowi6 fl_gw6;
1140                 int redirect;
1141
1142                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1143                                       (*dst)->dev, 1);
1144
1145                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1146                 if (ifp)
1147                         in6_ifa_put(ifp);
1148
1149                 if (redirect) {
1150                         /*
1151                          * We need to get the dst entry for the
1152                          * default router instead
1153                          */
1154                         dst_release(*dst);
1155                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1156                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1157                         *dst = ip6_route_output(net, sk, &fl_gw6);
1158                         err = (*dst)->error;
1159                         if (err)
1160                                 goto out_err_release;
1161                 }
1162         }
1163 #endif
1164         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1165             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1166                 err = -EAFNOSUPPORT;
1167                 goto out_err_release;
1168         }
1169
1170         return 0;
1171
1172 out_err_release:
1173         dst_release(*dst);
1174         *dst = NULL;
1175
1176         if (err == -ENETUNREACH)
1177                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1178         return err;
1179 }
1180
1181 /**
1182  *      ip6_dst_lookup - perform route lookup on flow
1183  *      @net: Network namespace to perform lookup in
1184  *      @sk: socket which provides route info
1185  *      @dst: pointer to dst_entry * for result
1186  *      @fl6: flow to lookup
1187  *
1188  *      This function performs a route lookup on the given flow.
1189  *
1190  *      It returns zero on success, or a standard errno code on error.
1191  */
1192 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1193                    struct flowi6 *fl6)
1194 {
1195         *dst = NULL;
1196         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1197 }
1198 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1199
1200 /**
1201  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1202  *      @net: Network namespace to perform lookup in
1203  *      @sk: socket which provides route info
1204  *      @fl6: flow to lookup
1205  *      @final_dst: final destination address for ipsec lookup
1206  *
1207  *      This function performs a route lookup on the given flow.
1208  *
1209  *      It returns a valid dst pointer on success, or a pointer encoded
1210  *      error code.
1211  */
1212 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1213                                       const struct in6_addr *final_dst)
1214 {
1215         struct dst_entry *dst = NULL;
1216         int err;
1217
1218         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1219         if (err)
1220                 return ERR_PTR(err);
1221         if (final_dst)
1222                 fl6->daddr = *final_dst;
1223
1224         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1225 }
1226 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1227
1228 /**
1229  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1230  *      @sk: socket which provides the dst cache and route info
1231  *      @fl6: flow to lookup
1232  *      @final_dst: final destination address for ipsec lookup
1233  *      @connected: whether @sk is connected or not
1234  *
1235  *      This function performs a route lookup on the given flow with the
1236  *      possibility of using the cached route in the socket if it is valid.
1237  *      It will take the socket dst lock when operating on the dst cache.
1238  *      As a result, this function can only be used in process context.
1239  *
1240  *      In addition, for a connected socket, cache the dst in the socket
1241  *      if the current cache is not valid.
1242  *
1243  *      It returns a valid dst pointer on success, or a pointer encoded
1244  *      error code.
1245  */
1246 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1247                                          const struct in6_addr *final_dst,
1248                                          bool connected)
1249 {
1250         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1251
1252         dst = ip6_sk_dst_check(sk, dst, fl6);
1253         if (dst)
1254                 return dst;
1255
1256         dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1257         if (connected && !IS_ERR(dst))
1258                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1259
1260         return dst;
1261 }
1262 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1263
1264 /**
1265  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1266  *      @skb: Packet for which lookup is done
1267  *      @dev: Tunnel device
1268  *      @net: Network namespace of tunnel device
1269  *      @sock: Socket which provides route info
1270  *      @saddr: Memory to store the src ip address
1271  *      @info: Tunnel information
1272  *      @protocol: IP protocol
1273  *      @use_cache: Flag to enable cache usage
1274  *      This function performs a route lookup on a tunnel
1275  *
1276  *      It returns a valid dst pointer and stores src address to be used in
1277  *      tunnel in param saddr on success, else a pointer encoded error code.
1278  */
1279
1280 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1281                                         struct net_device *dev,
1282                                         struct net *net,
1283                                         struct socket *sock,
1284                                         struct in6_addr *saddr,
1285                                         const struct ip_tunnel_info *info,
1286                                         u8 protocol,
1287                                         bool use_cache)
1288 {
1289         struct dst_entry *dst = NULL;
1290 #ifdef CONFIG_DST_CACHE
1291         struct dst_cache *dst_cache;
1292 #endif
1293         struct flowi6 fl6;
1294         __u8 prio;
1295
1296 #ifdef CONFIG_DST_CACHE
1297         dst_cache = (struct dst_cache *)&info->dst_cache;
1298         if (use_cache) {
1299                 dst = dst_cache_get_ip6(dst_cache, saddr);
1300                 if (dst)
1301                         return dst;
1302         }
1303 #endif
1304         memset(&fl6, 0, sizeof(fl6));
1305         fl6.flowi6_mark = skb->mark;
1306         fl6.flowi6_proto = protocol;
1307         fl6.daddr = info->key.u.ipv6.dst;
1308         fl6.saddr = info->key.u.ipv6.src;
1309         prio = info->key.tos;
1310         fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1311
1312         dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1313                                               NULL);
1314         if (IS_ERR(dst)) {
1315                 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1316                 return ERR_PTR(-ENETUNREACH);
1317         }
1318         if (dst->dev == dev) { /* is this necessary? */
1319                 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1320                 dst_release(dst);
1321                 return ERR_PTR(-ELOOP);
1322         }
1323 #ifdef CONFIG_DST_CACHE
1324         if (use_cache)
1325                 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1326 #endif
1327         *saddr = fl6.saddr;
1328         return dst;
1329 }
1330 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1331
1332 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1333                                                gfp_t gfp)
1334 {
1335         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1336 }
1337
1338 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1339                                                 gfp_t gfp)
1340 {
1341         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1342 }
1343
1344 static void ip6_append_data_mtu(unsigned int *mtu,
1345                                 int *maxfraglen,
1346                                 unsigned int fragheaderlen,
1347                                 struct sk_buff *skb,
1348                                 struct rt6_info *rt,
1349                                 unsigned int orig_mtu)
1350 {
1351         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1352                 if (!skb) {
1353                         /* first fragment, reserve header_len */
1354                         *mtu = orig_mtu - rt->dst.header_len;
1355
1356                 } else {
1357                         /*
1358                          * this fragment is not first, the headers
1359                          * space is regarded as data space.
1360                          */
1361                         *mtu = orig_mtu;
1362                 }
1363                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1364                               + fragheaderlen - sizeof(struct frag_hdr);
1365         }
1366 }
1367
1368 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1369                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1370                           struct rt6_info *rt, struct flowi6 *fl6)
1371 {
1372         struct ipv6_pinfo *np = inet6_sk(sk);
1373         unsigned int mtu;
1374         struct ipv6_txoptions *opt = ipc6->opt;
1375
1376         /*
1377          * setup for corking
1378          */
1379         if (opt) {
1380                 if (WARN_ON(v6_cork->opt))
1381                         return -EINVAL;
1382
1383                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1384                 if (unlikely(!v6_cork->opt))
1385                         return -ENOBUFS;
1386
1387                 v6_cork->opt->tot_len = sizeof(*opt);
1388                 v6_cork->opt->opt_flen = opt->opt_flen;
1389                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1390
1391                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1392                                                     sk->sk_allocation);
1393                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1394                         return -ENOBUFS;
1395
1396                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1397                                                     sk->sk_allocation);
1398                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1399                         return -ENOBUFS;
1400
1401                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1402                                                    sk->sk_allocation);
1403                 if (opt->hopopt && !v6_cork->opt->hopopt)
1404                         return -ENOBUFS;
1405
1406                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1407                                                     sk->sk_allocation);
1408                 if (opt->srcrt && !v6_cork->opt->srcrt)
1409                         return -ENOBUFS;
1410
1411                 /* need source address above miyazawa*/
1412         }
1413         dst_hold(&rt->dst);
1414         cork->base.dst = &rt->dst;
1415         cork->fl.u.ip6 = *fl6;
1416         v6_cork->hop_limit = ipc6->hlimit;
1417         v6_cork->tclass = ipc6->tclass;
1418         if (rt->dst.flags & DST_XFRM_TUNNEL)
1419                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1420                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1421         else
1422                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1423                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1424         if (np->frag_size < mtu) {
1425                 if (np->frag_size)
1426                         mtu = np->frag_size;
1427         }
1428         cork->base.fragsize = mtu;
1429         cork->base.gso_size = ipc6->gso_size;
1430         cork->base.tx_flags = 0;
1431         cork->base.mark = ipc6->sockc.mark;
1432         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1433
1434         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1435                 cork->base.flags |= IPCORK_ALLFRAG;
1436         cork->base.length = 0;
1437
1438         cork->base.transmit_time = ipc6->sockc.transmit_time;
1439
1440         return 0;
1441 }
1442
1443 static int __ip6_append_data(struct sock *sk,
1444                              struct flowi6 *fl6,
1445                              struct sk_buff_head *queue,
1446                              struct inet_cork *cork,
1447                              struct inet6_cork *v6_cork,
1448                              struct page_frag *pfrag,
1449                              int getfrag(void *from, char *to, int offset,
1450                                          int len, int odd, struct sk_buff *skb),
1451                              void *from, int length, int transhdrlen,
1452                              unsigned int flags, struct ipcm6_cookie *ipc6)
1453 {
1454         struct sk_buff *skb, *skb_prev = NULL;
1455         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1456         struct ubuf_info *uarg = NULL;
1457         int exthdrlen = 0;
1458         int dst_exthdrlen = 0;
1459         int hh_len;
1460         int copy;
1461         int err;
1462         int offset = 0;
1463         u32 tskey = 0;
1464         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1465         struct ipv6_txoptions *opt = v6_cork->opt;
1466         int csummode = CHECKSUM_NONE;
1467         unsigned int maxnonfragsize, headersize;
1468         unsigned int wmem_alloc_delta = 0;
1469         bool paged, extra_uref = false;
1470
1471         skb = skb_peek_tail(queue);
1472         if (!skb) {
1473                 exthdrlen = opt ? opt->opt_flen : 0;
1474                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1475         }
1476
1477         paged = !!cork->gso_size;
1478         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1479         orig_mtu = mtu;
1480
1481         if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1482             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1483                 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1484
1485         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1486
1487         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1488                         (opt ? opt->opt_nflen : 0);
1489
1490         headersize = sizeof(struct ipv6hdr) +
1491                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1492                      (dst_allfrag(&rt->dst) ?
1493                       sizeof(struct frag_hdr) : 0) +
1494                      rt->rt6i_nfheader_len;
1495
1496         if (mtu <= fragheaderlen ||
1497             ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1498                 goto emsgsize;
1499
1500         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1501                      sizeof(struct frag_hdr);
1502
1503         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1504          * the first fragment
1505          */
1506         if (headersize + transhdrlen > mtu)
1507                 goto emsgsize;
1508
1509         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1510             (sk->sk_protocol == IPPROTO_UDP ||
1511              sk->sk_protocol == IPPROTO_RAW)) {
1512                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1513                                 sizeof(struct ipv6hdr));
1514                 goto emsgsize;
1515         }
1516
1517         if (ip6_sk_ignore_df(sk))
1518                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1519         else
1520                 maxnonfragsize = mtu;
1521
1522         if (cork->length + length > maxnonfragsize - headersize) {
1523 emsgsize:
1524                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1525                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1526                 return -EMSGSIZE;
1527         }
1528
1529         /* CHECKSUM_PARTIAL only with no extension headers and when
1530          * we are not going to fragment
1531          */
1532         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1533             headersize == sizeof(struct ipv6hdr) &&
1534             length <= mtu - headersize &&
1535             (!(flags & MSG_MORE) || cork->gso_size) &&
1536             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1537                 csummode = CHECKSUM_PARTIAL;
1538
1539         if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1540                 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1541                 if (!uarg)
1542                         return -ENOBUFS;
1543                 extra_uref = !skb_zcopy(skb);   /* only ref on new uarg */
1544                 if (rt->dst.dev->features & NETIF_F_SG &&
1545                     csummode == CHECKSUM_PARTIAL) {
1546                         paged = true;
1547                 } else {
1548                         uarg->zerocopy = 0;
1549                         skb_zcopy_set(skb, uarg, &extra_uref);
1550                 }
1551         }
1552
1553         /*
1554          * Let's try using as much space as possible.
1555          * Use MTU if total length of the message fits into the MTU.
1556          * Otherwise, we need to reserve fragment header and
1557          * fragment alignment (= 8-15 octects, in total).
1558          *
1559          * Note that we may need to "move" the data from the tail
1560          * of the buffer to the new fragment when we split
1561          * the message.
1562          *
1563          * FIXME: It may be fragmented into multiple chunks
1564          *        at once if non-fragmentable extension headers
1565          *        are too large.
1566          * --yoshfuji
1567          */
1568
1569         cork->length += length;
1570         if (!skb)
1571                 goto alloc_new_skb;
1572
1573         while (length > 0) {
1574                 /* Check if the remaining data fits into current packet. */
1575                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1576                 if (copy < length)
1577                         copy = maxfraglen - skb->len;
1578
1579                 if (copy <= 0) {
1580                         char *data;
1581                         unsigned int datalen;
1582                         unsigned int fraglen;
1583                         unsigned int fraggap;
1584                         unsigned int alloclen, alloc_extra;
1585                         unsigned int pagedlen;
1586 alloc_new_skb:
1587                         /* There's no room in the current skb */
1588                         if (skb)
1589                                 fraggap = skb->len - maxfraglen;
1590                         else
1591                                 fraggap = 0;
1592                         /* update mtu and maxfraglen if necessary */
1593                         if (!skb || !skb_prev)
1594                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1595                                                     fragheaderlen, skb, rt,
1596                                                     orig_mtu);
1597
1598                         skb_prev = skb;
1599
1600                         /*
1601                          * If remaining data exceeds the mtu,
1602                          * we know we need more fragment(s).
1603                          */
1604                         datalen = length + fraggap;
1605
1606                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1607                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1608                         fraglen = datalen + fragheaderlen;
1609                         pagedlen = 0;
1610
1611                         alloc_extra = hh_len;
1612                         alloc_extra += dst_exthdrlen;
1613                         alloc_extra += rt->dst.trailer_len;
1614
1615                         /* We just reserve space for fragment header.
1616                          * Note: this may be overallocation if the message
1617                          * (without MSG_MORE) fits into the MTU.
1618                          */
1619                         alloc_extra += sizeof(struct frag_hdr);
1620
1621                         if ((flags & MSG_MORE) &&
1622                             !(rt->dst.dev->features&NETIF_F_SG))
1623                                 alloclen = mtu;
1624                         else if (!paged &&
1625                                  (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1626                                   !(rt->dst.dev->features & NETIF_F_SG)))
1627                                 alloclen = fraglen;
1628                         else {
1629                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1630                                 pagedlen = fraglen - alloclen;
1631                         }
1632                         alloclen += alloc_extra;
1633
1634                         if (datalen != length + fraggap) {
1635                                 /*
1636                                  * this is not the last fragment, the trailer
1637                                  * space is regarded as data space.
1638                                  */
1639                                 datalen += rt->dst.trailer_len;
1640                         }
1641
1642                         fraglen = datalen + fragheaderlen;
1643
1644                         copy = datalen - transhdrlen - fraggap - pagedlen;
1645                         if (copy < 0) {
1646                                 err = -EINVAL;
1647                                 goto error;
1648                         }
1649                         if (transhdrlen) {
1650                                 skb = sock_alloc_send_skb(sk, alloclen,
1651                                                 (flags & MSG_DONTWAIT), &err);
1652                         } else {
1653                                 skb = NULL;
1654                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1655                                     2 * sk->sk_sndbuf)
1656                                         skb = alloc_skb(alloclen,
1657                                                         sk->sk_allocation);
1658                                 if (unlikely(!skb))
1659                                         err = -ENOBUFS;
1660                         }
1661                         if (!skb)
1662                                 goto error;
1663                         /*
1664                          *      Fill in the control structures
1665                          */
1666                         skb->protocol = htons(ETH_P_IPV6);
1667                         skb->ip_summed = csummode;
1668                         skb->csum = 0;
1669                         /* reserve for fragmentation and ipsec header */
1670                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1671                                     dst_exthdrlen);
1672
1673                         /*
1674                          *      Find where to start putting bytes
1675                          */
1676                         data = skb_put(skb, fraglen - pagedlen);
1677                         skb_set_network_header(skb, exthdrlen);
1678                         data += fragheaderlen;
1679                         skb->transport_header = (skb->network_header +
1680                                                  fragheaderlen);
1681                         if (fraggap) {
1682                                 skb->csum = skb_copy_and_csum_bits(
1683                                         skb_prev, maxfraglen,
1684                                         data + transhdrlen, fraggap);
1685                                 skb_prev->csum = csum_sub(skb_prev->csum,
1686                                                           skb->csum);
1687                                 data += fraggap;
1688                                 pskb_trim_unique(skb_prev, maxfraglen);
1689                         }
1690                         if (copy > 0 &&
1691                             getfrag(from, data + transhdrlen, offset,
1692                                     copy, fraggap, skb) < 0) {
1693                                 err = -EFAULT;
1694                                 kfree_skb(skb);
1695                                 goto error;
1696                         }
1697
1698                         offset += copy;
1699                         length -= copy + transhdrlen;
1700                         transhdrlen = 0;
1701                         exthdrlen = 0;
1702                         dst_exthdrlen = 0;
1703
1704                         /* Only the initial fragment is time stamped */
1705                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1706                         cork->tx_flags = 0;
1707                         skb_shinfo(skb)->tskey = tskey;
1708                         tskey = 0;
1709                         skb_zcopy_set(skb, uarg, &extra_uref);
1710
1711                         if ((flags & MSG_CONFIRM) && !skb_prev)
1712                                 skb_set_dst_pending_confirm(skb, 1);
1713
1714                         /*
1715                          * Put the packet on the pending queue
1716                          */
1717                         if (!skb->destructor) {
1718                                 skb->destructor = sock_wfree;
1719                                 skb->sk = sk;
1720                                 wmem_alloc_delta += skb->truesize;
1721                         }
1722                         __skb_queue_tail(queue, skb);
1723                         continue;
1724                 }
1725
1726                 if (copy > length)
1727                         copy = length;
1728
1729                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1730                     skb_tailroom(skb) >= copy) {
1731                         unsigned int off;
1732
1733                         off = skb->len;
1734                         if (getfrag(from, skb_put(skb, copy),
1735                                                 offset, copy, off, skb) < 0) {
1736                                 __skb_trim(skb, off);
1737                                 err = -EFAULT;
1738                                 goto error;
1739                         }
1740                 } else if (!uarg || !uarg->zerocopy) {
1741                         int i = skb_shinfo(skb)->nr_frags;
1742
1743                         err = -ENOMEM;
1744                         if (!sk_page_frag_refill(sk, pfrag))
1745                                 goto error;
1746
1747                         if (!skb_can_coalesce(skb, i, pfrag->page,
1748                                               pfrag->offset)) {
1749                                 err = -EMSGSIZE;
1750                                 if (i == MAX_SKB_FRAGS)
1751                                         goto error;
1752
1753                                 __skb_fill_page_desc(skb, i, pfrag->page,
1754                                                      pfrag->offset, 0);
1755                                 skb_shinfo(skb)->nr_frags = ++i;
1756                                 get_page(pfrag->page);
1757                         }
1758                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1759                         if (getfrag(from,
1760                                     page_address(pfrag->page) + pfrag->offset,
1761                                     offset, copy, skb->len, skb) < 0)
1762                                 goto error_efault;
1763
1764                         pfrag->offset += copy;
1765                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1766                         skb->len += copy;
1767                         skb->data_len += copy;
1768                         skb->truesize += copy;
1769                         wmem_alloc_delta += copy;
1770                 } else {
1771                         err = skb_zerocopy_iter_dgram(skb, from, copy);
1772                         if (err < 0)
1773                                 goto error;
1774                 }
1775                 offset += copy;
1776                 length -= copy;
1777         }
1778
1779         if (wmem_alloc_delta)
1780                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1781         return 0;
1782
1783 error_efault:
1784         err = -EFAULT;
1785 error:
1786         net_zcopy_put_abort(uarg, extra_uref);
1787         cork->length -= length;
1788         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1789         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1790         return err;
1791 }
1792
1793 int ip6_append_data(struct sock *sk,
1794                     int getfrag(void *from, char *to, int offset, int len,
1795                                 int odd, struct sk_buff *skb),
1796                     void *from, int length, int transhdrlen,
1797                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1798                     struct rt6_info *rt, unsigned int flags)
1799 {
1800         struct inet_sock *inet = inet_sk(sk);
1801         struct ipv6_pinfo *np = inet6_sk(sk);
1802         int exthdrlen;
1803         int err;
1804
1805         if (flags&MSG_PROBE)
1806                 return 0;
1807         if (skb_queue_empty(&sk->sk_write_queue)) {
1808                 /*
1809                  * setup for corking
1810                  */
1811                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1812                                      ipc6, rt, fl6);
1813                 if (err)
1814                         return err;
1815
1816                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1817                 length += exthdrlen;
1818                 transhdrlen += exthdrlen;
1819         } else {
1820                 fl6 = &inet->cork.fl.u.ip6;
1821                 transhdrlen = 0;
1822         }
1823
1824         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1825                                  &np->cork, sk_page_frag(sk), getfrag,
1826                                  from, length, transhdrlen, flags, ipc6);
1827 }
1828 EXPORT_SYMBOL_GPL(ip6_append_data);
1829
1830 static void ip6_cork_release(struct inet_cork_full *cork,
1831                              struct inet6_cork *v6_cork)
1832 {
1833         if (v6_cork->opt) {
1834                 kfree(v6_cork->opt->dst0opt);
1835                 kfree(v6_cork->opt->dst1opt);
1836                 kfree(v6_cork->opt->hopopt);
1837                 kfree(v6_cork->opt->srcrt);
1838                 kfree(v6_cork->opt);
1839                 v6_cork->opt = NULL;
1840         }
1841
1842         if (cork->base.dst) {
1843                 dst_release(cork->base.dst);
1844                 cork->base.dst = NULL;
1845                 cork->base.flags &= ~IPCORK_ALLFRAG;
1846         }
1847         memset(&cork->fl, 0, sizeof(cork->fl));
1848 }
1849
1850 struct sk_buff *__ip6_make_skb(struct sock *sk,
1851                                struct sk_buff_head *queue,
1852                                struct inet_cork_full *cork,
1853                                struct inet6_cork *v6_cork)
1854 {
1855         struct sk_buff *skb, *tmp_skb;
1856         struct sk_buff **tail_skb;
1857         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1858         struct ipv6_pinfo *np = inet6_sk(sk);
1859         struct net *net = sock_net(sk);
1860         struct ipv6hdr *hdr;
1861         struct ipv6_txoptions *opt = v6_cork->opt;
1862         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1863         struct flowi6 *fl6 = &cork->fl.u.ip6;
1864         unsigned char proto = fl6->flowi6_proto;
1865
1866         skb = __skb_dequeue(queue);
1867         if (!skb)
1868                 goto out;
1869         tail_skb = &(skb_shinfo(skb)->frag_list);
1870
1871         /* move skb->data to ip header from ext header */
1872         if (skb->data < skb_network_header(skb))
1873                 __skb_pull(skb, skb_network_offset(skb));
1874         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1875                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1876                 *tail_skb = tmp_skb;
1877                 tail_skb = &(tmp_skb->next);
1878                 skb->len += tmp_skb->len;
1879                 skb->data_len += tmp_skb->len;
1880                 skb->truesize += tmp_skb->truesize;
1881                 tmp_skb->destructor = NULL;
1882                 tmp_skb->sk = NULL;
1883         }
1884
1885         /* Allow local fragmentation. */
1886         skb->ignore_df = ip6_sk_ignore_df(sk);
1887
1888         *final_dst = fl6->daddr;
1889         __skb_pull(skb, skb_network_header_len(skb));
1890         if (opt && opt->opt_flen)
1891                 ipv6_push_frag_opts(skb, opt, &proto);
1892         if (opt && opt->opt_nflen)
1893                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1894
1895         skb_push(skb, sizeof(struct ipv6hdr));
1896         skb_reset_network_header(skb);
1897         hdr = ipv6_hdr(skb);
1898
1899         ip6_flow_hdr(hdr, v6_cork->tclass,
1900                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1901                                         ip6_autoflowlabel(net, np), fl6));
1902         hdr->hop_limit = v6_cork->hop_limit;
1903         hdr->nexthdr = proto;
1904         hdr->saddr = fl6->saddr;
1905         hdr->daddr = *final_dst;
1906
1907         skb->priority = sk->sk_priority;
1908         skb->mark = cork->base.mark;
1909
1910         skb->tstamp = cork->base.transmit_time;
1911
1912         skb_dst_set(skb, dst_clone(&rt->dst));
1913         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1914         if (proto == IPPROTO_ICMPV6) {
1915                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1916                 u8 icmp6_type;
1917
1918                 if (sk->sk_socket->type == SOCK_RAW && !inet_sk(sk)->hdrincl)
1919                         icmp6_type = fl6->fl6_icmp_type;
1920                 else
1921                         icmp6_type = icmp6_hdr(skb)->icmp6_type;
1922                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1923                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1924         }
1925
1926         ip6_cork_release(cork, v6_cork);
1927 out:
1928         return skb;
1929 }
1930
1931 int ip6_send_skb(struct sk_buff *skb)
1932 {
1933         struct net *net = sock_net(skb->sk);
1934         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1935         int err;
1936
1937         err = ip6_local_out(net, skb->sk, skb);
1938         if (err) {
1939                 if (err > 0)
1940                         err = net_xmit_errno(err);
1941                 if (err)
1942                         IP6_INC_STATS(net, rt->rt6i_idev,
1943                                       IPSTATS_MIB_OUTDISCARDS);
1944         }
1945
1946         return err;
1947 }
1948
1949 int ip6_push_pending_frames(struct sock *sk)
1950 {
1951         struct sk_buff *skb;
1952
1953         skb = ip6_finish_skb(sk);
1954         if (!skb)
1955                 return 0;
1956
1957         return ip6_send_skb(skb);
1958 }
1959 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1960
1961 static void __ip6_flush_pending_frames(struct sock *sk,
1962                                        struct sk_buff_head *queue,
1963                                        struct inet_cork_full *cork,
1964                                        struct inet6_cork *v6_cork)
1965 {
1966         struct sk_buff *skb;
1967
1968         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1969                 if (skb_dst(skb))
1970                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1971                                       IPSTATS_MIB_OUTDISCARDS);
1972                 kfree_skb(skb);
1973         }
1974
1975         ip6_cork_release(cork, v6_cork);
1976 }
1977
1978 void ip6_flush_pending_frames(struct sock *sk)
1979 {
1980         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1981                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1982 }
1983 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1984
1985 struct sk_buff *ip6_make_skb(struct sock *sk,
1986                              int getfrag(void *from, char *to, int offset,
1987                                          int len, int odd, struct sk_buff *skb),
1988                              void *from, int length, int transhdrlen,
1989                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1990                              struct rt6_info *rt, unsigned int flags,
1991                              struct inet_cork_full *cork)
1992 {
1993         struct inet6_cork v6_cork;
1994         struct sk_buff_head queue;
1995         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1996         int err;
1997
1998         if (flags & MSG_PROBE)
1999                 return NULL;
2000
2001         __skb_queue_head_init(&queue);
2002
2003         cork->base.flags = 0;
2004         cork->base.addr = 0;
2005         cork->base.opt = NULL;
2006         cork->base.dst = NULL;
2007         v6_cork.opt = NULL;
2008         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
2009         if (err) {
2010                 ip6_cork_release(cork, &v6_cork);
2011                 return ERR_PTR(err);
2012         }
2013         if (ipc6->dontfrag < 0)
2014                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2015
2016         err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
2017                                 &current->task_frag, getfrag, from,
2018                                 length + exthdrlen, transhdrlen + exthdrlen,
2019                                 flags, ipc6);
2020         if (err) {
2021                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2022                 return ERR_PTR(err);
2023         }
2024
2025         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2026 }