net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/bpf-cgroup.h>
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58 #include <linux/mroute6.h>
  59 #include <net/l3mdev.h>
  60 #include <net/lwtunnel.h>
  61
  62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63 {
  64         struct dst_entry *dst = skb_dst(skb);
  65         struct net_device *dev = dst->dev;
  66         struct neighbour *neigh;
  67         struct in6_addr *nexthop;
  68         int ret;
  69
  70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74                     ((mroute6_socket(net, skb) &&
  75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                          &ipv6_hdr(skb)->saddr))) {
  78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                         /* Do not check for IFF_ALLMULTI; multicast routing
  81                            is not supported in any case.
  82                          */
  83                         if (newskb)
  84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                         net, sk, newskb, NULL, newskb->dev,
  86                                         dev_loopback_xmit);
  87
  88                         if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                 IP6_INC_STATS(net, idev,
  90                                               IPSTATS_MIB_OUTDISCARDS);
  91                                 kfree_skb(skb);
  92                                 return 0;
  93                         }
  94                 }
  95
  96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  97
  98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  99                     IPV6_ADDR_SCOPE_NODELOCAL &&
 100                     !(dev->flags & IFF_LOOPBACK)) {
 101                         kfree_skb(skb);
 102                         return 0;
 103                 }
 104         }
 105
 106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 107                 int res = lwtunnel_xmit(skb);
 108
 109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 110                         return res;
 111         }
 112
 113         rcu_read_lock_bh();
 114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 116         if (unlikely(!neigh))
 117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 118         if (!IS_ERR(neigh)) {
 119                 sock_confirm_neigh(skb, neigh);
 120                 ret = neigh_output(neigh, skb);
 121                 rcu_read_unlock_bh();
 122                 return ret;
 123         }
 124         rcu_read_unlock_bh();
 125
 126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 127         kfree_skb(skb);
 128         return -EINVAL;
 129 }
 130
 131 static int
 132 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
 133                                     struct sk_buff *skb, unsigned int mtu)
 134 {
 135         struct sk_buff *segs, *nskb;
 136         netdev_features_t features;
 137         int ret = 0;
 138
 139         /* Please see corresponding comment in ip_finish_output_gso
 140          * describing the cases where GSO segment length exceeds the
 141          * egress MTU.
 142          */
 143         features = netif_skb_features(skb);
 144         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 145         if (IS_ERR_OR_NULL(segs)) {
 146                 kfree_skb(skb);
 147                 return -ENOMEM;
 148         }
 149
 150         consume_skb(skb);
 151
 152         skb_list_walk_safe(segs, segs, nskb) {
 153                 int err;
 154
 155                 skb_mark_not_on_list(segs);
 156                 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
 157                 if (err && ret == 0)
 158                         ret = err;
 159         }
 160
 161         return ret;
 162 }
 163
 164 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 165 {
 166         unsigned int mtu;
 167         int ret;
 168
 169         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 170         if (ret) {
 171                 kfree_skb(skb);
 172                 return ret;
 173         }
 174
 175 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 176         /* Policy lookup after SNAT yielded a new policy */
 177         if (skb_dst(skb)->xfrm) {
 178                 IP6CB(skb)->flags |= IP6SKB_REROUTED;
 179                 return dst_output(net, sk, skb);
 180         }
 181 #endif
 182
 183         mtu = ip6_skb_dst_mtu(skb);
 184         if (skb_is_gso(skb) && !skb_gso_validate_mtu(skb, mtu))
 185                 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
 186
 187         if ((skb->len > mtu && !skb_is_gso(skb)) ||
 188             dst_allfrag(skb_dst(skb)) ||
 189             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 190                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 191         else
 192                 return ip6_finish_output2(net, sk, skb);
 193 }
 194
 195 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 196 {
 197         struct net_device *dev = skb_dst(skb)->dev;
 198         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 199
 200         skb->protocol = htons(ETH_P_IPV6);
 201         skb->dev = dev;
 202
 203         if (unlikely(idev->cnf.disable_ipv6)) {
 204                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 205                 kfree_skb(skb);
 206                 return 0;
 207         }
 208
 209         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 210                             net, sk, skb, NULL, dev,
 211                             ip6_finish_output,
 212                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 213 }
 214
 215 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 216 {
 217         if (!np->autoflowlabel_set)
 218                 return ip6_default_np_autolabel(net);
 219         else
 220                 return np->autoflowlabel;
 221 }
 222
 223 /*
 224  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 225  * Note : socket lock is not held for SYNACK packets, but might be modified
 226  * by calls to skb_set_owner_w() and ipv6_local_error(),
 227  * which are using proper atomic operations or spinlocks.
 228  */
 229 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 230              __u32 mark, struct ipv6_txoptions *opt, int tclass)
 231 {
 232         struct net *net = sock_net(sk);
 233         const struct ipv6_pinfo *np = inet6_sk(sk);
 234         struct in6_addr *first_hop = &fl6->daddr;
 235         struct dst_entry *dst = skb_dst(skb);
 236         unsigned int head_room;
 237         struct ipv6hdr *hdr;
 238         u8  proto = fl6->flowi6_proto;
 239         int seg_len = skb->len;
 240         int hlimit = -1;
 241         u32 mtu;
 242
 243         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 244         if (opt)
 245                 head_room += opt->opt_nflen + opt->opt_flen;
 246
 247         if (unlikely(skb_headroom(skb) < head_room)) {
 248                 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 249                 if (!skb2) {
 250                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 251                                       IPSTATS_MIB_OUTDISCARDS);
 252                         kfree_skb(skb);
 253                         return -ENOBUFS;
 254                 }
 255                 if (skb->sk)
 256                         skb_set_owner_w(skb2, skb->sk);
 257                 consume_skb(skb);
 258                 skb = skb2;
 259         }
 260
 261         if (opt) {
 262                 seg_len += opt->opt_nflen + opt->opt_flen;
 263
 264                 if (opt->opt_flen)
 265                         ipv6_push_frag_opts(skb, opt, &proto);
 266
 267                 if (opt->opt_nflen)
 268                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 269                                              &fl6->saddr);
 270         }
 271
 272         skb_push(skb, sizeof(struct ipv6hdr));
 273         skb_reset_network_header(skb);
 274         hdr = ipv6_hdr(skb);
 275
 276         /*
 277          *      Fill in the IPv6 header
 278          */
 279         if (np)
 280                 hlimit = np->hop_limit;
 281         if (hlimit < 0)
 282                 hlimit = ip6_dst_hoplimit(dst);
 283
 284         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 285                                 ip6_autoflowlabel(net, np), fl6));
 286
 287         hdr->payload_len = htons(seg_len);
 288         hdr->nexthdr = proto;
 289         hdr->hop_limit = hlimit;
 290
 291         hdr->saddr = fl6->saddr;
 292         hdr->daddr = *first_hop;
 293
 294         skb->protocol = htons(ETH_P_IPV6);
 295         skb->priority = sk->sk_priority;
 296         skb->mark = mark;
 297
 298         mtu = dst_mtu(dst);
 299         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 300                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 301                               IPSTATS_MIB_OUT, skb->len);
 302
 303                 /* if egress device is enslaved to an L3 master device pass the
 304                  * skb to its handler for processing
 305                  */
 306                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 307                 if (unlikely(!skb))
 308                         return 0;
 309
 310                 /* hooks should never assume socket lock is held.
 311                  * we promote our socket to non const
 312                  */
 313                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 314                                net, (struct sock *)sk, skb, NULL, dst->dev,
 315                                dst_output);
 316         }
 317
 318         skb->dev = dst->dev;
 319         /* ipv6_local_error() does not require socket lock,
 320          * we promote our socket to non const
 321          */
 322         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 323
 324         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 325         kfree_skb(skb);
 326         return -EMSGSIZE;
 327 }
 328 EXPORT_SYMBOL(ip6_xmit);
 329
 330 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 331 {
 332         struct ip6_ra_chain *ra;
 333         struct sock *last = NULL;
 334
 335         read_lock(&ip6_ra_lock);
 336         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 337                 struct sock *sk = ra->sk;
 338                 if (sk && ra->sel == sel &&
 339                     (!sk->sk_bound_dev_if ||
 340                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 341                         if (last) {
 342                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 343                                 if (skb2)
 344                                         rawv6_rcv(last, skb2);
 345                         }
 346                         last = sk;
 347                 }
 348         }
 349
 350         if (last) {
 351                 rawv6_rcv(last, skb);
 352                 read_unlock(&ip6_ra_lock);
 353                 return 1;
 354         }
 355         read_unlock(&ip6_ra_lock);
 356         return 0;
 357 }
 358
 359 static int ip6_forward_proxy_check(struct sk_buff *skb)
 360 {
 361         struct ipv6hdr *hdr = ipv6_hdr(skb);
 362         u8 nexthdr = hdr->nexthdr;
 363         __be16 frag_off;
 364         int offset;
 365
 366         if (ipv6_ext_hdr(nexthdr)) {
 367                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 368                 if (offset < 0)
 369                         return 0;
 370         } else
 371                 offset = sizeof(struct ipv6hdr);
 372
 373         if (nexthdr == IPPROTO_ICMPV6) {
 374                 struct icmp6hdr *icmp6;
 375
 376                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 377                                          offset + 1 - skb->data)))
 378                         return 0;
 379
 380                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 381
 382                 switch (icmp6->icmp6_type) {
 383                 case NDISC_ROUTER_SOLICITATION:
 384                 case NDISC_ROUTER_ADVERTISEMENT:
 385                 case NDISC_NEIGHBOUR_SOLICITATION:
 386                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 387                 case NDISC_REDIRECT:
 388                         /* For reaction involving unicast neighbor discovery
 389                          * message destined to the proxied address, pass it to
 390                          * input function.
 391                          */
 392                         return 1;
 393                 default:
 394                         break;
 395                 }
 396         }
 397
 398         /*
 399          * The proxying router can't forward traffic sent to a link-local
 400          * address, so signal the sender and discard the packet. This
 401          * behavior is clarified by the MIPv6 specification.
 402          */
 403         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 404                 dst_link_failure(skb);
 405                 return -1;
 406         }
 407
 408         return 0;
 409 }
 410
 411 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 412                                      struct sk_buff *skb)
 413 {
 414         struct dst_entry *dst = skb_dst(skb);
 415
 416         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 417         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 418
 419         return dst_output(net, sk, skb);
 420 }
 421
 422 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 423 {
 424         unsigned int mtu;
 425         struct inet6_dev *idev;
 426
 427         if (dst_metric_locked(dst, RTAX_MTU)) {
 428                 mtu = dst_metric_raw(dst, RTAX_MTU);
 429                 if (mtu)
 430                         return mtu;
 431         }
 432
 433         mtu = IPV6_MIN_MTU;
 434         rcu_read_lock();
 435         idev = __in6_dev_get(dst->dev);
 436         if (idev)
 437                 mtu = idev->cnf.mtu6;
 438         rcu_read_unlock();
 439
 440         return mtu;
 441 }
 442
 443 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 444 {
 445         if (skb->len <= mtu)
 446                 return false;
 447
 448         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 449         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 450                 return true;
 451
 452         if (skb->ignore_df)
 453                 return false;
 454
 455         if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
 456                 return false;
 457
 458         return true;
 459 }
 460
 461 int ip6_forward(struct sk_buff *skb)
 462 {
 463         struct dst_entry *dst = skb_dst(skb);
 464         struct ipv6hdr *hdr = ipv6_hdr(skb);
 465         struct inet6_skb_parm *opt = IP6CB(skb);
 466         struct net *net = dev_net(dst->dev);
 467         u32 mtu;
 468
 469         if (net->ipv6.devconf_all->forwarding == 0)
 470                 goto error;
 471
 472         if (skb->pkt_type != PACKET_HOST)
 473                 goto drop;
 474
 475         if (unlikely(skb->sk))
 476                 goto drop;
 477
 478         if (skb_warn_if_lro(skb))
 479                 goto drop;
 480
 481         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 482                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 483                                 IPSTATS_MIB_INDISCARDS);
 484                 goto drop;
 485         }
 486
 487         skb_forward_csum(skb);
 488
 489         /*
 490          *      We DO NOT make any processing on
 491          *      RA packets, pushing them to user level AS IS
 492          *      without ane WARRANTY that application will be able
 493          *      to interpret them. The reason is that we
 494          *      cannot make anything clever here.
 495          *
 496          *      We are not end-node, so that if packet contains
 497          *      AH/ESP, we cannot make anything.
 498          *      Defragmentation also would be mistake, RA packets
 499          *      cannot be fragmented, because there is no warranty
 500          *      that different fragments will go along one path. --ANK
 501          */
 502         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 503                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 504                         return 0;
 505         }
 506
 507         /*
 508          *      check and decrement ttl
 509          */
 510         if (hdr->hop_limit <= 1) {
 511                 /* Force OUTPUT device used as source address */
 512                 skb->dev = dst->dev;
 513                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 514                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 515                                 IPSTATS_MIB_INHDRERRORS);
 516
 517                 kfree_skb(skb);
 518                 return -ETIMEDOUT;
 519         }
 520
 521         /* XXX: idev->cnf.proxy_ndp? */
 522         if (net->ipv6.devconf_all->proxy_ndp &&
 523             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 524                 int proxied = ip6_forward_proxy_check(skb);
 525                 if (proxied > 0)
 526                         return ip6_input(skb);
 527                 else if (proxied < 0) {
 528                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
 529                                         IPSTATS_MIB_INDISCARDS);
 530                         goto drop;
 531                 }
 532         }
 533
 534         if (!xfrm6_route_forward(skb)) {
 535                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 536                                 IPSTATS_MIB_INDISCARDS);
 537                 goto drop;
 538         }
 539         dst = skb_dst(skb);
 540
 541         /* IPv6 specs say nothing about it, but it is clear that we cannot
 542            send redirects to source routed frames.
 543            We don't send redirects to frames decapsulated from IPsec.
 544          */
 545         if (IP6CB(skb)->iif == dst->dev->ifindex &&
 546             opt->srcrt == 0 && !skb_sec_path(skb)) {
 547                 struct in6_addr *target = NULL;
 548                 struct inet_peer *peer;
 549                 struct rt6_info *rt;
 550
 551                 /*
 552                  *      incoming and outgoing devices are the same
 553                  *      send a redirect.
 554                  */
 555
 556                 rt = (struct rt6_info *) dst;
 557                 if (rt->rt6i_flags & RTF_GATEWAY)
 558                         target = &rt->rt6i_gateway;
 559                 else
 560                         target = &hdr->daddr;
 561
 562                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 563
 564                 /* Limit redirects both by destination (here)
 565                    and by source (inside ndisc_send_redirect)
 566                  */
 567                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 568                         ndisc_send_redirect(skb, target);
 569                 if (peer)
 570                         inet_putpeer(peer);
 571         } else {
 572                 int addrtype = ipv6_addr_type(&hdr->saddr);
 573
 574                 /* This check is security critical. */
 575                 if (addrtype == IPV6_ADDR_ANY ||
 576                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 577                         goto error;
 578                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 579                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 580                                     ICMPV6_NOT_NEIGHBOUR, 0);
 581                         goto error;
 582                 }
 583         }
 584
 585         mtu = ip6_dst_mtu_forward(dst);
 586         if (mtu < IPV6_MIN_MTU)
 587                 mtu = IPV6_MIN_MTU;
 588
 589         if (ip6_pkt_too_big(skb, mtu)) {
 590                 /* Again, force OUTPUT device used as source address */
 591                 skb->dev = dst->dev;
 592                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 593                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 594                                 IPSTATS_MIB_INTOOBIGERRORS);
 595                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 596                                 IPSTATS_MIB_FRAGFAILS);
 597                 kfree_skb(skb);
 598                 return -EMSGSIZE;
 599         }
 600
 601         if (skb_cow(skb, dst->dev->hard_header_len)) {
 602                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 603                                 IPSTATS_MIB_OUTDISCARDS);
 604                 goto drop;
 605         }
 606
 607         hdr = ipv6_hdr(skb);
 608
 609         /* Mangling hops number delayed to point after skb COW */
 610
 611         hdr->hop_limit--;
 612
 613         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 614                        net, NULL, skb, skb->dev, dst->dev,
 615                        ip6_forward_finish);
 616
 617 error:
 618         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 619 drop:
 620         kfree_skb(skb);
 621         return -EINVAL;
 622 }
 623
 624 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 625 {
 626         to->pkt_type = from->pkt_type;
 627         to->priority = from->priority;
 628         to->protocol = from->protocol;
 629         skb_dst_drop(to);
 630         skb_dst_set(to, dst_clone(skb_dst(from)));
 631         to->dev = from->dev;
 632         to->mark = from->mark;
 633
 634         skb_copy_hash(to, from);
 635
 636 #ifdef CONFIG_NET_SCHED
 637         to->tc_index = from->tc_index;
 638 #endif
 639         nf_copy(to, from);
 640         skb_copy_secmark(to, from);
 641 }
 642
 643 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 644                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 645 {
 646         struct sk_buff *frag;
 647         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 648         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 649                                 inet6_sk(skb->sk) : NULL;
 650         struct ipv6hdr *tmp_hdr;
 651         struct frag_hdr *fh;
 652         unsigned int mtu, hlen, left, len, nexthdr_offset;
 653         int hroom, troom;
 654         __be32 frag_id;
 655         int ptr, offset = 0, err = 0;
 656         u8 *prevhdr, nexthdr = 0;
 657
 658         err = ip6_find_1stfragopt(skb, &prevhdr);
 659         if (err < 0)
 660                 goto fail;
 661         hlen = err;
 662         nexthdr = *prevhdr;
 663         nexthdr_offset = prevhdr - skb_network_header(skb);
 664
 665         mtu = ip6_skb_dst_mtu(skb);
 666
 667         /* We must not fragment if the socket is set to force MTU discovery
 668          * or if the skb it not generated by a local socket.
 669          */
 670         if (unlikely(!skb->ignore_df && skb->len > mtu))
 671                 goto fail_toobig;
 672
 673         if (IP6CB(skb)->frag_max_size) {
 674                 if (IP6CB(skb)->frag_max_size > mtu)
 675                         goto fail_toobig;
 676
 677                 /* don't send fragments larger than what we received */
 678                 mtu = IP6CB(skb)->frag_max_size;
 679                 if (mtu < IPV6_MIN_MTU)
 680                         mtu = IPV6_MIN_MTU;
 681         }
 682
 683         if (np && np->frag_size < mtu) {
 684                 if (np->frag_size)
 685                         mtu = np->frag_size;
 686         }
 687         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 688                 goto fail_toobig;
 689         mtu -= hlen + sizeof(struct frag_hdr);
 690
 691         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 692                                     &ipv6_hdr(skb)->saddr);
 693
 694         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 695             (err = skb_checksum_help(skb)))
 696                 goto fail;
 697
 698         prevhdr = skb_network_header(skb) + nexthdr_offset;
 699         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 700         if (skb_has_frag_list(skb)) {
 701                 unsigned int first_len = skb_pagelen(skb);
 702                 struct sk_buff *frag2;
 703
 704                 if (first_len - hlen > mtu ||
 705                     ((first_len - hlen) & 7) ||
 706                     skb_cloned(skb) ||
 707                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 708                         goto slow_path;
 709
 710                 skb_walk_frags(skb, frag) {
 711                         /* Correct geometry. */
 712                         if (frag->len > mtu ||
 713                             ((frag->len & 7) && frag->next) ||
 714                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 715                                 goto slow_path_clean;
 716
 717                         /* Partially cloned skb? */
 718                         if (skb_shared(frag))
 719                                 goto slow_path_clean;
 720
 721                         BUG_ON(frag->sk);
 722                         if (skb->sk) {
 723                                 frag->sk = skb->sk;
 724                                 frag->destructor = sock_wfree;
 725                         }
 726                         skb->truesize -= frag->truesize;
 727                 }
 728
 729                 err = 0;
 730                 offset = 0;
 731                 /* BUILD HEADER */
 732
 733                 *prevhdr = NEXTHDR_FRAGMENT;
 734                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 735                 if (!tmp_hdr) {
 736                         err = -ENOMEM;
 737                         goto fail;
 738                 }
 739                 frag = skb_shinfo(skb)->frag_list;
 740                 skb_frag_list_init(skb);
 741
 742                 __skb_pull(skb, hlen);
 743                 fh = __skb_push(skb, sizeof(struct frag_hdr));
 744                 __skb_push(skb, hlen);
 745                 skb_reset_network_header(skb);
 746                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 747
 748                 fh->nexthdr = nexthdr;
 749                 fh->reserved = 0;
 750                 fh->frag_off = htons(IP6_MF);
 751                 fh->identification = frag_id;
 752
 753                 first_len = skb_pagelen(skb);
 754                 skb->data_len = first_len - skb_headlen(skb);
 755                 skb->len = first_len;
 756                 ipv6_hdr(skb)->payload_len = htons(first_len -
 757                                                    sizeof(struct ipv6hdr));
 758
 759                 /* We prevent @rt from being freed. */
 760                 rcu_read_lock();
 761
 762                 for (;;) {
 763                         /* Prepare header of the next frame,
 764                          * before previous one went down. */
 765                         if (frag) {
 766                                 frag->ip_summed = CHECKSUM_NONE;
 767                                 skb_reset_transport_header(frag);
 768                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
 769                                 __skb_push(frag, hlen);
 770                                 skb_reset_network_header(frag);
 771                                 memcpy(skb_network_header(frag), tmp_hdr,
 772                                        hlen);
 773                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 774                                 fh->nexthdr = nexthdr;
 775                                 fh->reserved = 0;
 776                                 fh->frag_off = htons(offset);
 777                                 if (frag->next)
 778                                         fh->frag_off |= htons(IP6_MF);
 779                                 fh->identification = frag_id;
 780                                 ipv6_hdr(frag)->payload_len =
 781                                                 htons(frag->len -
 782                                                       sizeof(struct ipv6hdr));
 783                                 ip6_copy_metadata(frag, skb);
 784                         }
 785
 786                         err = output(net, sk, skb);
 787                         if (!err)
 788                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 789                                               IPSTATS_MIB_FRAGCREATES);
 790
 791                         if (err || !frag)
 792                                 break;
 793
 794                         skb = frag;
 795                         frag = skb->next;
 796                         skb->next = NULL;
 797                 }
 798
 799                 kfree(tmp_hdr);
 800
 801                 if (err == 0) {
 802                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 803                                       IPSTATS_MIB_FRAGOKS);
 804                         rcu_read_unlock();
 805                         return 0;
 806                 }
 807
 808                 kfree_skb_list(frag);
 809
 810                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 811                               IPSTATS_MIB_FRAGFAILS);
 812                 rcu_read_unlock();
 813                 return err;
 814
 815 slow_path_clean:
 816                 skb_walk_frags(skb, frag2) {
 817                         if (frag2 == frag)
 818                                 break;
 819                         frag2->sk = NULL;
 820                         frag2->destructor = NULL;
 821                         skb->truesize += frag2->truesize;
 822                 }
 823         }
 824
 825 slow_path:
 826         left = skb->len - hlen;         /* Space per frame */
 827         ptr = hlen;                     /* Where to start from */
 828
 829         /*
 830          *      Fragment the datagram.
 831          */
 832
 833         troom = rt->dst.dev->needed_tailroom;
 834
 835         /*
 836          *      Keep copying data until we run out.
 837          */
 838         while (left > 0)        {
 839                 u8 *fragnexthdr_offset;
 840
 841                 len = left;
 842                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 843                 if (len > mtu)
 844                         len = mtu;
 845                 /* IF: we are not sending up to and including the packet end
 846                    then align the next start on an eight byte boundary */
 847                 if (len < left) {
 848                         len &= ~7;
 849                 }
 850
 851                 /* Allocate buffer */
 852                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 853                                  hroom + troom, GFP_ATOMIC);
 854                 if (!frag) {
 855                         err = -ENOMEM;
 856                         goto fail;
 857                 }
 858
 859                 /*
 860                  *      Set up data on packet
 861                  */
 862
 863                 ip6_copy_metadata(frag, skb);
 864                 skb_reserve(frag, hroom);
 865                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 866                 skb_reset_network_header(frag);
 867                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 868                 frag->transport_header = (frag->network_header + hlen +
 869                                           sizeof(struct frag_hdr));
 870
 871                 /*
 872                  *      Charge the memory for the fragment to any owner
 873                  *      it might possess
 874                  */
 875                 if (skb->sk)
 876                         skb_set_owner_w(frag, skb->sk);
 877
 878                 /*
 879                  *      Copy the packet header into the new buffer.
 880                  */
 881                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 882
 883                 fragnexthdr_offset = skb_network_header(frag);
 884                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
 885                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 886
 887                 /*
 888                  *      Build fragment header.
 889                  */
 890                 fh->nexthdr = nexthdr;
 891                 fh->reserved = 0;
 892                 fh->identification = frag_id;
 893
 894                 /*
 895                  *      Copy a block of the IP datagram.
 896                  */
 897                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 898                                      len));
 899                 left -= len;
 900
 901                 fh->frag_off = htons(offset);
 902                 if (left > 0)
 903                         fh->frag_off |= htons(IP6_MF);
 904                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 905                                                     sizeof(struct ipv6hdr));
 906
 907                 ptr += len;
 908                 offset += len;
 909
 910                 /*
 911                  *      Put this fragment into the sending queue.
 912                  */
 913                 err = output(net, sk, frag);
 914                 if (err)
 915                         goto fail;
 916
 917                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 918                               IPSTATS_MIB_FRAGCREATES);
 919         }
 920         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 921                       IPSTATS_MIB_FRAGOKS);
 922         consume_skb(skb);
 923         return err;
 924
 925 fail_toobig:
 926         if (skb->sk && dst_allfrag(skb_dst(skb)))
 927                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 928
 929         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 930         err = -EMSGSIZE;
 931
 932 fail:
 933         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 934                       IPSTATS_MIB_FRAGFAILS);
 935         kfree_skb(skb);
 936         return err;
 937 }
 938
 939 static inline int ip6_rt_check(const struct rt6key *rt_key,
 940                                const struct in6_addr *fl_addr,
 941                                const struct in6_addr *addr_cache)
 942 {
 943         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 944                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 945 }
 946
 947 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 948                                           struct dst_entry *dst,
 949                                           const struct flowi6 *fl6)
 950 {
 951         struct ipv6_pinfo *np = inet6_sk(sk);
 952         struct rt6_info *rt;
 953
 954         if (!dst)
 955                 goto out;
 956
 957         if (dst->ops->family != AF_INET6) {
 958                 dst_release(dst);
 959                 return NULL;
 960         }
 961
 962         rt = (struct rt6_info *)dst;
 963         /* Yes, checking route validity in not connected
 964          * case is not very simple. Take into account,
 965          * that we do not support routing by source, TOS,
 966          * and MSG_DONTROUTE            --ANK (980726)
 967          *
 968          * 1. ip6_rt_check(): If route was host route,
 969          *    check that cached destination is current.
 970          *    If it is network route, we still may
 971          *    check its validity using saved pointer
 972          *    to the last used address: daddr_cache.
 973          *    We do not want to save whole address now,
 974          *    (because main consumer of this service
 975          *    is tcp, which has not this problem),
 976          *    so that the last trick works only on connected
 977          *    sockets.
 978          * 2. oif also should be the same.
 979          */
 980         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 981 #ifdef CONFIG_IPV6_SUBTREES
 982             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 983 #endif
 984            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 985               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 986                 dst_release(dst);
 987                 dst = NULL;
 988         }
 989
 990 out:
 991         return dst;
 992 }
 993
 994 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 995                                struct dst_entry **dst, struct flowi6 *fl6)
 996 {
 997 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 998         struct neighbour *n;
 999         struct rt6_info *rt;
1000 #endif
1001         int err;
1002         int flags = 0;
1003
1004         /* The correct way to handle this would be to do
1005          * ip6_route_get_saddr, and then ip6_route_output; however,
1006          * the route-specific preferred source forces the
1007          * ip6_route_output call _before_ ip6_route_get_saddr.
1008          *
1009          * In source specific routing (no src=any default route),
1010          * ip6_route_output will fail given src=any saddr, though, so
1011          * that's why we try it again later.
1012          */
1013         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
1014                 struct rt6_info *rt;
1015                 bool had_dst = *dst != NULL;
1016
1017                 if (!had_dst)
1018                         *dst = ip6_route_output(net, sk, fl6);
1019                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1020                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
1021                                           sk ? inet6_sk(sk)->srcprefs : 0,
1022                                           &fl6->saddr);
1023                 if (err)
1024                         goto out_err_release;
1025
1026                 /* If we had an erroneous initial result, pretend it
1027                  * never existed and let the SA-enabled version take
1028                  * over.
1029                  */
1030                 if (!had_dst && (*dst)->error) {
1031                         dst_release(*dst);
1032                         *dst = NULL;
1033                 }
1034
1035                 if (fl6->flowi6_oif)
1036                         flags |= RT6_LOOKUP_F_IFACE;
1037         }
1038
1039         if (!*dst)
1040                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1041
1042         err = (*dst)->error;
1043         if (err)
1044                 goto out_err_release;
1045
1046 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1047         /*
1048          * Here if the dst entry we've looked up
1049          * has a neighbour entry that is in the INCOMPLETE
1050          * state and the src address from the flow is
1051          * marked as OPTIMISTIC, we release the found
1052          * dst entry and replace it instead with the
1053          * dst entry of the nexthop router
1054          */
1055         rt = (struct rt6_info *) *dst;
1056         rcu_read_lock_bh();
1057         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1058                                       rt6_nexthop(rt, &fl6->daddr));
1059         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1060         rcu_read_unlock_bh();
1061
1062         if (err) {
1063                 struct inet6_ifaddr *ifp;
1064                 struct flowi6 fl_gw6;
1065                 int redirect;
1066
1067                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1068                                       (*dst)->dev, 1);
1069
1070                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1071                 if (ifp)
1072                         in6_ifa_put(ifp);
1073
1074                 if (redirect) {
1075                         /*
1076                          * We need to get the dst entry for the
1077                          * default router instead
1078                          */
1079                         dst_release(*dst);
1080                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1081                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1082                         *dst = ip6_route_output(net, sk, &fl_gw6);
1083                         err = (*dst)->error;
1084                         if (err)
1085                                 goto out_err_release;
1086                 }
1087         }
1088 #endif
1089         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1090             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1091                 err = -EAFNOSUPPORT;
1092                 goto out_err_release;
1093         }
1094
1095         return 0;
1096
1097 out_err_release:
1098         dst_release(*dst);
1099         *dst = NULL;
1100
1101         if (err == -ENETUNREACH)
1102                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1103         return err;
1104 }
1105
1106 /**
1107  *      ip6_dst_lookup - perform route lookup on flow
1108  *      @sk: socket which provides route info
1109  *      @dst: pointer to dst_entry * for result
1110  *      @fl6: flow to lookup
1111  *
1112  *      This function performs a route lookup on the given flow.
1113  *
1114  *      It returns zero on success, or a standard errno code on error.
1115  */
1116 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1117                    struct flowi6 *fl6)
1118 {
1119         *dst = NULL;
1120         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1121 }
1122 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1123
1124 /**
1125  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1126  *      @sk: socket which provides route info
1127  *      @fl6: flow to lookup
1128  *      @final_dst: final destination address for ipsec lookup
1129  *
1130  *      This function performs a route lookup on the given flow.
1131  *
1132  *      It returns a valid dst pointer on success, or a pointer encoded
1133  *      error code.
1134  */
1135 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1136                                       const struct in6_addr *final_dst)
1137 {
1138         struct dst_entry *dst = NULL;
1139         int err;
1140
1141         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1142         if (err)
1143                 return ERR_PTR(err);
1144         if (final_dst)
1145                 fl6->daddr = *final_dst;
1146
1147         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1148 }
1149 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1150
1151 /**
1152  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1153  *      @sk: socket which provides the dst cache and route info
1154  *      @fl6: flow to lookup
1155  *      @final_dst: final destination address for ipsec lookup
1156  *
1157  *      This function performs a route lookup on the given flow with the
1158  *      possibility of using the cached route in the socket if it is valid.
1159  *      It will take the socket dst lock when operating on the dst cache.
1160  *      As a result, this function can only be used in process context.
1161  *
1162  *      It returns a valid dst pointer on success, or a pointer encoded
1163  *      error code.
1164  */
1165 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1166                                          const struct in6_addr *final_dst)
1167 {
1168         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1169
1170         dst = ip6_sk_dst_check(sk, dst, fl6);
1171         if (!dst)
1172                 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1173
1174         return dst;
1175 }
1176 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1177
1178 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1179                                                gfp_t gfp)
1180 {
1181         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1182 }
1183
1184 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1185                                                 gfp_t gfp)
1186 {
1187         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1188 }
1189
1190 static void ip6_append_data_mtu(unsigned int *mtu,
1191                                 int *maxfraglen,
1192                                 unsigned int fragheaderlen,
1193                                 struct sk_buff *skb,
1194                                 struct rt6_info *rt,
1195                                 unsigned int orig_mtu)
1196 {
1197         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1198                 if (!skb) {
1199                         /* first fragment, reserve header_len */
1200                         *mtu = orig_mtu - rt->dst.header_len;
1201
1202                 } else {
1203                         /*
1204                          * this fragment is not first, the headers
1205                          * space is regarded as data space.
1206                          */
1207                         *mtu = orig_mtu;
1208                 }
1209                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1210                               + fragheaderlen - sizeof(struct frag_hdr);
1211         }
1212 }
1213
1214 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1215                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1216                           struct rt6_info *rt, struct flowi6 *fl6)
1217 {
1218         struct ipv6_pinfo *np = inet6_sk(sk);
1219         unsigned int mtu;
1220         struct ipv6_txoptions *opt = ipc6->opt;
1221
1222         /*
1223          * setup for corking
1224          */
1225         if (opt) {
1226                 if (WARN_ON(v6_cork->opt))
1227                         return -EINVAL;
1228
1229                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1230                 if (unlikely(!v6_cork->opt))
1231                         return -ENOBUFS;
1232
1233                 v6_cork->opt->tot_len = sizeof(*opt);
1234                 v6_cork->opt->opt_flen = opt->opt_flen;
1235                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1236
1237                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1238                                                     sk->sk_allocation);
1239                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1240                         return -ENOBUFS;
1241
1242                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1243                                                     sk->sk_allocation);
1244                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1245                         return -ENOBUFS;
1246
1247                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1248                                                    sk->sk_allocation);
1249                 if (opt->hopopt && !v6_cork->opt->hopopt)
1250                         return -ENOBUFS;
1251
1252                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1253                                                     sk->sk_allocation);
1254                 if (opt->srcrt && !v6_cork->opt->srcrt)
1255                         return -ENOBUFS;
1256
1257                 /* need source address above miyazawa*/
1258         }
1259         dst_hold(&rt->dst);
1260         cork->base.dst = &rt->dst;
1261         cork->fl.u.ip6 = *fl6;
1262         v6_cork->hop_limit = ipc6->hlimit;
1263         v6_cork->tclass = ipc6->tclass;
1264         if (rt->dst.flags & DST_XFRM_TUNNEL)
1265                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1266                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1267         else
1268                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1269                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path);
1270         if (np->frag_size < mtu) {
1271                 if (np->frag_size)
1272                         mtu = np->frag_size;
1273         }
1274         cork->base.fragsize = mtu;
1275         if (dst_allfrag(rt->dst.path))
1276                 cork->base.flags |= IPCORK_ALLFRAG;
1277         cork->base.length = 0;
1278
1279         return 0;
1280 }
1281
1282 static int __ip6_append_data(struct sock *sk,
1283                              struct flowi6 *fl6,
1284                              struct sk_buff_head *queue,
1285                              struct inet_cork *cork,
1286                              struct inet6_cork *v6_cork,
1287                              struct page_frag *pfrag,
1288                              int getfrag(void *from, char *to, int offset,
1289                                          int len, int odd, struct sk_buff *skb),
1290                              void *from, int length, int transhdrlen,
1291                              unsigned int flags, struct ipcm6_cookie *ipc6,
1292                              const struct sockcm_cookie *sockc)
1293 {
1294         struct sk_buff *skb, *skb_prev = NULL;
1295         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1296         int exthdrlen = 0;
1297         int dst_exthdrlen = 0;
1298         int hh_len;
1299         int copy;
1300         int err;
1301         int offset = 0;
1302         __u8 tx_flags = 0;
1303         u32 tskey = 0;
1304         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1305         struct ipv6_txoptions *opt = v6_cork->opt;
1306         int csummode = CHECKSUM_NONE;
1307         unsigned int maxnonfragsize, headersize;
1308
1309         skb = skb_peek_tail(queue);
1310         if (!skb) {
1311                 exthdrlen = opt ? opt->opt_flen : 0;
1312                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1313         }
1314
1315         mtu = cork->fragsize;
1316         orig_mtu = mtu;
1317
1318         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1319
1320         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1321                         (opt ? opt->opt_nflen : 0);
1322
1323         headersize = sizeof(struct ipv6hdr) +
1324                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1325                      (dst_allfrag(&rt->dst) ?
1326                       sizeof(struct frag_hdr) : 0) +
1327                      rt->rt6i_nfheader_len;
1328
1329         if (mtu <= fragheaderlen ||
1330             ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1331                 goto emsgsize;
1332
1333         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1334                      sizeof(struct frag_hdr);
1335
1336         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1337          * the first fragment
1338          */
1339         if (headersize + transhdrlen > mtu)
1340                 goto emsgsize;
1341
1342         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1343             (sk->sk_protocol == IPPROTO_UDP ||
1344              sk->sk_protocol == IPPROTO_RAW)) {
1345                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1346                                 sizeof(struct ipv6hdr));
1347                 goto emsgsize;
1348         }
1349
1350         if (ip6_sk_ignore_df(sk))
1351                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1352         else
1353                 maxnonfragsize = mtu;
1354
1355         if (cork->length + length > maxnonfragsize - headersize) {
1356 emsgsize:
1357                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1358                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1359                 return -EMSGSIZE;
1360         }
1361
1362         /* CHECKSUM_PARTIAL only with no extension headers and when
1363          * we are not going to fragment
1364          */
1365         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1366             headersize == sizeof(struct ipv6hdr) &&
1367             length <= mtu - headersize &&
1368             !(flags & MSG_MORE) &&
1369             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1370                 csummode = CHECKSUM_PARTIAL;
1371
1372         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1373                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1374                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1375                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1376                         tskey = sk->sk_tskey++;
1377         }
1378
1379         /*
1380          * Let's try using as much space as possible.
1381          * Use MTU if total length of the message fits into the MTU.
1382          * Otherwise, we need to reserve fragment header and
1383          * fragment alignment (= 8-15 octects, in total).
1384          *
1385          * Note that we may need to "move" the data from the tail of
1386          * of the buffer to the new fragment when we split
1387          * the message.
1388          *
1389          * FIXME: It may be fragmented into multiple chunks
1390          *        at once if non-fragmentable extension headers
1391          *        are too large.
1392          * --yoshfuji
1393          */
1394
1395         cork->length += length;
1396         if (!skb)
1397                 goto alloc_new_skb;
1398
1399         while (length > 0) {
1400                 /* Check if the remaining data fits into current packet. */
1401                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1402                 if (copy < length)
1403                         copy = maxfraglen - skb->len;
1404
1405                 if (copy <= 0) {
1406                         char *data;
1407                         unsigned int datalen;
1408                         unsigned int fraglen;
1409                         unsigned int fraggap;
1410                         unsigned int alloclen;
1411 alloc_new_skb:
1412                         /* There's no room in the current skb */
1413                         if (skb)
1414                                 fraggap = skb->len - maxfraglen;
1415                         else
1416                                 fraggap = 0;
1417                         /* update mtu and maxfraglen if necessary */
1418                         if (!skb || !skb_prev)
1419                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1420                                                     fragheaderlen, skb, rt,
1421                                                     orig_mtu);
1422
1423                         skb_prev = skb;
1424
1425                         /*
1426                          * If remaining data exceeds the mtu,
1427                          * we know we need more fragment(s).
1428                          */
1429                         datalen = length + fraggap;
1430
1431                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1432                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1433                         if ((flags & MSG_MORE) &&
1434                             !(rt->dst.dev->features&NETIF_F_SG))
1435                                 alloclen = mtu;
1436                         else
1437                                 alloclen = datalen + fragheaderlen;
1438
1439                         alloclen += dst_exthdrlen;
1440
1441                         if (datalen != length + fraggap) {
1442                                 /*
1443                                  * this is not the last fragment, the trailer
1444                                  * space is regarded as data space.
1445                                  */
1446                                 datalen += rt->dst.trailer_len;
1447                         }
1448
1449                         alloclen += rt->dst.trailer_len;
1450                         fraglen = datalen + fragheaderlen;
1451
1452                         /*
1453                          * We just reserve space for fragment header.
1454                          * Note: this may be overallocation if the message
1455                          * (without MSG_MORE) fits into the MTU.
1456                          */
1457                         alloclen += sizeof(struct frag_hdr);
1458
1459                         copy = datalen - transhdrlen - fraggap;
1460                         if (copy < 0) {
1461                                 err = -EINVAL;
1462                                 goto error;
1463                         }
1464                         if (transhdrlen) {
1465                                 skb = sock_alloc_send_skb(sk,
1466                                                 alloclen + hh_len,
1467                                                 (flags & MSG_DONTWAIT), &err);
1468                         } else {
1469                                 skb = NULL;
1470                                 if (refcount_read(&sk->sk_wmem_alloc) <=
1471                                     2 * sk->sk_sndbuf)
1472                                         skb = sock_wmalloc(sk,
1473                                                            alloclen + hh_len, 1,
1474                                                            sk->sk_allocation);
1475                                 if (unlikely(!skb))
1476                                         err = -ENOBUFS;
1477                         }
1478                         if (!skb)
1479                                 goto error;
1480                         /*
1481                          *      Fill in the control structures
1482                          */
1483                         skb->protocol = htons(ETH_P_IPV6);
1484                         skb->ip_summed = csummode;
1485                         skb->csum = 0;
1486                         /* reserve for fragmentation and ipsec header */
1487                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1488                                     dst_exthdrlen);
1489
1490                         /* Only the initial fragment is time stamped */
1491                         skb_shinfo(skb)->tx_flags = tx_flags;
1492                         tx_flags = 0;
1493                         skb_shinfo(skb)->tskey = tskey;
1494                         tskey = 0;
1495
1496                         /*
1497                          *      Find where to start putting bytes
1498                          */
1499                         data = skb_put(skb, fraglen);
1500                         skb_set_network_header(skb, exthdrlen);
1501                         data += fragheaderlen;
1502                         skb->transport_header = (skb->network_header +
1503                                                  fragheaderlen);
1504                         if (fraggap) {
1505                                 skb->csum = skb_copy_and_csum_bits(
1506                                         skb_prev, maxfraglen,
1507                                         data + transhdrlen, fraggap, 0);
1508                                 skb_prev->csum = csum_sub(skb_prev->csum,
1509                                                           skb->csum);
1510                                 data += fraggap;
1511                                 pskb_trim_unique(skb_prev, maxfraglen);
1512                         }
1513                         if (copy > 0 &&
1514                             getfrag(from, data + transhdrlen, offset,
1515                                     copy, fraggap, skb) < 0) {
1516                                 err = -EFAULT;
1517                                 kfree_skb(skb);
1518                                 goto error;
1519                         }
1520
1521                         offset += copy;
1522                         length -= datalen - fraggap;
1523                         transhdrlen = 0;
1524                         exthdrlen = 0;
1525                         dst_exthdrlen = 0;
1526
1527                         if ((flags & MSG_CONFIRM) && !skb_prev)
1528                                 skb_set_dst_pending_confirm(skb, 1);
1529
1530                         /*
1531                          * Put the packet on the pending queue
1532                          */
1533                         __skb_queue_tail(queue, skb);
1534                         continue;
1535                 }
1536
1537                 if (copy > length)
1538                         copy = length;
1539
1540                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1541                     skb_tailroom(skb) >= copy) {
1542                         unsigned int off;
1543
1544                         off = skb->len;
1545                         if (getfrag(from, skb_put(skb, copy),
1546                                                 offset, copy, off, skb) < 0) {
1547                                 __skb_trim(skb, off);
1548                                 err = -EFAULT;
1549                                 goto error;
1550                         }
1551                 } else {
1552                         int i = skb_shinfo(skb)->nr_frags;
1553
1554                         err = -ENOMEM;
1555                         if (!sk_page_frag_refill(sk, pfrag))
1556                                 goto error;
1557
1558                         if (!skb_can_coalesce(skb, i, pfrag->page,
1559                                               pfrag->offset)) {
1560                                 err = -EMSGSIZE;
1561                                 if (i == MAX_SKB_FRAGS)
1562                                         goto error;
1563
1564                                 __skb_fill_page_desc(skb, i, pfrag->page,
1565                                                      pfrag->offset, 0);
1566                                 skb_shinfo(skb)->nr_frags = ++i;
1567                                 get_page(pfrag->page);
1568                         }
1569                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1570                         if (getfrag(from,
1571                                     page_address(pfrag->page) + pfrag->offset,
1572                                     offset, copy, skb->len, skb) < 0)
1573                                 goto error_efault;
1574
1575                         pfrag->offset += copy;
1576                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1577                         skb->len += copy;
1578                         skb->data_len += copy;
1579                         skb->truesize += copy;
1580                         refcount_add(copy, &sk->sk_wmem_alloc);
1581                 }
1582                 offset += copy;
1583                 length -= copy;
1584         }
1585
1586         return 0;
1587
1588 error_efault:
1589         err = -EFAULT;
1590 error:
1591         cork->length -= length;
1592         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1593         return err;
1594 }
1595
1596 int ip6_append_data(struct sock *sk,
1597                     int getfrag(void *from, char *to, int offset, int len,
1598                                 int odd, struct sk_buff *skb),
1599                     void *from, int length, int transhdrlen,
1600                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1601                     struct rt6_info *rt, unsigned int flags,
1602                     const struct sockcm_cookie *sockc)
1603 {
1604         struct inet_sock *inet = inet_sk(sk);
1605         struct ipv6_pinfo *np = inet6_sk(sk);
1606         int exthdrlen;
1607         int err;
1608
1609         if (flags&MSG_PROBE)
1610                 return 0;
1611         if (skb_queue_empty(&sk->sk_write_queue)) {
1612                 /*
1613                  * setup for corking
1614                  */
1615                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1616                                      ipc6, rt, fl6);
1617                 if (err)
1618                         return err;
1619
1620                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1621                 length += exthdrlen;
1622                 transhdrlen += exthdrlen;
1623         } else {
1624                 fl6 = &inet->cork.fl.u.ip6;
1625                 transhdrlen = 0;
1626         }
1627
1628         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1629                                  &np->cork, sk_page_frag(sk), getfrag,
1630                                  from, length, transhdrlen, flags, ipc6, sockc);
1631 }
1632 EXPORT_SYMBOL_GPL(ip6_append_data);
1633
1634 static void ip6_cork_release(struct inet_cork_full *cork,
1635                              struct inet6_cork *v6_cork)
1636 {
1637         if (v6_cork->opt) {
1638                 kfree(v6_cork->opt->dst0opt);
1639                 kfree(v6_cork->opt->dst1opt);
1640                 kfree(v6_cork->opt->hopopt);
1641                 kfree(v6_cork->opt->srcrt);
1642                 kfree(v6_cork->opt);
1643                 v6_cork->opt = NULL;
1644         }
1645
1646         if (cork->base.dst) {
1647                 dst_release(cork->base.dst);
1648                 cork->base.dst = NULL;
1649                 cork->base.flags &= ~IPCORK_ALLFRAG;
1650         }
1651         memset(&cork->fl, 0, sizeof(cork->fl));
1652 }
1653
1654 struct sk_buff *__ip6_make_skb(struct sock *sk,
1655                                struct sk_buff_head *queue,
1656                                struct inet_cork_full *cork,
1657                                struct inet6_cork *v6_cork)
1658 {
1659         struct sk_buff *skb, *tmp_skb;
1660         struct sk_buff **tail_skb;
1661         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1662         struct ipv6_pinfo *np = inet6_sk(sk);
1663         struct net *net = sock_net(sk);
1664         struct ipv6hdr *hdr;
1665         struct ipv6_txoptions *opt = v6_cork->opt;
1666         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1667         struct flowi6 *fl6 = &cork->fl.u.ip6;
1668         unsigned char proto = fl6->flowi6_proto;
1669
1670         skb = __skb_dequeue(queue);
1671         if (!skb)
1672                 goto out;
1673         tail_skb = &(skb_shinfo(skb)->frag_list);
1674
1675         /* move skb->data to ip header from ext header */
1676         if (skb->data < skb_network_header(skb))
1677                 __skb_pull(skb, skb_network_offset(skb));
1678         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1679                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1680                 *tail_skb = tmp_skb;
1681                 tail_skb = &(tmp_skb->next);
1682                 skb->len += tmp_skb->len;
1683                 skb->data_len += tmp_skb->len;
1684                 skb->truesize += tmp_skb->truesize;
1685                 tmp_skb->destructor = NULL;
1686                 tmp_skb->sk = NULL;
1687         }
1688
1689         /* Allow local fragmentation. */
1690         skb->ignore_df = ip6_sk_ignore_df(sk);
1691
1692         *final_dst = fl6->daddr;
1693         __skb_pull(skb, skb_network_header_len(skb));
1694         if (opt && opt->opt_flen)
1695                 ipv6_push_frag_opts(skb, opt, &proto);
1696         if (opt && opt->opt_nflen)
1697                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1698
1699         skb_push(skb, sizeof(struct ipv6hdr));
1700         skb_reset_network_header(skb);
1701         hdr = ipv6_hdr(skb);
1702
1703         ip6_flow_hdr(hdr, v6_cork->tclass,
1704                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1705                                         ip6_autoflowlabel(net, np), fl6));
1706         hdr->hop_limit = v6_cork->hop_limit;
1707         hdr->nexthdr = proto;
1708         hdr->saddr = fl6->saddr;
1709         hdr->daddr = *final_dst;
1710
1711         skb->priority = sk->sk_priority;
1712         skb->mark = sk->sk_mark;
1713
1714         skb_dst_set(skb, dst_clone(&rt->dst));
1715         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1716         if (proto == IPPROTO_ICMPV6) {
1717                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1718
1719                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1720                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1721         }
1722
1723         ip6_cork_release(cork, v6_cork);
1724 out:
1725         return skb;
1726 }
1727
1728 int ip6_send_skb(struct sk_buff *skb)
1729 {
1730         struct net *net = sock_net(skb->sk);
1731         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1732         int err;
1733
1734         err = ip6_local_out(net, skb->sk, skb);
1735         if (err) {
1736                 if (err > 0)
1737                         err = net_xmit_errno(err);
1738                 if (err)
1739                         IP6_INC_STATS(net, rt->rt6i_idev,
1740                                       IPSTATS_MIB_OUTDISCARDS);
1741         }
1742
1743         return err;
1744 }
1745
1746 int ip6_push_pending_frames(struct sock *sk)
1747 {
1748         struct sk_buff *skb;
1749
1750         skb = ip6_finish_skb(sk);
1751         if (!skb)
1752                 return 0;
1753
1754         return ip6_send_skb(skb);
1755 }
1756 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1757
1758 static void __ip6_flush_pending_frames(struct sock *sk,
1759                                        struct sk_buff_head *queue,
1760                                        struct inet_cork_full *cork,
1761                                        struct inet6_cork *v6_cork)
1762 {
1763         struct sk_buff *skb;
1764
1765         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1766                 if (skb_dst(skb))
1767                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1768                                       IPSTATS_MIB_OUTDISCARDS);
1769                 kfree_skb(skb);
1770         }
1771
1772         ip6_cork_release(cork, v6_cork);
1773 }
1774
1775 void ip6_flush_pending_frames(struct sock *sk)
1776 {
1777         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1778                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1779 }
1780 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1781
1782 struct sk_buff *ip6_make_skb(struct sock *sk,
1783                              int getfrag(void *from, char *to, int offset,
1784                                          int len, int odd, struct sk_buff *skb),
1785                              void *from, int length, int transhdrlen,
1786                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1787                              struct rt6_info *rt, unsigned int flags,
1788                              const struct sockcm_cookie *sockc)
1789 {
1790         struct inet_cork_full cork;
1791         struct inet6_cork v6_cork;
1792         struct sk_buff_head queue;
1793         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1794         int err;
1795
1796         if (flags & MSG_PROBE)
1797                 return NULL;
1798
1799         __skb_queue_head_init(&queue);
1800
1801         cork.base.flags = 0;
1802         cork.base.addr = 0;
1803         cork.base.opt = NULL;
1804         cork.base.dst = NULL;
1805         v6_cork.opt = NULL;
1806         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1807         if (err) {
1808                 ip6_cork_release(&cork, &v6_cork);
1809                 return ERR_PTR(err);
1810         }
1811         if (ipc6->dontfrag < 0)
1812                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1813
1814         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1815                                 &current->task_frag, getfrag, from,
1816                                 length + exthdrlen, transhdrlen + exthdrlen,
1817                                 flags, ipc6, sockc);
1818         if (err) {
1819                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1820                 return ERR_PTR(err);
1821         }
1822
1823         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1824 }