net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/bpf-cgroup.h>
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58 #include <linux/mroute6.h>
  59 #include <net/l3mdev.h>
  60 #include <net/lwtunnel.h>
  61
  62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63 {
  64         struct dst_entry *dst = skb_dst(skb);
  65         struct net_device *dev = dst->dev;
  66         struct neighbour *neigh;
  67         struct in6_addr *nexthop;
  68         int ret;
  69
  70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74                     ((mroute6_socket(net, skb) &&
  75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                          &ipv6_hdr(skb)->saddr))) {
  78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                         /* Do not check for IFF_ALLMULTI; multicast routing
  81                            is not supported in any case.
  82                          */
  83                         if (newskb)
  84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                         net, sk, newskb, NULL, newskb->dev,
  86                                         dev_loopback_xmit);
  87
  88                         if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                 IP6_INC_STATS(net, idev,
  90                                               IPSTATS_MIB_OUTDISCARDS);
  91                                 kfree_skb(skb);
  92                                 return 0;
  93                         }
  94                 }
  95
  96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  97
  98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  99                     IPV6_ADDR_SCOPE_NODELOCAL &&
 100                     !(dev->flags & IFF_LOOPBACK)) {
 101                         kfree_skb(skb);
 102                         return 0;
 103                 }
 104         }
 105
 106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 107                 int res = lwtunnel_xmit(skb);
 108
 109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 110                         return res;
 111         }
 112
 113         rcu_read_lock_bh();
 114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 116         if (unlikely(!neigh))
 117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 118         if (!IS_ERR(neigh)) {
 119                 sock_confirm_neigh(skb, neigh);
 120                 ret = neigh_output(neigh, skb);
 121                 rcu_read_unlock_bh();
 122                 return ret;
 123         }
 124         rcu_read_unlock_bh();
 125
 126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 127         kfree_skb(skb);
 128         return -EINVAL;
 129 }
 130
 131 static int
 132 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
 133                                     struct sk_buff *skb, unsigned int mtu)
 134 {
 135         struct sk_buff *segs, *nskb;
 136         netdev_features_t features;
 137         int ret = 0;
 138
 139         /* Please see corresponding comment in ip_finish_output_gso
 140          * describing the cases where GSO segment length exceeds the
 141          * egress MTU.
 142          */
 143         features = netif_skb_features(skb);
 144         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 145         if (IS_ERR_OR_NULL(segs)) {
 146                 kfree_skb(skb);
 147                 return -ENOMEM;
 148         }
 149
 150         consume_skb(skb);
 151
 152         skb_list_walk_safe(segs, segs, nskb) {
 153                 int err;
 154
 155                 skb_mark_not_on_list(segs);
 156                 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
 157                 if (err && ret == 0)
 158                         ret = err;
 159         }
 160
 161         return ret;
 162 }
 163
 164 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 165 {
 166         unsigned int mtu;
 167         int ret;
 168
 169         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 170         if (ret) {
 171                 kfree_skb(skb);
 172                 return ret;
 173         }
 174
 175 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 176         /* Policy lookup after SNAT yielded a new policy */
 177         if (skb_dst(skb)->xfrm) {
 178                 IPCB(skb)->flags |= IPSKB_REROUTED;
 179                 return dst_output(net, sk, skb);
 180         }
 181 #endif
 182
 183         mtu = ip6_skb_dst_mtu(skb);
 184         if (skb_is_gso(skb) && !skb_gso_validate_mtu(skb, mtu))
 185                 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
 186
 187         if ((skb->len > mtu && !skb_is_gso(skb)) ||
 188             dst_allfrag(skb_dst(skb)) ||
 189             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 190                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 191         else
 192                 return ip6_finish_output2(net, sk, skb);
 193 }
 194
 195 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 196 {
 197         struct net_device *dev = skb_dst(skb)->dev;
 198         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 199
 200         skb->protocol = htons(ETH_P_IPV6);
 201         skb->dev = dev;
 202
 203         if (unlikely(idev->cnf.disable_ipv6)) {
 204                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 205                 kfree_skb(skb);
 206                 return 0;
 207         }
 208
 209         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 210                             net, sk, skb, NULL, dev,
 211                             ip6_finish_output,
 212                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 213 }
 214
 215 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 216 {
 217         if (!np->autoflowlabel_set)
 218                 return ip6_default_np_autolabel(net);
 219         else
 220                 return np->autoflowlabel;
 221 }
 222
 223 /*
 224  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 225  * Note : socket lock is not held for SYNACK packets, but might be modified
 226  * by calls to skb_set_owner_w() and ipv6_local_error(),
 227  * which are using proper atomic operations or spinlocks.
 228  */
 229 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 230              __u32 mark, struct ipv6_txoptions *opt, int tclass)
 231 {
 232         struct net *net = sock_net(sk);
 233         const struct ipv6_pinfo *np = inet6_sk(sk);
 234         struct in6_addr *first_hop = &fl6->daddr;
 235         struct dst_entry *dst = skb_dst(skb);
 236         unsigned int head_room;
 237         struct ipv6hdr *hdr;
 238         u8  proto = fl6->flowi6_proto;
 239         int seg_len = skb->len;
 240         int hlimit = -1;
 241         u32 mtu;
 242
 243         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 244         if (opt)
 245                 head_room += opt->opt_nflen + opt->opt_flen;
 246
 247         if (unlikely(skb_headroom(skb) < head_room)) {
 248                 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 249                 if (!skb2) {
 250                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 251                                       IPSTATS_MIB_OUTDISCARDS);
 252                         kfree_skb(skb);
 253                         return -ENOBUFS;
 254                 }
 255                 if (skb->sk)
 256                         skb_set_owner_w(skb2, skb->sk);
 257                 consume_skb(skb);
 258                 skb = skb2;
 259         }
 260
 261         if (opt) {
 262                 seg_len += opt->opt_nflen + opt->opt_flen;
 263
 264                 if (opt->opt_flen)
 265                         ipv6_push_frag_opts(skb, opt, &proto);
 266
 267                 if (opt->opt_nflen)
 268                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 269                                              &fl6->saddr);
 270         }
 271
 272         skb_push(skb, sizeof(struct ipv6hdr));
 273         skb_reset_network_header(skb);
 274         hdr = ipv6_hdr(skb);
 275
 276         /*
 277          *      Fill in the IPv6 header
 278          */
 279         if (np)
 280                 hlimit = np->hop_limit;
 281         if (hlimit < 0)
 282                 hlimit = ip6_dst_hoplimit(dst);
 283
 284         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 285                                 ip6_autoflowlabel(net, np), fl6));
 286
 287         hdr->payload_len = htons(seg_len);
 288         hdr->nexthdr = proto;
 289         hdr->hop_limit = hlimit;
 290
 291         hdr->saddr = fl6->saddr;
 292         hdr->daddr = *first_hop;
 293
 294         skb->protocol = htons(ETH_P_IPV6);
 295         skb->priority = sk->sk_priority;
 296         skb->mark = mark;
 297
 298         mtu = dst_mtu(dst);
 299         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 300                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 301                               IPSTATS_MIB_OUT, skb->len);
 302
 303                 /* if egress device is enslaved to an L3 master device pass the
 304                  * skb to its handler for processing
 305                  */
 306                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 307                 if (unlikely(!skb))
 308                         return 0;
 309
 310                 /* hooks should never assume socket lock is held.
 311                  * we promote our socket to non const
 312                  */
 313                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 314                                net, (struct sock *)sk, skb, NULL, dst->dev,
 315                                dst_output);
 316         }
 317
 318         skb->dev = dst->dev;
 319         /* ipv6_local_error() does not require socket lock,
 320          * we promote our socket to non const
 321          */
 322         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 323
 324         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 325         kfree_skb(skb);
 326         return -EMSGSIZE;
 327 }
 328 EXPORT_SYMBOL(ip6_xmit);
 329
 330 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 331 {
 332         struct ip6_ra_chain *ra;
 333         struct sock *last = NULL;
 334
 335         read_lock(&ip6_ra_lock);
 336         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 337                 struct sock *sk = ra->sk;
 338                 if (sk && ra->sel == sel &&
 339                     (!sk->sk_bound_dev_if ||
 340                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 341                         if (last) {
 342                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 343                                 if (skb2)
 344                                         rawv6_rcv(last, skb2);
 345                         }
 346                         last = sk;
 347                 }
 348         }
 349
 350         if (last) {
 351                 rawv6_rcv(last, skb);
 352                 read_unlock(&ip6_ra_lock);
 353                 return 1;
 354         }
 355         read_unlock(&ip6_ra_lock);
 356         return 0;
 357 }
 358
 359 static int ip6_forward_proxy_check(struct sk_buff *skb)
 360 {
 361         struct ipv6hdr *hdr = ipv6_hdr(skb);
 362         u8 nexthdr = hdr->nexthdr;
 363         __be16 frag_off;
 364         int offset;
 365
 366         if (ipv6_ext_hdr(nexthdr)) {
 367                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 368                 if (offset < 0)
 369                         return 0;
 370         } else
 371                 offset = sizeof(struct ipv6hdr);
 372
 373         if (nexthdr == IPPROTO_ICMPV6) {
 374                 struct icmp6hdr *icmp6;
 375
 376                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 377                                          offset + 1 - skb->data)))
 378                         return 0;
 379
 380                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 381
 382                 switch (icmp6->icmp6_type) {
 383                 case NDISC_ROUTER_SOLICITATION:
 384                 case NDISC_ROUTER_ADVERTISEMENT:
 385                 case NDISC_NEIGHBOUR_SOLICITATION:
 386                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 387                 case NDISC_REDIRECT:
 388                         /* For reaction involving unicast neighbor discovery
 389                          * message destined to the proxied address, pass it to
 390                          * input function.
 391                          */
 392                         return 1;
 393                 default:
 394                         break;
 395                 }
 396         }
 397
 398         /*
 399          * The proxying router can't forward traffic sent to a link-local
 400          * address, so signal the sender and discard the packet. This
 401          * behavior is clarified by the MIPv6 specification.
 402          */
 403         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 404                 dst_link_failure(skb);
 405                 return -1;
 406         }
 407
 408         return 0;
 409 }
 410
 411 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 412                                      struct sk_buff *skb)
 413 {
 414         struct dst_entry *dst = skb_dst(skb);
 415
 416         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 417         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 418
 419         return dst_output(net, sk, skb);
 420 }
 421
 422 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 423 {
 424         unsigned int mtu;
 425         struct inet6_dev *idev;
 426
 427         if (dst_metric_locked(dst, RTAX_MTU)) {
 428                 mtu = dst_metric_raw(dst, RTAX_MTU);
 429                 if (mtu)
 430                         return mtu;
 431         }
 432
 433         mtu = IPV6_MIN_MTU;
 434         rcu_read_lock();
 435         idev = __in6_dev_get(dst->dev);
 436         if (idev)
 437                 mtu = idev->cnf.mtu6;
 438         rcu_read_unlock();
 439
 440         return mtu;
 441 }
 442
 443 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 444 {
 445         if (skb->len <= mtu)
 446                 return false;
 447
 448         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 449         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 450                 return true;
 451
 452         if (skb->ignore_df)
 453                 return false;
 454
 455         if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
 456                 return false;
 457
 458         return true;
 459 }
 460
 461 int ip6_forward(struct sk_buff *skb)
 462 {
 463         struct dst_entry *dst = skb_dst(skb);
 464         struct ipv6hdr *hdr = ipv6_hdr(skb);
 465         struct inet6_skb_parm *opt = IP6CB(skb);
 466         struct net *net = dev_net(dst->dev);
 467         u32 mtu;
 468
 469         if (net->ipv6.devconf_all->forwarding == 0)
 470                 goto error;
 471
 472         if (skb->pkt_type != PACKET_HOST)
 473                 goto drop;
 474
 475         if (unlikely(skb->sk))
 476                 goto drop;
 477
 478         if (skb_warn_if_lro(skb))
 479                 goto drop;
 480
 481         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 482                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 483                                 IPSTATS_MIB_INDISCARDS);
 484                 goto drop;
 485         }
 486
 487         skb_forward_csum(skb);
 488
 489         /*
 490          *      We DO NOT make any processing on
 491          *      RA packets, pushing them to user level AS IS
 492          *      without ane WARRANTY that application will be able
 493          *      to interpret them. The reason is that we
 494          *      cannot make anything clever here.
 495          *
 496          *      We are not end-node, so that if packet contains
 497          *      AH/ESP, we cannot make anything.
 498          *      Defragmentation also would be mistake, RA packets
 499          *      cannot be fragmented, because there is no warranty
 500          *      that different fragments will go along one path. --ANK
 501          */
 502         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 503                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 504                         return 0;
 505         }
 506
 507         /*
 508          *      check and decrement ttl
 509          */
 510         if (hdr->hop_limit <= 1) {
 511                 /* Force OUTPUT device used as source address */
 512                 skb->dev = dst->dev;
 513                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 514                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 515                                 IPSTATS_MIB_INHDRERRORS);
 516
 517                 kfree_skb(skb);
 518                 return -ETIMEDOUT;
 519         }
 520
 521         /* XXX: idev->cnf.proxy_ndp? */
 522         if (net->ipv6.devconf_all->proxy_ndp &&
 523             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 524                 int proxied = ip6_forward_proxy_check(skb);
 525                 if (proxied > 0)
 526                         return ip6_input(skb);
 527                 else if (proxied < 0) {
 528                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
 529                                         IPSTATS_MIB_INDISCARDS);
 530                         goto drop;
 531                 }
 532         }
 533
 534         if (!xfrm6_route_forward(skb)) {
 535                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 536                                 IPSTATS_MIB_INDISCARDS);
 537                 goto drop;
 538         }
 539         dst = skb_dst(skb);
 540
 541         /* IPv6 specs say nothing about it, but it is clear that we cannot
 542            send redirects to source routed frames.
 543            We don't send redirects to frames decapsulated from IPsec.
 544          */
 545         if (IP6CB(skb)->iif == dst->dev->ifindex &&
 546             opt->srcrt == 0 && !skb_sec_path(skb)) {
 547                 struct in6_addr *target = NULL;
 548                 struct inet_peer *peer;
 549                 struct rt6_info *rt;
 550
 551                 /*
 552                  *      incoming and outgoing devices are the same
 553                  *      send a redirect.
 554                  */
 555
 556                 rt = (struct rt6_info *) dst;
 557                 if (rt->rt6i_flags & RTF_GATEWAY)
 558                         target = &rt->rt6i_gateway;
 559                 else
 560                         target = &hdr->daddr;
 561
 562                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 563
 564                 /* Limit redirects both by destination (here)
 565                    and by source (inside ndisc_send_redirect)
 566                  */
 567                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 568                         ndisc_send_redirect(skb, target);
 569                 if (peer)
 570                         inet_putpeer(peer);
 571         } else {
 572                 int addrtype = ipv6_addr_type(&hdr->saddr);
 573
 574                 /* This check is security critical. */
 575                 if (addrtype == IPV6_ADDR_ANY ||
 576                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 577                         goto error;
 578                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 579                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 580                                     ICMPV6_NOT_NEIGHBOUR, 0);
 581                         goto error;
 582                 }
 583         }
 584
 585         mtu = ip6_dst_mtu_forward(dst);
 586         if (mtu < IPV6_MIN_MTU)
 587                 mtu = IPV6_MIN_MTU;
 588
 589         if (ip6_pkt_too_big(skb, mtu)) {
 590                 /* Again, force OUTPUT device used as source address */
 591                 skb->dev = dst->dev;
 592                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 593                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 594                                 IPSTATS_MIB_INTOOBIGERRORS);
 595                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 596                                 IPSTATS_MIB_FRAGFAILS);
 597                 kfree_skb(skb);
 598                 return -EMSGSIZE;
 599         }
 600
 601         if (skb_cow(skb, dst->dev->hard_header_len)) {
 602                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 603                                 IPSTATS_MIB_OUTDISCARDS);
 604                 goto drop;
 605         }
 606
 607         hdr = ipv6_hdr(skb);
 608
 609         /* Mangling hops number delayed to point after skb COW */
 610
 611         hdr->hop_limit--;
 612
 613         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 614                        net, NULL, skb, skb->dev, dst->dev,
 615                        ip6_forward_finish);
 616
 617 error:
 618         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 619 drop:
 620         kfree_skb(skb);
 621         return -EINVAL;
 622 }
 623
 624 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 625 {
 626         to->pkt_type = from->pkt_type;
 627         to->priority = from->priority;
 628         to->protocol = from->protocol;
 629         skb_dst_drop(to);
 630         skb_dst_set(to, dst_clone(skb_dst(from)));
 631         to->dev = from->dev;
 632         to->mark = from->mark;
 633
 634         skb_copy_hash(to, from);
 635
 636 #ifdef CONFIG_NET_SCHED
 637         to->tc_index = from->tc_index;
 638 #endif
 639         nf_copy(to, from);
 640         skb_copy_secmark(to, from);
 641 }
 642
 643 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 644                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 645 {
 646         struct sk_buff *frag;
 647         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 648         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 649                                 inet6_sk(skb->sk) : NULL;
 650         struct ipv6hdr *tmp_hdr;
 651         struct frag_hdr *fh;
 652         unsigned int mtu, hlen, left, len, nexthdr_offset;
 653         int hroom, troom;
 654         __be32 frag_id;
 655         int ptr, offset = 0, err = 0;
 656         u8 *prevhdr, nexthdr = 0;
 657
 658         err = ip6_find_1stfragopt(skb, &prevhdr);
 659         if (err < 0)
 660                 goto fail;
 661         hlen = err;
 662         nexthdr = *prevhdr;
 663         nexthdr_offset = prevhdr - skb_network_header(skb);
 664
 665         mtu = ip6_skb_dst_mtu(skb);
 666
 667         /* We must not fragment if the socket is set to force MTU discovery
 668          * or if the skb it not generated by a local socket.
 669          */
 670         if (unlikely(!skb->ignore_df && skb->len > mtu))
 671                 goto fail_toobig;
 672
 673         if (IP6CB(skb)->frag_max_size) {
 674                 if (IP6CB(skb)->frag_max_size > mtu)
 675                         goto fail_toobig;
 676
 677                 /* don't send fragments larger than what we received */
 678                 mtu = IP6CB(skb)->frag_max_size;
 679                 if (mtu < IPV6_MIN_MTU)
 680                         mtu = IPV6_MIN_MTU;
 681         }
 682
 683         if (np && np->frag_size < mtu) {
 684                 if (np->frag_size)
 685                         mtu = np->frag_size;
 686         }
 687         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 688                 goto fail_toobig;
 689         mtu -= hlen + sizeof(struct frag_hdr);
 690
 691         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 692                                     &ipv6_hdr(skb)->saddr);
 693
 694         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 695             (err = skb_checksum_help(skb)))
 696                 goto fail;
 697
 698         prevhdr = skb_network_header(skb) + nexthdr_offset;
 699         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 700         if (skb_has_frag_list(skb)) {
 701                 unsigned int first_len = skb_pagelen(skb);
 702                 struct sk_buff *frag2;
 703
 704                 if (first_len - hlen > mtu ||
 705                     ((first_len - hlen) & 7) ||
 706                     skb_cloned(skb) ||
 707                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 708                         goto slow_path;
 709
 710                 skb_walk_frags(skb, frag) {
 711                         /* Correct geometry. */
 712                         if (frag->len > mtu ||
 713                             ((frag->len & 7) && frag->next) ||
 714                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 715                                 goto slow_path_clean;
 716
 717                         /* Partially cloned skb? */
 718                         if (skb_shared(frag))
 719                                 goto slow_path_clean;
 720
 721                         BUG_ON(frag->sk);
 722                         if (skb->sk) {
 723                                 frag->sk = skb->sk;
 724                                 frag->destructor = sock_wfree;
 725                         }
 726                         skb->truesize -= frag->truesize;
 727                 }
 728
 729                 err = 0;
 730                 offset = 0;
 731                 /* BUILD HEADER */
 732
 733                 *prevhdr = NEXTHDR_FRAGMENT;
 734                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 735                 if (!tmp_hdr) {
 736                         err = -ENOMEM;
 737                         goto fail;
 738                 }
 739                 frag = skb_shinfo(skb)->frag_list;
 740                 skb_frag_list_init(skb);
 741
 742                 __skb_pull(skb, hlen);
 743                 fh = __skb_push(skb, sizeof(struct frag_hdr));
 744                 __skb_push(skb, hlen);
 745                 skb_reset_network_header(skb);
 746                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 747
 748                 fh->nexthdr = nexthdr;
 749                 fh->reserved = 0;
 750                 fh->frag_off = htons(IP6_MF);
 751                 fh->identification = frag_id;
 752
 753                 first_len = skb_pagelen(skb);
 754                 skb->data_len = first_len - skb_headlen(skb);
 755                 skb->len = first_len;
 756                 ipv6_hdr(skb)->payload_len = htons(first_len -
 757                                                    sizeof(struct ipv6hdr));
 758
 759                 for (;;) {
 760                         /* Prepare header of the next frame,
 761                          * before previous one went down. */
 762                         if (frag) {
 763                                 frag->ip_summed = CHECKSUM_NONE;
 764                                 skb_reset_transport_header(frag);
 765                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
 766                                 __skb_push(frag, hlen);
 767                                 skb_reset_network_header(frag);
 768                                 memcpy(skb_network_header(frag), tmp_hdr,
 769                                        hlen);
 770                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 771                                 fh->nexthdr = nexthdr;
 772                                 fh->reserved = 0;
 773                                 fh->frag_off = htons(offset);
 774                                 if (frag->next)
 775                                         fh->frag_off |= htons(IP6_MF);
 776                                 fh->identification = frag_id;
 777                                 ipv6_hdr(frag)->payload_len =
 778                                                 htons(frag->len -
 779                                                       sizeof(struct ipv6hdr));
 780                                 ip6_copy_metadata(frag, skb);
 781                         }
 782
 783                         err = output(net, sk, skb);
 784                         if (!err)
 785                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 786                                               IPSTATS_MIB_FRAGCREATES);
 787
 788                         if (err || !frag)
 789                                 break;
 790
 791                         skb = frag;
 792                         frag = skb->next;
 793                         skb->next = NULL;
 794                 }
 795
 796                 kfree(tmp_hdr);
 797
 798                 if (err == 0) {
 799                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 800                                       IPSTATS_MIB_FRAGOKS);
 801                         return 0;
 802                 }
 803
 804                 kfree_skb_list(frag);
 805
 806                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 807                               IPSTATS_MIB_FRAGFAILS);
 808                 return err;
 809
 810 slow_path_clean:
 811                 skb_walk_frags(skb, frag2) {
 812                         if (frag2 == frag)
 813                                 break;
 814                         frag2->sk = NULL;
 815                         frag2->destructor = NULL;
 816                         skb->truesize += frag2->truesize;
 817                 }
 818         }
 819
 820 slow_path:
 821         left = skb->len - hlen;         /* Space per frame */
 822         ptr = hlen;                     /* Where to start from */
 823
 824         /*
 825          *      Fragment the datagram.
 826          */
 827
 828         troom = rt->dst.dev->needed_tailroom;
 829
 830         /*
 831          *      Keep copying data until we run out.
 832          */
 833         while (left > 0)        {
 834                 u8 *fragnexthdr_offset;
 835
 836                 len = left;
 837                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 838                 if (len > mtu)
 839                         len = mtu;
 840                 /* IF: we are not sending up to and including the packet end
 841                    then align the next start on an eight byte boundary */
 842                 if (len < left) {
 843                         len &= ~7;
 844                 }
 845
 846                 /* Allocate buffer */
 847                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 848                                  hroom + troom, GFP_ATOMIC);
 849                 if (!frag) {
 850                         err = -ENOMEM;
 851                         goto fail;
 852                 }
 853
 854                 /*
 855                  *      Set up data on packet
 856                  */
 857
 858                 ip6_copy_metadata(frag, skb);
 859                 skb_reserve(frag, hroom);
 860                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 861                 skb_reset_network_header(frag);
 862                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 863                 frag->transport_header = (frag->network_header + hlen +
 864                                           sizeof(struct frag_hdr));
 865
 866                 /*
 867                  *      Charge the memory for the fragment to any owner
 868                  *      it might possess
 869                  */
 870                 if (skb->sk)
 871                         skb_set_owner_w(frag, skb->sk);
 872
 873                 /*
 874                  *      Copy the packet header into the new buffer.
 875                  */
 876                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 877
 878                 fragnexthdr_offset = skb_network_header(frag);
 879                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
 880                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 881
 882                 /*
 883                  *      Build fragment header.
 884                  */
 885                 fh->nexthdr = nexthdr;
 886                 fh->reserved = 0;
 887                 fh->identification = frag_id;
 888
 889                 /*
 890                  *      Copy a block of the IP datagram.
 891                  */
 892                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 893                                      len));
 894                 left -= len;
 895
 896                 fh->frag_off = htons(offset);
 897                 if (left > 0)
 898                         fh->frag_off |= htons(IP6_MF);
 899                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 900                                                     sizeof(struct ipv6hdr));
 901
 902                 ptr += len;
 903                 offset += len;
 904
 905                 /*
 906                  *      Put this fragment into the sending queue.
 907                  */
 908                 err = output(net, sk, frag);
 909                 if (err)
 910                         goto fail;
 911
 912                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 913                               IPSTATS_MIB_FRAGCREATES);
 914         }
 915         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 916                       IPSTATS_MIB_FRAGOKS);
 917         consume_skb(skb);
 918         return err;
 919
 920 fail_toobig:
 921         if (skb->sk && dst_allfrag(skb_dst(skb)))
 922                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 923
 924         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 925         err = -EMSGSIZE;
 926
 927 fail:
 928         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 929                       IPSTATS_MIB_FRAGFAILS);
 930         kfree_skb(skb);
 931         return err;
 932 }
 933
 934 static inline int ip6_rt_check(const struct rt6key *rt_key,
 935                                const struct in6_addr *fl_addr,
 936                                const struct in6_addr *addr_cache)
 937 {
 938         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 939                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 940 }
 941
 942 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 943                                           struct dst_entry *dst,
 944                                           const struct flowi6 *fl6)
 945 {
 946         struct ipv6_pinfo *np = inet6_sk(sk);
 947         struct rt6_info *rt;
 948
 949         if (!dst)
 950                 goto out;
 951
 952         if (dst->ops->family != AF_INET6) {
 953                 dst_release(dst);
 954                 return NULL;
 955         }
 956
 957         rt = (struct rt6_info *)dst;
 958         /* Yes, checking route validity in not connected
 959          * case is not very simple. Take into account,
 960          * that we do not support routing by source, TOS,
 961          * and MSG_DONTROUTE            --ANK (980726)
 962          *
 963          * 1. ip6_rt_check(): If route was host route,
 964          *    check that cached destination is current.
 965          *    If it is network route, we still may
 966          *    check its validity using saved pointer
 967          *    to the last used address: daddr_cache.
 968          *    We do not want to save whole address now,
 969          *    (because main consumer of this service
 970          *    is tcp, which has not this problem),
 971          *    so that the last trick works only on connected
 972          *    sockets.
 973          * 2. oif also should be the same.
 974          */
 975         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 976 #ifdef CONFIG_IPV6_SUBTREES
 977             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 978 #endif
 979            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 980               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 981                 dst_release(dst);
 982                 dst = NULL;
 983         }
 984
 985 out:
 986         return dst;
 987 }
 988
 989 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 990                                struct dst_entry **dst, struct flowi6 *fl6)
 991 {
 992 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 993         struct neighbour *n;
 994         struct rt6_info *rt;
 995 #endif
 996         int err;
 997         int flags = 0;
 998
 999         /* The correct way to handle this would be to do
1000          * ip6_route_get_saddr, and then ip6_route_output; however,
1001          * the route-specific preferred source forces the
1002          * ip6_route_output call _before_ ip6_route_get_saddr.
1003          *
1004          * In source specific routing (no src=any default route),
1005          * ip6_route_output will fail given src=any saddr, though, so
1006          * that's why we try it again later.
1007          */
1008         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
1009                 struct rt6_info *rt;
1010                 bool had_dst = *dst != NULL;
1011
1012                 if (!had_dst)
1013                         *dst = ip6_route_output(net, sk, fl6);
1014                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1015                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
1016                                           sk ? inet6_sk(sk)->srcprefs : 0,
1017                                           &fl6->saddr);
1018                 if (err)
1019                         goto out_err_release;
1020
1021                 /* If we had an erroneous initial result, pretend it
1022                  * never existed and let the SA-enabled version take
1023                  * over.
1024                  */
1025                 if (!had_dst && (*dst)->error) {
1026                         dst_release(*dst);
1027                         *dst = NULL;
1028                 }
1029
1030                 if (fl6->flowi6_oif)
1031                         flags |= RT6_LOOKUP_F_IFACE;
1032         }
1033
1034         if (!*dst)
1035                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1036
1037         err = (*dst)->error;
1038         if (err)
1039                 goto out_err_release;
1040
1041 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1042         /*
1043          * Here if the dst entry we've looked up
1044          * has a neighbour entry that is in the INCOMPLETE
1045          * state and the src address from the flow is
1046          * marked as OPTIMISTIC, we release the found
1047          * dst entry and replace it instead with the
1048          * dst entry of the nexthop router
1049          */
1050         rt = (struct rt6_info *) *dst;
1051         rcu_read_lock_bh();
1052         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1053                                       rt6_nexthop(rt, &fl6->daddr));
1054         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1055         rcu_read_unlock_bh();
1056
1057         if (err) {
1058                 struct inet6_ifaddr *ifp;
1059                 struct flowi6 fl_gw6;
1060                 int redirect;
1061
1062                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1063                                       (*dst)->dev, 1);
1064
1065                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1066                 if (ifp)
1067                         in6_ifa_put(ifp);
1068
1069                 if (redirect) {
1070                         /*
1071                          * We need to get the dst entry for the
1072                          * default router instead
1073                          */
1074                         dst_release(*dst);
1075                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1076                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1077                         *dst = ip6_route_output(net, sk, &fl_gw6);
1078                         err = (*dst)->error;
1079                         if (err)
1080                                 goto out_err_release;
1081                 }
1082         }
1083 #endif
1084         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1085             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1086                 err = -EAFNOSUPPORT;
1087                 goto out_err_release;
1088         }
1089
1090         return 0;
1091
1092 out_err_release:
1093         dst_release(*dst);
1094         *dst = NULL;
1095
1096         if (err == -ENETUNREACH)
1097                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1098         return err;
1099 }
1100
1101 /**
1102  *      ip6_dst_lookup - perform route lookup on flow
1103  *      @sk: socket which provides route info
1104  *      @dst: pointer to dst_entry * for result
1105  *      @fl6: flow to lookup
1106  *
1107  *      This function performs a route lookup on the given flow.
1108  *
1109  *      It returns zero on success, or a standard errno code on error.
1110  */
1111 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1112                    struct flowi6 *fl6)
1113 {
1114         *dst = NULL;
1115         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1116 }
1117 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1118
1119 /**
1120  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1121  *      @sk: socket which provides route info
1122  *      @fl6: flow to lookup
1123  *      @final_dst: final destination address for ipsec lookup
1124  *
1125  *      This function performs a route lookup on the given flow.
1126  *
1127  *      It returns a valid dst pointer on success, or a pointer encoded
1128  *      error code.
1129  */
1130 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1131                                       const struct in6_addr *final_dst)
1132 {
1133         struct dst_entry *dst = NULL;
1134         int err;
1135
1136         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1137         if (err)
1138                 return ERR_PTR(err);
1139         if (final_dst)
1140                 fl6->daddr = *final_dst;
1141
1142         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1143 }
1144 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1145
1146 /**
1147  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1148  *      @sk: socket which provides the dst cache and route info
1149  *      @fl6: flow to lookup
1150  *      @final_dst: final destination address for ipsec lookup
1151  *
1152  *      This function performs a route lookup on the given flow with the
1153  *      possibility of using the cached route in the socket if it is valid.
1154  *      It will take the socket dst lock when operating on the dst cache.
1155  *      As a result, this function can only be used in process context.
1156  *
1157  *      It returns a valid dst pointer on success, or a pointer encoded
1158  *      error code.
1159  */
1160 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1161                                          const struct in6_addr *final_dst)
1162 {
1163         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1164
1165         dst = ip6_sk_dst_check(sk, dst, fl6);
1166         if (!dst)
1167                 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1168
1169         return dst;
1170 }
1171 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1172
1173 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1174                                                gfp_t gfp)
1175 {
1176         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1177 }
1178
1179 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1180                                                 gfp_t gfp)
1181 {
1182         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1183 }
1184
1185 static void ip6_append_data_mtu(unsigned int *mtu,
1186                                 int *maxfraglen,
1187                                 unsigned int fragheaderlen,
1188                                 struct sk_buff *skb,
1189                                 struct rt6_info *rt,
1190                                 unsigned int orig_mtu)
1191 {
1192         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1193                 if (!skb) {
1194                         /* first fragment, reserve header_len */
1195                         *mtu = orig_mtu - rt->dst.header_len;
1196
1197                 } else {
1198                         /*
1199                          * this fragment is not first, the headers
1200                          * space is regarded as data space.
1201                          */
1202                         *mtu = orig_mtu;
1203                 }
1204                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1205                               + fragheaderlen - sizeof(struct frag_hdr);
1206         }
1207 }
1208
1209 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1210                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1211                           struct rt6_info *rt, struct flowi6 *fl6)
1212 {
1213         struct ipv6_pinfo *np = inet6_sk(sk);
1214         unsigned int mtu;
1215         struct ipv6_txoptions *opt = ipc6->opt;
1216
1217         /*
1218          * setup for corking
1219          */
1220         if (opt) {
1221                 if (WARN_ON(v6_cork->opt))
1222                         return -EINVAL;
1223
1224                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1225                 if (unlikely(!v6_cork->opt))
1226                         return -ENOBUFS;
1227
1228                 v6_cork->opt->tot_len = sizeof(*opt);
1229                 v6_cork->opt->opt_flen = opt->opt_flen;
1230                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1231
1232                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1233                                                     sk->sk_allocation);
1234                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1235                         return -ENOBUFS;
1236
1237                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1238                                                     sk->sk_allocation);
1239                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1240                         return -ENOBUFS;
1241
1242                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1243                                                    sk->sk_allocation);
1244                 if (opt->hopopt && !v6_cork->opt->hopopt)
1245                         return -ENOBUFS;
1246
1247                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1248                                                     sk->sk_allocation);
1249                 if (opt->srcrt && !v6_cork->opt->srcrt)
1250                         return -ENOBUFS;
1251
1252                 /* need source address above miyazawa*/
1253         }
1254         dst_hold(&rt->dst);
1255         cork->base.dst = &rt->dst;
1256         cork->fl.u.ip6 = *fl6;
1257         v6_cork->hop_limit = ipc6->hlimit;
1258         v6_cork->tclass = ipc6->tclass;
1259         if (rt->dst.flags & DST_XFRM_TUNNEL)
1260                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1261                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1262         else
1263                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1264                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path);
1265         if (np->frag_size < mtu) {
1266                 if (np->frag_size)
1267                         mtu = np->frag_size;
1268         }
1269         if (mtu < IPV6_MIN_MTU)
1270                 return -EINVAL;
1271         cork->base.fragsize = mtu;
1272         if (dst_allfrag(rt->dst.path))
1273                 cork->base.flags |= IPCORK_ALLFRAG;
1274         cork->base.length = 0;
1275
1276         return 0;
1277 }
1278
1279 static int __ip6_append_data(struct sock *sk,
1280                              struct flowi6 *fl6,
1281                              struct sk_buff_head *queue,
1282                              struct inet_cork *cork,
1283                              struct inet6_cork *v6_cork,
1284                              struct page_frag *pfrag,
1285                              int getfrag(void *from, char *to, int offset,
1286                                          int len, int odd, struct sk_buff *skb),
1287                              void *from, int length, int transhdrlen,
1288                              unsigned int flags, struct ipcm6_cookie *ipc6,
1289                              const struct sockcm_cookie *sockc)
1290 {
1291         struct sk_buff *skb, *skb_prev = NULL;
1292         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1293         int exthdrlen = 0;
1294         int dst_exthdrlen = 0;
1295         int hh_len;
1296         int copy;
1297         int err;
1298         int offset = 0;
1299         __u8 tx_flags = 0;
1300         u32 tskey = 0;
1301         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1302         struct ipv6_txoptions *opt = v6_cork->opt;
1303         int csummode = CHECKSUM_NONE;
1304         unsigned int maxnonfragsize, headersize;
1305
1306         skb = skb_peek_tail(queue);
1307         if (!skb) {
1308                 exthdrlen = opt ? opt->opt_flen : 0;
1309                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1310         }
1311
1312         mtu = cork->fragsize;
1313         orig_mtu = mtu;
1314
1315         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1316
1317         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1318                         (opt ? opt->opt_nflen : 0);
1319         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1320                      sizeof(struct frag_hdr);
1321
1322         headersize = sizeof(struct ipv6hdr) +
1323                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1324                      (dst_allfrag(&rt->dst) ?
1325                       sizeof(struct frag_hdr) : 0) +
1326                      rt->rt6i_nfheader_len;
1327
1328         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1329          * the first fragment
1330          */
1331         if (headersize + transhdrlen > mtu)
1332                 goto emsgsize;
1333
1334         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1335             (sk->sk_protocol == IPPROTO_UDP ||
1336              sk->sk_protocol == IPPROTO_RAW)) {
1337                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1338                                 sizeof(struct ipv6hdr));
1339                 goto emsgsize;
1340         }
1341
1342         if (ip6_sk_ignore_df(sk))
1343                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1344         else
1345                 maxnonfragsize = mtu;
1346
1347         if (cork->length + length > maxnonfragsize - headersize) {
1348 emsgsize:
1349                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1350                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1351                 return -EMSGSIZE;
1352         }
1353
1354         /* CHECKSUM_PARTIAL only with no extension headers and when
1355          * we are not going to fragment
1356          */
1357         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1358             headersize == sizeof(struct ipv6hdr) &&
1359             length <= mtu - headersize &&
1360             !(flags & MSG_MORE) &&
1361             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1362                 csummode = CHECKSUM_PARTIAL;
1363
1364         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1365                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1366                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1367                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1368                         tskey = sk->sk_tskey++;
1369         }
1370
1371         /*
1372          * Let's try using as much space as possible.
1373          * Use MTU if total length of the message fits into the MTU.
1374          * Otherwise, we need to reserve fragment header and
1375          * fragment alignment (= 8-15 octects, in total).
1376          *
1377          * Note that we may need to "move" the data from the tail of
1378          * of the buffer to the new fragment when we split
1379          * the message.
1380          *
1381          * FIXME: It may be fragmented into multiple chunks
1382          *        at once if non-fragmentable extension headers
1383          *        are too large.
1384          * --yoshfuji
1385          */
1386
1387         cork->length += length;
1388         if (!skb)
1389                 goto alloc_new_skb;
1390
1391         while (length > 0) {
1392                 /* Check if the remaining data fits into current packet. */
1393                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1394                 if (copy < length)
1395                         copy = maxfraglen - skb->len;
1396
1397                 if (copy <= 0) {
1398                         char *data;
1399                         unsigned int datalen;
1400                         unsigned int fraglen;
1401                         unsigned int fraggap;
1402                         unsigned int alloclen;
1403 alloc_new_skb:
1404                         /* There's no room in the current skb */
1405                         if (skb)
1406                                 fraggap = skb->len - maxfraglen;
1407                         else
1408                                 fraggap = 0;
1409                         /* update mtu and maxfraglen if necessary */
1410                         if (!skb || !skb_prev)
1411                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1412                                                     fragheaderlen, skb, rt,
1413                                                     orig_mtu);
1414
1415                         skb_prev = skb;
1416
1417                         /*
1418                          * If remaining data exceeds the mtu,
1419                          * we know we need more fragment(s).
1420                          */
1421                         datalen = length + fraggap;
1422
1423                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1424                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1425                         if ((flags & MSG_MORE) &&
1426                             !(rt->dst.dev->features&NETIF_F_SG))
1427                                 alloclen = mtu;
1428                         else
1429                                 alloclen = datalen + fragheaderlen;
1430
1431                         alloclen += dst_exthdrlen;
1432
1433                         if (datalen != length + fraggap) {
1434                                 /*
1435                                  * this is not the last fragment, the trailer
1436                                  * space is regarded as data space.
1437                                  */
1438                                 datalen += rt->dst.trailer_len;
1439                         }
1440
1441                         alloclen += rt->dst.trailer_len;
1442                         fraglen = datalen + fragheaderlen;
1443
1444                         /*
1445                          * We just reserve space for fragment header.
1446                          * Note: this may be overallocation if the message
1447                          * (without MSG_MORE) fits into the MTU.
1448                          */
1449                         alloclen += sizeof(struct frag_hdr);
1450
1451                         copy = datalen - transhdrlen - fraggap;
1452                         if (copy < 0) {
1453                                 err = -EINVAL;
1454                                 goto error;
1455                         }
1456                         if (transhdrlen) {
1457                                 skb = sock_alloc_send_skb(sk,
1458                                                 alloclen + hh_len,
1459                                                 (flags & MSG_DONTWAIT), &err);
1460                         } else {
1461                                 skb = NULL;
1462                                 if (refcount_read(&sk->sk_wmem_alloc) <=
1463                                     2 * sk->sk_sndbuf)
1464                                         skb = sock_wmalloc(sk,
1465                                                            alloclen + hh_len, 1,
1466                                                            sk->sk_allocation);
1467                                 if (unlikely(!skb))
1468                                         err = -ENOBUFS;
1469                         }
1470                         if (!skb)
1471                                 goto error;
1472                         /*
1473                          *      Fill in the control structures
1474                          */
1475                         skb->protocol = htons(ETH_P_IPV6);
1476                         skb->ip_summed = csummode;
1477                         skb->csum = 0;
1478                         /* reserve for fragmentation and ipsec header */
1479                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1480                                     dst_exthdrlen);
1481
1482                         /* Only the initial fragment is time stamped */
1483                         skb_shinfo(skb)->tx_flags = tx_flags;
1484                         tx_flags = 0;
1485                         skb_shinfo(skb)->tskey = tskey;
1486                         tskey = 0;
1487
1488                         /*
1489                          *      Find where to start putting bytes
1490                          */
1491                         data = skb_put(skb, fraglen);
1492                         skb_set_network_header(skb, exthdrlen);
1493                         data += fragheaderlen;
1494                         skb->transport_header = (skb->network_header +
1495                                                  fragheaderlen);
1496                         if (fraggap) {
1497                                 skb->csum = skb_copy_and_csum_bits(
1498                                         skb_prev, maxfraglen,
1499                                         data + transhdrlen, fraggap, 0);
1500                                 skb_prev->csum = csum_sub(skb_prev->csum,
1501                                                           skb->csum);
1502                                 data += fraggap;
1503                                 pskb_trim_unique(skb_prev, maxfraglen);
1504                         }
1505                         if (copy > 0 &&
1506                             getfrag(from, data + transhdrlen, offset,
1507                                     copy, fraggap, skb) < 0) {
1508                                 err = -EFAULT;
1509                                 kfree_skb(skb);
1510                                 goto error;
1511                         }
1512
1513                         offset += copy;
1514                         length -= datalen - fraggap;
1515                         transhdrlen = 0;
1516                         exthdrlen = 0;
1517                         dst_exthdrlen = 0;
1518
1519                         if ((flags & MSG_CONFIRM) && !skb_prev)
1520                                 skb_set_dst_pending_confirm(skb, 1);
1521
1522                         /*
1523                          * Put the packet on the pending queue
1524                          */
1525                         __skb_queue_tail(queue, skb);
1526                         continue;
1527                 }
1528
1529                 if (copy > length)
1530                         copy = length;
1531
1532                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1533                     skb_tailroom(skb) >= copy) {
1534                         unsigned int off;
1535
1536                         off = skb->len;
1537                         if (getfrag(from, skb_put(skb, copy),
1538                                                 offset, copy, off, skb) < 0) {
1539                                 __skb_trim(skb, off);
1540                                 err = -EFAULT;
1541                                 goto error;
1542                         }
1543                 } else {
1544                         int i = skb_shinfo(skb)->nr_frags;
1545
1546                         err = -ENOMEM;
1547                         if (!sk_page_frag_refill(sk, pfrag))
1548                                 goto error;
1549
1550                         if (!skb_can_coalesce(skb, i, pfrag->page,
1551                                               pfrag->offset)) {
1552                                 err = -EMSGSIZE;
1553                                 if (i == MAX_SKB_FRAGS)
1554                                         goto error;
1555
1556                                 __skb_fill_page_desc(skb, i, pfrag->page,
1557                                                      pfrag->offset, 0);
1558                                 skb_shinfo(skb)->nr_frags = ++i;
1559                                 get_page(pfrag->page);
1560                         }
1561                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1562                         if (getfrag(from,
1563                                     page_address(pfrag->page) + pfrag->offset,
1564                                     offset, copy, skb->len, skb) < 0)
1565                                 goto error_efault;
1566
1567                         pfrag->offset += copy;
1568                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1569                         skb->len += copy;
1570                         skb->data_len += copy;
1571                         skb->truesize += copy;
1572                         refcount_add(copy, &sk->sk_wmem_alloc);
1573                 }
1574                 offset += copy;
1575                 length -= copy;
1576         }
1577
1578         return 0;
1579
1580 error_efault:
1581         err = -EFAULT;
1582 error:
1583         cork->length -= length;
1584         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1585         return err;
1586 }
1587
1588 int ip6_append_data(struct sock *sk,
1589                     int getfrag(void *from, char *to, int offset, int len,
1590                                 int odd, struct sk_buff *skb),
1591                     void *from, int length, int transhdrlen,
1592                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1593                     struct rt6_info *rt, unsigned int flags,
1594                     const struct sockcm_cookie *sockc)
1595 {
1596         struct inet_sock *inet = inet_sk(sk);
1597         struct ipv6_pinfo *np = inet6_sk(sk);
1598         int exthdrlen;
1599         int err;
1600
1601         if (flags&MSG_PROBE)
1602                 return 0;
1603         if (skb_queue_empty(&sk->sk_write_queue)) {
1604                 /*
1605                  * setup for corking
1606                  */
1607                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1608                                      ipc6, rt, fl6);
1609                 if (err)
1610                         return err;
1611
1612                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1613                 length += exthdrlen;
1614                 transhdrlen += exthdrlen;
1615         } else {
1616                 fl6 = &inet->cork.fl.u.ip6;
1617                 transhdrlen = 0;
1618         }
1619
1620         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1621                                  &np->cork, sk_page_frag(sk), getfrag,
1622                                  from, length, transhdrlen, flags, ipc6, sockc);
1623 }
1624 EXPORT_SYMBOL_GPL(ip6_append_data);
1625
1626 static void ip6_cork_release(struct inet_cork_full *cork,
1627                              struct inet6_cork *v6_cork)
1628 {
1629         if (v6_cork->opt) {
1630                 kfree(v6_cork->opt->dst0opt);
1631                 kfree(v6_cork->opt->dst1opt);
1632                 kfree(v6_cork->opt->hopopt);
1633                 kfree(v6_cork->opt->srcrt);
1634                 kfree(v6_cork->opt);
1635                 v6_cork->opt = NULL;
1636         }
1637
1638         if (cork->base.dst) {
1639                 dst_release(cork->base.dst);
1640                 cork->base.dst = NULL;
1641                 cork->base.flags &= ~IPCORK_ALLFRAG;
1642         }
1643         memset(&cork->fl, 0, sizeof(cork->fl));
1644 }
1645
1646 struct sk_buff *__ip6_make_skb(struct sock *sk,
1647                                struct sk_buff_head *queue,
1648                                struct inet_cork_full *cork,
1649                                struct inet6_cork *v6_cork)
1650 {
1651         struct sk_buff *skb, *tmp_skb;
1652         struct sk_buff **tail_skb;
1653         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1654         struct ipv6_pinfo *np = inet6_sk(sk);
1655         struct net *net = sock_net(sk);
1656         struct ipv6hdr *hdr;
1657         struct ipv6_txoptions *opt = v6_cork->opt;
1658         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1659         struct flowi6 *fl6 = &cork->fl.u.ip6;
1660         unsigned char proto = fl6->flowi6_proto;
1661
1662         skb = __skb_dequeue(queue);
1663         if (!skb)
1664                 goto out;
1665         tail_skb = &(skb_shinfo(skb)->frag_list);
1666
1667         /* move skb->data to ip header from ext header */
1668         if (skb->data < skb_network_header(skb))
1669                 __skb_pull(skb, skb_network_offset(skb));
1670         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1671                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1672                 *tail_skb = tmp_skb;
1673                 tail_skb = &(tmp_skb->next);
1674                 skb->len += tmp_skb->len;
1675                 skb->data_len += tmp_skb->len;
1676                 skb->truesize += tmp_skb->truesize;
1677                 tmp_skb->destructor = NULL;
1678                 tmp_skb->sk = NULL;
1679         }
1680
1681         /* Allow local fragmentation. */
1682         skb->ignore_df = ip6_sk_ignore_df(sk);
1683
1684         *final_dst = fl6->daddr;
1685         __skb_pull(skb, skb_network_header_len(skb));
1686         if (opt && opt->opt_flen)
1687                 ipv6_push_frag_opts(skb, opt, &proto);
1688         if (opt && opt->opt_nflen)
1689                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1690
1691         skb_push(skb, sizeof(struct ipv6hdr));
1692         skb_reset_network_header(skb);
1693         hdr = ipv6_hdr(skb);
1694
1695         ip6_flow_hdr(hdr, v6_cork->tclass,
1696                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1697                                         ip6_autoflowlabel(net, np), fl6));
1698         hdr->hop_limit = v6_cork->hop_limit;
1699         hdr->nexthdr = proto;
1700         hdr->saddr = fl6->saddr;
1701         hdr->daddr = *final_dst;
1702
1703         skb->priority = sk->sk_priority;
1704         skb->mark = sk->sk_mark;
1705
1706         skb_dst_set(skb, dst_clone(&rt->dst));
1707         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1708         if (proto == IPPROTO_ICMPV6) {
1709                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1710
1711                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1712                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1713         }
1714
1715         ip6_cork_release(cork, v6_cork);
1716 out:
1717         return skb;
1718 }
1719
1720 int ip6_send_skb(struct sk_buff *skb)
1721 {
1722         struct net *net = sock_net(skb->sk);
1723         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1724         int err;
1725
1726         err = ip6_local_out(net, skb->sk, skb);
1727         if (err) {
1728                 if (err > 0)
1729                         err = net_xmit_errno(err);
1730                 if (err)
1731                         IP6_INC_STATS(net, rt->rt6i_idev,
1732                                       IPSTATS_MIB_OUTDISCARDS);
1733         }
1734
1735         return err;
1736 }
1737
1738 int ip6_push_pending_frames(struct sock *sk)
1739 {
1740         struct sk_buff *skb;
1741
1742         skb = ip6_finish_skb(sk);
1743         if (!skb)
1744                 return 0;
1745
1746         return ip6_send_skb(skb);
1747 }
1748 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1749
1750 static void __ip6_flush_pending_frames(struct sock *sk,
1751                                        struct sk_buff_head *queue,
1752                                        struct inet_cork_full *cork,
1753                                        struct inet6_cork *v6_cork)
1754 {
1755         struct sk_buff *skb;
1756
1757         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1758                 if (skb_dst(skb))
1759                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1760                                       IPSTATS_MIB_OUTDISCARDS);
1761                 kfree_skb(skb);
1762         }
1763
1764         ip6_cork_release(cork, v6_cork);
1765 }
1766
1767 void ip6_flush_pending_frames(struct sock *sk)
1768 {
1769         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1770                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1771 }
1772 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1773
1774 struct sk_buff *ip6_make_skb(struct sock *sk,
1775                              int getfrag(void *from, char *to, int offset,
1776                                          int len, int odd, struct sk_buff *skb),
1777                              void *from, int length, int transhdrlen,
1778                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1779                              struct rt6_info *rt, unsigned int flags,
1780                              const struct sockcm_cookie *sockc)
1781 {
1782         struct inet_cork_full cork;
1783         struct inet6_cork v6_cork;
1784         struct sk_buff_head queue;
1785         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1786         int err;
1787
1788         if (flags & MSG_PROBE)
1789                 return NULL;
1790
1791         __skb_queue_head_init(&queue);
1792
1793         cork.base.flags = 0;
1794         cork.base.addr = 0;
1795         cork.base.opt = NULL;
1796         cork.base.dst = NULL;
1797         v6_cork.opt = NULL;
1798         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1799         if (err) {
1800                 ip6_cork_release(&cork, &v6_cork);
1801                 return ERR_PTR(err);
1802         }
1803         if (ipc6->dontfrag < 0)
1804                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1805
1806         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1807                                 &current->task_frag, getfrag, from,
1808                                 length + exthdrlen, transhdrlen + exthdrlen,
1809                                 flags, ipc6, sockc);
1810         if (err) {
1811                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1812                 return ERR_PTR(err);
1813         }
1814
1815         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1816 }