net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/netfilter.h>
  43 #include <linux/netfilter_ipv6.h>
  44
  45 #include <net/sock.h>
  46 #include <net/snmp.h>
  47
  48 #include <net/ipv6.h>
  49 #include <net/ndisc.h>
  50 #include <net/protocol.h>
  51 #include <net/ip6_route.h>
  52 #include <net/addrconf.h>
  53 #include <net/rawv6.h>
  54 #include <net/icmp.h>
  55 #include <net/xfrm.h>
  56 #include <net/checksum.h>
  57 #include <linux/mroute6.h>
  58 #include <net/l3mdev.h>
  59
  60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  61 {
  62         struct dst_entry *dst = skb_dst(skb);
  63         struct net_device *dev = dst->dev;
  64         struct neighbour *neigh;
  65         struct in6_addr *nexthop;
  66         int ret;
  67
  68         skb->protocol = htons(ETH_P_IPV6);
  69         skb->dev = dev;
  70
  71         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  72                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  73
  74                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  75                     ((mroute6_socket(net, skb) &&
  76                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  77                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  78                                          &ipv6_hdr(skb)->saddr))) {
  79                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  80
  81                         /* Do not check for IFF_ALLMULTI; multicast routing
  82                            is not supported in any case.
  83                          */
  84                         if (newskb)
  85                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  86                                         net, sk, newskb, NULL, newskb->dev,
  87                                         dev_loopback_xmit);
  88
  89                         if (ipv6_hdr(skb)->hop_limit == 0) {
  90                                 IP6_INC_STATS(net, idev,
  91                                               IPSTATS_MIB_OUTDISCARDS);
  92                                 kfree_skb(skb);
  93                                 return 0;
  94                         }
  95                 }
  96
  97                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  98
  99                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
 100                     IPV6_ADDR_SCOPE_NODELOCAL &&
 101                     !(dev->flags & IFF_LOOPBACK)) {
 102                         kfree_skb(skb);
 103                         return 0;
 104                 }
 105         }
 106
 107         rcu_read_lock_bh();
 108         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 109         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 110         if (unlikely(!neigh))
 111                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 112         if (!IS_ERR(neigh)) {
 113                 ret = dst_neigh_output(dst, neigh, skb);
 114                 rcu_read_unlock_bh();
 115                 return ret;
 116         }
 117         rcu_read_unlock_bh();
 118
 119         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 120         kfree_skb(skb);
 121         return -EINVAL;
 122 }
 123
 124 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 125 {
 126         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 127             dst_allfrag(skb_dst(skb)) ||
 128             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 129                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 130         else
 131                 return ip6_finish_output2(net, sk, skb);
 132 }
 133
 134 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 135 {
 136         struct net_device *dev = skb_dst(skb)->dev;
 137         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 138
 139         if (unlikely(idev->cnf.disable_ipv6)) {
 140                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 141                 kfree_skb(skb);
 142                 return 0;
 143         }
 144
 145         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 146                             net, sk, skb, NULL, dev,
 147                             ip6_finish_output,
 148                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 149 }
 150
 151 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 152 {
 153         if (!np->autoflowlabel_set)
 154                 return ip6_default_np_autolabel(net);
 155         else
 156                 return np->autoflowlabel;
 157 }
 158
 159 /*
 160  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 161  * Note : socket lock is not held for SYNACK packets, but might be modified
 162  * by calls to skb_set_owner_w() and ipv6_local_error(),
 163  * which are using proper atomic operations or spinlocks.
 164  */
 165 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 166              struct ipv6_txoptions *opt, int tclass)
 167 {
 168         struct net *net = sock_net(sk);
 169         const struct ipv6_pinfo *np = inet6_sk(sk);
 170         struct in6_addr *first_hop = &fl6->daddr;
 171         struct dst_entry *dst = skb_dst(skb);
 172         unsigned int head_room;
 173         struct ipv6hdr *hdr;
 174         u8  proto = fl6->flowi6_proto;
 175         int seg_len = skb->len;
 176         int hlimit = -1;
 177         u32 mtu;
 178
 179         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 180         if (opt)
 181                 head_room += opt->opt_nflen + opt->opt_flen;
 182
 183         if (unlikely(skb_headroom(skb) < head_room)) {
 184                 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 185                 if (!skb2) {
 186                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 187                                       IPSTATS_MIB_OUTDISCARDS);
 188                         kfree_skb(skb);
 189                         return -ENOBUFS;
 190                 }
 191                 if (skb->sk)
 192                         skb_set_owner_w(skb2, skb->sk);
 193                 consume_skb(skb);
 194                 skb = skb2;
 195         }
 196
 197         if (opt) {
 198                 seg_len += opt->opt_nflen + opt->opt_flen;
 199
 200                 if (opt->opt_flen)
 201                         ipv6_push_frag_opts(skb, opt, &proto);
 202
 203                 if (opt->opt_nflen)
 204                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 205         }
 206
 207         skb_push(skb, sizeof(struct ipv6hdr));
 208         skb_reset_network_header(skb);
 209         hdr = ipv6_hdr(skb);
 210
 211         /*
 212          *      Fill in the IPv6 header
 213          */
 214         if (np)
 215                 hlimit = np->hop_limit;
 216         if (hlimit < 0)
 217                 hlimit = ip6_dst_hoplimit(dst);
 218
 219         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 220                                 ip6_autoflowlabel(net, np), fl6));
 221
 222         hdr->payload_len = htons(seg_len);
 223         hdr->nexthdr = proto;
 224         hdr->hop_limit = hlimit;
 225
 226         hdr->saddr = fl6->saddr;
 227         hdr->daddr = *first_hop;
 228
 229         skb->protocol = htons(ETH_P_IPV6);
 230         skb->priority = sk->sk_priority;
 231         skb->mark = sk->sk_mark;
 232
 233         mtu = dst_mtu(dst);
 234         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 235                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 236                               IPSTATS_MIB_OUT, skb->len);
 237                 /* hooks should never assume socket lock is held.
 238                  * we promote our socket to non const
 239                  */
 240                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 241                                net, (struct sock *)sk, skb, NULL, dst->dev,
 242                                dst_output);
 243         }
 244
 245         skb->dev = dst->dev;
 246         /* ipv6_local_error() does not require socket lock,
 247          * we promote our socket to non const
 248          */
 249         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 250
 251         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 252         kfree_skb(skb);
 253         return -EMSGSIZE;
 254 }
 255 EXPORT_SYMBOL(ip6_xmit);
 256
 257 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 258 {
 259         struct ip6_ra_chain *ra;
 260         struct sock *last = NULL;
 261
 262         read_lock(&ip6_ra_lock);
 263         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 264                 struct sock *sk = ra->sk;
 265                 if (sk && ra->sel == sel &&
 266                     (!sk->sk_bound_dev_if ||
 267                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 268                         if (last) {
 269                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 270                                 if (skb2)
 271                                         rawv6_rcv(last, skb2);
 272                         }
 273                         last = sk;
 274                 }
 275         }
 276
 277         if (last) {
 278                 rawv6_rcv(last, skb);
 279                 read_unlock(&ip6_ra_lock);
 280                 return 1;
 281         }
 282         read_unlock(&ip6_ra_lock);
 283         return 0;
 284 }
 285
 286 static int ip6_forward_proxy_check(struct sk_buff *skb)
 287 {
 288         struct ipv6hdr *hdr = ipv6_hdr(skb);
 289         u8 nexthdr = hdr->nexthdr;
 290         __be16 frag_off;
 291         int offset;
 292
 293         if (ipv6_ext_hdr(nexthdr)) {
 294                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 295                 if (offset < 0)
 296                         return 0;
 297         } else
 298                 offset = sizeof(struct ipv6hdr);
 299
 300         if (nexthdr == IPPROTO_ICMPV6) {
 301                 struct icmp6hdr *icmp6;
 302
 303                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 304                                          offset + 1 - skb->data)))
 305                         return 0;
 306
 307                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 308
 309                 switch (icmp6->icmp6_type) {
 310                 case NDISC_ROUTER_SOLICITATION:
 311                 case NDISC_ROUTER_ADVERTISEMENT:
 312                 case NDISC_NEIGHBOUR_SOLICITATION:
 313                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 314                 case NDISC_REDIRECT:
 315                         /* For reaction involving unicast neighbor discovery
 316                          * message destined to the proxied address, pass it to
 317                          * input function.
 318                          */
 319                         return 1;
 320                 default:
 321                         break;
 322                 }
 323         }
 324
 325         /*
 326          * The proxying router can't forward traffic sent to a link-local
 327          * address, so signal the sender and discard the packet. This
 328          * behavior is clarified by the MIPv6 specification.
 329          */
 330         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 331                 dst_link_failure(skb);
 332                 return -1;
 333         }
 334
 335         return 0;
 336 }
 337
 338 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 339                                      struct sk_buff *skb)
 340 {
 341         struct dst_entry *dst = skb_dst(skb);
 342
 343         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 344         IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 345         skb_sender_cpu_clear(skb);
 346         return dst_output(net, sk, skb);
 347 }
 348
 349 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 350 {
 351         unsigned int mtu;
 352         struct inet6_dev *idev;
 353
 354         if (dst_metric_locked(dst, RTAX_MTU)) {
 355                 mtu = dst_metric_raw(dst, RTAX_MTU);
 356                 if (mtu)
 357                         return mtu;
 358         }
 359
 360         mtu = IPV6_MIN_MTU;
 361         rcu_read_lock();
 362         idev = __in6_dev_get(dst->dev);
 363         if (idev)
 364                 mtu = idev->cnf.mtu6;
 365         rcu_read_unlock();
 366
 367         return mtu;
 368 }
 369
 370 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 371 {
 372         if (skb->len <= mtu)
 373                 return false;
 374
 375         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 376         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 377                 return true;
 378
 379         if (skb->ignore_df)
 380                 return false;
 381
 382         if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
 383                 return false;
 384
 385         return true;
 386 }
 387
 388 int ip6_forward(struct sk_buff *skb)
 389 {
 390         struct dst_entry *dst = skb_dst(skb);
 391         struct ipv6hdr *hdr = ipv6_hdr(skb);
 392         struct inet6_skb_parm *opt = IP6CB(skb);
 393         struct net *net = dev_net(dst->dev);
 394         u32 mtu;
 395
 396         if (net->ipv6.devconf_all->forwarding == 0)
 397                 goto error;
 398
 399         if (skb->pkt_type != PACKET_HOST)
 400                 goto drop;
 401
 402         if (unlikely(skb->sk))
 403                 goto drop;
 404
 405         if (skb_warn_if_lro(skb))
 406                 goto drop;
 407
 408         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 409                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 410                                  IPSTATS_MIB_INDISCARDS);
 411                 goto drop;
 412         }
 413
 414         skb_forward_csum(skb);
 415
 416         /*
 417          *      We DO NOT make any processing on
 418          *      RA packets, pushing them to user level AS IS
 419          *      without ane WARRANTY that application will be able
 420          *      to interpret them. The reason is that we
 421          *      cannot make anything clever here.
 422          *
 423          *      We are not end-node, so that if packet contains
 424          *      AH/ESP, we cannot make anything.
 425          *      Defragmentation also would be mistake, RA packets
 426          *      cannot be fragmented, because there is no warranty
 427          *      that different fragments will go along one path. --ANK
 428          */
 429         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 430                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 431                         return 0;
 432         }
 433
 434         /*
 435          *      check and decrement ttl
 436          */
 437         if (hdr->hop_limit <= 1) {
 438                 /* Force OUTPUT device used as source address */
 439                 skb->dev = dst->dev;
 440                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 441                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 442                                  IPSTATS_MIB_INHDRERRORS);
 443
 444                 kfree_skb(skb);
 445                 return -ETIMEDOUT;
 446         }
 447
 448         /* XXX: idev->cnf.proxy_ndp? */
 449         if (net->ipv6.devconf_all->proxy_ndp &&
 450             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 451                 int proxied = ip6_forward_proxy_check(skb);
 452                 if (proxied > 0)
 453                         return ip6_input(skb);
 454                 else if (proxied < 0) {
 455                         IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 456                                          IPSTATS_MIB_INDISCARDS);
 457                         goto drop;
 458                 }
 459         }
 460
 461         if (!xfrm6_route_forward(skb)) {
 462                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 463                                  IPSTATS_MIB_INDISCARDS);
 464                 goto drop;
 465         }
 466         dst = skb_dst(skb);
 467
 468         /* IPv6 specs say nothing about it, but it is clear that we cannot
 469            send redirects to source routed frames.
 470            We don't send redirects to frames decapsulated from IPsec.
 471          */
 472         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 473                 struct in6_addr *target = NULL;
 474                 struct inet_peer *peer;
 475                 struct rt6_info *rt;
 476
 477                 /*
 478                  *      incoming and outgoing devices are the same
 479                  *      send a redirect.
 480                  */
 481
 482                 rt = (struct rt6_info *) dst;
 483                 if (rt->rt6i_flags & RTF_GATEWAY)
 484                         target = &rt->rt6i_gateway;
 485                 else
 486                         target = &hdr->daddr;
 487
 488                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 489
 490                 /* Limit redirects both by destination (here)
 491                    and by source (inside ndisc_send_redirect)
 492                  */
 493                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 494                         ndisc_send_redirect(skb, target);
 495                 if (peer)
 496                         inet_putpeer(peer);
 497         } else {
 498                 int addrtype = ipv6_addr_type(&hdr->saddr);
 499
 500                 /* This check is security critical. */
 501                 if (addrtype == IPV6_ADDR_ANY ||
 502                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 503                         goto error;
 504                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 505                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 506                                     ICMPV6_NOT_NEIGHBOUR, 0);
 507                         goto error;
 508                 }
 509         }
 510
 511         mtu = ip6_dst_mtu_forward(dst);
 512         if (mtu < IPV6_MIN_MTU)
 513                 mtu = IPV6_MIN_MTU;
 514
 515         if (ip6_pkt_too_big(skb, mtu)) {
 516                 /* Again, force OUTPUT device used as source address */
 517                 skb->dev = dst->dev;
 518                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 519                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 520                                  IPSTATS_MIB_INTOOBIGERRORS);
 521                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 522                                  IPSTATS_MIB_FRAGFAILS);
 523                 kfree_skb(skb);
 524                 return -EMSGSIZE;
 525         }
 526
 527         if (skb_cow(skb, dst->dev->hard_header_len)) {
 528                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 529                                  IPSTATS_MIB_OUTDISCARDS);
 530                 goto drop;
 531         }
 532
 533         hdr = ipv6_hdr(skb);
 534
 535         /* Mangling hops number delayed to point after skb COW */
 536
 537         hdr->hop_limit--;
 538
 539         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 540                        net, NULL, skb, skb->dev, dst->dev,
 541                        ip6_forward_finish);
 542
 543 error:
 544         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 545 drop:
 546         kfree_skb(skb);
 547         return -EINVAL;
 548 }
 549
 550 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 551 {
 552         to->pkt_type = from->pkt_type;
 553         to->priority = from->priority;
 554         to->protocol = from->protocol;
 555         skb_dst_drop(to);
 556         skb_dst_set(to, dst_clone(skb_dst(from)));
 557         to->dev = from->dev;
 558         to->mark = from->mark;
 559
 560         skb_copy_hash(to, from);
 561
 562 #ifdef CONFIG_NET_SCHED
 563         to->tc_index = from->tc_index;
 564 #endif
 565         nf_copy(to, from);
 566         skb_copy_secmark(to, from);
 567 }
 568
 569 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 570                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 571 {
 572         struct sk_buff *frag;
 573         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 574         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 575                                 inet6_sk(skb->sk) : NULL;
 576         struct ipv6hdr *tmp_hdr;
 577         struct frag_hdr *fh;
 578         unsigned int mtu, hlen, left, len, nexthdr_offset;
 579         int hroom, troom;
 580         __be32 frag_id;
 581         int ptr, offset = 0, err = 0;
 582         u8 *prevhdr, nexthdr = 0;
 583
 584         err = ip6_find_1stfragopt(skb, &prevhdr);
 585         if (err < 0)
 586                 goto fail;
 587         hlen = err;
 588         nexthdr = *prevhdr;
 589         nexthdr_offset = prevhdr - skb_network_header(skb);
 590
 591         mtu = ip6_skb_dst_mtu(skb);
 592
 593         /* We must not fragment if the socket is set to force MTU discovery
 594          * or if the skb it not generated by a local socket.
 595          */
 596         if (unlikely(!skb->ignore_df && skb->len > mtu))
 597                 goto fail_toobig;
 598
 599         if (IP6CB(skb)->frag_max_size) {
 600                 if (IP6CB(skb)->frag_max_size > mtu)
 601                         goto fail_toobig;
 602
 603                 /* don't send fragments larger than what we received */
 604                 mtu = IP6CB(skb)->frag_max_size;
 605                 if (mtu < IPV6_MIN_MTU)
 606                         mtu = IPV6_MIN_MTU;
 607         }
 608
 609         if (np && np->frag_size < mtu) {
 610                 if (np->frag_size)
 611                         mtu = np->frag_size;
 612         }
 613         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 614                 goto fail_toobig;
 615         mtu -= hlen + sizeof(struct frag_hdr);
 616
 617         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 618                                     &ipv6_hdr(skb)->saddr);
 619
 620         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 621             (err = skb_checksum_help(skb)))
 622                 goto fail;
 623
 624         prevhdr = skb_network_header(skb) + nexthdr_offset;
 625         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 626         if (skb_has_frag_list(skb)) {
 627                 int first_len = skb_pagelen(skb);
 628                 struct sk_buff *frag2;
 629
 630                 if (first_len - hlen > mtu ||
 631                     ((first_len - hlen) & 7) ||
 632                     skb_cloned(skb) ||
 633                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 634                         goto slow_path;
 635
 636                 skb_walk_frags(skb, frag) {
 637                         /* Correct geometry. */
 638                         if (frag->len > mtu ||
 639                             ((frag->len & 7) && frag->next) ||
 640                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 641                                 goto slow_path_clean;
 642
 643                         /* Partially cloned skb? */
 644                         if (skb_shared(frag))
 645                                 goto slow_path_clean;
 646
 647                         BUG_ON(frag->sk);
 648                         if (skb->sk) {
 649                                 frag->sk = skb->sk;
 650                                 frag->destructor = sock_wfree;
 651                         }
 652                         skb->truesize -= frag->truesize;
 653                 }
 654
 655                 err = 0;
 656                 offset = 0;
 657                 /* BUILD HEADER */
 658
 659                 *prevhdr = NEXTHDR_FRAGMENT;
 660                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 661                 if (!tmp_hdr) {
 662                         err = -ENOMEM;
 663                         goto fail;
 664                 }
 665                 frag = skb_shinfo(skb)->frag_list;
 666                 skb_frag_list_init(skb);
 667
 668                 __skb_pull(skb, hlen);
 669                 fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
 670                 __skb_push(skb, hlen);
 671                 skb_reset_network_header(skb);
 672                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 673
 674                 fh->nexthdr = nexthdr;
 675                 fh->reserved = 0;
 676                 fh->frag_off = htons(IP6_MF);
 677                 fh->identification = frag_id;
 678
 679                 first_len = skb_pagelen(skb);
 680                 skb->data_len = first_len - skb_headlen(skb);
 681                 skb->len = first_len;
 682                 ipv6_hdr(skb)->payload_len = htons(first_len -
 683                                                    sizeof(struct ipv6hdr));
 684
 685                 dst_hold(&rt->dst);
 686
 687                 for (;;) {
 688                         /* Prepare header of the next frame,
 689                          * before previous one went down. */
 690                         if (frag) {
 691                                 frag->ip_summed = CHECKSUM_NONE;
 692                                 skb_reset_transport_header(frag);
 693                                 fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
 694                                 __skb_push(frag, hlen);
 695                                 skb_reset_network_header(frag);
 696                                 memcpy(skb_network_header(frag), tmp_hdr,
 697                                        hlen);
 698                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 699                                 fh->nexthdr = nexthdr;
 700                                 fh->reserved = 0;
 701                                 fh->frag_off = htons(offset);
 702                                 if (frag->next)
 703                                         fh->frag_off |= htons(IP6_MF);
 704                                 fh->identification = frag_id;
 705                                 ipv6_hdr(frag)->payload_len =
 706                                                 htons(frag->len -
 707                                                       sizeof(struct ipv6hdr));
 708                                 ip6_copy_metadata(frag, skb);
 709                         }
 710
 711                         err = output(net, sk, skb);
 712                         if (!err)
 713                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 714                                               IPSTATS_MIB_FRAGCREATES);
 715
 716                         if (err || !frag)
 717                                 break;
 718
 719                         skb = frag;
 720                         frag = skb->next;
 721                         skb->next = NULL;
 722                 }
 723
 724                 kfree(tmp_hdr);
 725
 726                 if (err == 0) {
 727                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 728                                       IPSTATS_MIB_FRAGOKS);
 729                         ip6_rt_put(rt);
 730                         return 0;
 731                 }
 732
 733                 kfree_skb_list(frag);
 734
 735                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 736                               IPSTATS_MIB_FRAGFAILS);
 737                 ip6_rt_put(rt);
 738                 return err;
 739
 740 slow_path_clean:
 741                 skb_walk_frags(skb, frag2) {
 742                         if (frag2 == frag)
 743                                 break;
 744                         frag2->sk = NULL;
 745                         frag2->destructor = NULL;
 746                         skb->truesize += frag2->truesize;
 747                 }
 748         }
 749
 750 slow_path:
 751         left = skb->len - hlen;         /* Space per frame */
 752         ptr = hlen;                     /* Where to start from */
 753
 754         /*
 755          *      Fragment the datagram.
 756          */
 757
 758         troom = rt->dst.dev->needed_tailroom;
 759
 760         /*
 761          *      Keep copying data until we run out.
 762          */
 763         while (left > 0)        {
 764                 u8 *fragnexthdr_offset;
 765
 766                 len = left;
 767                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 768                 if (len > mtu)
 769                         len = mtu;
 770                 /* IF: we are not sending up to and including the packet end
 771                    then align the next start on an eight byte boundary */
 772                 if (len < left) {
 773                         len &= ~7;
 774                 }
 775
 776                 /* Allocate buffer */
 777                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 778                                  hroom + troom, GFP_ATOMIC);
 779                 if (!frag) {
 780                         err = -ENOMEM;
 781                         goto fail;
 782                 }
 783
 784                 /*
 785                  *      Set up data on packet
 786                  */
 787
 788                 ip6_copy_metadata(frag, skb);
 789                 skb_reserve(frag, hroom);
 790                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 791                 skb_reset_network_header(frag);
 792                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 793                 frag->transport_header = (frag->network_header + hlen +
 794                                           sizeof(struct frag_hdr));
 795
 796                 /*
 797                  *      Charge the memory for the fragment to any owner
 798                  *      it might possess
 799                  */
 800                 if (skb->sk)
 801                         skb_set_owner_w(frag, skb->sk);
 802
 803                 /*
 804                  *      Copy the packet header into the new buffer.
 805                  */
 806                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 807
 808                 fragnexthdr_offset = skb_network_header(frag);
 809                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
 810                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 811
 812                 /*
 813                  *      Build fragment header.
 814                  */
 815                 fh->nexthdr = nexthdr;
 816                 fh->reserved = 0;
 817                 fh->identification = frag_id;
 818
 819                 /*
 820                  *      Copy a block of the IP datagram.
 821                  */
 822                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 823                                      len));
 824                 left -= len;
 825
 826                 fh->frag_off = htons(offset);
 827                 if (left > 0)
 828                         fh->frag_off |= htons(IP6_MF);
 829                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 830                                                     sizeof(struct ipv6hdr));
 831
 832                 ptr += len;
 833                 offset += len;
 834
 835                 /*
 836                  *      Put this fragment into the sending queue.
 837                  */
 838                 err = output(net, sk, frag);
 839                 if (err)
 840                         goto fail;
 841
 842                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 843                               IPSTATS_MIB_FRAGCREATES);
 844         }
 845         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 846                       IPSTATS_MIB_FRAGOKS);
 847         consume_skb(skb);
 848         return err;
 849
 850 fail_toobig:
 851         if (skb->sk && dst_allfrag(skb_dst(skb)))
 852                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 853
 854         skb->dev = skb_dst(skb)->dev;
 855         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 856         err = -EMSGSIZE;
 857
 858 fail:
 859         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 860                       IPSTATS_MIB_FRAGFAILS);
 861         kfree_skb(skb);
 862         return err;
 863 }
 864
 865 static inline int ip6_rt_check(const struct rt6key *rt_key,
 866                                const struct in6_addr *fl_addr,
 867                                const struct in6_addr *addr_cache)
 868 {
 869         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 870                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 871 }
 872
 873 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 874                                           struct dst_entry *dst,
 875                                           const struct flowi6 *fl6)
 876 {
 877         struct ipv6_pinfo *np = inet6_sk(sk);
 878         struct rt6_info *rt;
 879
 880         if (!dst)
 881                 goto out;
 882
 883         if (dst->ops->family != AF_INET6) {
 884                 dst_release(dst);
 885                 return NULL;
 886         }
 887
 888         rt = (struct rt6_info *)dst;
 889         /* Yes, checking route validity in not connected
 890          * case is not very simple. Take into account,
 891          * that we do not support routing by source, TOS,
 892          * and MSG_DONTROUTE            --ANK (980726)
 893          *
 894          * 1. ip6_rt_check(): If route was host route,
 895          *    check that cached destination is current.
 896          *    If it is network route, we still may
 897          *    check its validity using saved pointer
 898          *    to the last used address: daddr_cache.
 899          *    We do not want to save whole address now,
 900          *    (because main consumer of this service
 901          *    is tcp, which has not this problem),
 902          *    so that the last trick works only on connected
 903          *    sockets.
 904          * 2. oif also should be the same.
 905          */
 906         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 907 #ifdef CONFIG_IPV6_SUBTREES
 908             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 909 #endif
 910            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 911               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 912                 dst_release(dst);
 913                 dst = NULL;
 914         }
 915
 916 out:
 917         return dst;
 918 }
 919
 920 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 921                                struct dst_entry **dst, struct flowi6 *fl6)
 922 {
 923 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 924         struct neighbour *n;
 925         struct rt6_info *rt;
 926 #endif
 927         int err;
 928         int flags = 0;
 929
 930         /* The correct way to handle this would be to do
 931          * ip6_route_get_saddr, and then ip6_route_output; however,
 932          * the route-specific preferred source forces the
 933          * ip6_route_output call _before_ ip6_route_get_saddr.
 934          *
 935          * In source specific routing (no src=any default route),
 936          * ip6_route_output will fail given src=any saddr, though, so
 937          * that's why we try it again later.
 938          */
 939         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 940                 struct rt6_info *rt;
 941                 bool had_dst = *dst != NULL;
 942
 943                 if (!had_dst)
 944                         *dst = ip6_route_output(net, sk, fl6);
 945                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 946                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 947                                           sk ? inet6_sk(sk)->srcprefs : 0,
 948                                           &fl6->saddr);
 949                 if (err)
 950                         goto out_err_release;
 951
 952                 /* If we had an erroneous initial result, pretend it
 953                  * never existed and let the SA-enabled version take
 954                  * over.
 955                  */
 956                 if (!had_dst && (*dst)->error) {
 957                         dst_release(*dst);
 958                         *dst = NULL;
 959                 }
 960
 961                 if (fl6->flowi6_oif)
 962                         flags |= RT6_LOOKUP_F_IFACE;
 963         }
 964
 965         if (!*dst)
 966                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
 967
 968         err = (*dst)->error;
 969         if (err)
 970                 goto out_err_release;
 971
 972 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 973         /*
 974          * Here if the dst entry we've looked up
 975          * has a neighbour entry that is in the INCOMPLETE
 976          * state and the src address from the flow is
 977          * marked as OPTIMISTIC, we release the found
 978          * dst entry and replace it instead with the
 979          * dst entry of the nexthop router
 980          */
 981         rt = (struct rt6_info *) *dst;
 982         rcu_read_lock_bh();
 983         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
 984                                       rt6_nexthop(rt, &fl6->daddr));
 985         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 986         rcu_read_unlock_bh();
 987
 988         if (err) {
 989                 struct inet6_ifaddr *ifp;
 990                 struct flowi6 fl_gw6;
 991                 int redirect;
 992
 993                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 994                                       (*dst)->dev, 1);
 995
 996                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 997                 if (ifp)
 998                         in6_ifa_put(ifp);
 999
1000                 if (redirect) {
1001                         /*
1002                          * We need to get the dst entry for the
1003                          * default router instead
1004                          */
1005                         dst_release(*dst);
1006                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1007                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1008                         *dst = ip6_route_output(net, sk, &fl_gw6);
1009                         err = (*dst)->error;
1010                         if (err)
1011                                 goto out_err_release;
1012                 }
1013         }
1014 #endif
1015         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1016             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1017                 err = -EAFNOSUPPORT;
1018                 goto out_err_release;
1019         }
1020
1021         return 0;
1022
1023 out_err_release:
1024         if (err == -ENETUNREACH)
1025                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1026         dst_release(*dst);
1027         *dst = NULL;
1028         return err;
1029 }
1030
1031 /**
1032  *      ip6_dst_lookup - perform route lookup on flow
1033  *      @sk: socket which provides route info
1034  *      @dst: pointer to dst_entry * for result
1035  *      @fl6: flow to lookup
1036  *
1037  *      This function performs a route lookup on the given flow.
1038  *
1039  *      It returns zero on success, or a standard errno code on error.
1040  */
1041 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1042                    struct flowi6 *fl6)
1043 {
1044         *dst = NULL;
1045         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1046 }
1047 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1048
1049 /**
1050  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1051  *      @sk: socket which provides route info
1052  *      @fl6: flow to lookup
1053  *      @final_dst: final destination address for ipsec lookup
1054  *
1055  *      This function performs a route lookup on the given flow.
1056  *
1057  *      It returns a valid dst pointer on success, or a pointer encoded
1058  *      error code.
1059  */
1060 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1061                                       const struct in6_addr *final_dst)
1062 {
1063         struct dst_entry *dst = NULL;
1064         int err;
1065
1066         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1067         if (err)
1068                 return ERR_PTR(err);
1069         if (final_dst)
1070                 fl6->daddr = *final_dst;
1071         if (!fl6->flowi6_oif)
1072                 fl6->flowi6_oif = l3mdev_fib_oif(dst->dev);
1073
1074         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1075 }
1076 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1077
1078 /**
1079  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1080  *      @sk: socket which provides the dst cache and route info
1081  *      @fl6: flow to lookup
1082  *      @final_dst: final destination address for ipsec lookup
1083  *
1084  *      This function performs a route lookup on the given flow with the
1085  *      possibility of using the cached route in the socket if it is valid.
1086  *      It will take the socket dst lock when operating on the dst cache.
1087  *      As a result, this function can only be used in process context.
1088  *
1089  *      It returns a valid dst pointer on success, or a pointer encoded
1090  *      error code.
1091  */
1092 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1093                                          const struct in6_addr *final_dst)
1094 {
1095         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1096
1097         dst = ip6_sk_dst_check(sk, dst, fl6);
1098         if (!dst)
1099                 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1100
1101         return dst;
1102 }
1103 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1104
1105 static inline int ip6_ufo_append_data(struct sock *sk,
1106                         struct sk_buff_head *queue,
1107                         int getfrag(void *from, char *to, int offset, int len,
1108                         int odd, struct sk_buff *skb),
1109                         void *from, int length, int hh_len, int fragheaderlen,
1110                         int exthdrlen, int transhdrlen, int mtu,
1111                         unsigned int flags, const struct flowi6 *fl6)
1112
1113 {
1114         struct sk_buff *skb;
1115         int err;
1116
1117         /* There is support for UDP large send offload by network
1118          * device, so create one single skb packet containing complete
1119          * udp datagram
1120          */
1121         skb = skb_peek_tail(queue);
1122         if (!skb) {
1123                 skb = sock_alloc_send_skb(sk,
1124                         hh_len + fragheaderlen + transhdrlen + 20,
1125                         (flags & MSG_DONTWAIT), &err);
1126                 if (!skb)
1127                         return err;
1128
1129                 /* reserve space for Hardware header */
1130                 skb_reserve(skb, hh_len);
1131
1132                 /* create space for UDP/IP header */
1133                 skb_put(skb, fragheaderlen + transhdrlen);
1134
1135                 /* initialize network header pointer */
1136                 skb_set_network_header(skb, exthdrlen);
1137
1138                 /* initialize protocol header pointer */
1139                 skb->transport_header = skb->network_header + fragheaderlen;
1140
1141                 skb->protocol = htons(ETH_P_IPV6);
1142                 skb->csum = 0;
1143
1144                 __skb_queue_tail(queue, skb);
1145         } else if (skb_is_gso(skb)) {
1146                 goto append;
1147         }
1148
1149         skb->ip_summed = CHECKSUM_PARTIAL;
1150         /* Specify the length of each IPv6 datagram fragment.
1151          * It has to be a multiple of 8.
1152          */
1153         skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1154                                      sizeof(struct frag_hdr)) & ~7;
1155         skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1156         skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1157                                                          &fl6->daddr,
1158                                                          &fl6->saddr);
1159
1160 append:
1161         return skb_append_datato_frags(sk, skb, getfrag, from,
1162                                        (length - transhdrlen));
1163 }
1164
1165 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1166                                                gfp_t gfp)
1167 {
1168         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1169 }
1170
1171 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1172                                                 gfp_t gfp)
1173 {
1174         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1175 }
1176
1177 static void ip6_append_data_mtu(unsigned int *mtu,
1178                                 int *maxfraglen,
1179                                 unsigned int fragheaderlen,
1180                                 struct sk_buff *skb,
1181                                 struct rt6_info *rt,
1182                                 unsigned int orig_mtu)
1183 {
1184         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1185                 if (!skb) {
1186                         /* first fragment, reserve header_len */
1187                         *mtu = orig_mtu - rt->dst.header_len;
1188
1189                 } else {
1190                         /*
1191                          * this fragment is not first, the headers
1192                          * space is regarded as data space.
1193                          */
1194                         *mtu = orig_mtu;
1195                 }
1196                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1197                               + fragheaderlen - sizeof(struct frag_hdr);
1198         }
1199 }
1200
1201 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1202                           struct inet6_cork *v6_cork,
1203                           int hlimit, int tclass, struct ipv6_txoptions *opt,
1204                           struct rt6_info *rt, struct flowi6 *fl6)
1205 {
1206         struct ipv6_pinfo *np = inet6_sk(sk);
1207         unsigned int mtu;
1208
1209         /*
1210          * setup for corking
1211          */
1212         if (opt) {
1213                 if (WARN_ON(v6_cork->opt))
1214                         return -EINVAL;
1215
1216                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1217                 if (unlikely(!v6_cork->opt))
1218                         return -ENOBUFS;
1219
1220                 v6_cork->opt->tot_len = sizeof(*opt);
1221                 v6_cork->opt->opt_flen = opt->opt_flen;
1222                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1223
1224                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1225                                                     sk->sk_allocation);
1226                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1227                         return -ENOBUFS;
1228
1229                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1230                                                     sk->sk_allocation);
1231                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1232                         return -ENOBUFS;
1233
1234                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1235                                                    sk->sk_allocation);
1236                 if (opt->hopopt && !v6_cork->opt->hopopt)
1237                         return -ENOBUFS;
1238
1239                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1240                                                     sk->sk_allocation);
1241                 if (opt->srcrt && !v6_cork->opt->srcrt)
1242                         return -ENOBUFS;
1243
1244                 /* need source address above miyazawa*/
1245         }
1246         dst_hold(&rt->dst);
1247         cork->base.dst = &rt->dst;
1248         cork->fl.u.ip6 = *fl6;
1249         v6_cork->hop_limit = hlimit;
1250         v6_cork->tclass = tclass;
1251         if (rt->dst.flags & DST_XFRM_TUNNEL)
1252                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1253                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1254         else
1255                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1256                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path);
1257         if (np->frag_size < mtu) {
1258                 if (np->frag_size)
1259                         mtu = np->frag_size;
1260         }
1261         if (mtu < IPV6_MIN_MTU)
1262                 return -EINVAL;
1263         cork->base.fragsize = mtu;
1264         if (dst_allfrag(rt->dst.path))
1265                 cork->base.flags |= IPCORK_ALLFRAG;
1266         cork->base.length = 0;
1267
1268         return 0;
1269 }
1270
1271 static int __ip6_append_data(struct sock *sk,
1272                              struct flowi6 *fl6,
1273                              struct sk_buff_head *queue,
1274                              struct inet_cork *cork,
1275                              struct inet6_cork *v6_cork,
1276                              struct page_frag *pfrag,
1277                              int getfrag(void *from, char *to, int offset,
1278                                          int len, int odd, struct sk_buff *skb),
1279                              void *from, int length, int transhdrlen,
1280                              unsigned int flags, int dontfrag)
1281 {
1282         struct sk_buff *skb, *skb_prev = NULL;
1283         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1284         int exthdrlen = 0;
1285         int dst_exthdrlen = 0;
1286         int hh_len;
1287         int copy;
1288         int err;
1289         int offset = 0;
1290         __u8 tx_flags = 0;
1291         u32 tskey = 0;
1292         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1293         struct ipv6_txoptions *opt = v6_cork->opt;
1294         int csummode = CHECKSUM_NONE;
1295         unsigned int maxnonfragsize, headersize;
1296
1297         skb = skb_peek_tail(queue);
1298         if (!skb) {
1299                 exthdrlen = opt ? opt->opt_flen : 0;
1300                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1301         }
1302
1303         mtu = cork->fragsize;
1304         orig_mtu = mtu;
1305
1306         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1307
1308         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1309                         (opt ? opt->opt_nflen : 0);
1310         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1311                      sizeof(struct frag_hdr);
1312
1313         headersize = sizeof(struct ipv6hdr) +
1314                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1315                      (dst_allfrag(&rt->dst) ?
1316                       sizeof(struct frag_hdr) : 0) +
1317                      rt->rt6i_nfheader_len;
1318
1319         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1320          * the first fragment
1321          */
1322         if (headersize + transhdrlen > mtu)
1323                 goto emsgsize;
1324
1325         if (cork->length + length > mtu - headersize && dontfrag &&
1326             (sk->sk_protocol == IPPROTO_UDP ||
1327              sk->sk_protocol == IPPROTO_RAW)) {
1328                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1329                                 sizeof(struct ipv6hdr));
1330                 goto emsgsize;
1331         }
1332
1333         if (ip6_sk_ignore_df(sk))
1334                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1335         else
1336                 maxnonfragsize = mtu;
1337
1338         if (cork->length + length > maxnonfragsize - headersize) {
1339 emsgsize:
1340                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1341                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1342                 return -EMSGSIZE;
1343         }
1344
1345         /* CHECKSUM_PARTIAL only with no extension headers and when
1346          * we are not going to fragment
1347          */
1348         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1349             headersize == sizeof(struct ipv6hdr) &&
1350             length < mtu - headersize &&
1351             !(flags & MSG_MORE) &&
1352             rt->dst.dev->features & NETIF_F_V6_CSUM)
1353                 csummode = CHECKSUM_PARTIAL;
1354
1355         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1356                 sock_tx_timestamp(sk, &tx_flags);
1357                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1358                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1359                         tskey = sk->sk_tskey++;
1360         }
1361
1362         /*
1363          * Let's try using as much space as possible.
1364          * Use MTU if total length of the message fits into the MTU.
1365          * Otherwise, we need to reserve fragment header and
1366          * fragment alignment (= 8-15 octects, in total).
1367          *
1368          * Note that we may need to "move" the data from the tail of
1369          * of the buffer to the new fragment when we split
1370          * the message.
1371          *
1372          * FIXME: It may be fragmented into multiple chunks
1373          *        at once if non-fragmentable extension headers
1374          *        are too large.
1375          * --yoshfuji
1376          */
1377
1378         cork->length += length;
1379         if ((skb && skb_is_gso(skb)) ||
1380             (((length + (skb ? skb->len : headersize)) > mtu) &&
1381             (skb_queue_len(queue) <= 1) &&
1382             (sk->sk_protocol == IPPROTO_UDP) &&
1383             (rt->dst.dev->features & NETIF_F_UFO) &&
1384             (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk))) {
1385                 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1386                                           hh_len, fragheaderlen, exthdrlen,
1387                                           transhdrlen, mtu, flags, fl6);
1388                 if (err)
1389                         goto error;
1390                 return 0;
1391         }
1392
1393         if (!skb)
1394                 goto alloc_new_skb;
1395
1396         while (length > 0) {
1397                 /* Check if the remaining data fits into current packet. */
1398                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1399                 if (copy < length)
1400                         copy = maxfraglen - skb->len;
1401
1402                 if (copy <= 0) {
1403                         char *data;
1404                         unsigned int datalen;
1405                         unsigned int fraglen;
1406                         unsigned int fraggap;
1407                         unsigned int alloclen;
1408 alloc_new_skb:
1409                         /* There's no room in the current skb */
1410                         if (skb)
1411                                 fraggap = skb->len - maxfraglen;
1412                         else
1413                                 fraggap = 0;
1414                         /* update mtu and maxfraglen if necessary */
1415                         if (!skb || !skb_prev)
1416                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1417                                                     fragheaderlen, skb, rt,
1418                                                     orig_mtu);
1419
1420                         skb_prev = skb;
1421
1422                         /*
1423                          * If remaining data exceeds the mtu,
1424                          * we know we need more fragment(s).
1425                          */
1426                         datalen = length + fraggap;
1427
1428                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1429                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1430                         if ((flags & MSG_MORE) &&
1431                             !(rt->dst.dev->features&NETIF_F_SG))
1432                                 alloclen = mtu;
1433                         else
1434                                 alloclen = datalen + fragheaderlen;
1435
1436                         alloclen += dst_exthdrlen;
1437
1438                         if (datalen != length + fraggap) {
1439                                 /*
1440                                  * this is not the last fragment, the trailer
1441                                  * space is regarded as data space.
1442                                  */
1443                                 datalen += rt->dst.trailer_len;
1444                         }
1445
1446                         alloclen += rt->dst.trailer_len;
1447                         fraglen = datalen + fragheaderlen;
1448
1449                         /*
1450                          * We just reserve space for fragment header.
1451                          * Note: this may be overallocation if the message
1452                          * (without MSG_MORE) fits into the MTU.
1453                          */
1454                         alloclen += sizeof(struct frag_hdr);
1455
1456                         copy = datalen - transhdrlen - fraggap;
1457                         if (copy < 0) {
1458                                 err = -EINVAL;
1459                                 goto error;
1460                         }
1461                         if (transhdrlen) {
1462                                 skb = sock_alloc_send_skb(sk,
1463                                                 alloclen + hh_len,
1464                                                 (flags & MSG_DONTWAIT), &err);
1465                         } else {
1466                                 skb = NULL;
1467                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1468                                     2 * sk->sk_sndbuf)
1469                                         skb = sock_wmalloc(sk,
1470                                                            alloclen + hh_len, 1,
1471                                                            sk->sk_allocation);
1472                                 if (unlikely(!skb))
1473                                         err = -ENOBUFS;
1474                         }
1475                         if (!skb)
1476                                 goto error;
1477                         /*
1478                          *      Fill in the control structures
1479                          */
1480                         skb->protocol = htons(ETH_P_IPV6);
1481                         skb->ip_summed = csummode;
1482                         skb->csum = 0;
1483                         /* reserve for fragmentation and ipsec header */
1484                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1485                                     dst_exthdrlen);
1486
1487                         /* Only the initial fragment is time stamped */
1488                         skb_shinfo(skb)->tx_flags = tx_flags;
1489                         tx_flags = 0;
1490                         skb_shinfo(skb)->tskey = tskey;
1491                         tskey = 0;
1492
1493                         /*
1494                          *      Find where to start putting bytes
1495                          */
1496                         data = skb_put(skb, fraglen);
1497                         skb_set_network_header(skb, exthdrlen);
1498                         data += fragheaderlen;
1499                         skb->transport_header = (skb->network_header +
1500                                                  fragheaderlen);
1501                         if (fraggap) {
1502                                 skb->csum = skb_copy_and_csum_bits(
1503                                         skb_prev, maxfraglen,
1504                                         data + transhdrlen, fraggap, 0);
1505                                 skb_prev->csum = csum_sub(skb_prev->csum,
1506                                                           skb->csum);
1507                                 data += fraggap;
1508                                 pskb_trim_unique(skb_prev, maxfraglen);
1509                         }
1510                         if (copy > 0 &&
1511                             getfrag(from, data + transhdrlen, offset,
1512                                     copy, fraggap, skb) < 0) {
1513                                 err = -EFAULT;
1514                                 kfree_skb(skb);
1515                                 goto error;
1516                         }
1517
1518                         offset += copy;
1519                         length -= datalen - fraggap;
1520                         transhdrlen = 0;
1521                         exthdrlen = 0;
1522                         dst_exthdrlen = 0;
1523
1524                         /*
1525                          * Put the packet on the pending queue
1526                          */
1527                         __skb_queue_tail(queue, skb);
1528                         continue;
1529                 }
1530
1531                 if (copy > length)
1532                         copy = length;
1533
1534                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1535                     skb_tailroom(skb) >= copy) {
1536                         unsigned int off;
1537
1538                         off = skb->len;
1539                         if (getfrag(from, skb_put(skb, copy),
1540                                                 offset, copy, off, skb) < 0) {
1541                                 __skb_trim(skb, off);
1542                                 err = -EFAULT;
1543                                 goto error;
1544                         }
1545                 } else {
1546                         int i = skb_shinfo(skb)->nr_frags;
1547
1548                         err = -ENOMEM;
1549                         if (!sk_page_frag_refill(sk, pfrag))
1550                                 goto error;
1551
1552                         if (!skb_can_coalesce(skb, i, pfrag->page,
1553                                               pfrag->offset)) {
1554                                 err = -EMSGSIZE;
1555                                 if (i == MAX_SKB_FRAGS)
1556                                         goto error;
1557
1558                                 __skb_fill_page_desc(skb, i, pfrag->page,
1559                                                      pfrag->offset, 0);
1560                                 skb_shinfo(skb)->nr_frags = ++i;
1561                                 get_page(pfrag->page);
1562                         }
1563                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1564                         if (getfrag(from,
1565                                     page_address(pfrag->page) + pfrag->offset,
1566                                     offset, copy, skb->len, skb) < 0)
1567                                 goto error_efault;
1568
1569                         pfrag->offset += copy;
1570                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1571                         skb->len += copy;
1572                         skb->data_len += copy;
1573                         skb->truesize += copy;
1574                         atomic_add(copy, &sk->sk_wmem_alloc);
1575                 }
1576                 offset += copy;
1577                 length -= copy;
1578         }
1579
1580         return 0;
1581
1582 error_efault:
1583         err = -EFAULT;
1584 error:
1585         cork->length -= length;
1586         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1587         return err;
1588 }
1589
1590 int ip6_append_data(struct sock *sk,
1591                     int getfrag(void *from, char *to, int offset, int len,
1592                                 int odd, struct sk_buff *skb),
1593                     void *from, int length, int transhdrlen, int hlimit,
1594                     int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1595                     struct rt6_info *rt, unsigned int flags, int dontfrag)
1596 {
1597         struct inet_sock *inet = inet_sk(sk);
1598         struct ipv6_pinfo *np = inet6_sk(sk);
1599         int exthdrlen;
1600         int err;
1601
1602         if (flags&MSG_PROBE)
1603                 return 0;
1604         if (skb_queue_empty(&sk->sk_write_queue)) {
1605                 /*
1606                  * setup for corking
1607                  */
1608                 err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
1609                                      tclass, opt, rt, fl6);
1610                 if (err)
1611                         return err;
1612
1613                 exthdrlen = (opt ? opt->opt_flen : 0);
1614                 length += exthdrlen;
1615                 transhdrlen += exthdrlen;
1616         } else {
1617                 fl6 = &inet->cork.fl.u.ip6;
1618                 transhdrlen = 0;
1619         }
1620
1621         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1622                                  &np->cork, sk_page_frag(sk), getfrag,
1623                                  from, length, transhdrlen, flags, dontfrag);
1624 }
1625 EXPORT_SYMBOL_GPL(ip6_append_data);
1626
1627 static void ip6_cork_release(struct inet_cork_full *cork,
1628                              struct inet6_cork *v6_cork)
1629 {
1630         if (v6_cork->opt) {
1631                 kfree(v6_cork->opt->dst0opt);
1632                 kfree(v6_cork->opt->dst1opt);
1633                 kfree(v6_cork->opt->hopopt);
1634                 kfree(v6_cork->opt->srcrt);
1635                 kfree(v6_cork->opt);
1636                 v6_cork->opt = NULL;
1637         }
1638
1639         if (cork->base.dst) {
1640                 dst_release(cork->base.dst);
1641                 cork->base.dst = NULL;
1642                 cork->base.flags &= ~IPCORK_ALLFRAG;
1643         }
1644         memset(&cork->fl, 0, sizeof(cork->fl));
1645 }
1646
1647 struct sk_buff *__ip6_make_skb(struct sock *sk,
1648                                struct sk_buff_head *queue,
1649                                struct inet_cork_full *cork,
1650                                struct inet6_cork *v6_cork)
1651 {
1652         struct sk_buff *skb, *tmp_skb;
1653         struct sk_buff **tail_skb;
1654         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1655         struct ipv6_pinfo *np = inet6_sk(sk);
1656         struct net *net = sock_net(sk);
1657         struct ipv6hdr *hdr;
1658         struct ipv6_txoptions *opt = v6_cork->opt;
1659         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1660         struct flowi6 *fl6 = &cork->fl.u.ip6;
1661         unsigned char proto = fl6->flowi6_proto;
1662
1663         skb = __skb_dequeue(queue);
1664         if (!skb)
1665                 goto out;
1666         tail_skb = &(skb_shinfo(skb)->frag_list);
1667
1668         /* move skb->data to ip header from ext header */
1669         if (skb->data < skb_network_header(skb))
1670                 __skb_pull(skb, skb_network_offset(skb));
1671         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1672                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1673                 *tail_skb = tmp_skb;
1674                 tail_skb = &(tmp_skb->next);
1675                 skb->len += tmp_skb->len;
1676                 skb->data_len += tmp_skb->len;
1677                 skb->truesize += tmp_skb->truesize;
1678                 tmp_skb->destructor = NULL;
1679                 tmp_skb->sk = NULL;
1680         }
1681
1682         /* Allow local fragmentation. */
1683         skb->ignore_df = ip6_sk_ignore_df(sk);
1684
1685         *final_dst = fl6->daddr;
1686         __skb_pull(skb, skb_network_header_len(skb));
1687         if (opt && opt->opt_flen)
1688                 ipv6_push_frag_opts(skb, opt, &proto);
1689         if (opt && opt->opt_nflen)
1690                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1691
1692         skb_push(skb, sizeof(struct ipv6hdr));
1693         skb_reset_network_header(skb);
1694         hdr = ipv6_hdr(skb);
1695
1696         ip6_flow_hdr(hdr, v6_cork->tclass,
1697                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1698                                         ip6_autoflowlabel(net, np), fl6));
1699         hdr->hop_limit = v6_cork->hop_limit;
1700         hdr->nexthdr = proto;
1701         hdr->saddr = fl6->saddr;
1702         hdr->daddr = *final_dst;
1703
1704         skb->priority = sk->sk_priority;
1705         skb->mark = sk->sk_mark;
1706
1707         skb_dst_set(skb, dst_clone(&rt->dst));
1708         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1709         if (proto == IPPROTO_ICMPV6) {
1710                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1711
1712                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1713                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1714         }
1715
1716         ip6_cork_release(cork, v6_cork);
1717 out:
1718         return skb;
1719 }
1720
1721 int ip6_send_skb(struct sk_buff *skb)
1722 {
1723         struct net *net = sock_net(skb->sk);
1724         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1725         int err;
1726
1727         err = ip6_local_out(net, skb->sk, skb);
1728         if (err) {
1729                 if (err > 0)
1730                         err = net_xmit_errno(err);
1731                 if (err)
1732                         IP6_INC_STATS(net, rt->rt6i_idev,
1733                                       IPSTATS_MIB_OUTDISCARDS);
1734         }
1735
1736         return err;
1737 }
1738
1739 int ip6_push_pending_frames(struct sock *sk)
1740 {
1741         struct sk_buff *skb;
1742
1743         skb = ip6_finish_skb(sk);
1744         if (!skb)
1745                 return 0;
1746
1747         return ip6_send_skb(skb);
1748 }
1749 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1750
1751 static void __ip6_flush_pending_frames(struct sock *sk,
1752                                        struct sk_buff_head *queue,
1753                                        struct inet_cork_full *cork,
1754                                        struct inet6_cork *v6_cork)
1755 {
1756         struct sk_buff *skb;
1757
1758         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1759                 if (skb_dst(skb))
1760                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1761                                       IPSTATS_MIB_OUTDISCARDS);
1762                 kfree_skb(skb);
1763         }
1764
1765         ip6_cork_release(cork, v6_cork);
1766 }
1767
1768 void ip6_flush_pending_frames(struct sock *sk)
1769 {
1770         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1771                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1772 }
1773 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1774
1775 struct sk_buff *ip6_make_skb(struct sock *sk,
1776                              int getfrag(void *from, char *to, int offset,
1777                                          int len, int odd, struct sk_buff *skb),
1778                              void *from, int length, int transhdrlen,
1779                              int hlimit, int tclass,
1780                              struct ipv6_txoptions *opt, struct flowi6 *fl6,
1781                              struct rt6_info *rt, unsigned int flags,
1782                              int dontfrag)
1783 {
1784         struct inet_cork_full cork;
1785         struct inet6_cork v6_cork;
1786         struct sk_buff_head queue;
1787         int exthdrlen = (opt ? opt->opt_flen : 0);
1788         int err;
1789
1790         if (flags & MSG_PROBE)
1791                 return NULL;
1792
1793         __skb_queue_head_init(&queue);
1794
1795         cork.base.flags = 0;
1796         cork.base.addr = 0;
1797         cork.base.opt = NULL;
1798         cork.base.dst = NULL;
1799         v6_cork.opt = NULL;
1800         err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
1801         if (err) {
1802                 ip6_cork_release(&cork, &v6_cork);
1803                 return ERR_PTR(err);
1804         }
1805
1806         if (dontfrag < 0)
1807                 dontfrag = inet6_sk(sk)->dontfrag;
1808
1809         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1810                                 &current->task_frag, getfrag, from,
1811                                 length + exthdrlen, transhdrlen + exthdrlen,
1812                                 flags, dontfrag);
1813         if (err) {
1814                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1815                 return ERR_PTR(err);
1816         }
1817
1818         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1819 }