GNU Linux-libre 4.14.330-gnu1
[releases.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64         struct dst_entry *dst = skb_dst(skb);
65         struct net_device *dev = dst->dev;
66         struct neighbour *neigh;
67         struct in6_addr *nexthop;
68         int ret;
69
70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72
73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74                     ((mroute6_socket(net, skb) &&
75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77                                          &ipv6_hdr(skb)->saddr))) {
78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80                         /* Do not check for IFF_ALLMULTI; multicast routing
81                            is not supported in any case.
82                          */
83                         if (newskb)
84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85                                         net, sk, newskb, NULL, newskb->dev,
86                                         dev_loopback_xmit);
87
88                         if (ipv6_hdr(skb)->hop_limit == 0) {
89                                 IP6_INC_STATS(net, idev,
90                                               IPSTATS_MIB_OUTDISCARDS);
91                                 kfree_skb(skb);
92                                 return 0;
93                         }
94                 }
95
96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97
98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99                     IPV6_ADDR_SCOPE_NODELOCAL &&
100                     !(dev->flags & IFF_LOOPBACK)) {
101                         kfree_skb(skb);
102                         return 0;
103                 }
104         }
105
106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107                 int res = lwtunnel_xmit(skb);
108
109                 if (res != LWTUNNEL_XMIT_CONTINUE)
110                         return res;
111         }
112
113         rcu_read_lock_bh();
114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116         if (unlikely(!neigh))
117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118         if (!IS_ERR(neigh)) {
119                 sock_confirm_neigh(skb, neigh);
120                 ret = neigh_output(neigh, skb);
121                 rcu_read_unlock_bh();
122                 return ret;
123         }
124         rcu_read_unlock_bh();
125
126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127         kfree_skb(skb);
128         return -EINVAL;
129 }
130
131 static int
132 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
133                                     struct sk_buff *skb, unsigned int mtu)
134 {
135         struct sk_buff *segs, *nskb;
136         netdev_features_t features;
137         int ret = 0;
138
139         /* Please see corresponding comment in ip_finish_output_gso
140          * describing the cases where GSO segment length exceeds the
141          * egress MTU.
142          */
143         features = netif_skb_features(skb);
144         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
145         if (IS_ERR_OR_NULL(segs)) {
146                 kfree_skb(skb);
147                 return -ENOMEM;
148         }
149
150         consume_skb(skb);
151
152         skb_list_walk_safe(segs, segs, nskb) {
153                 int err;
154
155                 skb_mark_not_on_list(segs);
156                 /* Last GSO segment can be smaller than gso_size (and MTU).
157                  * Adding a fragment header would produce an "atomic fragment",
158                  * which is considered harmful (RFC-8021). Avoid that.
159                  */
160                 err = segs->len > mtu ?
161                         ip6_fragment(net, sk, segs, ip6_finish_output2) :
162                         ip6_finish_output2(net, sk, segs);
163                 if (err && ret == 0)
164                         ret = err;
165         }
166
167         return ret;
168 }
169
170 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
171 {
172         unsigned int mtu;
173         int ret;
174
175         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
176         if (ret) {
177                 kfree_skb(skb);
178                 return ret;
179         }
180
181 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
182         /* Policy lookup after SNAT yielded a new policy */
183         if (skb_dst(skb)->xfrm) {
184                 IP6CB(skb)->flags |= IP6SKB_REROUTED;
185                 return dst_output(net, sk, skb);
186         }
187 #endif
188
189         mtu = ip6_skb_dst_mtu(skb);
190         if (skb_is_gso(skb) && !skb_gso_validate_mtu(skb, mtu))
191                 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
192
193         if ((skb->len > mtu && !skb_is_gso(skb)) ||
194             dst_allfrag(skb_dst(skb)) ||
195             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
196                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
197         else
198                 return ip6_finish_output2(net, sk, skb);
199 }
200
201 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
202 {
203         struct net_device *dev = skb_dst(skb)->dev;
204         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
205
206         skb->protocol = htons(ETH_P_IPV6);
207         skb->dev = dev;
208
209         if (unlikely(idev->cnf.disable_ipv6)) {
210                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
211                 kfree_skb(skb);
212                 return 0;
213         }
214
215         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
216                             net, sk, skb, NULL, dev,
217                             ip6_finish_output,
218                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
219 }
220
221 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
222 {
223         if (!np->autoflowlabel_set)
224                 return ip6_default_np_autolabel(net);
225         else
226                 return np->autoflowlabel;
227 }
228
229 /*
230  * xmit an sk_buff (used by TCP, SCTP and DCCP)
231  * Note : socket lock is not held for SYNACK packets, but might be modified
232  * by calls to skb_set_owner_w() and ipv6_local_error(),
233  * which are using proper atomic operations or spinlocks.
234  */
235 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
236              __u32 mark, struct ipv6_txoptions *opt, int tclass)
237 {
238         struct net *net = sock_net(sk);
239         const struct ipv6_pinfo *np = inet6_sk(sk);
240         struct in6_addr *first_hop = &fl6->daddr;
241         struct dst_entry *dst = skb_dst(skb);
242         unsigned int head_room;
243         struct ipv6hdr *hdr;
244         u8  proto = fl6->flowi6_proto;
245         int seg_len = skb->len;
246         int hlimit = -1;
247         u32 mtu;
248
249         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
250         if (opt)
251                 head_room += opt->opt_nflen + opt->opt_flen;
252
253         if (unlikely(skb_headroom(skb) < head_room)) {
254                 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
255                 if (!skb2) {
256                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
257                                       IPSTATS_MIB_OUTDISCARDS);
258                         kfree_skb(skb);
259                         return -ENOBUFS;
260                 }
261                 if (skb->sk)
262                         skb_set_owner_w(skb2, skb->sk);
263                 consume_skb(skb);
264                 skb = skb2;
265         }
266
267         if (opt) {
268                 seg_len += opt->opt_nflen + opt->opt_flen;
269
270                 if (opt->opt_flen)
271                         ipv6_push_frag_opts(skb, opt, &proto);
272
273                 if (opt->opt_nflen)
274                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
275                                              &fl6->saddr);
276         }
277
278         skb_push(skb, sizeof(struct ipv6hdr));
279         skb_reset_network_header(skb);
280         hdr = ipv6_hdr(skb);
281
282         /*
283          *      Fill in the IPv6 header
284          */
285         if (np)
286                 hlimit = np->hop_limit;
287         if (hlimit < 0)
288                 hlimit = ip6_dst_hoplimit(dst);
289
290         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
291                                 ip6_autoflowlabel(net, np), fl6));
292
293         hdr->payload_len = htons(seg_len);
294         hdr->nexthdr = proto;
295         hdr->hop_limit = hlimit;
296
297         hdr->saddr = fl6->saddr;
298         hdr->daddr = *first_hop;
299
300         skb->protocol = htons(ETH_P_IPV6);
301         skb->priority = sk->sk_priority;
302         skb->mark = mark;
303
304         mtu = dst_mtu(dst);
305         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
306                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
307                               IPSTATS_MIB_OUT, skb->len);
308
309                 /* if egress device is enslaved to an L3 master device pass the
310                  * skb to its handler for processing
311                  */
312                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
313                 if (unlikely(!skb))
314                         return 0;
315
316                 /* hooks should never assume socket lock is held.
317                  * we promote our socket to non const
318                  */
319                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
320                                net, (struct sock *)sk, skb, NULL, dst->dev,
321                                dst_output);
322         }
323
324         skb->dev = dst->dev;
325         /* ipv6_local_error() does not require socket lock,
326          * we promote our socket to non const
327          */
328         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
329
330         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
331         kfree_skb(skb);
332         return -EMSGSIZE;
333 }
334 EXPORT_SYMBOL(ip6_xmit);
335
336 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
337 {
338         struct ip6_ra_chain *ra;
339         struct sock *last = NULL;
340
341         read_lock(&ip6_ra_lock);
342         for (ra = ip6_ra_chain; ra; ra = ra->next) {
343                 struct sock *sk = ra->sk;
344                 if (sk && ra->sel == sel &&
345                     (!sk->sk_bound_dev_if ||
346                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
347                         if (last) {
348                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
349                                 if (skb2)
350                                         rawv6_rcv(last, skb2);
351                         }
352                         last = sk;
353                 }
354         }
355
356         if (last) {
357                 rawv6_rcv(last, skb);
358                 read_unlock(&ip6_ra_lock);
359                 return 1;
360         }
361         read_unlock(&ip6_ra_lock);
362         return 0;
363 }
364
365 static int ip6_forward_proxy_check(struct sk_buff *skb)
366 {
367         struct ipv6hdr *hdr = ipv6_hdr(skb);
368         u8 nexthdr = hdr->nexthdr;
369         __be16 frag_off;
370         int offset;
371
372         if (ipv6_ext_hdr(nexthdr)) {
373                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
374                 if (offset < 0)
375                         return 0;
376         } else
377                 offset = sizeof(struct ipv6hdr);
378
379         if (nexthdr == IPPROTO_ICMPV6) {
380                 struct icmp6hdr *icmp6;
381
382                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
383                                          offset + 1 - skb->data)))
384                         return 0;
385
386                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
387
388                 switch (icmp6->icmp6_type) {
389                 case NDISC_ROUTER_SOLICITATION:
390                 case NDISC_ROUTER_ADVERTISEMENT:
391                 case NDISC_NEIGHBOUR_SOLICITATION:
392                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
393                 case NDISC_REDIRECT:
394                         /* For reaction involving unicast neighbor discovery
395                          * message destined to the proxied address, pass it to
396                          * input function.
397                          */
398                         return 1;
399                 default:
400                         break;
401                 }
402         }
403
404         /*
405          * The proxying router can't forward traffic sent to a link-local
406          * address, so signal the sender and discard the packet. This
407          * behavior is clarified by the MIPv6 specification.
408          */
409         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
410                 dst_link_failure(skb);
411                 return -1;
412         }
413
414         return 0;
415 }
416
417 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
418                                      struct sk_buff *skb)
419 {
420         struct dst_entry *dst = skb_dst(skb);
421
422         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
423         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
424
425         return dst_output(net, sk, skb);
426 }
427
428 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
429 {
430         unsigned int mtu;
431         struct inet6_dev *idev;
432
433         if (dst_metric_locked(dst, RTAX_MTU)) {
434                 mtu = dst_metric_raw(dst, RTAX_MTU);
435                 if (mtu)
436                         return mtu;
437         }
438
439         mtu = IPV6_MIN_MTU;
440         rcu_read_lock();
441         idev = __in6_dev_get(dst->dev);
442         if (idev)
443                 mtu = idev->cnf.mtu6;
444         rcu_read_unlock();
445
446         return mtu;
447 }
448
449 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
450 {
451         if (skb->len <= mtu)
452                 return false;
453
454         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
455         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
456                 return true;
457
458         if (skb->ignore_df)
459                 return false;
460
461         if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
462                 return false;
463
464         return true;
465 }
466
467 int ip6_forward(struct sk_buff *skb)
468 {
469         struct dst_entry *dst = skb_dst(skb);
470         struct ipv6hdr *hdr = ipv6_hdr(skb);
471         struct inet6_skb_parm *opt = IP6CB(skb);
472         struct net *net = dev_net(dst->dev);
473         u32 mtu;
474
475         if (net->ipv6.devconf_all->forwarding == 0)
476                 goto error;
477
478         if (skb->pkt_type != PACKET_HOST)
479                 goto drop;
480
481         if (unlikely(skb->sk))
482                 goto drop;
483
484         if (skb_warn_if_lro(skb))
485                 goto drop;
486
487         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
488                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
489                                 IPSTATS_MIB_INDISCARDS);
490                 goto drop;
491         }
492
493         skb_forward_csum(skb);
494
495         /*
496          *      We DO NOT make any processing on
497          *      RA packets, pushing them to user level AS IS
498          *      without ane WARRANTY that application will be able
499          *      to interpret them. The reason is that we
500          *      cannot make anything clever here.
501          *
502          *      We are not end-node, so that if packet contains
503          *      AH/ESP, we cannot make anything.
504          *      Defragmentation also would be mistake, RA packets
505          *      cannot be fragmented, because there is no warranty
506          *      that different fragments will go along one path. --ANK
507          */
508         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
509                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
510                         return 0;
511         }
512
513         /*
514          *      check and decrement ttl
515          */
516         if (hdr->hop_limit <= 1) {
517                 /* Force OUTPUT device used as source address */
518                 skb->dev = dst->dev;
519                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
520                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
521                                 IPSTATS_MIB_INHDRERRORS);
522
523                 kfree_skb(skb);
524                 return -ETIMEDOUT;
525         }
526
527         /* XXX: idev->cnf.proxy_ndp? */
528         if (net->ipv6.devconf_all->proxy_ndp &&
529             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
530                 int proxied = ip6_forward_proxy_check(skb);
531                 if (proxied > 0)
532                         return ip6_input(skb);
533                 else if (proxied < 0) {
534                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
535                                         IPSTATS_MIB_INDISCARDS);
536                         goto drop;
537                 }
538         }
539
540         if (!xfrm6_route_forward(skb)) {
541                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
542                                 IPSTATS_MIB_INDISCARDS);
543                 goto drop;
544         }
545         dst = skb_dst(skb);
546
547         /* IPv6 specs say nothing about it, but it is clear that we cannot
548            send redirects to source routed frames.
549            We don't send redirects to frames decapsulated from IPsec.
550          */
551         if (IP6CB(skb)->iif == dst->dev->ifindex &&
552             opt->srcrt == 0 && !skb_sec_path(skb)) {
553                 struct in6_addr *target = NULL;
554                 struct inet_peer *peer;
555                 struct rt6_info *rt;
556
557                 /*
558                  *      incoming and outgoing devices are the same
559                  *      send a redirect.
560                  */
561
562                 rt = (struct rt6_info *) dst;
563                 if (rt->rt6i_flags & RTF_GATEWAY)
564                         target = &rt->rt6i_gateway;
565                 else
566                         target = &hdr->daddr;
567
568                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
569
570                 /* Limit redirects both by destination (here)
571                    and by source (inside ndisc_send_redirect)
572                  */
573                 if (inet_peer_xrlim_allow(peer, 1*HZ))
574                         ndisc_send_redirect(skb, target);
575                 if (peer)
576                         inet_putpeer(peer);
577         } else {
578                 int addrtype = ipv6_addr_type(&hdr->saddr);
579
580                 /* This check is security critical. */
581                 if (addrtype == IPV6_ADDR_ANY ||
582                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
583                         goto error;
584                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
585                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
586                                     ICMPV6_NOT_NEIGHBOUR, 0);
587                         goto error;
588                 }
589         }
590
591         mtu = ip6_dst_mtu_forward(dst);
592         if (mtu < IPV6_MIN_MTU)
593                 mtu = IPV6_MIN_MTU;
594
595         if (ip6_pkt_too_big(skb, mtu)) {
596                 /* Again, force OUTPUT device used as source address */
597                 skb->dev = dst->dev;
598                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
599                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
600                                 IPSTATS_MIB_INTOOBIGERRORS);
601                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
602                                 IPSTATS_MIB_FRAGFAILS);
603                 kfree_skb(skb);
604                 return -EMSGSIZE;
605         }
606
607         if (skb_cow(skb, dst->dev->hard_header_len)) {
608                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
609                                 IPSTATS_MIB_OUTDISCARDS);
610                 goto drop;
611         }
612
613         hdr = ipv6_hdr(skb);
614
615         /* Mangling hops number delayed to point after skb COW */
616
617         hdr->hop_limit--;
618
619         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
620                        net, NULL, skb, skb->dev, dst->dev,
621                        ip6_forward_finish);
622
623 error:
624         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
625 drop:
626         kfree_skb(skb);
627         return -EINVAL;
628 }
629
630 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
631 {
632         to->pkt_type = from->pkt_type;
633         to->priority = from->priority;
634         to->protocol = from->protocol;
635         skb_dst_drop(to);
636         skb_dst_set(to, dst_clone(skb_dst(from)));
637         to->dev = from->dev;
638         to->mark = from->mark;
639
640         skb_copy_hash(to, from);
641
642 #ifdef CONFIG_NET_SCHED
643         to->tc_index = from->tc_index;
644 #endif
645         nf_copy(to, from);
646         skb_copy_secmark(to, from);
647 }
648
649 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
650                  int (*output)(struct net *, struct sock *, struct sk_buff *))
651 {
652         struct sk_buff *frag;
653         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
654         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
655                                 inet6_sk(skb->sk) : NULL;
656         struct ipv6hdr *tmp_hdr;
657         struct frag_hdr *fh;
658         unsigned int mtu, hlen, left, len, nexthdr_offset;
659         int hroom, troom;
660         __be32 frag_id;
661         int ptr, offset = 0, err = 0;
662         u8 *prevhdr, nexthdr = 0;
663
664         err = ip6_find_1stfragopt(skb, &prevhdr);
665         if (err < 0)
666                 goto fail;
667         hlen = err;
668         nexthdr = *prevhdr;
669         nexthdr_offset = prevhdr - skb_network_header(skb);
670
671         mtu = ip6_skb_dst_mtu(skb);
672
673         /* We must not fragment if the socket is set to force MTU discovery
674          * or if the skb it not generated by a local socket.
675          */
676         if (unlikely(!skb->ignore_df && skb->len > mtu))
677                 goto fail_toobig;
678
679         if (IP6CB(skb)->frag_max_size) {
680                 if (IP6CB(skb)->frag_max_size > mtu)
681                         goto fail_toobig;
682
683                 /* don't send fragments larger than what we received */
684                 mtu = IP6CB(skb)->frag_max_size;
685                 if (mtu < IPV6_MIN_MTU)
686                         mtu = IPV6_MIN_MTU;
687         }
688
689         if (np && np->frag_size < mtu) {
690                 if (np->frag_size)
691                         mtu = np->frag_size;
692         }
693         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
694                 goto fail_toobig;
695         mtu -= hlen + sizeof(struct frag_hdr);
696
697         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
698                                     &ipv6_hdr(skb)->saddr);
699
700         if (skb->ip_summed == CHECKSUM_PARTIAL &&
701             (err = skb_checksum_help(skb)))
702                 goto fail;
703
704         prevhdr = skb_network_header(skb) + nexthdr_offset;
705         hroom = LL_RESERVED_SPACE(rt->dst.dev);
706         if (skb_has_frag_list(skb)) {
707                 unsigned int first_len = skb_pagelen(skb);
708                 struct sk_buff *frag2;
709
710                 if (first_len - hlen > mtu ||
711                     ((first_len - hlen) & 7) ||
712                     skb_cloned(skb) ||
713                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
714                         goto slow_path;
715
716                 skb_walk_frags(skb, frag) {
717                         /* Correct geometry. */
718                         if (frag->len > mtu ||
719                             ((frag->len & 7) && frag->next) ||
720                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
721                                 goto slow_path_clean;
722
723                         /* Partially cloned skb? */
724                         if (skb_shared(frag))
725                                 goto slow_path_clean;
726
727                         BUG_ON(frag->sk);
728                         if (skb->sk) {
729                                 frag->sk = skb->sk;
730                                 frag->destructor = sock_wfree;
731                         }
732                         skb->truesize -= frag->truesize;
733                 }
734
735                 err = 0;
736                 offset = 0;
737                 /* BUILD HEADER */
738
739                 *prevhdr = NEXTHDR_FRAGMENT;
740                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
741                 if (!tmp_hdr) {
742                         err = -ENOMEM;
743                         goto fail;
744                 }
745                 frag = skb_shinfo(skb)->frag_list;
746                 skb_frag_list_init(skb);
747
748                 __skb_pull(skb, hlen);
749                 fh = __skb_push(skb, sizeof(struct frag_hdr));
750                 __skb_push(skb, hlen);
751                 skb_reset_network_header(skb);
752                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
753
754                 fh->nexthdr = nexthdr;
755                 fh->reserved = 0;
756                 fh->frag_off = htons(IP6_MF);
757                 fh->identification = frag_id;
758
759                 first_len = skb_pagelen(skb);
760                 skb->data_len = first_len - skb_headlen(skb);
761                 skb->len = first_len;
762                 ipv6_hdr(skb)->payload_len = htons(first_len -
763                                                    sizeof(struct ipv6hdr));
764
765                 /* We prevent @rt from being freed. */
766                 rcu_read_lock();
767
768                 for (;;) {
769                         /* Prepare header of the next frame,
770                          * before previous one went down. */
771                         if (frag) {
772                                 frag->ip_summed = CHECKSUM_NONE;
773                                 skb_reset_transport_header(frag);
774                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
775                                 __skb_push(frag, hlen);
776                                 skb_reset_network_header(frag);
777                                 memcpy(skb_network_header(frag), tmp_hdr,
778                                        hlen);
779                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
780                                 fh->nexthdr = nexthdr;
781                                 fh->reserved = 0;
782                                 fh->frag_off = htons(offset);
783                                 if (frag->next)
784                                         fh->frag_off |= htons(IP6_MF);
785                                 fh->identification = frag_id;
786                                 ipv6_hdr(frag)->payload_len =
787                                                 htons(frag->len -
788                                                       sizeof(struct ipv6hdr));
789                                 ip6_copy_metadata(frag, skb);
790                         }
791
792                         err = output(net, sk, skb);
793                         if (!err)
794                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
795                                               IPSTATS_MIB_FRAGCREATES);
796
797                         if (err || !frag)
798                                 break;
799
800                         skb = frag;
801                         frag = skb->next;
802                         skb->next = NULL;
803                 }
804
805                 kfree(tmp_hdr);
806
807                 if (err == 0) {
808                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
809                                       IPSTATS_MIB_FRAGOKS);
810                         rcu_read_unlock();
811                         return 0;
812                 }
813
814                 kfree_skb_list(frag);
815
816                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
817                               IPSTATS_MIB_FRAGFAILS);
818                 rcu_read_unlock();
819                 return err;
820
821 slow_path_clean:
822                 skb_walk_frags(skb, frag2) {
823                         if (frag2 == frag)
824                                 break;
825                         frag2->sk = NULL;
826                         frag2->destructor = NULL;
827                         skb->truesize += frag2->truesize;
828                 }
829         }
830
831 slow_path:
832         left = skb->len - hlen;         /* Space per frame */
833         ptr = hlen;                     /* Where to start from */
834
835         /*
836          *      Fragment the datagram.
837          */
838
839         troom = rt->dst.dev->needed_tailroom;
840
841         /*
842          *      Keep copying data until we run out.
843          */
844         while (left > 0)        {
845                 u8 *fragnexthdr_offset;
846
847                 len = left;
848                 /* IF: it doesn't fit, use 'mtu' - the data space left */
849                 if (len > mtu)
850                         len = mtu;
851                 /* IF: we are not sending up to and including the packet end
852                    then align the next start on an eight byte boundary */
853                 if (len < left) {
854                         len &= ~7;
855                 }
856
857                 /* Allocate buffer */
858                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
859                                  hroom + troom, GFP_ATOMIC);
860                 if (!frag) {
861                         err = -ENOMEM;
862                         goto fail;
863                 }
864
865                 /*
866                  *      Set up data on packet
867                  */
868
869                 ip6_copy_metadata(frag, skb);
870                 skb_reserve(frag, hroom);
871                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
872                 skb_reset_network_header(frag);
873                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
874                 frag->transport_header = (frag->network_header + hlen +
875                                           sizeof(struct frag_hdr));
876
877                 /*
878                  *      Charge the memory for the fragment to any owner
879                  *      it might possess
880                  */
881                 if (skb->sk)
882                         skb_set_owner_w(frag, skb->sk);
883
884                 /*
885                  *      Copy the packet header into the new buffer.
886                  */
887                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
888
889                 fragnexthdr_offset = skb_network_header(frag);
890                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
891                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
892
893                 /*
894                  *      Build fragment header.
895                  */
896                 fh->nexthdr = nexthdr;
897                 fh->reserved = 0;
898                 fh->identification = frag_id;
899
900                 /*
901                  *      Copy a block of the IP datagram.
902                  */
903                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
904                                      len));
905                 left -= len;
906
907                 fh->frag_off = htons(offset);
908                 if (left > 0)
909                         fh->frag_off |= htons(IP6_MF);
910                 ipv6_hdr(frag)->payload_len = htons(frag->len -
911                                                     sizeof(struct ipv6hdr));
912
913                 ptr += len;
914                 offset += len;
915
916                 /*
917                  *      Put this fragment into the sending queue.
918                  */
919                 err = output(net, sk, frag);
920                 if (err)
921                         goto fail;
922
923                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
924                               IPSTATS_MIB_FRAGCREATES);
925         }
926         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
927                       IPSTATS_MIB_FRAGOKS);
928         consume_skb(skb);
929         return err;
930
931 fail_toobig:
932         if (skb->sk && dst_allfrag(skb_dst(skb)))
933                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
934
935         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
936         err = -EMSGSIZE;
937
938 fail:
939         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
940                       IPSTATS_MIB_FRAGFAILS);
941         kfree_skb(skb);
942         return err;
943 }
944
945 static inline int ip6_rt_check(const struct rt6key *rt_key,
946                                const struct in6_addr *fl_addr,
947                                const struct in6_addr *addr_cache)
948 {
949         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
950                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
951 }
952
953 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
954                                           struct dst_entry *dst,
955                                           const struct flowi6 *fl6)
956 {
957         struct ipv6_pinfo *np = inet6_sk(sk);
958         struct rt6_info *rt;
959
960         if (!dst)
961                 goto out;
962
963         if (dst->ops->family != AF_INET6) {
964                 dst_release(dst);
965                 return NULL;
966         }
967
968         rt = (struct rt6_info *)dst;
969         /* Yes, checking route validity in not connected
970          * case is not very simple. Take into account,
971          * that we do not support routing by source, TOS,
972          * and MSG_DONTROUTE            --ANK (980726)
973          *
974          * 1. ip6_rt_check(): If route was host route,
975          *    check that cached destination is current.
976          *    If it is network route, we still may
977          *    check its validity using saved pointer
978          *    to the last used address: daddr_cache.
979          *    We do not want to save whole address now,
980          *    (because main consumer of this service
981          *    is tcp, which has not this problem),
982          *    so that the last trick works only on connected
983          *    sockets.
984          * 2. oif also should be the same.
985          */
986         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
987 #ifdef CONFIG_IPV6_SUBTREES
988             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
989 #endif
990            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
991               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
992                 dst_release(dst);
993                 dst = NULL;
994         }
995
996 out:
997         return dst;
998 }
999
1000 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1001                                struct dst_entry **dst, struct flowi6 *fl6)
1002 {
1003 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1004         struct neighbour *n;
1005         struct rt6_info *rt;
1006 #endif
1007         int err;
1008         int flags = 0;
1009
1010         /* The correct way to handle this would be to do
1011          * ip6_route_get_saddr, and then ip6_route_output; however,
1012          * the route-specific preferred source forces the
1013          * ip6_route_output call _before_ ip6_route_get_saddr.
1014          *
1015          * In source specific routing (no src=any default route),
1016          * ip6_route_output will fail given src=any saddr, though, so
1017          * that's why we try it again later.
1018          */
1019         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
1020                 struct rt6_info *rt;
1021                 bool had_dst = *dst != NULL;
1022
1023                 if (!had_dst)
1024                         *dst = ip6_route_output(net, sk, fl6);
1025                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1026                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
1027                                           sk ? inet6_sk(sk)->srcprefs : 0,
1028                                           &fl6->saddr);
1029                 if (err)
1030                         goto out_err_release;
1031
1032                 /* If we had an erroneous initial result, pretend it
1033                  * never existed and let the SA-enabled version take
1034                  * over.
1035                  */
1036                 if (!had_dst && (*dst)->error) {
1037                         dst_release(*dst);
1038                         *dst = NULL;
1039                 }
1040
1041                 if (fl6->flowi6_oif)
1042                         flags |= RT6_LOOKUP_F_IFACE;
1043         }
1044
1045         if (!*dst)
1046                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1047
1048         err = (*dst)->error;
1049         if (err)
1050                 goto out_err_release;
1051
1052 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1053         /*
1054          * Here if the dst entry we've looked up
1055          * has a neighbour entry that is in the INCOMPLETE
1056          * state and the src address from the flow is
1057          * marked as OPTIMISTIC, we release the found
1058          * dst entry and replace it instead with the
1059          * dst entry of the nexthop router
1060          */
1061         rt = (struct rt6_info *) *dst;
1062         rcu_read_lock_bh();
1063         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1064                                       rt6_nexthop(rt, &fl6->daddr));
1065         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1066         rcu_read_unlock_bh();
1067
1068         if (err) {
1069                 struct inet6_ifaddr *ifp;
1070                 struct flowi6 fl_gw6;
1071                 int redirect;
1072
1073                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1074                                       (*dst)->dev, 1);
1075
1076                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1077                 if (ifp)
1078                         in6_ifa_put(ifp);
1079
1080                 if (redirect) {
1081                         /*
1082                          * We need to get the dst entry for the
1083                          * default router instead
1084                          */
1085                         dst_release(*dst);
1086                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1087                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1088                         *dst = ip6_route_output(net, sk, &fl_gw6);
1089                         err = (*dst)->error;
1090                         if (err)
1091                                 goto out_err_release;
1092                 }
1093         }
1094 #endif
1095         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1096             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1097                 err = -EAFNOSUPPORT;
1098                 goto out_err_release;
1099         }
1100
1101         return 0;
1102
1103 out_err_release:
1104         dst_release(*dst);
1105         *dst = NULL;
1106
1107         if (err == -ENETUNREACH)
1108                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1109         return err;
1110 }
1111
1112 /**
1113  *      ip6_dst_lookup - perform route lookup on flow
1114  *      @sk: socket which provides route info
1115  *      @dst: pointer to dst_entry * for result
1116  *      @fl6: flow to lookup
1117  *
1118  *      This function performs a route lookup on the given flow.
1119  *
1120  *      It returns zero on success, or a standard errno code on error.
1121  */
1122 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1123                    struct flowi6 *fl6)
1124 {
1125         *dst = NULL;
1126         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1127 }
1128 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1129
1130 /**
1131  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1132  *      @sk: socket which provides route info
1133  *      @fl6: flow to lookup
1134  *      @final_dst: final destination address for ipsec lookup
1135  *
1136  *      This function performs a route lookup on the given flow.
1137  *
1138  *      It returns a valid dst pointer on success, or a pointer encoded
1139  *      error code.
1140  */
1141 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1142                                       const struct in6_addr *final_dst)
1143 {
1144         struct dst_entry *dst = NULL;
1145         int err;
1146
1147         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1148         if (err)
1149                 return ERR_PTR(err);
1150         if (final_dst)
1151                 fl6->daddr = *final_dst;
1152
1153         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1154 }
1155 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1156
1157 /**
1158  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1159  *      @sk: socket which provides the dst cache and route info
1160  *      @fl6: flow to lookup
1161  *      @final_dst: final destination address for ipsec lookup
1162  *
1163  *      This function performs a route lookup on the given flow with the
1164  *      possibility of using the cached route in the socket if it is valid.
1165  *      It will take the socket dst lock when operating on the dst cache.
1166  *      As a result, this function can only be used in process context.
1167  *
1168  *      It returns a valid dst pointer on success, or a pointer encoded
1169  *      error code.
1170  */
1171 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1172                                          const struct in6_addr *final_dst)
1173 {
1174         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1175
1176         dst = ip6_sk_dst_check(sk, dst, fl6);
1177         if (!dst)
1178                 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1179
1180         return dst;
1181 }
1182 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1183
1184 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1185                                                gfp_t gfp)
1186 {
1187         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1188 }
1189
1190 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1191                                                 gfp_t gfp)
1192 {
1193         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1194 }
1195
1196 static void ip6_append_data_mtu(unsigned int *mtu,
1197                                 int *maxfraglen,
1198                                 unsigned int fragheaderlen,
1199                                 struct sk_buff *skb,
1200                                 struct rt6_info *rt,
1201                                 unsigned int orig_mtu)
1202 {
1203         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1204                 if (!skb) {
1205                         /* first fragment, reserve header_len */
1206                         *mtu = orig_mtu - rt->dst.header_len;
1207
1208                 } else {
1209                         /*
1210                          * this fragment is not first, the headers
1211                          * space is regarded as data space.
1212                          */
1213                         *mtu = orig_mtu;
1214                 }
1215                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1216                               + fragheaderlen - sizeof(struct frag_hdr);
1217         }
1218 }
1219
1220 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1221                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1222                           struct rt6_info *rt, struct flowi6 *fl6)
1223 {
1224         struct ipv6_pinfo *np = inet6_sk(sk);
1225         unsigned int mtu;
1226         struct ipv6_txoptions *opt = ipc6->opt;
1227
1228         /*
1229          * setup for corking
1230          */
1231         if (opt) {
1232                 if (WARN_ON(v6_cork->opt))
1233                         return -EINVAL;
1234
1235                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1236                 if (unlikely(!v6_cork->opt))
1237                         return -ENOBUFS;
1238
1239                 v6_cork->opt->tot_len = sizeof(*opt);
1240                 v6_cork->opt->opt_flen = opt->opt_flen;
1241                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1242
1243                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1244                                                     sk->sk_allocation);
1245                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1246                         return -ENOBUFS;
1247
1248                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1249                                                     sk->sk_allocation);
1250                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1251                         return -ENOBUFS;
1252
1253                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1254                                                    sk->sk_allocation);
1255                 if (opt->hopopt && !v6_cork->opt->hopopt)
1256                         return -ENOBUFS;
1257
1258                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1259                                                     sk->sk_allocation);
1260                 if (opt->srcrt && !v6_cork->opt->srcrt)
1261                         return -ENOBUFS;
1262
1263                 /* need source address above miyazawa*/
1264         }
1265         dst_hold(&rt->dst);
1266         cork->base.dst = &rt->dst;
1267         cork->fl.u.ip6 = *fl6;
1268         v6_cork->hop_limit = ipc6->hlimit;
1269         v6_cork->tclass = ipc6->tclass;
1270         if (rt->dst.flags & DST_XFRM_TUNNEL)
1271                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1272                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1273         else
1274                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1275                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path);
1276         if (np->frag_size < mtu) {
1277                 if (np->frag_size)
1278                         mtu = np->frag_size;
1279         }
1280         cork->base.fragsize = mtu;
1281         if (dst_allfrag(rt->dst.path))
1282                 cork->base.flags |= IPCORK_ALLFRAG;
1283         cork->base.length = 0;
1284
1285         return 0;
1286 }
1287
1288 static int __ip6_append_data(struct sock *sk,
1289                              struct flowi6 *fl6,
1290                              struct sk_buff_head *queue,
1291                              struct inet_cork *cork,
1292                              struct inet6_cork *v6_cork,
1293                              struct page_frag *pfrag,
1294                              int getfrag(void *from, char *to, int offset,
1295                                          int len, int odd, struct sk_buff *skb),
1296                              void *from, int length, int transhdrlen,
1297                              unsigned int flags, struct ipcm6_cookie *ipc6,
1298                              const struct sockcm_cookie *sockc)
1299 {
1300         struct sk_buff *skb, *skb_prev = NULL;
1301         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1302         int exthdrlen = 0;
1303         int dst_exthdrlen = 0;
1304         int hh_len;
1305         int copy;
1306         int err;
1307         int offset = 0;
1308         __u8 tx_flags = 0;
1309         u32 tskey = 0;
1310         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1311         struct ipv6_txoptions *opt = v6_cork->opt;
1312         int csummode = CHECKSUM_NONE;
1313         unsigned int maxnonfragsize, headersize;
1314
1315         skb = skb_peek_tail(queue);
1316         if (!skb) {
1317                 exthdrlen = opt ? opt->opt_flen : 0;
1318                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1319         }
1320
1321         mtu = cork->fragsize;
1322         orig_mtu = mtu;
1323
1324         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1325
1326         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1327                         (opt ? opt->opt_nflen : 0);
1328
1329         headersize = sizeof(struct ipv6hdr) +
1330                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1331                      (dst_allfrag(&rt->dst) ?
1332                       sizeof(struct frag_hdr) : 0) +
1333                      rt->rt6i_nfheader_len;
1334
1335         if (mtu <= fragheaderlen ||
1336             ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1337                 goto emsgsize;
1338
1339         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1340                      sizeof(struct frag_hdr);
1341
1342         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1343          * the first fragment
1344          */
1345         if (headersize + transhdrlen > mtu)
1346                 goto emsgsize;
1347
1348         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1349             (sk->sk_protocol == IPPROTO_UDP ||
1350              sk->sk_protocol == IPPROTO_RAW)) {
1351                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1352                                 sizeof(struct ipv6hdr));
1353                 goto emsgsize;
1354         }
1355
1356         if (ip6_sk_ignore_df(sk))
1357                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1358         else
1359                 maxnonfragsize = mtu;
1360
1361         if (cork->length + length > maxnonfragsize - headersize) {
1362 emsgsize:
1363                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1364                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1365                 return -EMSGSIZE;
1366         }
1367
1368         /* CHECKSUM_PARTIAL only with no extension headers and when
1369          * we are not going to fragment
1370          */
1371         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1372             headersize == sizeof(struct ipv6hdr) &&
1373             length <= mtu - headersize &&
1374             !(flags & MSG_MORE) &&
1375             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1376                 csummode = CHECKSUM_PARTIAL;
1377
1378         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1379                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1380                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1381                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1382                         tskey = sk->sk_tskey++;
1383         }
1384
1385         /*
1386          * Let's try using as much space as possible.
1387          * Use MTU if total length of the message fits into the MTU.
1388          * Otherwise, we need to reserve fragment header and
1389          * fragment alignment (= 8-15 octects, in total).
1390          *
1391          * Note that we may need to "move" the data from the tail of
1392          * of the buffer to the new fragment when we split
1393          * the message.
1394          *
1395          * FIXME: It may be fragmented into multiple chunks
1396          *        at once if non-fragmentable extension headers
1397          *        are too large.
1398          * --yoshfuji
1399          */
1400
1401         cork->length += length;
1402         if (!skb)
1403                 goto alloc_new_skb;
1404
1405         while (length > 0) {
1406                 /* Check if the remaining data fits into current packet. */
1407                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1408                 if (copy < length)
1409                         copy = maxfraglen - skb->len;
1410
1411                 if (copy <= 0) {
1412                         char *data;
1413                         unsigned int datalen;
1414                         unsigned int fraglen;
1415                         unsigned int fraggap;
1416                         unsigned int alloclen;
1417 alloc_new_skb:
1418                         /* There's no room in the current skb */
1419                         if (skb)
1420                                 fraggap = skb->len - maxfraglen;
1421                         else
1422                                 fraggap = 0;
1423                         /* update mtu and maxfraglen if necessary */
1424                         if (!skb || !skb_prev)
1425                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1426                                                     fragheaderlen, skb, rt,
1427                                                     orig_mtu);
1428
1429                         skb_prev = skb;
1430
1431                         /*
1432                          * If remaining data exceeds the mtu,
1433                          * we know we need more fragment(s).
1434                          */
1435                         datalen = length + fraggap;
1436
1437                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1438                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1439                         if ((flags & MSG_MORE) &&
1440                             !(rt->dst.dev->features&NETIF_F_SG))
1441                                 alloclen = mtu;
1442                         else
1443                                 alloclen = datalen + fragheaderlen;
1444
1445                         alloclen += dst_exthdrlen;
1446
1447                         if (datalen != length + fraggap) {
1448                                 /*
1449                                  * this is not the last fragment, the trailer
1450                                  * space is regarded as data space.
1451                                  */
1452                                 datalen += rt->dst.trailer_len;
1453                         }
1454
1455                         alloclen += rt->dst.trailer_len;
1456                         fraglen = datalen + fragheaderlen;
1457
1458                         /*
1459                          * We just reserve space for fragment header.
1460                          * Note: this may be overallocation if the message
1461                          * (without MSG_MORE) fits into the MTU.
1462                          */
1463                         alloclen += sizeof(struct frag_hdr);
1464
1465                         copy = datalen - transhdrlen - fraggap;
1466                         if (copy < 0) {
1467                                 err = -EINVAL;
1468                                 goto error;
1469                         }
1470                         if (transhdrlen) {
1471                                 skb = sock_alloc_send_skb(sk,
1472                                                 alloclen + hh_len,
1473                                                 (flags & MSG_DONTWAIT), &err);
1474                         } else {
1475                                 skb = NULL;
1476                                 if (refcount_read(&sk->sk_wmem_alloc) <=
1477                                     2 * sk->sk_sndbuf)
1478                                         skb = sock_wmalloc(sk,
1479                                                            alloclen + hh_len, 1,
1480                                                            sk->sk_allocation);
1481                                 if (unlikely(!skb))
1482                                         err = -ENOBUFS;
1483                         }
1484                         if (!skb)
1485                                 goto error;
1486                         /*
1487                          *      Fill in the control structures
1488                          */
1489                         skb->protocol = htons(ETH_P_IPV6);
1490                         skb->ip_summed = csummode;
1491                         skb->csum = 0;
1492                         /* reserve for fragmentation and ipsec header */
1493                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1494                                     dst_exthdrlen);
1495
1496                         /* Only the initial fragment is time stamped */
1497                         skb_shinfo(skb)->tx_flags = tx_flags;
1498                         tx_flags = 0;
1499                         skb_shinfo(skb)->tskey = tskey;
1500                         tskey = 0;
1501
1502                         /*
1503                          *      Find where to start putting bytes
1504                          */
1505                         data = skb_put(skb, fraglen);
1506                         skb_set_network_header(skb, exthdrlen);
1507                         data += fragheaderlen;
1508                         skb->transport_header = (skb->network_header +
1509                                                  fragheaderlen);
1510                         if (fraggap) {
1511                                 skb->csum = skb_copy_and_csum_bits(
1512                                         skb_prev, maxfraglen,
1513                                         data + transhdrlen, fraggap, 0);
1514                                 skb_prev->csum = csum_sub(skb_prev->csum,
1515                                                           skb->csum);
1516                                 data += fraggap;
1517                                 pskb_trim_unique(skb_prev, maxfraglen);
1518                         }
1519                         if (copy > 0 &&
1520                             getfrag(from, data + transhdrlen, offset,
1521                                     copy, fraggap, skb) < 0) {
1522                                 err = -EFAULT;
1523                                 kfree_skb(skb);
1524                                 goto error;
1525                         }
1526
1527                         offset += copy;
1528                         length -= datalen - fraggap;
1529                         transhdrlen = 0;
1530                         exthdrlen = 0;
1531                         dst_exthdrlen = 0;
1532
1533                         if ((flags & MSG_CONFIRM) && !skb_prev)
1534                                 skb_set_dst_pending_confirm(skb, 1);
1535
1536                         /*
1537                          * Put the packet on the pending queue
1538                          */
1539                         __skb_queue_tail(queue, skb);
1540                         continue;
1541                 }
1542
1543                 if (copy > length)
1544                         copy = length;
1545
1546                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1547                     skb_tailroom(skb) >= copy) {
1548                         unsigned int off;
1549
1550                         off = skb->len;
1551                         if (getfrag(from, skb_put(skb, copy),
1552                                                 offset, copy, off, skb) < 0) {
1553                                 __skb_trim(skb, off);
1554                                 err = -EFAULT;
1555                                 goto error;
1556                         }
1557                 } else {
1558                         int i = skb_shinfo(skb)->nr_frags;
1559
1560                         err = -ENOMEM;
1561                         if (!sk_page_frag_refill(sk, pfrag))
1562                                 goto error;
1563
1564                         if (!skb_can_coalesce(skb, i, pfrag->page,
1565                                               pfrag->offset)) {
1566                                 err = -EMSGSIZE;
1567                                 if (i == MAX_SKB_FRAGS)
1568                                         goto error;
1569
1570                                 __skb_fill_page_desc(skb, i, pfrag->page,
1571                                                      pfrag->offset, 0);
1572                                 skb_shinfo(skb)->nr_frags = ++i;
1573                                 get_page(pfrag->page);
1574                         }
1575                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1576                         if (getfrag(from,
1577                                     page_address(pfrag->page) + pfrag->offset,
1578                                     offset, copy, skb->len, skb) < 0)
1579                                 goto error_efault;
1580
1581                         pfrag->offset += copy;
1582                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1583                         skb->len += copy;
1584                         skb->data_len += copy;
1585                         skb->truesize += copy;
1586                         refcount_add(copy, &sk->sk_wmem_alloc);
1587                 }
1588                 offset += copy;
1589                 length -= copy;
1590         }
1591
1592         return 0;
1593
1594 error_efault:
1595         err = -EFAULT;
1596 error:
1597         cork->length -= length;
1598         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1599         return err;
1600 }
1601
1602 int ip6_append_data(struct sock *sk,
1603                     int getfrag(void *from, char *to, int offset, int len,
1604                                 int odd, struct sk_buff *skb),
1605                     void *from, int length, int transhdrlen,
1606                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1607                     struct rt6_info *rt, unsigned int flags,
1608                     const struct sockcm_cookie *sockc)
1609 {
1610         struct inet_sock *inet = inet_sk(sk);
1611         struct ipv6_pinfo *np = inet6_sk(sk);
1612         int exthdrlen;
1613         int err;
1614
1615         if (flags&MSG_PROBE)
1616                 return 0;
1617         if (skb_queue_empty(&sk->sk_write_queue)) {
1618                 /*
1619                  * setup for corking
1620                  */
1621                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1622                                      ipc6, rt, fl6);
1623                 if (err)
1624                         return err;
1625
1626                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1627                 length += exthdrlen;
1628                 transhdrlen += exthdrlen;
1629         } else {
1630                 fl6 = &inet->cork.fl.u.ip6;
1631                 transhdrlen = 0;
1632         }
1633
1634         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1635                                  &np->cork, sk_page_frag(sk), getfrag,
1636                                  from, length, transhdrlen, flags, ipc6, sockc);
1637 }
1638 EXPORT_SYMBOL_GPL(ip6_append_data);
1639
1640 static void ip6_cork_release(struct inet_cork_full *cork,
1641                              struct inet6_cork *v6_cork)
1642 {
1643         if (v6_cork->opt) {
1644                 kfree(v6_cork->opt->dst0opt);
1645                 kfree(v6_cork->opt->dst1opt);
1646                 kfree(v6_cork->opt->hopopt);
1647                 kfree(v6_cork->opt->srcrt);
1648                 kfree(v6_cork->opt);
1649                 v6_cork->opt = NULL;
1650         }
1651
1652         if (cork->base.dst) {
1653                 dst_release(cork->base.dst);
1654                 cork->base.dst = NULL;
1655                 cork->base.flags &= ~IPCORK_ALLFRAG;
1656         }
1657         memset(&cork->fl, 0, sizeof(cork->fl));
1658 }
1659
1660 struct sk_buff *__ip6_make_skb(struct sock *sk,
1661                                struct sk_buff_head *queue,
1662                                struct inet_cork_full *cork,
1663                                struct inet6_cork *v6_cork)
1664 {
1665         struct sk_buff *skb, *tmp_skb;
1666         struct sk_buff **tail_skb;
1667         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1668         struct ipv6_pinfo *np = inet6_sk(sk);
1669         struct net *net = sock_net(sk);
1670         struct ipv6hdr *hdr;
1671         struct ipv6_txoptions *opt = v6_cork->opt;
1672         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1673         struct flowi6 *fl6 = &cork->fl.u.ip6;
1674         unsigned char proto = fl6->flowi6_proto;
1675
1676         skb = __skb_dequeue(queue);
1677         if (!skb)
1678                 goto out;
1679         tail_skb = &(skb_shinfo(skb)->frag_list);
1680
1681         /* move skb->data to ip header from ext header */
1682         if (skb->data < skb_network_header(skb))
1683                 __skb_pull(skb, skb_network_offset(skb));
1684         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1685                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1686                 *tail_skb = tmp_skb;
1687                 tail_skb = &(tmp_skb->next);
1688                 skb->len += tmp_skb->len;
1689                 skb->data_len += tmp_skb->len;
1690                 skb->truesize += tmp_skb->truesize;
1691                 tmp_skb->destructor = NULL;
1692                 tmp_skb->sk = NULL;
1693         }
1694
1695         /* Allow local fragmentation. */
1696         skb->ignore_df = ip6_sk_ignore_df(sk);
1697
1698         *final_dst = fl6->daddr;
1699         __skb_pull(skb, skb_network_header_len(skb));
1700         if (opt && opt->opt_flen)
1701                 ipv6_push_frag_opts(skb, opt, &proto);
1702         if (opt && opt->opt_nflen)
1703                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1704
1705         skb_push(skb, sizeof(struct ipv6hdr));
1706         skb_reset_network_header(skb);
1707         hdr = ipv6_hdr(skb);
1708
1709         ip6_flow_hdr(hdr, v6_cork->tclass,
1710                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1711                                         ip6_autoflowlabel(net, np), fl6));
1712         hdr->hop_limit = v6_cork->hop_limit;
1713         hdr->nexthdr = proto;
1714         hdr->saddr = fl6->saddr;
1715         hdr->daddr = *final_dst;
1716
1717         skb->priority = sk->sk_priority;
1718         skb->mark = sk->sk_mark;
1719
1720         skb_dst_set(skb, dst_clone(&rt->dst));
1721         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1722         if (proto == IPPROTO_ICMPV6) {
1723                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1724                 u8 icmp6_type;
1725
1726                 if (sk->sk_socket->type == SOCK_RAW && !inet_sk(sk)->hdrincl)
1727                         icmp6_type = fl6->fl6_icmp_type;
1728                 else
1729                         icmp6_type = icmp6_hdr(skb)->icmp6_type;
1730                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1731                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1732         }
1733
1734         ip6_cork_release(cork, v6_cork);
1735 out:
1736         return skb;
1737 }
1738
1739 int ip6_send_skb(struct sk_buff *skb)
1740 {
1741         struct net *net = sock_net(skb->sk);
1742         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1743         int err;
1744
1745         err = ip6_local_out(net, skb->sk, skb);
1746         if (err) {
1747                 if (err > 0)
1748                         err = net_xmit_errno(err);
1749                 if (err)
1750                         IP6_INC_STATS(net, rt->rt6i_idev,
1751                                       IPSTATS_MIB_OUTDISCARDS);
1752         }
1753
1754         return err;
1755 }
1756
1757 int ip6_push_pending_frames(struct sock *sk)
1758 {
1759         struct sk_buff *skb;
1760
1761         skb = ip6_finish_skb(sk);
1762         if (!skb)
1763                 return 0;
1764
1765         return ip6_send_skb(skb);
1766 }
1767 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1768
1769 static void __ip6_flush_pending_frames(struct sock *sk,
1770                                        struct sk_buff_head *queue,
1771                                        struct inet_cork_full *cork,
1772                                        struct inet6_cork *v6_cork)
1773 {
1774         struct sk_buff *skb;
1775
1776         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1777                 if (skb_dst(skb))
1778                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1779                                       IPSTATS_MIB_OUTDISCARDS);
1780                 kfree_skb(skb);
1781         }
1782
1783         ip6_cork_release(cork, v6_cork);
1784 }
1785
1786 void ip6_flush_pending_frames(struct sock *sk)
1787 {
1788         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1789                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1790 }
1791 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1792
1793 struct sk_buff *ip6_make_skb(struct sock *sk,
1794                              int getfrag(void *from, char *to, int offset,
1795                                          int len, int odd, struct sk_buff *skb),
1796                              void *from, int length, int transhdrlen,
1797                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1798                              struct rt6_info *rt, unsigned int flags,
1799                              const struct sockcm_cookie *sockc)
1800 {
1801         struct inet_cork_full cork;
1802         struct inet6_cork v6_cork;
1803         struct sk_buff_head queue;
1804         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1805         int err;
1806
1807         if (flags & MSG_PROBE)
1808                 return NULL;
1809
1810         __skb_queue_head_init(&queue);
1811
1812         cork.base.flags = 0;
1813         cork.base.addr = 0;
1814         cork.base.opt = NULL;
1815         cork.base.dst = NULL;
1816         v6_cork.opt = NULL;
1817         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1818         if (err) {
1819                 ip6_cork_release(&cork, &v6_cork);
1820                 return ERR_PTR(err);
1821         }
1822         if (ipc6->dontfrag < 0)
1823                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1824
1825         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1826                                 &current->task_frag, getfrag, from,
1827                                 length + exthdrlen, transhdrlen + exthdrlen,
1828                                 flags, ipc6, sockc);
1829         if (err) {
1830                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1831                 return ERR_PTR(err);
1832         }
1833
1834         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1835 }