GNU Linux-libre 4.19.242-gnu1
[releases.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64         struct dst_entry *dst = skb_dst(skb);
65         struct net_device *dev = dst->dev;
66         struct neighbour *neigh;
67         struct in6_addr *nexthop;
68         int ret;
69
70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72
73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74                     ((mroute6_is_socket(net, skb) &&
75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77                                          &ipv6_hdr(skb)->saddr))) {
78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80                         /* Do not check for IFF_ALLMULTI; multicast routing
81                            is not supported in any case.
82                          */
83                         if (newskb)
84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85                                         net, sk, newskb, NULL, newskb->dev,
86                                         dev_loopback_xmit);
87
88                         if (ipv6_hdr(skb)->hop_limit == 0) {
89                                 IP6_INC_STATS(net, idev,
90                                               IPSTATS_MIB_OUTDISCARDS);
91                                 kfree_skb(skb);
92                                 return 0;
93                         }
94                 }
95
96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97
98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99                     IPV6_ADDR_SCOPE_NODELOCAL &&
100                     !(dev->flags & IFF_LOOPBACK)) {
101                         kfree_skb(skb);
102                         return 0;
103                 }
104         }
105
106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107                 int res = lwtunnel_xmit(skb);
108
109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110                         return res;
111         }
112
113         rcu_read_lock_bh();
114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116         if (unlikely(!neigh))
117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118         if (!IS_ERR(neigh)) {
119                 sock_confirm_neigh(skb, neigh);
120                 ret = neigh_output(neigh, skb);
121                 rcu_read_unlock_bh();
122                 return ret;
123         }
124         rcu_read_unlock_bh();
125
126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127         kfree_skb(skb);
128         return -EINVAL;
129 }
130
131 static int
132 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
133                                     struct sk_buff *skb, unsigned int mtu)
134 {
135         struct sk_buff *segs, *nskb;
136         netdev_features_t features;
137         int ret = 0;
138
139         /* Please see corresponding comment in ip_finish_output_gso
140          * describing the cases where GSO segment length exceeds the
141          * egress MTU.
142          */
143         features = netif_skb_features(skb);
144         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
145         if (IS_ERR_OR_NULL(segs)) {
146                 kfree_skb(skb);
147                 return -ENOMEM;
148         }
149
150         consume_skb(skb);
151
152         skb_list_walk_safe(segs, segs, nskb) {
153                 int err;
154
155                 skb_mark_not_on_list(segs);
156                 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
157                 if (err && ret == 0)
158                         ret = err;
159         }
160
161         return ret;
162 }
163
164 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
165 {
166         unsigned int mtu;
167         int ret;
168
169         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
170         if (ret) {
171                 kfree_skb(skb);
172                 return ret;
173         }
174
175 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
176         /* Policy lookup after SNAT yielded a new policy */
177         if (skb_dst(skb)->xfrm) {
178                 IP6CB(skb)->flags |= IP6SKB_REROUTED;
179                 return dst_output(net, sk, skb);
180         }
181 #endif
182
183         mtu = ip6_skb_dst_mtu(skb);
184         if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
185                 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
186
187         if ((skb->len > mtu && !skb_is_gso(skb)) ||
188             dst_allfrag(skb_dst(skb)) ||
189             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
190                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
191         else
192                 return ip6_finish_output2(net, sk, skb);
193 }
194
195 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
196 {
197         struct net_device *dev = skb_dst(skb)->dev;
198         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
199
200         skb->protocol = htons(ETH_P_IPV6);
201         skb->dev = dev;
202
203         if (unlikely(idev->cnf.disable_ipv6)) {
204                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
205                 kfree_skb(skb);
206                 return 0;
207         }
208
209         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
210                             net, sk, skb, NULL, dev,
211                             ip6_finish_output,
212                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
213 }
214
215 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
216 {
217         if (!np->autoflowlabel_set)
218                 return ip6_default_np_autolabel(net);
219         else
220                 return np->autoflowlabel;
221 }
222
223 /*
224  * xmit an sk_buff (used by TCP, SCTP and DCCP)
225  * Note : socket lock is not held for SYNACK packets, but might be modified
226  * by calls to skb_set_owner_w() and ipv6_local_error(),
227  * which are using proper atomic operations or spinlocks.
228  */
229 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
230              __u32 mark, struct ipv6_txoptions *opt, int tclass)
231 {
232         struct net *net = sock_net(sk);
233         const struct ipv6_pinfo *np = inet6_sk(sk);
234         struct in6_addr *first_hop = &fl6->daddr;
235         struct dst_entry *dst = skb_dst(skb);
236         unsigned int head_room;
237         struct ipv6hdr *hdr;
238         u8  proto = fl6->flowi6_proto;
239         int seg_len = skb->len;
240         int hlimit = -1;
241         u32 mtu;
242
243         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
244         if (opt)
245                 head_room += opt->opt_nflen + opt->opt_flen;
246
247         if (unlikely(skb_headroom(skb) < head_room)) {
248                 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
249                 if (!skb2) {
250                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
251                                       IPSTATS_MIB_OUTDISCARDS);
252                         kfree_skb(skb);
253                         return -ENOBUFS;
254                 }
255                 if (skb->sk)
256                         skb_set_owner_w(skb2, skb->sk);
257                 consume_skb(skb);
258                 skb = skb2;
259         }
260
261         if (opt) {
262                 seg_len += opt->opt_nflen + opt->opt_flen;
263
264                 if (opt->opt_flen)
265                         ipv6_push_frag_opts(skb, opt, &proto);
266
267                 if (opt->opt_nflen)
268                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
269                                              &fl6->saddr);
270         }
271
272         skb_push(skb, sizeof(struct ipv6hdr));
273         skb_reset_network_header(skb);
274         hdr = ipv6_hdr(skb);
275
276         /*
277          *      Fill in the IPv6 header
278          */
279         if (np)
280                 hlimit = np->hop_limit;
281         if (hlimit < 0)
282                 hlimit = ip6_dst_hoplimit(dst);
283
284         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
285                                 ip6_autoflowlabel(net, np), fl6));
286
287         hdr->payload_len = htons(seg_len);
288         hdr->nexthdr = proto;
289         hdr->hop_limit = hlimit;
290
291         hdr->saddr = fl6->saddr;
292         hdr->daddr = *first_hop;
293
294         skb->protocol = htons(ETH_P_IPV6);
295         skb->priority = sk->sk_priority;
296         skb->mark = mark;
297
298         mtu = dst_mtu(dst);
299         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
300                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
301                               IPSTATS_MIB_OUT, skb->len);
302
303                 /* if egress device is enslaved to an L3 master device pass the
304                  * skb to its handler for processing
305                  */
306                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
307                 if (unlikely(!skb))
308                         return 0;
309
310                 /* hooks should never assume socket lock is held.
311                  * we promote our socket to non const
312                  */
313                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
314                                net, (struct sock *)sk, skb, NULL, dst->dev,
315                                dst_output);
316         }
317
318         skb->dev = dst->dev;
319         /* ipv6_local_error() does not require socket lock,
320          * we promote our socket to non const
321          */
322         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
323
324         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
325         kfree_skb(skb);
326         return -EMSGSIZE;
327 }
328 EXPORT_SYMBOL(ip6_xmit);
329
330 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
331 {
332         struct ip6_ra_chain *ra;
333         struct sock *last = NULL;
334
335         read_lock(&ip6_ra_lock);
336         for (ra = ip6_ra_chain; ra; ra = ra->next) {
337                 struct sock *sk = ra->sk;
338                 if (sk && ra->sel == sel &&
339                     (!sk->sk_bound_dev_if ||
340                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
341                         if (last) {
342                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
343                                 if (skb2)
344                                         rawv6_rcv(last, skb2);
345                         }
346                         last = sk;
347                 }
348         }
349
350         if (last) {
351                 rawv6_rcv(last, skb);
352                 read_unlock(&ip6_ra_lock);
353                 return 1;
354         }
355         read_unlock(&ip6_ra_lock);
356         return 0;
357 }
358
359 static int ip6_forward_proxy_check(struct sk_buff *skb)
360 {
361         struct ipv6hdr *hdr = ipv6_hdr(skb);
362         u8 nexthdr = hdr->nexthdr;
363         __be16 frag_off;
364         int offset;
365
366         if (ipv6_ext_hdr(nexthdr)) {
367                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
368                 if (offset < 0)
369                         return 0;
370         } else
371                 offset = sizeof(struct ipv6hdr);
372
373         if (nexthdr == IPPROTO_ICMPV6) {
374                 struct icmp6hdr *icmp6;
375
376                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
377                                          offset + 1 - skb->data)))
378                         return 0;
379
380                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
381
382                 switch (icmp6->icmp6_type) {
383                 case NDISC_ROUTER_SOLICITATION:
384                 case NDISC_ROUTER_ADVERTISEMENT:
385                 case NDISC_NEIGHBOUR_SOLICITATION:
386                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
387                 case NDISC_REDIRECT:
388                         /* For reaction involving unicast neighbor discovery
389                          * message destined to the proxied address, pass it to
390                          * input function.
391                          */
392                         return 1;
393                 default:
394                         break;
395                 }
396         }
397
398         /*
399          * The proxying router can't forward traffic sent to a link-local
400          * address, so signal the sender and discard the packet. This
401          * behavior is clarified by the MIPv6 specification.
402          */
403         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
404                 dst_link_failure(skb);
405                 return -1;
406         }
407
408         return 0;
409 }
410
411 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
412                                      struct sk_buff *skb)
413 {
414         struct dst_entry *dst = skb_dst(skb);
415
416         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
417         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
418
419         skb->tstamp = 0;
420         return dst_output(net, sk, skb);
421 }
422
423 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
424 {
425         if (skb->len <= mtu)
426                 return false;
427
428         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
429         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
430                 return true;
431
432         if (skb->ignore_df)
433                 return false;
434
435         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
436                 return false;
437
438         return true;
439 }
440
441 int ip6_forward(struct sk_buff *skb)
442 {
443         struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
444         struct dst_entry *dst = skb_dst(skb);
445         struct ipv6hdr *hdr = ipv6_hdr(skb);
446         struct inet6_skb_parm *opt = IP6CB(skb);
447         struct net *net = dev_net(dst->dev);
448         u32 mtu;
449
450         if (net->ipv6.devconf_all->forwarding == 0)
451                 goto error;
452
453         if (skb->pkt_type != PACKET_HOST)
454                 goto drop;
455
456         if (unlikely(skb->sk))
457                 goto drop;
458
459         if (skb_warn_if_lro(skb))
460                 goto drop;
461
462         if (!net->ipv6.devconf_all->disable_policy &&
463             (!idev || !idev->cnf.disable_policy) &&
464             !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
465                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
466                 goto drop;
467         }
468
469         skb_forward_csum(skb);
470
471         /*
472          *      We DO NOT make any processing on
473          *      RA packets, pushing them to user level AS IS
474          *      without ane WARRANTY that application will be able
475          *      to interpret them. The reason is that we
476          *      cannot make anything clever here.
477          *
478          *      We are not end-node, so that if packet contains
479          *      AH/ESP, we cannot make anything.
480          *      Defragmentation also would be mistake, RA packets
481          *      cannot be fragmented, because there is no warranty
482          *      that different fragments will go along one path. --ANK
483          */
484         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
485                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
486                         return 0;
487         }
488
489         /*
490          *      check and decrement ttl
491          */
492         if (hdr->hop_limit <= 1) {
493                 /* Force OUTPUT device used as source address */
494                 skb->dev = dst->dev;
495                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
496                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
497
498                 kfree_skb(skb);
499                 return -ETIMEDOUT;
500         }
501
502         /* XXX: idev->cnf.proxy_ndp? */
503         if (net->ipv6.devconf_all->proxy_ndp &&
504             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
505                 int proxied = ip6_forward_proxy_check(skb);
506                 if (proxied > 0)
507                         return ip6_input(skb);
508                 else if (proxied < 0) {
509                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
510                         goto drop;
511                 }
512         }
513
514         if (!xfrm6_route_forward(skb)) {
515                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
516                 goto drop;
517         }
518         dst = skb_dst(skb);
519
520         /* IPv6 specs say nothing about it, but it is clear that we cannot
521            send redirects to source routed frames.
522            We don't send redirects to frames decapsulated from IPsec.
523          */
524         if (IP6CB(skb)->iif == dst->dev->ifindex &&
525             opt->srcrt == 0 && !skb_sec_path(skb)) {
526                 struct in6_addr *target = NULL;
527                 struct inet_peer *peer;
528                 struct rt6_info *rt;
529
530                 /*
531                  *      incoming and outgoing devices are the same
532                  *      send a redirect.
533                  */
534
535                 rt = (struct rt6_info *) dst;
536                 if (rt->rt6i_flags & RTF_GATEWAY)
537                         target = &rt->rt6i_gateway;
538                 else
539                         target = &hdr->daddr;
540
541                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
542
543                 /* Limit redirects both by destination (here)
544                    and by source (inside ndisc_send_redirect)
545                  */
546                 if (inet_peer_xrlim_allow(peer, 1*HZ))
547                         ndisc_send_redirect(skb, target);
548                 if (peer)
549                         inet_putpeer(peer);
550         } else {
551                 int addrtype = ipv6_addr_type(&hdr->saddr);
552
553                 /* This check is security critical. */
554                 if (addrtype == IPV6_ADDR_ANY ||
555                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
556                         goto error;
557                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
558                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
559                                     ICMPV6_NOT_NEIGHBOUR, 0);
560                         goto error;
561                 }
562         }
563
564         mtu = ip6_dst_mtu_forward(dst);
565         if (mtu < IPV6_MIN_MTU)
566                 mtu = IPV6_MIN_MTU;
567
568         if (ip6_pkt_too_big(skb, mtu)) {
569                 /* Again, force OUTPUT device used as source address */
570                 skb->dev = dst->dev;
571                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
572                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
573                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
574                                 IPSTATS_MIB_FRAGFAILS);
575                 kfree_skb(skb);
576                 return -EMSGSIZE;
577         }
578
579         if (skb_cow(skb, dst->dev->hard_header_len)) {
580                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
581                                 IPSTATS_MIB_OUTDISCARDS);
582                 goto drop;
583         }
584
585         hdr = ipv6_hdr(skb);
586
587         /* Mangling hops number delayed to point after skb COW */
588
589         hdr->hop_limit--;
590
591         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
592                        net, NULL, skb, skb->dev, dst->dev,
593                        ip6_forward_finish);
594
595 error:
596         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
597 drop:
598         kfree_skb(skb);
599         return -EINVAL;
600 }
601
602 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
603 {
604         to->pkt_type = from->pkt_type;
605         to->priority = from->priority;
606         to->protocol = from->protocol;
607         skb_dst_drop(to);
608         skb_dst_set(to, dst_clone(skb_dst(from)));
609         to->dev = from->dev;
610         to->mark = from->mark;
611
612         skb_copy_hash(to, from);
613
614 #ifdef CONFIG_NET_SCHED
615         to->tc_index = from->tc_index;
616 #endif
617         nf_copy(to, from);
618         skb_copy_secmark(to, from);
619 }
620
621 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
622                  int (*output)(struct net *, struct sock *, struct sk_buff *))
623 {
624         struct sk_buff *frag;
625         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
626         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
627                                 inet6_sk(skb->sk) : NULL;
628         struct ipv6hdr *tmp_hdr;
629         struct frag_hdr *fh;
630         unsigned int mtu, hlen, left, len, nexthdr_offset;
631         int hroom, troom;
632         __be32 frag_id;
633         int ptr, offset = 0, err = 0;
634         u8 *prevhdr, nexthdr = 0;
635
636         err = ip6_find_1stfragopt(skb, &prevhdr);
637         if (err < 0)
638                 goto fail;
639         hlen = err;
640         nexthdr = *prevhdr;
641         nexthdr_offset = prevhdr - skb_network_header(skb);
642
643         mtu = ip6_skb_dst_mtu(skb);
644
645         /* We must not fragment if the socket is set to force MTU discovery
646          * or if the skb it not generated by a local socket.
647          */
648         if (unlikely(!skb->ignore_df && skb->len > mtu))
649                 goto fail_toobig;
650
651         if (IP6CB(skb)->frag_max_size) {
652                 if (IP6CB(skb)->frag_max_size > mtu)
653                         goto fail_toobig;
654
655                 /* don't send fragments larger than what we received */
656                 mtu = IP6CB(skb)->frag_max_size;
657                 if (mtu < IPV6_MIN_MTU)
658                         mtu = IPV6_MIN_MTU;
659         }
660
661         if (np && np->frag_size < mtu) {
662                 if (np->frag_size)
663                         mtu = np->frag_size;
664         }
665         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
666                 goto fail_toobig;
667         mtu -= hlen + sizeof(struct frag_hdr);
668
669         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
670                                     &ipv6_hdr(skb)->saddr);
671
672         if (skb->ip_summed == CHECKSUM_PARTIAL &&
673             (err = skb_checksum_help(skb)))
674                 goto fail;
675
676         prevhdr = skb_network_header(skb) + nexthdr_offset;
677         hroom = LL_RESERVED_SPACE(rt->dst.dev);
678         if (skb_has_frag_list(skb)) {
679                 unsigned int first_len = skb_pagelen(skb);
680                 struct sk_buff *frag2;
681
682                 if (first_len - hlen > mtu ||
683                     ((first_len - hlen) & 7) ||
684                     skb_cloned(skb) ||
685                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
686                         goto slow_path;
687
688                 skb_walk_frags(skb, frag) {
689                         /* Correct geometry. */
690                         if (frag->len > mtu ||
691                             ((frag->len & 7) && frag->next) ||
692                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
693                                 goto slow_path_clean;
694
695                         /* Partially cloned skb? */
696                         if (skb_shared(frag))
697                                 goto slow_path_clean;
698
699                         BUG_ON(frag->sk);
700                         if (skb->sk) {
701                                 frag->sk = skb->sk;
702                                 frag->destructor = sock_wfree;
703                         }
704                         skb->truesize -= frag->truesize;
705                 }
706
707                 err = 0;
708                 offset = 0;
709                 /* BUILD HEADER */
710
711                 *prevhdr = NEXTHDR_FRAGMENT;
712                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
713                 if (!tmp_hdr) {
714                         err = -ENOMEM;
715                         goto fail;
716                 }
717                 frag = skb_shinfo(skb)->frag_list;
718                 skb_frag_list_init(skb);
719
720                 __skb_pull(skb, hlen);
721                 fh = __skb_push(skb, sizeof(struct frag_hdr));
722                 __skb_push(skb, hlen);
723                 skb_reset_network_header(skb);
724                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
725
726                 fh->nexthdr = nexthdr;
727                 fh->reserved = 0;
728                 fh->frag_off = htons(IP6_MF);
729                 fh->identification = frag_id;
730
731                 first_len = skb_pagelen(skb);
732                 skb->data_len = first_len - skb_headlen(skb);
733                 skb->len = first_len;
734                 ipv6_hdr(skb)->payload_len = htons(first_len -
735                                                    sizeof(struct ipv6hdr));
736
737                 for (;;) {
738                         /* Prepare header of the next frame,
739                          * before previous one went down. */
740                         if (frag) {
741                                 frag->ip_summed = CHECKSUM_NONE;
742                                 skb_reset_transport_header(frag);
743                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
744                                 __skb_push(frag, hlen);
745                                 skb_reset_network_header(frag);
746                                 memcpy(skb_network_header(frag), tmp_hdr,
747                                        hlen);
748                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
749                                 fh->nexthdr = nexthdr;
750                                 fh->reserved = 0;
751                                 fh->frag_off = htons(offset);
752                                 if (frag->next)
753                                         fh->frag_off |= htons(IP6_MF);
754                                 fh->identification = frag_id;
755                                 ipv6_hdr(frag)->payload_len =
756                                                 htons(frag->len -
757                                                       sizeof(struct ipv6hdr));
758                                 ip6_copy_metadata(frag, skb);
759                         }
760
761                         err = output(net, sk, skb);
762                         if (!err)
763                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
764                                               IPSTATS_MIB_FRAGCREATES);
765
766                         if (err || !frag)
767                                 break;
768
769                         skb = frag;
770                         frag = skb->next;
771                         skb->next = NULL;
772                 }
773
774                 kfree(tmp_hdr);
775
776                 if (err == 0) {
777                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
778                                       IPSTATS_MIB_FRAGOKS);
779                         return 0;
780                 }
781
782                 kfree_skb_list(frag);
783
784                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
785                               IPSTATS_MIB_FRAGFAILS);
786                 return err;
787
788 slow_path_clean:
789                 skb_walk_frags(skb, frag2) {
790                         if (frag2 == frag)
791                                 break;
792                         frag2->sk = NULL;
793                         frag2->destructor = NULL;
794                         skb->truesize += frag2->truesize;
795                 }
796         }
797
798 slow_path:
799         left = skb->len - hlen;         /* Space per frame */
800         ptr = hlen;                     /* Where to start from */
801
802         /*
803          *      Fragment the datagram.
804          */
805
806         troom = rt->dst.dev->needed_tailroom;
807
808         /*
809          *      Keep copying data until we run out.
810          */
811         while (left > 0)        {
812                 u8 *fragnexthdr_offset;
813
814                 len = left;
815                 /* IF: it doesn't fit, use 'mtu' - the data space left */
816                 if (len > mtu)
817                         len = mtu;
818                 /* IF: we are not sending up to and including the packet end
819                    then align the next start on an eight byte boundary */
820                 if (len < left) {
821                         len &= ~7;
822                 }
823
824                 /* Allocate buffer */
825                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
826                                  hroom + troom, GFP_ATOMIC);
827                 if (!frag) {
828                         err = -ENOMEM;
829                         goto fail;
830                 }
831
832                 /*
833                  *      Set up data on packet
834                  */
835
836                 ip6_copy_metadata(frag, skb);
837                 skb_reserve(frag, hroom);
838                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
839                 skb_reset_network_header(frag);
840                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
841                 frag->transport_header = (frag->network_header + hlen +
842                                           sizeof(struct frag_hdr));
843
844                 /*
845                  *      Charge the memory for the fragment to any owner
846                  *      it might possess
847                  */
848                 if (skb->sk)
849                         skb_set_owner_w(frag, skb->sk);
850
851                 /*
852                  *      Copy the packet header into the new buffer.
853                  */
854                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
855
856                 fragnexthdr_offset = skb_network_header(frag);
857                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
858                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
859
860                 /*
861                  *      Build fragment header.
862                  */
863                 fh->nexthdr = nexthdr;
864                 fh->reserved = 0;
865                 fh->identification = frag_id;
866
867                 /*
868                  *      Copy a block of the IP datagram.
869                  */
870                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
871                                      len));
872                 left -= len;
873
874                 fh->frag_off = htons(offset);
875                 if (left > 0)
876                         fh->frag_off |= htons(IP6_MF);
877                 ipv6_hdr(frag)->payload_len = htons(frag->len -
878                                                     sizeof(struct ipv6hdr));
879
880                 ptr += len;
881                 offset += len;
882
883                 /*
884                  *      Put this fragment into the sending queue.
885                  */
886                 err = output(net, sk, frag);
887                 if (err)
888                         goto fail;
889
890                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
891                               IPSTATS_MIB_FRAGCREATES);
892         }
893         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
894                       IPSTATS_MIB_FRAGOKS);
895         consume_skb(skb);
896         return err;
897
898 fail_toobig:
899         if (skb->sk && dst_allfrag(skb_dst(skb)))
900                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
901
902         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
903         err = -EMSGSIZE;
904
905 fail:
906         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
907                       IPSTATS_MIB_FRAGFAILS);
908         kfree_skb(skb);
909         return err;
910 }
911
912 static inline int ip6_rt_check(const struct rt6key *rt_key,
913                                const struct in6_addr *fl_addr,
914                                const struct in6_addr *addr_cache)
915 {
916         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
917                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
918 }
919
920 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
921                                           struct dst_entry *dst,
922                                           const struct flowi6 *fl6)
923 {
924         struct ipv6_pinfo *np = inet6_sk(sk);
925         struct rt6_info *rt;
926
927         if (!dst)
928                 goto out;
929
930         if (dst->ops->family != AF_INET6) {
931                 dst_release(dst);
932                 return NULL;
933         }
934
935         rt = (struct rt6_info *)dst;
936         /* Yes, checking route validity in not connected
937          * case is not very simple. Take into account,
938          * that we do not support routing by source, TOS,
939          * and MSG_DONTROUTE            --ANK (980726)
940          *
941          * 1. ip6_rt_check(): If route was host route,
942          *    check that cached destination is current.
943          *    If it is network route, we still may
944          *    check its validity using saved pointer
945          *    to the last used address: daddr_cache.
946          *    We do not want to save whole address now,
947          *    (because main consumer of this service
948          *    is tcp, which has not this problem),
949          *    so that the last trick works only on connected
950          *    sockets.
951          * 2. oif also should be the same.
952          */
953         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
954 #ifdef CONFIG_IPV6_SUBTREES
955             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
956 #endif
957            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
958               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
959                 dst_release(dst);
960                 dst = NULL;
961         }
962
963 out:
964         return dst;
965 }
966
967 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
968                                struct dst_entry **dst, struct flowi6 *fl6)
969 {
970 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
971         struct neighbour *n;
972         struct rt6_info *rt;
973 #endif
974         int err;
975         int flags = 0;
976
977         /* The correct way to handle this would be to do
978          * ip6_route_get_saddr, and then ip6_route_output; however,
979          * the route-specific preferred source forces the
980          * ip6_route_output call _before_ ip6_route_get_saddr.
981          *
982          * In source specific routing (no src=any default route),
983          * ip6_route_output will fail given src=any saddr, though, so
984          * that's why we try it again later.
985          */
986         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
987                 struct fib6_info *from;
988                 struct rt6_info *rt;
989                 bool had_dst = *dst != NULL;
990
991                 if (!had_dst)
992                         *dst = ip6_route_output(net, sk, fl6);
993                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
994
995                 rcu_read_lock();
996                 from = rt ? rcu_dereference(rt->from) : NULL;
997                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
998                                           sk ? inet6_sk(sk)->srcprefs : 0,
999                                           &fl6->saddr);
1000                 rcu_read_unlock();
1001
1002                 if (err)
1003                         goto out_err_release;
1004
1005                 /* If we had an erroneous initial result, pretend it
1006                  * never existed and let the SA-enabled version take
1007                  * over.
1008                  */
1009                 if (!had_dst && (*dst)->error) {
1010                         dst_release(*dst);
1011                         *dst = NULL;
1012                 }
1013
1014                 if (fl6->flowi6_oif)
1015                         flags |= RT6_LOOKUP_F_IFACE;
1016         }
1017
1018         if (!*dst)
1019                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1020
1021         err = (*dst)->error;
1022         if (err)
1023                 goto out_err_release;
1024
1025 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1026         /*
1027          * Here if the dst entry we've looked up
1028          * has a neighbour entry that is in the INCOMPLETE
1029          * state and the src address from the flow is
1030          * marked as OPTIMISTIC, we release the found
1031          * dst entry and replace it instead with the
1032          * dst entry of the nexthop router
1033          */
1034         rt = (struct rt6_info *) *dst;
1035         rcu_read_lock_bh();
1036         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1037                                       rt6_nexthop(rt, &fl6->daddr));
1038         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1039         rcu_read_unlock_bh();
1040
1041         if (err) {
1042                 struct inet6_ifaddr *ifp;
1043                 struct flowi6 fl_gw6;
1044                 int redirect;
1045
1046                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1047                                       (*dst)->dev, 1);
1048
1049                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1050                 if (ifp)
1051                         in6_ifa_put(ifp);
1052
1053                 if (redirect) {
1054                         /*
1055                          * We need to get the dst entry for the
1056                          * default router instead
1057                          */
1058                         dst_release(*dst);
1059                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1060                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1061                         *dst = ip6_route_output(net, sk, &fl_gw6);
1062                         err = (*dst)->error;
1063                         if (err)
1064                                 goto out_err_release;
1065                 }
1066         }
1067 #endif
1068         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1069             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1070                 err = -EAFNOSUPPORT;
1071                 goto out_err_release;
1072         }
1073
1074         return 0;
1075
1076 out_err_release:
1077         dst_release(*dst);
1078         *dst = NULL;
1079
1080         if (err == -ENETUNREACH)
1081                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1082         return err;
1083 }
1084
1085 /**
1086  *      ip6_dst_lookup - perform route lookup on flow
1087  *      @sk: socket which provides route info
1088  *      @dst: pointer to dst_entry * for result
1089  *      @fl6: flow to lookup
1090  *
1091  *      This function performs a route lookup on the given flow.
1092  *
1093  *      It returns zero on success, or a standard errno code on error.
1094  */
1095 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1096                    struct flowi6 *fl6)
1097 {
1098         *dst = NULL;
1099         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1100 }
1101 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1102
1103 /**
1104  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1105  *      @sk: socket which provides route info
1106  *      @fl6: flow to lookup
1107  *      @final_dst: final destination address for ipsec lookup
1108  *
1109  *      This function performs a route lookup on the given flow.
1110  *
1111  *      It returns a valid dst pointer on success, or a pointer encoded
1112  *      error code.
1113  */
1114 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1115                                       const struct in6_addr *final_dst)
1116 {
1117         struct dst_entry *dst = NULL;
1118         int err;
1119
1120         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1121         if (err)
1122                 return ERR_PTR(err);
1123         if (final_dst)
1124                 fl6->daddr = *final_dst;
1125
1126         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1127 }
1128 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1129
1130 /**
1131  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1132  *      @sk: socket which provides the dst cache and route info
1133  *      @fl6: flow to lookup
1134  *      @final_dst: final destination address for ipsec lookup
1135  *      @connected: whether @sk is connected or not
1136  *
1137  *      This function performs a route lookup on the given flow with the
1138  *      possibility of using the cached route in the socket if it is valid.
1139  *      It will take the socket dst lock when operating on the dst cache.
1140  *      As a result, this function can only be used in process context.
1141  *
1142  *      In addition, for a connected socket, cache the dst in the socket
1143  *      if the current cache is not valid.
1144  *
1145  *      It returns a valid dst pointer on success, or a pointer encoded
1146  *      error code.
1147  */
1148 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1149                                          const struct in6_addr *final_dst,
1150                                          bool connected)
1151 {
1152         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1153
1154         dst = ip6_sk_dst_check(sk, dst, fl6);
1155         if (dst)
1156                 return dst;
1157
1158         dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1159         if (connected && !IS_ERR(dst))
1160                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1161
1162         return dst;
1163 }
1164 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1165
1166 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1167                                                gfp_t gfp)
1168 {
1169         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1170 }
1171
1172 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1173                                                 gfp_t gfp)
1174 {
1175         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1176 }
1177
1178 static void ip6_append_data_mtu(unsigned int *mtu,
1179                                 int *maxfraglen,
1180                                 unsigned int fragheaderlen,
1181                                 struct sk_buff *skb,
1182                                 struct rt6_info *rt,
1183                                 unsigned int orig_mtu)
1184 {
1185         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1186                 if (!skb) {
1187                         /* first fragment, reserve header_len */
1188                         *mtu = orig_mtu - rt->dst.header_len;
1189
1190                 } else {
1191                         /*
1192                          * this fragment is not first, the headers
1193                          * space is regarded as data space.
1194                          */
1195                         *mtu = orig_mtu;
1196                 }
1197                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1198                               + fragheaderlen - sizeof(struct frag_hdr);
1199         }
1200 }
1201
1202 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1203                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1204                           struct rt6_info *rt, struct flowi6 *fl6)
1205 {
1206         struct ipv6_pinfo *np = inet6_sk(sk);
1207         unsigned int mtu;
1208         struct ipv6_txoptions *opt = ipc6->opt;
1209
1210         /*
1211          * setup for corking
1212          */
1213         if (opt) {
1214                 if (WARN_ON(v6_cork->opt))
1215                         return -EINVAL;
1216
1217                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1218                 if (unlikely(!v6_cork->opt))
1219                         return -ENOBUFS;
1220
1221                 v6_cork->opt->tot_len = sizeof(*opt);
1222                 v6_cork->opt->opt_flen = opt->opt_flen;
1223                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1224
1225                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1226                                                     sk->sk_allocation);
1227                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1228                         return -ENOBUFS;
1229
1230                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1231                                                     sk->sk_allocation);
1232                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1233                         return -ENOBUFS;
1234
1235                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1236                                                    sk->sk_allocation);
1237                 if (opt->hopopt && !v6_cork->opt->hopopt)
1238                         return -ENOBUFS;
1239
1240                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1241                                                     sk->sk_allocation);
1242                 if (opt->srcrt && !v6_cork->opt->srcrt)
1243                         return -ENOBUFS;
1244
1245                 /* need source address above miyazawa*/
1246         }
1247         dst_hold(&rt->dst);
1248         cork->base.dst = &rt->dst;
1249         cork->fl.u.ip6 = *fl6;
1250         v6_cork->hop_limit = ipc6->hlimit;
1251         v6_cork->tclass = ipc6->tclass;
1252         if (rt->dst.flags & DST_XFRM_TUNNEL)
1253                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1254                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1255         else
1256                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1257                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1258         if (np->frag_size < mtu) {
1259                 if (np->frag_size)
1260                         mtu = np->frag_size;
1261         }
1262         cork->base.fragsize = mtu;
1263         cork->base.gso_size = ipc6->gso_size;
1264         cork->base.tx_flags = 0;
1265         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1266
1267         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1268                 cork->base.flags |= IPCORK_ALLFRAG;
1269         cork->base.length = 0;
1270
1271         cork->base.transmit_time = ipc6->sockc.transmit_time;
1272
1273         return 0;
1274 }
1275
1276 static int __ip6_append_data(struct sock *sk,
1277                              struct flowi6 *fl6,
1278                              struct sk_buff_head *queue,
1279                              struct inet_cork *cork,
1280                              struct inet6_cork *v6_cork,
1281                              struct page_frag *pfrag,
1282                              int getfrag(void *from, char *to, int offset,
1283                                          int len, int odd, struct sk_buff *skb),
1284                              void *from, int length, int transhdrlen,
1285                              unsigned int flags, struct ipcm6_cookie *ipc6)
1286 {
1287         struct sk_buff *skb, *skb_prev = NULL;
1288         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1289         int exthdrlen = 0;
1290         int dst_exthdrlen = 0;
1291         int hh_len;
1292         int copy;
1293         int err;
1294         int offset = 0;
1295         u32 tskey = 0;
1296         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1297         struct ipv6_txoptions *opt = v6_cork->opt;
1298         int csummode = CHECKSUM_NONE;
1299         unsigned int maxnonfragsize, headersize;
1300         unsigned int wmem_alloc_delta = 0;
1301         bool paged;
1302
1303         skb = skb_peek_tail(queue);
1304         if (!skb) {
1305                 exthdrlen = opt ? opt->opt_flen : 0;
1306                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1307         }
1308
1309         paged = !!cork->gso_size;
1310         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1311         orig_mtu = mtu;
1312
1313         if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1314             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1315                 tskey = sk->sk_tskey++;
1316
1317         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1318
1319         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1320                         (opt ? opt->opt_nflen : 0);
1321
1322         headersize = sizeof(struct ipv6hdr) +
1323                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1324                      (dst_allfrag(&rt->dst) ?
1325                       sizeof(struct frag_hdr) : 0) +
1326                      rt->rt6i_nfheader_len;
1327
1328         if (mtu <= fragheaderlen ||
1329             ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1330                 goto emsgsize;
1331
1332         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1333                      sizeof(struct frag_hdr);
1334
1335         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1336          * the first fragment
1337          */
1338         if (headersize + transhdrlen > mtu)
1339                 goto emsgsize;
1340
1341         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1342             (sk->sk_protocol == IPPROTO_UDP ||
1343              sk->sk_protocol == IPPROTO_RAW)) {
1344                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1345                                 sizeof(struct ipv6hdr));
1346                 goto emsgsize;
1347         }
1348
1349         if (ip6_sk_ignore_df(sk))
1350                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1351         else
1352                 maxnonfragsize = mtu;
1353
1354         if (cork->length + length > maxnonfragsize - headersize) {
1355 emsgsize:
1356                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1357                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1358                 return -EMSGSIZE;
1359         }
1360
1361         /* CHECKSUM_PARTIAL only with no extension headers and when
1362          * we are not going to fragment
1363          */
1364         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1365             headersize == sizeof(struct ipv6hdr) &&
1366             length <= mtu - headersize &&
1367             (!(flags & MSG_MORE) || cork->gso_size) &&
1368             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1369                 csummode = CHECKSUM_PARTIAL;
1370
1371         /*
1372          * Let's try using as much space as possible.
1373          * Use MTU if total length of the message fits into the MTU.
1374          * Otherwise, we need to reserve fragment header and
1375          * fragment alignment (= 8-15 octects, in total).
1376          *
1377          * Note that we may need to "move" the data from the tail of
1378          * of the buffer to the new fragment when we split
1379          * the message.
1380          *
1381          * FIXME: It may be fragmented into multiple chunks
1382          *        at once if non-fragmentable extension headers
1383          *        are too large.
1384          * --yoshfuji
1385          */
1386
1387         cork->length += length;
1388         if (!skb)
1389                 goto alloc_new_skb;
1390
1391         while (length > 0) {
1392                 /* Check if the remaining data fits into current packet. */
1393                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1394                 if (copy < length)
1395                         copy = maxfraglen - skb->len;
1396
1397                 if (copy <= 0) {
1398                         char *data;
1399                         unsigned int datalen;
1400                         unsigned int fraglen;
1401                         unsigned int fraggap;
1402                         unsigned int alloclen, alloc_extra;
1403                         unsigned int pagedlen;
1404 alloc_new_skb:
1405                         /* There's no room in the current skb */
1406                         if (skb)
1407                                 fraggap = skb->len - maxfraglen;
1408                         else
1409                                 fraggap = 0;
1410                         /* update mtu and maxfraglen if necessary */
1411                         if (!skb || !skb_prev)
1412                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1413                                                     fragheaderlen, skb, rt,
1414                                                     orig_mtu);
1415
1416                         skb_prev = skb;
1417
1418                         /*
1419                          * If remaining data exceeds the mtu,
1420                          * we know we need more fragment(s).
1421                          */
1422                         datalen = length + fraggap;
1423
1424                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1425                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1426                         fraglen = datalen + fragheaderlen;
1427                         pagedlen = 0;
1428
1429                         alloc_extra = hh_len;
1430                         alloc_extra += dst_exthdrlen;
1431                         alloc_extra += rt->dst.trailer_len;
1432
1433                         /* We just reserve space for fragment header.
1434                          * Note: this may be overallocation if the message
1435                          * (without MSG_MORE) fits into the MTU.
1436                          */
1437                         alloc_extra += sizeof(struct frag_hdr);
1438
1439                         if ((flags & MSG_MORE) &&
1440                             !(rt->dst.dev->features&NETIF_F_SG))
1441                                 alloclen = mtu;
1442                         else if (!paged &&
1443                                  (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1444                                   !(rt->dst.dev->features & NETIF_F_SG)))
1445                                 alloclen = fraglen;
1446                         else {
1447                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1448                                 pagedlen = fraglen - alloclen;
1449                         }
1450                         alloclen += alloc_extra;
1451
1452                         if (datalen != length + fraggap) {
1453                                 /*
1454                                  * this is not the last fragment, the trailer
1455                                  * space is regarded as data space.
1456                                  */
1457                                 datalen += rt->dst.trailer_len;
1458                         }
1459
1460                         fraglen = datalen + fragheaderlen;
1461
1462                         copy = datalen - transhdrlen - fraggap - pagedlen;
1463                         if (copy < 0) {
1464                                 err = -EINVAL;
1465                                 goto error;
1466                         }
1467                         if (transhdrlen) {
1468                                 skb = sock_alloc_send_skb(sk, alloclen,
1469                                                 (flags & MSG_DONTWAIT), &err);
1470                         } else {
1471                                 skb = NULL;
1472                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1473                                     2 * sk->sk_sndbuf)
1474                                         skb = alloc_skb(alloclen,
1475                                                         sk->sk_allocation);
1476                                 if (unlikely(!skb))
1477                                         err = -ENOBUFS;
1478                         }
1479                         if (!skb)
1480                                 goto error;
1481                         /*
1482                          *      Fill in the control structures
1483                          */
1484                         skb->protocol = htons(ETH_P_IPV6);
1485                         skb->ip_summed = csummode;
1486                         skb->csum = 0;
1487                         /* reserve for fragmentation and ipsec header */
1488                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1489                                     dst_exthdrlen);
1490
1491                         /* Only the initial fragment is time stamped */
1492                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1493                         cork->tx_flags = 0;
1494                         skb_shinfo(skb)->tskey = tskey;
1495                         tskey = 0;
1496
1497                         /*
1498                          *      Find where to start putting bytes
1499                          */
1500                         data = skb_put(skb, fraglen - pagedlen);
1501                         skb_set_network_header(skb, exthdrlen);
1502                         data += fragheaderlen;
1503                         skb->transport_header = (skb->network_header +
1504                                                  fragheaderlen);
1505                         if (fraggap) {
1506                                 skb->csum = skb_copy_and_csum_bits(
1507                                         skb_prev, maxfraglen,
1508                                         data + transhdrlen, fraggap, 0);
1509                                 skb_prev->csum = csum_sub(skb_prev->csum,
1510                                                           skb->csum);
1511                                 data += fraggap;
1512                                 pskb_trim_unique(skb_prev, maxfraglen);
1513                         }
1514                         if (copy > 0 &&
1515                             getfrag(from, data + transhdrlen, offset,
1516                                     copy, fraggap, skb) < 0) {
1517                                 err = -EFAULT;
1518                                 kfree_skb(skb);
1519                                 goto error;
1520                         }
1521
1522                         offset += copy;
1523                         length -= copy + transhdrlen;
1524                         transhdrlen = 0;
1525                         exthdrlen = 0;
1526                         dst_exthdrlen = 0;
1527
1528                         if ((flags & MSG_CONFIRM) && !skb_prev)
1529                                 skb_set_dst_pending_confirm(skb, 1);
1530
1531                         /*
1532                          * Put the packet on the pending queue
1533                          */
1534                         if (!skb->destructor) {
1535                                 skb->destructor = sock_wfree;
1536                                 skb->sk = sk;
1537                                 wmem_alloc_delta += skb->truesize;
1538                         }
1539                         __skb_queue_tail(queue, skb);
1540                         continue;
1541                 }
1542
1543                 if (copy > length)
1544                         copy = length;
1545
1546                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1547                     skb_tailroom(skb) >= copy) {
1548                         unsigned int off;
1549
1550                         off = skb->len;
1551                         if (getfrag(from, skb_put(skb, copy),
1552                                                 offset, copy, off, skb) < 0) {
1553                                 __skb_trim(skb, off);
1554                                 err = -EFAULT;
1555                                 goto error;
1556                         }
1557                 } else {
1558                         int i = skb_shinfo(skb)->nr_frags;
1559
1560                         err = -ENOMEM;
1561                         if (!sk_page_frag_refill(sk, pfrag))
1562                                 goto error;
1563
1564                         if (!skb_can_coalesce(skb, i, pfrag->page,
1565                                               pfrag->offset)) {
1566                                 err = -EMSGSIZE;
1567                                 if (i == MAX_SKB_FRAGS)
1568                                         goto error;
1569
1570                                 __skb_fill_page_desc(skb, i, pfrag->page,
1571                                                      pfrag->offset, 0);
1572                                 skb_shinfo(skb)->nr_frags = ++i;
1573                                 get_page(pfrag->page);
1574                         }
1575                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1576                         if (getfrag(from,
1577                                     page_address(pfrag->page) + pfrag->offset,
1578                                     offset, copy, skb->len, skb) < 0)
1579                                 goto error_efault;
1580
1581                         pfrag->offset += copy;
1582                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1583                         skb->len += copy;
1584                         skb->data_len += copy;
1585                         skb->truesize += copy;
1586                         wmem_alloc_delta += copy;
1587                 }
1588                 offset += copy;
1589                 length -= copy;
1590         }
1591
1592         if (wmem_alloc_delta)
1593                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1594         return 0;
1595
1596 error_efault:
1597         err = -EFAULT;
1598 error:
1599         cork->length -= length;
1600         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1601         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1602         return err;
1603 }
1604
1605 int ip6_append_data(struct sock *sk,
1606                     int getfrag(void *from, char *to, int offset, int len,
1607                                 int odd, struct sk_buff *skb),
1608                     void *from, int length, int transhdrlen,
1609                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1610                     struct rt6_info *rt, unsigned int flags)
1611 {
1612         struct inet_sock *inet = inet_sk(sk);
1613         struct ipv6_pinfo *np = inet6_sk(sk);
1614         int exthdrlen;
1615         int err;
1616
1617         if (flags&MSG_PROBE)
1618                 return 0;
1619         if (skb_queue_empty(&sk->sk_write_queue)) {
1620                 /*
1621                  * setup for corking
1622                  */
1623                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1624                                      ipc6, rt, fl6);
1625                 if (err)
1626                         return err;
1627
1628                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1629                 length += exthdrlen;
1630                 transhdrlen += exthdrlen;
1631         } else {
1632                 fl6 = &inet->cork.fl.u.ip6;
1633                 transhdrlen = 0;
1634         }
1635
1636         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1637                                  &np->cork, sk_page_frag(sk), getfrag,
1638                                  from, length, transhdrlen, flags, ipc6);
1639 }
1640 EXPORT_SYMBOL_GPL(ip6_append_data);
1641
1642 static void ip6_cork_release(struct inet_cork_full *cork,
1643                              struct inet6_cork *v6_cork)
1644 {
1645         if (v6_cork->opt) {
1646                 kfree(v6_cork->opt->dst0opt);
1647                 kfree(v6_cork->opt->dst1opt);
1648                 kfree(v6_cork->opt->hopopt);
1649                 kfree(v6_cork->opt->srcrt);
1650                 kfree(v6_cork->opt);
1651                 v6_cork->opt = NULL;
1652         }
1653
1654         if (cork->base.dst) {
1655                 dst_release(cork->base.dst);
1656                 cork->base.dst = NULL;
1657                 cork->base.flags &= ~IPCORK_ALLFRAG;
1658         }
1659         memset(&cork->fl, 0, sizeof(cork->fl));
1660 }
1661
1662 struct sk_buff *__ip6_make_skb(struct sock *sk,
1663                                struct sk_buff_head *queue,
1664                                struct inet_cork_full *cork,
1665                                struct inet6_cork *v6_cork)
1666 {
1667         struct sk_buff *skb, *tmp_skb;
1668         struct sk_buff **tail_skb;
1669         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1670         struct ipv6_pinfo *np = inet6_sk(sk);
1671         struct net *net = sock_net(sk);
1672         struct ipv6hdr *hdr;
1673         struct ipv6_txoptions *opt = v6_cork->opt;
1674         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1675         struct flowi6 *fl6 = &cork->fl.u.ip6;
1676         unsigned char proto = fl6->flowi6_proto;
1677
1678         skb = __skb_dequeue(queue);
1679         if (!skb)
1680                 goto out;
1681         tail_skb = &(skb_shinfo(skb)->frag_list);
1682
1683         /* move skb->data to ip header from ext header */
1684         if (skb->data < skb_network_header(skb))
1685                 __skb_pull(skb, skb_network_offset(skb));
1686         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1687                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1688                 *tail_skb = tmp_skb;
1689                 tail_skb = &(tmp_skb->next);
1690                 skb->len += tmp_skb->len;
1691                 skb->data_len += tmp_skb->len;
1692                 skb->truesize += tmp_skb->truesize;
1693                 tmp_skb->destructor = NULL;
1694                 tmp_skb->sk = NULL;
1695         }
1696
1697         /* Allow local fragmentation. */
1698         skb->ignore_df = ip6_sk_ignore_df(sk);
1699
1700         *final_dst = fl6->daddr;
1701         __skb_pull(skb, skb_network_header_len(skb));
1702         if (opt && opt->opt_flen)
1703                 ipv6_push_frag_opts(skb, opt, &proto);
1704         if (opt && opt->opt_nflen)
1705                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1706
1707         skb_push(skb, sizeof(struct ipv6hdr));
1708         skb_reset_network_header(skb);
1709         hdr = ipv6_hdr(skb);
1710
1711         ip6_flow_hdr(hdr, v6_cork->tclass,
1712                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1713                                         ip6_autoflowlabel(net, np), fl6));
1714         hdr->hop_limit = v6_cork->hop_limit;
1715         hdr->nexthdr = proto;
1716         hdr->saddr = fl6->saddr;
1717         hdr->daddr = *final_dst;
1718
1719         skb->priority = sk->sk_priority;
1720         skb->mark = sk->sk_mark;
1721
1722         skb->tstamp = cork->base.transmit_time;
1723
1724         skb_dst_set(skb, dst_clone(&rt->dst));
1725         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1726         if (proto == IPPROTO_ICMPV6) {
1727                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1728
1729                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1730                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1731         }
1732
1733         ip6_cork_release(cork, v6_cork);
1734 out:
1735         return skb;
1736 }
1737
1738 int ip6_send_skb(struct sk_buff *skb)
1739 {
1740         struct net *net = sock_net(skb->sk);
1741         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1742         int err;
1743
1744         err = ip6_local_out(net, skb->sk, skb);
1745         if (err) {
1746                 if (err > 0)
1747                         err = net_xmit_errno(err);
1748                 if (err)
1749                         IP6_INC_STATS(net, rt->rt6i_idev,
1750                                       IPSTATS_MIB_OUTDISCARDS);
1751         }
1752
1753         return err;
1754 }
1755
1756 int ip6_push_pending_frames(struct sock *sk)
1757 {
1758         struct sk_buff *skb;
1759
1760         skb = ip6_finish_skb(sk);
1761         if (!skb)
1762                 return 0;
1763
1764         return ip6_send_skb(skb);
1765 }
1766 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1767
1768 static void __ip6_flush_pending_frames(struct sock *sk,
1769                                        struct sk_buff_head *queue,
1770                                        struct inet_cork_full *cork,
1771                                        struct inet6_cork *v6_cork)
1772 {
1773         struct sk_buff *skb;
1774
1775         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1776                 if (skb_dst(skb))
1777                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1778                                       IPSTATS_MIB_OUTDISCARDS);
1779                 kfree_skb(skb);
1780         }
1781
1782         ip6_cork_release(cork, v6_cork);
1783 }
1784
1785 void ip6_flush_pending_frames(struct sock *sk)
1786 {
1787         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1788                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1789 }
1790 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1791
1792 struct sk_buff *ip6_make_skb(struct sock *sk,
1793                              int getfrag(void *from, char *to, int offset,
1794                                          int len, int odd, struct sk_buff *skb),
1795                              void *from, int length, int transhdrlen,
1796                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1797                              struct rt6_info *rt, unsigned int flags,
1798                              struct inet_cork_full *cork)
1799 {
1800         struct inet6_cork v6_cork;
1801         struct sk_buff_head queue;
1802         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1803         int err;
1804
1805         if (flags & MSG_PROBE)
1806                 return NULL;
1807
1808         __skb_queue_head_init(&queue);
1809
1810         cork->base.flags = 0;
1811         cork->base.addr = 0;
1812         cork->base.opt = NULL;
1813         cork->base.dst = NULL;
1814         v6_cork.opt = NULL;
1815         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1816         if (err) {
1817                 ip6_cork_release(cork, &v6_cork);
1818                 return ERR_PTR(err);
1819         }
1820         if (ipc6->dontfrag < 0)
1821                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1822
1823         err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1824                                 &current->task_frag, getfrag, from,
1825                                 length + exthdrlen, transhdrlen + exthdrlen,
1826                                 flags, ipc6);
1827         if (err) {
1828                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1829                 return ERR_PTR(err);
1830         }
1831
1832         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1833 }