GNU Linux-libre 4.19.304-gnu1
[releases.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64         struct dst_entry *dst = skb_dst(skb);
65         struct net_device *dev = dst->dev;
66         struct neighbour *neigh;
67         struct in6_addr *nexthop;
68         int ret;
69
70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72
73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74                     ((mroute6_is_socket(net, skb) &&
75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77                                          &ipv6_hdr(skb)->saddr))) {
78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80                         /* Do not check for IFF_ALLMULTI; multicast routing
81                            is not supported in any case.
82                          */
83                         if (newskb)
84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85                                         net, sk, newskb, NULL, newskb->dev,
86                                         dev_loopback_xmit);
87
88                         if (ipv6_hdr(skb)->hop_limit == 0) {
89                                 IP6_INC_STATS(net, idev,
90                                               IPSTATS_MIB_OUTDISCARDS);
91                                 kfree_skb(skb);
92                                 return 0;
93                         }
94                 }
95
96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97
98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99                     IPV6_ADDR_SCOPE_NODELOCAL &&
100                     !(dev->flags & IFF_LOOPBACK)) {
101                         kfree_skb(skb);
102                         return 0;
103                 }
104         }
105
106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107                 int res = lwtunnel_xmit(skb);
108
109                 if (res != LWTUNNEL_XMIT_CONTINUE)
110                         return res;
111         }
112
113         rcu_read_lock_bh();
114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116         if (unlikely(!neigh))
117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118         if (!IS_ERR(neigh)) {
119                 sock_confirm_neigh(skb, neigh);
120                 ret = neigh_output(neigh, skb);
121                 rcu_read_unlock_bh();
122                 return ret;
123         }
124         rcu_read_unlock_bh();
125
126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127         kfree_skb(skb);
128         return -EINVAL;
129 }
130
131 static int
132 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
133                                     struct sk_buff *skb, unsigned int mtu)
134 {
135         struct sk_buff *segs, *nskb;
136         netdev_features_t features;
137         int ret = 0;
138
139         /* Please see corresponding comment in ip_finish_output_gso
140          * describing the cases where GSO segment length exceeds the
141          * egress MTU.
142          */
143         features = netif_skb_features(skb);
144         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
145         if (IS_ERR_OR_NULL(segs)) {
146                 kfree_skb(skb);
147                 return -ENOMEM;
148         }
149
150         consume_skb(skb);
151
152         skb_list_walk_safe(segs, segs, nskb) {
153                 int err;
154
155                 skb_mark_not_on_list(segs);
156                 /* Last GSO segment can be smaller than gso_size (and MTU).
157                  * Adding a fragment header would produce an "atomic fragment",
158                  * which is considered harmful (RFC-8021). Avoid that.
159                  */
160                 err = segs->len > mtu ?
161                         ip6_fragment(net, sk, segs, ip6_finish_output2) :
162                         ip6_finish_output2(net, sk, segs);
163                 if (err && ret == 0)
164                         ret = err;
165         }
166
167         return ret;
168 }
169
170 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
171 {
172         unsigned int mtu;
173         int ret;
174
175         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
176         if (ret) {
177                 kfree_skb(skb);
178                 return ret;
179         }
180
181 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
182         /* Policy lookup after SNAT yielded a new policy */
183         if (skb_dst(skb)->xfrm) {
184                 IP6CB(skb)->flags |= IP6SKB_REROUTED;
185                 return dst_output(net, sk, skb);
186         }
187 #endif
188
189         mtu = ip6_skb_dst_mtu(skb);
190         if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
191                 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
192
193         if ((skb->len > mtu && !skb_is_gso(skb)) ||
194             dst_allfrag(skb_dst(skb)) ||
195             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
196                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
197         else
198                 return ip6_finish_output2(net, sk, skb);
199 }
200
201 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
202 {
203         struct net_device *dev = skb_dst(skb)->dev;
204         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
205
206         skb->protocol = htons(ETH_P_IPV6);
207         skb->dev = dev;
208
209         if (unlikely(idev->cnf.disable_ipv6)) {
210                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
211                 kfree_skb(skb);
212                 return 0;
213         }
214
215         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
216                             net, sk, skb, NULL, dev,
217                             ip6_finish_output,
218                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
219 }
220
221 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
222 {
223         if (!np->autoflowlabel_set)
224                 return ip6_default_np_autolabel(net);
225         else
226                 return np->autoflowlabel;
227 }
228
229 /*
230  * xmit an sk_buff (used by TCP, SCTP and DCCP)
231  * Note : socket lock is not held for SYNACK packets, but might be modified
232  * by calls to skb_set_owner_w() and ipv6_local_error(),
233  * which are using proper atomic operations or spinlocks.
234  */
235 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
236              __u32 mark, struct ipv6_txoptions *opt, int tclass)
237 {
238         struct net *net = sock_net(sk);
239         const struct ipv6_pinfo *np = inet6_sk(sk);
240         struct in6_addr *first_hop = &fl6->daddr;
241         struct dst_entry *dst = skb_dst(skb);
242         unsigned int head_room;
243         struct ipv6hdr *hdr;
244         u8  proto = fl6->flowi6_proto;
245         int seg_len = skb->len;
246         int hlimit = -1;
247         u32 mtu;
248
249         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
250         if (opt)
251                 head_room += opt->opt_nflen + opt->opt_flen;
252
253         if (unlikely(skb_headroom(skb) < head_room)) {
254                 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
255                 if (!skb2) {
256                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
257                                       IPSTATS_MIB_OUTDISCARDS);
258                         kfree_skb(skb);
259                         return -ENOBUFS;
260                 }
261                 if (skb->sk)
262                         skb_set_owner_w(skb2, skb->sk);
263                 consume_skb(skb);
264                 skb = skb2;
265         }
266
267         if (opt) {
268                 seg_len += opt->opt_nflen + opt->opt_flen;
269
270                 if (opt->opt_flen)
271                         ipv6_push_frag_opts(skb, opt, &proto);
272
273                 if (opt->opt_nflen)
274                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
275                                              &fl6->saddr);
276         }
277
278         skb_push(skb, sizeof(struct ipv6hdr));
279         skb_reset_network_header(skb);
280         hdr = ipv6_hdr(skb);
281
282         /*
283          *      Fill in the IPv6 header
284          */
285         if (np)
286                 hlimit = np->hop_limit;
287         if (hlimit < 0)
288                 hlimit = ip6_dst_hoplimit(dst);
289
290         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
291                                 ip6_autoflowlabel(net, np), fl6));
292
293         hdr->payload_len = htons(seg_len);
294         hdr->nexthdr = proto;
295         hdr->hop_limit = hlimit;
296
297         hdr->saddr = fl6->saddr;
298         hdr->daddr = *first_hop;
299
300         skb->protocol = htons(ETH_P_IPV6);
301         skb->priority = sk->sk_priority;
302         skb->mark = mark;
303
304         mtu = dst_mtu(dst);
305         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
306                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
307                               IPSTATS_MIB_OUT, skb->len);
308
309                 /* if egress device is enslaved to an L3 master device pass the
310                  * skb to its handler for processing
311                  */
312                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
313                 if (unlikely(!skb))
314                         return 0;
315
316                 /* hooks should never assume socket lock is held.
317                  * we promote our socket to non const
318                  */
319                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
320                                net, (struct sock *)sk, skb, NULL, dst->dev,
321                                dst_output);
322         }
323
324         skb->dev = dst->dev;
325         /* ipv6_local_error() does not require socket lock,
326          * we promote our socket to non const
327          */
328         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
329
330         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
331         kfree_skb(skb);
332         return -EMSGSIZE;
333 }
334 EXPORT_SYMBOL(ip6_xmit);
335
336 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
337 {
338         struct ip6_ra_chain *ra;
339         struct sock *last = NULL;
340
341         read_lock(&ip6_ra_lock);
342         for (ra = ip6_ra_chain; ra; ra = ra->next) {
343                 struct sock *sk = ra->sk;
344                 if (sk && ra->sel == sel &&
345                     (!sk->sk_bound_dev_if ||
346                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
347                         if (last) {
348                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
349                                 if (skb2)
350                                         rawv6_rcv(last, skb2);
351                         }
352                         last = sk;
353                 }
354         }
355
356         if (last) {
357                 rawv6_rcv(last, skb);
358                 read_unlock(&ip6_ra_lock);
359                 return 1;
360         }
361         read_unlock(&ip6_ra_lock);
362         return 0;
363 }
364
365 static int ip6_forward_proxy_check(struct sk_buff *skb)
366 {
367         struct ipv6hdr *hdr = ipv6_hdr(skb);
368         u8 nexthdr = hdr->nexthdr;
369         __be16 frag_off;
370         int offset;
371
372         if (ipv6_ext_hdr(nexthdr)) {
373                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
374                 if (offset < 0)
375                         return 0;
376         } else
377                 offset = sizeof(struct ipv6hdr);
378
379         if (nexthdr == IPPROTO_ICMPV6) {
380                 struct icmp6hdr *icmp6;
381
382                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
383                                          offset + 1 - skb->data)))
384                         return 0;
385
386                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
387
388                 switch (icmp6->icmp6_type) {
389                 case NDISC_ROUTER_SOLICITATION:
390                 case NDISC_ROUTER_ADVERTISEMENT:
391                 case NDISC_NEIGHBOUR_SOLICITATION:
392                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
393                 case NDISC_REDIRECT:
394                         /* For reaction involving unicast neighbor discovery
395                          * message destined to the proxied address, pass it to
396                          * input function.
397                          */
398                         return 1;
399                 default:
400                         break;
401                 }
402         }
403
404         /*
405          * The proxying router can't forward traffic sent to a link-local
406          * address, so signal the sender and discard the packet. This
407          * behavior is clarified by the MIPv6 specification.
408          */
409         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
410                 dst_link_failure(skb);
411                 return -1;
412         }
413
414         return 0;
415 }
416
417 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
418                                      struct sk_buff *skb)
419 {
420         struct dst_entry *dst = skb_dst(skb);
421
422         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
423         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
424
425         skb->tstamp = 0;
426         return dst_output(net, sk, skb);
427 }
428
429 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
430 {
431         if (skb->len <= mtu)
432                 return false;
433
434         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
435         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
436                 return true;
437
438         if (skb->ignore_df)
439                 return false;
440
441         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
442                 return false;
443
444         return true;
445 }
446
447 int ip6_forward(struct sk_buff *skb)
448 {
449         struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
450         struct dst_entry *dst = skb_dst(skb);
451         struct ipv6hdr *hdr = ipv6_hdr(skb);
452         struct inet6_skb_parm *opt = IP6CB(skb);
453         struct net *net = dev_net(dst->dev);
454         u32 mtu;
455
456         if (net->ipv6.devconf_all->forwarding == 0)
457                 goto error;
458
459         if (skb->pkt_type != PACKET_HOST)
460                 goto drop;
461
462         if (unlikely(skb->sk))
463                 goto drop;
464
465         if (skb_warn_if_lro(skb))
466                 goto drop;
467
468         if (!net->ipv6.devconf_all->disable_policy &&
469             (!idev || !idev->cnf.disable_policy) &&
470             !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
471                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
472                 goto drop;
473         }
474
475         skb_forward_csum(skb);
476
477         /*
478          *      We DO NOT make any processing on
479          *      RA packets, pushing them to user level AS IS
480          *      without ane WARRANTY that application will be able
481          *      to interpret them. The reason is that we
482          *      cannot make anything clever here.
483          *
484          *      We are not end-node, so that if packet contains
485          *      AH/ESP, we cannot make anything.
486          *      Defragmentation also would be mistake, RA packets
487          *      cannot be fragmented, because there is no warranty
488          *      that different fragments will go along one path. --ANK
489          */
490         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
491                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
492                         return 0;
493         }
494
495         /*
496          *      check and decrement ttl
497          */
498         if (hdr->hop_limit <= 1) {
499                 /* Force OUTPUT device used as source address */
500                 skb->dev = dst->dev;
501                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
502                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
503
504                 kfree_skb(skb);
505                 return -ETIMEDOUT;
506         }
507
508         /* XXX: idev->cnf.proxy_ndp? */
509         if (net->ipv6.devconf_all->proxy_ndp &&
510             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
511                 int proxied = ip6_forward_proxy_check(skb);
512                 if (proxied > 0)
513                         return ip6_input(skb);
514                 else if (proxied < 0) {
515                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
516                         goto drop;
517                 }
518         }
519
520         if (!xfrm6_route_forward(skb)) {
521                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
522                 goto drop;
523         }
524         dst = skb_dst(skb);
525
526         /* IPv6 specs say nothing about it, but it is clear that we cannot
527            send redirects to source routed frames.
528            We don't send redirects to frames decapsulated from IPsec.
529          */
530         if (IP6CB(skb)->iif == dst->dev->ifindex &&
531             opt->srcrt == 0 && !skb_sec_path(skb)) {
532                 struct in6_addr *target = NULL;
533                 struct inet_peer *peer;
534                 struct rt6_info *rt;
535
536                 /*
537                  *      incoming and outgoing devices are the same
538                  *      send a redirect.
539                  */
540
541                 rt = (struct rt6_info *) dst;
542                 if (rt->rt6i_flags & RTF_GATEWAY)
543                         target = &rt->rt6i_gateway;
544                 else
545                         target = &hdr->daddr;
546
547                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
548
549                 /* Limit redirects both by destination (here)
550                    and by source (inside ndisc_send_redirect)
551                  */
552                 if (inet_peer_xrlim_allow(peer, 1*HZ))
553                         ndisc_send_redirect(skb, target);
554                 if (peer)
555                         inet_putpeer(peer);
556         } else {
557                 int addrtype = ipv6_addr_type(&hdr->saddr);
558
559                 /* This check is security critical. */
560                 if (addrtype == IPV6_ADDR_ANY ||
561                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
562                         goto error;
563                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
564                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
565                                     ICMPV6_NOT_NEIGHBOUR, 0);
566                         goto error;
567                 }
568         }
569
570         mtu = ip6_dst_mtu_forward(dst);
571         if (mtu < IPV6_MIN_MTU)
572                 mtu = IPV6_MIN_MTU;
573
574         if (ip6_pkt_too_big(skb, mtu)) {
575                 /* Again, force OUTPUT device used as source address */
576                 skb->dev = dst->dev;
577                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
578                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
579                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
580                                 IPSTATS_MIB_FRAGFAILS);
581                 kfree_skb(skb);
582                 return -EMSGSIZE;
583         }
584
585         if (skb_cow(skb, dst->dev->hard_header_len)) {
586                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
587                                 IPSTATS_MIB_OUTDISCARDS);
588                 goto drop;
589         }
590
591         hdr = ipv6_hdr(skb);
592
593         /* Mangling hops number delayed to point after skb COW */
594
595         hdr->hop_limit--;
596
597         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
598                        net, NULL, skb, skb->dev, dst->dev,
599                        ip6_forward_finish);
600
601 error:
602         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
603 drop:
604         kfree_skb(skb);
605         return -EINVAL;
606 }
607
608 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
609 {
610         to->pkt_type = from->pkt_type;
611         to->priority = from->priority;
612         to->protocol = from->protocol;
613         skb_dst_drop(to);
614         skb_dst_set(to, dst_clone(skb_dst(from)));
615         to->dev = from->dev;
616         to->mark = from->mark;
617
618         skb_copy_hash(to, from);
619
620 #ifdef CONFIG_NET_SCHED
621         to->tc_index = from->tc_index;
622 #endif
623         nf_copy(to, from);
624         skb_copy_secmark(to, from);
625 }
626
627 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
628                  int (*output)(struct net *, struct sock *, struct sk_buff *))
629 {
630         struct sk_buff *frag;
631         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
632         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
633                                 inet6_sk(skb->sk) : NULL;
634         struct ipv6hdr *tmp_hdr;
635         struct frag_hdr *fh;
636         unsigned int mtu, hlen, left, len, nexthdr_offset;
637         int hroom, troom;
638         __be32 frag_id;
639         int ptr, offset = 0, err = 0;
640         u8 *prevhdr, nexthdr = 0;
641
642         err = ip6_find_1stfragopt(skb, &prevhdr);
643         if (err < 0)
644                 goto fail;
645         hlen = err;
646         nexthdr = *prevhdr;
647         nexthdr_offset = prevhdr - skb_network_header(skb);
648
649         mtu = ip6_skb_dst_mtu(skb);
650
651         /* We must not fragment if the socket is set to force MTU discovery
652          * or if the skb it not generated by a local socket.
653          */
654         if (unlikely(!skb->ignore_df && skb->len > mtu))
655                 goto fail_toobig;
656
657         if (IP6CB(skb)->frag_max_size) {
658                 if (IP6CB(skb)->frag_max_size > mtu)
659                         goto fail_toobig;
660
661                 /* don't send fragments larger than what we received */
662                 mtu = IP6CB(skb)->frag_max_size;
663                 if (mtu < IPV6_MIN_MTU)
664                         mtu = IPV6_MIN_MTU;
665         }
666
667         if (np && np->frag_size < mtu) {
668                 if (np->frag_size)
669                         mtu = np->frag_size;
670         }
671         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
672                 goto fail_toobig;
673         mtu -= hlen + sizeof(struct frag_hdr);
674
675         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
676                                     &ipv6_hdr(skb)->saddr);
677
678         if (skb->ip_summed == CHECKSUM_PARTIAL &&
679             (err = skb_checksum_help(skb)))
680                 goto fail;
681
682         prevhdr = skb_network_header(skb) + nexthdr_offset;
683         hroom = LL_RESERVED_SPACE(rt->dst.dev);
684         if (skb_has_frag_list(skb)) {
685                 unsigned int first_len = skb_pagelen(skb);
686                 struct sk_buff *frag2;
687
688                 if (first_len - hlen > mtu ||
689                     ((first_len - hlen) & 7) ||
690                     skb_cloned(skb) ||
691                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
692                         goto slow_path;
693
694                 skb_walk_frags(skb, frag) {
695                         /* Correct geometry. */
696                         if (frag->len > mtu ||
697                             ((frag->len & 7) && frag->next) ||
698                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
699                                 goto slow_path_clean;
700
701                         /* Partially cloned skb? */
702                         if (skb_shared(frag))
703                                 goto slow_path_clean;
704
705                         BUG_ON(frag->sk);
706                         if (skb->sk) {
707                                 frag->sk = skb->sk;
708                                 frag->destructor = sock_wfree;
709                         }
710                         skb->truesize -= frag->truesize;
711                 }
712
713                 err = 0;
714                 offset = 0;
715                 /* BUILD HEADER */
716
717                 *prevhdr = NEXTHDR_FRAGMENT;
718                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
719                 if (!tmp_hdr) {
720                         err = -ENOMEM;
721                         goto fail;
722                 }
723                 frag = skb_shinfo(skb)->frag_list;
724                 skb_frag_list_init(skb);
725
726                 __skb_pull(skb, hlen);
727                 fh = __skb_push(skb, sizeof(struct frag_hdr));
728                 __skb_push(skb, hlen);
729                 skb_reset_network_header(skb);
730                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
731
732                 fh->nexthdr = nexthdr;
733                 fh->reserved = 0;
734                 fh->frag_off = htons(IP6_MF);
735                 fh->identification = frag_id;
736
737                 first_len = skb_pagelen(skb);
738                 skb->data_len = first_len - skb_headlen(skb);
739                 skb->len = first_len;
740                 ipv6_hdr(skb)->payload_len = htons(first_len -
741                                                    sizeof(struct ipv6hdr));
742
743                 /* We prevent @rt from being freed. */
744                 rcu_read_lock();
745
746                 for (;;) {
747                         /* Prepare header of the next frame,
748                          * before previous one went down. */
749                         if (frag) {
750                                 frag->ip_summed = CHECKSUM_NONE;
751                                 skb_reset_transport_header(frag);
752                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
753                                 __skb_push(frag, hlen);
754                                 skb_reset_network_header(frag);
755                                 memcpy(skb_network_header(frag), tmp_hdr,
756                                        hlen);
757                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
758                                 fh->nexthdr = nexthdr;
759                                 fh->reserved = 0;
760                                 fh->frag_off = htons(offset);
761                                 if (frag->next)
762                                         fh->frag_off |= htons(IP6_MF);
763                                 fh->identification = frag_id;
764                                 ipv6_hdr(frag)->payload_len =
765                                                 htons(frag->len -
766                                                       sizeof(struct ipv6hdr));
767                                 ip6_copy_metadata(frag, skb);
768                         }
769
770                         err = output(net, sk, skb);
771                         if (!err)
772                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
773                                               IPSTATS_MIB_FRAGCREATES);
774
775                         if (err || !frag)
776                                 break;
777
778                         skb = frag;
779                         frag = skb->next;
780                         skb->next = NULL;
781                 }
782
783                 kfree(tmp_hdr);
784
785                 if (err == 0) {
786                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
787                                       IPSTATS_MIB_FRAGOKS);
788                         rcu_read_unlock();
789                         return 0;
790                 }
791
792                 kfree_skb_list(frag);
793
794                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
795                               IPSTATS_MIB_FRAGFAILS);
796                 rcu_read_unlock();
797                 return err;
798
799 slow_path_clean:
800                 skb_walk_frags(skb, frag2) {
801                         if (frag2 == frag)
802                                 break;
803                         frag2->sk = NULL;
804                         frag2->destructor = NULL;
805                         skb->truesize += frag2->truesize;
806                 }
807         }
808
809 slow_path:
810         left = skb->len - hlen;         /* Space per frame */
811         ptr = hlen;                     /* Where to start from */
812
813         /*
814          *      Fragment the datagram.
815          */
816
817         troom = rt->dst.dev->needed_tailroom;
818
819         /*
820          *      Keep copying data until we run out.
821          */
822         while (left > 0)        {
823                 u8 *fragnexthdr_offset;
824
825                 len = left;
826                 /* IF: it doesn't fit, use 'mtu' - the data space left */
827                 if (len > mtu)
828                         len = mtu;
829                 /* IF: we are not sending up to and including the packet end
830                    then align the next start on an eight byte boundary */
831                 if (len < left) {
832                         len &= ~7;
833                 }
834
835                 /* Allocate buffer */
836                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
837                                  hroom + troom, GFP_ATOMIC);
838                 if (!frag) {
839                         err = -ENOMEM;
840                         goto fail;
841                 }
842
843                 /*
844                  *      Set up data on packet
845                  */
846
847                 ip6_copy_metadata(frag, skb);
848                 skb_reserve(frag, hroom);
849                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
850                 skb_reset_network_header(frag);
851                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
852                 frag->transport_header = (frag->network_header + hlen +
853                                           sizeof(struct frag_hdr));
854
855                 /*
856                  *      Charge the memory for the fragment to any owner
857                  *      it might possess
858                  */
859                 if (skb->sk)
860                         skb_set_owner_w(frag, skb->sk);
861
862                 /*
863                  *      Copy the packet header into the new buffer.
864                  */
865                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
866
867                 fragnexthdr_offset = skb_network_header(frag);
868                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
869                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
870
871                 /*
872                  *      Build fragment header.
873                  */
874                 fh->nexthdr = nexthdr;
875                 fh->reserved = 0;
876                 fh->identification = frag_id;
877
878                 /*
879                  *      Copy a block of the IP datagram.
880                  */
881                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
882                                      len));
883                 left -= len;
884
885                 fh->frag_off = htons(offset);
886                 if (left > 0)
887                         fh->frag_off |= htons(IP6_MF);
888                 ipv6_hdr(frag)->payload_len = htons(frag->len -
889                                                     sizeof(struct ipv6hdr));
890
891                 ptr += len;
892                 offset += len;
893
894                 /*
895                  *      Put this fragment into the sending queue.
896                  */
897                 err = output(net, sk, frag);
898                 if (err)
899                         goto fail;
900
901                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
902                               IPSTATS_MIB_FRAGCREATES);
903         }
904         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
905                       IPSTATS_MIB_FRAGOKS);
906         consume_skb(skb);
907         return err;
908
909 fail_toobig:
910         if (skb->sk && dst_allfrag(skb_dst(skb)))
911                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
912
913         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
914         err = -EMSGSIZE;
915
916 fail:
917         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
918                       IPSTATS_MIB_FRAGFAILS);
919         kfree_skb(skb);
920         return err;
921 }
922
923 static inline int ip6_rt_check(const struct rt6key *rt_key,
924                                const struct in6_addr *fl_addr,
925                                const struct in6_addr *addr_cache)
926 {
927         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
928                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
929 }
930
931 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
932                                           struct dst_entry *dst,
933                                           const struct flowi6 *fl6)
934 {
935         struct ipv6_pinfo *np = inet6_sk(sk);
936         struct rt6_info *rt;
937
938         if (!dst)
939                 goto out;
940
941         if (dst->ops->family != AF_INET6) {
942                 dst_release(dst);
943                 return NULL;
944         }
945
946         rt = (struct rt6_info *)dst;
947         /* Yes, checking route validity in not connected
948          * case is not very simple. Take into account,
949          * that we do not support routing by source, TOS,
950          * and MSG_DONTROUTE            --ANK (980726)
951          *
952          * 1. ip6_rt_check(): If route was host route,
953          *    check that cached destination is current.
954          *    If it is network route, we still may
955          *    check its validity using saved pointer
956          *    to the last used address: daddr_cache.
957          *    We do not want to save whole address now,
958          *    (because main consumer of this service
959          *    is tcp, which has not this problem),
960          *    so that the last trick works only on connected
961          *    sockets.
962          * 2. oif also should be the same.
963          */
964         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
965 #ifdef CONFIG_IPV6_SUBTREES
966             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
967 #endif
968            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
969               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
970                 dst_release(dst);
971                 dst = NULL;
972         }
973
974 out:
975         return dst;
976 }
977
978 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
979                                struct dst_entry **dst, struct flowi6 *fl6)
980 {
981 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
982         struct neighbour *n;
983         struct rt6_info *rt;
984 #endif
985         int err;
986         int flags = 0;
987
988         /* The correct way to handle this would be to do
989          * ip6_route_get_saddr, and then ip6_route_output; however,
990          * the route-specific preferred source forces the
991          * ip6_route_output call _before_ ip6_route_get_saddr.
992          *
993          * In source specific routing (no src=any default route),
994          * ip6_route_output will fail given src=any saddr, though, so
995          * that's why we try it again later.
996          */
997         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
998                 struct fib6_info *from;
999                 struct rt6_info *rt;
1000                 bool had_dst = *dst != NULL;
1001
1002                 if (!had_dst)
1003                         *dst = ip6_route_output(net, sk, fl6);
1004                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1005
1006                 rcu_read_lock();
1007                 from = rt ? rcu_dereference(rt->from) : NULL;
1008                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1009                                           sk ? inet6_sk(sk)->srcprefs : 0,
1010                                           &fl6->saddr);
1011                 rcu_read_unlock();
1012
1013                 if (err)
1014                         goto out_err_release;
1015
1016                 /* If we had an erroneous initial result, pretend it
1017                  * never existed and let the SA-enabled version take
1018                  * over.
1019                  */
1020                 if (!had_dst && (*dst)->error) {
1021                         dst_release(*dst);
1022                         *dst = NULL;
1023                 }
1024
1025                 if (fl6->flowi6_oif)
1026                         flags |= RT6_LOOKUP_F_IFACE;
1027         }
1028
1029         if (!*dst)
1030                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1031
1032         err = (*dst)->error;
1033         if (err)
1034                 goto out_err_release;
1035
1036 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1037         /*
1038          * Here if the dst entry we've looked up
1039          * has a neighbour entry that is in the INCOMPLETE
1040          * state and the src address from the flow is
1041          * marked as OPTIMISTIC, we release the found
1042          * dst entry and replace it instead with the
1043          * dst entry of the nexthop router
1044          */
1045         rt = (struct rt6_info *) *dst;
1046         rcu_read_lock_bh();
1047         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1048                                       rt6_nexthop(rt, &fl6->daddr));
1049         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1050         rcu_read_unlock_bh();
1051
1052         if (err) {
1053                 struct inet6_ifaddr *ifp;
1054                 struct flowi6 fl_gw6;
1055                 int redirect;
1056
1057                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1058                                       (*dst)->dev, 1);
1059
1060                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1061                 if (ifp)
1062                         in6_ifa_put(ifp);
1063
1064                 if (redirect) {
1065                         /*
1066                          * We need to get the dst entry for the
1067                          * default router instead
1068                          */
1069                         dst_release(*dst);
1070                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1071                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1072                         *dst = ip6_route_output(net, sk, &fl_gw6);
1073                         err = (*dst)->error;
1074                         if (err)
1075                                 goto out_err_release;
1076                 }
1077         }
1078 #endif
1079         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1080             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1081                 err = -EAFNOSUPPORT;
1082                 goto out_err_release;
1083         }
1084
1085         return 0;
1086
1087 out_err_release:
1088         dst_release(*dst);
1089         *dst = NULL;
1090
1091         if (err == -ENETUNREACH)
1092                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1093         return err;
1094 }
1095
1096 /**
1097  *      ip6_dst_lookup - perform route lookup on flow
1098  *      @sk: socket which provides route info
1099  *      @dst: pointer to dst_entry * for result
1100  *      @fl6: flow to lookup
1101  *
1102  *      This function performs a route lookup on the given flow.
1103  *
1104  *      It returns zero on success, or a standard errno code on error.
1105  */
1106 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1107                    struct flowi6 *fl6)
1108 {
1109         *dst = NULL;
1110         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1111 }
1112 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1113
1114 /**
1115  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1116  *      @sk: socket which provides route info
1117  *      @fl6: flow to lookup
1118  *      @final_dst: final destination address for ipsec lookup
1119  *
1120  *      This function performs a route lookup on the given flow.
1121  *
1122  *      It returns a valid dst pointer on success, or a pointer encoded
1123  *      error code.
1124  */
1125 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1126                                       const struct in6_addr *final_dst)
1127 {
1128         struct dst_entry *dst = NULL;
1129         int err;
1130
1131         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1132         if (err)
1133                 return ERR_PTR(err);
1134         if (final_dst)
1135                 fl6->daddr = *final_dst;
1136
1137         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1138 }
1139 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1140
1141 /**
1142  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1143  *      @sk: socket which provides the dst cache and route info
1144  *      @fl6: flow to lookup
1145  *      @final_dst: final destination address for ipsec lookup
1146  *      @connected: whether @sk is connected or not
1147  *
1148  *      This function performs a route lookup on the given flow with the
1149  *      possibility of using the cached route in the socket if it is valid.
1150  *      It will take the socket dst lock when operating on the dst cache.
1151  *      As a result, this function can only be used in process context.
1152  *
1153  *      In addition, for a connected socket, cache the dst in the socket
1154  *      if the current cache is not valid.
1155  *
1156  *      It returns a valid dst pointer on success, or a pointer encoded
1157  *      error code.
1158  */
1159 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1160                                          const struct in6_addr *final_dst,
1161                                          bool connected)
1162 {
1163         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1164
1165         dst = ip6_sk_dst_check(sk, dst, fl6);
1166         if (dst)
1167                 return dst;
1168
1169         dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1170         if (connected && !IS_ERR(dst))
1171                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1172
1173         return dst;
1174 }
1175 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1176
1177 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1178                                                gfp_t gfp)
1179 {
1180         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1181 }
1182
1183 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1184                                                 gfp_t gfp)
1185 {
1186         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1187 }
1188
1189 static void ip6_append_data_mtu(unsigned int *mtu,
1190                                 int *maxfraglen,
1191                                 unsigned int fragheaderlen,
1192                                 struct sk_buff *skb,
1193                                 struct rt6_info *rt,
1194                                 unsigned int orig_mtu)
1195 {
1196         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1197                 if (!skb) {
1198                         /* first fragment, reserve header_len */
1199                         *mtu = orig_mtu - rt->dst.header_len;
1200
1201                 } else {
1202                         /*
1203                          * this fragment is not first, the headers
1204                          * space is regarded as data space.
1205                          */
1206                         *mtu = orig_mtu;
1207                 }
1208                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1209                               + fragheaderlen - sizeof(struct frag_hdr);
1210         }
1211 }
1212
1213 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1214                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1215                           struct rt6_info *rt, struct flowi6 *fl6)
1216 {
1217         struct ipv6_pinfo *np = inet6_sk(sk);
1218         unsigned int mtu;
1219         struct ipv6_txoptions *opt = ipc6->opt;
1220
1221         /*
1222          * setup for corking
1223          */
1224         if (opt) {
1225                 if (WARN_ON(v6_cork->opt))
1226                         return -EINVAL;
1227
1228                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1229                 if (unlikely(!v6_cork->opt))
1230                         return -ENOBUFS;
1231
1232                 v6_cork->opt->tot_len = sizeof(*opt);
1233                 v6_cork->opt->opt_flen = opt->opt_flen;
1234                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1235
1236                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1237                                                     sk->sk_allocation);
1238                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1239                         return -ENOBUFS;
1240
1241                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1242                                                     sk->sk_allocation);
1243                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1244                         return -ENOBUFS;
1245
1246                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1247                                                    sk->sk_allocation);
1248                 if (opt->hopopt && !v6_cork->opt->hopopt)
1249                         return -ENOBUFS;
1250
1251                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1252                                                     sk->sk_allocation);
1253                 if (opt->srcrt && !v6_cork->opt->srcrt)
1254                         return -ENOBUFS;
1255
1256                 /* need source address above miyazawa*/
1257         }
1258         dst_hold(&rt->dst);
1259         cork->base.dst = &rt->dst;
1260         cork->fl.u.ip6 = *fl6;
1261         v6_cork->hop_limit = ipc6->hlimit;
1262         v6_cork->tclass = ipc6->tclass;
1263         if (rt->dst.flags & DST_XFRM_TUNNEL)
1264                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1265                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1266         else
1267                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1268                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1269         if (np->frag_size < mtu) {
1270                 if (np->frag_size)
1271                         mtu = np->frag_size;
1272         }
1273         cork->base.fragsize = mtu;
1274         cork->base.gso_size = ipc6->gso_size;
1275         cork->base.tx_flags = 0;
1276         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1277
1278         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1279                 cork->base.flags |= IPCORK_ALLFRAG;
1280         cork->base.length = 0;
1281
1282         cork->base.transmit_time = ipc6->sockc.transmit_time;
1283
1284         return 0;
1285 }
1286
1287 static int __ip6_append_data(struct sock *sk,
1288                              struct flowi6 *fl6,
1289                              struct sk_buff_head *queue,
1290                              struct inet_cork *cork,
1291                              struct inet6_cork *v6_cork,
1292                              struct page_frag *pfrag,
1293                              int getfrag(void *from, char *to, int offset,
1294                                          int len, int odd, struct sk_buff *skb),
1295                              void *from, int length, int transhdrlen,
1296                              unsigned int flags, struct ipcm6_cookie *ipc6)
1297 {
1298         struct sk_buff *skb, *skb_prev = NULL;
1299         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1300         int exthdrlen = 0;
1301         int dst_exthdrlen = 0;
1302         int hh_len;
1303         int copy;
1304         int err;
1305         int offset = 0;
1306         u32 tskey = 0;
1307         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1308         struct ipv6_txoptions *opt = v6_cork->opt;
1309         int csummode = CHECKSUM_NONE;
1310         unsigned int maxnonfragsize, headersize;
1311         unsigned int wmem_alloc_delta = 0;
1312         bool paged;
1313
1314         skb = skb_peek_tail(queue);
1315         if (!skb) {
1316                 exthdrlen = opt ? opt->opt_flen : 0;
1317                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1318         }
1319
1320         paged = !!cork->gso_size;
1321         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1322         orig_mtu = mtu;
1323
1324         if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1325             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1326                 tskey = sk->sk_tskey++;
1327
1328         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1329
1330         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1331                         (opt ? opt->opt_nflen : 0);
1332
1333         headersize = sizeof(struct ipv6hdr) +
1334                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1335                      (dst_allfrag(&rt->dst) ?
1336                       sizeof(struct frag_hdr) : 0) +
1337                      rt->rt6i_nfheader_len;
1338
1339         if (mtu <= fragheaderlen ||
1340             ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1341                 goto emsgsize;
1342
1343         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1344                      sizeof(struct frag_hdr);
1345
1346         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1347          * the first fragment
1348          */
1349         if (headersize + transhdrlen > mtu)
1350                 goto emsgsize;
1351
1352         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1353             (sk->sk_protocol == IPPROTO_UDP ||
1354              sk->sk_protocol == IPPROTO_RAW)) {
1355                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1356                                 sizeof(struct ipv6hdr));
1357                 goto emsgsize;
1358         }
1359
1360         if (ip6_sk_ignore_df(sk))
1361                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1362         else
1363                 maxnonfragsize = mtu;
1364
1365         if (cork->length + length > maxnonfragsize - headersize) {
1366 emsgsize:
1367                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1368                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1369                 return -EMSGSIZE;
1370         }
1371
1372         /* CHECKSUM_PARTIAL only with no extension headers and when
1373          * we are not going to fragment
1374          */
1375         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1376             headersize == sizeof(struct ipv6hdr) &&
1377             length <= mtu - headersize &&
1378             (!(flags & MSG_MORE) || cork->gso_size) &&
1379             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1380                 csummode = CHECKSUM_PARTIAL;
1381
1382         /*
1383          * Let's try using as much space as possible.
1384          * Use MTU if total length of the message fits into the MTU.
1385          * Otherwise, we need to reserve fragment header and
1386          * fragment alignment (= 8-15 octects, in total).
1387          *
1388          * Note that we may need to "move" the data from the tail of
1389          * of the buffer to the new fragment when we split
1390          * the message.
1391          *
1392          * FIXME: It may be fragmented into multiple chunks
1393          *        at once if non-fragmentable extension headers
1394          *        are too large.
1395          * --yoshfuji
1396          */
1397
1398         cork->length += length;
1399         if (!skb)
1400                 goto alloc_new_skb;
1401
1402         while (length > 0) {
1403                 /* Check if the remaining data fits into current packet. */
1404                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1405                 if (copy < length)
1406                         copy = maxfraglen - skb->len;
1407
1408                 if (copy <= 0) {
1409                         char *data;
1410                         unsigned int datalen;
1411                         unsigned int fraglen;
1412                         unsigned int fraggap;
1413                         unsigned int alloclen, alloc_extra;
1414                         unsigned int pagedlen;
1415 alloc_new_skb:
1416                         /* There's no room in the current skb */
1417                         if (skb)
1418                                 fraggap = skb->len - maxfraglen;
1419                         else
1420                                 fraggap = 0;
1421                         /* update mtu and maxfraglen if necessary */
1422                         if (!skb || !skb_prev)
1423                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1424                                                     fragheaderlen, skb, rt,
1425                                                     orig_mtu);
1426
1427                         skb_prev = skb;
1428
1429                         /*
1430                          * If remaining data exceeds the mtu,
1431                          * we know we need more fragment(s).
1432                          */
1433                         datalen = length + fraggap;
1434
1435                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1436                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1437                         fraglen = datalen + fragheaderlen;
1438                         pagedlen = 0;
1439
1440                         alloc_extra = hh_len;
1441                         alloc_extra += dst_exthdrlen;
1442                         alloc_extra += rt->dst.trailer_len;
1443
1444                         /* We just reserve space for fragment header.
1445                          * Note: this may be overallocation if the message
1446                          * (without MSG_MORE) fits into the MTU.
1447                          */
1448                         alloc_extra += sizeof(struct frag_hdr);
1449
1450                         if ((flags & MSG_MORE) &&
1451                             !(rt->dst.dev->features&NETIF_F_SG))
1452                                 alloclen = mtu;
1453                         else if (!paged &&
1454                                  (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1455                                   !(rt->dst.dev->features & NETIF_F_SG)))
1456                                 alloclen = fraglen;
1457                         else {
1458                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1459                                 pagedlen = fraglen - alloclen;
1460                         }
1461                         alloclen += alloc_extra;
1462
1463                         if (datalen != length + fraggap) {
1464                                 /*
1465                                  * this is not the last fragment, the trailer
1466                                  * space is regarded as data space.
1467                                  */
1468                                 datalen += rt->dst.trailer_len;
1469                         }
1470
1471                         fraglen = datalen + fragheaderlen;
1472
1473                         copy = datalen - transhdrlen - fraggap - pagedlen;
1474                         if (copy < 0) {
1475                                 err = -EINVAL;
1476                                 goto error;
1477                         }
1478                         if (transhdrlen) {
1479                                 skb = sock_alloc_send_skb(sk, alloclen,
1480                                                 (flags & MSG_DONTWAIT), &err);
1481                         } else {
1482                                 skb = NULL;
1483                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1484                                     2 * sk->sk_sndbuf)
1485                                         skb = alloc_skb(alloclen,
1486                                                         sk->sk_allocation);
1487                                 if (unlikely(!skb))
1488                                         err = -ENOBUFS;
1489                         }
1490                         if (!skb)
1491                                 goto error;
1492                         /*
1493                          *      Fill in the control structures
1494                          */
1495                         skb->protocol = htons(ETH_P_IPV6);
1496                         skb->ip_summed = csummode;
1497                         skb->csum = 0;
1498                         /* reserve for fragmentation and ipsec header */
1499                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1500                                     dst_exthdrlen);
1501
1502                         /* Only the initial fragment is time stamped */
1503                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1504                         cork->tx_flags = 0;
1505                         skb_shinfo(skb)->tskey = tskey;
1506                         tskey = 0;
1507
1508                         /*
1509                          *      Find where to start putting bytes
1510                          */
1511                         data = skb_put(skb, fraglen - pagedlen);
1512                         skb_set_network_header(skb, exthdrlen);
1513                         data += fragheaderlen;
1514                         skb->transport_header = (skb->network_header +
1515                                                  fragheaderlen);
1516                         if (fraggap) {
1517                                 skb->csum = skb_copy_and_csum_bits(
1518                                         skb_prev, maxfraglen,
1519                                         data + transhdrlen, fraggap, 0);
1520                                 skb_prev->csum = csum_sub(skb_prev->csum,
1521                                                           skb->csum);
1522                                 data += fraggap;
1523                                 pskb_trim_unique(skb_prev, maxfraglen);
1524                         }
1525                         if (copy > 0 &&
1526                             getfrag(from, data + transhdrlen, offset,
1527                                     copy, fraggap, skb) < 0) {
1528                                 err = -EFAULT;
1529                                 kfree_skb(skb);
1530                                 goto error;
1531                         }
1532
1533                         offset += copy;
1534                         length -= copy + transhdrlen;
1535                         transhdrlen = 0;
1536                         exthdrlen = 0;
1537                         dst_exthdrlen = 0;
1538
1539                         if ((flags & MSG_CONFIRM) && !skb_prev)
1540                                 skb_set_dst_pending_confirm(skb, 1);
1541
1542                         /*
1543                          * Put the packet on the pending queue
1544                          */
1545                         if (!skb->destructor) {
1546                                 skb->destructor = sock_wfree;
1547                                 skb->sk = sk;
1548                                 wmem_alloc_delta += skb->truesize;
1549                         }
1550                         __skb_queue_tail(queue, skb);
1551                         continue;
1552                 }
1553
1554                 if (copy > length)
1555                         copy = length;
1556
1557                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1558                     skb_tailroom(skb) >= copy) {
1559                         unsigned int off;
1560
1561                         off = skb->len;
1562                         if (getfrag(from, skb_put(skb, copy),
1563                                                 offset, copy, off, skb) < 0) {
1564                                 __skb_trim(skb, off);
1565                                 err = -EFAULT;
1566                                 goto error;
1567                         }
1568                 } else {
1569                         int i = skb_shinfo(skb)->nr_frags;
1570
1571                         err = -ENOMEM;
1572                         if (!sk_page_frag_refill(sk, pfrag))
1573                                 goto error;
1574
1575                         if (!skb_can_coalesce(skb, i, pfrag->page,
1576                                               pfrag->offset)) {
1577                                 err = -EMSGSIZE;
1578                                 if (i == MAX_SKB_FRAGS)
1579                                         goto error;
1580
1581                                 __skb_fill_page_desc(skb, i, pfrag->page,
1582                                                      pfrag->offset, 0);
1583                                 skb_shinfo(skb)->nr_frags = ++i;
1584                                 get_page(pfrag->page);
1585                         }
1586                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1587                         if (getfrag(from,
1588                                     page_address(pfrag->page) + pfrag->offset,
1589                                     offset, copy, skb->len, skb) < 0)
1590                                 goto error_efault;
1591
1592                         pfrag->offset += copy;
1593                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1594                         skb->len += copy;
1595                         skb->data_len += copy;
1596                         skb->truesize += copy;
1597                         wmem_alloc_delta += copy;
1598                 }
1599                 offset += copy;
1600                 length -= copy;
1601         }
1602
1603         if (wmem_alloc_delta)
1604                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1605         return 0;
1606
1607 error_efault:
1608         err = -EFAULT;
1609 error:
1610         cork->length -= length;
1611         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1612         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1613         return err;
1614 }
1615
1616 int ip6_append_data(struct sock *sk,
1617                     int getfrag(void *from, char *to, int offset, int len,
1618                                 int odd, struct sk_buff *skb),
1619                     void *from, int length, int transhdrlen,
1620                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1621                     struct rt6_info *rt, unsigned int flags)
1622 {
1623         struct inet_sock *inet = inet_sk(sk);
1624         struct ipv6_pinfo *np = inet6_sk(sk);
1625         int exthdrlen;
1626         int err;
1627
1628         if (flags&MSG_PROBE)
1629                 return 0;
1630         if (skb_queue_empty(&sk->sk_write_queue)) {
1631                 /*
1632                  * setup for corking
1633                  */
1634                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1635                                      ipc6, rt, fl6);
1636                 if (err)
1637                         return err;
1638
1639                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1640                 length += exthdrlen;
1641                 transhdrlen += exthdrlen;
1642         } else {
1643                 fl6 = &inet->cork.fl.u.ip6;
1644                 transhdrlen = 0;
1645         }
1646
1647         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1648                                  &np->cork, sk_page_frag(sk), getfrag,
1649                                  from, length, transhdrlen, flags, ipc6);
1650 }
1651 EXPORT_SYMBOL_GPL(ip6_append_data);
1652
1653 static void ip6_cork_release(struct inet_cork_full *cork,
1654                              struct inet6_cork *v6_cork)
1655 {
1656         if (v6_cork->opt) {
1657                 kfree(v6_cork->opt->dst0opt);
1658                 kfree(v6_cork->opt->dst1opt);
1659                 kfree(v6_cork->opt->hopopt);
1660                 kfree(v6_cork->opt->srcrt);
1661                 kfree(v6_cork->opt);
1662                 v6_cork->opt = NULL;
1663         }
1664
1665         if (cork->base.dst) {
1666                 dst_release(cork->base.dst);
1667                 cork->base.dst = NULL;
1668                 cork->base.flags &= ~IPCORK_ALLFRAG;
1669         }
1670         memset(&cork->fl, 0, sizeof(cork->fl));
1671 }
1672
1673 struct sk_buff *__ip6_make_skb(struct sock *sk,
1674                                struct sk_buff_head *queue,
1675                                struct inet_cork_full *cork,
1676                                struct inet6_cork *v6_cork)
1677 {
1678         struct sk_buff *skb, *tmp_skb;
1679         struct sk_buff **tail_skb;
1680         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1681         struct ipv6_pinfo *np = inet6_sk(sk);
1682         struct net *net = sock_net(sk);
1683         struct ipv6hdr *hdr;
1684         struct ipv6_txoptions *opt = v6_cork->opt;
1685         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1686         struct flowi6 *fl6 = &cork->fl.u.ip6;
1687         unsigned char proto = fl6->flowi6_proto;
1688
1689         skb = __skb_dequeue(queue);
1690         if (!skb)
1691                 goto out;
1692         tail_skb = &(skb_shinfo(skb)->frag_list);
1693
1694         /* move skb->data to ip header from ext header */
1695         if (skb->data < skb_network_header(skb))
1696                 __skb_pull(skb, skb_network_offset(skb));
1697         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1698                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1699                 *tail_skb = tmp_skb;
1700                 tail_skb = &(tmp_skb->next);
1701                 skb->len += tmp_skb->len;
1702                 skb->data_len += tmp_skb->len;
1703                 skb->truesize += tmp_skb->truesize;
1704                 tmp_skb->destructor = NULL;
1705                 tmp_skb->sk = NULL;
1706         }
1707
1708         /* Allow local fragmentation. */
1709         skb->ignore_df = ip6_sk_ignore_df(sk);
1710
1711         *final_dst = fl6->daddr;
1712         __skb_pull(skb, skb_network_header_len(skb));
1713         if (opt && opt->opt_flen)
1714                 ipv6_push_frag_opts(skb, opt, &proto);
1715         if (opt && opt->opt_nflen)
1716                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1717
1718         skb_push(skb, sizeof(struct ipv6hdr));
1719         skb_reset_network_header(skb);
1720         hdr = ipv6_hdr(skb);
1721
1722         ip6_flow_hdr(hdr, v6_cork->tclass,
1723                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1724                                         ip6_autoflowlabel(net, np), fl6));
1725         hdr->hop_limit = v6_cork->hop_limit;
1726         hdr->nexthdr = proto;
1727         hdr->saddr = fl6->saddr;
1728         hdr->daddr = *final_dst;
1729
1730         skb->priority = sk->sk_priority;
1731         skb->mark = sk->sk_mark;
1732
1733         skb->tstamp = cork->base.transmit_time;
1734
1735         skb_dst_set(skb, dst_clone(&rt->dst));
1736         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1737         if (proto == IPPROTO_ICMPV6) {
1738                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1739                 u8 icmp6_type;
1740
1741                 if (sk->sk_socket->type == SOCK_RAW && !inet_sk(sk)->hdrincl)
1742                         icmp6_type = fl6->fl6_icmp_type;
1743                 else
1744                         icmp6_type = icmp6_hdr(skb)->icmp6_type;
1745                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1746                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1747         }
1748
1749         ip6_cork_release(cork, v6_cork);
1750 out:
1751         return skb;
1752 }
1753
1754 int ip6_send_skb(struct sk_buff *skb)
1755 {
1756         struct net *net = sock_net(skb->sk);
1757         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1758         int err;
1759
1760         err = ip6_local_out(net, skb->sk, skb);
1761         if (err) {
1762                 if (err > 0)
1763                         err = net_xmit_errno(err);
1764                 if (err)
1765                         IP6_INC_STATS(net, rt->rt6i_idev,
1766                                       IPSTATS_MIB_OUTDISCARDS);
1767         }
1768
1769         return err;
1770 }
1771
1772 int ip6_push_pending_frames(struct sock *sk)
1773 {
1774         struct sk_buff *skb;
1775
1776         skb = ip6_finish_skb(sk);
1777         if (!skb)
1778                 return 0;
1779
1780         return ip6_send_skb(skb);
1781 }
1782 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1783
1784 static void __ip6_flush_pending_frames(struct sock *sk,
1785                                        struct sk_buff_head *queue,
1786                                        struct inet_cork_full *cork,
1787                                        struct inet6_cork *v6_cork)
1788 {
1789         struct sk_buff *skb;
1790
1791         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1792                 if (skb_dst(skb))
1793                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1794                                       IPSTATS_MIB_OUTDISCARDS);
1795                 kfree_skb(skb);
1796         }
1797
1798         ip6_cork_release(cork, v6_cork);
1799 }
1800
1801 void ip6_flush_pending_frames(struct sock *sk)
1802 {
1803         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1804                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1805 }
1806 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1807
1808 struct sk_buff *ip6_make_skb(struct sock *sk,
1809                              int getfrag(void *from, char *to, int offset,
1810                                          int len, int odd, struct sk_buff *skb),
1811                              void *from, int length, int transhdrlen,
1812                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1813                              struct rt6_info *rt, unsigned int flags,
1814                              struct inet_cork_full *cork)
1815 {
1816         struct inet6_cork v6_cork;
1817         struct sk_buff_head queue;
1818         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1819         int err;
1820
1821         if (flags & MSG_PROBE)
1822                 return NULL;
1823
1824         __skb_queue_head_init(&queue);
1825
1826         cork->base.flags = 0;
1827         cork->base.addr = 0;
1828         cork->base.opt = NULL;
1829         cork->base.dst = NULL;
1830         v6_cork.opt = NULL;
1831         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1832         if (err) {
1833                 ip6_cork_release(cork, &v6_cork);
1834                 return ERR_PTR(err);
1835         }
1836         if (ipc6->dontfrag < 0)
1837                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1838
1839         err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1840                                 &current->task_frag, getfrag, from,
1841                                 length + exthdrlen, transhdrlen + exthdrlen,
1842                                 flags, ipc6);
1843         if (err) {
1844                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1845                 return ERR_PTR(err);
1846         }
1847
1848         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1849 }