GNU Linux-libre 5.10.217-gnu1
[releases.git] / net / ipv6 / ip6_output.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *      IPv6 output functions
4  *      Linux INET6 implementation
5  *
6  *      Authors:
7  *      Pedro Roque             <roque@di.fc.ul.pt>
8  *
9  *      Based on linux/net/ipv4/ip_output.c
10  *
11  *      Changes:
12  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
13  *                              extension headers are implemented.
14  *                              route changes now work.
15  *                              ip6_forward does not confuse sniffers.
16  *                              etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *      Imran Patel     :       frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *                      :       add ip6_append_data and related functions
22  *                              for datagram xmit
23  */
24
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41
42 #include <net/sock.h>
43 #include <net/snmp.h>
44
45 #include <net/ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
51 #include <net/icmp.h>
52 #include <net/xfrm.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 #include <net/ip_tunnels.h>
58
59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
60 {
61         struct dst_entry *dst = skb_dst(skb);
62         struct net_device *dev = dst->dev;
63         unsigned int hh_len = LL_RESERVED_SPACE(dev);
64         int delta = hh_len - skb_headroom(skb);
65         const struct in6_addr *nexthop;
66         struct neighbour *neigh;
67         int ret;
68
69         /* Be paranoid, rather than too clever. */
70         if (unlikely(delta > 0) && dev->header_ops) {
71                 /* pskb_expand_head() might crash, if skb is shared */
72                 if (skb_shared(skb)) {
73                         struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
74
75                         if (likely(nskb)) {
76                                 if (skb->sk)
77                                         skb_set_owner_w(nskb, skb->sk);
78                                 consume_skb(skb);
79                         } else {
80                                 kfree_skb(skb);
81                         }
82                         skb = nskb;
83                 }
84                 if (skb &&
85                     pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
86                         kfree_skb(skb);
87                         skb = NULL;
88                 }
89                 if (!skb) {
90                         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
91                         return -ENOMEM;
92                 }
93         }
94
95         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
96                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
97
98                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
99                     ((mroute6_is_socket(net, skb) &&
100                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
101                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
102                                          &ipv6_hdr(skb)->saddr))) {
103                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
104
105                         /* Do not check for IFF_ALLMULTI; multicast routing
106                            is not supported in any case.
107                          */
108                         if (newskb)
109                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
110                                         net, sk, newskb, NULL, newskb->dev,
111                                         dev_loopback_xmit);
112
113                         if (ipv6_hdr(skb)->hop_limit == 0) {
114                                 IP6_INC_STATS(net, idev,
115                                               IPSTATS_MIB_OUTDISCARDS);
116                                 kfree_skb(skb);
117                                 return 0;
118                         }
119                 }
120
121                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
122
123                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
124                     IPV6_ADDR_SCOPE_NODELOCAL &&
125                     !(dev->flags & IFF_LOOPBACK)) {
126                         kfree_skb(skb);
127                         return 0;
128                 }
129         }
130
131         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
132                 int res = lwtunnel_xmit(skb);
133
134                 if (res != LWTUNNEL_XMIT_CONTINUE)
135                         return res;
136         }
137
138         rcu_read_lock_bh();
139         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
140         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
141         if (unlikely(!neigh))
142                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
143         if (!IS_ERR(neigh)) {
144                 sock_confirm_neigh(skb, neigh);
145                 ret = neigh_output(neigh, skb, false);
146                 rcu_read_unlock_bh();
147                 return ret;
148         }
149         rcu_read_unlock_bh();
150
151         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
152         kfree_skb(skb);
153         return -EINVAL;
154 }
155
156 static int
157 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
158                                     struct sk_buff *skb, unsigned int mtu)
159 {
160         struct sk_buff *segs, *nskb;
161         netdev_features_t features;
162         int ret = 0;
163
164         /* Please see corresponding comment in ip_finish_output_gso
165          * describing the cases where GSO segment length exceeds the
166          * egress MTU.
167          */
168         features = netif_skb_features(skb);
169         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
170         if (IS_ERR_OR_NULL(segs)) {
171                 kfree_skb(skb);
172                 return -ENOMEM;
173         }
174
175         consume_skb(skb);
176
177         skb_list_walk_safe(segs, segs, nskb) {
178                 int err;
179
180                 skb_mark_not_on_list(segs);
181                 /* Last GSO segment can be smaller than gso_size (and MTU).
182                  * Adding a fragment header would produce an "atomic fragment",
183                  * which is considered harmful (RFC-8021). Avoid that.
184                  */
185                 err = segs->len > mtu ?
186                         ip6_fragment(net, sk, segs, ip6_finish_output2) :
187                         ip6_finish_output2(net, sk, segs);
188                 if (err && ret == 0)
189                         ret = err;
190         }
191
192         return ret;
193 }
194
195 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
196 {
197         unsigned int mtu;
198
199 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
200         /* Policy lookup after SNAT yielded a new policy */
201         if (skb_dst(skb)->xfrm) {
202                 IP6CB(skb)->flags |= IP6SKB_REROUTED;
203                 return dst_output(net, sk, skb);
204         }
205 #endif
206
207         mtu = ip6_skb_dst_mtu(skb);
208         if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
209                 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
210
211         if ((skb->len > mtu && !skb_is_gso(skb)) ||
212             dst_allfrag(skb_dst(skb)) ||
213             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
214                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
215         else
216                 return ip6_finish_output2(net, sk, skb);
217 }
218
219 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
220 {
221         int ret;
222
223         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
224         switch (ret) {
225         case NET_XMIT_SUCCESS:
226                 return __ip6_finish_output(net, sk, skb);
227         case NET_XMIT_CN:
228                 return __ip6_finish_output(net, sk, skb) ? : ret;
229         default:
230                 kfree_skb(skb);
231                 return ret;
232         }
233 }
234
235 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
236 {
237         struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
238         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
239
240         skb->protocol = htons(ETH_P_IPV6);
241         skb->dev = dev;
242
243         if (unlikely(idev->cnf.disable_ipv6)) {
244                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
245                 kfree_skb(skb);
246                 return 0;
247         }
248
249         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
250                             net, sk, skb, indev, dev,
251                             ip6_finish_output,
252                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
253 }
254
255 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
256 {
257         if (!np->autoflowlabel_set)
258                 return ip6_default_np_autolabel(net);
259         else
260                 return np->autoflowlabel;
261 }
262
263 /*
264  * xmit an sk_buff (used by TCP, SCTP and DCCP)
265  * Note : socket lock is not held for SYNACK packets, but might be modified
266  * by calls to skb_set_owner_w() and ipv6_local_error(),
267  * which are using proper atomic operations or spinlocks.
268  */
269 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
270              __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
271 {
272         struct net *net = sock_net(sk);
273         const struct ipv6_pinfo *np = inet6_sk(sk);
274         struct in6_addr *first_hop = &fl6->daddr;
275         struct dst_entry *dst = skb_dst(skb);
276         unsigned int head_room;
277         struct ipv6hdr *hdr;
278         u8  proto = fl6->flowi6_proto;
279         int seg_len = skb->len;
280         int hlimit = -1;
281         u32 mtu;
282
283         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
284         if (opt)
285                 head_room += opt->opt_nflen + opt->opt_flen;
286
287         if (unlikely(skb_headroom(skb) < head_room)) {
288                 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
289                 if (!skb2) {
290                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
291                                       IPSTATS_MIB_OUTDISCARDS);
292                         kfree_skb(skb);
293                         return -ENOBUFS;
294                 }
295                 if (skb->sk)
296                         skb_set_owner_w(skb2, skb->sk);
297                 consume_skb(skb);
298                 skb = skb2;
299         }
300
301         if (opt) {
302                 seg_len += opt->opt_nflen + opt->opt_flen;
303
304                 if (opt->opt_flen)
305                         ipv6_push_frag_opts(skb, opt, &proto);
306
307                 if (opt->opt_nflen)
308                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
309                                              &fl6->saddr);
310         }
311
312         skb_push(skb, sizeof(struct ipv6hdr));
313         skb_reset_network_header(skb);
314         hdr = ipv6_hdr(skb);
315
316         /*
317          *      Fill in the IPv6 header
318          */
319         if (np)
320                 hlimit = np->hop_limit;
321         if (hlimit < 0)
322                 hlimit = ip6_dst_hoplimit(dst);
323
324         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
325                                 ip6_autoflowlabel(net, np), fl6));
326
327         hdr->payload_len = htons(seg_len);
328         hdr->nexthdr = proto;
329         hdr->hop_limit = hlimit;
330
331         hdr->saddr = fl6->saddr;
332         hdr->daddr = *first_hop;
333
334         skb->protocol = htons(ETH_P_IPV6);
335         skb->priority = priority;
336         skb->mark = mark;
337
338         mtu = dst_mtu(dst);
339         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
340                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
341                               IPSTATS_MIB_OUT, skb->len);
342
343                 /* if egress device is enslaved to an L3 master device pass the
344                  * skb to its handler for processing
345                  */
346                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
347                 if (unlikely(!skb))
348                         return 0;
349
350                 /* hooks should never assume socket lock is held.
351                  * we promote our socket to non const
352                  */
353                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
354                                net, (struct sock *)sk, skb, NULL, dst->dev,
355                                dst_output);
356         }
357
358         skb->dev = dst->dev;
359         /* ipv6_local_error() does not require socket lock,
360          * we promote our socket to non const
361          */
362         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
363
364         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
365         kfree_skb(skb);
366         return -EMSGSIZE;
367 }
368 EXPORT_SYMBOL(ip6_xmit);
369
370 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
371 {
372         struct ip6_ra_chain *ra;
373         struct sock *last = NULL;
374
375         read_lock(&ip6_ra_lock);
376         for (ra = ip6_ra_chain; ra; ra = ra->next) {
377                 struct sock *sk = ra->sk;
378                 if (sk && ra->sel == sel &&
379                     (!sk->sk_bound_dev_if ||
380                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
381                         struct ipv6_pinfo *np = inet6_sk(sk);
382
383                         if (np && np->rtalert_isolate &&
384                             !net_eq(sock_net(sk), dev_net(skb->dev))) {
385                                 continue;
386                         }
387                         if (last) {
388                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
389                                 if (skb2)
390                                         rawv6_rcv(last, skb2);
391                         }
392                         last = sk;
393                 }
394         }
395
396         if (last) {
397                 rawv6_rcv(last, skb);
398                 read_unlock(&ip6_ra_lock);
399                 return 1;
400         }
401         read_unlock(&ip6_ra_lock);
402         return 0;
403 }
404
405 static int ip6_forward_proxy_check(struct sk_buff *skb)
406 {
407         struct ipv6hdr *hdr = ipv6_hdr(skb);
408         u8 nexthdr = hdr->nexthdr;
409         __be16 frag_off;
410         int offset;
411
412         if (ipv6_ext_hdr(nexthdr)) {
413                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
414                 if (offset < 0)
415                         return 0;
416         } else
417                 offset = sizeof(struct ipv6hdr);
418
419         if (nexthdr == IPPROTO_ICMPV6) {
420                 struct icmp6hdr *icmp6;
421
422                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
423                                          offset + 1 - skb->data)))
424                         return 0;
425
426                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
427
428                 switch (icmp6->icmp6_type) {
429                 case NDISC_ROUTER_SOLICITATION:
430                 case NDISC_ROUTER_ADVERTISEMENT:
431                 case NDISC_NEIGHBOUR_SOLICITATION:
432                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
433                 case NDISC_REDIRECT:
434                         /* For reaction involving unicast neighbor discovery
435                          * message destined to the proxied address, pass it to
436                          * input function.
437                          */
438                         return 1;
439                 default:
440                         break;
441                 }
442         }
443
444         /*
445          * The proxying router can't forward traffic sent to a link-local
446          * address, so signal the sender and discard the packet. This
447          * behavior is clarified by the MIPv6 specification.
448          */
449         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
450                 dst_link_failure(skb);
451                 return -1;
452         }
453
454         return 0;
455 }
456
457 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
458                                      struct sk_buff *skb)
459 {
460         struct dst_entry *dst = skb_dst(skb);
461
462         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
463         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
464
465 #ifdef CONFIG_NET_SWITCHDEV
466         if (skb->offload_l3_fwd_mark) {
467                 consume_skb(skb);
468                 return 0;
469         }
470 #endif
471
472         skb->tstamp = 0;
473         return dst_output(net, sk, skb);
474 }
475
476 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
477 {
478         if (skb->len <= mtu)
479                 return false;
480
481         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
482         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
483                 return true;
484
485         if (skb->ignore_df)
486                 return false;
487
488         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
489                 return false;
490
491         return true;
492 }
493
494 int ip6_forward(struct sk_buff *skb)
495 {
496         struct dst_entry *dst = skb_dst(skb);
497         struct ipv6hdr *hdr = ipv6_hdr(skb);
498         struct inet6_skb_parm *opt = IP6CB(skb);
499         struct net *net = dev_net(dst->dev);
500         struct inet6_dev *idev;
501         u32 mtu;
502
503         idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
504         if (net->ipv6.devconf_all->forwarding == 0)
505                 goto error;
506
507         if (skb->pkt_type != PACKET_HOST)
508                 goto drop;
509
510         if (unlikely(skb->sk))
511                 goto drop;
512
513         if (skb_warn_if_lro(skb))
514                 goto drop;
515
516         if (!net->ipv6.devconf_all->disable_policy &&
517             (!idev || !idev->cnf.disable_policy) &&
518             !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
519                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
520                 goto drop;
521         }
522
523         skb_forward_csum(skb);
524
525         /*
526          *      We DO NOT make any processing on
527          *      RA packets, pushing them to user level AS IS
528          *      without ane WARRANTY that application will be able
529          *      to interpret them. The reason is that we
530          *      cannot make anything clever here.
531          *
532          *      We are not end-node, so that if packet contains
533          *      AH/ESP, we cannot make anything.
534          *      Defragmentation also would be mistake, RA packets
535          *      cannot be fragmented, because there is no warranty
536          *      that different fragments will go along one path. --ANK
537          */
538         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
539                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
540                         return 0;
541         }
542
543         /*
544          *      check and decrement ttl
545          */
546         if (hdr->hop_limit <= 1) {
547                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
548                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
549
550                 kfree_skb(skb);
551                 return -ETIMEDOUT;
552         }
553
554         /* XXX: idev->cnf.proxy_ndp? */
555         if (net->ipv6.devconf_all->proxy_ndp &&
556             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
557                 int proxied = ip6_forward_proxy_check(skb);
558                 if (proxied > 0)
559                         return ip6_input(skb);
560                 else if (proxied < 0) {
561                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
562                         goto drop;
563                 }
564         }
565
566         if (!xfrm6_route_forward(skb)) {
567                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
568                 goto drop;
569         }
570         dst = skb_dst(skb);
571
572         /* IPv6 specs say nothing about it, but it is clear that we cannot
573            send redirects to source routed frames.
574            We don't send redirects to frames decapsulated from IPsec.
575          */
576         if (IP6CB(skb)->iif == dst->dev->ifindex &&
577             opt->srcrt == 0 && !skb_sec_path(skb)) {
578                 struct in6_addr *target = NULL;
579                 struct inet_peer *peer;
580                 struct rt6_info *rt;
581
582                 /*
583                  *      incoming and outgoing devices are the same
584                  *      send a redirect.
585                  */
586
587                 rt = (struct rt6_info *) dst;
588                 if (rt->rt6i_flags & RTF_GATEWAY)
589                         target = &rt->rt6i_gateway;
590                 else
591                         target = &hdr->daddr;
592
593                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
594
595                 /* Limit redirects both by destination (here)
596                    and by source (inside ndisc_send_redirect)
597                  */
598                 if (inet_peer_xrlim_allow(peer, 1*HZ))
599                         ndisc_send_redirect(skb, target);
600                 if (peer)
601                         inet_putpeer(peer);
602         } else {
603                 int addrtype = ipv6_addr_type(&hdr->saddr);
604
605                 /* This check is security critical. */
606                 if (addrtype == IPV6_ADDR_ANY ||
607                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
608                         goto error;
609                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
610                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
611                                     ICMPV6_NOT_NEIGHBOUR, 0);
612                         goto error;
613                 }
614         }
615
616         mtu = ip6_dst_mtu_forward(dst);
617         if (mtu < IPV6_MIN_MTU)
618                 mtu = IPV6_MIN_MTU;
619
620         if (ip6_pkt_too_big(skb, mtu)) {
621                 /* Again, force OUTPUT device used as source address */
622                 skb->dev = dst->dev;
623                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
624                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
625                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
626                                 IPSTATS_MIB_FRAGFAILS);
627                 kfree_skb(skb);
628                 return -EMSGSIZE;
629         }
630
631         if (skb_cow(skb, dst->dev->hard_header_len)) {
632                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
633                                 IPSTATS_MIB_OUTDISCARDS);
634                 goto drop;
635         }
636
637         hdr = ipv6_hdr(skb);
638
639         /* Mangling hops number delayed to point after skb COW */
640
641         hdr->hop_limit--;
642
643         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
644                        net, NULL, skb, skb->dev, dst->dev,
645                        ip6_forward_finish);
646
647 error:
648         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
649 drop:
650         kfree_skb(skb);
651         return -EINVAL;
652 }
653
654 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
655 {
656         to->pkt_type = from->pkt_type;
657         to->priority = from->priority;
658         to->protocol = from->protocol;
659         skb_dst_drop(to);
660         skb_dst_set(to, dst_clone(skb_dst(from)));
661         to->dev = from->dev;
662         to->mark = from->mark;
663
664         skb_copy_hash(to, from);
665
666 #ifdef CONFIG_NET_SCHED
667         to->tc_index = from->tc_index;
668 #endif
669         nf_copy(to, from);
670         skb_ext_copy(to, from);
671         skb_copy_secmark(to, from);
672 }
673
674 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
675                       u8 nexthdr, __be32 frag_id,
676                       struct ip6_fraglist_iter *iter)
677 {
678         unsigned int first_len;
679         struct frag_hdr *fh;
680
681         /* BUILD HEADER */
682         *prevhdr = NEXTHDR_FRAGMENT;
683         iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
684         if (!iter->tmp_hdr)
685                 return -ENOMEM;
686
687         iter->frag = skb_shinfo(skb)->frag_list;
688         skb_frag_list_init(skb);
689
690         iter->offset = 0;
691         iter->hlen = hlen;
692         iter->frag_id = frag_id;
693         iter->nexthdr = nexthdr;
694
695         __skb_pull(skb, hlen);
696         fh = __skb_push(skb, sizeof(struct frag_hdr));
697         __skb_push(skb, hlen);
698         skb_reset_network_header(skb);
699         memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
700
701         fh->nexthdr = nexthdr;
702         fh->reserved = 0;
703         fh->frag_off = htons(IP6_MF);
704         fh->identification = frag_id;
705
706         first_len = skb_pagelen(skb);
707         skb->data_len = first_len - skb_headlen(skb);
708         skb->len = first_len;
709         ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
710
711         return 0;
712 }
713 EXPORT_SYMBOL(ip6_fraglist_init);
714
715 void ip6_fraglist_prepare(struct sk_buff *skb,
716                           struct ip6_fraglist_iter *iter)
717 {
718         struct sk_buff *frag = iter->frag;
719         unsigned int hlen = iter->hlen;
720         struct frag_hdr *fh;
721
722         frag->ip_summed = CHECKSUM_NONE;
723         skb_reset_transport_header(frag);
724         fh = __skb_push(frag, sizeof(struct frag_hdr));
725         __skb_push(frag, hlen);
726         skb_reset_network_header(frag);
727         memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
728         iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
729         fh->nexthdr = iter->nexthdr;
730         fh->reserved = 0;
731         fh->frag_off = htons(iter->offset);
732         if (frag->next)
733                 fh->frag_off |= htons(IP6_MF);
734         fh->identification = iter->frag_id;
735         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
736         ip6_copy_metadata(frag, skb);
737 }
738 EXPORT_SYMBOL(ip6_fraglist_prepare);
739
740 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
741                    unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
742                    u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
743 {
744         state->prevhdr = prevhdr;
745         state->nexthdr = nexthdr;
746         state->frag_id = frag_id;
747
748         state->hlen = hlen;
749         state->mtu = mtu;
750
751         state->left = skb->len - hlen;  /* Space per frame */
752         state->ptr = hlen;              /* Where to start from */
753
754         state->hroom = hdr_room;
755         state->troom = needed_tailroom;
756
757         state->offset = 0;
758 }
759 EXPORT_SYMBOL(ip6_frag_init);
760
761 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
762 {
763         u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
764         struct sk_buff *frag;
765         struct frag_hdr *fh;
766         unsigned int len;
767
768         len = state->left;
769         /* IF: it doesn't fit, use 'mtu' - the data space left */
770         if (len > state->mtu)
771                 len = state->mtu;
772         /* IF: we are not sending up to and including the packet end
773            then align the next start on an eight byte boundary */
774         if (len < state->left)
775                 len &= ~7;
776
777         /* Allocate buffer */
778         frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
779                          state->hroom + state->troom, GFP_ATOMIC);
780         if (!frag)
781                 return ERR_PTR(-ENOMEM);
782
783         /*
784          *      Set up data on packet
785          */
786
787         ip6_copy_metadata(frag, skb);
788         skb_reserve(frag, state->hroom);
789         skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
790         skb_reset_network_header(frag);
791         fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
792         frag->transport_header = (frag->network_header + state->hlen +
793                                   sizeof(struct frag_hdr));
794
795         /*
796          *      Charge the memory for the fragment to any owner
797          *      it might possess
798          */
799         if (skb->sk)
800                 skb_set_owner_w(frag, skb->sk);
801
802         /*
803          *      Copy the packet header into the new buffer.
804          */
805         skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
806
807         fragnexthdr_offset = skb_network_header(frag);
808         fragnexthdr_offset += prevhdr - skb_network_header(skb);
809         *fragnexthdr_offset = NEXTHDR_FRAGMENT;
810
811         /*
812          *      Build fragment header.
813          */
814         fh->nexthdr = state->nexthdr;
815         fh->reserved = 0;
816         fh->identification = state->frag_id;
817
818         /*
819          *      Copy a block of the IP datagram.
820          */
821         BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
822                              len));
823         state->left -= len;
824
825         fh->frag_off = htons(state->offset);
826         if (state->left > 0)
827                 fh->frag_off |= htons(IP6_MF);
828         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
829
830         state->ptr += len;
831         state->offset += len;
832
833         return frag;
834 }
835 EXPORT_SYMBOL(ip6_frag_next);
836
837 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
838                  int (*output)(struct net *, struct sock *, struct sk_buff *))
839 {
840         struct sk_buff *frag;
841         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
842         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
843                                 inet6_sk(skb->sk) : NULL;
844         struct ip6_frag_state state;
845         unsigned int mtu, hlen, nexthdr_offset;
846         ktime_t tstamp = skb->tstamp;
847         int hroom, err = 0;
848         __be32 frag_id;
849         u8 *prevhdr, nexthdr = 0;
850
851         err = ip6_find_1stfragopt(skb, &prevhdr);
852         if (err < 0)
853                 goto fail;
854         hlen = err;
855         nexthdr = *prevhdr;
856         nexthdr_offset = prevhdr - skb_network_header(skb);
857
858         mtu = ip6_skb_dst_mtu(skb);
859
860         /* We must not fragment if the socket is set to force MTU discovery
861          * or if the skb it not generated by a local socket.
862          */
863         if (unlikely(!skb->ignore_df && skb->len > mtu))
864                 goto fail_toobig;
865
866         if (IP6CB(skb)->frag_max_size) {
867                 if (IP6CB(skb)->frag_max_size > mtu)
868                         goto fail_toobig;
869
870                 /* don't send fragments larger than what we received */
871                 mtu = IP6CB(skb)->frag_max_size;
872                 if (mtu < IPV6_MIN_MTU)
873                         mtu = IPV6_MIN_MTU;
874         }
875
876         if (np && np->frag_size < mtu) {
877                 if (np->frag_size)
878                         mtu = np->frag_size;
879         }
880         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
881                 goto fail_toobig;
882         mtu -= hlen + sizeof(struct frag_hdr);
883
884         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
885                                     &ipv6_hdr(skb)->saddr);
886
887         if (skb->ip_summed == CHECKSUM_PARTIAL &&
888             (err = skb_checksum_help(skb)))
889                 goto fail;
890
891         prevhdr = skb_network_header(skb) + nexthdr_offset;
892         hroom = LL_RESERVED_SPACE(rt->dst.dev);
893         if (skb_has_frag_list(skb)) {
894                 unsigned int first_len = skb_pagelen(skb);
895                 struct ip6_fraglist_iter iter;
896                 struct sk_buff *frag2;
897
898                 if (first_len - hlen > mtu ||
899                     ((first_len - hlen) & 7) ||
900                     skb_cloned(skb) ||
901                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
902                         goto slow_path;
903
904                 skb_walk_frags(skb, frag) {
905                         /* Correct geometry. */
906                         if (frag->len > mtu ||
907                             ((frag->len & 7) && frag->next) ||
908                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
909                                 goto slow_path_clean;
910
911                         /* Partially cloned skb? */
912                         if (skb_shared(frag))
913                                 goto slow_path_clean;
914
915                         BUG_ON(frag->sk);
916                         if (skb->sk) {
917                                 frag->sk = skb->sk;
918                                 frag->destructor = sock_wfree;
919                         }
920                         skb->truesize -= frag->truesize;
921                 }
922
923                 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
924                                         &iter);
925                 if (err < 0)
926                         goto fail;
927
928                 /* We prevent @rt from being freed. */
929                 rcu_read_lock();
930
931                 for (;;) {
932                         /* Prepare header of the next frame,
933                          * before previous one went down. */
934                         if (iter.frag)
935                                 ip6_fraglist_prepare(skb, &iter);
936
937                         skb->tstamp = tstamp;
938                         err = output(net, sk, skb);
939                         if (!err)
940                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
941                                               IPSTATS_MIB_FRAGCREATES);
942
943                         if (err || !iter.frag)
944                                 break;
945
946                         skb = ip6_fraglist_next(&iter);
947                 }
948
949                 kfree(iter.tmp_hdr);
950
951                 if (err == 0) {
952                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
953                                       IPSTATS_MIB_FRAGOKS);
954                         rcu_read_unlock();
955                         return 0;
956                 }
957
958                 kfree_skb_list(iter.frag);
959
960                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
961                               IPSTATS_MIB_FRAGFAILS);
962                 rcu_read_unlock();
963                 return err;
964
965 slow_path_clean:
966                 skb_walk_frags(skb, frag2) {
967                         if (frag2 == frag)
968                                 break;
969                         frag2->sk = NULL;
970                         frag2->destructor = NULL;
971                         skb->truesize += frag2->truesize;
972                 }
973         }
974
975 slow_path:
976         /*
977          *      Fragment the datagram.
978          */
979
980         ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
981                       LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
982                       &state);
983
984         /*
985          *      Keep copying data until we run out.
986          */
987
988         while (state.left > 0) {
989                 frag = ip6_frag_next(skb, &state);
990                 if (IS_ERR(frag)) {
991                         err = PTR_ERR(frag);
992                         goto fail;
993                 }
994
995                 /*
996                  *      Put this fragment into the sending queue.
997                  */
998                 frag->tstamp = tstamp;
999                 err = output(net, sk, frag);
1000                 if (err)
1001                         goto fail;
1002
1003                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1004                               IPSTATS_MIB_FRAGCREATES);
1005         }
1006         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1007                       IPSTATS_MIB_FRAGOKS);
1008         consume_skb(skb);
1009         return err;
1010
1011 fail_toobig:
1012         if (skb->sk && dst_allfrag(skb_dst(skb)))
1013                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
1014
1015         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1016         err = -EMSGSIZE;
1017
1018 fail:
1019         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1020                       IPSTATS_MIB_FRAGFAILS);
1021         kfree_skb(skb);
1022         return err;
1023 }
1024
1025 static inline int ip6_rt_check(const struct rt6key *rt_key,
1026                                const struct in6_addr *fl_addr,
1027                                const struct in6_addr *addr_cache)
1028 {
1029         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1030                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1031 }
1032
1033 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1034                                           struct dst_entry *dst,
1035                                           const struct flowi6 *fl6)
1036 {
1037         struct ipv6_pinfo *np = inet6_sk(sk);
1038         struct rt6_info *rt;
1039
1040         if (!dst)
1041                 goto out;
1042
1043         if (dst->ops->family != AF_INET6) {
1044                 dst_release(dst);
1045                 return NULL;
1046         }
1047
1048         rt = (struct rt6_info *)dst;
1049         /* Yes, checking route validity in not connected
1050          * case is not very simple. Take into account,
1051          * that we do not support routing by source, TOS,
1052          * and MSG_DONTROUTE            --ANK (980726)
1053          *
1054          * 1. ip6_rt_check(): If route was host route,
1055          *    check that cached destination is current.
1056          *    If it is network route, we still may
1057          *    check its validity using saved pointer
1058          *    to the last used address: daddr_cache.
1059          *    We do not want to save whole address now,
1060          *    (because main consumer of this service
1061          *    is tcp, which has not this problem),
1062          *    so that the last trick works only on connected
1063          *    sockets.
1064          * 2. oif also should be the same.
1065          */
1066         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1067 #ifdef CONFIG_IPV6_SUBTREES
1068             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1069 #endif
1070            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1071               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1072                 dst_release(dst);
1073                 dst = NULL;
1074         }
1075
1076 out:
1077         return dst;
1078 }
1079
1080 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1081                                struct dst_entry **dst, struct flowi6 *fl6)
1082 {
1083 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1084         struct neighbour *n;
1085         struct rt6_info *rt;
1086 #endif
1087         int err;
1088         int flags = 0;
1089
1090         /* The correct way to handle this would be to do
1091          * ip6_route_get_saddr, and then ip6_route_output; however,
1092          * the route-specific preferred source forces the
1093          * ip6_route_output call _before_ ip6_route_get_saddr.
1094          *
1095          * In source specific routing (no src=any default route),
1096          * ip6_route_output will fail given src=any saddr, though, so
1097          * that's why we try it again later.
1098          */
1099         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
1100                 struct fib6_info *from;
1101                 struct rt6_info *rt;
1102                 bool had_dst = *dst != NULL;
1103
1104                 if (!had_dst)
1105                         *dst = ip6_route_output(net, sk, fl6);
1106                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1107
1108                 rcu_read_lock();
1109                 from = rt ? rcu_dereference(rt->from) : NULL;
1110                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1111                                           sk ? inet6_sk(sk)->srcprefs : 0,
1112                                           &fl6->saddr);
1113                 rcu_read_unlock();
1114
1115                 if (err)
1116                         goto out_err_release;
1117
1118                 /* If we had an erroneous initial result, pretend it
1119                  * never existed and let the SA-enabled version take
1120                  * over.
1121                  */
1122                 if (!had_dst && (*dst)->error) {
1123                         dst_release(*dst);
1124                         *dst = NULL;
1125                 }
1126
1127                 if (fl6->flowi6_oif)
1128                         flags |= RT6_LOOKUP_F_IFACE;
1129         }
1130
1131         if (!*dst)
1132                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1133
1134         err = (*dst)->error;
1135         if (err)
1136                 goto out_err_release;
1137
1138 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1139         /*
1140          * Here if the dst entry we've looked up
1141          * has a neighbour entry that is in the INCOMPLETE
1142          * state and the src address from the flow is
1143          * marked as OPTIMISTIC, we release the found
1144          * dst entry and replace it instead with the
1145          * dst entry of the nexthop router
1146          */
1147         rt = (struct rt6_info *) *dst;
1148         rcu_read_lock_bh();
1149         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1150                                       rt6_nexthop(rt, &fl6->daddr));
1151         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1152         rcu_read_unlock_bh();
1153
1154         if (err) {
1155                 struct inet6_ifaddr *ifp;
1156                 struct flowi6 fl_gw6;
1157                 int redirect;
1158
1159                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1160                                       (*dst)->dev, 1);
1161
1162                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1163                 if (ifp)
1164                         in6_ifa_put(ifp);
1165
1166                 if (redirect) {
1167                         /*
1168                          * We need to get the dst entry for the
1169                          * default router instead
1170                          */
1171                         dst_release(*dst);
1172                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1173                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1174                         *dst = ip6_route_output(net, sk, &fl_gw6);
1175                         err = (*dst)->error;
1176                         if (err)
1177                                 goto out_err_release;
1178                 }
1179         }
1180 #endif
1181         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1182             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1183                 err = -EAFNOSUPPORT;
1184                 goto out_err_release;
1185         }
1186
1187         return 0;
1188
1189 out_err_release:
1190         dst_release(*dst);
1191         *dst = NULL;
1192
1193         if (err == -ENETUNREACH)
1194                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1195         return err;
1196 }
1197
1198 /**
1199  *      ip6_dst_lookup - perform route lookup on flow
1200  *      @net: Network namespace to perform lookup in
1201  *      @sk: socket which provides route info
1202  *      @dst: pointer to dst_entry * for result
1203  *      @fl6: flow to lookup
1204  *
1205  *      This function performs a route lookup on the given flow.
1206  *
1207  *      It returns zero on success, or a standard errno code on error.
1208  */
1209 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1210                    struct flowi6 *fl6)
1211 {
1212         *dst = NULL;
1213         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1214 }
1215 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1216
1217 /**
1218  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1219  *      @net: Network namespace to perform lookup in
1220  *      @sk: socket which provides route info
1221  *      @fl6: flow to lookup
1222  *      @final_dst: final destination address for ipsec lookup
1223  *
1224  *      This function performs a route lookup on the given flow.
1225  *
1226  *      It returns a valid dst pointer on success, or a pointer encoded
1227  *      error code.
1228  */
1229 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1230                                       const struct in6_addr *final_dst)
1231 {
1232         struct dst_entry *dst = NULL;
1233         int err;
1234
1235         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1236         if (err)
1237                 return ERR_PTR(err);
1238         if (final_dst)
1239                 fl6->daddr = *final_dst;
1240
1241         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1242 }
1243 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1244
1245 /**
1246  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1247  *      @sk: socket which provides the dst cache and route info
1248  *      @fl6: flow to lookup
1249  *      @final_dst: final destination address for ipsec lookup
1250  *      @connected: whether @sk is connected or not
1251  *
1252  *      This function performs a route lookup on the given flow with the
1253  *      possibility of using the cached route in the socket if it is valid.
1254  *      It will take the socket dst lock when operating on the dst cache.
1255  *      As a result, this function can only be used in process context.
1256  *
1257  *      In addition, for a connected socket, cache the dst in the socket
1258  *      if the current cache is not valid.
1259  *
1260  *      It returns a valid dst pointer on success, or a pointer encoded
1261  *      error code.
1262  */
1263 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1264                                          const struct in6_addr *final_dst,
1265                                          bool connected)
1266 {
1267         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1268
1269         dst = ip6_sk_dst_check(sk, dst, fl6);
1270         if (dst)
1271                 return dst;
1272
1273         dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1274         if (connected && !IS_ERR(dst))
1275                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1276
1277         return dst;
1278 }
1279 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1280
1281 /**
1282  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1283  *      @skb: Packet for which lookup is done
1284  *      @dev: Tunnel device
1285  *      @net: Network namespace of tunnel device
1286  *      @sock: Socket which provides route info
1287  *      @saddr: Memory to store the src ip address
1288  *      @info: Tunnel information
1289  *      @protocol: IP protocol
1290  *      @use_cache: Flag to enable cache usage
1291  *      This function performs a route lookup on a tunnel
1292  *
1293  *      It returns a valid dst pointer and stores src address to be used in
1294  *      tunnel in param saddr on success, else a pointer encoded error code.
1295  */
1296
1297 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1298                                         struct net_device *dev,
1299                                         struct net *net,
1300                                         struct socket *sock,
1301                                         struct in6_addr *saddr,
1302                                         const struct ip_tunnel_info *info,
1303                                         u8 protocol,
1304                                         bool use_cache)
1305 {
1306         struct dst_entry *dst = NULL;
1307 #ifdef CONFIG_DST_CACHE
1308         struct dst_cache *dst_cache;
1309 #endif
1310         struct flowi6 fl6;
1311         __u8 prio;
1312
1313 #ifdef CONFIG_DST_CACHE
1314         dst_cache = (struct dst_cache *)&info->dst_cache;
1315         if (use_cache) {
1316                 dst = dst_cache_get_ip6(dst_cache, saddr);
1317                 if (dst)
1318                         return dst;
1319         }
1320 #endif
1321         memset(&fl6, 0, sizeof(fl6));
1322         fl6.flowi6_mark = skb->mark;
1323         fl6.flowi6_proto = protocol;
1324         fl6.daddr = info->key.u.ipv6.dst;
1325         fl6.saddr = info->key.u.ipv6.src;
1326         prio = info->key.tos;
1327         fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1328
1329         dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1330                                               NULL);
1331         if (IS_ERR(dst)) {
1332                 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1333                 return ERR_PTR(-ENETUNREACH);
1334         }
1335         if (dst->dev == dev) { /* is this necessary? */
1336                 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1337                 dst_release(dst);
1338                 return ERR_PTR(-ELOOP);
1339         }
1340 #ifdef CONFIG_DST_CACHE
1341         if (use_cache)
1342                 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1343 #endif
1344         *saddr = fl6.saddr;
1345         return dst;
1346 }
1347 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1348
1349 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1350                                                gfp_t gfp)
1351 {
1352         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1353 }
1354
1355 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1356                                                 gfp_t gfp)
1357 {
1358         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1359 }
1360
1361 static void ip6_append_data_mtu(unsigned int *mtu,
1362                                 int *maxfraglen,
1363                                 unsigned int fragheaderlen,
1364                                 struct sk_buff *skb,
1365                                 struct rt6_info *rt,
1366                                 unsigned int orig_mtu)
1367 {
1368         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1369                 if (!skb) {
1370                         /* first fragment, reserve header_len */
1371                         *mtu = orig_mtu - rt->dst.header_len;
1372
1373                 } else {
1374                         /*
1375                          * this fragment is not first, the headers
1376                          * space is regarded as data space.
1377                          */
1378                         *mtu = orig_mtu;
1379                 }
1380                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1381                               + fragheaderlen - sizeof(struct frag_hdr);
1382         }
1383 }
1384
1385 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1386                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1387                           struct rt6_info *rt, struct flowi6 *fl6)
1388 {
1389         struct ipv6_pinfo *np = inet6_sk(sk);
1390         unsigned int mtu;
1391         struct ipv6_txoptions *opt = ipc6->opt;
1392
1393         /*
1394          * setup for corking
1395          */
1396         if (opt) {
1397                 if (WARN_ON(v6_cork->opt))
1398                         return -EINVAL;
1399
1400                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1401                 if (unlikely(!v6_cork->opt))
1402                         return -ENOBUFS;
1403
1404                 v6_cork->opt->tot_len = sizeof(*opt);
1405                 v6_cork->opt->opt_flen = opt->opt_flen;
1406                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1407
1408                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1409                                                     sk->sk_allocation);
1410                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1411                         return -ENOBUFS;
1412
1413                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1414                                                     sk->sk_allocation);
1415                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1416                         return -ENOBUFS;
1417
1418                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1419                                                    sk->sk_allocation);
1420                 if (opt->hopopt && !v6_cork->opt->hopopt)
1421                         return -ENOBUFS;
1422
1423                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1424                                                     sk->sk_allocation);
1425                 if (opt->srcrt && !v6_cork->opt->srcrt)
1426                         return -ENOBUFS;
1427
1428                 /* need source address above miyazawa*/
1429         }
1430         dst_hold(&rt->dst);
1431         cork->base.dst = &rt->dst;
1432         cork->fl.u.ip6 = *fl6;
1433         v6_cork->hop_limit = ipc6->hlimit;
1434         v6_cork->tclass = ipc6->tclass;
1435         if (rt->dst.flags & DST_XFRM_TUNNEL)
1436                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1437                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1438         else
1439                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1440                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1441         if (np->frag_size < mtu) {
1442                 if (np->frag_size)
1443                         mtu = np->frag_size;
1444         }
1445         cork->base.fragsize = mtu;
1446         cork->base.gso_size = ipc6->gso_size;
1447         cork->base.tx_flags = 0;
1448         cork->base.mark = ipc6->sockc.mark;
1449         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1450
1451         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1452                 cork->base.flags |= IPCORK_ALLFRAG;
1453         cork->base.length = 0;
1454
1455         cork->base.transmit_time = ipc6->sockc.transmit_time;
1456
1457         return 0;
1458 }
1459
1460 static int __ip6_append_data(struct sock *sk,
1461                              struct flowi6 *fl6,
1462                              struct sk_buff_head *queue,
1463                              struct inet_cork *cork,
1464                              struct inet6_cork *v6_cork,
1465                              struct page_frag *pfrag,
1466                              int getfrag(void *from, char *to, int offset,
1467                                          int len, int odd, struct sk_buff *skb),
1468                              void *from, int length, int transhdrlen,
1469                              unsigned int flags, struct ipcm6_cookie *ipc6)
1470 {
1471         struct sk_buff *skb, *skb_prev = NULL;
1472         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1473         struct ubuf_info *uarg = NULL;
1474         int exthdrlen = 0;
1475         int dst_exthdrlen = 0;
1476         int hh_len;
1477         int copy;
1478         int err;
1479         int offset = 0;
1480         u32 tskey = 0;
1481         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1482         struct ipv6_txoptions *opt = v6_cork->opt;
1483         int csummode = CHECKSUM_NONE;
1484         unsigned int maxnonfragsize, headersize;
1485         unsigned int wmem_alloc_delta = 0;
1486         bool paged, extra_uref = false;
1487
1488         skb = skb_peek_tail(queue);
1489         if (!skb) {
1490                 exthdrlen = opt ? opt->opt_flen : 0;
1491                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1492         }
1493
1494         paged = !!cork->gso_size;
1495         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1496         orig_mtu = mtu;
1497
1498         if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1499             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1500                 tskey = sk->sk_tskey++;
1501
1502         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1503
1504         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1505                         (opt ? opt->opt_nflen : 0);
1506
1507         headersize = sizeof(struct ipv6hdr) +
1508                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1509                      (dst_allfrag(&rt->dst) ?
1510                       sizeof(struct frag_hdr) : 0) +
1511                      rt->rt6i_nfheader_len;
1512
1513         if (mtu <= fragheaderlen ||
1514             ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1515                 goto emsgsize;
1516
1517         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1518                      sizeof(struct frag_hdr);
1519
1520         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1521          * the first fragment
1522          */
1523         if (headersize + transhdrlen > mtu)
1524                 goto emsgsize;
1525
1526         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1527             (sk->sk_protocol == IPPROTO_UDP ||
1528              sk->sk_protocol == IPPROTO_RAW)) {
1529                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1530                                 sizeof(struct ipv6hdr));
1531                 goto emsgsize;
1532         }
1533
1534         if (ip6_sk_ignore_df(sk))
1535                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1536         else
1537                 maxnonfragsize = mtu;
1538
1539         if (cork->length + length > maxnonfragsize - headersize) {
1540 emsgsize:
1541                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1542                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1543                 return -EMSGSIZE;
1544         }
1545
1546         /* CHECKSUM_PARTIAL only with no extension headers and when
1547          * we are not going to fragment
1548          */
1549         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1550             headersize == sizeof(struct ipv6hdr) &&
1551             length <= mtu - headersize &&
1552             (!(flags & MSG_MORE) || cork->gso_size) &&
1553             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1554                 csummode = CHECKSUM_PARTIAL;
1555
1556         if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1557                 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1558                 if (!uarg)
1559                         return -ENOBUFS;
1560                 extra_uref = !skb_zcopy(skb);   /* only ref on new uarg */
1561                 if (rt->dst.dev->features & NETIF_F_SG &&
1562                     csummode == CHECKSUM_PARTIAL) {
1563                         paged = true;
1564                 } else {
1565                         uarg->zerocopy = 0;
1566                         skb_zcopy_set(skb, uarg, &extra_uref);
1567                 }
1568         }
1569
1570         /*
1571          * Let's try using as much space as possible.
1572          * Use MTU if total length of the message fits into the MTU.
1573          * Otherwise, we need to reserve fragment header and
1574          * fragment alignment (= 8-15 octects, in total).
1575          *
1576          * Note that we may need to "move" the data from the tail
1577          * of the buffer to the new fragment when we split
1578          * the message.
1579          *
1580          * FIXME: It may be fragmented into multiple chunks
1581          *        at once if non-fragmentable extension headers
1582          *        are too large.
1583          * --yoshfuji
1584          */
1585
1586         cork->length += length;
1587         if (!skb)
1588                 goto alloc_new_skb;
1589
1590         while (length > 0) {
1591                 /* Check if the remaining data fits into current packet. */
1592                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1593                 if (copy < length)
1594                         copy = maxfraglen - skb->len;
1595
1596                 if (copy <= 0) {
1597                         char *data;
1598                         unsigned int datalen;
1599                         unsigned int fraglen;
1600                         unsigned int fraggap;
1601                         unsigned int alloclen, alloc_extra;
1602                         unsigned int pagedlen;
1603 alloc_new_skb:
1604                         /* There's no room in the current skb */
1605                         if (skb)
1606                                 fraggap = skb->len - maxfraglen;
1607                         else
1608                                 fraggap = 0;
1609                         /* update mtu and maxfraglen if necessary */
1610                         if (!skb || !skb_prev)
1611                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1612                                                     fragheaderlen, skb, rt,
1613                                                     orig_mtu);
1614
1615                         skb_prev = skb;
1616
1617                         /*
1618                          * If remaining data exceeds the mtu,
1619                          * we know we need more fragment(s).
1620                          */
1621                         datalen = length + fraggap;
1622
1623                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1624                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1625                         fraglen = datalen + fragheaderlen;
1626                         pagedlen = 0;
1627
1628                         alloc_extra = hh_len;
1629                         alloc_extra += dst_exthdrlen;
1630                         alloc_extra += rt->dst.trailer_len;
1631
1632                         /* We just reserve space for fragment header.
1633                          * Note: this may be overallocation if the message
1634                          * (without MSG_MORE) fits into the MTU.
1635                          */
1636                         alloc_extra += sizeof(struct frag_hdr);
1637
1638                         if ((flags & MSG_MORE) &&
1639                             !(rt->dst.dev->features&NETIF_F_SG))
1640                                 alloclen = mtu;
1641                         else if (!paged &&
1642                                  (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1643                                   !(rt->dst.dev->features & NETIF_F_SG)))
1644                                 alloclen = fraglen;
1645                         else {
1646                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1647                                 pagedlen = fraglen - alloclen;
1648                         }
1649                         alloclen += alloc_extra;
1650
1651                         if (datalen != length + fraggap) {
1652                                 /*
1653                                  * this is not the last fragment, the trailer
1654                                  * space is regarded as data space.
1655                                  */
1656                                 datalen += rt->dst.trailer_len;
1657                         }
1658
1659                         fraglen = datalen + fragheaderlen;
1660
1661                         copy = datalen - transhdrlen - fraggap - pagedlen;
1662                         if (copy < 0) {
1663                                 err = -EINVAL;
1664                                 goto error;
1665                         }
1666                         if (transhdrlen) {
1667                                 skb = sock_alloc_send_skb(sk, alloclen,
1668                                                 (flags & MSG_DONTWAIT), &err);
1669                         } else {
1670                                 skb = NULL;
1671                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1672                                     2 * sk->sk_sndbuf)
1673                                         skb = alloc_skb(alloclen,
1674                                                         sk->sk_allocation);
1675                                 if (unlikely(!skb))
1676                                         err = -ENOBUFS;
1677                         }
1678                         if (!skb)
1679                                 goto error;
1680                         /*
1681                          *      Fill in the control structures
1682                          */
1683                         skb->protocol = htons(ETH_P_IPV6);
1684                         skb->ip_summed = csummode;
1685                         skb->csum = 0;
1686                         /* reserve for fragmentation and ipsec header */
1687                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1688                                     dst_exthdrlen);
1689
1690                         /*
1691                          *      Find where to start putting bytes
1692                          */
1693                         data = skb_put(skb, fraglen - pagedlen);
1694                         skb_set_network_header(skb, exthdrlen);
1695                         data += fragheaderlen;
1696                         skb->transport_header = (skb->network_header +
1697                                                  fragheaderlen);
1698                         if (fraggap) {
1699                                 skb->csum = skb_copy_and_csum_bits(
1700                                         skb_prev, maxfraglen,
1701                                         data + transhdrlen, fraggap);
1702                                 skb_prev->csum = csum_sub(skb_prev->csum,
1703                                                           skb->csum);
1704                                 data += fraggap;
1705                                 pskb_trim_unique(skb_prev, maxfraglen);
1706                         }
1707                         if (copy > 0 &&
1708                             getfrag(from, data + transhdrlen, offset,
1709                                     copy, fraggap, skb) < 0) {
1710                                 err = -EFAULT;
1711                                 kfree_skb(skb);
1712                                 goto error;
1713                         }
1714
1715                         offset += copy;
1716                         length -= copy + transhdrlen;
1717                         transhdrlen = 0;
1718                         exthdrlen = 0;
1719                         dst_exthdrlen = 0;
1720
1721                         /* Only the initial fragment is time stamped */
1722                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1723                         cork->tx_flags = 0;
1724                         skb_shinfo(skb)->tskey = tskey;
1725                         tskey = 0;
1726                         skb_zcopy_set(skb, uarg, &extra_uref);
1727
1728                         if ((flags & MSG_CONFIRM) && !skb_prev)
1729                                 skb_set_dst_pending_confirm(skb, 1);
1730
1731                         /*
1732                          * Put the packet on the pending queue
1733                          */
1734                         if (!skb->destructor) {
1735                                 skb->destructor = sock_wfree;
1736                                 skb->sk = sk;
1737                                 wmem_alloc_delta += skb->truesize;
1738                         }
1739                         __skb_queue_tail(queue, skb);
1740                         continue;
1741                 }
1742
1743                 if (copy > length)
1744                         copy = length;
1745
1746                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1747                     skb_tailroom(skb) >= copy) {
1748                         unsigned int off;
1749
1750                         off = skb->len;
1751                         if (getfrag(from, skb_put(skb, copy),
1752                                                 offset, copy, off, skb) < 0) {
1753                                 __skb_trim(skb, off);
1754                                 err = -EFAULT;
1755                                 goto error;
1756                         }
1757                 } else if (!uarg || !uarg->zerocopy) {
1758                         int i = skb_shinfo(skb)->nr_frags;
1759
1760                         err = -ENOMEM;
1761                         if (!sk_page_frag_refill(sk, pfrag))
1762                                 goto error;
1763
1764                         if (!skb_can_coalesce(skb, i, pfrag->page,
1765                                               pfrag->offset)) {
1766                                 err = -EMSGSIZE;
1767                                 if (i == MAX_SKB_FRAGS)
1768                                         goto error;
1769
1770                                 __skb_fill_page_desc(skb, i, pfrag->page,
1771                                                      pfrag->offset, 0);
1772                                 skb_shinfo(skb)->nr_frags = ++i;
1773                                 get_page(pfrag->page);
1774                         }
1775                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1776                         if (getfrag(from,
1777                                     page_address(pfrag->page) + pfrag->offset,
1778                                     offset, copy, skb->len, skb) < 0)
1779                                 goto error_efault;
1780
1781                         pfrag->offset += copy;
1782                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1783                         skb->len += copy;
1784                         skb->data_len += copy;
1785                         skb->truesize += copy;
1786                         wmem_alloc_delta += copy;
1787                 } else {
1788                         err = skb_zerocopy_iter_dgram(skb, from, copy);
1789                         if (err < 0)
1790                                 goto error;
1791                 }
1792                 offset += copy;
1793                 length -= copy;
1794         }
1795
1796         if (wmem_alloc_delta)
1797                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1798         return 0;
1799
1800 error_efault:
1801         err = -EFAULT;
1802 error:
1803         if (uarg)
1804                 sock_zerocopy_put_abort(uarg, extra_uref);
1805         cork->length -= length;
1806         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1807         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1808         return err;
1809 }
1810
1811 int ip6_append_data(struct sock *sk,
1812                     int getfrag(void *from, char *to, int offset, int len,
1813                                 int odd, struct sk_buff *skb),
1814                     void *from, int length, int transhdrlen,
1815                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1816                     struct rt6_info *rt, unsigned int flags)
1817 {
1818         struct inet_sock *inet = inet_sk(sk);
1819         struct ipv6_pinfo *np = inet6_sk(sk);
1820         int exthdrlen;
1821         int err;
1822
1823         if (flags&MSG_PROBE)
1824                 return 0;
1825         if (skb_queue_empty(&sk->sk_write_queue)) {
1826                 /*
1827                  * setup for corking
1828                  */
1829                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1830                                      ipc6, rt, fl6);
1831                 if (err)
1832                         return err;
1833
1834                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1835                 length += exthdrlen;
1836                 transhdrlen += exthdrlen;
1837         } else {
1838                 fl6 = &inet->cork.fl.u.ip6;
1839                 transhdrlen = 0;
1840         }
1841
1842         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1843                                  &np->cork, sk_page_frag(sk), getfrag,
1844                                  from, length, transhdrlen, flags, ipc6);
1845 }
1846 EXPORT_SYMBOL_GPL(ip6_append_data);
1847
1848 static void ip6_cork_release(struct inet_cork_full *cork,
1849                              struct inet6_cork *v6_cork)
1850 {
1851         if (v6_cork->opt) {
1852                 kfree(v6_cork->opt->dst0opt);
1853                 kfree(v6_cork->opt->dst1opt);
1854                 kfree(v6_cork->opt->hopopt);
1855                 kfree(v6_cork->opt->srcrt);
1856                 kfree(v6_cork->opt);
1857                 v6_cork->opt = NULL;
1858         }
1859
1860         if (cork->base.dst) {
1861                 dst_release(cork->base.dst);
1862                 cork->base.dst = NULL;
1863                 cork->base.flags &= ~IPCORK_ALLFRAG;
1864         }
1865         memset(&cork->fl, 0, sizeof(cork->fl));
1866 }
1867
1868 struct sk_buff *__ip6_make_skb(struct sock *sk,
1869                                struct sk_buff_head *queue,
1870                                struct inet_cork_full *cork,
1871                                struct inet6_cork *v6_cork)
1872 {
1873         struct sk_buff *skb, *tmp_skb;
1874         struct sk_buff **tail_skb;
1875         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1876         struct ipv6_pinfo *np = inet6_sk(sk);
1877         struct net *net = sock_net(sk);
1878         struct ipv6hdr *hdr;
1879         struct ipv6_txoptions *opt = v6_cork->opt;
1880         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1881         struct flowi6 *fl6 = &cork->fl.u.ip6;
1882         unsigned char proto = fl6->flowi6_proto;
1883
1884         skb = __skb_dequeue(queue);
1885         if (!skb)
1886                 goto out;
1887         tail_skb = &(skb_shinfo(skb)->frag_list);
1888
1889         /* move skb->data to ip header from ext header */
1890         if (skb->data < skb_network_header(skb))
1891                 __skb_pull(skb, skb_network_offset(skb));
1892         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1893                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1894                 *tail_skb = tmp_skb;
1895                 tail_skb = &(tmp_skb->next);
1896                 skb->len += tmp_skb->len;
1897                 skb->data_len += tmp_skb->len;
1898                 skb->truesize += tmp_skb->truesize;
1899                 tmp_skb->destructor = NULL;
1900                 tmp_skb->sk = NULL;
1901         }
1902
1903         /* Allow local fragmentation. */
1904         skb->ignore_df = ip6_sk_ignore_df(sk);
1905
1906         *final_dst = fl6->daddr;
1907         __skb_pull(skb, skb_network_header_len(skb));
1908         if (opt && opt->opt_flen)
1909                 ipv6_push_frag_opts(skb, opt, &proto);
1910         if (opt && opt->opt_nflen)
1911                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1912
1913         skb_push(skb, sizeof(struct ipv6hdr));
1914         skb_reset_network_header(skb);
1915         hdr = ipv6_hdr(skb);
1916
1917         ip6_flow_hdr(hdr, v6_cork->tclass,
1918                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1919                                         ip6_autoflowlabel(net, np), fl6));
1920         hdr->hop_limit = v6_cork->hop_limit;
1921         hdr->nexthdr = proto;
1922         hdr->saddr = fl6->saddr;
1923         hdr->daddr = *final_dst;
1924
1925         skb->priority = sk->sk_priority;
1926         skb->mark = cork->base.mark;
1927
1928         skb->tstamp = cork->base.transmit_time;
1929
1930         skb_dst_set(skb, dst_clone(&rt->dst));
1931         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1932         if (proto == IPPROTO_ICMPV6) {
1933                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1934                 u8 icmp6_type;
1935
1936                 if (sk->sk_socket->type == SOCK_RAW && !inet_sk(sk)->hdrincl)
1937                         icmp6_type = fl6->fl6_icmp_type;
1938                 else
1939                         icmp6_type = icmp6_hdr(skb)->icmp6_type;
1940                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1941                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1942         }
1943
1944         ip6_cork_release(cork, v6_cork);
1945 out:
1946         return skb;
1947 }
1948
1949 int ip6_send_skb(struct sk_buff *skb)
1950 {
1951         struct net *net = sock_net(skb->sk);
1952         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1953         int err;
1954
1955         err = ip6_local_out(net, skb->sk, skb);
1956         if (err) {
1957                 if (err > 0)
1958                         err = net_xmit_errno(err);
1959                 if (err)
1960                         IP6_INC_STATS(net, rt->rt6i_idev,
1961                                       IPSTATS_MIB_OUTDISCARDS);
1962         }
1963
1964         return err;
1965 }
1966
1967 int ip6_push_pending_frames(struct sock *sk)
1968 {
1969         struct sk_buff *skb;
1970
1971         skb = ip6_finish_skb(sk);
1972         if (!skb)
1973                 return 0;
1974
1975         return ip6_send_skb(skb);
1976 }
1977 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1978
1979 static void __ip6_flush_pending_frames(struct sock *sk,
1980                                        struct sk_buff_head *queue,
1981                                        struct inet_cork_full *cork,
1982                                        struct inet6_cork *v6_cork)
1983 {
1984         struct sk_buff *skb;
1985
1986         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1987                 if (skb_dst(skb))
1988                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1989                                       IPSTATS_MIB_OUTDISCARDS);
1990                 kfree_skb(skb);
1991         }
1992
1993         ip6_cork_release(cork, v6_cork);
1994 }
1995
1996 void ip6_flush_pending_frames(struct sock *sk)
1997 {
1998         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1999                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2000 }
2001 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2002
2003 struct sk_buff *ip6_make_skb(struct sock *sk,
2004                              int getfrag(void *from, char *to, int offset,
2005                                          int len, int odd, struct sk_buff *skb),
2006                              void *from, int length, int transhdrlen,
2007                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
2008                              struct rt6_info *rt, unsigned int flags,
2009                              struct inet_cork_full *cork)
2010 {
2011         struct inet6_cork v6_cork;
2012         struct sk_buff_head queue;
2013         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2014         int err;
2015
2016         if (flags & MSG_PROBE)
2017                 return NULL;
2018
2019         __skb_queue_head_init(&queue);
2020
2021         cork->base.flags = 0;
2022         cork->base.addr = 0;
2023         cork->base.opt = NULL;
2024         cork->base.dst = NULL;
2025         v6_cork.opt = NULL;
2026         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
2027         if (err) {
2028                 ip6_cork_release(cork, &v6_cork);
2029                 return ERR_PTR(err);
2030         }
2031         if (ipc6->dontfrag < 0)
2032                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2033
2034         err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
2035                                 &current->task_frag, getfrag, from,
2036                                 length + exthdrlen, transhdrlen + exthdrlen,
2037                                 flags, ipc6);
2038         if (err) {
2039                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2040                 return ERR_PTR(err);
2041         }
2042
2043         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2044 }