1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * IPv6 output functions
4 * Linux INET6 implementation
7 * Pedro Roque <roque@di.fc.ul.pt>
9 * Based on linux/net/ipv4/ip_output.c
12 * A.N.Kuznetsov : airthmetics in fragmentation.
13 * extension headers are implemented.
14 * route changes now work.
15 * ip6_forward does not confuse sniffers.
18 * H. von Brand : Added missing #include <linux/string.h>
19 * Imran Patel : frag id should be in NBO
20 * Kazunori MIYAZAWA @USAGI
21 * : add ip6_append_data and related functions
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
58 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
60 struct dst_entry *dst = skb_dst(skb);
61 struct net_device *dev = dst->dev;
62 unsigned int hh_len = LL_RESERVED_SPACE(dev);
63 int delta = hh_len - skb_headroom(skb);
64 const struct in6_addr *nexthop;
65 struct neighbour *neigh;
68 /* Be paranoid, rather than too clever. */
69 if (unlikely(delta > 0) && dev->header_ops) {
70 /* pskb_expand_head() might crash, if skb is shared */
71 if (skb_shared(skb)) {
72 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
76 skb_set_owner_w(nskb, skb->sk);
84 pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
89 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
94 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
95 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
97 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
98 ((mroute6_is_socket(net, skb) &&
99 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
100 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
101 &ipv6_hdr(skb)->saddr))) {
102 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
104 /* Do not check for IFF_ALLMULTI; multicast routing
105 is not supported in any case.
108 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
109 net, sk, newskb, NULL, newskb->dev,
112 if (ipv6_hdr(skb)->hop_limit == 0) {
113 IP6_INC_STATS(net, idev,
114 IPSTATS_MIB_OUTDISCARDS);
120 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
122 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
123 IPV6_ADDR_SCOPE_NODELOCAL &&
124 !(dev->flags & IFF_LOOPBACK)) {
130 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
131 int res = lwtunnel_xmit(skb);
133 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
138 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
139 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
140 if (unlikely(!neigh))
141 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
142 if (!IS_ERR(neigh)) {
143 sock_confirm_neigh(skb, neigh);
144 ret = neigh_output(neigh, skb, false);
145 rcu_read_unlock_bh();
148 rcu_read_unlock_bh();
150 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
156 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
157 struct sk_buff *skb, unsigned int mtu)
159 struct sk_buff *segs, *nskb;
160 netdev_features_t features;
163 /* Please see corresponding comment in ip_finish_output_gso
164 * describing the cases where GSO segment length exceeds the
167 features = netif_skb_features(skb);
168 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
169 if (IS_ERR_OR_NULL(segs)) {
176 skb_list_walk_safe(segs, segs, nskb) {
179 skb_mark_not_on_list(segs);
180 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
188 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
192 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
193 /* Policy lookup after SNAT yielded a new policy */
194 if (skb_dst(skb)->xfrm) {
195 IP6CB(skb)->flags |= IP6SKB_REROUTED;
196 return dst_output(net, sk, skb);
200 mtu = ip6_skb_dst_mtu(skb);
201 if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
202 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
204 if ((skb->len > mtu && !skb_is_gso(skb)) ||
205 dst_allfrag(skb_dst(skb)) ||
206 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
207 return ip6_fragment(net, sk, skb, ip6_finish_output2);
209 return ip6_finish_output2(net, sk, skb);
212 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
216 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
218 case NET_XMIT_SUCCESS:
219 return __ip6_finish_output(net, sk, skb);
221 return __ip6_finish_output(net, sk, skb) ? : ret;
228 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
230 struct net_device *dev = skb_dst(skb)->dev;
231 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
233 skb->protocol = htons(ETH_P_IPV6);
236 if (unlikely(idev->cnf.disable_ipv6)) {
237 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
242 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
243 net, sk, skb, NULL, dev,
245 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
248 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
250 if (!np->autoflowlabel_set)
251 return ip6_default_np_autolabel(net);
253 return np->autoflowlabel;
257 * xmit an sk_buff (used by TCP, SCTP and DCCP)
258 * Note : socket lock is not held for SYNACK packets, but might be modified
259 * by calls to skb_set_owner_w() and ipv6_local_error(),
260 * which are using proper atomic operations or spinlocks.
262 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
263 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
265 struct net *net = sock_net(sk);
266 const struct ipv6_pinfo *np = inet6_sk(sk);
267 struct in6_addr *first_hop = &fl6->daddr;
268 struct dst_entry *dst = skb_dst(skb);
269 unsigned int head_room;
271 u8 proto = fl6->flowi6_proto;
272 int seg_len = skb->len;
276 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
278 head_room += opt->opt_nflen + opt->opt_flen;
280 if (unlikely(skb_headroom(skb) < head_room)) {
281 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
283 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
284 IPSTATS_MIB_OUTDISCARDS);
289 skb_set_owner_w(skb2, skb->sk);
295 seg_len += opt->opt_nflen + opt->opt_flen;
298 ipv6_push_frag_opts(skb, opt, &proto);
301 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
305 skb_push(skb, sizeof(struct ipv6hdr));
306 skb_reset_network_header(skb);
310 * Fill in the IPv6 header
313 hlimit = np->hop_limit;
315 hlimit = ip6_dst_hoplimit(dst);
317 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
318 ip6_autoflowlabel(net, np), fl6));
320 hdr->payload_len = htons(seg_len);
321 hdr->nexthdr = proto;
322 hdr->hop_limit = hlimit;
324 hdr->saddr = fl6->saddr;
325 hdr->daddr = *first_hop;
327 skb->protocol = htons(ETH_P_IPV6);
328 skb->priority = priority;
332 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
333 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
334 IPSTATS_MIB_OUT, skb->len);
336 /* if egress device is enslaved to an L3 master device pass the
337 * skb to its handler for processing
339 skb = l3mdev_ip6_out((struct sock *)sk, skb);
343 /* hooks should never assume socket lock is held.
344 * we promote our socket to non const
346 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
347 net, (struct sock *)sk, skb, NULL, dst->dev,
352 /* ipv6_local_error() does not require socket lock,
353 * we promote our socket to non const
355 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
357 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
361 EXPORT_SYMBOL(ip6_xmit);
363 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
365 struct ip6_ra_chain *ra;
366 struct sock *last = NULL;
368 read_lock(&ip6_ra_lock);
369 for (ra = ip6_ra_chain; ra; ra = ra->next) {
370 struct sock *sk = ra->sk;
371 if (sk && ra->sel == sel &&
372 (!sk->sk_bound_dev_if ||
373 sk->sk_bound_dev_if == skb->dev->ifindex)) {
374 struct ipv6_pinfo *np = inet6_sk(sk);
376 if (np && np->rtalert_isolate &&
377 !net_eq(sock_net(sk), dev_net(skb->dev))) {
381 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
383 rawv6_rcv(last, skb2);
390 rawv6_rcv(last, skb);
391 read_unlock(&ip6_ra_lock);
394 read_unlock(&ip6_ra_lock);
398 static int ip6_forward_proxy_check(struct sk_buff *skb)
400 struct ipv6hdr *hdr = ipv6_hdr(skb);
401 u8 nexthdr = hdr->nexthdr;
405 if (ipv6_ext_hdr(nexthdr)) {
406 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
410 offset = sizeof(struct ipv6hdr);
412 if (nexthdr == IPPROTO_ICMPV6) {
413 struct icmp6hdr *icmp6;
415 if (!pskb_may_pull(skb, (skb_network_header(skb) +
416 offset + 1 - skb->data)))
419 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
421 switch (icmp6->icmp6_type) {
422 case NDISC_ROUTER_SOLICITATION:
423 case NDISC_ROUTER_ADVERTISEMENT:
424 case NDISC_NEIGHBOUR_SOLICITATION:
425 case NDISC_NEIGHBOUR_ADVERTISEMENT:
427 /* For reaction involving unicast neighbor discovery
428 * message destined to the proxied address, pass it to
438 * The proxying router can't forward traffic sent to a link-local
439 * address, so signal the sender and discard the packet. This
440 * behavior is clarified by the MIPv6 specification.
442 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
443 dst_link_failure(skb);
450 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
453 struct dst_entry *dst = skb_dst(skb);
455 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
456 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
458 #ifdef CONFIG_NET_SWITCHDEV
459 if (skb->offload_l3_fwd_mark) {
466 return dst_output(net, sk, skb);
469 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
474 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
475 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
481 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
487 int ip6_forward(struct sk_buff *skb)
489 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
490 struct dst_entry *dst = skb_dst(skb);
491 struct ipv6hdr *hdr = ipv6_hdr(skb);
492 struct inet6_skb_parm *opt = IP6CB(skb);
493 struct net *net = dev_net(dst->dev);
496 if (net->ipv6.devconf_all->forwarding == 0)
499 if (skb->pkt_type != PACKET_HOST)
502 if (unlikely(skb->sk))
505 if (skb_warn_if_lro(skb))
508 if (!net->ipv6.devconf_all->disable_policy &&
509 (!idev || !idev->cnf.disable_policy) &&
510 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
511 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
515 skb_forward_csum(skb);
518 * We DO NOT make any processing on
519 * RA packets, pushing them to user level AS IS
520 * without ane WARRANTY that application will be able
521 * to interpret them. The reason is that we
522 * cannot make anything clever here.
524 * We are not end-node, so that if packet contains
525 * AH/ESP, we cannot make anything.
526 * Defragmentation also would be mistake, RA packets
527 * cannot be fragmented, because there is no warranty
528 * that different fragments will go along one path. --ANK
530 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
531 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
536 * check and decrement ttl
538 if (hdr->hop_limit <= 1) {
539 /* Force OUTPUT device used as source address */
541 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
542 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
548 /* XXX: idev->cnf.proxy_ndp? */
549 if (net->ipv6.devconf_all->proxy_ndp &&
550 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
551 int proxied = ip6_forward_proxy_check(skb);
553 return ip6_input(skb);
554 else if (proxied < 0) {
555 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
560 if (!xfrm6_route_forward(skb)) {
561 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
566 /* IPv6 specs say nothing about it, but it is clear that we cannot
567 send redirects to source routed frames.
568 We don't send redirects to frames decapsulated from IPsec.
570 if (IP6CB(skb)->iif == dst->dev->ifindex &&
571 opt->srcrt == 0 && !skb_sec_path(skb)) {
572 struct in6_addr *target = NULL;
573 struct inet_peer *peer;
577 * incoming and outgoing devices are the same
581 rt = (struct rt6_info *) dst;
582 if (rt->rt6i_flags & RTF_GATEWAY)
583 target = &rt->rt6i_gateway;
585 target = &hdr->daddr;
587 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
589 /* Limit redirects both by destination (here)
590 and by source (inside ndisc_send_redirect)
592 if (inet_peer_xrlim_allow(peer, 1*HZ))
593 ndisc_send_redirect(skb, target);
597 int addrtype = ipv6_addr_type(&hdr->saddr);
599 /* This check is security critical. */
600 if (addrtype == IPV6_ADDR_ANY ||
601 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
603 if (addrtype & IPV6_ADDR_LINKLOCAL) {
604 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
605 ICMPV6_NOT_NEIGHBOUR, 0);
610 mtu = ip6_dst_mtu_forward(dst);
611 if (mtu < IPV6_MIN_MTU)
614 if (ip6_pkt_too_big(skb, mtu)) {
615 /* Again, force OUTPUT device used as source address */
617 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
618 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
619 __IP6_INC_STATS(net, ip6_dst_idev(dst),
620 IPSTATS_MIB_FRAGFAILS);
625 if (skb_cow(skb, dst->dev->hard_header_len)) {
626 __IP6_INC_STATS(net, ip6_dst_idev(dst),
627 IPSTATS_MIB_OUTDISCARDS);
633 /* Mangling hops number delayed to point after skb COW */
637 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
638 net, NULL, skb, skb->dev, dst->dev,
642 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
648 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
650 to->pkt_type = from->pkt_type;
651 to->priority = from->priority;
652 to->protocol = from->protocol;
654 skb_dst_set(to, dst_clone(skb_dst(from)));
656 to->mark = from->mark;
658 skb_copy_hash(to, from);
660 #ifdef CONFIG_NET_SCHED
661 to->tc_index = from->tc_index;
664 skb_ext_copy(to, from);
665 skb_copy_secmark(to, from);
668 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
669 u8 nexthdr, __be32 frag_id,
670 struct ip6_fraglist_iter *iter)
672 unsigned int first_len;
676 *prevhdr = NEXTHDR_FRAGMENT;
677 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
681 iter->frag = skb_shinfo(skb)->frag_list;
682 skb_frag_list_init(skb);
686 iter->frag_id = frag_id;
687 iter->nexthdr = nexthdr;
689 __skb_pull(skb, hlen);
690 fh = __skb_push(skb, sizeof(struct frag_hdr));
691 __skb_push(skb, hlen);
692 skb_reset_network_header(skb);
693 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
695 fh->nexthdr = nexthdr;
697 fh->frag_off = htons(IP6_MF);
698 fh->identification = frag_id;
700 first_len = skb_pagelen(skb);
701 skb->data_len = first_len - skb_headlen(skb);
702 skb->len = first_len;
703 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
707 EXPORT_SYMBOL(ip6_fraglist_init);
709 void ip6_fraglist_prepare(struct sk_buff *skb,
710 struct ip6_fraglist_iter *iter)
712 struct sk_buff *frag = iter->frag;
713 unsigned int hlen = iter->hlen;
716 frag->ip_summed = CHECKSUM_NONE;
717 skb_reset_transport_header(frag);
718 fh = __skb_push(frag, sizeof(struct frag_hdr));
719 __skb_push(frag, hlen);
720 skb_reset_network_header(frag);
721 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
722 iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
723 fh->nexthdr = iter->nexthdr;
725 fh->frag_off = htons(iter->offset);
727 fh->frag_off |= htons(IP6_MF);
728 fh->identification = iter->frag_id;
729 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
730 ip6_copy_metadata(frag, skb);
732 EXPORT_SYMBOL(ip6_fraglist_prepare);
734 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
735 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
736 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
738 state->prevhdr = prevhdr;
739 state->nexthdr = nexthdr;
740 state->frag_id = frag_id;
745 state->left = skb->len - hlen; /* Space per frame */
746 state->ptr = hlen; /* Where to start from */
748 state->hroom = hdr_room;
749 state->troom = needed_tailroom;
753 EXPORT_SYMBOL(ip6_frag_init);
755 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
757 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
758 struct sk_buff *frag;
763 /* IF: it doesn't fit, use 'mtu' - the data space left */
764 if (len > state->mtu)
766 /* IF: we are not sending up to and including the packet end
767 then align the next start on an eight byte boundary */
768 if (len < state->left)
771 /* Allocate buffer */
772 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
773 state->hroom + state->troom, GFP_ATOMIC);
775 return ERR_PTR(-ENOMEM);
778 * Set up data on packet
781 ip6_copy_metadata(frag, skb);
782 skb_reserve(frag, state->hroom);
783 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
784 skb_reset_network_header(frag);
785 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
786 frag->transport_header = (frag->network_header + state->hlen +
787 sizeof(struct frag_hdr));
790 * Charge the memory for the fragment to any owner
794 skb_set_owner_w(frag, skb->sk);
797 * Copy the packet header into the new buffer.
799 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
801 fragnexthdr_offset = skb_network_header(frag);
802 fragnexthdr_offset += prevhdr - skb_network_header(skb);
803 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
806 * Build fragment header.
808 fh->nexthdr = state->nexthdr;
810 fh->identification = state->frag_id;
813 * Copy a block of the IP datagram.
815 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
819 fh->frag_off = htons(state->offset);
821 fh->frag_off |= htons(IP6_MF);
822 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
825 state->offset += len;
829 EXPORT_SYMBOL(ip6_frag_next);
831 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
832 int (*output)(struct net *, struct sock *, struct sk_buff *))
834 struct sk_buff *frag;
835 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
836 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
837 inet6_sk(skb->sk) : NULL;
838 struct ip6_frag_state state;
839 unsigned int mtu, hlen, nexthdr_offset;
840 ktime_t tstamp = skb->tstamp;
843 u8 *prevhdr, nexthdr = 0;
845 err = ip6_find_1stfragopt(skb, &prevhdr);
850 nexthdr_offset = prevhdr - skb_network_header(skb);
852 mtu = ip6_skb_dst_mtu(skb);
854 /* We must not fragment if the socket is set to force MTU discovery
855 * or if the skb it not generated by a local socket.
857 if (unlikely(!skb->ignore_df && skb->len > mtu))
860 if (IP6CB(skb)->frag_max_size) {
861 if (IP6CB(skb)->frag_max_size > mtu)
864 /* don't send fragments larger than what we received */
865 mtu = IP6CB(skb)->frag_max_size;
866 if (mtu < IPV6_MIN_MTU)
870 if (np && np->frag_size < mtu) {
874 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
876 mtu -= hlen + sizeof(struct frag_hdr);
878 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
879 &ipv6_hdr(skb)->saddr);
881 if (skb->ip_summed == CHECKSUM_PARTIAL &&
882 (err = skb_checksum_help(skb)))
885 prevhdr = skb_network_header(skb) + nexthdr_offset;
886 hroom = LL_RESERVED_SPACE(rt->dst.dev);
887 if (skb_has_frag_list(skb)) {
888 unsigned int first_len = skb_pagelen(skb);
889 struct ip6_fraglist_iter iter;
890 struct sk_buff *frag2;
892 if (first_len - hlen > mtu ||
893 ((first_len - hlen) & 7) ||
895 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
898 skb_walk_frags(skb, frag) {
899 /* Correct geometry. */
900 if (frag->len > mtu ||
901 ((frag->len & 7) && frag->next) ||
902 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
903 goto slow_path_clean;
905 /* Partially cloned skb? */
906 if (skb_shared(frag))
907 goto slow_path_clean;
912 frag->destructor = sock_wfree;
914 skb->truesize -= frag->truesize;
917 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
923 /* Prepare header of the next frame,
924 * before previous one went down. */
926 ip6_fraglist_prepare(skb, &iter);
928 skb->tstamp = tstamp;
929 err = output(net, sk, skb);
931 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
932 IPSTATS_MIB_FRAGCREATES);
934 if (err || !iter.frag)
937 skb = ip6_fraglist_next(&iter);
943 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
944 IPSTATS_MIB_FRAGOKS);
948 kfree_skb_list(iter.frag);
950 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
951 IPSTATS_MIB_FRAGFAILS);
955 skb_walk_frags(skb, frag2) {
959 frag2->destructor = NULL;
960 skb->truesize += frag2->truesize;
966 * Fragment the datagram.
969 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
970 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
974 * Keep copying data until we run out.
977 while (state.left > 0) {
978 frag = ip6_frag_next(skb, &state);
985 * Put this fragment into the sending queue.
987 frag->tstamp = tstamp;
988 err = output(net, sk, frag);
992 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
993 IPSTATS_MIB_FRAGCREATES);
995 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
996 IPSTATS_MIB_FRAGOKS);
1001 if (skb->sk && dst_allfrag(skb_dst(skb)))
1002 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
1004 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1008 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1009 IPSTATS_MIB_FRAGFAILS);
1014 static inline int ip6_rt_check(const struct rt6key *rt_key,
1015 const struct in6_addr *fl_addr,
1016 const struct in6_addr *addr_cache)
1018 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1019 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1022 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1023 struct dst_entry *dst,
1024 const struct flowi6 *fl6)
1026 struct ipv6_pinfo *np = inet6_sk(sk);
1027 struct rt6_info *rt;
1032 if (dst->ops->family != AF_INET6) {
1037 rt = (struct rt6_info *)dst;
1038 /* Yes, checking route validity in not connected
1039 * case is not very simple. Take into account,
1040 * that we do not support routing by source, TOS,
1041 * and MSG_DONTROUTE --ANK (980726)
1043 * 1. ip6_rt_check(): If route was host route,
1044 * check that cached destination is current.
1045 * If it is network route, we still may
1046 * check its validity using saved pointer
1047 * to the last used address: daddr_cache.
1048 * We do not want to save whole address now,
1049 * (because main consumer of this service
1050 * is tcp, which has not this problem),
1051 * so that the last trick works only on connected
1053 * 2. oif also should be the same.
1055 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1056 #ifdef CONFIG_IPV6_SUBTREES
1057 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1059 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1060 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1069 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1070 struct dst_entry **dst, struct flowi6 *fl6)
1072 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1073 struct neighbour *n;
1074 struct rt6_info *rt;
1079 /* The correct way to handle this would be to do
1080 * ip6_route_get_saddr, and then ip6_route_output; however,
1081 * the route-specific preferred source forces the
1082 * ip6_route_output call _before_ ip6_route_get_saddr.
1084 * In source specific routing (no src=any default route),
1085 * ip6_route_output will fail given src=any saddr, though, so
1086 * that's why we try it again later.
1088 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
1089 struct fib6_info *from;
1090 struct rt6_info *rt;
1091 bool had_dst = *dst != NULL;
1094 *dst = ip6_route_output(net, sk, fl6);
1095 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1098 from = rt ? rcu_dereference(rt->from) : NULL;
1099 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1100 sk ? inet6_sk(sk)->srcprefs : 0,
1105 goto out_err_release;
1107 /* If we had an erroneous initial result, pretend it
1108 * never existed and let the SA-enabled version take
1111 if (!had_dst && (*dst)->error) {
1116 if (fl6->flowi6_oif)
1117 flags |= RT6_LOOKUP_F_IFACE;
1121 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1123 err = (*dst)->error;
1125 goto out_err_release;
1127 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1129 * Here if the dst entry we've looked up
1130 * has a neighbour entry that is in the INCOMPLETE
1131 * state and the src address from the flow is
1132 * marked as OPTIMISTIC, we release the found
1133 * dst entry and replace it instead with the
1134 * dst entry of the nexthop router
1136 rt = (struct rt6_info *) *dst;
1138 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1139 rt6_nexthop(rt, &fl6->daddr));
1140 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1141 rcu_read_unlock_bh();
1144 struct inet6_ifaddr *ifp;
1145 struct flowi6 fl_gw6;
1148 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1151 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1157 * We need to get the dst entry for the
1158 * default router instead
1161 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1162 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1163 *dst = ip6_route_output(net, sk, &fl_gw6);
1164 err = (*dst)->error;
1166 goto out_err_release;
1170 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1171 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1172 err = -EAFNOSUPPORT;
1173 goto out_err_release;
1182 if (err == -ENETUNREACH)
1183 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1188 * ip6_dst_lookup - perform route lookup on flow
1189 * @sk: socket which provides route info
1190 * @dst: pointer to dst_entry * for result
1191 * @fl6: flow to lookup
1193 * This function performs a route lookup on the given flow.
1195 * It returns zero on success, or a standard errno code on error.
1197 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1201 return ip6_dst_lookup_tail(net, sk, dst, fl6);
1203 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1206 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1207 * @sk: socket which provides route info
1208 * @fl6: flow to lookup
1209 * @final_dst: final destination address for ipsec lookup
1211 * This function performs a route lookup on the given flow.
1213 * It returns a valid dst pointer on success, or a pointer encoded
1216 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1217 const struct in6_addr *final_dst)
1219 struct dst_entry *dst = NULL;
1222 err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1224 return ERR_PTR(err);
1226 fl6->daddr = *final_dst;
1228 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1230 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1233 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1234 * @sk: socket which provides the dst cache and route info
1235 * @fl6: flow to lookup
1236 * @final_dst: final destination address for ipsec lookup
1237 * @connected: whether @sk is connected or not
1239 * This function performs a route lookup on the given flow with the
1240 * possibility of using the cached route in the socket if it is valid.
1241 * It will take the socket dst lock when operating on the dst cache.
1242 * As a result, this function can only be used in process context.
1244 * In addition, for a connected socket, cache the dst in the socket
1245 * if the current cache is not valid.
1247 * It returns a valid dst pointer on success, or a pointer encoded
1250 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1251 const struct in6_addr *final_dst,
1254 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1256 dst = ip6_sk_dst_check(sk, dst, fl6);
1260 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1261 if (connected && !IS_ERR(dst))
1262 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1266 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1268 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1271 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1274 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1277 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1280 static void ip6_append_data_mtu(unsigned int *mtu,
1282 unsigned int fragheaderlen,
1283 struct sk_buff *skb,
1284 struct rt6_info *rt,
1285 unsigned int orig_mtu)
1287 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1289 /* first fragment, reserve header_len */
1290 *mtu = orig_mtu - rt->dst.header_len;
1294 * this fragment is not first, the headers
1295 * space is regarded as data space.
1299 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1300 + fragheaderlen - sizeof(struct frag_hdr);
1304 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1305 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1306 struct rt6_info *rt, struct flowi6 *fl6)
1308 struct ipv6_pinfo *np = inet6_sk(sk);
1310 struct ipv6_txoptions *opt = ipc6->opt;
1316 if (WARN_ON(v6_cork->opt))
1319 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1320 if (unlikely(!v6_cork->opt))
1323 v6_cork->opt->tot_len = sizeof(*opt);
1324 v6_cork->opt->opt_flen = opt->opt_flen;
1325 v6_cork->opt->opt_nflen = opt->opt_nflen;
1327 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1329 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1332 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1334 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1337 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1339 if (opt->hopopt && !v6_cork->opt->hopopt)
1342 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1344 if (opt->srcrt && !v6_cork->opt->srcrt)
1347 /* need source address above miyazawa*/
1350 cork->base.dst = &rt->dst;
1351 cork->fl.u.ip6 = *fl6;
1352 v6_cork->hop_limit = ipc6->hlimit;
1353 v6_cork->tclass = ipc6->tclass;
1354 if (rt->dst.flags & DST_XFRM_TUNNEL)
1355 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1356 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1358 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1359 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1360 if (np->frag_size < mtu) {
1362 mtu = np->frag_size;
1364 cork->base.fragsize = mtu;
1365 cork->base.gso_size = ipc6->gso_size;
1366 cork->base.tx_flags = 0;
1367 cork->base.mark = ipc6->sockc.mark;
1368 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1370 if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1371 cork->base.flags |= IPCORK_ALLFRAG;
1372 cork->base.length = 0;
1374 cork->base.transmit_time = ipc6->sockc.transmit_time;
1379 static int __ip6_append_data(struct sock *sk,
1381 struct sk_buff_head *queue,
1382 struct inet_cork *cork,
1383 struct inet6_cork *v6_cork,
1384 struct page_frag *pfrag,
1385 int getfrag(void *from, char *to, int offset,
1386 int len, int odd, struct sk_buff *skb),
1387 void *from, int length, int transhdrlen,
1388 unsigned int flags, struct ipcm6_cookie *ipc6)
1390 struct sk_buff *skb, *skb_prev = NULL;
1391 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1392 struct ubuf_info *uarg = NULL;
1394 int dst_exthdrlen = 0;
1400 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1401 struct ipv6_txoptions *opt = v6_cork->opt;
1402 int csummode = CHECKSUM_NONE;
1403 unsigned int maxnonfragsize, headersize;
1404 unsigned int wmem_alloc_delta = 0;
1405 bool paged, extra_uref = false;
1407 skb = skb_peek_tail(queue);
1409 exthdrlen = opt ? opt->opt_flen : 0;
1410 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1413 paged = !!cork->gso_size;
1414 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1417 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1418 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1419 tskey = sk->sk_tskey++;
1421 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1423 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1424 (opt ? opt->opt_nflen : 0);
1426 headersize = sizeof(struct ipv6hdr) +
1427 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1428 (dst_allfrag(&rt->dst) ?
1429 sizeof(struct frag_hdr) : 0) +
1430 rt->rt6i_nfheader_len;
1432 if (mtu <= fragheaderlen ||
1433 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1436 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1437 sizeof(struct frag_hdr);
1439 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1440 * the first fragment
1442 if (headersize + transhdrlen > mtu)
1445 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1446 (sk->sk_protocol == IPPROTO_UDP ||
1447 sk->sk_protocol == IPPROTO_RAW)) {
1448 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1449 sizeof(struct ipv6hdr));
1453 if (ip6_sk_ignore_df(sk))
1454 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1456 maxnonfragsize = mtu;
1458 if (cork->length + length > maxnonfragsize - headersize) {
1460 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1461 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1465 /* CHECKSUM_PARTIAL only with no extension headers and when
1466 * we are not going to fragment
1468 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1469 headersize == sizeof(struct ipv6hdr) &&
1470 length <= mtu - headersize &&
1471 (!(flags & MSG_MORE) || cork->gso_size) &&
1472 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1473 csummode = CHECKSUM_PARTIAL;
1475 if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1476 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1479 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
1480 if (rt->dst.dev->features & NETIF_F_SG &&
1481 csummode == CHECKSUM_PARTIAL) {
1485 skb_zcopy_set(skb, uarg, &extra_uref);
1490 * Let's try using as much space as possible.
1491 * Use MTU if total length of the message fits into the MTU.
1492 * Otherwise, we need to reserve fragment header and
1493 * fragment alignment (= 8-15 octects, in total).
1495 * Note that we may need to "move" the data from the tail of
1496 * of the buffer to the new fragment when we split
1499 * FIXME: It may be fragmented into multiple chunks
1500 * at once if non-fragmentable extension headers
1505 cork->length += length;
1509 while (length > 0) {
1510 /* Check if the remaining data fits into current packet. */
1511 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1513 copy = maxfraglen - skb->len;
1517 unsigned int datalen;
1518 unsigned int fraglen;
1519 unsigned int fraggap;
1520 unsigned int alloclen, alloc_extra;
1521 unsigned int pagedlen;
1523 /* There's no room in the current skb */
1525 fraggap = skb->len - maxfraglen;
1528 /* update mtu and maxfraglen if necessary */
1529 if (!skb || !skb_prev)
1530 ip6_append_data_mtu(&mtu, &maxfraglen,
1531 fragheaderlen, skb, rt,
1537 * If remaining data exceeds the mtu,
1538 * we know we need more fragment(s).
1540 datalen = length + fraggap;
1542 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1543 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1544 fraglen = datalen + fragheaderlen;
1547 alloc_extra = hh_len;
1548 alloc_extra += dst_exthdrlen;
1549 alloc_extra += rt->dst.trailer_len;
1551 /* We just reserve space for fragment header.
1552 * Note: this may be overallocation if the message
1553 * (without MSG_MORE) fits into the MTU.
1555 alloc_extra += sizeof(struct frag_hdr);
1557 if ((flags & MSG_MORE) &&
1558 !(rt->dst.dev->features&NETIF_F_SG))
1561 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1562 !(rt->dst.dev->features & NETIF_F_SG)))
1565 alloclen = min_t(int, fraglen, MAX_HEADER);
1566 pagedlen = fraglen - alloclen;
1568 alloclen += alloc_extra;
1570 if (datalen != length + fraggap) {
1572 * this is not the last fragment, the trailer
1573 * space is regarded as data space.
1575 datalen += rt->dst.trailer_len;
1578 fraglen = datalen + fragheaderlen;
1580 copy = datalen - transhdrlen - fraggap - pagedlen;
1586 skb = sock_alloc_send_skb(sk, alloclen,
1587 (flags & MSG_DONTWAIT), &err);
1590 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1592 skb = alloc_skb(alloclen,
1600 * Fill in the control structures
1602 skb->protocol = htons(ETH_P_IPV6);
1603 skb->ip_summed = csummode;
1605 /* reserve for fragmentation and ipsec header */
1606 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1610 * Find where to start putting bytes
1612 data = skb_put(skb, fraglen - pagedlen);
1613 skb_set_network_header(skb, exthdrlen);
1614 data += fragheaderlen;
1615 skb->transport_header = (skb->network_header +
1618 skb->csum = skb_copy_and_csum_bits(
1619 skb_prev, maxfraglen,
1620 data + transhdrlen, fraggap, 0);
1621 skb_prev->csum = csum_sub(skb_prev->csum,
1624 pskb_trim_unique(skb_prev, maxfraglen);
1627 getfrag(from, data + transhdrlen, offset,
1628 copy, fraggap, skb) < 0) {
1635 length -= copy + transhdrlen;
1640 /* Only the initial fragment is time stamped */
1641 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1643 skb_shinfo(skb)->tskey = tskey;
1645 skb_zcopy_set(skb, uarg, &extra_uref);
1647 if ((flags & MSG_CONFIRM) && !skb_prev)
1648 skb_set_dst_pending_confirm(skb, 1);
1651 * Put the packet on the pending queue
1653 if (!skb->destructor) {
1654 skb->destructor = sock_wfree;
1656 wmem_alloc_delta += skb->truesize;
1658 __skb_queue_tail(queue, skb);
1665 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1666 skb_tailroom(skb) >= copy) {
1670 if (getfrag(from, skb_put(skb, copy),
1671 offset, copy, off, skb) < 0) {
1672 __skb_trim(skb, off);
1676 } else if (!uarg || !uarg->zerocopy) {
1677 int i = skb_shinfo(skb)->nr_frags;
1680 if (!sk_page_frag_refill(sk, pfrag))
1683 if (!skb_can_coalesce(skb, i, pfrag->page,
1686 if (i == MAX_SKB_FRAGS)
1689 __skb_fill_page_desc(skb, i, pfrag->page,
1691 skb_shinfo(skb)->nr_frags = ++i;
1692 get_page(pfrag->page);
1694 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1696 page_address(pfrag->page) + pfrag->offset,
1697 offset, copy, skb->len, skb) < 0)
1700 pfrag->offset += copy;
1701 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1703 skb->data_len += copy;
1704 skb->truesize += copy;
1705 wmem_alloc_delta += copy;
1707 err = skb_zerocopy_iter_dgram(skb, from, copy);
1715 if (wmem_alloc_delta)
1716 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1723 sock_zerocopy_put_abort(uarg, extra_uref);
1724 cork->length -= length;
1725 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1726 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1730 int ip6_append_data(struct sock *sk,
1731 int getfrag(void *from, char *to, int offset, int len,
1732 int odd, struct sk_buff *skb),
1733 void *from, int length, int transhdrlen,
1734 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1735 struct rt6_info *rt, unsigned int flags)
1737 struct inet_sock *inet = inet_sk(sk);
1738 struct ipv6_pinfo *np = inet6_sk(sk);
1742 if (flags&MSG_PROBE)
1744 if (skb_queue_empty(&sk->sk_write_queue)) {
1748 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1753 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1754 length += exthdrlen;
1755 transhdrlen += exthdrlen;
1757 fl6 = &inet->cork.fl.u.ip6;
1761 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1762 &np->cork, sk_page_frag(sk), getfrag,
1763 from, length, transhdrlen, flags, ipc6);
1765 EXPORT_SYMBOL_GPL(ip6_append_data);
1767 static void ip6_cork_release(struct inet_cork_full *cork,
1768 struct inet6_cork *v6_cork)
1771 kfree(v6_cork->opt->dst0opt);
1772 kfree(v6_cork->opt->dst1opt);
1773 kfree(v6_cork->opt->hopopt);
1774 kfree(v6_cork->opt->srcrt);
1775 kfree(v6_cork->opt);
1776 v6_cork->opt = NULL;
1779 if (cork->base.dst) {
1780 dst_release(cork->base.dst);
1781 cork->base.dst = NULL;
1782 cork->base.flags &= ~IPCORK_ALLFRAG;
1784 memset(&cork->fl, 0, sizeof(cork->fl));
1787 struct sk_buff *__ip6_make_skb(struct sock *sk,
1788 struct sk_buff_head *queue,
1789 struct inet_cork_full *cork,
1790 struct inet6_cork *v6_cork)
1792 struct sk_buff *skb, *tmp_skb;
1793 struct sk_buff **tail_skb;
1794 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1795 struct ipv6_pinfo *np = inet6_sk(sk);
1796 struct net *net = sock_net(sk);
1797 struct ipv6hdr *hdr;
1798 struct ipv6_txoptions *opt = v6_cork->opt;
1799 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1800 struct flowi6 *fl6 = &cork->fl.u.ip6;
1801 unsigned char proto = fl6->flowi6_proto;
1803 skb = __skb_dequeue(queue);
1806 tail_skb = &(skb_shinfo(skb)->frag_list);
1808 /* move skb->data to ip header from ext header */
1809 if (skb->data < skb_network_header(skb))
1810 __skb_pull(skb, skb_network_offset(skb));
1811 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1812 __skb_pull(tmp_skb, skb_network_header_len(skb));
1813 *tail_skb = tmp_skb;
1814 tail_skb = &(tmp_skb->next);
1815 skb->len += tmp_skb->len;
1816 skb->data_len += tmp_skb->len;
1817 skb->truesize += tmp_skb->truesize;
1818 tmp_skb->destructor = NULL;
1822 /* Allow local fragmentation. */
1823 skb->ignore_df = ip6_sk_ignore_df(sk);
1825 *final_dst = fl6->daddr;
1826 __skb_pull(skb, skb_network_header_len(skb));
1827 if (opt && opt->opt_flen)
1828 ipv6_push_frag_opts(skb, opt, &proto);
1829 if (opt && opt->opt_nflen)
1830 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1832 skb_push(skb, sizeof(struct ipv6hdr));
1833 skb_reset_network_header(skb);
1834 hdr = ipv6_hdr(skb);
1836 ip6_flow_hdr(hdr, v6_cork->tclass,
1837 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1838 ip6_autoflowlabel(net, np), fl6));
1839 hdr->hop_limit = v6_cork->hop_limit;
1840 hdr->nexthdr = proto;
1841 hdr->saddr = fl6->saddr;
1842 hdr->daddr = *final_dst;
1844 skb->priority = sk->sk_priority;
1845 skb->mark = cork->base.mark;
1847 skb->tstamp = cork->base.transmit_time;
1849 skb_dst_set(skb, dst_clone(&rt->dst));
1850 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1851 if (proto == IPPROTO_ICMPV6) {
1852 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1854 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1855 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1858 ip6_cork_release(cork, v6_cork);
1863 int ip6_send_skb(struct sk_buff *skb)
1865 struct net *net = sock_net(skb->sk);
1866 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1869 err = ip6_local_out(net, skb->sk, skb);
1872 err = net_xmit_errno(err);
1874 IP6_INC_STATS(net, rt->rt6i_idev,
1875 IPSTATS_MIB_OUTDISCARDS);
1881 int ip6_push_pending_frames(struct sock *sk)
1883 struct sk_buff *skb;
1885 skb = ip6_finish_skb(sk);
1889 return ip6_send_skb(skb);
1891 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1893 static void __ip6_flush_pending_frames(struct sock *sk,
1894 struct sk_buff_head *queue,
1895 struct inet_cork_full *cork,
1896 struct inet6_cork *v6_cork)
1898 struct sk_buff *skb;
1900 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1902 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1903 IPSTATS_MIB_OUTDISCARDS);
1907 ip6_cork_release(cork, v6_cork);
1910 void ip6_flush_pending_frames(struct sock *sk)
1912 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1913 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1915 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1917 struct sk_buff *ip6_make_skb(struct sock *sk,
1918 int getfrag(void *from, char *to, int offset,
1919 int len, int odd, struct sk_buff *skb),
1920 void *from, int length, int transhdrlen,
1921 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1922 struct rt6_info *rt, unsigned int flags,
1923 struct inet_cork_full *cork)
1925 struct inet6_cork v6_cork;
1926 struct sk_buff_head queue;
1927 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1930 if (flags & MSG_PROBE)
1933 __skb_queue_head_init(&queue);
1935 cork->base.flags = 0;
1936 cork->base.addr = 0;
1937 cork->base.opt = NULL;
1938 cork->base.dst = NULL;
1940 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1942 ip6_cork_release(cork, &v6_cork);
1943 return ERR_PTR(err);
1945 if (ipc6->dontfrag < 0)
1946 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1948 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1949 ¤t->task_frag, getfrag, from,
1950 length + exthdrlen, transhdrlen + exthdrlen,
1953 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1954 return ERR_PTR(err);
1957 return __ip6_make_skb(sk, &queue, cork, &v6_cork);