1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * IPv6 output functions
4 * Linux INET6 implementation
7 * Pedro Roque <roque@di.fc.ul.pt>
9 * Based on linux/net/ipv4/ip_output.c
12 * A.N.Kuznetsov : airthmetics in fragmentation.
13 * extension headers are implemented.
14 * route changes now work.
15 * ip6_forward does not confuse sniffers.
18 * H. von Brand : Added missing #include <linux/string.h>
19 * Imran Patel : frag id should be in NBO
20 * Kazunori MIYAZAWA @USAGI
21 * : add ip6_append_data and related functions
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 #include <net/ip_tunnels.h>
59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 struct dst_entry *dst = skb_dst(skb);
62 struct net_device *dev = dst->dev;
63 struct inet6_dev *idev = ip6_dst_idev(dst);
64 unsigned int hh_len = LL_RESERVED_SPACE(dev);
65 const struct in6_addr *daddr, *nexthop;
67 struct neighbour *neigh;
70 /* Be paranoid, rather than too clever. */
71 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
72 skb = skb_expand_head(skb, hh_len);
74 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
81 if (ipv6_addr_is_multicast(daddr)) {
82 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
83 ((mroute6_is_socket(net, skb) &&
84 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
85 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
86 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
88 /* Do not check for IFF_ALLMULTI; multicast routing
89 is not supported in any case.
92 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
93 net, sk, newskb, NULL, newskb->dev,
96 if (hdr->hop_limit == 0) {
97 IP6_INC_STATS(net, idev,
98 IPSTATS_MIB_OUTDISCARDS);
104 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
105 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
106 !(dev->flags & IFF_LOOPBACK)) {
112 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
113 int res = lwtunnel_xmit(skb);
115 if (res != LWTUNNEL_XMIT_CONTINUE)
120 nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
121 neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
122 if (unlikely(!neigh))
123 neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
124 if (!IS_ERR(neigh)) {
125 sock_confirm_neigh(skb, neigh);
126 ret = neigh_output(neigh, skb, false);
127 rcu_read_unlock_bh();
130 rcu_read_unlock_bh();
132 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
138 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
139 struct sk_buff *skb, unsigned int mtu)
141 struct sk_buff *segs, *nskb;
142 netdev_features_t features;
145 /* Please see corresponding comment in ip_finish_output_gso
146 * describing the cases where GSO segment length exceeds the
149 features = netif_skb_features(skb);
150 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
151 if (IS_ERR_OR_NULL(segs)) {
158 skb_list_walk_safe(segs, segs, nskb) {
161 skb_mark_not_on_list(segs);
162 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
170 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
174 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
175 /* Policy lookup after SNAT yielded a new policy */
176 if (skb_dst(skb)->xfrm) {
177 IP6CB(skb)->flags |= IP6SKB_REROUTED;
178 return dst_output(net, sk, skb);
182 mtu = ip6_skb_dst_mtu(skb);
183 if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
184 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
186 if ((skb->len > mtu && !skb_is_gso(skb)) ||
187 dst_allfrag(skb_dst(skb)) ||
188 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
189 return ip6_fragment(net, sk, skb, ip6_finish_output2);
191 return ip6_finish_output2(net, sk, skb);
194 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
198 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
200 case NET_XMIT_SUCCESS:
201 return __ip6_finish_output(net, sk, skb);
203 return __ip6_finish_output(net, sk, skb) ? : ret;
210 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
212 struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
213 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
215 skb->protocol = htons(ETH_P_IPV6);
218 if (unlikely(idev->cnf.disable_ipv6)) {
219 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
224 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
225 net, sk, skb, indev, dev,
227 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
229 EXPORT_SYMBOL(ip6_output);
231 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
233 if (!np->autoflowlabel_set)
234 return ip6_default_np_autolabel(net);
236 return np->autoflowlabel;
240 * xmit an sk_buff (used by TCP, SCTP and DCCP)
241 * Note : socket lock is not held for SYNACK packets, but might be modified
242 * by calls to skb_set_owner_w() and ipv6_local_error(),
243 * which are using proper atomic operations or spinlocks.
245 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
246 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
248 struct net *net = sock_net(sk);
249 const struct ipv6_pinfo *np = inet6_sk(sk);
250 struct in6_addr *first_hop = &fl6->daddr;
251 struct dst_entry *dst = skb_dst(skb);
252 struct net_device *dev = dst->dev;
253 struct inet6_dev *idev = ip6_dst_idev(dst);
254 unsigned int head_room;
256 u8 proto = fl6->flowi6_proto;
257 int seg_len = skb->len;
261 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev);
263 head_room += opt->opt_nflen + opt->opt_flen;
265 if (unlikely(head_room > skb_headroom(skb))) {
266 skb = skb_expand_head(skb, head_room);
268 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
274 seg_len += opt->opt_nflen + opt->opt_flen;
277 ipv6_push_frag_opts(skb, opt, &proto);
280 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
284 skb_push(skb, sizeof(struct ipv6hdr));
285 skb_reset_network_header(skb);
289 * Fill in the IPv6 header
292 hlimit = np->hop_limit;
294 hlimit = ip6_dst_hoplimit(dst);
296 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
297 ip6_autoflowlabel(net, np), fl6));
299 hdr->payload_len = htons(seg_len);
300 hdr->nexthdr = proto;
301 hdr->hop_limit = hlimit;
303 hdr->saddr = fl6->saddr;
304 hdr->daddr = *first_hop;
306 skb->protocol = htons(ETH_P_IPV6);
307 skb->priority = priority;
311 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
312 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
314 /* if egress device is enslaved to an L3 master device pass the
315 * skb to its handler for processing
317 skb = l3mdev_ip6_out((struct sock *)sk, skb);
321 /* hooks should never assume socket lock is held.
322 * we promote our socket to non const
324 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
325 net, (struct sock *)sk, skb, NULL, dev,
330 /* ipv6_local_error() does not require socket lock,
331 * we promote our socket to non const
333 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
335 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
339 EXPORT_SYMBOL(ip6_xmit);
341 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
343 struct ip6_ra_chain *ra;
344 struct sock *last = NULL;
346 read_lock(&ip6_ra_lock);
347 for (ra = ip6_ra_chain; ra; ra = ra->next) {
348 struct sock *sk = ra->sk;
349 if (sk && ra->sel == sel &&
350 (!sk->sk_bound_dev_if ||
351 sk->sk_bound_dev_if == skb->dev->ifindex)) {
352 struct ipv6_pinfo *np = inet6_sk(sk);
354 if (np && np->rtalert_isolate &&
355 !net_eq(sock_net(sk), dev_net(skb->dev))) {
359 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
361 rawv6_rcv(last, skb2);
368 rawv6_rcv(last, skb);
369 read_unlock(&ip6_ra_lock);
372 read_unlock(&ip6_ra_lock);
376 static int ip6_forward_proxy_check(struct sk_buff *skb)
378 struct ipv6hdr *hdr = ipv6_hdr(skb);
379 u8 nexthdr = hdr->nexthdr;
383 if (ipv6_ext_hdr(nexthdr)) {
384 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
388 offset = sizeof(struct ipv6hdr);
390 if (nexthdr == IPPROTO_ICMPV6) {
391 struct icmp6hdr *icmp6;
393 if (!pskb_may_pull(skb, (skb_network_header(skb) +
394 offset + 1 - skb->data)))
397 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
399 switch (icmp6->icmp6_type) {
400 case NDISC_ROUTER_SOLICITATION:
401 case NDISC_ROUTER_ADVERTISEMENT:
402 case NDISC_NEIGHBOUR_SOLICITATION:
403 case NDISC_NEIGHBOUR_ADVERTISEMENT:
405 /* For reaction involving unicast neighbor discovery
406 * message destined to the proxied address, pass it to
416 * The proxying router can't forward traffic sent to a link-local
417 * address, so signal the sender and discard the packet. This
418 * behavior is clarified by the MIPv6 specification.
420 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
421 dst_link_failure(skb);
428 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
431 struct dst_entry *dst = skb_dst(skb);
433 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
434 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
436 #ifdef CONFIG_NET_SWITCHDEV
437 if (skb->offload_l3_fwd_mark) {
444 return dst_output(net, sk, skb);
447 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
452 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
453 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
459 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
465 int ip6_forward(struct sk_buff *skb)
467 struct dst_entry *dst = skb_dst(skb);
468 struct ipv6hdr *hdr = ipv6_hdr(skb);
469 struct inet6_skb_parm *opt = IP6CB(skb);
470 struct net *net = dev_net(dst->dev);
471 struct inet6_dev *idev;
474 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
475 if (net->ipv6.devconf_all->forwarding == 0)
478 if (skb->pkt_type != PACKET_HOST)
481 if (unlikely(skb->sk))
484 if (skb_warn_if_lro(skb))
487 if (!net->ipv6.devconf_all->disable_policy &&
488 (!idev || !idev->cnf.disable_policy) &&
489 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
490 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
494 skb_forward_csum(skb);
497 * We DO NOT make any processing on
498 * RA packets, pushing them to user level AS IS
499 * without ane WARRANTY that application will be able
500 * to interpret them. The reason is that we
501 * cannot make anything clever here.
503 * We are not end-node, so that if packet contains
504 * AH/ESP, we cannot make anything.
505 * Defragmentation also would be mistake, RA packets
506 * cannot be fragmented, because there is no warranty
507 * that different fragments will go along one path. --ANK
509 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
510 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
515 * check and decrement ttl
517 if (hdr->hop_limit <= 1) {
518 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
519 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
525 /* XXX: idev->cnf.proxy_ndp? */
526 if (net->ipv6.devconf_all->proxy_ndp &&
527 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
528 int proxied = ip6_forward_proxy_check(skb);
530 /* It's tempting to decrease the hop limit
531 * here by 1, as we do at the end of the
534 * But that would be incorrect, as proxying is
535 * not forwarding. The ip6_input function
536 * will handle this packet locally, and it
537 * depends on the hop limit being unchanged.
539 * One example is the NDP hop limit, that
540 * always has to stay 255, but other would be
541 * similar checks around RA packets, where the
542 * user can even change the desired limit.
544 return ip6_input(skb);
545 } else if (proxied < 0) {
546 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
551 if (!xfrm6_route_forward(skb)) {
552 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
557 /* IPv6 specs say nothing about it, but it is clear that we cannot
558 send redirects to source routed frames.
559 We don't send redirects to frames decapsulated from IPsec.
561 if (IP6CB(skb)->iif == dst->dev->ifindex &&
562 opt->srcrt == 0 && !skb_sec_path(skb)) {
563 struct in6_addr *target = NULL;
564 struct inet_peer *peer;
568 * incoming and outgoing devices are the same
572 rt = (struct rt6_info *) dst;
573 if (rt->rt6i_flags & RTF_GATEWAY)
574 target = &rt->rt6i_gateway;
576 target = &hdr->daddr;
578 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
580 /* Limit redirects both by destination (here)
581 and by source (inside ndisc_send_redirect)
583 if (inet_peer_xrlim_allow(peer, 1*HZ))
584 ndisc_send_redirect(skb, target);
588 int addrtype = ipv6_addr_type(&hdr->saddr);
590 /* This check is security critical. */
591 if (addrtype == IPV6_ADDR_ANY ||
592 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
594 if (addrtype & IPV6_ADDR_LINKLOCAL) {
595 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
596 ICMPV6_NOT_NEIGHBOUR, 0);
601 mtu = ip6_dst_mtu_maybe_forward(dst, true);
602 if (mtu < IPV6_MIN_MTU)
605 if (ip6_pkt_too_big(skb, mtu)) {
606 /* Again, force OUTPUT device used as source address */
608 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
609 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
610 __IP6_INC_STATS(net, ip6_dst_idev(dst),
611 IPSTATS_MIB_FRAGFAILS);
616 if (skb_cow(skb, dst->dev->hard_header_len)) {
617 __IP6_INC_STATS(net, ip6_dst_idev(dst),
618 IPSTATS_MIB_OUTDISCARDS);
624 /* Mangling hops number delayed to point after skb COW */
628 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
629 net, NULL, skb, skb->dev, dst->dev,
633 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
639 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
641 to->pkt_type = from->pkt_type;
642 to->priority = from->priority;
643 to->protocol = from->protocol;
645 skb_dst_set(to, dst_clone(skb_dst(from)));
647 to->mark = from->mark;
649 skb_copy_hash(to, from);
651 #ifdef CONFIG_NET_SCHED
652 to->tc_index = from->tc_index;
655 skb_ext_copy(to, from);
656 skb_copy_secmark(to, from);
659 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
660 u8 nexthdr, __be32 frag_id,
661 struct ip6_fraglist_iter *iter)
663 unsigned int first_len;
667 *prevhdr = NEXTHDR_FRAGMENT;
668 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
672 iter->frag = skb_shinfo(skb)->frag_list;
673 skb_frag_list_init(skb);
677 iter->frag_id = frag_id;
678 iter->nexthdr = nexthdr;
680 __skb_pull(skb, hlen);
681 fh = __skb_push(skb, sizeof(struct frag_hdr));
682 __skb_push(skb, hlen);
683 skb_reset_network_header(skb);
684 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
686 fh->nexthdr = nexthdr;
688 fh->frag_off = htons(IP6_MF);
689 fh->identification = frag_id;
691 first_len = skb_pagelen(skb);
692 skb->data_len = first_len - skb_headlen(skb);
693 skb->len = first_len;
694 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
698 EXPORT_SYMBOL(ip6_fraglist_init);
700 void ip6_fraglist_prepare(struct sk_buff *skb,
701 struct ip6_fraglist_iter *iter)
703 struct sk_buff *frag = iter->frag;
704 unsigned int hlen = iter->hlen;
707 frag->ip_summed = CHECKSUM_NONE;
708 skb_reset_transport_header(frag);
709 fh = __skb_push(frag, sizeof(struct frag_hdr));
710 __skb_push(frag, hlen);
711 skb_reset_network_header(frag);
712 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
713 iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
714 fh->nexthdr = iter->nexthdr;
716 fh->frag_off = htons(iter->offset);
718 fh->frag_off |= htons(IP6_MF);
719 fh->identification = iter->frag_id;
720 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
721 ip6_copy_metadata(frag, skb);
723 EXPORT_SYMBOL(ip6_fraglist_prepare);
725 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
726 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
727 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
729 state->prevhdr = prevhdr;
730 state->nexthdr = nexthdr;
731 state->frag_id = frag_id;
736 state->left = skb->len - hlen; /* Space per frame */
737 state->ptr = hlen; /* Where to start from */
739 state->hroom = hdr_room;
740 state->troom = needed_tailroom;
744 EXPORT_SYMBOL(ip6_frag_init);
746 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
748 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
749 struct sk_buff *frag;
754 /* IF: it doesn't fit, use 'mtu' - the data space left */
755 if (len > state->mtu)
757 /* IF: we are not sending up to and including the packet end
758 then align the next start on an eight byte boundary */
759 if (len < state->left)
762 /* Allocate buffer */
763 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
764 state->hroom + state->troom, GFP_ATOMIC);
766 return ERR_PTR(-ENOMEM);
769 * Set up data on packet
772 ip6_copy_metadata(frag, skb);
773 skb_reserve(frag, state->hroom);
774 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
775 skb_reset_network_header(frag);
776 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
777 frag->transport_header = (frag->network_header + state->hlen +
778 sizeof(struct frag_hdr));
781 * Charge the memory for the fragment to any owner
785 skb_set_owner_w(frag, skb->sk);
788 * Copy the packet header into the new buffer.
790 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
792 fragnexthdr_offset = skb_network_header(frag);
793 fragnexthdr_offset += prevhdr - skb_network_header(skb);
794 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
797 * Build fragment header.
799 fh->nexthdr = state->nexthdr;
801 fh->identification = state->frag_id;
804 * Copy a block of the IP datagram.
806 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
810 fh->frag_off = htons(state->offset);
812 fh->frag_off |= htons(IP6_MF);
813 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
816 state->offset += len;
820 EXPORT_SYMBOL(ip6_frag_next);
822 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
823 int (*output)(struct net *, struct sock *, struct sk_buff *))
825 struct sk_buff *frag;
826 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
827 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
828 inet6_sk(skb->sk) : NULL;
829 struct ip6_frag_state state;
830 unsigned int mtu, hlen, nexthdr_offset;
831 ktime_t tstamp = skb->tstamp;
834 u8 *prevhdr, nexthdr = 0;
836 err = ip6_find_1stfragopt(skb, &prevhdr);
841 nexthdr_offset = prevhdr - skb_network_header(skb);
843 mtu = ip6_skb_dst_mtu(skb);
845 /* We must not fragment if the socket is set to force MTU discovery
846 * or if the skb it not generated by a local socket.
848 if (unlikely(!skb->ignore_df && skb->len > mtu))
851 if (IP6CB(skb)->frag_max_size) {
852 if (IP6CB(skb)->frag_max_size > mtu)
855 /* don't send fragments larger than what we received */
856 mtu = IP6CB(skb)->frag_max_size;
857 if (mtu < IPV6_MIN_MTU)
861 if (np && np->frag_size < mtu) {
865 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
867 mtu -= hlen + sizeof(struct frag_hdr);
869 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
870 &ipv6_hdr(skb)->saddr);
872 if (skb->ip_summed == CHECKSUM_PARTIAL &&
873 (err = skb_checksum_help(skb)))
876 prevhdr = skb_network_header(skb) + nexthdr_offset;
877 hroom = LL_RESERVED_SPACE(rt->dst.dev);
878 if (skb_has_frag_list(skb)) {
879 unsigned int first_len = skb_pagelen(skb);
880 struct ip6_fraglist_iter iter;
881 struct sk_buff *frag2;
883 if (first_len - hlen > mtu ||
884 ((first_len - hlen) & 7) ||
886 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
889 skb_walk_frags(skb, frag) {
890 /* Correct geometry. */
891 if (frag->len > mtu ||
892 ((frag->len & 7) && frag->next) ||
893 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
894 goto slow_path_clean;
896 /* Partially cloned skb? */
897 if (skb_shared(frag))
898 goto slow_path_clean;
903 frag->destructor = sock_wfree;
905 skb->truesize -= frag->truesize;
908 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
913 /* We prevent @rt from being freed. */
917 /* Prepare header of the next frame,
918 * before previous one went down. */
920 ip6_fraglist_prepare(skb, &iter);
922 skb->tstamp = tstamp;
923 err = output(net, sk, skb);
925 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
926 IPSTATS_MIB_FRAGCREATES);
928 if (err || !iter.frag)
931 skb = ip6_fraglist_next(&iter);
937 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
938 IPSTATS_MIB_FRAGOKS);
943 kfree_skb_list(iter.frag);
945 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
946 IPSTATS_MIB_FRAGFAILS);
951 skb_walk_frags(skb, frag2) {
955 frag2->destructor = NULL;
956 skb->truesize += frag2->truesize;
962 * Fragment the datagram.
965 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
966 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
970 * Keep copying data until we run out.
973 while (state.left > 0) {
974 frag = ip6_frag_next(skb, &state);
981 * Put this fragment into the sending queue.
983 frag->tstamp = tstamp;
984 err = output(net, sk, frag);
988 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
989 IPSTATS_MIB_FRAGCREATES);
991 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
992 IPSTATS_MIB_FRAGOKS);
997 if (skb->sk && dst_allfrag(skb_dst(skb)))
998 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
1000 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1004 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1005 IPSTATS_MIB_FRAGFAILS);
1010 static inline int ip6_rt_check(const struct rt6key *rt_key,
1011 const struct in6_addr *fl_addr,
1012 const struct in6_addr *addr_cache)
1014 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1015 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1018 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1019 struct dst_entry *dst,
1020 const struct flowi6 *fl6)
1022 struct ipv6_pinfo *np = inet6_sk(sk);
1023 struct rt6_info *rt;
1028 if (dst->ops->family != AF_INET6) {
1033 rt = (struct rt6_info *)dst;
1034 /* Yes, checking route validity in not connected
1035 * case is not very simple. Take into account,
1036 * that we do not support routing by source, TOS,
1037 * and MSG_DONTROUTE --ANK (980726)
1039 * 1. ip6_rt_check(): If route was host route,
1040 * check that cached destination is current.
1041 * If it is network route, we still may
1042 * check its validity using saved pointer
1043 * to the last used address: daddr_cache.
1044 * We do not want to save whole address now,
1045 * (because main consumer of this service
1046 * is tcp, which has not this problem),
1047 * so that the last trick works only on connected
1049 * 2. oif also should be the same.
1051 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1052 #ifdef CONFIG_IPV6_SUBTREES
1053 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1055 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1056 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1065 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1066 struct dst_entry **dst, struct flowi6 *fl6)
1068 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1069 struct neighbour *n;
1070 struct rt6_info *rt;
1075 /* The correct way to handle this would be to do
1076 * ip6_route_get_saddr, and then ip6_route_output; however,
1077 * the route-specific preferred source forces the
1078 * ip6_route_output call _before_ ip6_route_get_saddr.
1080 * In source specific routing (no src=any default route),
1081 * ip6_route_output will fail given src=any saddr, though, so
1082 * that's why we try it again later.
1084 if (ipv6_addr_any(&fl6->saddr)) {
1085 struct fib6_info *from;
1086 struct rt6_info *rt;
1088 *dst = ip6_route_output(net, sk, fl6);
1089 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1092 from = rt ? rcu_dereference(rt->from) : NULL;
1093 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1094 sk ? inet6_sk(sk)->srcprefs : 0,
1099 goto out_err_release;
1101 /* If we had an erroneous initial result, pretend it
1102 * never existed and let the SA-enabled version take
1105 if ((*dst)->error) {
1110 if (fl6->flowi6_oif)
1111 flags |= RT6_LOOKUP_F_IFACE;
1115 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1117 err = (*dst)->error;
1119 goto out_err_release;
1121 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1123 * Here if the dst entry we've looked up
1124 * has a neighbour entry that is in the INCOMPLETE
1125 * state and the src address from the flow is
1126 * marked as OPTIMISTIC, we release the found
1127 * dst entry and replace it instead with the
1128 * dst entry of the nexthop router
1130 rt = (struct rt6_info *) *dst;
1132 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1133 rt6_nexthop(rt, &fl6->daddr));
1134 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1135 rcu_read_unlock_bh();
1138 struct inet6_ifaddr *ifp;
1139 struct flowi6 fl_gw6;
1142 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1145 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1151 * We need to get the dst entry for the
1152 * default router instead
1155 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1156 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1157 *dst = ip6_route_output(net, sk, &fl_gw6);
1158 err = (*dst)->error;
1160 goto out_err_release;
1164 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1165 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1166 err = -EAFNOSUPPORT;
1167 goto out_err_release;
1176 if (err == -ENETUNREACH)
1177 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1182 * ip6_dst_lookup - perform route lookup on flow
1183 * @net: Network namespace to perform lookup in
1184 * @sk: socket which provides route info
1185 * @dst: pointer to dst_entry * for result
1186 * @fl6: flow to lookup
1188 * This function performs a route lookup on the given flow.
1190 * It returns zero on success, or a standard errno code on error.
1192 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1196 return ip6_dst_lookup_tail(net, sk, dst, fl6);
1198 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1201 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1202 * @net: Network namespace to perform lookup in
1203 * @sk: socket which provides route info
1204 * @fl6: flow to lookup
1205 * @final_dst: final destination address for ipsec lookup
1207 * This function performs a route lookup on the given flow.
1209 * It returns a valid dst pointer on success, or a pointer encoded
1212 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1213 const struct in6_addr *final_dst)
1215 struct dst_entry *dst = NULL;
1218 err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1220 return ERR_PTR(err);
1222 fl6->daddr = *final_dst;
1224 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1226 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1229 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1230 * @sk: socket which provides the dst cache and route info
1231 * @fl6: flow to lookup
1232 * @final_dst: final destination address for ipsec lookup
1233 * @connected: whether @sk is connected or not
1235 * This function performs a route lookup on the given flow with the
1236 * possibility of using the cached route in the socket if it is valid.
1237 * It will take the socket dst lock when operating on the dst cache.
1238 * As a result, this function can only be used in process context.
1240 * In addition, for a connected socket, cache the dst in the socket
1241 * if the current cache is not valid.
1243 * It returns a valid dst pointer on success, or a pointer encoded
1246 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1247 const struct in6_addr *final_dst,
1250 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1252 dst = ip6_sk_dst_check(sk, dst, fl6);
1256 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1257 if (connected && !IS_ERR(dst))
1258 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1262 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1265 * ip6_dst_lookup_tunnel - perform route lookup on tunnel
1266 * @skb: Packet for which lookup is done
1267 * @dev: Tunnel device
1268 * @net: Network namespace of tunnel device
1269 * @sock: Socket which provides route info
1270 * @saddr: Memory to store the src ip address
1271 * @info: Tunnel information
1272 * @protocol: IP protocol
1273 * @use_cache: Flag to enable cache usage
1274 * This function performs a route lookup on a tunnel
1276 * It returns a valid dst pointer and stores src address to be used in
1277 * tunnel in param saddr on success, else a pointer encoded error code.
1280 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1281 struct net_device *dev,
1283 struct socket *sock,
1284 struct in6_addr *saddr,
1285 const struct ip_tunnel_info *info,
1289 struct dst_entry *dst = NULL;
1290 #ifdef CONFIG_DST_CACHE
1291 struct dst_cache *dst_cache;
1296 #ifdef CONFIG_DST_CACHE
1297 dst_cache = (struct dst_cache *)&info->dst_cache;
1299 dst = dst_cache_get_ip6(dst_cache, saddr);
1304 memset(&fl6, 0, sizeof(fl6));
1305 fl6.flowi6_mark = skb->mark;
1306 fl6.flowi6_proto = protocol;
1307 fl6.daddr = info->key.u.ipv6.dst;
1308 fl6.saddr = info->key.u.ipv6.src;
1309 prio = info->key.tos;
1310 fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1312 dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1315 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1316 return ERR_PTR(-ENETUNREACH);
1318 if (dst->dev == dev) { /* is this necessary? */
1319 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1321 return ERR_PTR(-ELOOP);
1323 #ifdef CONFIG_DST_CACHE
1325 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1330 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1332 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1335 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1338 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1341 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1344 static void ip6_append_data_mtu(unsigned int *mtu,
1346 unsigned int fragheaderlen,
1347 struct sk_buff *skb,
1348 struct rt6_info *rt,
1349 unsigned int orig_mtu)
1351 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1353 /* first fragment, reserve header_len */
1354 *mtu = orig_mtu - rt->dst.header_len;
1358 * this fragment is not first, the headers
1359 * space is regarded as data space.
1363 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1364 + fragheaderlen - sizeof(struct frag_hdr);
1368 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1369 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1370 struct rt6_info *rt, struct flowi6 *fl6)
1372 struct ipv6_pinfo *np = inet6_sk(sk);
1374 struct ipv6_txoptions *opt = ipc6->opt;
1380 if (WARN_ON(v6_cork->opt))
1383 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1384 if (unlikely(!v6_cork->opt))
1387 v6_cork->opt->tot_len = sizeof(*opt);
1388 v6_cork->opt->opt_flen = opt->opt_flen;
1389 v6_cork->opt->opt_nflen = opt->opt_nflen;
1391 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1393 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1396 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1398 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1401 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1403 if (opt->hopopt && !v6_cork->opt->hopopt)
1406 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1408 if (opt->srcrt && !v6_cork->opt->srcrt)
1411 /* need source address above miyazawa*/
1414 cork->base.dst = &rt->dst;
1415 cork->fl.u.ip6 = *fl6;
1416 v6_cork->hop_limit = ipc6->hlimit;
1417 v6_cork->tclass = ipc6->tclass;
1418 if (rt->dst.flags & DST_XFRM_TUNNEL)
1419 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1420 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1422 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1423 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1424 if (np->frag_size < mtu) {
1426 mtu = np->frag_size;
1428 cork->base.fragsize = mtu;
1429 cork->base.gso_size = ipc6->gso_size;
1430 cork->base.tx_flags = 0;
1431 cork->base.mark = ipc6->sockc.mark;
1432 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1434 if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1435 cork->base.flags |= IPCORK_ALLFRAG;
1436 cork->base.length = 0;
1438 cork->base.transmit_time = ipc6->sockc.transmit_time;
1443 static int __ip6_append_data(struct sock *sk,
1445 struct sk_buff_head *queue,
1446 struct inet_cork *cork,
1447 struct inet6_cork *v6_cork,
1448 struct page_frag *pfrag,
1449 int getfrag(void *from, char *to, int offset,
1450 int len, int odd, struct sk_buff *skb),
1451 void *from, int length, int transhdrlen,
1452 unsigned int flags, struct ipcm6_cookie *ipc6)
1454 struct sk_buff *skb, *skb_prev = NULL;
1455 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1456 struct ubuf_info *uarg = NULL;
1458 int dst_exthdrlen = 0;
1464 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1465 struct ipv6_txoptions *opt = v6_cork->opt;
1466 int csummode = CHECKSUM_NONE;
1467 unsigned int maxnonfragsize, headersize;
1468 unsigned int wmem_alloc_delta = 0;
1469 bool paged, extra_uref = false;
1471 skb = skb_peek_tail(queue);
1473 exthdrlen = opt ? opt->opt_flen : 0;
1474 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1477 paged = !!cork->gso_size;
1478 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1481 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1482 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1483 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1485 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1487 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1488 (opt ? opt->opt_nflen : 0);
1490 headersize = sizeof(struct ipv6hdr) +
1491 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1492 (dst_allfrag(&rt->dst) ?
1493 sizeof(struct frag_hdr) : 0) +
1494 rt->rt6i_nfheader_len;
1496 if (mtu <= fragheaderlen ||
1497 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1500 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1501 sizeof(struct frag_hdr);
1503 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1504 * the first fragment
1506 if (headersize + transhdrlen > mtu)
1509 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1510 (sk->sk_protocol == IPPROTO_UDP ||
1511 sk->sk_protocol == IPPROTO_RAW)) {
1512 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1513 sizeof(struct ipv6hdr));
1517 if (ip6_sk_ignore_df(sk))
1518 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1520 maxnonfragsize = mtu;
1522 if (cork->length + length > maxnonfragsize - headersize) {
1524 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1525 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1529 /* CHECKSUM_PARTIAL only with no extension headers and when
1530 * we are not going to fragment
1532 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1533 headersize == sizeof(struct ipv6hdr) &&
1534 length <= mtu - headersize &&
1535 (!(flags & MSG_MORE) || cork->gso_size) &&
1536 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1537 csummode = CHECKSUM_PARTIAL;
1539 if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1540 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1543 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
1544 if (rt->dst.dev->features & NETIF_F_SG &&
1545 csummode == CHECKSUM_PARTIAL) {
1549 skb_zcopy_set(skb, uarg, &extra_uref);
1554 * Let's try using as much space as possible.
1555 * Use MTU if total length of the message fits into the MTU.
1556 * Otherwise, we need to reserve fragment header and
1557 * fragment alignment (= 8-15 octects, in total).
1559 * Note that we may need to "move" the data from the tail
1560 * of the buffer to the new fragment when we split
1563 * FIXME: It may be fragmented into multiple chunks
1564 * at once if non-fragmentable extension headers
1569 cork->length += length;
1573 while (length > 0) {
1574 /* Check if the remaining data fits into current packet. */
1575 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1577 copy = maxfraglen - skb->len;
1581 unsigned int datalen;
1582 unsigned int fraglen;
1583 unsigned int fraggap;
1584 unsigned int alloclen, alloc_extra;
1585 unsigned int pagedlen;
1587 /* There's no room in the current skb */
1589 fraggap = skb->len - maxfraglen;
1592 /* update mtu and maxfraglen if necessary */
1593 if (!skb || !skb_prev)
1594 ip6_append_data_mtu(&mtu, &maxfraglen,
1595 fragheaderlen, skb, rt,
1601 * If remaining data exceeds the mtu,
1602 * we know we need more fragment(s).
1604 datalen = length + fraggap;
1606 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1607 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1608 fraglen = datalen + fragheaderlen;
1611 alloc_extra = hh_len;
1612 alloc_extra += dst_exthdrlen;
1613 alloc_extra += rt->dst.trailer_len;
1615 /* We just reserve space for fragment header.
1616 * Note: this may be overallocation if the message
1617 * (without MSG_MORE) fits into the MTU.
1619 alloc_extra += sizeof(struct frag_hdr);
1621 if ((flags & MSG_MORE) &&
1622 !(rt->dst.dev->features&NETIF_F_SG))
1625 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1626 !(rt->dst.dev->features & NETIF_F_SG)))
1629 alloclen = min_t(int, fraglen, MAX_HEADER);
1630 pagedlen = fraglen - alloclen;
1632 alloclen += alloc_extra;
1634 if (datalen != length + fraggap) {
1636 * this is not the last fragment, the trailer
1637 * space is regarded as data space.
1639 datalen += rt->dst.trailer_len;
1642 fraglen = datalen + fragheaderlen;
1644 copy = datalen - transhdrlen - fraggap - pagedlen;
1650 skb = sock_alloc_send_skb(sk, alloclen,
1651 (flags & MSG_DONTWAIT), &err);
1654 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1656 skb = alloc_skb(alloclen,
1664 * Fill in the control structures
1666 skb->protocol = htons(ETH_P_IPV6);
1667 skb->ip_summed = csummode;
1669 /* reserve for fragmentation and ipsec header */
1670 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1674 * Find where to start putting bytes
1676 data = skb_put(skb, fraglen - pagedlen);
1677 skb_set_network_header(skb, exthdrlen);
1678 data += fragheaderlen;
1679 skb->transport_header = (skb->network_header +
1682 skb->csum = skb_copy_and_csum_bits(
1683 skb_prev, maxfraglen,
1684 data + transhdrlen, fraggap);
1685 skb_prev->csum = csum_sub(skb_prev->csum,
1688 pskb_trim_unique(skb_prev, maxfraglen);
1691 getfrag(from, data + transhdrlen, offset,
1692 copy, fraggap, skb) < 0) {
1699 length -= copy + transhdrlen;
1704 /* Only the initial fragment is time stamped */
1705 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1707 skb_shinfo(skb)->tskey = tskey;
1709 skb_zcopy_set(skb, uarg, &extra_uref);
1711 if ((flags & MSG_CONFIRM) && !skb_prev)
1712 skb_set_dst_pending_confirm(skb, 1);
1715 * Put the packet on the pending queue
1717 if (!skb->destructor) {
1718 skb->destructor = sock_wfree;
1720 wmem_alloc_delta += skb->truesize;
1722 __skb_queue_tail(queue, skb);
1729 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1730 skb_tailroom(skb) >= copy) {
1734 if (getfrag(from, skb_put(skb, copy),
1735 offset, copy, off, skb) < 0) {
1736 __skb_trim(skb, off);
1740 } else if (!uarg || !uarg->zerocopy) {
1741 int i = skb_shinfo(skb)->nr_frags;
1744 if (!sk_page_frag_refill(sk, pfrag))
1747 if (!skb_can_coalesce(skb, i, pfrag->page,
1750 if (i == MAX_SKB_FRAGS)
1753 __skb_fill_page_desc(skb, i, pfrag->page,
1755 skb_shinfo(skb)->nr_frags = ++i;
1756 get_page(pfrag->page);
1758 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1760 page_address(pfrag->page) + pfrag->offset,
1761 offset, copy, skb->len, skb) < 0)
1764 pfrag->offset += copy;
1765 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1767 skb->data_len += copy;
1768 skb->truesize += copy;
1769 wmem_alloc_delta += copy;
1771 err = skb_zerocopy_iter_dgram(skb, from, copy);
1779 if (wmem_alloc_delta)
1780 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1786 net_zcopy_put_abort(uarg, extra_uref);
1787 cork->length -= length;
1788 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1789 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1793 int ip6_append_data(struct sock *sk,
1794 int getfrag(void *from, char *to, int offset, int len,
1795 int odd, struct sk_buff *skb),
1796 void *from, int length, int transhdrlen,
1797 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1798 struct rt6_info *rt, unsigned int flags)
1800 struct inet_sock *inet = inet_sk(sk);
1801 struct ipv6_pinfo *np = inet6_sk(sk);
1805 if (flags&MSG_PROBE)
1807 if (skb_queue_empty(&sk->sk_write_queue)) {
1811 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1816 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1817 length += exthdrlen;
1818 transhdrlen += exthdrlen;
1820 fl6 = &inet->cork.fl.u.ip6;
1824 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1825 &np->cork, sk_page_frag(sk), getfrag,
1826 from, length, transhdrlen, flags, ipc6);
1828 EXPORT_SYMBOL_GPL(ip6_append_data);
1830 static void ip6_cork_release(struct inet_cork_full *cork,
1831 struct inet6_cork *v6_cork)
1834 kfree(v6_cork->opt->dst0opt);
1835 kfree(v6_cork->opt->dst1opt);
1836 kfree(v6_cork->opt->hopopt);
1837 kfree(v6_cork->opt->srcrt);
1838 kfree(v6_cork->opt);
1839 v6_cork->opt = NULL;
1842 if (cork->base.dst) {
1843 dst_release(cork->base.dst);
1844 cork->base.dst = NULL;
1845 cork->base.flags &= ~IPCORK_ALLFRAG;
1847 memset(&cork->fl, 0, sizeof(cork->fl));
1850 struct sk_buff *__ip6_make_skb(struct sock *sk,
1851 struct sk_buff_head *queue,
1852 struct inet_cork_full *cork,
1853 struct inet6_cork *v6_cork)
1855 struct sk_buff *skb, *tmp_skb;
1856 struct sk_buff **tail_skb;
1857 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1858 struct ipv6_pinfo *np = inet6_sk(sk);
1859 struct net *net = sock_net(sk);
1860 struct ipv6hdr *hdr;
1861 struct ipv6_txoptions *opt = v6_cork->opt;
1862 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1863 struct flowi6 *fl6 = &cork->fl.u.ip6;
1864 unsigned char proto = fl6->flowi6_proto;
1866 skb = __skb_dequeue(queue);
1869 tail_skb = &(skb_shinfo(skb)->frag_list);
1871 /* move skb->data to ip header from ext header */
1872 if (skb->data < skb_network_header(skb))
1873 __skb_pull(skb, skb_network_offset(skb));
1874 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1875 __skb_pull(tmp_skb, skb_network_header_len(skb));
1876 *tail_skb = tmp_skb;
1877 tail_skb = &(tmp_skb->next);
1878 skb->len += tmp_skb->len;
1879 skb->data_len += tmp_skb->len;
1880 skb->truesize += tmp_skb->truesize;
1881 tmp_skb->destructor = NULL;
1885 /* Allow local fragmentation. */
1886 skb->ignore_df = ip6_sk_ignore_df(sk);
1888 *final_dst = fl6->daddr;
1889 __skb_pull(skb, skb_network_header_len(skb));
1890 if (opt && opt->opt_flen)
1891 ipv6_push_frag_opts(skb, opt, &proto);
1892 if (opt && opt->opt_nflen)
1893 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1895 skb_push(skb, sizeof(struct ipv6hdr));
1896 skb_reset_network_header(skb);
1897 hdr = ipv6_hdr(skb);
1899 ip6_flow_hdr(hdr, v6_cork->tclass,
1900 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1901 ip6_autoflowlabel(net, np), fl6));
1902 hdr->hop_limit = v6_cork->hop_limit;
1903 hdr->nexthdr = proto;
1904 hdr->saddr = fl6->saddr;
1905 hdr->daddr = *final_dst;
1907 skb->priority = sk->sk_priority;
1908 skb->mark = cork->base.mark;
1910 skb->tstamp = cork->base.transmit_time;
1912 skb_dst_set(skb, dst_clone(&rt->dst));
1913 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1914 if (proto == IPPROTO_ICMPV6) {
1915 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1918 if (sk->sk_socket->type == SOCK_RAW && !inet_sk(sk)->hdrincl)
1919 icmp6_type = fl6->fl6_icmp_type;
1921 icmp6_type = icmp6_hdr(skb)->icmp6_type;
1922 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1923 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1926 ip6_cork_release(cork, v6_cork);
1931 int ip6_send_skb(struct sk_buff *skb)
1933 struct net *net = sock_net(skb->sk);
1934 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1937 err = ip6_local_out(net, skb->sk, skb);
1940 err = net_xmit_errno(err);
1942 IP6_INC_STATS(net, rt->rt6i_idev,
1943 IPSTATS_MIB_OUTDISCARDS);
1949 int ip6_push_pending_frames(struct sock *sk)
1951 struct sk_buff *skb;
1953 skb = ip6_finish_skb(sk);
1957 return ip6_send_skb(skb);
1959 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1961 static void __ip6_flush_pending_frames(struct sock *sk,
1962 struct sk_buff_head *queue,
1963 struct inet_cork_full *cork,
1964 struct inet6_cork *v6_cork)
1966 struct sk_buff *skb;
1968 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1970 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1971 IPSTATS_MIB_OUTDISCARDS);
1975 ip6_cork_release(cork, v6_cork);
1978 void ip6_flush_pending_frames(struct sock *sk)
1980 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1981 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1983 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1985 struct sk_buff *ip6_make_skb(struct sock *sk,
1986 int getfrag(void *from, char *to, int offset,
1987 int len, int odd, struct sk_buff *skb),
1988 void *from, int length, int transhdrlen,
1989 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1990 struct rt6_info *rt, unsigned int flags,
1991 struct inet_cork_full *cork)
1993 struct inet6_cork v6_cork;
1994 struct sk_buff_head queue;
1995 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1998 if (flags & MSG_PROBE)
2001 __skb_queue_head_init(&queue);
2003 cork->base.flags = 0;
2004 cork->base.addr = 0;
2005 cork->base.opt = NULL;
2006 cork->base.dst = NULL;
2008 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
2010 ip6_cork_release(cork, &v6_cork);
2011 return ERR_PTR(err);
2013 if (ipc6->dontfrag < 0)
2014 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2016 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
2017 ¤t->task_frag, getfrag, from,
2018 length + exthdrlen, transhdrlen + exthdrlen,
2021 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2022 return ERR_PTR(err);
2025 return __ip6_make_skb(sk, &queue, cork, &v6_cork);