GNU Linux-libre 4.14.259-gnu1
[releases.git] / drivers / net / virtio_net.c
1 /* A network driver using virtio.
2  *
3  * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, see <http://www.gnu.org/licenses/>.
17  */
18 //#define DEBUG
19 #include <linux/netdevice.h>
20 #include <linux/etherdevice.h>
21 #include <linux/ethtool.h>
22 #include <linux/module.h>
23 #include <linux/virtio.h>
24 #include <linux/virtio_net.h>
25 #include <linux/bpf.h>
26 #include <linux/bpf_trace.h>
27 #include <linux/scatterlist.h>
28 #include <linux/if_vlan.h>
29 #include <linux/slab.h>
30 #include <linux/cpu.h>
31 #include <linux/average.h>
32 #include <net/route.h>
33
34 static int napi_weight = NAPI_POLL_WEIGHT;
35 module_param(napi_weight, int, 0444);
36
37 static bool csum = true, gso = true, napi_tx;
38 module_param(csum, bool, 0444);
39 module_param(gso, bool, 0444);
40 module_param(napi_tx, bool, 0644);
41
42 /* FIXME: MTU in config. */
43 #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
44 #define GOOD_COPY_LEN   128
45
46 #define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
47
48 /* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
49 #define VIRTIO_XDP_HEADROOM 256
50
51 /* RX packet size EWMA. The average packet size is used to determine the packet
52  * buffer size when refilling RX rings. As the entire RX ring may be refilled
53  * at once, the weight is chosen so that the EWMA will be insensitive to short-
54  * term, transient changes in packet size.
55  */
56 DECLARE_EWMA(pkt_len, 0, 64)
57
58 #define VIRTNET_DRIVER_VERSION "1.0.0"
59
60 static const unsigned long guest_offloads[] = {
61         VIRTIO_NET_F_GUEST_TSO4,
62         VIRTIO_NET_F_GUEST_TSO6,
63         VIRTIO_NET_F_GUEST_ECN,
64         VIRTIO_NET_F_GUEST_UFO,
65         VIRTIO_NET_F_GUEST_CSUM
66 };
67
68 struct virtnet_stats {
69         struct u64_stats_sync tx_syncp;
70         struct u64_stats_sync rx_syncp;
71         u64 tx_bytes;
72         u64 tx_packets;
73
74         u64 rx_bytes;
75         u64 rx_packets;
76 };
77
78 /* Internal representation of a send virtqueue */
79 struct send_queue {
80         /* Virtqueue associated with this send _queue */
81         struct virtqueue *vq;
82
83         /* TX: fragments + linear part + virtio header */
84         struct scatterlist sg[MAX_SKB_FRAGS + 2];
85
86         /* Name of the send queue: output.$index */
87         char name[40];
88
89         struct napi_struct napi;
90 };
91
92 /* Internal representation of a receive virtqueue */
93 struct receive_queue {
94         /* Virtqueue associated with this receive_queue */
95         struct virtqueue *vq;
96
97         struct napi_struct napi;
98
99         struct bpf_prog __rcu *xdp_prog;
100
101         /* Chain pages by the private ptr. */
102         struct page *pages;
103
104         /* Average packet length for mergeable receive buffers. */
105         struct ewma_pkt_len mrg_avg_pkt_len;
106
107         /* Page frag for packet buffer allocation. */
108         struct page_frag alloc_frag;
109
110         /* RX: fragments + linear part + virtio header */
111         struct scatterlist sg[MAX_SKB_FRAGS + 2];
112
113         /* Min single buffer size for mergeable buffers case. */
114         unsigned int min_buf_len;
115
116         /* Name of this receive queue: input.$index */
117         char name[40];
118 };
119
120 /* Control VQ buffers: protected by the rtnl lock */
121 struct control_buf {
122         struct virtio_net_ctrl_hdr hdr;
123         virtio_net_ctrl_ack status;
124         struct virtio_net_ctrl_mq mq;
125         u8 promisc;
126         u8 allmulti;
127         __virtio16 vid;
128         u64 offloads;
129 };
130
131 struct virtnet_info {
132         struct virtio_device *vdev;
133         struct virtqueue *cvq;
134         struct net_device *dev;
135         struct send_queue *sq;
136         struct receive_queue *rq;
137         unsigned int status;
138
139         /* Max # of queue pairs supported by the device */
140         u16 max_queue_pairs;
141
142         /* # of queue pairs currently used by the driver */
143         u16 curr_queue_pairs;
144
145         /* # of XDP queue pairs currently used by the driver */
146         u16 xdp_queue_pairs;
147
148         /* I like... big packets and I cannot lie! */
149         bool big_packets;
150
151         /* Host will merge rx buffers for big packets (shake it! shake it!) */
152         bool mergeable_rx_bufs;
153
154         /* Has control virtqueue */
155         bool has_cvq;
156
157         /* Host can handle any s/g split between our header and packet data */
158         bool any_header_sg;
159
160         /* Packet virtio header size */
161         u8 hdr_len;
162
163         /* Active statistics */
164         struct virtnet_stats __percpu *stats;
165
166         /* Work struct for refilling if we run low on memory. */
167         struct delayed_work refill;
168
169         /* Work struct for config space updates */
170         struct work_struct config_work;
171
172         /* Does the affinity hint is set for virtqueues? */
173         bool affinity_hint_set;
174
175         /* CPU hotplug instances for online & dead */
176         struct hlist_node node;
177         struct hlist_node node_dead;
178
179         struct control_buf *ctrl;
180
181         /* Ethtool settings */
182         u8 duplex;
183         u32 speed;
184
185         unsigned long guest_offloads;
186 };
187
188 struct padded_vnet_hdr {
189         struct virtio_net_hdr_mrg_rxbuf hdr;
190         /*
191          * hdr is in a separate sg buffer, and data sg buffer shares same page
192          * with this header sg. This padding makes next sg 16 byte aligned
193          * after the header.
194          */
195         char padding[4];
196 };
197
198 /* Converting between virtqueue no. and kernel tx/rx queue no.
199  * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
200  */
201 static int vq2txq(struct virtqueue *vq)
202 {
203         return (vq->index - 1) / 2;
204 }
205
206 static int txq2vq(int txq)
207 {
208         return txq * 2 + 1;
209 }
210
211 static int vq2rxq(struct virtqueue *vq)
212 {
213         return vq->index / 2;
214 }
215
216 static int rxq2vq(int rxq)
217 {
218         return rxq * 2;
219 }
220
221 static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb)
222 {
223         return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb;
224 }
225
226 /*
227  * private is used to chain pages for big packets, put the whole
228  * most recent used list in the beginning for reuse
229  */
230 static void give_pages(struct receive_queue *rq, struct page *page)
231 {
232         struct page *end;
233
234         /* Find end of list, sew whole thing into vi->rq.pages. */
235         for (end = page; end->private; end = (struct page *)end->private);
236         end->private = (unsigned long)rq->pages;
237         rq->pages = page;
238 }
239
240 static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
241 {
242         struct page *p = rq->pages;
243
244         if (p) {
245                 rq->pages = (struct page *)p->private;
246                 /* clear private here, it is used to chain pages */
247                 p->private = 0;
248         } else
249                 p = alloc_page(gfp_mask);
250         return p;
251 }
252
253 static void virtqueue_napi_schedule(struct napi_struct *napi,
254                                     struct virtqueue *vq)
255 {
256         if (napi_schedule_prep(napi)) {
257                 virtqueue_disable_cb(vq);
258                 __napi_schedule(napi);
259         }
260 }
261
262 static void virtqueue_napi_complete(struct napi_struct *napi,
263                                     struct virtqueue *vq, int processed)
264 {
265         int opaque;
266
267         opaque = virtqueue_enable_cb_prepare(vq);
268         if (napi_complete_done(napi, processed)) {
269                 if (unlikely(virtqueue_poll(vq, opaque)))
270                         virtqueue_napi_schedule(napi, vq);
271         } else {
272                 virtqueue_disable_cb(vq);
273         }
274 }
275
276 static void skb_xmit_done(struct virtqueue *vq)
277 {
278         struct virtnet_info *vi = vq->vdev->priv;
279         struct napi_struct *napi = &vi->sq[vq2txq(vq)].napi;
280
281         /* Suppress further interrupts. */
282         virtqueue_disable_cb(vq);
283
284         if (napi->weight)
285                 virtqueue_napi_schedule(napi, vq);
286         else
287                 /* We were probably waiting for more output buffers. */
288                 netif_wake_subqueue(vi->dev, vq2txq(vq));
289 }
290
291 #define MRG_CTX_HEADER_SHIFT 22
292 static void *mergeable_len_to_ctx(unsigned int truesize,
293                                   unsigned int headroom)
294 {
295         return (void *)(unsigned long)((headroom << MRG_CTX_HEADER_SHIFT) | truesize);
296 }
297
298 static unsigned int mergeable_ctx_to_headroom(void *mrg_ctx)
299 {
300         return (unsigned long)mrg_ctx >> MRG_CTX_HEADER_SHIFT;
301 }
302
303 static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx)
304 {
305         return (unsigned long)mrg_ctx & ((1 << MRG_CTX_HEADER_SHIFT) - 1);
306 }
307
308 /* Called from bottom half context */
309 static struct sk_buff *page_to_skb(struct virtnet_info *vi,
310                                    struct receive_queue *rq,
311                                    struct page *page, unsigned int offset,
312                                    unsigned int len, unsigned int truesize,
313                                    bool hdr_valid)
314 {
315         struct sk_buff *skb;
316         struct virtio_net_hdr_mrg_rxbuf *hdr;
317         unsigned int copy, hdr_len, hdr_padded_len;
318         char *p;
319
320         p = page_address(page) + offset;
321
322         /* copy small packet so we can reuse these pages for small data */
323         skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
324         if (unlikely(!skb))
325                 return NULL;
326
327         hdr = skb_vnet_hdr(skb);
328
329         hdr_len = vi->hdr_len;
330         if (vi->mergeable_rx_bufs)
331                 hdr_padded_len = sizeof(*hdr);
332         else
333                 hdr_padded_len = sizeof(struct padded_vnet_hdr);
334
335         if (hdr_valid)
336                 memcpy(hdr, p, hdr_len);
337
338         len -= hdr_len;
339         offset += hdr_padded_len;
340         p += hdr_padded_len;
341
342         /* Copy all frame if it fits skb->head, otherwise
343          * we let virtio_net_hdr_to_skb() and GRO pull headers as needed.
344          */
345         if (len <= skb_tailroom(skb))
346                 copy = len;
347         else
348                 copy = ETH_HLEN;
349         skb_put_data(skb, p, copy);
350
351         len -= copy;
352         offset += copy;
353
354         if (vi->mergeable_rx_bufs) {
355                 if (len)
356                         skb_add_rx_frag(skb, 0, page, offset, len, truesize);
357                 else
358                         put_page(page);
359                 return skb;
360         }
361
362         /*
363          * Verify that we can indeed put this data into a skb.
364          * This is here to handle cases when the device erroneously
365          * tries to receive more than is possible. This is usually
366          * the case of a broken device.
367          */
368         if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
369                 net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
370                 dev_kfree_skb(skb);
371                 return NULL;
372         }
373         BUG_ON(offset >= PAGE_SIZE);
374         while (len) {
375                 unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
376                 skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
377                                 frag_size, truesize);
378                 len -= frag_size;
379                 page = (struct page *)page->private;
380                 offset = 0;
381         }
382
383         if (page)
384                 give_pages(rq, page);
385
386         return skb;
387 }
388
389 static bool virtnet_xdp_xmit(struct virtnet_info *vi,
390                              struct receive_queue *rq,
391                              struct xdp_buff *xdp)
392 {
393         struct virtio_net_hdr_mrg_rxbuf *hdr;
394         unsigned int len;
395         struct send_queue *sq;
396         unsigned int qp;
397         void *xdp_sent;
398         int err;
399
400         qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
401         sq = &vi->sq[qp];
402
403         /* Free up any pending old buffers before queueing new ones. */
404         while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
405                 struct page *sent_page = virt_to_head_page(xdp_sent);
406
407                 put_page(sent_page);
408         }
409
410         xdp->data -= vi->hdr_len;
411         /* Zero header and leave csum up to XDP layers */
412         hdr = xdp->data;
413         memset(hdr, 0, vi->hdr_len);
414
415         sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
416
417         err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp->data, GFP_ATOMIC);
418         if (unlikely(err)) {
419                 struct page *page = virt_to_head_page(xdp->data);
420
421                 put_page(page);
422                 return false;
423         }
424
425         virtqueue_kick(sq->vq);
426         return true;
427 }
428
429 static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
430 {
431         return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0;
432 }
433
434 /* We copy the packet for XDP in the following cases:
435  *
436  * 1) Packet is scattered across multiple rx buffers.
437  * 2) Headroom space is insufficient.
438  *
439  * This is inefficient but it's a temporary condition that
440  * we hit right after XDP is enabled and until queue is refilled
441  * with large buffers with sufficient headroom - so it should affect
442  * at most queue size packets.
443  * Afterwards, the conditions to enable
444  * XDP should preclude the underlying device from sending packets
445  * across multiple buffers (num_buf > 1), and we make sure buffers
446  * have enough headroom.
447  */
448 static struct page *xdp_linearize_page(struct receive_queue *rq,
449                                        u16 *num_buf,
450                                        struct page *p,
451                                        int offset,
452                                        int page_off,
453                                        unsigned int *len)
454 {
455         struct page *page = alloc_page(GFP_ATOMIC);
456
457         if (!page)
458                 return NULL;
459
460         memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
461         page_off += *len;
462
463         while (--*num_buf) {
464                 unsigned int buflen;
465                 void *buf;
466                 int off;
467
468                 buf = virtqueue_get_buf(rq->vq, &buflen);
469                 if (unlikely(!buf))
470                         goto err_buf;
471
472                 p = virt_to_head_page(buf);
473                 off = buf - page_address(p);
474
475                 /* guard against a misconfigured or uncooperative backend that
476                  * is sending packet larger than the MTU.
477                  */
478                 if ((page_off + buflen) > PAGE_SIZE) {
479                         put_page(p);
480                         goto err_buf;
481                 }
482
483                 memcpy(page_address(page) + page_off,
484                        page_address(p) + off, buflen);
485                 page_off += buflen;
486                 put_page(p);
487         }
488
489         /* Headroom does not contribute to packet length */
490         *len = page_off - VIRTIO_XDP_HEADROOM;
491         return page;
492 err_buf:
493         __free_pages(page, 0);
494         return NULL;
495 }
496
497 static struct sk_buff *receive_small(struct net_device *dev,
498                                      struct virtnet_info *vi,
499                                      struct receive_queue *rq,
500                                      void *buf, void *ctx,
501                                      unsigned int len)
502 {
503         struct sk_buff *skb;
504         struct bpf_prog *xdp_prog;
505         unsigned int xdp_headroom = (unsigned long)ctx;
506         unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
507         unsigned int headroom = vi->hdr_len + header_offset;
508         unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
509                               SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
510         struct page *page = virt_to_head_page(buf);
511         unsigned int delta = 0;
512         struct page *xdp_page;
513         len -= vi->hdr_len;
514
515         rcu_read_lock();
516         xdp_prog = rcu_dereference(rq->xdp_prog);
517         if (xdp_prog) {
518                 struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
519                 struct xdp_buff xdp;
520                 void *orig_data;
521                 u32 act;
522
523                 if (unlikely(hdr->hdr.gso_type))
524                         goto err_xdp;
525
526                 if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) {
527                         int offset = buf - page_address(page) + header_offset;
528                         unsigned int tlen = len + vi->hdr_len;
529                         u16 num_buf = 1;
530
531                         xdp_headroom = virtnet_get_headroom(vi);
532                         header_offset = VIRTNET_RX_PAD + xdp_headroom;
533                         headroom = vi->hdr_len + header_offset;
534                         buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
535                                  SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
536                         xdp_page = xdp_linearize_page(rq, &num_buf, page,
537                                                       offset, header_offset,
538                                                       &tlen);
539                         if (!xdp_page)
540                                 goto err_xdp;
541
542                         buf = page_address(xdp_page);
543                         put_page(page);
544                         page = xdp_page;
545                 }
546
547                 xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
548                 xdp.data = xdp.data_hard_start + xdp_headroom;
549                 xdp.data_end = xdp.data + len;
550                 orig_data = xdp.data;
551                 act = bpf_prog_run_xdp(xdp_prog, &xdp);
552
553                 switch (act) {
554                 case XDP_PASS:
555                         /* Recalculate length in case bpf program changed it */
556                         delta = orig_data - xdp.data;
557                         break;
558                 case XDP_TX:
559                         if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp)))
560                                 trace_xdp_exception(vi->dev, xdp_prog, act);
561                         rcu_read_unlock();
562                         goto xdp_xmit;
563                 default:
564                         bpf_warn_invalid_xdp_action(act);
565                 case XDP_ABORTED:
566                         trace_xdp_exception(vi->dev, xdp_prog, act);
567                 case XDP_DROP:
568                         goto err_xdp;
569                 }
570         }
571         rcu_read_unlock();
572
573         skb = build_skb(buf, buflen);
574         if (!skb) {
575                 put_page(page);
576                 goto err;
577         }
578         skb_reserve(skb, headroom - delta);
579         skb_put(skb, len + delta);
580         if (!delta) {
581                 buf += header_offset;
582                 memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
583         } /* keep zeroed vnet hdr since packet was changed by bpf */
584
585 err:
586         return skb;
587
588 err_xdp:
589         rcu_read_unlock();
590         dev->stats.rx_dropped++;
591         put_page(page);
592 xdp_xmit:
593         return NULL;
594 }
595
596 static struct sk_buff *receive_big(struct net_device *dev,
597                                    struct virtnet_info *vi,
598                                    struct receive_queue *rq,
599                                    void *buf,
600                                    unsigned int len)
601 {
602         struct page *page = buf;
603         struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len,
604                                           PAGE_SIZE, true);
605
606         if (unlikely(!skb))
607                 goto err;
608
609         return skb;
610
611 err:
612         dev->stats.rx_dropped++;
613         give_pages(rq, page);
614         return NULL;
615 }
616
617 static struct sk_buff *receive_mergeable(struct net_device *dev,
618                                          struct virtnet_info *vi,
619                                          struct receive_queue *rq,
620                                          void *buf,
621                                          void *ctx,
622                                          unsigned int len)
623 {
624         struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
625         u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
626         struct page *page = virt_to_head_page(buf);
627         int offset = buf - page_address(page);
628         struct sk_buff *head_skb, *curr_skb;
629         struct bpf_prog *xdp_prog;
630         unsigned int truesize;
631         unsigned int headroom = mergeable_ctx_to_headroom(ctx);
632
633         head_skb = NULL;
634
635         rcu_read_lock();
636         xdp_prog = rcu_dereference(rq->xdp_prog);
637         if (xdp_prog) {
638                 struct page *xdp_page;
639                 struct xdp_buff xdp;
640                 void *data;
641                 u32 act;
642
643                 /* Transient failure which in theory could occur if
644                  * in-flight packets from before XDP was enabled reach
645                  * the receive path after XDP is loaded.
646                  */
647                 if (unlikely(hdr->hdr.gso_type))
648                         goto err_xdp;
649
650                 /* This happens when rx buffer size is underestimated */
651                 if (unlikely(num_buf > 1 ||
652                              headroom < virtnet_get_headroom(vi))) {
653                         /* linearize data for XDP */
654                         xdp_page = xdp_linearize_page(rq, &num_buf,
655                                                       page, offset,
656                                                       VIRTIO_XDP_HEADROOM,
657                                                       &len);
658                         if (!xdp_page)
659                                 goto err_xdp;
660                         offset = VIRTIO_XDP_HEADROOM;
661                 } else {
662                         xdp_page = page;
663                 }
664
665                 /* Allow consuming headroom but reserve enough space to push
666                  * the descriptor on if we get an XDP_TX return code.
667                  */
668                 data = page_address(xdp_page) + offset;
669                 xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
670                 xdp.data = data + vi->hdr_len;
671                 xdp.data_end = xdp.data + (len - vi->hdr_len);
672                 act = bpf_prog_run_xdp(xdp_prog, &xdp);
673
674                 switch (act) {
675                 case XDP_PASS:
676                         /* recalculate offset to account for any header
677                          * adjustments. Note other cases do not build an
678                          * skb and avoid using offset
679                          */
680                         offset = xdp.data -
681                                         page_address(xdp_page) - vi->hdr_len;
682
683                         /* We can only create skb based on xdp_page. */
684                         if (unlikely(xdp_page != page)) {
685                                 rcu_read_unlock();
686                                 put_page(page);
687                                 head_skb = page_to_skb(vi, rq, xdp_page,
688                                                        offset, len,
689                                                        PAGE_SIZE, false);
690                                 ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
691                                 return head_skb;
692                         }
693                         break;
694                 case XDP_TX:
695                         if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp)))
696                                 trace_xdp_exception(vi->dev, xdp_prog, act);
697                         ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
698                         if (unlikely(xdp_page != page))
699                                 put_page(page);
700                         rcu_read_unlock();
701                         goto xdp_xmit;
702                 default:
703                         bpf_warn_invalid_xdp_action(act);
704                 case XDP_ABORTED:
705                         trace_xdp_exception(vi->dev, xdp_prog, act);
706                 case XDP_DROP:
707                         if (unlikely(xdp_page != page))
708                                 __free_pages(xdp_page, 0);
709                         ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
710                         goto err_xdp;
711                 }
712         }
713         rcu_read_unlock();
714
715         truesize = mergeable_ctx_to_truesize(ctx);
716         if (unlikely(len > truesize)) {
717                 pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
718                          dev->name, len, (unsigned long)ctx);
719                 dev->stats.rx_length_errors++;
720                 goto err_skb;
721         }
722
723         head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog);
724         curr_skb = head_skb;
725
726         if (unlikely(!curr_skb))
727                 goto err_skb;
728         while (--num_buf) {
729                 int num_skb_frags;
730
731                 buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
732                 if (unlikely(!buf)) {
733                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
734                                  dev->name, num_buf,
735                                  virtio16_to_cpu(vi->vdev,
736                                                  hdr->num_buffers));
737                         dev->stats.rx_length_errors++;
738                         goto err_buf;
739                 }
740
741                 page = virt_to_head_page(buf);
742
743                 truesize = mergeable_ctx_to_truesize(ctx);
744                 if (unlikely(len > truesize)) {
745                         pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
746                                  dev->name, len, (unsigned long)ctx);
747                         dev->stats.rx_length_errors++;
748                         goto err_skb;
749                 }
750
751                 num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
752                 if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
753                         struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
754
755                         if (unlikely(!nskb))
756                                 goto err_skb;
757                         if (curr_skb == head_skb)
758                                 skb_shinfo(curr_skb)->frag_list = nskb;
759                         else
760                                 curr_skb->next = nskb;
761                         curr_skb = nskb;
762                         head_skb->truesize += nskb->truesize;
763                         num_skb_frags = 0;
764                 }
765                 if (curr_skb != head_skb) {
766                         head_skb->data_len += len;
767                         head_skb->len += len;
768                         head_skb->truesize += truesize;
769                 }
770                 offset = buf - page_address(page);
771                 if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
772                         put_page(page);
773                         skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
774                                              len, truesize);
775                 } else {
776                         skb_add_rx_frag(curr_skb, num_skb_frags, page,
777                                         offset, len, truesize);
778                 }
779         }
780
781         ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
782         return head_skb;
783
784 err_xdp:
785         rcu_read_unlock();
786 err_skb:
787         put_page(page);
788         while (num_buf-- > 1) {
789                 buf = virtqueue_get_buf(rq->vq, &len);
790                 if (unlikely(!buf)) {
791                         pr_debug("%s: rx error: %d buffers missing\n",
792                                  dev->name, num_buf);
793                         dev->stats.rx_length_errors++;
794                         break;
795                 }
796                 page = virt_to_head_page(buf);
797                 put_page(page);
798         }
799 err_buf:
800         dev->stats.rx_dropped++;
801         dev_kfree_skb(head_skb);
802 xdp_xmit:
803         return NULL;
804 }
805
806 static int receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
807                        void *buf, unsigned int len, void **ctx)
808 {
809         struct net_device *dev = vi->dev;
810         struct sk_buff *skb;
811         struct virtio_net_hdr_mrg_rxbuf *hdr;
812         int ret;
813
814         if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
815                 pr_debug("%s: short packet %i\n", dev->name, len);
816                 dev->stats.rx_length_errors++;
817                 if (vi->mergeable_rx_bufs) {
818                         put_page(virt_to_head_page(buf));
819                 } else if (vi->big_packets) {
820                         give_pages(rq, buf);
821                 } else {
822                         put_page(virt_to_head_page(buf));
823                 }
824                 return 0;
825         }
826
827         if (vi->mergeable_rx_bufs)
828                 skb = receive_mergeable(dev, vi, rq, buf, ctx, len);
829         else if (vi->big_packets)
830                 skb = receive_big(dev, vi, rq, buf, len);
831         else
832                 skb = receive_small(dev, vi, rq, buf, ctx, len);
833
834         if (unlikely(!skb))
835                 return 0;
836
837         hdr = skb_vnet_hdr(skb);
838
839         ret = skb->len;
840
841         if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID)
842                 skb->ip_summed = CHECKSUM_UNNECESSARY;
843
844         if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
845                                   virtio_is_little_endian(vi->vdev))) {
846                 net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
847                                      dev->name, hdr->hdr.gso_type,
848                                      hdr->hdr.gso_size);
849                 goto frame_err;
850         }
851
852         skb->protocol = eth_type_trans(skb, dev);
853         pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
854                  ntohs(skb->protocol), skb->len, skb->pkt_type);
855
856         napi_gro_receive(&rq->napi, skb);
857         return ret;
858
859 frame_err:
860         dev->stats.rx_frame_errors++;
861         dev_kfree_skb(skb);
862         return 0;
863 }
864
865 /* Unlike mergeable buffers, all buffers are allocated to the
866  * same size, except for the headroom. For this reason we do
867  * not need to use  mergeable_len_to_ctx here - it is enough
868  * to store the headroom as the context ignoring the truesize.
869  */
870 static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
871                              gfp_t gfp)
872 {
873         struct page_frag *alloc_frag = &rq->alloc_frag;
874         char *buf;
875         unsigned int xdp_headroom = virtnet_get_headroom(vi);
876         void *ctx = (void *)(unsigned long)xdp_headroom;
877         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
878         int err;
879
880         len = SKB_DATA_ALIGN(len) +
881               SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
882         if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
883                 return -ENOMEM;
884
885         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
886         get_page(alloc_frag->page);
887         alloc_frag->offset += len;
888         sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
889                     vi->hdr_len + GOOD_PACKET_LEN);
890         err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
891         if (err < 0)
892                 put_page(virt_to_head_page(buf));
893         return err;
894 }
895
896 static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
897                            gfp_t gfp)
898 {
899         struct page *first, *list = NULL;
900         char *p;
901         int i, err, offset;
902
903         sg_init_table(rq->sg, MAX_SKB_FRAGS + 2);
904
905         /* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
906         for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
907                 first = get_a_page(rq, gfp);
908                 if (!first) {
909                         if (list)
910                                 give_pages(rq, list);
911                         return -ENOMEM;
912                 }
913                 sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
914
915                 /* chain new page in list head to match sg */
916                 first->private = (unsigned long)list;
917                 list = first;
918         }
919
920         first = get_a_page(rq, gfp);
921         if (!first) {
922                 give_pages(rq, list);
923                 return -ENOMEM;
924         }
925         p = page_address(first);
926
927         /* rq->sg[0], rq->sg[1] share the same page */
928         /* a separated rq->sg[0] for header - required in case !any_header_sg */
929         sg_set_buf(&rq->sg[0], p, vi->hdr_len);
930
931         /* rq->sg[1] for data packet, from offset */
932         offset = sizeof(struct padded_vnet_hdr);
933         sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
934
935         /* chain first in list head */
936         first->private = (unsigned long)list;
937         err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
938                                   first, gfp);
939         if (err < 0)
940                 give_pages(rq, first);
941
942         return err;
943 }
944
945 static unsigned int get_mergeable_buf_len(struct receive_queue *rq,
946                                           struct ewma_pkt_len *avg_pkt_len)
947 {
948         const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
949         unsigned int len;
950
951         len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
952                                 rq->min_buf_len, PAGE_SIZE - hdr_len);
953         return ALIGN(len, L1_CACHE_BYTES);
954 }
955
956 static int add_recvbuf_mergeable(struct virtnet_info *vi,
957                                  struct receive_queue *rq, gfp_t gfp)
958 {
959         struct page_frag *alloc_frag = &rq->alloc_frag;
960         unsigned int headroom = virtnet_get_headroom(vi);
961         char *buf;
962         void *ctx;
963         int err;
964         unsigned int len, hole;
965
966         len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len);
967         if (unlikely(!skb_page_frag_refill(len + headroom, alloc_frag, gfp)))
968                 return -ENOMEM;
969
970         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
971         buf += headroom; /* advance address leaving hole at front of pkt */
972         get_page(alloc_frag->page);
973         alloc_frag->offset += len + headroom;
974         hole = alloc_frag->size - alloc_frag->offset;
975         if (hole < len + headroom) {
976                 /* To avoid internal fragmentation, if there is very likely not
977                  * enough space for another buffer, add the remaining space to
978                  * the current buffer.
979                  */
980                 len += hole;
981                 alloc_frag->offset += hole;
982         }
983
984         sg_init_one(rq->sg, buf, len);
985         ctx = mergeable_len_to_ctx(len, headroom);
986         err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
987         if (err < 0)
988                 put_page(virt_to_head_page(buf));
989
990         return err;
991 }
992
993 /*
994  * Returns false if we couldn't fill entirely (OOM).
995  *
996  * Normally run in the receive path, but can also be run from ndo_open
997  * before we're receiving packets, or from refill_work which is
998  * careful to disable receiving (using napi_disable).
999  */
1000 static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
1001                           gfp_t gfp)
1002 {
1003         int err;
1004         bool oom;
1005
1006         gfp |= __GFP_COLD;
1007         do {
1008                 if (vi->mergeable_rx_bufs)
1009                         err = add_recvbuf_mergeable(vi, rq, gfp);
1010                 else if (vi->big_packets)
1011                         err = add_recvbuf_big(vi, rq, gfp);
1012                 else
1013                         err = add_recvbuf_small(vi, rq, gfp);
1014
1015                 oom = err == -ENOMEM;
1016                 if (err)
1017                         break;
1018         } while (rq->vq->num_free);
1019         virtqueue_kick(rq->vq);
1020         return !oom;
1021 }
1022
1023 static void skb_recv_done(struct virtqueue *rvq)
1024 {
1025         struct virtnet_info *vi = rvq->vdev->priv;
1026         struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
1027
1028         virtqueue_napi_schedule(&rq->napi, rvq);
1029 }
1030
1031 static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi)
1032 {
1033         napi_enable(napi);
1034
1035         /* If all buffers were filled by other side before we napi_enabled, we
1036          * won't get another interrupt, so process any outstanding packets now.
1037          * Call local_bh_enable after to trigger softIRQ processing.
1038          */
1039         local_bh_disable();
1040         virtqueue_napi_schedule(napi, vq);
1041         local_bh_enable();
1042 }
1043
1044 static void virtnet_napi_tx_enable(struct virtnet_info *vi,
1045                                    struct virtqueue *vq,
1046                                    struct napi_struct *napi)
1047 {
1048         if (!napi->weight)
1049                 return;
1050
1051         /* Tx napi touches cachelines on the cpu handling tx interrupts. Only
1052          * enable the feature if this is likely affine with the transmit path.
1053          */
1054         if (!vi->affinity_hint_set) {
1055                 napi->weight = 0;
1056                 return;
1057         }
1058
1059         return virtnet_napi_enable(vq, napi);
1060 }
1061
1062 static void virtnet_napi_tx_disable(struct napi_struct *napi)
1063 {
1064         if (napi->weight)
1065                 napi_disable(napi);
1066 }
1067
1068 static void refill_work(struct work_struct *work)
1069 {
1070         struct virtnet_info *vi =
1071                 container_of(work, struct virtnet_info, refill.work);
1072         bool still_empty;
1073         int i;
1074
1075         for (i = 0; i < vi->curr_queue_pairs; i++) {
1076                 struct receive_queue *rq = &vi->rq[i];
1077
1078                 napi_disable(&rq->napi);
1079                 still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
1080                 virtnet_napi_enable(rq->vq, &rq->napi);
1081
1082                 /* In theory, this can happen: if we don't get any buffers in
1083                  * we will *never* try to fill again.
1084                  */
1085                 if (still_empty)
1086                         schedule_delayed_work(&vi->refill, HZ/2);
1087         }
1088 }
1089
1090 static int virtnet_receive(struct receive_queue *rq, int budget)
1091 {
1092         struct virtnet_info *vi = rq->vq->vdev->priv;
1093         unsigned int len, received = 0, bytes = 0;
1094         void *buf;
1095         struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
1096
1097         if (!vi->big_packets || vi->mergeable_rx_bufs) {
1098                 void *ctx;
1099
1100                 while (received < budget &&
1101                        (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
1102                         bytes += receive_buf(vi, rq, buf, len, ctx);
1103                         received++;
1104                 }
1105         } else {
1106                 while (received < budget &&
1107                        (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
1108                         bytes += receive_buf(vi, rq, buf, len, NULL);
1109                         received++;
1110                 }
1111         }
1112
1113         if (rq->vq->num_free > virtqueue_get_vring_size(rq->vq) / 2) {
1114                 if (!try_fill_recv(vi, rq, GFP_ATOMIC))
1115                         schedule_delayed_work(&vi->refill, 0);
1116         }
1117
1118         u64_stats_update_begin(&stats->rx_syncp);
1119         stats->rx_bytes += bytes;
1120         stats->rx_packets += received;
1121         u64_stats_update_end(&stats->rx_syncp);
1122
1123         return received;
1124 }
1125
1126 static void free_old_xmit_skbs(struct send_queue *sq)
1127 {
1128         struct sk_buff *skb;
1129         unsigned int len;
1130         struct virtnet_info *vi = sq->vq->vdev->priv;
1131         struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
1132         unsigned int packets = 0;
1133         unsigned int bytes = 0;
1134
1135         while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
1136                 pr_debug("Sent skb %p\n", skb);
1137
1138                 bytes += skb->len;
1139                 packets++;
1140
1141                 dev_consume_skb_any(skb);
1142         }
1143
1144         /* Avoid overhead when no packets have been processed
1145          * happens when called speculatively from start_xmit.
1146          */
1147         if (!packets)
1148                 return;
1149
1150         u64_stats_update_begin(&stats->tx_syncp);
1151         stats->tx_bytes += bytes;
1152         stats->tx_packets += packets;
1153         u64_stats_update_end(&stats->tx_syncp);
1154 }
1155
1156 static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
1157 {
1158         if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
1159                 return false;
1160         else if (q < vi->curr_queue_pairs)
1161                 return true;
1162         else
1163                 return false;
1164 }
1165
1166 static void virtnet_poll_cleantx(struct receive_queue *rq)
1167 {
1168         struct virtnet_info *vi = rq->vq->vdev->priv;
1169         unsigned int index = vq2rxq(rq->vq);
1170         struct send_queue *sq = &vi->sq[index];
1171         struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index);
1172
1173         if (!sq->napi.weight || is_xdp_raw_buffer_queue(vi, index))
1174                 return;
1175
1176         if (__netif_tx_trylock(txq)) {
1177                 free_old_xmit_skbs(sq);
1178                 __netif_tx_unlock(txq);
1179         }
1180
1181         if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
1182                 netif_tx_wake_queue(txq);
1183 }
1184
1185 static int virtnet_poll(struct napi_struct *napi, int budget)
1186 {
1187         struct receive_queue *rq =
1188                 container_of(napi, struct receive_queue, napi);
1189         unsigned int received;
1190
1191         virtnet_poll_cleantx(rq);
1192
1193         received = virtnet_receive(rq, budget);
1194
1195         /* Out of packets? */
1196         if (received < budget)
1197                 virtqueue_napi_complete(napi, rq->vq, received);
1198
1199         return received;
1200 }
1201
1202 static int virtnet_open(struct net_device *dev)
1203 {
1204         struct virtnet_info *vi = netdev_priv(dev);
1205         int i;
1206
1207         for (i = 0; i < vi->max_queue_pairs; i++) {
1208                 if (i < vi->curr_queue_pairs)
1209                         /* Make sure we have some buffers: if oom use wq. */
1210                         if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
1211                                 schedule_delayed_work(&vi->refill, 0);
1212                 virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
1213                 virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
1214         }
1215
1216         return 0;
1217 }
1218
1219 static int virtnet_poll_tx(struct napi_struct *napi, int budget)
1220 {
1221         struct send_queue *sq = container_of(napi, struct send_queue, napi);
1222         struct virtnet_info *vi = sq->vq->vdev->priv;
1223         unsigned int index = vq2txq(sq->vq);
1224         struct netdev_queue *txq;
1225
1226         if (unlikely(is_xdp_raw_buffer_queue(vi, index))) {
1227                 /* We don't need to enable cb for XDP */
1228                 napi_complete_done(napi, 0);
1229                 return 0;
1230         }
1231
1232         txq = netdev_get_tx_queue(vi->dev, index);
1233         __netif_tx_lock(txq, raw_smp_processor_id());
1234         free_old_xmit_skbs(sq);
1235         __netif_tx_unlock(txq);
1236
1237         virtqueue_napi_complete(napi, sq->vq, 0);
1238
1239         if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
1240                 netif_tx_wake_queue(txq);
1241
1242         return 0;
1243 }
1244
1245 static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
1246 {
1247         struct virtio_net_hdr_mrg_rxbuf *hdr;
1248         const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
1249         struct virtnet_info *vi = sq->vq->vdev->priv;
1250         int num_sg;
1251         unsigned hdr_len = vi->hdr_len;
1252         bool can_push;
1253
1254         pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
1255
1256         can_push = vi->any_header_sg &&
1257                 !((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
1258                 !skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
1259         /* Even if we can, don't push here yet as this would skew
1260          * csum_start offset below. */
1261         if (can_push)
1262                 hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len);
1263         else
1264                 hdr = skb_vnet_hdr(skb);
1265
1266         if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
1267                                     virtio_is_little_endian(vi->vdev), false,
1268                                     0))
1269                 return -EPROTO;
1270
1271         if (vi->mergeable_rx_bufs)
1272                 hdr->num_buffers = 0;
1273
1274         sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2));
1275         if (can_push) {
1276                 __skb_push(skb, hdr_len);
1277                 num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
1278                 if (unlikely(num_sg < 0))
1279                         return num_sg;
1280                 /* Pull header back to avoid skew in tx bytes calculations. */
1281                 __skb_pull(skb, hdr_len);
1282         } else {
1283                 sg_set_buf(sq->sg, hdr, hdr_len);
1284                 num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
1285                 if (unlikely(num_sg < 0))
1286                         return num_sg;
1287                 num_sg++;
1288         }
1289         return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
1290 }
1291
1292 static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
1293 {
1294         struct virtnet_info *vi = netdev_priv(dev);
1295         int qnum = skb_get_queue_mapping(skb);
1296         struct send_queue *sq = &vi->sq[qnum];
1297         int err;
1298         struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
1299         bool kick = !skb->xmit_more;
1300         bool use_napi = sq->napi.weight;
1301
1302         /* Free up any pending old buffers before queueing new ones. */
1303         free_old_xmit_skbs(sq);
1304
1305         if (use_napi && kick)
1306                 virtqueue_enable_cb_delayed(sq->vq);
1307
1308         /* timestamp packet in software */
1309         skb_tx_timestamp(skb);
1310
1311         /* Try to transmit */
1312         err = xmit_skb(sq, skb);
1313
1314         /* This should not happen! */
1315         if (unlikely(err)) {
1316                 dev->stats.tx_fifo_errors++;
1317                 if (net_ratelimit())
1318                         dev_warn(&dev->dev,
1319                                  "Unexpected TXQ (%d) queue failure: %d\n", qnum, err);
1320                 dev->stats.tx_dropped++;
1321                 dev_kfree_skb_any(skb);
1322                 return NETDEV_TX_OK;
1323         }
1324
1325         /* Don't wait up for transmitted skbs to be freed. */
1326         if (!use_napi) {
1327                 skb_orphan(skb);
1328                 nf_reset(skb);
1329         }
1330
1331         /* If running out of space, stop queue to avoid getting packets that we
1332          * are then unable to transmit.
1333          * An alternative would be to force queuing layer to requeue the skb by
1334          * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be
1335          * returned in a normal path of operation: it means that driver is not
1336          * maintaining the TX queue stop/start state properly, and causes
1337          * the stack to do a non-trivial amount of useless work.
1338          * Since most packets only take 1 or 2 ring slots, stopping the queue
1339          * early means 16 slots are typically wasted.
1340          */
1341         if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
1342                 netif_stop_subqueue(dev, qnum);
1343                 if (!use_napi &&
1344                     unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
1345                         /* More just got used, free them then recheck. */
1346                         free_old_xmit_skbs(sq);
1347                         if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
1348                                 netif_start_subqueue(dev, qnum);
1349                                 virtqueue_disable_cb(sq->vq);
1350                         }
1351                 }
1352         }
1353
1354         if (kick || netif_xmit_stopped(txq))
1355                 virtqueue_kick(sq->vq);
1356
1357         return NETDEV_TX_OK;
1358 }
1359
1360 /*
1361  * Send command via the control virtqueue and check status.  Commands
1362  * supported by the hypervisor, as indicated by feature bits, should
1363  * never fail unless improperly formatted.
1364  */
1365 static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
1366                                  struct scatterlist *out)
1367 {
1368         struct scatterlist *sgs[4], hdr, stat;
1369         unsigned out_num = 0, tmp;
1370
1371         /* Caller should know better */
1372         BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
1373
1374         vi->ctrl->status = ~0;
1375         vi->ctrl->hdr.class = class;
1376         vi->ctrl->hdr.cmd = cmd;
1377         /* Add header */
1378         sg_init_one(&hdr, &vi->ctrl->hdr, sizeof(vi->ctrl->hdr));
1379         sgs[out_num++] = &hdr;
1380
1381         if (out)
1382                 sgs[out_num++] = out;
1383
1384         /* Add return status. */
1385         sg_init_one(&stat, &vi->ctrl->status, sizeof(vi->ctrl->status));
1386         sgs[out_num] = &stat;
1387
1388         BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
1389         virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
1390
1391         if (unlikely(!virtqueue_kick(vi->cvq)))
1392                 return vi->ctrl->status == VIRTIO_NET_OK;
1393
1394         /* Spin for a response, the kick causes an ioport write, trapping
1395          * into the hypervisor, so the request should be handled immediately.
1396          */
1397         while (!virtqueue_get_buf(vi->cvq, &tmp) &&
1398                !virtqueue_is_broken(vi->cvq))
1399                 cpu_relax();
1400
1401         return vi->ctrl->status == VIRTIO_NET_OK;
1402 }
1403
1404 static int virtnet_set_mac_address(struct net_device *dev, void *p)
1405 {
1406         struct virtnet_info *vi = netdev_priv(dev);
1407         struct virtio_device *vdev = vi->vdev;
1408         int ret;
1409         struct sockaddr *addr;
1410         struct scatterlist sg;
1411
1412         addr = kmemdup(p, sizeof(*addr), GFP_KERNEL);
1413         if (!addr)
1414                 return -ENOMEM;
1415
1416         ret = eth_prepare_mac_addr_change(dev, addr);
1417         if (ret)
1418                 goto out;
1419
1420         if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
1421                 sg_init_one(&sg, addr->sa_data, dev->addr_len);
1422                 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1423                                           VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
1424                         dev_warn(&vdev->dev,
1425                                  "Failed to set mac address by vq command.\n");
1426                         ret = -EINVAL;
1427                         goto out;
1428                 }
1429         } else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
1430                    !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1431                 unsigned int i;
1432
1433                 /* Naturally, this has an atomicity problem. */
1434                 for (i = 0; i < dev->addr_len; i++)
1435                         virtio_cwrite8(vdev,
1436                                        offsetof(struct virtio_net_config, mac) +
1437                                        i, addr->sa_data[i]);
1438         }
1439
1440         eth_commit_mac_addr_change(dev, p);
1441         ret = 0;
1442
1443 out:
1444         kfree(addr);
1445         return ret;
1446 }
1447
1448 static void virtnet_stats(struct net_device *dev,
1449                           struct rtnl_link_stats64 *tot)
1450 {
1451         struct virtnet_info *vi = netdev_priv(dev);
1452         int cpu;
1453         unsigned int start;
1454
1455         for_each_possible_cpu(cpu) {
1456                 struct virtnet_stats *stats = per_cpu_ptr(vi->stats, cpu);
1457                 u64 tpackets, tbytes, rpackets, rbytes;
1458
1459                 do {
1460                         start = u64_stats_fetch_begin_irq(&stats->tx_syncp);
1461                         tpackets = stats->tx_packets;
1462                         tbytes   = stats->tx_bytes;
1463                 } while (u64_stats_fetch_retry_irq(&stats->tx_syncp, start));
1464
1465                 do {
1466                         start = u64_stats_fetch_begin_irq(&stats->rx_syncp);
1467                         rpackets = stats->rx_packets;
1468                         rbytes   = stats->rx_bytes;
1469                 } while (u64_stats_fetch_retry_irq(&stats->rx_syncp, start));
1470
1471                 tot->rx_packets += rpackets;
1472                 tot->tx_packets += tpackets;
1473                 tot->rx_bytes   += rbytes;
1474                 tot->tx_bytes   += tbytes;
1475         }
1476
1477         tot->tx_dropped = dev->stats.tx_dropped;
1478         tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
1479         tot->rx_dropped = dev->stats.rx_dropped;
1480         tot->rx_length_errors = dev->stats.rx_length_errors;
1481         tot->rx_frame_errors = dev->stats.rx_frame_errors;
1482 }
1483
1484 #ifdef CONFIG_NET_POLL_CONTROLLER
1485 static void virtnet_netpoll(struct net_device *dev)
1486 {
1487         struct virtnet_info *vi = netdev_priv(dev);
1488         int i;
1489
1490         for (i = 0; i < vi->curr_queue_pairs; i++)
1491                 napi_schedule(&vi->rq[i].napi);
1492 }
1493 #endif
1494
1495 static void virtnet_ack_link_announce(struct virtnet_info *vi)
1496 {
1497         rtnl_lock();
1498         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
1499                                   VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL))
1500                 dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
1501         rtnl_unlock();
1502 }
1503
1504 static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
1505 {
1506         struct scatterlist sg;
1507         struct net_device *dev = vi->dev;
1508
1509         if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
1510                 return 0;
1511
1512         vi->ctrl->mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
1513         sg_init_one(&sg, &vi->ctrl->mq, sizeof(vi->ctrl->mq));
1514
1515         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
1516                                   VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
1517                 dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
1518                          queue_pairs);
1519                 return -EINVAL;
1520         } else {
1521                 vi->curr_queue_pairs = queue_pairs;
1522                 /* virtnet_open() will refill when device is going to up. */
1523                 if (dev->flags & IFF_UP)
1524                         schedule_delayed_work(&vi->refill, 0);
1525         }
1526
1527         return 0;
1528 }
1529
1530 static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
1531 {
1532         int err;
1533
1534         rtnl_lock();
1535         err = _virtnet_set_queues(vi, queue_pairs);
1536         rtnl_unlock();
1537         return err;
1538 }
1539
1540 static int virtnet_close(struct net_device *dev)
1541 {
1542         struct virtnet_info *vi = netdev_priv(dev);
1543         int i;
1544
1545         /* Make sure refill_work doesn't re-enable napi! */
1546         cancel_delayed_work_sync(&vi->refill);
1547
1548         for (i = 0; i < vi->max_queue_pairs; i++) {
1549                 napi_disable(&vi->rq[i].napi);
1550                 virtnet_napi_tx_disable(&vi->sq[i].napi);
1551         }
1552
1553         return 0;
1554 }
1555
1556 static void virtnet_set_rx_mode(struct net_device *dev)
1557 {
1558         struct virtnet_info *vi = netdev_priv(dev);
1559         struct scatterlist sg[2];
1560         struct virtio_net_ctrl_mac *mac_data;
1561         struct netdev_hw_addr *ha;
1562         int uc_count;
1563         int mc_count;
1564         void *buf;
1565         int i;
1566
1567         /* We can't dynamically set ndo_set_rx_mode, so return gracefully */
1568         if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
1569                 return;
1570
1571         vi->ctrl->promisc = ((dev->flags & IFF_PROMISC) != 0);
1572         vi->ctrl->allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
1573
1574         sg_init_one(sg, &vi->ctrl->promisc, sizeof(vi->ctrl->promisc));
1575
1576         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1577                                   VIRTIO_NET_CTRL_RX_PROMISC, sg))
1578                 dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
1579                          vi->ctrl->promisc ? "en" : "dis");
1580
1581         sg_init_one(sg, &vi->ctrl->allmulti, sizeof(vi->ctrl->allmulti));
1582
1583         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1584                                   VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
1585                 dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
1586                          vi->ctrl->allmulti ? "en" : "dis");
1587
1588         uc_count = netdev_uc_count(dev);
1589         mc_count = netdev_mc_count(dev);
1590         /* MAC filter - use one buffer for both lists */
1591         buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
1592                       (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
1593         mac_data = buf;
1594         if (!buf)
1595                 return;
1596
1597         sg_init_table(sg, 2);
1598
1599         /* Store the unicast list and count in the front of the buffer */
1600         mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
1601         i = 0;
1602         netdev_for_each_uc_addr(ha, dev)
1603                 memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1604
1605         sg_set_buf(&sg[0], mac_data,
1606                    sizeof(mac_data->entries) + (uc_count * ETH_ALEN));
1607
1608         /* multicast list and count fill the end */
1609         mac_data = (void *)&mac_data->macs[uc_count][0];
1610
1611         mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
1612         i = 0;
1613         netdev_for_each_mc_addr(ha, dev)
1614                 memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1615
1616         sg_set_buf(&sg[1], mac_data,
1617                    sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
1618
1619         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1620                                   VIRTIO_NET_CTRL_MAC_TABLE_SET, sg))
1621                 dev_warn(&dev->dev, "Failed to set MAC filter table.\n");
1622
1623         kfree(buf);
1624 }
1625
1626 static int virtnet_vlan_rx_add_vid(struct net_device *dev,
1627                                    __be16 proto, u16 vid)
1628 {
1629         struct virtnet_info *vi = netdev_priv(dev);
1630         struct scatterlist sg;
1631
1632         vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
1633         sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
1634
1635         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1636                                   VIRTIO_NET_CTRL_VLAN_ADD, &sg))
1637                 dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
1638         return 0;
1639 }
1640
1641 static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
1642                                     __be16 proto, u16 vid)
1643 {
1644         struct virtnet_info *vi = netdev_priv(dev);
1645         struct scatterlist sg;
1646
1647         vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
1648         sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
1649
1650         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1651                                   VIRTIO_NET_CTRL_VLAN_DEL, &sg))
1652                 dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
1653         return 0;
1654 }
1655
1656 static void virtnet_clean_affinity(struct virtnet_info *vi, long hcpu)
1657 {
1658         int i;
1659
1660         if (vi->affinity_hint_set) {
1661                 for (i = 0; i < vi->max_queue_pairs; i++) {
1662                         virtqueue_set_affinity(vi->rq[i].vq, -1);
1663                         virtqueue_set_affinity(vi->sq[i].vq, -1);
1664                 }
1665
1666                 vi->affinity_hint_set = false;
1667         }
1668 }
1669
1670 static void virtnet_set_affinity(struct virtnet_info *vi)
1671 {
1672         int i;
1673         int cpu;
1674
1675         /* In multiqueue mode, when the number of cpu is equal to the number of
1676          * queue pairs, we let the queue pairs to be private to one cpu by
1677          * setting the affinity hint to eliminate the contention.
1678          */
1679         if (vi->curr_queue_pairs == 1 ||
1680             vi->max_queue_pairs != num_online_cpus()) {
1681                 virtnet_clean_affinity(vi, -1);
1682                 return;
1683         }
1684
1685         i = 0;
1686         for_each_online_cpu(cpu) {
1687                 virtqueue_set_affinity(vi->rq[i].vq, cpu);
1688                 virtqueue_set_affinity(vi->sq[i].vq, cpu);
1689                 netif_set_xps_queue(vi->dev, cpumask_of(cpu), i);
1690                 i++;
1691         }
1692
1693         vi->affinity_hint_set = true;
1694 }
1695
1696 static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node)
1697 {
1698         struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
1699                                                    node);
1700         virtnet_set_affinity(vi);
1701         return 0;
1702 }
1703
1704 static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node)
1705 {
1706         struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
1707                                                    node_dead);
1708         virtnet_set_affinity(vi);
1709         return 0;
1710 }
1711
1712 static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
1713 {
1714         struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
1715                                                    node);
1716
1717         virtnet_clean_affinity(vi, cpu);
1718         return 0;
1719 }
1720
1721 static enum cpuhp_state virtionet_online;
1722
1723 static int virtnet_cpu_notif_add(struct virtnet_info *vi)
1724 {
1725         int ret;
1726
1727         ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node);
1728         if (ret)
1729                 return ret;
1730         ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD,
1731                                                &vi->node_dead);
1732         if (!ret)
1733                 return ret;
1734         cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
1735         return ret;
1736 }
1737
1738 static void virtnet_cpu_notif_remove(struct virtnet_info *vi)
1739 {
1740         cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
1741         cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD,
1742                                             &vi->node_dead);
1743 }
1744
1745 static void virtnet_get_ringparam(struct net_device *dev,
1746                                 struct ethtool_ringparam *ring)
1747 {
1748         struct virtnet_info *vi = netdev_priv(dev);
1749
1750         ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
1751         ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
1752         ring->rx_pending = ring->rx_max_pending;
1753         ring->tx_pending = ring->tx_max_pending;
1754 }
1755
1756
1757 static void virtnet_get_drvinfo(struct net_device *dev,
1758                                 struct ethtool_drvinfo *info)
1759 {
1760         struct virtnet_info *vi = netdev_priv(dev);
1761         struct virtio_device *vdev = vi->vdev;
1762
1763         strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
1764         strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
1765         strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));
1766
1767 }
1768
1769 /* TODO: Eliminate OOO packets during switching */
1770 static int virtnet_set_channels(struct net_device *dev,
1771                                 struct ethtool_channels *channels)
1772 {
1773         struct virtnet_info *vi = netdev_priv(dev);
1774         u16 queue_pairs = channels->combined_count;
1775         int err;
1776
1777         /* We don't support separate rx/tx channels.
1778          * We don't allow setting 'other' channels.
1779          */
1780         if (channels->rx_count || channels->tx_count || channels->other_count)
1781                 return -EINVAL;
1782
1783         if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
1784                 return -EINVAL;
1785
1786         /* For now we don't support modifying channels while XDP is loaded
1787          * also when XDP is loaded all RX queues have XDP programs so we only
1788          * need to check a single RX queue.
1789          */
1790         if (vi->rq[0].xdp_prog)
1791                 return -EINVAL;
1792
1793         get_online_cpus();
1794         err = _virtnet_set_queues(vi, queue_pairs);
1795         if (err) {
1796                 put_online_cpus();
1797                 goto err;
1798         }
1799         virtnet_set_affinity(vi);
1800         put_online_cpus();
1801
1802         netif_set_real_num_tx_queues(dev, queue_pairs);
1803         netif_set_real_num_rx_queues(dev, queue_pairs);
1804  err:
1805         return err;
1806 }
1807
1808 static void virtnet_get_channels(struct net_device *dev,
1809                                  struct ethtool_channels *channels)
1810 {
1811         struct virtnet_info *vi = netdev_priv(dev);
1812
1813         channels->combined_count = vi->curr_queue_pairs;
1814         channels->max_combined = vi->max_queue_pairs;
1815         channels->max_other = 0;
1816         channels->rx_count = 0;
1817         channels->tx_count = 0;
1818         channels->other_count = 0;
1819 }
1820
1821 /* Check if the user is trying to change anything besides speed/duplex */
1822 static bool
1823 virtnet_validate_ethtool_cmd(const struct ethtool_link_ksettings *cmd)
1824 {
1825         struct ethtool_link_ksettings diff1 = *cmd;
1826         struct ethtool_link_ksettings diff2 = {};
1827
1828         /* cmd is always set so we need to clear it, validate the port type
1829          * and also without autonegotiation we can ignore advertising
1830          */
1831         diff1.base.speed = 0;
1832         diff2.base.port = PORT_OTHER;
1833         ethtool_link_ksettings_zero_link_mode(&diff1, advertising);
1834         diff1.base.duplex = 0;
1835         diff1.base.cmd = 0;
1836         diff1.base.link_mode_masks_nwords = 0;
1837
1838         return !memcmp(&diff1.base, &diff2.base, sizeof(diff1.base)) &&
1839                 bitmap_empty(diff1.link_modes.supported,
1840                              __ETHTOOL_LINK_MODE_MASK_NBITS) &&
1841                 bitmap_empty(diff1.link_modes.advertising,
1842                              __ETHTOOL_LINK_MODE_MASK_NBITS) &&
1843                 bitmap_empty(diff1.link_modes.lp_advertising,
1844                              __ETHTOOL_LINK_MODE_MASK_NBITS);
1845 }
1846
1847 static int virtnet_set_link_ksettings(struct net_device *dev,
1848                                       const struct ethtool_link_ksettings *cmd)
1849 {
1850         struct virtnet_info *vi = netdev_priv(dev);
1851         u32 speed;
1852
1853         speed = cmd->base.speed;
1854         /* don't allow custom speed and duplex */
1855         if (!ethtool_validate_speed(speed) ||
1856             !ethtool_validate_duplex(cmd->base.duplex) ||
1857             !virtnet_validate_ethtool_cmd(cmd))
1858                 return -EINVAL;
1859         vi->speed = speed;
1860         vi->duplex = cmd->base.duplex;
1861
1862         return 0;
1863 }
1864
1865 static int virtnet_get_link_ksettings(struct net_device *dev,
1866                                       struct ethtool_link_ksettings *cmd)
1867 {
1868         struct virtnet_info *vi = netdev_priv(dev);
1869
1870         cmd->base.speed = vi->speed;
1871         cmd->base.duplex = vi->duplex;
1872         cmd->base.port = PORT_OTHER;
1873
1874         return 0;
1875 }
1876
1877 static void virtnet_init_settings(struct net_device *dev)
1878 {
1879         struct virtnet_info *vi = netdev_priv(dev);
1880
1881         vi->speed = SPEED_UNKNOWN;
1882         vi->duplex = DUPLEX_UNKNOWN;
1883 }
1884
1885 static const struct ethtool_ops virtnet_ethtool_ops = {
1886         .get_drvinfo = virtnet_get_drvinfo,
1887         .get_link = ethtool_op_get_link,
1888         .get_ringparam = virtnet_get_ringparam,
1889         .set_channels = virtnet_set_channels,
1890         .get_channels = virtnet_get_channels,
1891         .get_ts_info = ethtool_op_get_ts_info,
1892         .get_link_ksettings = virtnet_get_link_ksettings,
1893         .set_link_ksettings = virtnet_set_link_ksettings,
1894 };
1895
1896 static void virtnet_freeze_down(struct virtio_device *vdev)
1897 {
1898         struct virtnet_info *vi = vdev->priv;
1899         int i;
1900
1901         /* Make sure no work handler is accessing the device */
1902         flush_work(&vi->config_work);
1903
1904         netif_tx_lock_bh(vi->dev);
1905         netif_device_detach(vi->dev);
1906         netif_tx_unlock_bh(vi->dev);
1907         cancel_delayed_work_sync(&vi->refill);
1908
1909         if (netif_running(vi->dev)) {
1910                 for (i = 0; i < vi->max_queue_pairs; i++) {
1911                         napi_disable(&vi->rq[i].napi);
1912                         virtnet_napi_tx_disable(&vi->sq[i].napi);
1913                 }
1914         }
1915 }
1916
1917 static int init_vqs(struct virtnet_info *vi);
1918
1919 static int virtnet_restore_up(struct virtio_device *vdev)
1920 {
1921         struct virtnet_info *vi = vdev->priv;
1922         int err, i;
1923
1924         err = init_vqs(vi);
1925         if (err)
1926                 return err;
1927
1928         virtio_device_ready(vdev);
1929
1930         if (netif_running(vi->dev)) {
1931                 for (i = 0; i < vi->curr_queue_pairs; i++)
1932                         if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
1933                                 schedule_delayed_work(&vi->refill, 0);
1934
1935                 for (i = 0; i < vi->max_queue_pairs; i++) {
1936                         virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
1937                         virtnet_napi_tx_enable(vi, vi->sq[i].vq,
1938                                                &vi->sq[i].napi);
1939                 }
1940         }
1941
1942         netif_tx_lock_bh(vi->dev);
1943         netif_device_attach(vi->dev);
1944         netif_tx_unlock_bh(vi->dev);
1945         return err;
1946 }
1947
1948 static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads)
1949 {
1950         struct scatterlist sg;
1951         vi->ctrl->offloads = cpu_to_virtio64(vi->vdev, offloads);
1952
1953         sg_init_one(&sg, &vi->ctrl->offloads, sizeof(vi->ctrl->offloads));
1954
1955         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS,
1956                                   VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) {
1957                 dev_warn(&vi->dev->dev, "Fail to set guest offload. \n");
1958                 return -EINVAL;
1959         }
1960
1961         return 0;
1962 }
1963
1964 static int virtnet_clear_guest_offloads(struct virtnet_info *vi)
1965 {
1966         u64 offloads = 0;
1967
1968         if (!vi->guest_offloads)
1969                 return 0;
1970
1971         return virtnet_set_guest_offloads(vi, offloads);
1972 }
1973
1974 static int virtnet_restore_guest_offloads(struct virtnet_info *vi)
1975 {
1976         u64 offloads = vi->guest_offloads;
1977
1978         if (!vi->guest_offloads)
1979                 return 0;
1980
1981         return virtnet_set_guest_offloads(vi, offloads);
1982 }
1983
1984 static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
1985                            struct netlink_ext_ack *extack)
1986 {
1987         unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
1988         struct virtnet_info *vi = netdev_priv(dev);
1989         struct bpf_prog *old_prog;
1990         u16 xdp_qp = 0, curr_qp;
1991         int i, err;
1992
1993         if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)
1994             && (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
1995                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
1996                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
1997                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) ||
1998                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM))) {
1999                 NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing LRO/CSUM, disable LRO/CSUM first");
2000                 return -EOPNOTSUPP;
2001         }
2002
2003         if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
2004                 NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required");
2005                 return -EINVAL;
2006         }
2007
2008         if (dev->mtu > max_sz) {
2009                 NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP");
2010                 netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
2011                 return -EINVAL;
2012         }
2013
2014         curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
2015         if (prog)
2016                 xdp_qp = nr_cpu_ids;
2017
2018         /* XDP requires extra queues for XDP_TX */
2019         if (curr_qp + xdp_qp > vi->max_queue_pairs) {
2020                 NL_SET_ERR_MSG_MOD(extack, "Too few free TX rings available");
2021                 netdev_warn(dev, "request %i queues but max is %i\n",
2022                             curr_qp + xdp_qp, vi->max_queue_pairs);
2023                 return -ENOMEM;
2024         }
2025
2026         if (prog) {
2027                 prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
2028                 if (IS_ERR(prog))
2029                         return PTR_ERR(prog);
2030         }
2031
2032         /* Make sure NAPI is not using any XDP TX queues for RX. */
2033         if (netif_running(dev)) {
2034                 for (i = 0; i < vi->max_queue_pairs; i++) {
2035                         napi_disable(&vi->rq[i].napi);
2036                         virtnet_napi_tx_disable(&vi->sq[i].napi);
2037                 }
2038         }
2039
2040         err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
2041         if (err)
2042                 goto err;
2043         netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
2044         vi->xdp_queue_pairs = xdp_qp;
2045
2046         for (i = 0; i < vi->max_queue_pairs; i++) {
2047                 old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
2048                 rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
2049                 if (i == 0) {
2050                         if (!old_prog)
2051                                 virtnet_clear_guest_offloads(vi);
2052                         if (!prog)
2053                                 virtnet_restore_guest_offloads(vi);
2054                 }
2055                 if (old_prog)
2056                         bpf_prog_put(old_prog);
2057                 if (netif_running(dev)) {
2058                         virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
2059                         virtnet_napi_tx_enable(vi, vi->sq[i].vq,
2060                                                &vi->sq[i].napi);
2061                 }
2062         }
2063
2064         return 0;
2065
2066 err:
2067         if (netif_running(dev)) {
2068                 for (i = 0; i < vi->max_queue_pairs; i++) {
2069                         virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
2070                         virtnet_napi_tx_enable(vi, vi->sq[i].vq,
2071                                                &vi->sq[i].napi);
2072                 }
2073         }
2074         if (prog)
2075                 bpf_prog_sub(prog, vi->max_queue_pairs - 1);
2076         return err;
2077 }
2078
2079 static u32 virtnet_xdp_query(struct net_device *dev)
2080 {
2081         struct virtnet_info *vi = netdev_priv(dev);
2082         const struct bpf_prog *xdp_prog;
2083         int i;
2084
2085         for (i = 0; i < vi->max_queue_pairs; i++) {
2086                 xdp_prog = rtnl_dereference(vi->rq[i].xdp_prog);
2087                 if (xdp_prog)
2088                         return xdp_prog->aux->id;
2089         }
2090         return 0;
2091 }
2092
2093 static int virtnet_xdp(struct net_device *dev, struct netdev_xdp *xdp)
2094 {
2095         switch (xdp->command) {
2096         case XDP_SETUP_PROG:
2097                 return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
2098         case XDP_QUERY_PROG:
2099                 xdp->prog_id = virtnet_xdp_query(dev);
2100                 xdp->prog_attached = !!xdp->prog_id;
2101                 return 0;
2102         default:
2103                 return -EINVAL;
2104         }
2105 }
2106
2107 static const struct net_device_ops virtnet_netdev = {
2108         .ndo_open            = virtnet_open,
2109         .ndo_stop            = virtnet_close,
2110         .ndo_start_xmit      = start_xmit,
2111         .ndo_validate_addr   = eth_validate_addr,
2112         .ndo_set_mac_address = virtnet_set_mac_address,
2113         .ndo_set_rx_mode     = virtnet_set_rx_mode,
2114         .ndo_get_stats64     = virtnet_stats,
2115         .ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
2116         .ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
2117 #ifdef CONFIG_NET_POLL_CONTROLLER
2118         .ndo_poll_controller = virtnet_netpoll,
2119 #endif
2120         .ndo_xdp                = virtnet_xdp,
2121         .ndo_features_check     = passthru_features_check,
2122 };
2123
2124 static void virtnet_config_changed_work(struct work_struct *work)
2125 {
2126         struct virtnet_info *vi =
2127                 container_of(work, struct virtnet_info, config_work);
2128         u16 v;
2129
2130         if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
2131                                  struct virtio_net_config, status, &v) < 0)
2132                 return;
2133
2134         if (v & VIRTIO_NET_S_ANNOUNCE) {
2135                 netdev_notify_peers(vi->dev);
2136                 virtnet_ack_link_announce(vi);
2137         }
2138
2139         /* Ignore unknown (future) status bits */
2140         v &= VIRTIO_NET_S_LINK_UP;
2141
2142         if (vi->status == v)
2143                 return;
2144
2145         vi->status = v;
2146
2147         if (vi->status & VIRTIO_NET_S_LINK_UP) {
2148                 netif_carrier_on(vi->dev);
2149                 netif_tx_wake_all_queues(vi->dev);
2150         } else {
2151                 netif_carrier_off(vi->dev);
2152                 netif_tx_stop_all_queues(vi->dev);
2153         }
2154 }
2155
2156 static void virtnet_config_changed(struct virtio_device *vdev)
2157 {
2158         struct virtnet_info *vi = vdev->priv;
2159
2160         schedule_work(&vi->config_work);
2161 }
2162
2163 static void virtnet_free_queues(struct virtnet_info *vi)
2164 {
2165         int i;
2166
2167         for (i = 0; i < vi->max_queue_pairs; i++) {
2168                 napi_hash_del(&vi->rq[i].napi);
2169                 netif_napi_del(&vi->rq[i].napi);
2170                 netif_napi_del(&vi->sq[i].napi);
2171         }
2172
2173         /* We called napi_hash_del() before netif_napi_del(),
2174          * we need to respect an RCU grace period before freeing vi->rq
2175          */
2176         synchronize_net();
2177
2178         kfree(vi->rq);
2179         kfree(vi->sq);
2180         kfree(vi->ctrl);
2181 }
2182
2183 static void _free_receive_bufs(struct virtnet_info *vi)
2184 {
2185         struct bpf_prog *old_prog;
2186         int i;
2187
2188         for (i = 0; i < vi->max_queue_pairs; i++) {
2189                 while (vi->rq[i].pages)
2190                         __free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
2191
2192                 old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
2193                 RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
2194                 if (old_prog)
2195                         bpf_prog_put(old_prog);
2196         }
2197 }
2198
2199 static void free_receive_bufs(struct virtnet_info *vi)
2200 {
2201         rtnl_lock();
2202         _free_receive_bufs(vi);
2203         rtnl_unlock();
2204 }
2205
2206 static void free_receive_page_frags(struct virtnet_info *vi)
2207 {
2208         int i;
2209         for (i = 0; i < vi->max_queue_pairs; i++)
2210                 if (vi->rq[i].alloc_frag.page)
2211                         put_page(vi->rq[i].alloc_frag.page);
2212 }
2213
2214 static void free_unused_bufs(struct virtnet_info *vi)
2215 {
2216         void *buf;
2217         int i;
2218
2219         for (i = 0; i < vi->max_queue_pairs; i++) {
2220                 struct virtqueue *vq = vi->sq[i].vq;
2221                 while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2222                         if (!is_xdp_raw_buffer_queue(vi, i))
2223                                 dev_kfree_skb(buf);
2224                         else
2225                                 put_page(virt_to_head_page(buf));
2226                 }
2227         }
2228
2229         for (i = 0; i < vi->max_queue_pairs; i++) {
2230                 struct virtqueue *vq = vi->rq[i].vq;
2231
2232                 while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2233                         if (vi->mergeable_rx_bufs) {
2234                                 put_page(virt_to_head_page(buf));
2235                         } else if (vi->big_packets) {
2236                                 give_pages(&vi->rq[i], buf);
2237                         } else {
2238                                 put_page(virt_to_head_page(buf));
2239                         }
2240                 }
2241         }
2242 }
2243
2244 static void virtnet_del_vqs(struct virtnet_info *vi)
2245 {
2246         struct virtio_device *vdev = vi->vdev;
2247
2248         virtnet_clean_affinity(vi, -1);
2249
2250         vdev->config->del_vqs(vdev);
2251
2252         virtnet_free_queues(vi);
2253 }
2254
2255 /* How large should a single buffer be so a queue full of these can fit at
2256  * least one full packet?
2257  * Logic below assumes the mergeable buffer header is used.
2258  */
2259 static unsigned int mergeable_min_buf_len(struct virtnet_info *vi, struct virtqueue *vq)
2260 {
2261         const unsigned int hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
2262         unsigned int rq_size = virtqueue_get_vring_size(vq);
2263         unsigned int packet_len = vi->big_packets ? IP_MAX_MTU : vi->dev->max_mtu;
2264         unsigned int buf_len = hdr_len + ETH_HLEN + VLAN_HLEN + packet_len;
2265         unsigned int min_buf_len = DIV_ROUND_UP(buf_len, rq_size);
2266
2267         return max(max(min_buf_len, hdr_len) - hdr_len,
2268                    (unsigned int)GOOD_PACKET_LEN);
2269 }
2270
2271 static int virtnet_find_vqs(struct virtnet_info *vi)
2272 {
2273         vq_callback_t **callbacks;
2274         struct virtqueue **vqs;
2275         int ret = -ENOMEM;
2276         int i, total_vqs;
2277         const char **names;
2278         bool *ctx;
2279
2280         /* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
2281          * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
2282          * possible control vq.
2283          */
2284         total_vqs = vi->max_queue_pairs * 2 +
2285                     virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);
2286
2287         /* Allocate space for find_vqs parameters */
2288         vqs = kzalloc(total_vqs * sizeof(*vqs), GFP_KERNEL);
2289         if (!vqs)
2290                 goto err_vq;
2291         callbacks = kmalloc(total_vqs * sizeof(*callbacks), GFP_KERNEL);
2292         if (!callbacks)
2293                 goto err_callback;
2294         names = kmalloc(total_vqs * sizeof(*names), GFP_KERNEL);
2295         if (!names)
2296                 goto err_names;
2297         if (!vi->big_packets || vi->mergeable_rx_bufs) {
2298                 ctx = kzalloc(total_vqs * sizeof(*ctx), GFP_KERNEL);
2299                 if (!ctx)
2300                         goto err_ctx;
2301         } else {
2302                 ctx = NULL;
2303         }
2304
2305         /* Parameters for control virtqueue, if any */
2306         if (vi->has_cvq) {
2307                 callbacks[total_vqs - 1] = NULL;
2308                 names[total_vqs - 1] = "control";
2309         }
2310
2311         /* Allocate/initialize parameters for send/receive virtqueues */
2312         for (i = 0; i < vi->max_queue_pairs; i++) {
2313                 callbacks[rxq2vq(i)] = skb_recv_done;
2314                 callbacks[txq2vq(i)] = skb_xmit_done;
2315                 sprintf(vi->rq[i].name, "input.%d", i);
2316                 sprintf(vi->sq[i].name, "output.%d", i);
2317                 names[rxq2vq(i)] = vi->rq[i].name;
2318                 names[txq2vq(i)] = vi->sq[i].name;
2319                 if (ctx)
2320                         ctx[rxq2vq(i)] = true;
2321         }
2322
2323         ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
2324                                          names, ctx, NULL);
2325         if (ret)
2326                 goto err_find;
2327
2328         if (vi->has_cvq) {
2329                 vi->cvq = vqs[total_vqs - 1];
2330                 if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
2331                         vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
2332         }
2333
2334         for (i = 0; i < vi->max_queue_pairs; i++) {
2335                 vi->rq[i].vq = vqs[rxq2vq(i)];
2336                 vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
2337                 vi->sq[i].vq = vqs[txq2vq(i)];
2338         }
2339
2340         kfree(names);
2341         kfree(callbacks);
2342         kfree(vqs);
2343         kfree(ctx);
2344
2345         return 0;
2346
2347 err_find:
2348         kfree(ctx);
2349 err_ctx:
2350         kfree(names);
2351 err_names:
2352         kfree(callbacks);
2353 err_callback:
2354         kfree(vqs);
2355 err_vq:
2356         return ret;
2357 }
2358
2359 static int virtnet_alloc_queues(struct virtnet_info *vi)
2360 {
2361         int i;
2362
2363         vi->ctrl = kzalloc(sizeof(*vi->ctrl), GFP_KERNEL);
2364         if (!vi->ctrl)
2365                 goto err_ctrl;
2366         vi->sq = kzalloc(sizeof(*vi->sq) * vi->max_queue_pairs, GFP_KERNEL);
2367         if (!vi->sq)
2368                 goto err_sq;
2369         vi->rq = kzalloc(sizeof(*vi->rq) * vi->max_queue_pairs, GFP_KERNEL);
2370         if (!vi->rq)
2371                 goto err_rq;
2372
2373         INIT_DELAYED_WORK(&vi->refill, refill_work);
2374         for (i = 0; i < vi->max_queue_pairs; i++) {
2375                 vi->rq[i].pages = NULL;
2376                 netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
2377                                napi_weight);
2378                 netif_tx_napi_add(vi->dev, &vi->sq[i].napi, virtnet_poll_tx,
2379                                   napi_tx ? napi_weight : 0);
2380
2381                 sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
2382                 ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
2383                 sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
2384         }
2385
2386         return 0;
2387
2388 err_rq:
2389         kfree(vi->sq);
2390 err_sq:
2391         kfree(vi->ctrl);
2392 err_ctrl:
2393         return -ENOMEM;
2394 }
2395
2396 static int init_vqs(struct virtnet_info *vi)
2397 {
2398         int ret;
2399
2400         /* Allocate send & receive queues */
2401         ret = virtnet_alloc_queues(vi);
2402         if (ret)
2403                 goto err;
2404
2405         ret = virtnet_find_vqs(vi);
2406         if (ret)
2407                 goto err_free;
2408
2409         get_online_cpus();
2410         virtnet_set_affinity(vi);
2411         put_online_cpus();
2412
2413         return 0;
2414
2415 err_free:
2416         virtnet_free_queues(vi);
2417 err:
2418         return ret;
2419 }
2420
2421 #ifdef CONFIG_SYSFS
2422 static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
2423                 char *buf)
2424 {
2425         struct virtnet_info *vi = netdev_priv(queue->dev);
2426         unsigned int queue_index = get_netdev_rx_queue_index(queue);
2427         struct ewma_pkt_len *avg;
2428
2429         BUG_ON(queue_index >= vi->max_queue_pairs);
2430         avg = &vi->rq[queue_index].mrg_avg_pkt_len;
2431         return sprintf(buf, "%u\n",
2432                        get_mergeable_buf_len(&vi->rq[queue_index], avg));
2433 }
2434
2435 static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
2436         __ATTR_RO(mergeable_rx_buffer_size);
2437
2438 static struct attribute *virtio_net_mrg_rx_attrs[] = {
2439         &mergeable_rx_buffer_size_attribute.attr,
2440         NULL
2441 };
2442
2443 static const struct attribute_group virtio_net_mrg_rx_group = {
2444         .name = "virtio_net",
2445         .attrs = virtio_net_mrg_rx_attrs
2446 };
2447 #endif
2448
2449 static bool virtnet_fail_on_feature(struct virtio_device *vdev,
2450                                     unsigned int fbit,
2451                                     const char *fname, const char *dname)
2452 {
2453         if (!virtio_has_feature(vdev, fbit))
2454                 return false;
2455
2456         dev_err(&vdev->dev, "device advertises feature %s but not %s",
2457                 fname, dname);
2458
2459         return true;
2460 }
2461
2462 #define VIRTNET_FAIL_ON(vdev, fbit, dbit)                       \
2463         virtnet_fail_on_feature(vdev, fbit, #fbit, dbit)
2464
2465 static bool virtnet_validate_features(struct virtio_device *vdev)
2466 {
2467         if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) &&
2468             (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX,
2469                              "VIRTIO_NET_F_CTRL_VQ") ||
2470              VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN,
2471                              "VIRTIO_NET_F_CTRL_VQ") ||
2472              VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE,
2473                              "VIRTIO_NET_F_CTRL_VQ") ||
2474              VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") ||
2475              VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR,
2476                              "VIRTIO_NET_F_CTRL_VQ"))) {
2477                 return false;
2478         }
2479
2480         return true;
2481 }
2482
2483 #define MIN_MTU ETH_MIN_MTU
2484 #define MAX_MTU ETH_MAX_MTU
2485
2486 static int virtnet_validate(struct virtio_device *vdev)
2487 {
2488         if (!vdev->config->get) {
2489                 dev_err(&vdev->dev, "%s failure: config access disabled\n",
2490                         __func__);
2491                 return -EINVAL;
2492         }
2493
2494         if (!virtnet_validate_features(vdev))
2495                 return -EINVAL;
2496
2497         if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
2498                 int mtu = virtio_cread16(vdev,
2499                                          offsetof(struct virtio_net_config,
2500                                                   mtu));
2501                 if (mtu < MIN_MTU)
2502                         __virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
2503         }
2504
2505         return 0;
2506 }
2507
2508 static int virtnet_probe(struct virtio_device *vdev)
2509 {
2510         int i, err;
2511         struct net_device *dev;
2512         struct virtnet_info *vi;
2513         u16 max_queue_pairs;
2514         int mtu;
2515
2516         /* Find if host supports multiqueue virtio_net device */
2517         err = virtio_cread_feature(vdev, VIRTIO_NET_F_MQ,
2518                                    struct virtio_net_config,
2519                                    max_virtqueue_pairs, &max_queue_pairs);
2520
2521         /* We need at least 2 queue's */
2522         if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
2523             max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
2524             !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
2525                 max_queue_pairs = 1;
2526
2527         /* Allocate ourselves a network device with room for our info */
2528         dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
2529         if (!dev)
2530                 return -ENOMEM;
2531
2532         /* Set up network device as normal. */
2533         dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
2534         dev->netdev_ops = &virtnet_netdev;
2535         dev->features = NETIF_F_HIGHDMA;
2536
2537         dev->ethtool_ops = &virtnet_ethtool_ops;
2538         SET_NETDEV_DEV(dev, &vdev->dev);
2539
2540         /* Do we support "hardware" checksums? */
2541         if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
2542                 /* This opens up the world of extra features. */
2543                 dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2544                 if (csum)
2545                         dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2546
2547                 if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
2548                         dev->hw_features |= NETIF_F_TSO
2549                                 | NETIF_F_TSO_ECN | NETIF_F_TSO6;
2550                 }
2551                 /* Individual feature bits: what can host handle? */
2552                 if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
2553                         dev->hw_features |= NETIF_F_TSO;
2554                 if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
2555                         dev->hw_features |= NETIF_F_TSO6;
2556                 if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
2557                         dev->hw_features |= NETIF_F_TSO_ECN;
2558
2559                 dev->features |= NETIF_F_GSO_ROBUST;
2560
2561                 if (gso)
2562                         dev->features |= dev->hw_features & NETIF_F_ALL_TSO;
2563                 /* (!csum && gso) case will be fixed by register_netdev() */
2564         }
2565         if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
2566                 dev->features |= NETIF_F_RXCSUM;
2567
2568         dev->vlan_features = dev->features;
2569
2570         /* MTU range: 68 - 65535 */
2571         dev->min_mtu = MIN_MTU;
2572         dev->max_mtu = MAX_MTU;
2573
2574         /* Configuration may specify what MAC to use.  Otherwise random. */
2575         if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
2576                 virtio_cread_bytes(vdev,
2577                                    offsetof(struct virtio_net_config, mac),
2578                                    dev->dev_addr, dev->addr_len);
2579         else
2580                 eth_hw_addr_random(dev);
2581
2582         /* Set up our device-specific information */
2583         vi = netdev_priv(dev);
2584         vi->dev = dev;
2585         vi->vdev = vdev;
2586         vdev->priv = vi;
2587         vi->stats = alloc_percpu(struct virtnet_stats);
2588         err = -ENOMEM;
2589         if (vi->stats == NULL)
2590                 goto free;
2591
2592         for_each_possible_cpu(i) {
2593                 struct virtnet_stats *virtnet_stats;
2594                 virtnet_stats = per_cpu_ptr(vi->stats, i);
2595                 u64_stats_init(&virtnet_stats->tx_syncp);
2596                 u64_stats_init(&virtnet_stats->rx_syncp);
2597         }
2598
2599         INIT_WORK(&vi->config_work, virtnet_config_changed_work);
2600
2601         /* If we can receive ANY GSO packets, we must allocate large ones. */
2602         if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
2603             virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
2604             virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN) ||
2605             virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO))
2606                 vi->big_packets = true;
2607
2608         if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
2609                 vi->mergeable_rx_bufs = true;
2610
2611         if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
2612             virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2613                 vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
2614         else
2615                 vi->hdr_len = sizeof(struct virtio_net_hdr);
2616
2617         if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) ||
2618             virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2619                 vi->any_header_sg = true;
2620
2621         if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
2622                 vi->has_cvq = true;
2623
2624         if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
2625                 mtu = virtio_cread16(vdev,
2626                                      offsetof(struct virtio_net_config,
2627                                               mtu));
2628                 if (mtu < dev->min_mtu) {
2629                         /* Should never trigger: MTU was previously validated
2630                          * in virtnet_validate.
2631                          */
2632                         dev_err(&vdev->dev, "device MTU appears to have changed "
2633                                 "it is now %d < %d", mtu, dev->min_mtu);
2634                         goto free_stats;
2635                 }
2636
2637                 dev->mtu = mtu;
2638                 dev->max_mtu = mtu;
2639
2640                 /* TODO: size buffers correctly in this case. */
2641                 if (dev->mtu > ETH_DATA_LEN)
2642                         vi->big_packets = true;
2643         }
2644
2645         if (vi->any_header_sg)
2646                 dev->needed_headroom = vi->hdr_len;
2647
2648         /* Enable multiqueue by default */
2649         if (num_online_cpus() >= max_queue_pairs)
2650                 vi->curr_queue_pairs = max_queue_pairs;
2651         else
2652                 vi->curr_queue_pairs = num_online_cpus();
2653         vi->max_queue_pairs = max_queue_pairs;
2654
2655         /* Allocate/initialize the rx/tx queues, and invoke find_vqs */
2656         err = init_vqs(vi);
2657         if (err)
2658                 goto free_stats;
2659
2660 #ifdef CONFIG_SYSFS
2661         if (vi->mergeable_rx_bufs)
2662                 dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
2663 #endif
2664         netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
2665         netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
2666
2667         virtnet_init_settings(dev);
2668
2669         err = register_netdev(dev);
2670         if (err) {
2671                 pr_debug("virtio_net: registering device failed\n");
2672                 goto free_vqs;
2673         }
2674
2675         virtio_device_ready(vdev);
2676
2677         err = virtnet_cpu_notif_add(vi);
2678         if (err) {
2679                 pr_debug("virtio_net: registering cpu notifier failed\n");
2680                 goto free_unregister_netdev;
2681         }
2682
2683         virtnet_set_queues(vi, vi->curr_queue_pairs);
2684
2685         /* Assume link up if device can't report link status,
2686            otherwise get link status from config. */
2687         netif_carrier_off(dev);
2688         if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
2689                 schedule_work(&vi->config_work);
2690         } else {
2691                 vi->status = VIRTIO_NET_S_LINK_UP;
2692                 netif_carrier_on(dev);
2693         }
2694
2695         for (i = 0; i < ARRAY_SIZE(guest_offloads); i++)
2696                 if (virtio_has_feature(vi->vdev, guest_offloads[i]))
2697                         set_bit(guest_offloads[i], &vi->guest_offloads);
2698
2699         pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
2700                  dev->name, max_queue_pairs);
2701
2702         return 0;
2703
2704 free_unregister_netdev:
2705         vi->vdev->config->reset(vdev);
2706
2707         unregister_netdev(dev);
2708 free_vqs:
2709         cancel_delayed_work_sync(&vi->refill);
2710         free_receive_page_frags(vi);
2711         virtnet_del_vqs(vi);
2712 free_stats:
2713         free_percpu(vi->stats);
2714 free:
2715         free_netdev(dev);
2716         return err;
2717 }
2718
2719 static void remove_vq_common(struct virtnet_info *vi)
2720 {
2721         vi->vdev->config->reset(vi->vdev);
2722
2723         /* Free unused buffers in both send and recv, if any. */
2724         free_unused_bufs(vi);
2725
2726         free_receive_bufs(vi);
2727
2728         free_receive_page_frags(vi);
2729
2730         virtnet_del_vqs(vi);
2731 }
2732
2733 static void virtnet_remove(struct virtio_device *vdev)
2734 {
2735         struct virtnet_info *vi = vdev->priv;
2736
2737         virtnet_cpu_notif_remove(vi);
2738
2739         /* Make sure no work handler is accessing the device. */
2740         flush_work(&vi->config_work);
2741
2742         unregister_netdev(vi->dev);
2743
2744         remove_vq_common(vi);
2745
2746         free_percpu(vi->stats);
2747         free_netdev(vi->dev);
2748 }
2749
2750 static __maybe_unused int virtnet_freeze(struct virtio_device *vdev)
2751 {
2752         struct virtnet_info *vi = vdev->priv;
2753
2754         virtnet_cpu_notif_remove(vi);
2755         virtnet_freeze_down(vdev);
2756         remove_vq_common(vi);
2757
2758         return 0;
2759 }
2760
2761 static __maybe_unused int virtnet_restore(struct virtio_device *vdev)
2762 {
2763         struct virtnet_info *vi = vdev->priv;
2764         int err;
2765
2766         err = virtnet_restore_up(vdev);
2767         if (err)
2768                 return err;
2769         virtnet_set_queues(vi, vi->curr_queue_pairs);
2770
2771         err = virtnet_cpu_notif_add(vi);
2772         if (err) {
2773                 virtnet_freeze_down(vdev);
2774                 remove_vq_common(vi);
2775                 return err;
2776         }
2777
2778         return 0;
2779 }
2780
2781 static struct virtio_device_id id_table[] = {
2782         { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
2783         { 0 },
2784 };
2785
2786 #define VIRTNET_FEATURES \
2787         VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
2788         VIRTIO_NET_F_MAC, \
2789         VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
2790         VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
2791         VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
2792         VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
2793         VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
2794         VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
2795         VIRTIO_NET_F_CTRL_MAC_ADDR, \
2796         VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS
2797
2798 static unsigned int features[] = {
2799         VIRTNET_FEATURES,
2800 };
2801
2802 static unsigned int features_legacy[] = {
2803         VIRTNET_FEATURES,
2804         VIRTIO_NET_F_GSO,
2805         VIRTIO_F_ANY_LAYOUT,
2806 };
2807
2808 static struct virtio_driver virtio_net_driver = {
2809         .feature_table = features,
2810         .feature_table_size = ARRAY_SIZE(features),
2811         .feature_table_legacy = features_legacy,
2812         .feature_table_size_legacy = ARRAY_SIZE(features_legacy),
2813         .driver.name =  KBUILD_MODNAME,
2814         .driver.owner = THIS_MODULE,
2815         .id_table =     id_table,
2816         .validate =     virtnet_validate,
2817         .probe =        virtnet_probe,
2818         .remove =       virtnet_remove,
2819         .config_changed = virtnet_config_changed,
2820 #ifdef CONFIG_PM_SLEEP
2821         .freeze =       virtnet_freeze,
2822         .restore =      virtnet_restore,
2823 #endif
2824 };
2825
2826 static __init int virtio_net_driver_init(void)
2827 {
2828         int ret;
2829
2830         ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
2831                                       virtnet_cpu_online,
2832                                       virtnet_cpu_down_prep);
2833         if (ret < 0)
2834                 goto out;
2835         virtionet_online = ret;
2836         ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead",
2837                                       NULL, virtnet_cpu_dead);
2838         if (ret)
2839                 goto err_dead;
2840
2841         ret = register_virtio_driver(&virtio_net_driver);
2842         if (ret)
2843                 goto err_virtio;
2844         return 0;
2845 err_virtio:
2846         cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
2847 err_dead:
2848         cpuhp_remove_multi_state(virtionet_online);
2849 out:
2850         return ret;
2851 }
2852 module_init(virtio_net_driver_init);
2853
2854 static __exit void virtio_net_driver_exit(void)
2855 {
2856         unregister_virtio_driver(&virtio_net_driver);
2857         cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
2858         cpuhp_remove_multi_state(virtionet_online);
2859 }
2860 module_exit(virtio_net_driver_exit);
2861
2862 MODULE_DEVICE_TABLE(virtio, id_table);
2863 MODULE_DESCRIPTION("Virtio network driver");
2864 MODULE_LICENSE("GPL");