GNU Linux-libre 4.14.332-gnu1
[releases.git] / drivers / misc / vmw_balloon.c
1 /*
2  * VMware Balloon driver.
3  *
4  * Copyright (C) 2000-2014, VMware, Inc. All Rights Reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License as published by the
8  * Free Software Foundation; version 2 of the License and no later version.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
13  * NON INFRINGEMENT.  See the GNU General Public License for more
14  * details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19  *
20  * Maintained by:       Xavier Deguillard <xdeguillard@vmware.com>
21  *                      Philip Moltmann <moltmann@vmware.com>
22  */
23
24 /*
25  * This is VMware physical memory management driver for Linux. The driver
26  * acts like a "balloon" that can be inflated to reclaim physical pages by
27  * reserving them in the guest and invalidating them in the monitor,
28  * freeing up the underlying machine pages so they can be allocated to
29  * other guests.  The balloon can also be deflated to allow the guest to
30  * use more physical memory. Higher level policies can control the sizes
31  * of balloons in VMs in order to manage physical memory resources.
32  */
33
34 //#define DEBUG
35 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
36
37 #include <linux/types.h>
38 #include <linux/kernel.h>
39 #include <linux/mm.h>
40 #include <linux/vmalloc.h>
41 #include <linux/sched.h>
42 #include <linux/module.h>
43 #include <linux/workqueue.h>
44 #include <linux/debugfs.h>
45 #include <linux/seq_file.h>
46 #include <linux/vmw_vmci_defs.h>
47 #include <linux/vmw_vmci_api.h>
48 #include <linux/io.h>
49 #include <asm/hypervisor.h>
50
51 MODULE_AUTHOR("VMware, Inc.");
52 MODULE_DESCRIPTION("VMware Memory Control (Balloon) Driver");
53 MODULE_VERSION("1.5.0.0-k");
54 MODULE_ALIAS("dmi:*:svnVMware*:*");
55 MODULE_ALIAS("vmware_vmmemctl");
56 MODULE_LICENSE("GPL");
57
58 /*
59  * Various constants controlling rate of inflaint/deflating balloon,
60  * measured in pages.
61  */
62
63 /*
64  * Rates of memory allocaton when guest experiences memory pressure
65  * (driver performs sleeping allocations).
66  */
67 #define VMW_BALLOON_RATE_ALLOC_MIN      512U
68 #define VMW_BALLOON_RATE_ALLOC_MAX      2048U
69 #define VMW_BALLOON_RATE_ALLOC_INC      16U
70
71 /*
72  * When guest is under memory pressure, use a reduced page allocation
73  * rate for next several cycles.
74  */
75 #define VMW_BALLOON_SLOW_CYCLES         4
76
77 /*
78  * Use __GFP_HIGHMEM to allow pages from HIGHMEM zone. We don't
79  * allow wait (__GFP_RECLAIM) for NOSLEEP page allocations. Use
80  * __GFP_NOWARN, to suppress page allocation failure warnings.
81  */
82 #define VMW_PAGE_ALLOC_NOSLEEP          (__GFP_HIGHMEM|__GFP_NOWARN)
83
84 /*
85  * Use GFP_HIGHUSER when executing in a separate kernel thread
86  * context and allocation can sleep.  This is less stressful to
87  * the guest memory system, since it allows the thread to block
88  * while memory is reclaimed, and won't take pages from emergency
89  * low-memory pools.
90  */
91 #define VMW_PAGE_ALLOC_CANSLEEP         (GFP_HIGHUSER)
92
93 /* Maximum number of refused pages we accumulate during inflation cycle */
94 #define VMW_BALLOON_MAX_REFUSED         16
95
96 /*
97  * Hypervisor communication port definitions.
98  */
99 #define VMW_BALLOON_HV_PORT             0x5670
100 #define VMW_BALLOON_HV_MAGIC            0x456c6d6f
101 #define VMW_BALLOON_GUEST_ID            1       /* Linux */
102
103 enum vmwballoon_capabilities {
104         /*
105          * Bit 0 is reserved and not associated to any capability.
106          */
107         VMW_BALLOON_BASIC_CMDS                  = (1 << 1),
108         VMW_BALLOON_BATCHED_CMDS                = (1 << 2),
109         VMW_BALLOON_BATCHED_2M_CMDS             = (1 << 3),
110         VMW_BALLOON_SIGNALLED_WAKEUP_CMD        = (1 << 4),
111 };
112
113 #define VMW_BALLOON_CAPABILITIES        (VMW_BALLOON_BASIC_CMDS \
114                                         | VMW_BALLOON_BATCHED_CMDS \
115                                         | VMW_BALLOON_BATCHED_2M_CMDS \
116                                         | VMW_BALLOON_SIGNALLED_WAKEUP_CMD)
117
118 #define VMW_BALLOON_2M_SHIFT            (9)
119 #define VMW_BALLOON_NUM_PAGE_SIZES      (2)
120
121 /*
122  * Backdoor commands availability:
123  *
124  * START, GET_TARGET and GUEST_ID are always available,
125  *
126  * VMW_BALLOON_BASIC_CMDS:
127  *      LOCK and UNLOCK commands,
128  * VMW_BALLOON_BATCHED_CMDS:
129  *      BATCHED_LOCK and BATCHED_UNLOCK commands.
130  * VMW BALLOON_BATCHED_2M_CMDS:
131  *      BATCHED_2M_LOCK and BATCHED_2M_UNLOCK commands,
132  * VMW VMW_BALLOON_SIGNALLED_WAKEUP_CMD:
133  *      VMW_BALLOON_CMD_VMCI_DOORBELL_SET command.
134  */
135 #define VMW_BALLOON_CMD_START                   0
136 #define VMW_BALLOON_CMD_GET_TARGET              1
137 #define VMW_BALLOON_CMD_LOCK                    2
138 #define VMW_BALLOON_CMD_UNLOCK                  3
139 #define VMW_BALLOON_CMD_GUEST_ID                4
140 #define VMW_BALLOON_CMD_BATCHED_LOCK            6
141 #define VMW_BALLOON_CMD_BATCHED_UNLOCK          7
142 #define VMW_BALLOON_CMD_BATCHED_2M_LOCK         8
143 #define VMW_BALLOON_CMD_BATCHED_2M_UNLOCK       9
144 #define VMW_BALLOON_CMD_VMCI_DOORBELL_SET       10
145
146
147 /* error codes */
148 #define VMW_BALLOON_SUCCESS                     0
149 #define VMW_BALLOON_FAILURE                     -1
150 #define VMW_BALLOON_ERROR_CMD_INVALID           1
151 #define VMW_BALLOON_ERROR_PPN_INVALID           2
152 #define VMW_BALLOON_ERROR_PPN_LOCKED            3
153 #define VMW_BALLOON_ERROR_PPN_UNLOCKED          4
154 #define VMW_BALLOON_ERROR_PPN_PINNED            5
155 #define VMW_BALLOON_ERROR_PPN_NOTNEEDED         6
156 #define VMW_BALLOON_ERROR_RESET                 7
157 #define VMW_BALLOON_ERROR_BUSY                  8
158
159 #define VMW_BALLOON_SUCCESS_WITH_CAPABILITIES   (0x03000000)
160
161 /* Batch page description */
162
163 /*
164  * Layout of a page in the batch page:
165  *
166  * +-------------+----------+--------+
167  * |             |          |        |
168  * | Page number | Reserved | Status |
169  * |             |          |        |
170  * +-------------+----------+--------+
171  * 64  PAGE_SHIFT          6         0
172  *
173  * The reserved field should be set to 0.
174  */
175 #define VMW_BALLOON_BATCH_MAX_PAGES     (PAGE_SIZE / sizeof(u64))
176 #define VMW_BALLOON_BATCH_STATUS_MASK   ((1UL << 5) - 1)
177 #define VMW_BALLOON_BATCH_PAGE_MASK     (~((1UL << PAGE_SHIFT) - 1))
178
179 struct vmballoon_batch_page {
180         u64 pages[VMW_BALLOON_BATCH_MAX_PAGES];
181 };
182
183 static u64 vmballoon_batch_get_pa(struct vmballoon_batch_page *batch, int idx)
184 {
185         return batch->pages[idx] & VMW_BALLOON_BATCH_PAGE_MASK;
186 }
187
188 static int vmballoon_batch_get_status(struct vmballoon_batch_page *batch,
189                                 int idx)
190 {
191         return (int)(batch->pages[idx] & VMW_BALLOON_BATCH_STATUS_MASK);
192 }
193
194 static void vmballoon_batch_set_pa(struct vmballoon_batch_page *batch, int idx,
195                                 u64 pa)
196 {
197         batch->pages[idx] = pa;
198 }
199
200
201 #define VMWARE_BALLOON_CMD(cmd, arg1, arg2, result)             \
202 ({                                                              \
203         unsigned long __status, __dummy1, __dummy2, __dummy3;   \
204         __asm__ __volatile__ ("inl %%dx" :                      \
205                 "=a"(__status),                                 \
206                 "=c"(__dummy1),                                 \
207                 "=d"(__dummy2),                                 \
208                 "=b"(result),                                   \
209                 "=S" (__dummy3) :                               \
210                 "0"(VMW_BALLOON_HV_MAGIC),                      \
211                 "1"(VMW_BALLOON_CMD_##cmd),                     \
212                 "2"(VMW_BALLOON_HV_PORT),                       \
213                 "3"(arg1),                                      \
214                 "4" (arg2) :                                    \
215                 "memory");                                      \
216         if (VMW_BALLOON_CMD_##cmd == VMW_BALLOON_CMD_START)     \
217                 result = __dummy1;                              \
218         result &= -1UL;                                         \
219         __status & -1UL;                                        \
220 })
221
222 #ifdef CONFIG_DEBUG_FS
223 struct vmballoon_stats {
224         unsigned int timer;
225         unsigned int doorbell;
226
227         /* allocation statistics */
228         unsigned int alloc[VMW_BALLOON_NUM_PAGE_SIZES];
229         unsigned int alloc_fail[VMW_BALLOON_NUM_PAGE_SIZES];
230         unsigned int sleep_alloc;
231         unsigned int sleep_alloc_fail;
232         unsigned int refused_alloc[VMW_BALLOON_NUM_PAGE_SIZES];
233         unsigned int refused_free[VMW_BALLOON_NUM_PAGE_SIZES];
234         unsigned int free[VMW_BALLOON_NUM_PAGE_SIZES];
235
236         /* monitor operations */
237         unsigned int lock[VMW_BALLOON_NUM_PAGE_SIZES];
238         unsigned int lock_fail[VMW_BALLOON_NUM_PAGE_SIZES];
239         unsigned int unlock[VMW_BALLOON_NUM_PAGE_SIZES];
240         unsigned int unlock_fail[VMW_BALLOON_NUM_PAGE_SIZES];
241         unsigned int target;
242         unsigned int target_fail;
243         unsigned int start;
244         unsigned int start_fail;
245         unsigned int guest_type;
246         unsigned int guest_type_fail;
247         unsigned int doorbell_set;
248         unsigned int doorbell_unset;
249 };
250
251 #define STATS_INC(stat) (stat)++
252 #else
253 #define STATS_INC(stat)
254 #endif
255
256 struct vmballoon;
257
258 struct vmballoon_ops {
259         void (*add_page)(struct vmballoon *b, int idx, struct page *p);
260         int (*lock)(struct vmballoon *b, unsigned int num_pages,
261                         bool is_2m_pages, unsigned int *target);
262         int (*unlock)(struct vmballoon *b, unsigned int num_pages,
263                         bool is_2m_pages, unsigned int *target);
264 };
265
266 struct vmballoon_page_size {
267         /* list of reserved physical pages */
268         struct list_head pages;
269
270         /* transient list of non-balloonable pages */
271         struct list_head refused_pages;
272         unsigned int n_refused_pages;
273 };
274
275 struct vmballoon {
276         struct vmballoon_page_size page_sizes[VMW_BALLOON_NUM_PAGE_SIZES];
277
278         /* supported page sizes. 1 == 4k pages only, 2 == 4k and 2m pages */
279         unsigned supported_page_sizes;
280
281         /* balloon size in pages */
282         unsigned int size;
283         unsigned int target;
284
285         /* reset flag */
286         bool reset_required;
287
288         /* adjustment rates (pages per second) */
289         unsigned int rate_alloc;
290
291         /* slowdown page allocations for next few cycles */
292         unsigned int slow_allocation_cycles;
293
294         unsigned long capabilities;
295
296         struct vmballoon_batch_page *batch_page;
297         unsigned int batch_max_pages;
298         struct page *page;
299
300         const struct vmballoon_ops *ops;
301
302 #ifdef CONFIG_DEBUG_FS
303         /* statistics */
304         struct vmballoon_stats stats;
305
306         /* debugfs file exporting statistics */
307         struct dentry *dbg_entry;
308 #endif
309
310         struct sysinfo sysinfo;
311
312         struct delayed_work dwork;
313
314         struct vmci_handle vmci_doorbell;
315 };
316
317 static struct vmballoon balloon;
318
319 /*
320  * Send "start" command to the host, communicating supported version
321  * of the protocol.
322  */
323 static bool vmballoon_send_start(struct vmballoon *b, unsigned long req_caps)
324 {
325         unsigned long status, capabilities, dummy = 0;
326         bool success;
327
328         STATS_INC(b->stats.start);
329
330         status = VMWARE_BALLOON_CMD(START, req_caps, dummy, capabilities);
331
332         switch (status) {
333         case VMW_BALLOON_SUCCESS_WITH_CAPABILITIES:
334                 b->capabilities = capabilities;
335                 success = true;
336                 break;
337         case VMW_BALLOON_SUCCESS:
338                 b->capabilities = VMW_BALLOON_BASIC_CMDS;
339                 success = true;
340                 break;
341         default:
342                 success = false;
343         }
344
345         /*
346          * 2MB pages are only supported with batching. If batching is for some
347          * reason disabled, do not use 2MB pages, since otherwise the legacy
348          * mechanism is used with 2MB pages, causing a failure.
349          */
350         if ((b->capabilities & VMW_BALLOON_BATCHED_2M_CMDS) &&
351             (b->capabilities & VMW_BALLOON_BATCHED_CMDS))
352                 b->supported_page_sizes = 2;
353         else
354                 b->supported_page_sizes = 1;
355
356         if (!success) {
357                 pr_debug("%s - failed, hv returns %ld\n", __func__, status);
358                 STATS_INC(b->stats.start_fail);
359         }
360         return success;
361 }
362
363 static bool vmballoon_check_status(struct vmballoon *b, unsigned long status)
364 {
365         switch (status) {
366         case VMW_BALLOON_SUCCESS:
367                 return true;
368
369         case VMW_BALLOON_ERROR_RESET:
370                 b->reset_required = true;
371                 /* fall through */
372
373         default:
374                 return false;
375         }
376 }
377
378 /*
379  * Communicate guest type to the host so that it can adjust ballooning
380  * algorithm to the one most appropriate for the guest. This command
381  * is normally issued after sending "start" command and is part of
382  * standard reset sequence.
383  */
384 static bool vmballoon_send_guest_id(struct vmballoon *b)
385 {
386         unsigned long status, dummy = 0;
387
388         status = VMWARE_BALLOON_CMD(GUEST_ID, VMW_BALLOON_GUEST_ID, dummy,
389                                 dummy);
390
391         STATS_INC(b->stats.guest_type);
392
393         if (vmballoon_check_status(b, status))
394                 return true;
395
396         pr_debug("%s - failed, hv returns %ld\n", __func__, status);
397         STATS_INC(b->stats.guest_type_fail);
398         return false;
399 }
400
401 static u16 vmballoon_page_size(bool is_2m_page)
402 {
403         if (is_2m_page)
404                 return 1 << VMW_BALLOON_2M_SHIFT;
405
406         return 1;
407 }
408
409 /*
410  * Retrieve desired balloon size from the host.
411  */
412 static bool vmballoon_send_get_target(struct vmballoon *b, u32 *new_target)
413 {
414         unsigned long status;
415         unsigned long target;
416         unsigned long limit;
417         unsigned long dummy = 0;
418         u32 limit32;
419
420         /*
421          * si_meminfo() is cheap. Moreover, we want to provide dynamic
422          * max balloon size later. So let us call si_meminfo() every
423          * iteration.
424          */
425         si_meminfo(&b->sysinfo);
426         limit = b->sysinfo.totalram;
427
428         /* Ensure limit fits in 32-bits */
429         limit32 = (u32)limit;
430         if (limit != limit32)
431                 return false;
432
433         /* update stats */
434         STATS_INC(b->stats.target);
435
436         status = VMWARE_BALLOON_CMD(GET_TARGET, limit, dummy, target);
437         if (vmballoon_check_status(b, status)) {
438                 *new_target = target;
439                 return true;
440         }
441
442         pr_debug("%s - failed, hv returns %ld\n", __func__, status);
443         STATS_INC(b->stats.target_fail);
444         return false;
445 }
446
447 /*
448  * Notify the host about allocated page so that host can use it without
449  * fear that guest will need it. Host may reject some pages, we need to
450  * check the return value and maybe submit a different page.
451  */
452 static int vmballoon_send_lock_page(struct vmballoon *b, unsigned long pfn,
453                                 unsigned int *hv_status, unsigned int *target)
454 {
455         unsigned long status, dummy = 0;
456         u32 pfn32;
457
458         pfn32 = (u32)pfn;
459         if (pfn32 != pfn)
460                 return -EINVAL;
461
462         STATS_INC(b->stats.lock[false]);
463
464         *hv_status = status = VMWARE_BALLOON_CMD(LOCK, pfn, dummy, *target);
465         if (vmballoon_check_status(b, status))
466                 return 0;
467
468         pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
469         STATS_INC(b->stats.lock_fail[false]);
470         return -EIO;
471 }
472
473 static int vmballoon_send_batched_lock(struct vmballoon *b,
474                 unsigned int num_pages, bool is_2m_pages, unsigned int *target)
475 {
476         unsigned long status;
477         unsigned long pfn = PHYS_PFN(virt_to_phys(b->batch_page));
478
479         STATS_INC(b->stats.lock[is_2m_pages]);
480
481         if (is_2m_pages)
482                 status = VMWARE_BALLOON_CMD(BATCHED_2M_LOCK, pfn, num_pages,
483                                 *target);
484         else
485                 status = VMWARE_BALLOON_CMD(BATCHED_LOCK, pfn, num_pages,
486                                 *target);
487
488         if (vmballoon_check_status(b, status))
489                 return 0;
490
491         pr_debug("%s - batch ppn %lx, hv returns %ld\n", __func__, pfn, status);
492         STATS_INC(b->stats.lock_fail[is_2m_pages]);
493         return 1;
494 }
495
496 /*
497  * Notify the host that guest intends to release given page back into
498  * the pool of available (to the guest) pages.
499  */
500 static bool vmballoon_send_unlock_page(struct vmballoon *b, unsigned long pfn,
501                                                         unsigned int *target)
502 {
503         unsigned long status, dummy = 0;
504         u32 pfn32;
505
506         pfn32 = (u32)pfn;
507         if (pfn32 != pfn)
508                 return false;
509
510         STATS_INC(b->stats.unlock[false]);
511
512         status = VMWARE_BALLOON_CMD(UNLOCK, pfn, dummy, *target);
513         if (vmballoon_check_status(b, status))
514                 return true;
515
516         pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
517         STATS_INC(b->stats.unlock_fail[false]);
518         return false;
519 }
520
521 static bool vmballoon_send_batched_unlock(struct vmballoon *b,
522                 unsigned int num_pages, bool is_2m_pages, unsigned int *target)
523 {
524         unsigned long status;
525         unsigned long pfn = PHYS_PFN(virt_to_phys(b->batch_page));
526
527         STATS_INC(b->stats.unlock[is_2m_pages]);
528
529         if (is_2m_pages)
530                 status = VMWARE_BALLOON_CMD(BATCHED_2M_UNLOCK, pfn, num_pages,
531                                 *target);
532         else
533                 status = VMWARE_BALLOON_CMD(BATCHED_UNLOCK, pfn, num_pages,
534                                 *target);
535
536         if (vmballoon_check_status(b, status))
537                 return true;
538
539         pr_debug("%s - batch ppn %lx, hv returns %ld\n", __func__, pfn, status);
540         STATS_INC(b->stats.unlock_fail[is_2m_pages]);
541         return false;
542 }
543
544 static struct page *vmballoon_alloc_page(gfp_t flags, bool is_2m_page)
545 {
546         if (is_2m_page)
547                 return alloc_pages(flags, VMW_BALLOON_2M_SHIFT);
548
549         return alloc_page(flags);
550 }
551
552 static void vmballoon_free_page(struct page *page, bool is_2m_page)
553 {
554         if (is_2m_page)
555                 __free_pages(page, VMW_BALLOON_2M_SHIFT);
556         else
557                 __free_page(page);
558 }
559
560 /*
561  * Quickly release all pages allocated for the balloon. This function is
562  * called when host decides to "reset" balloon for one reason or another.
563  * Unlike normal "deflate" we do not (shall not) notify host of the pages
564  * being released.
565  */
566 static void vmballoon_pop(struct vmballoon *b)
567 {
568         struct page *page, *next;
569         unsigned is_2m_pages;
570
571         for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
572                         is_2m_pages++) {
573                 struct vmballoon_page_size *page_size =
574                                 &b->page_sizes[is_2m_pages];
575                 u16 size_per_page = vmballoon_page_size(is_2m_pages);
576
577                 list_for_each_entry_safe(page, next, &page_size->pages, lru) {
578                         list_del(&page->lru);
579                         vmballoon_free_page(page, is_2m_pages);
580                         STATS_INC(b->stats.free[is_2m_pages]);
581                         b->size -= size_per_page;
582                         cond_resched();
583                 }
584         }
585
586         /* Clearing the batch_page unconditionally has no adverse effect */
587         free_page((unsigned long)b->batch_page);
588         b->batch_page = NULL;
589 }
590
591 /*
592  * Notify the host of a ballooned page. If host rejects the page put it on the
593  * refuse list, those refused page are then released at the end of the
594  * inflation cycle.
595  */
596 static int vmballoon_lock_page(struct vmballoon *b, unsigned int num_pages,
597                                 bool is_2m_pages, unsigned int *target)
598 {
599         int locked, hv_status;
600         struct page *page = b->page;
601         struct vmballoon_page_size *page_size = &b->page_sizes[false];
602
603         /* is_2m_pages can never happen as 2m pages support implies batching */
604
605         locked = vmballoon_send_lock_page(b, page_to_pfn(page), &hv_status,
606                                                                 target);
607         if (locked) {
608                 STATS_INC(b->stats.refused_alloc[false]);
609
610                 if (locked == -EIO &&
611                     (hv_status == VMW_BALLOON_ERROR_RESET ||
612                      hv_status == VMW_BALLOON_ERROR_PPN_NOTNEEDED)) {
613                         vmballoon_free_page(page, false);
614                         return -EIO;
615                 }
616
617                 /*
618                  * Place page on the list of non-balloonable pages
619                  * and retry allocation, unless we already accumulated
620                  * too many of them, in which case take a breather.
621                  */
622                 if (page_size->n_refused_pages < VMW_BALLOON_MAX_REFUSED) {
623                         page_size->n_refused_pages++;
624                         list_add(&page->lru, &page_size->refused_pages);
625                 } else {
626                         vmballoon_free_page(page, false);
627                 }
628                 return locked;
629         }
630
631         /* track allocated page */
632         list_add(&page->lru, &page_size->pages);
633
634         /* update balloon size */
635         b->size++;
636
637         return 0;
638 }
639
640 static int vmballoon_lock_batched_page(struct vmballoon *b,
641                 unsigned int num_pages, bool is_2m_pages, unsigned int *target)
642 {
643         int locked, i;
644         u16 size_per_page = vmballoon_page_size(is_2m_pages);
645
646         locked = vmballoon_send_batched_lock(b, num_pages, is_2m_pages,
647                         target);
648         if (locked > 0) {
649                 for (i = 0; i < num_pages; i++) {
650                         u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
651                         struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
652
653                         vmballoon_free_page(p, is_2m_pages);
654                 }
655
656                 return -EIO;
657         }
658
659         for (i = 0; i < num_pages; i++) {
660                 u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
661                 struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
662                 struct vmballoon_page_size *page_size =
663                                 &b->page_sizes[is_2m_pages];
664
665                 locked = vmballoon_batch_get_status(b->batch_page, i);
666
667                 switch (locked) {
668                 case VMW_BALLOON_SUCCESS:
669                         list_add(&p->lru, &page_size->pages);
670                         b->size += size_per_page;
671                         break;
672                 case VMW_BALLOON_ERROR_PPN_PINNED:
673                 case VMW_BALLOON_ERROR_PPN_INVALID:
674                         if (page_size->n_refused_pages
675                                         < VMW_BALLOON_MAX_REFUSED) {
676                                 list_add(&p->lru, &page_size->refused_pages);
677                                 page_size->n_refused_pages++;
678                                 break;
679                         }
680                         /* Fallthrough */
681                 case VMW_BALLOON_ERROR_RESET:
682                 case VMW_BALLOON_ERROR_PPN_NOTNEEDED:
683                         vmballoon_free_page(p, is_2m_pages);
684                         break;
685                 default:
686                         /* This should never happen */
687                         WARN_ON_ONCE(true);
688                 }
689         }
690
691         return 0;
692 }
693
694 /*
695  * Release the page allocated for the balloon. Note that we first notify
696  * the host so it can make sure the page will be available for the guest
697  * to use, if needed.
698  */
699 static int vmballoon_unlock_page(struct vmballoon *b, unsigned int num_pages,
700                 bool is_2m_pages, unsigned int *target)
701 {
702         struct page *page = b->page;
703         struct vmballoon_page_size *page_size = &b->page_sizes[false];
704
705         /* is_2m_pages can never happen as 2m pages support implies batching */
706
707         if (!vmballoon_send_unlock_page(b, page_to_pfn(page), target)) {
708                 list_add(&page->lru, &page_size->pages);
709                 return -EIO;
710         }
711
712         /* deallocate page */
713         vmballoon_free_page(page, false);
714         STATS_INC(b->stats.free[false]);
715
716         /* update balloon size */
717         b->size--;
718
719         return 0;
720 }
721
722 static int vmballoon_unlock_batched_page(struct vmballoon *b,
723                                 unsigned int num_pages, bool is_2m_pages,
724                                 unsigned int *target)
725 {
726         int locked, i, ret = 0;
727         bool hv_success;
728         u16 size_per_page = vmballoon_page_size(is_2m_pages);
729
730         hv_success = vmballoon_send_batched_unlock(b, num_pages, is_2m_pages,
731                         target);
732         if (!hv_success)
733                 ret = -EIO;
734
735         for (i = 0; i < num_pages; i++) {
736                 u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
737                 struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
738                 struct vmballoon_page_size *page_size =
739                                 &b->page_sizes[is_2m_pages];
740
741                 locked = vmballoon_batch_get_status(b->batch_page, i);
742                 if (!hv_success || locked != VMW_BALLOON_SUCCESS) {
743                         /*
744                          * That page wasn't successfully unlocked by the
745                          * hypervisor, re-add it to the list of pages owned by
746                          * the balloon driver.
747                          */
748                         list_add(&p->lru, &page_size->pages);
749                 } else {
750                         /* deallocate page */
751                         vmballoon_free_page(p, is_2m_pages);
752                         STATS_INC(b->stats.free[is_2m_pages]);
753
754                         /* update balloon size */
755                         b->size -= size_per_page;
756                 }
757         }
758
759         return ret;
760 }
761
762 /*
763  * Release pages that were allocated while attempting to inflate the
764  * balloon but were refused by the host for one reason or another.
765  */
766 static void vmballoon_release_refused_pages(struct vmballoon *b,
767                 bool is_2m_pages)
768 {
769         struct page *page, *next;
770         struct vmballoon_page_size *page_size =
771                         &b->page_sizes[is_2m_pages];
772
773         list_for_each_entry_safe(page, next, &page_size->refused_pages, lru) {
774                 list_del(&page->lru);
775                 vmballoon_free_page(page, is_2m_pages);
776                 STATS_INC(b->stats.refused_free[is_2m_pages]);
777         }
778
779         page_size->n_refused_pages = 0;
780 }
781
782 static void vmballoon_add_page(struct vmballoon *b, int idx, struct page *p)
783 {
784         b->page = p;
785 }
786
787 static void vmballoon_add_batched_page(struct vmballoon *b, int idx,
788                                 struct page *p)
789 {
790         vmballoon_batch_set_pa(b->batch_page, idx,
791                         (u64)page_to_pfn(p) << PAGE_SHIFT);
792 }
793
794 /*
795  * Inflate the balloon towards its target size. Note that we try to limit
796  * the rate of allocation to make sure we are not choking the rest of the
797  * system.
798  */
799 static void vmballoon_inflate(struct vmballoon *b)
800 {
801         unsigned rate;
802         unsigned int allocations = 0;
803         unsigned int num_pages = 0;
804         int error = 0;
805         gfp_t flags = VMW_PAGE_ALLOC_NOSLEEP;
806         bool is_2m_pages;
807
808         pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
809
810         /*
811          * First try NOSLEEP page allocations to inflate balloon.
812          *
813          * If we do not throttle nosleep allocations, we can drain all
814          * free pages in the guest quickly (if the balloon target is high).
815          * As a side-effect, draining free pages helps to inform (force)
816          * the guest to start swapping if balloon target is not met yet,
817          * which is a desired behavior. However, balloon driver can consume
818          * all available CPU cycles if too many pages are allocated in a
819          * second. Therefore, we throttle nosleep allocations even when
820          * the guest is not under memory pressure. OTOH, if we have already
821          * predicted that the guest is under memory pressure, then we
822          * slowdown page allocations considerably.
823          */
824
825         /*
826          * Start with no sleep allocation rate which may be higher
827          * than sleeping allocation rate.
828          */
829         if (b->slow_allocation_cycles) {
830                 rate = b->rate_alloc;
831                 is_2m_pages = false;
832         } else {
833                 rate = UINT_MAX;
834                 is_2m_pages =
835                         b->supported_page_sizes == VMW_BALLOON_NUM_PAGE_SIZES;
836         }
837
838         pr_debug("%s - goal: %d, no-sleep rate: %u, sleep rate: %d\n",
839                  __func__, b->target - b->size, rate, b->rate_alloc);
840
841         while (!b->reset_required &&
842                 b->size + num_pages * vmballoon_page_size(is_2m_pages)
843                 < b->target) {
844                 struct page *page;
845
846                 if (flags == VMW_PAGE_ALLOC_NOSLEEP)
847                         STATS_INC(b->stats.alloc[is_2m_pages]);
848                 else
849                         STATS_INC(b->stats.sleep_alloc);
850
851                 page = vmballoon_alloc_page(flags, is_2m_pages);
852                 if (!page) {
853                         STATS_INC(b->stats.alloc_fail[is_2m_pages]);
854
855                         if (is_2m_pages) {
856                                 b->ops->lock(b, num_pages, true, &b->target);
857
858                                 /*
859                                  * ignore errors from locking as we now switch
860                                  * to 4k pages and we might get different
861                                  * errors.
862                                  */
863
864                                 num_pages = 0;
865                                 is_2m_pages = false;
866                                 continue;
867                         }
868
869                         if (flags == VMW_PAGE_ALLOC_CANSLEEP) {
870                                 /*
871                                  * CANSLEEP page allocation failed, so guest
872                                  * is under severe memory pressure. Quickly
873                                  * decrease allocation rate.
874                                  */
875                                 b->rate_alloc = max(b->rate_alloc / 2,
876                                                     VMW_BALLOON_RATE_ALLOC_MIN);
877                                 STATS_INC(b->stats.sleep_alloc_fail);
878                                 break;
879                         }
880
881                         /*
882                          * NOSLEEP page allocation failed, so the guest is
883                          * under memory pressure. Let us slow down page
884                          * allocations for next few cycles so that the guest
885                          * gets out of memory pressure. Also, if we already
886                          * allocated b->rate_alloc pages, let's pause,
887                          * otherwise switch to sleeping allocations.
888                          */
889                         b->slow_allocation_cycles = VMW_BALLOON_SLOW_CYCLES;
890
891                         if (allocations >= b->rate_alloc)
892                                 break;
893
894                         flags = VMW_PAGE_ALLOC_CANSLEEP;
895                         /* Lower rate for sleeping allocations. */
896                         rate = b->rate_alloc;
897                         continue;
898                 }
899
900                 b->ops->add_page(b, num_pages++, page);
901                 if (num_pages == b->batch_max_pages) {
902                         error = b->ops->lock(b, num_pages, is_2m_pages,
903                                         &b->target);
904                         num_pages = 0;
905                         if (error)
906                                 break;
907                 }
908
909                 cond_resched();
910
911                 if (allocations >= rate) {
912                         /* We allocated enough pages, let's take a break. */
913                         break;
914                 }
915         }
916
917         if (num_pages > 0)
918                 b->ops->lock(b, num_pages, is_2m_pages, &b->target);
919
920         /*
921          * We reached our goal without failures so try increasing
922          * allocation rate.
923          */
924         if (error == 0 && allocations >= b->rate_alloc) {
925                 unsigned int mult = allocations / b->rate_alloc;
926
927                 b->rate_alloc =
928                         min(b->rate_alloc + mult * VMW_BALLOON_RATE_ALLOC_INC,
929                             VMW_BALLOON_RATE_ALLOC_MAX);
930         }
931
932         vmballoon_release_refused_pages(b, true);
933         vmballoon_release_refused_pages(b, false);
934 }
935
936 /*
937  * Decrease the size of the balloon allowing guest to use more memory.
938  */
939 static void vmballoon_deflate(struct vmballoon *b)
940 {
941         unsigned is_2m_pages;
942
943         pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
944
945         /* free pages to reach target */
946         for (is_2m_pages = 0; is_2m_pages < b->supported_page_sizes;
947                         is_2m_pages++) {
948                 struct page *page, *next;
949                 unsigned int num_pages = 0;
950                 struct vmballoon_page_size *page_size =
951                                 &b->page_sizes[is_2m_pages];
952
953                 list_for_each_entry_safe(page, next, &page_size->pages, lru) {
954                         if (b->reset_required ||
955                                 (b->target > 0 &&
956                                         b->size - num_pages
957                                         * vmballoon_page_size(is_2m_pages)
958                                 < b->target + vmballoon_page_size(true)))
959                                 break;
960
961                         list_del(&page->lru);
962                         b->ops->add_page(b, num_pages++, page);
963
964                         if (num_pages == b->batch_max_pages) {
965                                 int error;
966
967                                 error = b->ops->unlock(b, num_pages,
968                                                 is_2m_pages, &b->target);
969                                 num_pages = 0;
970                                 if (error)
971                                         return;
972                         }
973
974                         cond_resched();
975                 }
976
977                 if (num_pages > 0)
978                         b->ops->unlock(b, num_pages, is_2m_pages, &b->target);
979         }
980 }
981
982 static const struct vmballoon_ops vmballoon_basic_ops = {
983         .add_page = vmballoon_add_page,
984         .lock = vmballoon_lock_page,
985         .unlock = vmballoon_unlock_page
986 };
987
988 static const struct vmballoon_ops vmballoon_batched_ops = {
989         .add_page = vmballoon_add_batched_page,
990         .lock = vmballoon_lock_batched_page,
991         .unlock = vmballoon_unlock_batched_page
992 };
993
994 static bool vmballoon_init_batching(struct vmballoon *b)
995 {
996         struct page *page;
997
998         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
999         if (!page)
1000                 return false;
1001
1002         b->batch_page = page_address(page);
1003         return true;
1004 }
1005
1006 /*
1007  * Receive notification and resize balloon
1008  */
1009 static void vmballoon_doorbell(void *client_data)
1010 {
1011         struct vmballoon *b = client_data;
1012
1013         STATS_INC(b->stats.doorbell);
1014
1015         mod_delayed_work(system_freezable_wq, &b->dwork, 0);
1016 }
1017
1018 /*
1019  * Clean up vmci doorbell
1020  */
1021 static void vmballoon_vmci_cleanup(struct vmballoon *b)
1022 {
1023         int error;
1024
1025         VMWARE_BALLOON_CMD(VMCI_DOORBELL_SET, VMCI_INVALID_ID,
1026                         VMCI_INVALID_ID, error);
1027         STATS_INC(b->stats.doorbell_unset);
1028
1029         if (!vmci_handle_is_invalid(b->vmci_doorbell)) {
1030                 vmci_doorbell_destroy(b->vmci_doorbell);
1031                 b->vmci_doorbell = VMCI_INVALID_HANDLE;
1032         }
1033 }
1034
1035 /*
1036  * Initialize vmci doorbell, to get notified as soon as balloon changes
1037  */
1038 static int vmballoon_vmci_init(struct vmballoon *b)
1039 {
1040         unsigned long error, dummy;
1041
1042         if ((b->capabilities & VMW_BALLOON_SIGNALLED_WAKEUP_CMD) == 0)
1043                 return 0;
1044
1045         error = vmci_doorbell_create(&b->vmci_doorbell, VMCI_FLAG_DELAYED_CB,
1046                                      VMCI_PRIVILEGE_FLAG_RESTRICTED,
1047                                      vmballoon_doorbell, b);
1048
1049         if (error != VMCI_SUCCESS)
1050                 goto fail;
1051
1052         error = VMWARE_BALLOON_CMD(VMCI_DOORBELL_SET, b->vmci_doorbell.context,
1053                                    b->vmci_doorbell.resource, dummy);
1054
1055         STATS_INC(b->stats.doorbell_set);
1056
1057         if (error != VMW_BALLOON_SUCCESS)
1058                 goto fail;
1059
1060         return 0;
1061 fail:
1062         vmballoon_vmci_cleanup(b);
1063         return -EIO;
1064 }
1065
1066 /*
1067  * Perform standard reset sequence by popping the balloon (in case it
1068  * is not  empty) and then restarting protocol. This operation normally
1069  * happens when host responds with VMW_BALLOON_ERROR_RESET to a command.
1070  */
1071 static void vmballoon_reset(struct vmballoon *b)
1072 {
1073         int error;
1074
1075         vmballoon_vmci_cleanup(b);
1076
1077         /* free all pages, skipping monitor unlock */
1078         vmballoon_pop(b);
1079
1080         if (!vmballoon_send_start(b, VMW_BALLOON_CAPABILITIES))
1081                 return;
1082
1083         if ((b->capabilities & VMW_BALLOON_BATCHED_CMDS) != 0) {
1084                 b->ops = &vmballoon_batched_ops;
1085                 b->batch_max_pages = VMW_BALLOON_BATCH_MAX_PAGES;
1086                 if (!vmballoon_init_batching(b)) {
1087                         /*
1088                          * We failed to initialize batching, inform the monitor
1089                          * about it by sending a null capability.
1090                          *
1091                          * The guest will retry in one second.
1092                          */
1093                         vmballoon_send_start(b, 0);
1094                         return;
1095                 }
1096         } else if ((b->capabilities & VMW_BALLOON_BASIC_CMDS) != 0) {
1097                 b->ops = &vmballoon_basic_ops;
1098                 b->batch_max_pages = 1;
1099         }
1100
1101         b->reset_required = false;
1102
1103         error = vmballoon_vmci_init(b);
1104         if (error)
1105                 pr_err("failed to initialize vmci doorbell\n");
1106
1107         if (!vmballoon_send_guest_id(b))
1108                 pr_err("failed to send guest ID to the host\n");
1109 }
1110
1111 /*
1112  * Balloon work function: reset protocol, if needed, get the new size and
1113  * adjust balloon as needed. Repeat in 1 sec.
1114  */
1115 static void vmballoon_work(struct work_struct *work)
1116 {
1117         struct delayed_work *dwork = to_delayed_work(work);
1118         struct vmballoon *b = container_of(dwork, struct vmballoon, dwork);
1119         unsigned int target;
1120
1121         STATS_INC(b->stats.timer);
1122
1123         if (b->reset_required)
1124                 vmballoon_reset(b);
1125
1126         if (b->slow_allocation_cycles > 0)
1127                 b->slow_allocation_cycles--;
1128
1129         if (!b->reset_required && vmballoon_send_get_target(b, &target)) {
1130                 /* update target, adjust size */
1131                 b->target = target;
1132
1133                 if (b->size < target)
1134                         vmballoon_inflate(b);
1135                 else if (target == 0 ||
1136                                 b->size > target + vmballoon_page_size(true))
1137                         vmballoon_deflate(b);
1138         }
1139
1140         /*
1141          * We are using a freezable workqueue so that balloon operations are
1142          * stopped while the system transitions to/from sleep/hibernation.
1143          */
1144         queue_delayed_work(system_freezable_wq,
1145                            dwork, round_jiffies_relative(HZ));
1146 }
1147
1148 /*
1149  * DEBUGFS Interface
1150  */
1151 #ifdef CONFIG_DEBUG_FS
1152
1153 static int vmballoon_debug_show(struct seq_file *f, void *offset)
1154 {
1155         struct vmballoon *b = f->private;
1156         struct vmballoon_stats *stats = &b->stats;
1157
1158         /* format capabilities info */
1159         seq_printf(f,
1160                    "balloon capabilities:   %#4x\n"
1161                    "used capabilities:      %#4lx\n"
1162                    "is resetting:           %c\n",
1163                    VMW_BALLOON_CAPABILITIES, b->capabilities,
1164                    b->reset_required ? 'y' : 'n');
1165
1166         /* format size info */
1167         seq_printf(f,
1168                    "target:             %8d pages\n"
1169                    "current:            %8d pages\n",
1170                    b->target, b->size);
1171
1172         /* format rate info */
1173         seq_printf(f,
1174                    "rateSleepAlloc:     %8d pages/sec\n",
1175                    b->rate_alloc);
1176
1177         seq_printf(f,
1178                    "\n"
1179                    "timer:              %8u\n"
1180                    "doorbell:           %8u\n"
1181                    "start:              %8u (%4u failed)\n"
1182                    "guestType:          %8u (%4u failed)\n"
1183                    "2m-lock:            %8u (%4u failed)\n"
1184                    "lock:               %8u (%4u failed)\n"
1185                    "2m-unlock:          %8u (%4u failed)\n"
1186                    "unlock:             %8u (%4u failed)\n"
1187                    "target:             %8u (%4u failed)\n"
1188                    "prim2mAlloc:        %8u (%4u failed)\n"
1189                    "primNoSleepAlloc:   %8u (%4u failed)\n"
1190                    "primCanSleepAlloc:  %8u (%4u failed)\n"
1191                    "prim2mFree:         %8u\n"
1192                    "primFree:           %8u\n"
1193                    "err2mAlloc:         %8u\n"
1194                    "errAlloc:           %8u\n"
1195                    "err2mFree:          %8u\n"
1196                    "errFree:            %8u\n"
1197                    "doorbellSet:        %8u\n"
1198                    "doorbellUnset:      %8u\n",
1199                    stats->timer,
1200                    stats->doorbell,
1201                    stats->start, stats->start_fail,
1202                    stats->guest_type, stats->guest_type_fail,
1203                    stats->lock[true],  stats->lock_fail[true],
1204                    stats->lock[false],  stats->lock_fail[false],
1205                    stats->unlock[true], stats->unlock_fail[true],
1206                    stats->unlock[false], stats->unlock_fail[false],
1207                    stats->target, stats->target_fail,
1208                    stats->alloc[true], stats->alloc_fail[true],
1209                    stats->alloc[false], stats->alloc_fail[false],
1210                    stats->sleep_alloc, stats->sleep_alloc_fail,
1211                    stats->free[true],
1212                    stats->free[false],
1213                    stats->refused_alloc[true], stats->refused_alloc[false],
1214                    stats->refused_free[true], stats->refused_free[false],
1215                    stats->doorbell_set, stats->doorbell_unset);
1216
1217         return 0;
1218 }
1219
1220 static int vmballoon_debug_open(struct inode *inode, struct file *file)
1221 {
1222         return single_open(file, vmballoon_debug_show, inode->i_private);
1223 }
1224
1225 static const struct file_operations vmballoon_debug_fops = {
1226         .owner          = THIS_MODULE,
1227         .open           = vmballoon_debug_open,
1228         .read           = seq_read,
1229         .llseek         = seq_lseek,
1230         .release        = single_release,
1231 };
1232
1233 static int __init vmballoon_debugfs_init(struct vmballoon *b)
1234 {
1235         int error;
1236
1237         b->dbg_entry = debugfs_create_file("vmmemctl", S_IRUGO, NULL, b,
1238                                            &vmballoon_debug_fops);
1239         if (IS_ERR(b->dbg_entry)) {
1240                 error = PTR_ERR(b->dbg_entry);
1241                 pr_err("failed to create debugfs entry, error: %d\n", error);
1242                 return error;
1243         }
1244
1245         return 0;
1246 }
1247
1248 static void __exit vmballoon_debugfs_exit(struct vmballoon *b)
1249 {
1250         debugfs_remove(b->dbg_entry);
1251 }
1252
1253 #else
1254
1255 static inline int vmballoon_debugfs_init(struct vmballoon *b)
1256 {
1257         return 0;
1258 }
1259
1260 static inline void vmballoon_debugfs_exit(struct vmballoon *b)
1261 {
1262 }
1263
1264 #endif  /* CONFIG_DEBUG_FS */
1265
1266 static int __init vmballoon_init(void)
1267 {
1268         int error;
1269         unsigned is_2m_pages;
1270         /*
1271          * Check if we are running on VMware's hypervisor and bail out
1272          * if we are not.
1273          */
1274         if (x86_hyper_type != X86_HYPER_VMWARE)
1275                 return -ENODEV;
1276
1277         for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
1278                         is_2m_pages++) {
1279                 INIT_LIST_HEAD(&balloon.page_sizes[is_2m_pages].pages);
1280                 INIT_LIST_HEAD(&balloon.page_sizes[is_2m_pages].refused_pages);
1281         }
1282
1283         /* initialize rates */
1284         balloon.rate_alloc = VMW_BALLOON_RATE_ALLOC_MAX;
1285
1286         INIT_DELAYED_WORK(&balloon.dwork, vmballoon_work);
1287
1288         error = vmballoon_debugfs_init(&balloon);
1289         if (error)
1290                 return error;
1291
1292         balloon.vmci_doorbell = VMCI_INVALID_HANDLE;
1293         balloon.batch_page = NULL;
1294         balloon.page = NULL;
1295         balloon.reset_required = true;
1296
1297         queue_delayed_work(system_freezable_wq, &balloon.dwork, 0);
1298
1299         return 0;
1300 }
1301
1302 /*
1303  * Using late_initcall() instead of module_init() allows the balloon to use the
1304  * VMCI doorbell even when the balloon is built into the kernel. Otherwise the
1305  * VMCI is probed only after the balloon is initialized. If the balloon is used
1306  * as a module, late_initcall() is equivalent to module_init().
1307  */
1308 late_initcall(vmballoon_init);
1309
1310 static void __exit vmballoon_exit(void)
1311 {
1312         vmballoon_vmci_cleanup(&balloon);
1313         cancel_delayed_work_sync(&balloon.dwork);
1314
1315         vmballoon_debugfs_exit(&balloon);
1316
1317         /*
1318          * Deallocate all reserved memory, and reset connection with monitor.
1319          * Reset connection before deallocating memory to avoid potential for
1320          * additional spurious resets from guest touching deallocated pages.
1321          */
1322         vmballoon_send_start(&balloon, 0);
1323         vmballoon_pop(&balloon);
1324 }
1325 module_exit(vmballoon_exit);