arch/x86/kvm/vmx.c

   1 /*
   2  * Kernel-based Virtual Machine driver for Linux
   3  *
   4  * This module enables machines with Intel VT-x extensions to run virtual
   5  * machines without emulation or binary translation.
   6  *
   7  * Copyright (C) 2006 Qumranet, Inc.
   8  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
   9  *
  10  * Authors:
  11  *   Avi Kivity   <avi@qumranet.com>
  12  *   Yaniv Kamay  <yaniv@qumranet.com>
  13  *
  14  * This work is licensed under the terms of the GNU GPL, version 2.  See
  15  * the COPYING file in the top-level directory.
  16  *
  17  */
  18
  19 #include "irq.h"
  20 #include "mmu.h"
  21 #include "cpuid.h"
  22 #include "lapic.h"
  23
  24 #include <linux/kvm_host.h>
  25 #include <linux/module.h>
  26 #include <linux/kernel.h>
  27 #include <linux/mm.h>
  28 #include <linux/highmem.h>
  29 #include <linux/sched.h>
  30 #include <linux/sched/smt.h>
  31 #include <linux/moduleparam.h>
  32 #include <linux/mod_devicetable.h>
  33 #include <linux/trace_events.h>
  34 #include <linux/slab.h>
  35 #include <linux/tboot.h>
  36 #include <linux/hrtimer.h>
  37 #include <linux/frame.h>
  38 #include <linux/nospec.h>
  39 #include "kvm_cache_regs.h"
  40 #include "x86.h"
  41
  42 #include <asm/asm.h>
  43 #include <asm/cpu.h>
  44 #include <asm/cpu_device_id.h>
  45 #include <asm/io.h>
  46 #include <asm/desc.h>
  47 #include <asm/vmx.h>
  48 #include <asm/virtext.h>
  49 #include <asm/mce.h>
  50 #include <asm/fpu/internal.h>
  51 #include <asm/perf_event.h>
  52 #include <asm/debugreg.h>
  53 #include <asm/kexec.h>
  54 #include <asm/apic.h>
  55 #include <asm/irq_remapping.h>
  56 #include <asm/mmu_context.h>
  57 #include <asm/spec-ctrl.h>
  58 #include <asm/mshyperv.h>
  59
  60 #include "trace.h"
  61 #include "pmu.h"
  62 #include "vmx_evmcs.h"
  63
  64 #define __ex(x) __kvm_handle_fault_on_reboot(x)
  65 #define __ex_clear(x, reg) \
  66         ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
  67
  68 MODULE_AUTHOR("Qumranet");
  69 MODULE_LICENSE("GPL");
  70
  71 static const struct x86_cpu_id vmx_cpu_id[] = {
  72         X86_FEATURE_MATCH(X86_FEATURE_VMX),
  73         {}
  74 };
  75 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
  76
  77 static bool __read_mostly enable_vpid = 1;
  78 module_param_named(vpid, enable_vpid, bool, 0444);
  79
  80 static bool __read_mostly enable_vnmi = 1;
  81 module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
  82
  83 static bool __read_mostly flexpriority_enabled = 1;
  84 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
  85
  86 static bool __read_mostly enable_ept = 1;
  87 module_param_named(ept, enable_ept, bool, S_IRUGO);
  88
  89 static bool __read_mostly enable_unrestricted_guest = 1;
  90 module_param_named(unrestricted_guest,
  91                         enable_unrestricted_guest, bool, S_IRUGO);
  92
  93 static bool __read_mostly enable_ept_ad_bits = 1;
  94 module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
  95
  96 static bool __read_mostly emulate_invalid_guest_state = true;
  97 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
  98
  99 static bool __read_mostly fasteoi = 1;
 100 module_param(fasteoi, bool, S_IRUGO);
 101
 102 static bool __read_mostly enable_apicv = 1;
 103 module_param(enable_apicv, bool, S_IRUGO);
 104
 105 static bool __read_mostly enable_shadow_vmcs = 1;
 106 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
 107 /*
 108  * If nested=1, nested virtualization is supported, i.e., guests may use
 109  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
 110  * use VMX instructions.
 111  */
 112 static bool __read_mostly nested = 0;
 113 module_param(nested, bool, S_IRUGO);
 114
 115 static u64 __read_mostly host_xss;
 116
 117 static bool __read_mostly enable_pml = 1;
 118 module_param_named(pml, enable_pml, bool, S_IRUGO);
 119
 120 #define MSR_TYPE_R      1
 121 #define MSR_TYPE_W      2
 122 #define MSR_TYPE_RW     3
 123
 124 #define MSR_BITMAP_MODE_X2APIC          1
 125 #define MSR_BITMAP_MODE_X2APIC_APICV    2
 126
 127 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
 128
 129 /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
 130 static int __read_mostly cpu_preemption_timer_multi;
 131 static bool __read_mostly enable_preemption_timer = 1;
 132 #ifdef CONFIG_X86_64
 133 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
 134 #endif
 135
 136 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
 137 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
 138 #define KVM_VM_CR0_ALWAYS_ON                            \
 139         (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST |      \
 140          X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
 141 #define KVM_CR4_GUEST_OWNED_BITS                                      \
 142         (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR      \
 143          | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
 144
 145 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
 146 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
 147 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
 148
 149 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
 150
 151 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
 152
 153 /*
 154  * Hyper-V requires all of these, so mark them as supported even though
 155  * they are just treated the same as all-context.
 156  */
 157 #define VMX_VPID_EXTENT_SUPPORTED_MASK          \
 158         (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |  \
 159         VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |    \
 160         VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |    \
 161         VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
 162
 163 /*
 164  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 165  * ple_gap:    upper bound on the amount of time between two successive
 166  *             executions of PAUSE in a loop. Also indicate if ple enabled.
 167  *             According to test, this time is usually smaller than 128 cycles.
 168  * ple_window: upper bound on the amount of time a guest is allowed to execute
 169  *             in a PAUSE loop. Tests indicate that most spinlocks are held for
 170  *             less than 2^12 cycles
 171  * Time is measured based on a counter that runs at the same rate as the TSC,
 172  * refer SDM volume 3b section 21.6.13 & 22.1.3.
 173  */
 174 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
 175 module_param(ple_gap, uint, 0444);
 176
 177 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
 178 module_param(ple_window, uint, 0444);
 179
 180 /* Default doubles per-vcpu window every exit. */
 181 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
 182 module_param(ple_window_grow, uint, 0444);
 183
 184 /* Default resets per-vcpu window every exit to ple_window. */
 185 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
 186 module_param(ple_window_shrink, uint, 0444);
 187
 188 /* Default is to compute the maximum so we can never overflow. */
 189 static unsigned int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
 190 module_param(ple_window_max, uint, 0444);
 191
 192 extern const ulong vmx_return;
 193
 194 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
 195 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
 196 static DEFINE_MUTEX(vmx_l1d_flush_mutex);
 197
 198 /* Storage for pre module init parameter parsing */
 199 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
 200
 201 static const struct {
 202         const char *option;
 203         bool for_parse;
 204 } vmentry_l1d_param[] = {
 205         [VMENTER_L1D_FLUSH_AUTO]         = {"auto", true},
 206         [VMENTER_L1D_FLUSH_NEVER]        = {"never", true},
 207         [VMENTER_L1D_FLUSH_COND]         = {"cond", true},
 208         [VMENTER_L1D_FLUSH_ALWAYS]       = {"always", true},
 209         [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
 210         [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
 211 };
 212
 213 #define L1D_CACHE_ORDER 4
 214 static void *vmx_l1d_flush_pages;
 215
 216 /* Control for disabling CPU Fill buffer clear */
 217 static bool __read_mostly vmx_fb_clear_ctrl_available;
 218
 219 static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
 220 {
 221         struct page *page;
 222         unsigned int i;
 223
 224         if (!enable_ept) {
 225                 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
 226                 return 0;
 227         }
 228
 229         if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
 230                 u64 msr;
 231
 232                 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
 233                 if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
 234                         l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
 235                         return 0;
 236                 }
 237         }
 238
 239         /* If set to auto use the default l1tf mitigation method */
 240         if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
 241                 switch (l1tf_mitigation) {
 242                 case L1TF_MITIGATION_OFF:
 243                         l1tf = VMENTER_L1D_FLUSH_NEVER;
 244                         break;
 245                 case L1TF_MITIGATION_FLUSH_NOWARN:
 246                 case L1TF_MITIGATION_FLUSH:
 247                 case L1TF_MITIGATION_FLUSH_NOSMT:
 248                         l1tf = VMENTER_L1D_FLUSH_COND;
 249                         break;
 250                 case L1TF_MITIGATION_FULL:
 251                 case L1TF_MITIGATION_FULL_FORCE:
 252                         l1tf = VMENTER_L1D_FLUSH_ALWAYS;
 253                         break;
 254                 }
 255         } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
 256                 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
 257         }
 258
 259         if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
 260             !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
 261                 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
 262                 if (!page)
 263                         return -ENOMEM;
 264                 vmx_l1d_flush_pages = page_address(page);
 265
 266                 /*
 267                  * Initialize each page with a different pattern in
 268                  * order to protect against KSM in the nested
 269                  * virtualization case.
 270                  */
 271                 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
 272                         memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
 273                                PAGE_SIZE);
 274                 }
 275         }
 276
 277         l1tf_vmx_mitigation = l1tf;
 278
 279         if (l1tf != VMENTER_L1D_FLUSH_NEVER)
 280                 static_branch_enable(&vmx_l1d_should_flush);
 281         else
 282                 static_branch_disable(&vmx_l1d_should_flush);
 283
 284         if (l1tf == VMENTER_L1D_FLUSH_COND)
 285                 static_branch_enable(&vmx_l1d_flush_cond);
 286         else
 287                 static_branch_disable(&vmx_l1d_flush_cond);
 288         return 0;
 289 }
 290
 291 static int vmentry_l1d_flush_parse(const char *s)
 292 {
 293         unsigned int i;
 294
 295         if (s) {
 296                 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
 297                         if (vmentry_l1d_param[i].for_parse &&
 298                             sysfs_streq(s, vmentry_l1d_param[i].option))
 299                                 return i;
 300                 }
 301         }
 302         return -EINVAL;
 303 }
 304
 305 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
 306 {
 307         int l1tf, ret;
 308
 309         l1tf = vmentry_l1d_flush_parse(s);
 310         if (l1tf < 0)
 311                 return l1tf;
 312
 313         if (!boot_cpu_has(X86_BUG_L1TF))
 314                 return 0;
 315
 316         /*
 317          * Has vmx_init() run already? If not then this is the pre init
 318          * parameter parsing. In that case just store the value and let
 319          * vmx_init() do the proper setup after enable_ept has been
 320          * established.
 321          */
 322         if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
 323                 vmentry_l1d_flush_param = l1tf;
 324                 return 0;
 325         }
 326
 327         mutex_lock(&vmx_l1d_flush_mutex);
 328         ret = vmx_setup_l1d_flush(l1tf);
 329         mutex_unlock(&vmx_l1d_flush_mutex);
 330         return ret;
 331 }
 332
 333 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
 334 {
 335         if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
 336                 return sprintf(s, "???\n");
 337
 338         return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
 339 }
 340
 341 static const struct kernel_param_ops vmentry_l1d_flush_ops = {
 342         .set = vmentry_l1d_flush_set,
 343         .get = vmentry_l1d_flush_get,
 344 };
 345 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
 346
 347 enum ept_pointers_status {
 348         EPT_POINTERS_CHECK = 0,
 349         EPT_POINTERS_MATCH = 1,
 350         EPT_POINTERS_MISMATCH = 2
 351 };
 352
 353 struct kvm_vmx {
 354         struct kvm kvm;
 355
 356         unsigned int tss_addr;
 357         bool ept_identity_pagetable_done;
 358         gpa_t ept_identity_map_addr;
 359
 360         enum ept_pointers_status ept_pointers_match;
 361         spinlock_t ept_pointer_lock;
 362 };
 363
 364 #define NR_AUTOLOAD_MSRS 8
 365
 366 struct vmcs_hdr {
 367         u32 revision_id:31;
 368         u32 shadow_vmcs:1;
 369 };
 370
 371 struct vmcs {
 372         struct vmcs_hdr hdr;
 373         u32 abort;
 374         char data[0];
 375 };
 376
 377 /*
 378  * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT
 379  * and whose values change infrequently, but are not constant.  I.e. this is
 380  * used as a write-through cache of the corresponding VMCS fields.
 381  */
 382 struct vmcs_host_state {
 383         unsigned long cr3;      /* May not match real cr3 */
 384         unsigned long cr4;      /* May not match real cr4 */
 385         unsigned long gs_base;
 386         unsigned long fs_base;
 387
 388         u16           fs_sel, gs_sel, ldt_sel;
 389 #ifdef CONFIG_X86_64
 390         u16           ds_sel, es_sel;
 391 #endif
 392 };
 393
 394 /*
 395  * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
 396  * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
 397  * loaded on this CPU (so we can clear them if the CPU goes down).
 398  */
 399 struct loaded_vmcs {
 400         struct vmcs *vmcs;
 401         struct vmcs *shadow_vmcs;
 402         int cpu;
 403         bool launched;
 404         bool nmi_known_unmasked;
 405         bool hv_timer_armed;
 406         /* Support for vnmi-less CPUs */
 407         int soft_vnmi_blocked;
 408         ktime_t entry_time;
 409         s64 vnmi_blocked_time;
 410         unsigned long *msr_bitmap;
 411         struct list_head loaded_vmcss_on_cpu_link;
 412         struct vmcs_host_state host_state;
 413 };
 414
 415 struct shared_msr_entry {
 416         unsigned index;
 417         u64 data;
 418         u64 mask;
 419 };
 420
 421 /*
 422  * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
 423  * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
 424  * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
 425  * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
 426  * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
 427  * More than one of these structures may exist, if L1 runs multiple L2 guests.
 428  * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
 429  * underlying hardware which will be used to run L2.
 430  * This structure is packed to ensure that its layout is identical across
 431  * machines (necessary for live migration).
 432  *
 433  * IMPORTANT: Changing the layout of existing fields in this structure
 434  * will break save/restore compatibility with older kvm releases. When
 435  * adding new fields, either use space in the reserved padding* arrays
 436  * or add the new fields to the end of the structure.
 437  */
 438 typedef u64 natural_width;
 439 struct __packed vmcs12 {
 440         /* According to the Intel spec, a VMCS region must start with the
 441          * following two fields. Then follow implementation-specific data.
 442          */
 443         struct vmcs_hdr hdr;
 444         u32 abort;
 445
 446         u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
 447         u32 padding[7]; /* room for future expansion */
 448
 449         u64 io_bitmap_a;
 450         u64 io_bitmap_b;
 451         u64 msr_bitmap;
 452         u64 vm_exit_msr_store_addr;
 453         u64 vm_exit_msr_load_addr;
 454         u64 vm_entry_msr_load_addr;
 455         u64 tsc_offset;
 456         u64 virtual_apic_page_addr;
 457         u64 apic_access_addr;
 458         u64 posted_intr_desc_addr;
 459         u64 ept_pointer;
 460         u64 eoi_exit_bitmap0;
 461         u64 eoi_exit_bitmap1;
 462         u64 eoi_exit_bitmap2;
 463         u64 eoi_exit_bitmap3;
 464         u64 xss_exit_bitmap;
 465         u64 guest_physical_address;
 466         u64 vmcs_link_pointer;
 467         u64 guest_ia32_debugctl;
 468         u64 guest_ia32_pat;
 469         u64 guest_ia32_efer;
 470         u64 guest_ia32_perf_global_ctrl;
 471         u64 guest_pdptr0;
 472         u64 guest_pdptr1;
 473         u64 guest_pdptr2;
 474         u64 guest_pdptr3;
 475         u64 guest_bndcfgs;
 476         u64 host_ia32_pat;
 477         u64 host_ia32_efer;
 478         u64 host_ia32_perf_global_ctrl;
 479         u64 vmread_bitmap;
 480         u64 vmwrite_bitmap;
 481         u64 vm_function_control;
 482         u64 eptp_list_address;
 483         u64 pml_address;
 484         u64 padding64[3]; /* room for future expansion */
 485         /*
 486          * To allow migration of L1 (complete with its L2 guests) between
 487          * machines of different natural widths (32 or 64 bit), we cannot have
 488          * unsigned long fields with no explict size. We use u64 (aliased
 489          * natural_width) instead. Luckily, x86 is little-endian.
 490          */
 491         natural_width cr0_guest_host_mask;
 492         natural_width cr4_guest_host_mask;
 493         natural_width cr0_read_shadow;
 494         natural_width cr4_read_shadow;
 495         natural_width cr3_target_value0;
 496         natural_width cr3_target_value1;
 497         natural_width cr3_target_value2;
 498         natural_width cr3_target_value3;
 499         natural_width exit_qualification;
 500         natural_width guest_linear_address;
 501         natural_width guest_cr0;
 502         natural_width guest_cr3;
 503         natural_width guest_cr4;
 504         natural_width guest_es_base;
 505         natural_width guest_cs_base;
 506         natural_width guest_ss_base;
 507         natural_width guest_ds_base;
 508         natural_width guest_fs_base;
 509         natural_width guest_gs_base;
 510         natural_width guest_ldtr_base;
 511         natural_width guest_tr_base;
 512         natural_width guest_gdtr_base;
 513         natural_width guest_idtr_base;
 514         natural_width guest_dr7;
 515         natural_width guest_rsp;
 516         natural_width guest_rip;
 517         natural_width guest_rflags;
 518         natural_width guest_pending_dbg_exceptions;
 519         natural_width guest_sysenter_esp;
 520         natural_width guest_sysenter_eip;
 521         natural_width host_cr0;
 522         natural_width host_cr3;
 523         natural_width host_cr4;
 524         natural_width host_fs_base;
 525         natural_width host_gs_base;
 526         natural_width host_tr_base;
 527         natural_width host_gdtr_base;
 528         natural_width host_idtr_base;
 529         natural_width host_ia32_sysenter_esp;
 530         natural_width host_ia32_sysenter_eip;
 531         natural_width host_rsp;
 532         natural_width host_rip;
 533         natural_width paddingl[8]; /* room for future expansion */
 534         u32 pin_based_vm_exec_control;
 535         u32 cpu_based_vm_exec_control;
 536         u32 exception_bitmap;
 537         u32 page_fault_error_code_mask;
 538         u32 page_fault_error_code_match;
 539         u32 cr3_target_count;
 540         u32 vm_exit_controls;
 541         u32 vm_exit_msr_store_count;
 542         u32 vm_exit_msr_load_count;
 543         u32 vm_entry_controls;
 544         u32 vm_entry_msr_load_count;
 545         u32 vm_entry_intr_info_field;
 546         u32 vm_entry_exception_error_code;
 547         u32 vm_entry_instruction_len;
 548         u32 tpr_threshold;
 549         u32 secondary_vm_exec_control;
 550         u32 vm_instruction_error;
 551         u32 vm_exit_reason;
 552         u32 vm_exit_intr_info;
 553         u32 vm_exit_intr_error_code;
 554         u32 idt_vectoring_info_field;
 555         u32 idt_vectoring_error_code;
 556         u32 vm_exit_instruction_len;
 557         u32 vmx_instruction_info;
 558         u32 guest_es_limit;
 559         u32 guest_cs_limit;
 560         u32 guest_ss_limit;
 561         u32 guest_ds_limit;
 562         u32 guest_fs_limit;
 563         u32 guest_gs_limit;
 564         u32 guest_ldtr_limit;
 565         u32 guest_tr_limit;
 566         u32 guest_gdtr_limit;
 567         u32 guest_idtr_limit;
 568         u32 guest_es_ar_bytes;
 569         u32 guest_cs_ar_bytes;
 570         u32 guest_ss_ar_bytes;
 571         u32 guest_ds_ar_bytes;
 572         u32 guest_fs_ar_bytes;
 573         u32 guest_gs_ar_bytes;
 574         u32 guest_ldtr_ar_bytes;
 575         u32 guest_tr_ar_bytes;
 576         u32 guest_interruptibility_info;
 577         u32 guest_activity_state;
 578         u32 guest_sysenter_cs;
 579         u32 host_ia32_sysenter_cs;
 580         u32 vmx_preemption_timer_value;
 581         u32 padding32[7]; /* room for future expansion */
 582         u16 virtual_processor_id;
 583         u16 posted_intr_nv;
 584         u16 guest_es_selector;
 585         u16 guest_cs_selector;
 586         u16 guest_ss_selector;
 587         u16 guest_ds_selector;
 588         u16 guest_fs_selector;
 589         u16 guest_gs_selector;
 590         u16 guest_ldtr_selector;
 591         u16 guest_tr_selector;
 592         u16 guest_intr_status;
 593         u16 host_es_selector;
 594         u16 host_cs_selector;
 595         u16 host_ss_selector;
 596         u16 host_ds_selector;
 597         u16 host_fs_selector;
 598         u16 host_gs_selector;
 599         u16 host_tr_selector;
 600         u16 guest_pml_index;
 601 };
 602
 603 /*
 604  * For save/restore compatibility, the vmcs12 field offsets must not change.
 605  */
 606 #define CHECK_OFFSET(field, loc)                                \
 607         BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc),       \
 608                 "Offset of " #field " in struct vmcs12 has changed.")
 609
 610 static inline void vmx_check_vmcs12_offsets(void) {
 611         CHECK_OFFSET(hdr, 0);
 612         CHECK_OFFSET(abort, 4);
 613         CHECK_OFFSET(launch_state, 8);
 614         CHECK_OFFSET(io_bitmap_a, 40);
 615         CHECK_OFFSET(io_bitmap_b, 48);
 616         CHECK_OFFSET(msr_bitmap, 56);
 617         CHECK_OFFSET(vm_exit_msr_store_addr, 64);
 618         CHECK_OFFSET(vm_exit_msr_load_addr, 72);
 619         CHECK_OFFSET(vm_entry_msr_load_addr, 80);
 620         CHECK_OFFSET(tsc_offset, 88);
 621         CHECK_OFFSET(virtual_apic_page_addr, 96);
 622         CHECK_OFFSET(apic_access_addr, 104);
 623         CHECK_OFFSET(posted_intr_desc_addr, 112);
 624         CHECK_OFFSET(ept_pointer, 120);
 625         CHECK_OFFSET(eoi_exit_bitmap0, 128);
 626         CHECK_OFFSET(eoi_exit_bitmap1, 136);
 627         CHECK_OFFSET(eoi_exit_bitmap2, 144);
 628         CHECK_OFFSET(eoi_exit_bitmap3, 152);
 629         CHECK_OFFSET(xss_exit_bitmap, 160);
 630         CHECK_OFFSET(guest_physical_address, 168);
 631         CHECK_OFFSET(vmcs_link_pointer, 176);
 632         CHECK_OFFSET(guest_ia32_debugctl, 184);
 633         CHECK_OFFSET(guest_ia32_pat, 192);
 634         CHECK_OFFSET(guest_ia32_efer, 200);
 635         CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208);
 636         CHECK_OFFSET(guest_pdptr0, 216);
 637         CHECK_OFFSET(guest_pdptr1, 224);
 638         CHECK_OFFSET(guest_pdptr2, 232);
 639         CHECK_OFFSET(guest_pdptr3, 240);
 640         CHECK_OFFSET(guest_bndcfgs, 248);
 641         CHECK_OFFSET(host_ia32_pat, 256);
 642         CHECK_OFFSET(host_ia32_efer, 264);
 643         CHECK_OFFSET(host_ia32_perf_global_ctrl, 272);
 644         CHECK_OFFSET(vmread_bitmap, 280);
 645         CHECK_OFFSET(vmwrite_bitmap, 288);
 646         CHECK_OFFSET(vm_function_control, 296);
 647         CHECK_OFFSET(eptp_list_address, 304);
 648         CHECK_OFFSET(pml_address, 312);
 649         CHECK_OFFSET(cr0_guest_host_mask, 344);
 650         CHECK_OFFSET(cr4_guest_host_mask, 352);
 651         CHECK_OFFSET(cr0_read_shadow, 360);
 652         CHECK_OFFSET(cr4_read_shadow, 368);
 653         CHECK_OFFSET(cr3_target_value0, 376);
 654         CHECK_OFFSET(cr3_target_value1, 384);
 655         CHECK_OFFSET(cr3_target_value2, 392);
 656         CHECK_OFFSET(cr3_target_value3, 400);
 657         CHECK_OFFSET(exit_qualification, 408);
 658         CHECK_OFFSET(guest_linear_address, 416);
 659         CHECK_OFFSET(guest_cr0, 424);
 660         CHECK_OFFSET(guest_cr3, 432);
 661         CHECK_OFFSET(guest_cr4, 440);
 662         CHECK_OFFSET(guest_es_base, 448);
 663         CHECK_OFFSET(guest_cs_base, 456);
 664         CHECK_OFFSET(guest_ss_base, 464);
 665         CHECK_OFFSET(guest_ds_base, 472);
 666         CHECK_OFFSET(guest_fs_base, 480);
 667         CHECK_OFFSET(guest_gs_base, 488);
 668         CHECK_OFFSET(guest_ldtr_base, 496);
 669         CHECK_OFFSET(guest_tr_base, 504);
 670         CHECK_OFFSET(guest_gdtr_base, 512);
 671         CHECK_OFFSET(guest_idtr_base, 520);
 672         CHECK_OFFSET(guest_dr7, 528);
 673         CHECK_OFFSET(guest_rsp, 536);
 674         CHECK_OFFSET(guest_rip, 544);
 675         CHECK_OFFSET(guest_rflags, 552);
 676         CHECK_OFFSET(guest_pending_dbg_exceptions, 560);
 677         CHECK_OFFSET(guest_sysenter_esp, 568);
 678         CHECK_OFFSET(guest_sysenter_eip, 576);
 679         CHECK_OFFSET(host_cr0, 584);
 680         CHECK_OFFSET(host_cr3, 592);
 681         CHECK_OFFSET(host_cr4, 600);
 682         CHECK_OFFSET(host_fs_base, 608);
 683         CHECK_OFFSET(host_gs_base, 616);
 684         CHECK_OFFSET(host_tr_base, 624);
 685         CHECK_OFFSET(host_gdtr_base, 632);
 686         CHECK_OFFSET(host_idtr_base, 640);
 687         CHECK_OFFSET(host_ia32_sysenter_esp, 648);
 688         CHECK_OFFSET(host_ia32_sysenter_eip, 656);
 689         CHECK_OFFSET(host_rsp, 664);
 690         CHECK_OFFSET(host_rip, 672);
 691         CHECK_OFFSET(pin_based_vm_exec_control, 744);
 692         CHECK_OFFSET(cpu_based_vm_exec_control, 748);
 693         CHECK_OFFSET(exception_bitmap, 752);
 694         CHECK_OFFSET(page_fault_error_code_mask, 756);
 695         CHECK_OFFSET(page_fault_error_code_match, 760);
 696         CHECK_OFFSET(cr3_target_count, 764);
 697         CHECK_OFFSET(vm_exit_controls, 768);
 698         CHECK_OFFSET(vm_exit_msr_store_count, 772);
 699         CHECK_OFFSET(vm_exit_msr_load_count, 776);
 700         CHECK_OFFSET(vm_entry_controls, 780);
 701         CHECK_OFFSET(vm_entry_msr_load_count, 784);
 702         CHECK_OFFSET(vm_entry_intr_info_field, 788);
 703         CHECK_OFFSET(vm_entry_exception_error_code, 792);
 704         CHECK_OFFSET(vm_entry_instruction_len, 796);
 705         CHECK_OFFSET(tpr_threshold, 800);
 706         CHECK_OFFSET(secondary_vm_exec_control, 804);
 707         CHECK_OFFSET(vm_instruction_error, 808);
 708         CHECK_OFFSET(vm_exit_reason, 812);
 709         CHECK_OFFSET(vm_exit_intr_info, 816);
 710         CHECK_OFFSET(vm_exit_intr_error_code, 820);
 711         CHECK_OFFSET(idt_vectoring_info_field, 824);
 712         CHECK_OFFSET(idt_vectoring_error_code, 828);
 713         CHECK_OFFSET(vm_exit_instruction_len, 832);
 714         CHECK_OFFSET(vmx_instruction_info, 836);
 715         CHECK_OFFSET(guest_es_limit, 840);
 716         CHECK_OFFSET(guest_cs_limit, 844);
 717         CHECK_OFFSET(guest_ss_limit, 848);
 718         CHECK_OFFSET(guest_ds_limit, 852);
 719         CHECK_OFFSET(guest_fs_limit, 856);
 720         CHECK_OFFSET(guest_gs_limit, 860);
 721         CHECK_OFFSET(guest_ldtr_limit, 864);
 722         CHECK_OFFSET(guest_tr_limit, 868);
 723         CHECK_OFFSET(guest_gdtr_limit, 872);
 724         CHECK_OFFSET(guest_idtr_limit, 876);
 725         CHECK_OFFSET(guest_es_ar_bytes, 880);
 726         CHECK_OFFSET(guest_cs_ar_bytes, 884);
 727         CHECK_OFFSET(guest_ss_ar_bytes, 888);
 728         CHECK_OFFSET(guest_ds_ar_bytes, 892);
 729         CHECK_OFFSET(guest_fs_ar_bytes, 896);
 730         CHECK_OFFSET(guest_gs_ar_bytes, 900);
 731         CHECK_OFFSET(guest_ldtr_ar_bytes, 904);
 732         CHECK_OFFSET(guest_tr_ar_bytes, 908);
 733         CHECK_OFFSET(guest_interruptibility_info, 912);
 734         CHECK_OFFSET(guest_activity_state, 916);
 735         CHECK_OFFSET(guest_sysenter_cs, 920);
 736         CHECK_OFFSET(host_ia32_sysenter_cs, 924);
 737         CHECK_OFFSET(vmx_preemption_timer_value, 928);
 738         CHECK_OFFSET(virtual_processor_id, 960);
 739         CHECK_OFFSET(posted_intr_nv, 962);
 740         CHECK_OFFSET(guest_es_selector, 964);
 741         CHECK_OFFSET(guest_cs_selector, 966);
 742         CHECK_OFFSET(guest_ss_selector, 968);
 743         CHECK_OFFSET(guest_ds_selector, 970);
 744         CHECK_OFFSET(guest_fs_selector, 972);
 745         CHECK_OFFSET(guest_gs_selector, 974);
 746         CHECK_OFFSET(guest_ldtr_selector, 976);
 747         CHECK_OFFSET(guest_tr_selector, 978);
 748         CHECK_OFFSET(guest_intr_status, 980);
 749         CHECK_OFFSET(host_es_selector, 982);
 750         CHECK_OFFSET(host_cs_selector, 984);
 751         CHECK_OFFSET(host_ss_selector, 986);
 752         CHECK_OFFSET(host_ds_selector, 988);
 753         CHECK_OFFSET(host_fs_selector, 990);
 754         CHECK_OFFSET(host_gs_selector, 992);
 755         CHECK_OFFSET(host_tr_selector, 994);
 756         CHECK_OFFSET(guest_pml_index, 996);
 757 }
 758
 759 /*
 760  * VMCS12_REVISION is an arbitrary id that should be changed if the content or
 761  * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
 762  * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
 763  *
 764  * IMPORTANT: Changing this value will break save/restore compatibility with
 765  * older kvm releases.
 766  */
 767 #define VMCS12_REVISION 0x11e57ed0
 768
 769 /*
 770  * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
 771  * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
 772  * current implementation, 4K are reserved to avoid future complications.
 773  */
 774 #define VMCS12_SIZE 0x1000
 775
 776 /*
 777  * VMCS12_MAX_FIELD_INDEX is the highest index value used in any
 778  * supported VMCS12 field encoding.
 779  */
 780 #define VMCS12_MAX_FIELD_INDEX 0x17
 781
 782 struct nested_vmx_msrs {
 783         /*
 784          * We only store the "true" versions of the VMX capability MSRs. We
 785          * generate the "non-true" versions by setting the must-be-1 bits
 786          * according to the SDM.
 787          */
 788         u32 procbased_ctls_low;
 789         u32 procbased_ctls_high;
 790         u32 secondary_ctls_low;
 791         u32 secondary_ctls_high;
 792         u32 pinbased_ctls_low;
 793         u32 pinbased_ctls_high;
 794         u32 exit_ctls_low;
 795         u32 exit_ctls_high;
 796         u32 entry_ctls_low;
 797         u32 entry_ctls_high;
 798         u32 misc_low;
 799         u32 misc_high;
 800         u32 ept_caps;
 801         u32 vpid_caps;
 802         u64 basic;
 803         u64 cr0_fixed0;
 804         u64 cr0_fixed1;
 805         u64 cr4_fixed0;
 806         u64 cr4_fixed1;
 807         u64 vmcs_enum;
 808         u64 vmfunc_controls;
 809 };
 810
 811 /*
 812  * The nested_vmx structure is part of vcpu_vmx, and holds information we need
 813  * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
 814  */
 815 struct nested_vmx {
 816         /* Has the level1 guest done vmxon? */
 817         bool vmxon;
 818         gpa_t vmxon_ptr;
 819         bool pml_full;
 820
 821         /* The guest-physical address of the current VMCS L1 keeps for L2 */
 822         gpa_t current_vmptr;
 823         /*
 824          * Cache of the guest's VMCS, existing outside of guest memory.
 825          * Loaded from guest memory during VMPTRLD. Flushed to guest
 826          * memory during VMCLEAR and VMPTRLD.
 827          */
 828         struct vmcs12 *cached_vmcs12;
 829         /*
 830          * Cache of the guest's shadow VMCS, existing outside of guest
 831          * memory. Loaded from guest memory during VM entry. Flushed
 832          * to guest memory during VM exit.
 833          */
 834         struct vmcs12 *cached_shadow_vmcs12;
 835         /*
 836          * Indicates if the shadow vmcs must be updated with the
 837          * data hold by vmcs12
 838          */
 839         bool sync_shadow_vmcs;
 840         bool dirty_vmcs12;
 841
 842         bool change_vmcs01_virtual_apic_mode;
 843
 844         /* L2 must run next, and mustn't decide to exit to L1. */
 845         bool nested_run_pending;
 846
 847         struct loaded_vmcs vmcs02;
 848
 849         /*
 850          * Guest pages referred to in the vmcs02 with host-physical
 851          * pointers, so we must keep them pinned while L2 runs.
 852          */
 853         struct page *apic_access_page;
 854         struct page *virtual_apic_page;
 855         struct page *pi_desc_page;
 856         struct pi_desc *pi_desc;
 857         bool pi_pending;
 858         u16 posted_intr_nv;
 859
 860         struct hrtimer preemption_timer;
 861         bool preemption_timer_expired;
 862
 863         /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
 864         u64 vmcs01_debugctl;
 865         u64 vmcs01_guest_bndcfgs;
 866
 867         u16 vpid02;
 868         u16 last_vpid;
 869
 870         struct nested_vmx_msrs msrs;
 871
 872         /* SMM related state */
 873         struct {
 874                 /* in VMX operation on SMM entry? */
 875                 bool vmxon;
 876                 /* in guest mode on SMM entry? */
 877                 bool guest_mode;
 878         } smm;
 879 };
 880
 881 #define POSTED_INTR_ON  0
 882 #define POSTED_INTR_SN  1
 883
 884 /* Posted-Interrupt Descriptor */
 885 struct pi_desc {
 886         u32 pir[8];     /* Posted interrupt requested */
 887         union {
 888                 struct {
 889                                 /* bit 256 - Outstanding Notification */
 890                         u16     on      : 1,
 891                                 /* bit 257 - Suppress Notification */
 892                                 sn      : 1,
 893                                 /* bit 271:258 - Reserved */
 894                                 rsvd_1  : 14;
 895                                 /* bit 279:272 - Notification Vector */
 896                         u8      nv;
 897                                 /* bit 287:280 - Reserved */
 898                         u8      rsvd_2;
 899                                 /* bit 319:288 - Notification Destination */
 900                         u32     ndst;
 901                 };
 902                 u64 control;
 903         };
 904         u32 rsvd[6];
 905 } __aligned(64);
 906
 907 static bool pi_test_and_set_on(struct pi_desc *pi_desc)
 908 {
 909         return test_and_set_bit(POSTED_INTR_ON,
 910                         (unsigned long *)&pi_desc->control);
 911 }
 912
 913 static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
 914 {
 915         return test_and_clear_bit(POSTED_INTR_ON,
 916                         (unsigned long *)&pi_desc->control);
 917 }
 918
 919 static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
 920 {
 921         return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
 922 }
 923
 924 static inline void pi_clear_sn(struct pi_desc *pi_desc)
 925 {
 926         return clear_bit(POSTED_INTR_SN,
 927                         (unsigned long *)&pi_desc->control);
 928 }
 929
 930 static inline void pi_set_sn(struct pi_desc *pi_desc)
 931 {
 932         return set_bit(POSTED_INTR_SN,
 933                         (unsigned long *)&pi_desc->control);
 934 }
 935
 936 static inline void pi_clear_on(struct pi_desc *pi_desc)
 937 {
 938         clear_bit(POSTED_INTR_ON,
 939                   (unsigned long *)&pi_desc->control);
 940 }
 941
 942 static inline int pi_test_on(struct pi_desc *pi_desc)
 943 {
 944         return test_bit(POSTED_INTR_ON,
 945                         (unsigned long *)&pi_desc->control);
 946 }
 947
 948 static inline int pi_test_sn(struct pi_desc *pi_desc)
 949 {
 950         return test_bit(POSTED_INTR_SN,
 951                         (unsigned long *)&pi_desc->control);
 952 }
 953
 954 struct vmx_msrs {
 955         unsigned int            nr;
 956         struct vmx_msr_entry    val[NR_AUTOLOAD_MSRS];
 957 };
 958
 959 struct vcpu_vmx {
 960         struct kvm_vcpu       vcpu;
 961         unsigned long         host_rsp;
 962         u8                    fail;
 963         u8                    msr_bitmap_mode;
 964         u32                   exit_intr_info;
 965         u32                   idt_vectoring_info;
 966         ulong                 rflags;
 967         struct shared_msr_entry *guest_msrs;
 968         int                   nmsrs;
 969         int                   save_nmsrs;
 970         bool                  guest_msrs_dirty;
 971         unsigned long         host_idt_base;
 972 #ifdef CONFIG_X86_64
 973         u64                   msr_host_kernel_gs_base;
 974         u64                   msr_guest_kernel_gs_base;
 975 #endif
 976
 977         u64                   spec_ctrl;
 978
 979         u32 vm_entry_controls_shadow;
 980         u32 vm_exit_controls_shadow;
 981         u32 secondary_exec_control;
 982
 983         /*
 984          * loaded_vmcs points to the VMCS currently used in this vcpu. For a
 985          * non-nested (L1) guest, it always points to vmcs01. For a nested
 986          * guest (L2), it points to a different VMCS.  loaded_cpu_state points
 987          * to the VMCS whose state is loaded into the CPU registers that only
 988          * need to be switched when transitioning to/from the kernel; a NULL
 989          * value indicates that host state is loaded.
 990          */
 991         struct loaded_vmcs    vmcs01;
 992         struct loaded_vmcs   *loaded_vmcs;
 993         struct loaded_vmcs   *loaded_cpu_state;
 994         bool                  __launched; /* temporary, used in vmx_vcpu_run */
 995         struct msr_autoload {
 996                 struct vmx_msrs guest;
 997                 struct vmx_msrs host;
 998         } msr_autoload;
 999
1000         struct {
1001                 int vm86_active;
1002                 ulong save_rflags;
1003                 struct kvm_segment segs[8];
1004         } rmode;
1005         struct {
1006                 u32 bitmask; /* 4 bits per segment (1 bit per field) */
1007                 struct kvm_save_segment {
1008                         u16 selector;
1009                         unsigned long base;
1010                         u32 limit;
1011                         u32 ar;
1012                 } seg[8];
1013         } segment_cache;
1014         int vpid;
1015         bool emulation_required;
1016
1017         u32 exit_reason;
1018
1019         /* Posted interrupt descriptor */
1020         struct pi_desc pi_desc;
1021
1022         /* Support for a guest hypervisor (nested VMX) */
1023         struct nested_vmx nested;
1024
1025         /* Dynamic PLE window. */
1026         int ple_window;
1027         bool ple_window_dirty;
1028
1029         bool req_immediate_exit;
1030
1031         /* Support for PML */
1032 #define PML_ENTITY_NUM          512
1033         struct page *pml_pg;
1034
1035         /* apic deadline value in host tsc */
1036         u64 hv_deadline_tsc;
1037
1038         u64 current_tsc_ratio;
1039
1040         u32 host_pkru;
1041
1042         unsigned long host_debugctlmsr;
1043
1044         /*
1045          * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
1046          * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
1047          * in msr_ia32_feature_control_valid_bits.
1048          */
1049         u64 msr_ia32_feature_control;
1050         u64 msr_ia32_feature_control_valid_bits;
1051         u64 ept_pointer;
1052         u64 msr_ia32_mcu_opt_ctrl;
1053         bool disable_fb_clear;
1054 };
1055
1056 enum segment_cache_field {
1057         SEG_FIELD_SEL = 0,
1058         SEG_FIELD_BASE = 1,
1059         SEG_FIELD_LIMIT = 2,
1060         SEG_FIELD_AR = 3,
1061
1062         SEG_FIELD_NR = 4
1063 };
1064
1065 static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
1066 {
1067         return container_of(kvm, struct kvm_vmx, kvm);
1068 }
1069
1070 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
1071 {
1072         return container_of(vcpu, struct vcpu_vmx, vcpu);
1073 }
1074
1075 static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
1076 {
1077         return &(to_vmx(vcpu)->pi_desc);
1078 }
1079
1080 #define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
1081 #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
1082 #define FIELD(number, name)     [ROL16(number, 6)] = VMCS12_OFFSET(name)
1083 #define FIELD64(number, name)                                           \
1084         FIELD(number, name),                                            \
1085         [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
1086
1087
1088 static u16 shadow_read_only_fields[] = {
1089 #define SHADOW_FIELD_RO(x) x,
1090 #include "vmx_shadow_fields.h"
1091 };
1092 static int max_shadow_read_only_fields =
1093         ARRAY_SIZE(shadow_read_only_fields);
1094
1095 static u16 shadow_read_write_fields[] = {
1096 #define SHADOW_FIELD_RW(x) x,
1097 #include "vmx_shadow_fields.h"
1098 };
1099 static int max_shadow_read_write_fields =
1100         ARRAY_SIZE(shadow_read_write_fields);
1101
1102 static const unsigned short vmcs_field_to_offset_table[] = {
1103         FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
1104         FIELD(POSTED_INTR_NV, posted_intr_nv),
1105         FIELD(GUEST_ES_SELECTOR, guest_es_selector),
1106         FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
1107         FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
1108         FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
1109         FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
1110         FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
1111         FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
1112         FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
1113         FIELD(GUEST_INTR_STATUS, guest_intr_status),
1114         FIELD(GUEST_PML_INDEX, guest_pml_index),
1115         FIELD(HOST_ES_SELECTOR, host_es_selector),
1116         FIELD(HOST_CS_SELECTOR, host_cs_selector),
1117         FIELD(HOST_SS_SELECTOR, host_ss_selector),
1118         FIELD(HOST_DS_SELECTOR, host_ds_selector),
1119         FIELD(HOST_FS_SELECTOR, host_fs_selector),
1120         FIELD(HOST_GS_SELECTOR, host_gs_selector),
1121         FIELD(HOST_TR_SELECTOR, host_tr_selector),
1122         FIELD64(IO_BITMAP_A, io_bitmap_a),
1123         FIELD64(IO_BITMAP_B, io_bitmap_b),
1124         FIELD64(MSR_BITMAP, msr_bitmap),
1125         FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
1126         FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
1127         FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
1128         FIELD64(PML_ADDRESS, pml_address),
1129         FIELD64(TSC_OFFSET, tsc_offset),
1130         FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
1131         FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
1132         FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
1133         FIELD64(VM_FUNCTION_CONTROL, vm_function_control),
1134         FIELD64(EPT_POINTER, ept_pointer),
1135         FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
1136         FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
1137         FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
1138         FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
1139         FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
1140         FIELD64(VMREAD_BITMAP, vmread_bitmap),
1141         FIELD64(VMWRITE_BITMAP, vmwrite_bitmap),
1142         FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
1143         FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
1144         FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
1145         FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
1146         FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
1147         FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
1148         FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
1149         FIELD64(GUEST_PDPTR0, guest_pdptr0),
1150         FIELD64(GUEST_PDPTR1, guest_pdptr1),
1151         FIELD64(GUEST_PDPTR2, guest_pdptr2),
1152         FIELD64(GUEST_PDPTR3, guest_pdptr3),
1153         FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
1154         FIELD64(HOST_IA32_PAT, host_ia32_pat),
1155         FIELD64(HOST_IA32_EFER, host_ia32_efer),
1156         FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
1157         FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
1158         FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
1159         FIELD(EXCEPTION_BITMAP, exception_bitmap),
1160         FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
1161         FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
1162         FIELD(CR3_TARGET_COUNT, cr3_target_count),
1163         FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
1164         FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
1165         FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
1166         FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
1167         FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
1168         FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
1169         FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
1170         FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
1171         FIELD(TPR_THRESHOLD, tpr_threshold),
1172         FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
1173         FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
1174         FIELD(VM_EXIT_REASON, vm_exit_reason),
1175         FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
1176         FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
1177         FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
1178         FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
1179         FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
1180         FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
1181         FIELD(GUEST_ES_LIMIT, guest_es_limit),
1182         FIELD(GUEST_CS_LIMIT, guest_cs_limit),
1183         FIELD(GUEST_SS_LIMIT, guest_ss_limit),
1184         FIELD(GUEST_DS_LIMIT, guest_ds_limit),
1185         FIELD(GUEST_FS_LIMIT, guest_fs_limit),
1186         FIELD(GUEST_GS_LIMIT, guest_gs_limit),
1187         FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
1188         FIELD(GUEST_TR_LIMIT, guest_tr_limit),
1189         FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
1190         FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
1191         FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
1192         FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
1193         FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
1194         FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
1195         FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
1196         FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
1197         FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
1198         FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
1199         FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
1200         FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
1201         FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
1202         FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
1203         FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
1204         FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
1205         FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
1206         FIELD(CR0_READ_SHADOW, cr0_read_shadow),
1207         FIELD(CR4_READ_SHADOW, cr4_read_shadow),
1208         FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
1209         FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
1210         FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
1211         FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
1212         FIELD(EXIT_QUALIFICATION, exit_qualification),
1213         FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
1214         FIELD(GUEST_CR0, guest_cr0),
1215         FIELD(GUEST_CR3, guest_cr3),
1216         FIELD(GUEST_CR4, guest_cr4),
1217         FIELD(GUEST_ES_BASE, guest_es_base),
1218         FIELD(GUEST_CS_BASE, guest_cs_base),
1219         FIELD(GUEST_SS_BASE, guest_ss_base),
1220         FIELD(GUEST_DS_BASE, guest_ds_base),
1221         FIELD(GUEST_FS_BASE, guest_fs_base),
1222         FIELD(GUEST_GS_BASE, guest_gs_base),
1223         FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
1224         FIELD(GUEST_TR_BASE, guest_tr_base),
1225         FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
1226         FIELD(GUEST_IDTR_BASE, guest_idtr_base),
1227         FIELD(GUEST_DR7, guest_dr7),
1228         FIELD(GUEST_RSP, guest_rsp),
1229         FIELD(GUEST_RIP, guest_rip),
1230         FIELD(GUEST_RFLAGS, guest_rflags),
1231         FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
1232         FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
1233         FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
1234         FIELD(HOST_CR0, host_cr0),
1235         FIELD(HOST_CR3, host_cr3),
1236         FIELD(HOST_CR4, host_cr4),
1237         FIELD(HOST_FS_BASE, host_fs_base),
1238         FIELD(HOST_GS_BASE, host_gs_base),
1239         FIELD(HOST_TR_BASE, host_tr_base),
1240         FIELD(HOST_GDTR_BASE, host_gdtr_base),
1241         FIELD(HOST_IDTR_BASE, host_idtr_base),
1242         FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
1243         FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
1244         FIELD(HOST_RSP, host_rsp),
1245         FIELD(HOST_RIP, host_rip),
1246 };
1247
1248 static inline short vmcs_field_to_offset(unsigned long field)
1249 {
1250         const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table);
1251         unsigned short offset;
1252         unsigned index;
1253
1254         if (field >> 15)
1255                 return -ENOENT;
1256
1257         index = ROL16(field, 6);
1258         if (index >= size)
1259                 return -ENOENT;
1260
1261         index = array_index_nospec(index, size);
1262         offset = vmcs_field_to_offset_table[index];
1263         if (offset == 0)
1264                 return -ENOENT;
1265         return offset;
1266 }
1267
1268 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
1269 {
1270         return to_vmx(vcpu)->nested.cached_vmcs12;
1271 }
1272
1273 static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu)
1274 {
1275         return to_vmx(vcpu)->nested.cached_shadow_vmcs12;
1276 }
1277
1278 static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
1279 static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
1280 static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
1281 static bool vmx_xsaves_supported(void);
1282 static void vmx_set_segment(struct kvm_vcpu *vcpu,
1283                             struct kvm_segment *var, int seg);
1284 static void vmx_get_segment(struct kvm_vcpu *vcpu,
1285                             struct kvm_segment *var, int seg);
1286 static bool guest_state_valid(struct kvm_vcpu *vcpu);
1287 static u32 vmx_segment_access_rights(struct kvm_segment *var);
1288 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
1289 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
1290 static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
1291 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
1292                                             u16 error_code);
1293 static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
1294 static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
1295                                                           u32 msr, int type);
1296
1297 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
1298 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
1299 /*
1300  * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
1301  * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
1302  */
1303 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
1304
1305 /*
1306  * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
1307  * can find which vCPU should be waken up.
1308  */
1309 static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
1310 static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
1311
1312 enum {
1313         VMX_VMREAD_BITMAP,
1314         VMX_VMWRITE_BITMAP,
1315         VMX_BITMAP_NR
1316 };
1317
1318 static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
1319
1320 #define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
1321 #define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
1322
1323 static bool cpu_has_load_ia32_efer;
1324 static bool cpu_has_load_perf_global_ctrl;
1325
1326 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
1327 static DEFINE_SPINLOCK(vmx_vpid_lock);
1328
1329 static struct vmcs_config {
1330         int size;
1331         int order;
1332         u32 basic_cap;
1333         u32 revision_id;
1334         u32 pin_based_exec_ctrl;
1335         u32 cpu_based_exec_ctrl;
1336         u32 cpu_based_2nd_exec_ctrl;
1337         u32 vmexit_ctrl;
1338         u32 vmentry_ctrl;
1339         struct nested_vmx_msrs nested;
1340 } vmcs_config;
1341
1342 static struct vmx_capability {
1343         u32 ept;
1344         u32 vpid;
1345 } vmx_capability;
1346
1347 #define VMX_SEGMENT_FIELD(seg)                                  \
1348         [VCPU_SREG_##seg] = {                                   \
1349                 .selector = GUEST_##seg##_SELECTOR,             \
1350                 .base = GUEST_##seg##_BASE,                     \
1351                 .limit = GUEST_##seg##_LIMIT,                   \
1352                 .ar_bytes = GUEST_##seg##_AR_BYTES,             \
1353         }
1354
1355 static const struct kvm_vmx_segment_field {
1356         unsigned selector;
1357         unsigned base;
1358         unsigned limit;
1359         unsigned ar_bytes;
1360 } kvm_vmx_segment_fields[] = {
1361         VMX_SEGMENT_FIELD(CS),
1362         VMX_SEGMENT_FIELD(DS),
1363         VMX_SEGMENT_FIELD(ES),
1364         VMX_SEGMENT_FIELD(FS),
1365         VMX_SEGMENT_FIELD(GS),
1366         VMX_SEGMENT_FIELD(SS),
1367         VMX_SEGMENT_FIELD(TR),
1368         VMX_SEGMENT_FIELD(LDTR),
1369 };
1370
1371 static u64 host_efer;
1372
1373 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
1374
1375 /*
1376  * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
1377  * away by decrementing the array size.
1378  */
1379 static const u32 vmx_msr_index[] = {
1380 #ifdef CONFIG_X86_64
1381         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
1382 #endif
1383         MSR_EFER, MSR_TSC_AUX, MSR_STAR,
1384 };
1385
1386 DEFINE_STATIC_KEY_FALSE(enable_evmcs);
1387
1388 #define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs))
1389
1390 #define KVM_EVMCS_VERSION 1
1391
1392 #if IS_ENABLED(CONFIG_HYPERV)
1393 static bool __read_mostly enlightened_vmcs = true;
1394 module_param(enlightened_vmcs, bool, 0444);
1395
1396 static inline void evmcs_write64(unsigned long field, u64 value)
1397 {
1398         u16 clean_field;
1399         int offset = get_evmcs_offset(field, &clean_field);
1400
1401         if (offset < 0)
1402                 return;
1403
1404         *(u64 *)((char *)current_evmcs + offset) = value;
1405
1406         current_evmcs->hv_clean_fields &= ~clean_field;
1407 }
1408
1409 static inline void evmcs_write32(unsigned long field, u32 value)
1410 {
1411         u16 clean_field;
1412         int offset = get_evmcs_offset(field, &clean_field);
1413
1414         if (offset < 0)
1415                 return;
1416
1417         *(u32 *)((char *)current_evmcs + offset) = value;
1418         current_evmcs->hv_clean_fields &= ~clean_field;
1419 }
1420
1421 static inline void evmcs_write16(unsigned long field, u16 value)
1422 {
1423         u16 clean_field;
1424         int offset = get_evmcs_offset(field, &clean_field);
1425
1426         if (offset < 0)
1427                 return;
1428
1429         *(u16 *)((char *)current_evmcs + offset) = value;
1430         current_evmcs->hv_clean_fields &= ~clean_field;
1431 }
1432
1433 static inline u64 evmcs_read64(unsigned long field)
1434 {
1435         int offset = get_evmcs_offset(field, NULL);
1436
1437         if (offset < 0)
1438                 return 0;
1439
1440         return *(u64 *)((char *)current_evmcs + offset);
1441 }
1442
1443 static inline u32 evmcs_read32(unsigned long field)
1444 {
1445         int offset = get_evmcs_offset(field, NULL);
1446
1447         if (offset < 0)
1448                 return 0;
1449
1450         return *(u32 *)((char *)current_evmcs + offset);
1451 }
1452
1453 static inline u16 evmcs_read16(unsigned long field)
1454 {
1455         int offset = get_evmcs_offset(field, NULL);
1456
1457         if (offset < 0)
1458                 return 0;
1459
1460         return *(u16 *)((char *)current_evmcs + offset);
1461 }
1462
1463 static inline void evmcs_touch_msr_bitmap(void)
1464 {
1465         if (unlikely(!current_evmcs))
1466                 return;
1467
1468         if (current_evmcs->hv_enlightenments_control.msr_bitmap)
1469                 current_evmcs->hv_clean_fields &=
1470                         ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
1471 }
1472
1473 static void evmcs_load(u64 phys_addr)
1474 {
1475         struct hv_vp_assist_page *vp_ap =
1476                 hv_get_vp_assist_page(smp_processor_id());
1477
1478         vp_ap->current_nested_vmcs = phys_addr;
1479         vp_ap->enlighten_vmentry = 1;
1480 }
1481
1482 static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
1483 {
1484         /*
1485          * Enlightened VMCSv1 doesn't support these:
1486          *
1487          *      POSTED_INTR_NV                  = 0x00000002,
1488          *      GUEST_INTR_STATUS               = 0x00000810,
1489          *      APIC_ACCESS_ADDR                = 0x00002014,
1490          *      POSTED_INTR_DESC_ADDR           = 0x00002016,
1491          *      EOI_EXIT_BITMAP0                = 0x0000201c,
1492          *      EOI_EXIT_BITMAP1                = 0x0000201e,
1493          *      EOI_EXIT_BITMAP2                = 0x00002020,
1494          *      EOI_EXIT_BITMAP3                = 0x00002022,
1495          */
1496         vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
1497         vmcs_conf->cpu_based_2nd_exec_ctrl &=
1498                 ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
1499         vmcs_conf->cpu_based_2nd_exec_ctrl &=
1500                 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1501         vmcs_conf->cpu_based_2nd_exec_ctrl &=
1502                 ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
1503
1504         /*
1505          *      GUEST_PML_INDEX                 = 0x00000812,
1506          *      PML_ADDRESS                     = 0x0000200e,
1507          */
1508         vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_PML;
1509
1510         /*      VM_FUNCTION_CONTROL             = 0x00002018, */
1511         vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
1512
1513         /*
1514          *      EPTP_LIST_ADDRESS               = 0x00002024,
1515          *      VMREAD_BITMAP                   = 0x00002026,
1516          *      VMWRITE_BITMAP                  = 0x00002028,
1517          */
1518         vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_SHADOW_VMCS;
1519
1520         /*
1521          *      TSC_MULTIPLIER                  = 0x00002032,
1522          */
1523         vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_TSC_SCALING;
1524
1525         /*
1526          *      PLE_GAP                         = 0x00004020,
1527          *      PLE_WINDOW                      = 0x00004022,
1528          */
1529         vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
1530
1531         /*
1532          *      VMX_PREEMPTION_TIMER_VALUE      = 0x0000482E,
1533          */
1534         vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
1535
1536         /*
1537          *      GUEST_IA32_PERF_GLOBAL_CTRL     = 0x00002808,
1538          *      HOST_IA32_PERF_GLOBAL_CTRL      = 0x00002c04,
1539          */
1540         vmcs_conf->vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
1541         vmcs_conf->vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
1542
1543         /*
1544          * Currently unsupported in KVM:
1545          *      GUEST_IA32_RTIT_CTL             = 0x00002814,
1546          */
1547 }
1548
1549 /* check_ept_pointer() should be under protection of ept_pointer_lock. */
1550 static void check_ept_pointer_match(struct kvm *kvm)
1551 {
1552         struct kvm_vcpu *vcpu;
1553         u64 tmp_eptp = INVALID_PAGE;
1554         int i;
1555
1556         kvm_for_each_vcpu(i, vcpu, kvm) {
1557                 if (!VALID_PAGE(tmp_eptp)) {
1558                         tmp_eptp = to_vmx(vcpu)->ept_pointer;
1559                 } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
1560                         to_kvm_vmx(kvm)->ept_pointers_match
1561                                 = EPT_POINTERS_MISMATCH;
1562                         return;
1563                 }
1564         }
1565
1566         to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
1567 }
1568
1569 static int vmx_hv_remote_flush_tlb(struct kvm *kvm)
1570 {
1571         int ret;
1572
1573         spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
1574
1575         if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
1576                 check_ept_pointer_match(kvm);
1577
1578         if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
1579                 ret = -ENOTSUPP;
1580                 goto out;
1581         }
1582
1583         /*
1584          * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs the address of the
1585          * base of EPT PML4 table, strip off EPT configuration information.
1586          */
1587         ret = hyperv_flush_guest_mapping(
1588                         to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer & PAGE_MASK);
1589
1590 out:
1591         spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
1592         return ret;
1593 }
1594 #else /* !IS_ENABLED(CONFIG_HYPERV) */
1595 static inline void evmcs_write64(unsigned long field, u64 value) {}
1596 static inline void evmcs_write32(unsigned long field, u32 value) {}
1597 static inline void evmcs_write16(unsigned long field, u16 value) {}
1598 static inline u64 evmcs_read64(unsigned long field) { return 0; }
1599 static inline u32 evmcs_read32(unsigned long field) { return 0; }
1600 static inline u16 evmcs_read16(unsigned long field) { return 0; }
1601 static inline void evmcs_load(u64 phys_addr) {}
1602 static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
1603 static inline void evmcs_touch_msr_bitmap(void) {}
1604 #endif /* IS_ENABLED(CONFIG_HYPERV) */
1605
1606 static inline bool is_exception_n(u32 intr_info, u8 vector)
1607 {
1608         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
1609                              INTR_INFO_VALID_MASK)) ==
1610                 (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
1611 }
1612
1613 static inline bool is_debug(u32 intr_info)
1614 {
1615         return is_exception_n(intr_info, DB_VECTOR);
1616 }
1617
1618 static inline bool is_breakpoint(u32 intr_info)
1619 {
1620         return is_exception_n(intr_info, BP_VECTOR);
1621 }
1622
1623 static inline bool is_page_fault(u32 intr_info)
1624 {
1625         return is_exception_n(intr_info, PF_VECTOR);
1626 }
1627
1628 static inline bool is_no_device(u32 intr_info)
1629 {
1630         return is_exception_n(intr_info, NM_VECTOR);
1631 }
1632
1633 static inline bool is_invalid_opcode(u32 intr_info)
1634 {
1635         return is_exception_n(intr_info, UD_VECTOR);
1636 }
1637
1638 static inline bool is_gp_fault(u32 intr_info)
1639 {
1640         return is_exception_n(intr_info, GP_VECTOR);
1641 }
1642
1643 static inline bool is_external_interrupt(u32 intr_info)
1644 {
1645         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
1646                 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
1647 }
1648
1649 static inline bool is_machine_check(u32 intr_info)
1650 {
1651         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
1652                              INTR_INFO_VALID_MASK)) ==
1653                 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
1654 }
1655
1656 /* Undocumented: icebp/int1 */
1657 static inline bool is_icebp(u32 intr_info)
1658 {
1659         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
1660                 == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK);
1661 }
1662
1663 static inline bool cpu_has_vmx_msr_bitmap(void)
1664 {
1665         return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
1666 }
1667
1668 static inline bool cpu_has_vmx_tpr_shadow(void)
1669 {
1670         return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
1671 }
1672
1673 static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
1674 {
1675         return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
1676 }
1677
1678 static inline bool cpu_has_secondary_exec_ctrls(void)
1679 {
1680         return vmcs_config.cpu_based_exec_ctrl &
1681                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
1682 }
1683
1684 static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
1685 {
1686         return vmcs_config.cpu_based_2nd_exec_ctrl &
1687                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1688 }
1689
1690 static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
1691 {
1692         return vmcs_config.cpu_based_2nd_exec_ctrl &
1693                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
1694 }
1695
1696 static inline bool cpu_has_vmx_apic_register_virt(void)
1697 {
1698         return vmcs_config.cpu_based_2nd_exec_ctrl &
1699                 SECONDARY_EXEC_APIC_REGISTER_VIRT;
1700 }
1701
1702 static inline bool cpu_has_vmx_virtual_intr_delivery(void)
1703 {
1704         return vmcs_config.cpu_based_2nd_exec_ctrl &
1705                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
1706 }
1707
1708 static inline bool cpu_has_vmx_encls_vmexit(void)
1709 {
1710         return vmcs_config.cpu_based_2nd_exec_ctrl &
1711                 SECONDARY_EXEC_ENCLS_EXITING;
1712 }
1713
1714 /*
1715  * Comment's format: document - errata name - stepping - processor name.
1716  * Refer from
1717  * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
1718  */
1719 static u32 vmx_preemption_cpu_tfms[] = {
1720 /* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
1721 0x000206E6,
1722 /* 323056.pdf - AAX65  - C2 - Xeon L3406 */
1723 /* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
1724 /* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
1725 0x00020652,
1726 /* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
1727 0x00020655,
1728 /* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
1729 /* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
1730 /*
1731  * 320767.pdf - AAP86  - B1 -
1732  * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
1733  */
1734 0x000106E5,
1735 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */
1736 0x000106A0,
1737 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */
1738 0x000106A1,
1739 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
1740 0x000106A4,
1741  /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
1742  /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
1743  /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
1744 0x000106A5,
1745 };
1746
1747 static inline bool cpu_has_broken_vmx_preemption_timer(void)
1748 {
1749         u32 eax = cpuid_eax(0x00000001), i;
1750
1751         /* Clear the reserved bits */
1752         eax &= ~(0x3U << 14 | 0xfU << 28);
1753         for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
1754                 if (eax == vmx_preemption_cpu_tfms[i])
1755                         return true;
1756
1757         return false;
1758 }
1759
1760 static inline bool cpu_has_vmx_preemption_timer(void)
1761 {
1762         return vmcs_config.pin_based_exec_ctrl &
1763                 PIN_BASED_VMX_PREEMPTION_TIMER;
1764 }
1765
1766 static inline bool cpu_has_vmx_posted_intr(void)
1767 {
1768         return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
1769                 vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
1770 }
1771
1772 static inline bool cpu_has_vmx_apicv(void)
1773 {
1774         return cpu_has_vmx_apic_register_virt() &&
1775                 cpu_has_vmx_virtual_intr_delivery() &&
1776                 cpu_has_vmx_posted_intr();
1777 }
1778
1779 static inline bool cpu_has_vmx_flexpriority(void)
1780 {
1781         return cpu_has_vmx_tpr_shadow() &&
1782                 cpu_has_vmx_virtualize_apic_accesses();
1783 }
1784
1785 static inline bool cpu_has_vmx_ept_execute_only(void)
1786 {
1787         return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
1788 }
1789
1790 static inline bool cpu_has_vmx_ept_2m_page(void)
1791 {
1792         return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
1793 }
1794
1795 static inline bool cpu_has_vmx_ept_1g_page(void)
1796 {
1797         return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
1798 }
1799
1800 static inline bool cpu_has_vmx_ept_4levels(void)
1801 {
1802         return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
1803 }
1804
1805 static inline bool cpu_has_vmx_ept_mt_wb(void)
1806 {
1807         return vmx_capability.ept & VMX_EPTP_WB_BIT;
1808 }
1809
1810 static inline bool cpu_has_vmx_ept_5levels(void)
1811 {
1812         return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT;
1813 }
1814
1815 static inline bool cpu_has_vmx_ept_ad_bits(void)
1816 {
1817         return vmx_capability.ept & VMX_EPT_AD_BIT;
1818 }
1819
1820 static inline bool cpu_has_vmx_invept_context(void)
1821 {
1822         return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
1823 }
1824
1825 static inline bool cpu_has_vmx_invept_global(void)
1826 {
1827         return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
1828 }
1829
1830 static inline bool cpu_has_vmx_invvpid_individual_addr(void)
1831 {
1832         return vmx_capability.vpid & VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT;
1833 }
1834
1835 static inline bool cpu_has_vmx_invvpid_single(void)
1836 {
1837         return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
1838 }
1839
1840 static inline bool cpu_has_vmx_invvpid_global(void)
1841 {
1842         return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
1843 }
1844
1845 static inline bool cpu_has_vmx_invvpid(void)
1846 {
1847         return vmx_capability.vpid & VMX_VPID_INVVPID_BIT;
1848 }
1849
1850 static inline bool cpu_has_vmx_ept(void)
1851 {
1852         return vmcs_config.cpu_based_2nd_exec_ctrl &
1853                 SECONDARY_EXEC_ENABLE_EPT;
1854 }
1855
1856 static inline bool cpu_has_vmx_unrestricted_guest(void)
1857 {
1858         return vmcs_config.cpu_based_2nd_exec_ctrl &
1859                 SECONDARY_EXEC_UNRESTRICTED_GUEST;
1860 }
1861
1862 static inline bool cpu_has_vmx_ple(void)
1863 {
1864         return vmcs_config.cpu_based_2nd_exec_ctrl &
1865                 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
1866 }
1867
1868 static inline bool cpu_has_vmx_basic_inout(void)
1869 {
1870         return  (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
1871 }
1872
1873 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
1874 {
1875         return flexpriority_enabled && lapic_in_kernel(vcpu);
1876 }
1877
1878 static inline bool cpu_has_vmx_vpid(void)
1879 {
1880         return vmcs_config.cpu_based_2nd_exec_ctrl &
1881                 SECONDARY_EXEC_ENABLE_VPID;
1882 }
1883
1884 static inline bool cpu_has_vmx_rdtscp(void)
1885 {
1886         return vmcs_config.cpu_based_2nd_exec_ctrl &
1887                 SECONDARY_EXEC_RDTSCP;
1888 }
1889
1890 static inline bool cpu_has_vmx_invpcid(void)
1891 {
1892         return vmcs_config.cpu_based_2nd_exec_ctrl &
1893                 SECONDARY_EXEC_ENABLE_INVPCID;
1894 }
1895
1896 static inline bool cpu_has_virtual_nmis(void)
1897 {
1898         return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
1899 }
1900
1901 static inline bool cpu_has_vmx_wbinvd_exit(void)
1902 {
1903         return vmcs_config.cpu_based_2nd_exec_ctrl &
1904                 SECONDARY_EXEC_WBINVD_EXITING;
1905 }
1906
1907 static inline bool cpu_has_vmx_shadow_vmcs(void)
1908 {
1909         u64 vmx_msr;
1910         rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
1911         /* check if the cpu supports writing r/o exit information fields */
1912         if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
1913                 return false;
1914
1915         return vmcs_config.cpu_based_2nd_exec_ctrl &
1916                 SECONDARY_EXEC_SHADOW_VMCS;
1917 }
1918
1919 static inline bool cpu_has_vmx_pml(void)
1920 {
1921         return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
1922 }
1923
1924 static inline bool cpu_has_vmx_tsc_scaling(void)
1925 {
1926         return vmcs_config.cpu_based_2nd_exec_ctrl &
1927                 SECONDARY_EXEC_TSC_SCALING;
1928 }
1929
1930 static inline bool cpu_has_vmx_vmfunc(void)
1931 {
1932         return vmcs_config.cpu_based_2nd_exec_ctrl &
1933                 SECONDARY_EXEC_ENABLE_VMFUNC;
1934 }
1935
1936 static bool vmx_umip_emulated(void)
1937 {
1938         return vmcs_config.cpu_based_2nd_exec_ctrl &
1939                 SECONDARY_EXEC_DESC;
1940 }
1941
1942 static inline bool report_flexpriority(void)
1943 {
1944         return flexpriority_enabled;
1945 }
1946
1947 static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
1948 {
1949         return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low);
1950 }
1951
1952 /*
1953  * Do the virtual VMX capability MSRs specify that L1 can use VMWRITE
1954  * to modify any valid field of the VMCS, or are the VM-exit
1955  * information fields read-only?
1956  */
1957 static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu)
1958 {
1959         return to_vmx(vcpu)->nested.msrs.misc_low &
1960                 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
1961 }
1962
1963 static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu)
1964 {
1965         return to_vmx(vcpu)->nested.msrs.misc_low & VMX_MISC_ZERO_LEN_INS;
1966 }
1967
1968 static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu)
1969 {
1970         return to_vmx(vcpu)->nested.msrs.procbased_ctls_high &
1971                         CPU_BASED_MONITOR_TRAP_FLAG;
1972 }
1973
1974 static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu)
1975 {
1976         return to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
1977                 SECONDARY_EXEC_SHADOW_VMCS;
1978 }
1979
1980 static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
1981 {
1982         return vmcs12->cpu_based_vm_exec_control & bit;
1983 }
1984
1985 static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
1986 {
1987         return (vmcs12->cpu_based_vm_exec_control &
1988                         CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
1989                 (vmcs12->secondary_vm_exec_control & bit);
1990 }
1991
1992 static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
1993 {
1994         return vmcs12->pin_based_vm_exec_control &
1995                 PIN_BASED_VMX_PREEMPTION_TIMER;
1996 }
1997
1998 static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12)
1999 {
2000         return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING;
2001 }
2002
2003 static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
2004 {
2005         return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
2006 }
2007
2008 static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
2009 {
2010         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
2011 }
2012
2013 static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
2014 {
2015         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
2016 }
2017
2018 static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12)
2019 {
2020         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML);
2021 }
2022
2023 static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
2024 {
2025         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
2026 }
2027
2028 static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
2029 {
2030         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
2031 }
2032
2033 static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
2034 {
2035         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
2036 }
2037
2038 static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
2039 {
2040         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2041 }
2042
2043 static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
2044 {
2045         return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
2046 }
2047
2048 static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
2049 {
2050         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
2051 }
2052
2053 static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
2054 {
2055         return nested_cpu_has_vmfunc(vmcs12) &&
2056                 (vmcs12->vm_function_control &
2057                  VMX_VMFUNC_EPTP_SWITCHING);
2058 }
2059
2060 static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12)
2061 {
2062         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS);
2063 }
2064
2065 static inline bool is_nmi(u32 intr_info)
2066 {
2067         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
2068                 == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
2069 }
2070
2071 static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
2072                               u32 exit_intr_info,
2073                               unsigned long exit_qualification);
2074 static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
2075                         struct vmcs12 *vmcs12,
2076                         u32 reason, unsigned long qualification);
2077
2078 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
2079 {
2080         int i;
2081
2082         for (i = 0; i < vmx->nmsrs; ++i)
2083                 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
2084                         return i;
2085         return -1;
2086 }
2087
2088 static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
2089 {
2090     struct {
2091         u64 vpid : 16;
2092         u64 rsvd : 48;
2093         u64 gva;
2094     } operand = { vpid, 0, gva };
2095     bool error;
2096
2097     asm volatile (__ex(ASM_VMX_INVVPID) CC_SET(na)
2098                   : CC_OUT(na) (error) : "a"(&operand), "c"(ext)
2099                   : "memory");
2100     BUG_ON(error);
2101 }
2102
2103 static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
2104 {
2105         struct {
2106                 u64 eptp, gpa;
2107         } operand = {eptp, gpa};
2108         bool error;
2109
2110         asm volatile (__ex(ASM_VMX_INVEPT) CC_SET(na)
2111                       : CC_OUT(na) (error) : "a" (&operand), "c" (ext)
2112                       : "memory");
2113         BUG_ON(error);
2114 }
2115
2116 static void vmx_setup_fb_clear_ctrl(void)
2117 {
2118         u64 msr;
2119
2120         if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) &&
2121             !boot_cpu_has_bug(X86_BUG_MDS) &&
2122             !boot_cpu_has_bug(X86_BUG_TAA)) {
2123                 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
2124                 if (msr & ARCH_CAP_FB_CLEAR_CTRL)
2125                         vmx_fb_clear_ctrl_available = true;
2126         }
2127 }
2128
2129 static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
2130 {
2131         u64 msr;
2132
2133         if (!vmx->disable_fb_clear)
2134                 return;
2135
2136         msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL);
2137         msr |= FB_CLEAR_DIS;
2138         native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
2139         /* Cache the MSR value to avoid reading it later */
2140         vmx->msr_ia32_mcu_opt_ctrl = msr;
2141 }
2142
2143 static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
2144 {
2145         if (!vmx->disable_fb_clear)
2146                 return;
2147
2148         vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
2149         native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
2150 }
2151
2152 static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
2153 {
2154         vmx->disable_fb_clear = vmx_fb_clear_ctrl_available;
2155
2156         /*
2157          * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
2158          * at VMEntry. Skip the MSR read/write when a guest has no use case to
2159          * execute VERW.
2160          */
2161         if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
2162            ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
2163             (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
2164             (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
2165             (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
2166             (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
2167                 vmx->disable_fb_clear = false;
2168 }
2169
2170 static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
2171 {
2172         int i;
2173
2174         i = __find_msr_index(vmx, msr);
2175         if (i >= 0)
2176                 return &vmx->guest_msrs[i];
2177         return NULL;
2178 }
2179
2180 static void vmcs_clear(struct vmcs *vmcs)
2181 {
2182         u64 phys_addr = __pa(vmcs);
2183         bool error;
2184
2185         asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) CC_SET(na)
2186                       : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
2187                       : "memory");
2188         if (unlikely(error))
2189                 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
2190                        vmcs, phys_addr);
2191 }
2192
2193 static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
2194 {
2195         vmcs_clear(loaded_vmcs->vmcs);
2196         if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
2197                 vmcs_clear(loaded_vmcs->shadow_vmcs);
2198         loaded_vmcs->cpu = -1;
2199         loaded_vmcs->launched = 0;
2200 }
2201
2202 static void vmcs_load(struct vmcs *vmcs)
2203 {
2204         u64 phys_addr = __pa(vmcs);
2205         bool error;
2206
2207         if (static_branch_unlikely(&enable_evmcs))
2208                 return evmcs_load(phys_addr);
2209
2210         asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) CC_SET(na)
2211                       : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
2212                       : "memory");
2213         if (unlikely(error))
2214                 printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
2215                        vmcs, phys_addr);
2216 }
2217
2218 #ifdef CONFIG_KEXEC_CORE
2219 static void crash_vmclear_local_loaded_vmcss(void)
2220 {
2221         int cpu = raw_smp_processor_id();
2222         struct loaded_vmcs *v;
2223
2224         list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
2225                             loaded_vmcss_on_cpu_link)
2226                 vmcs_clear(v->vmcs);
2227 }
2228 #endif /* CONFIG_KEXEC_CORE */
2229
2230 static void __loaded_vmcs_clear(void *arg)
2231 {
2232         struct loaded_vmcs *loaded_vmcs = arg;
2233         int cpu = raw_smp_processor_id();
2234
2235         if (loaded_vmcs->cpu != cpu)
2236                 return; /* vcpu migration can race with cpu offline */
2237         if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
2238                 per_cpu(current_vmcs, cpu) = NULL;
2239
2240         vmcs_clear(loaded_vmcs->vmcs);
2241         if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
2242                 vmcs_clear(loaded_vmcs->shadow_vmcs);
2243
2244         list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
2245
2246         /*
2247          * Ensure all writes to loaded_vmcs, including deleting it from its
2248          * current percpu list, complete before setting loaded_vmcs->vcpu to
2249          * -1, otherwise a different cpu can see vcpu == -1 first and add
2250          * loaded_vmcs to its percpu list before it's deleted from this cpu's
2251          * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
2252          */
2253         smp_wmb();
2254
2255         loaded_vmcs->cpu = -1;
2256         loaded_vmcs->launched = 0;
2257 }
2258
2259 static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
2260 {
2261         int cpu = loaded_vmcs->cpu;
2262
2263         if (cpu != -1)
2264                 smp_call_function_single(cpu,
2265                          __loaded_vmcs_clear, loaded_vmcs, 1);
2266 }
2267
2268 static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr)
2269 {
2270         if (vpid == 0)
2271                 return true;
2272
2273         if (cpu_has_vmx_invvpid_individual_addr()) {
2274                 __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
2275                 return true;
2276         }
2277
2278         return false;
2279 }
2280
2281 static inline void vpid_sync_vcpu_single(int vpid)
2282 {
2283         if (vpid == 0)
2284                 return;
2285
2286         if (cpu_has_vmx_invvpid_single())
2287                 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
2288 }
2289
2290 static inline void vpid_sync_vcpu_global(void)
2291 {
2292         if (cpu_has_vmx_invvpid_global())
2293                 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
2294 }
2295
2296 static inline void vpid_sync_context(int vpid)
2297 {
2298         if (cpu_has_vmx_invvpid_single())
2299                 vpid_sync_vcpu_single(vpid);
2300         else
2301                 vpid_sync_vcpu_global();
2302 }
2303
2304 static inline void ept_sync_global(void)
2305 {
2306         __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
2307 }
2308
2309 static inline void ept_sync_context(u64 eptp)
2310 {
2311         if (cpu_has_vmx_invept_context())
2312                 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
2313         else
2314                 ept_sync_global();
2315 }
2316
2317 static __always_inline void vmcs_check16(unsigned long field)
2318 {
2319         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
2320                          "16-bit accessor invalid for 64-bit field");
2321         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
2322                          "16-bit accessor invalid for 64-bit high field");
2323         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
2324                          "16-bit accessor invalid for 32-bit high field");
2325         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
2326                          "16-bit accessor invalid for natural width field");
2327 }
2328
2329 static __always_inline void vmcs_check32(unsigned long field)
2330 {
2331         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
2332                          "32-bit accessor invalid for 16-bit field");
2333         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
2334                          "32-bit accessor invalid for natural width field");
2335 }
2336
2337 static __always_inline void vmcs_check64(unsigned long field)
2338 {
2339         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
2340                          "64-bit accessor invalid for 16-bit field");
2341         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
2342                          "64-bit accessor invalid for 64-bit high field");
2343         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
2344                          "64-bit accessor invalid for 32-bit field");
2345         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
2346                          "64-bit accessor invalid for natural width field");
2347 }
2348
2349 static __always_inline void vmcs_checkl(unsigned long field)
2350 {
2351         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
2352                          "Natural width accessor invalid for 16-bit field");
2353         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
2354                          "Natural width accessor invalid for 64-bit field");
2355         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
2356                          "Natural width accessor invalid for 64-bit high field");
2357         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
2358                          "Natural width accessor invalid for 32-bit field");
2359 }
2360
2361 static __always_inline unsigned long __vmcs_readl(unsigned long field)
2362 {
2363         unsigned long value;
2364
2365         asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
2366                       : "=a"(value) : "d"(field) : "cc");
2367         return value;
2368 }
2369
2370 static __always_inline u16 vmcs_read16(unsigned long field)
2371 {
2372         vmcs_check16(field);
2373         if (static_branch_unlikely(&enable_evmcs))
2374                 return evmcs_read16(field);
2375         return __vmcs_readl(field);
2376 }
2377
2378 static __always_inline u32 vmcs_read32(unsigned long field)
2379 {
2380         vmcs_check32(field);
2381         if (static_branch_unlikely(&enable_evmcs))
2382                 return evmcs_read32(field);
2383         return __vmcs_readl(field);
2384 }
2385
2386 static __always_inline u64 vmcs_read64(unsigned long field)
2387 {
2388         vmcs_check64(field);
2389         if (static_branch_unlikely(&enable_evmcs))
2390                 return evmcs_read64(field);
2391 #ifdef CONFIG_X86_64
2392         return __vmcs_readl(field);
2393 #else
2394         return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32);
2395 #endif
2396 }
2397
2398 static __always_inline unsigned long vmcs_readl(unsigned long field)
2399 {
2400         vmcs_checkl(field);
2401         if (static_branch_unlikely(&enable_evmcs))
2402                 return evmcs_read64(field);
2403         return __vmcs_readl(field);
2404 }
2405
2406 static noinline void vmwrite_error(unsigned long field, unsigned long value)
2407 {
2408         printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
2409                field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
2410         dump_stack();
2411 }
2412
2413 static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
2414 {
2415         bool error;
2416
2417         asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) CC_SET(na)
2418                       : CC_OUT(na) (error) : "a"(value), "d"(field));
2419         if (unlikely(error))
2420                 vmwrite_error(field, value);
2421 }
2422
2423 static __always_inline void vmcs_write16(unsigned long field, u16 value)
2424 {
2425         vmcs_check16(field);
2426         if (static_branch_unlikely(&enable_evmcs))
2427                 return evmcs_write16(field, value);
2428
2429         __vmcs_writel(field, value);
2430 }
2431
2432 static __always_inline void vmcs_write32(unsigned long field, u32 value)
2433 {
2434         vmcs_check32(field);
2435         if (static_branch_unlikely(&enable_evmcs))
2436                 return evmcs_write32(field, value);
2437
2438         __vmcs_writel(field, value);
2439 }
2440
2441 static __always_inline void vmcs_write64(unsigned long field, u64 value)
2442 {
2443         vmcs_check64(field);
2444         if (static_branch_unlikely(&enable_evmcs))
2445                 return evmcs_write64(field, value);
2446
2447         __vmcs_writel(field, value);
2448 #ifndef CONFIG_X86_64
2449         asm volatile ("");
2450         __vmcs_writel(field+1, value >> 32);
2451 #endif
2452 }
2453
2454 static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
2455 {
2456         vmcs_checkl(field);
2457         if (static_branch_unlikely(&enable_evmcs))
2458                 return evmcs_write64(field, value);
2459
2460         __vmcs_writel(field, value);
2461 }
2462
2463 static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
2464 {
2465         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
2466                          "vmcs_clear_bits does not support 64-bit fields");
2467         if (static_branch_unlikely(&enable_evmcs))
2468                 return evmcs_write32(field, evmcs_read32(field) & ~mask);
2469
2470         __vmcs_writel(field, __vmcs_readl(field) & ~mask);
2471 }
2472
2473 static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
2474 {
2475         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
2476                          "vmcs_set_bits does not support 64-bit fields");
2477         if (static_branch_unlikely(&enable_evmcs))
2478                 return evmcs_write32(field, evmcs_read32(field) | mask);
2479
2480         __vmcs_writel(field, __vmcs_readl(field) | mask);
2481 }
2482
2483 static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx)
2484 {
2485         vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS);
2486 }
2487
2488 static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
2489 {
2490         vmcs_write32(VM_ENTRY_CONTROLS, val);
2491         vmx->vm_entry_controls_shadow = val;
2492 }
2493
2494 static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val)
2495 {
2496         if (vmx->vm_entry_controls_shadow != val)
2497                 vm_entry_controls_init(vmx, val);
2498 }
2499
2500 static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx)
2501 {
2502         return vmx->vm_entry_controls_shadow;
2503 }
2504
2505
2506 static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
2507 {
2508         vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
2509 }
2510
2511 static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
2512 {
2513         vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
2514 }
2515
2516 static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
2517 {
2518         vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
2519 }
2520
2521 static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
2522 {
2523         vmcs_write32(VM_EXIT_CONTROLS, val);
2524         vmx->vm_exit_controls_shadow = val;
2525 }
2526
2527 static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
2528 {
2529         if (vmx->vm_exit_controls_shadow != val)
2530                 vm_exit_controls_init(vmx, val);
2531 }
2532
2533 static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
2534 {
2535         return vmx->vm_exit_controls_shadow;
2536 }
2537
2538
2539 static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
2540 {
2541         vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
2542 }
2543
2544 static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
2545 {
2546         vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
2547 }
2548
2549 static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
2550 {
2551         vmx->segment_cache.bitmask = 0;
2552 }
2553
2554 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
2555                                        unsigned field)
2556 {
2557         bool ret;
2558         u32 mask = 1 << (seg * SEG_FIELD_NR + field);
2559
2560         if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
2561                 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
2562                 vmx->segment_cache.bitmask = 0;
2563         }
2564         ret = vmx->segment_cache.bitmask & mask;
2565         vmx->segment_cache.bitmask |= mask;
2566         return ret;
2567 }
2568
2569 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
2570 {
2571         u16 *p = &vmx->segment_cache.seg[seg].selector;
2572
2573         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
2574                 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
2575         return *p;
2576 }
2577
2578 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
2579 {
2580         ulong *p = &vmx->segment_cache.seg[seg].base;
2581
2582         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
2583                 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
2584         return *p;
2585 }
2586
2587 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
2588 {
2589         u32 *p = &vmx->segment_cache.seg[seg].limit;
2590
2591         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
2592                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
2593         return *p;
2594 }
2595
2596 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
2597 {
2598         u32 *p = &vmx->segment_cache.seg[seg].ar;
2599
2600         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
2601                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
2602         return *p;
2603 }
2604
2605 static void update_exception_bitmap(struct kvm_vcpu *vcpu)
2606 {
2607         u32 eb;
2608
2609         eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
2610              (1u << DB_VECTOR) | (1u << AC_VECTOR);
2611         /*
2612          * Guest access to VMware backdoor ports could legitimately
2613          * trigger #GP because of TSS I/O permission bitmap.
2614          * We intercept those #GP and allow access to them anyway
2615          * as VMware does.
2616          */
2617         if (enable_vmware_backdoor)
2618                 eb |= (1u << GP_VECTOR);
2619         if ((vcpu->guest_debug &
2620              (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
2621             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
2622                 eb |= 1u << BP_VECTOR;
2623         if (to_vmx(vcpu)->rmode.vm86_active)
2624                 eb = ~0;
2625         if (enable_ept)
2626                 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
2627
2628         /* When we are running a nested L2 guest and L1 specified for it a
2629          * certain exception bitmap, we must trap the same exceptions and pass
2630          * them to L1. When running L2, we will only handle the exceptions
2631          * specified above if L1 did not want them.
2632          */
2633         if (is_guest_mode(vcpu))
2634                 eb |= get_vmcs12(vcpu)->exception_bitmap;
2635
2636         vmcs_write32(EXCEPTION_BITMAP, eb);
2637 }
2638
2639 /*
2640  * Check if MSR is intercepted for currently loaded MSR bitmap.
2641  */
2642 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
2643 {
2644         unsigned long *msr_bitmap;
2645         int f = sizeof(unsigned long);
2646
2647         if (!cpu_has_vmx_msr_bitmap())
2648                 return true;
2649
2650         msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
2651
2652         if (msr <= 0x1fff) {
2653                 return !!test_bit(msr, msr_bitmap + 0x800 / f);
2654         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2655                 msr &= 0x1fff;
2656                 return !!test_bit(msr, msr_bitmap + 0xc00 / f);
2657         }
2658
2659         return true;
2660 }
2661
2662 /*
2663  * Check if MSR is intercepted for L01 MSR bitmap.
2664  */
2665 static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
2666 {
2667         unsigned long *msr_bitmap;
2668         int f = sizeof(unsigned long);
2669
2670         if (!cpu_has_vmx_msr_bitmap())
2671                 return true;
2672
2673         msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
2674
2675         if (msr <= 0x1fff) {
2676                 return !!test_bit(msr, msr_bitmap + 0x800 / f);
2677         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2678                 msr &= 0x1fff;
2679                 return !!test_bit(msr, msr_bitmap + 0xc00 / f);
2680         }
2681
2682         return true;
2683 }
2684
2685 static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
2686                 unsigned long entry, unsigned long exit)
2687 {
2688         vm_entry_controls_clearbit(vmx, entry);
2689         vm_exit_controls_clearbit(vmx, exit);
2690 }
2691
2692 static int find_msr(struct vmx_msrs *m, unsigned int msr)
2693 {
2694         unsigned int i;
2695
2696         for (i = 0; i < m->nr; ++i) {
2697                 if (m->val[i].index == msr)
2698                         return i;
2699         }
2700         return -ENOENT;
2701 }
2702
2703 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
2704 {
2705         int i;
2706         struct msr_autoload *m = &vmx->msr_autoload;
2707
2708         switch (msr) {
2709         case MSR_EFER:
2710                 if (cpu_has_load_ia32_efer) {
2711                         clear_atomic_switch_msr_special(vmx,
2712                                         VM_ENTRY_LOAD_IA32_EFER,
2713                                         VM_EXIT_LOAD_IA32_EFER);
2714                         return;
2715                 }
2716                 break;
2717         case MSR_CORE_PERF_GLOBAL_CTRL:
2718                 if (cpu_has_load_perf_global_ctrl) {
2719                         clear_atomic_switch_msr_special(vmx,
2720                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
2721                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
2722                         return;
2723                 }
2724                 break;
2725         }
2726         i = find_msr(&m->guest, msr);
2727         if (i < 0)
2728                 goto skip_guest;
2729         --m->guest.nr;
2730         m->guest.val[i] = m->guest.val[m->guest.nr];
2731         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
2732
2733 skip_guest:
2734         i = find_msr(&m->host, msr);
2735         if (i < 0)
2736                 return;
2737
2738         --m->host.nr;
2739         m->host.val[i] = m->host.val[m->host.nr];
2740         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
2741 }
2742
2743 static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
2744                 unsigned long entry, unsigned long exit,
2745                 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
2746                 u64 guest_val, u64 host_val)
2747 {
2748         vmcs_write64(guest_val_vmcs, guest_val);
2749         vmcs_write64(host_val_vmcs, host_val);
2750         vm_entry_controls_setbit(vmx, entry);
2751         vm_exit_controls_setbit(vmx, exit);
2752 }
2753
2754 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
2755                                   u64 guest_val, u64 host_val, bool entry_only)
2756 {
2757         int i, j = 0;
2758         struct msr_autoload *m = &vmx->msr_autoload;
2759
2760         switch (msr) {
2761         case MSR_EFER:
2762                 if (cpu_has_load_ia32_efer) {
2763                         add_atomic_switch_msr_special(vmx,
2764                                         VM_ENTRY_LOAD_IA32_EFER,
2765                                         VM_EXIT_LOAD_IA32_EFER,
2766                                         GUEST_IA32_EFER,
2767                                         HOST_IA32_EFER,
2768                                         guest_val, host_val);
2769                         return;
2770                 }
2771                 break;
2772         case MSR_CORE_PERF_GLOBAL_CTRL:
2773                 if (cpu_has_load_perf_global_ctrl) {
2774                         add_atomic_switch_msr_special(vmx,
2775                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
2776                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
2777                                         GUEST_IA32_PERF_GLOBAL_CTRL,
2778                                         HOST_IA32_PERF_GLOBAL_CTRL,
2779                                         guest_val, host_val);
2780                         return;
2781                 }
2782                 break;
2783         case MSR_IA32_PEBS_ENABLE:
2784                 /* PEBS needs a quiescent period after being disabled (to write
2785                  * a record).  Disabling PEBS through VMX MSR swapping doesn't
2786                  * provide that period, so a CPU could write host's record into
2787                  * guest's memory.
2788                  */
2789                 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
2790         }
2791
2792         i = find_msr(&m->guest, msr);
2793         if (!entry_only)
2794                 j = find_msr(&m->host, msr);
2795
2796         if ((i < 0 && m->guest.nr == NR_AUTOLOAD_MSRS) ||
2797                 (j < 0 &&  m->host.nr == NR_AUTOLOAD_MSRS)) {
2798                 printk_once(KERN_WARNING "Not enough msr switch entries. "
2799                                 "Can't add msr %x\n", msr);
2800                 return;
2801         }
2802         if (i < 0) {
2803                 i = m->guest.nr++;
2804                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
2805         }
2806         m->guest.val[i].index = msr;
2807         m->guest.val[i].value = guest_val;
2808
2809         if (entry_only)
2810                 return;
2811
2812         if (j < 0) {
2813                 j = m->host.nr++;
2814                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
2815         }
2816         m->host.val[j].index = msr;
2817         m->host.val[j].value = host_val;
2818 }
2819
2820 static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
2821 {
2822         u64 guest_efer = vmx->vcpu.arch.efer;
2823         u64 ignore_bits = 0;
2824
2825         /* Shadow paging assumes NX to be available.  */
2826         if (!enable_ept)
2827                 guest_efer |= EFER_NX;
2828
2829         /*
2830          * LMA and LME handled by hardware; SCE meaningless outside long mode.
2831          */
2832         ignore_bits |= EFER_SCE;
2833 #ifdef CONFIG_X86_64
2834         ignore_bits |= EFER_LMA | EFER_LME;
2835         /* SCE is meaningful only in long mode on Intel */
2836         if (guest_efer & EFER_LMA)
2837                 ignore_bits &= ~(u64)EFER_SCE;
2838 #endif
2839
2840         clear_atomic_switch_msr(vmx, MSR_EFER);
2841
2842         /*
2843          * On EPT, we can't emulate NX, so we must switch EFER atomically.
2844          * On CPUs that support "load IA32_EFER", always switch EFER
2845          * atomically, since it's faster than switching it manually.
2846          */
2847         if (cpu_has_load_ia32_efer ||
2848             (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
2849                 if (!(guest_efer & EFER_LMA))
2850                         guest_efer &= ~EFER_LME;
2851                 if (guest_efer != host_efer)
2852                         add_atomic_switch_msr(vmx, MSR_EFER,
2853                                               guest_efer, host_efer, false);
2854                 return false;
2855         } else {
2856                 guest_efer &= ~ignore_bits;
2857                 guest_efer |= host_efer & ignore_bits;
2858
2859                 vmx->guest_msrs[efer_offset].data = guest_efer;
2860                 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
2861
2862                 return true;
2863         }
2864 }
2865
2866 #ifdef CONFIG_X86_32
2867 /*
2868  * On 32-bit kernels, VM exits still load the FS and GS bases from the
2869  * VMCS rather than the segment table.  KVM uses this helper to figure
2870  * out the current bases to poke them into the VMCS before entry.
2871  */
2872 static unsigned long segment_base(u16 selector)
2873 {
2874         struct desc_struct *table;
2875         unsigned long v;
2876
2877         if (!(selector & ~SEGMENT_RPL_MASK))
2878                 return 0;
2879
2880         table = get_current_gdt_ro();
2881
2882         if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
2883                 u16 ldt_selector = kvm_read_ldt();
2884
2885                 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
2886                         return 0;
2887
2888                 table = (struct desc_struct *)segment_base(ldt_selector);
2889         }
2890         v = get_desc_base(&table[selector >> 3]);
2891         return v;
2892 }
2893 #endif
2894
2895 static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
2896 {
2897         struct vcpu_vmx *vmx = to_vmx(vcpu);
2898         struct vmcs_host_state *host_state;
2899 #ifdef CONFIG_X86_64
2900         int cpu = raw_smp_processor_id();
2901 #endif
2902         unsigned long fs_base, gs_base;
2903         u16 fs_sel, gs_sel;
2904         int i;
2905
2906         vmx->req_immediate_exit = false;
2907
2908         /*
2909          * Note that guest MSRs to be saved/restored can also be changed
2910          * when guest state is loaded. This happens when guest transitions
2911          * to/from long-mode by setting MSR_EFER.LMA.
2912          */
2913         if (!vmx->loaded_cpu_state || vmx->guest_msrs_dirty) {
2914                 vmx->guest_msrs_dirty = false;
2915                 for (i = 0; i < vmx->save_nmsrs; ++i)
2916                         kvm_set_shared_msr(vmx->guest_msrs[i].index,
2917                                            vmx->guest_msrs[i].data,
2918                                            vmx->guest_msrs[i].mask);
2919
2920         }
2921
2922         if (vmx->loaded_cpu_state)
2923                 return;
2924
2925         vmx->loaded_cpu_state = vmx->loaded_vmcs;
2926         host_state = &vmx->loaded_cpu_state->host_state;
2927
2928         /*
2929          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
2930          * allow segment selectors with cpl > 0 or ti == 1.
2931          */
2932         host_state->ldt_sel = kvm_read_ldt();
2933
2934 #ifdef CONFIG_X86_64
2935         savesegment(ds, host_state->ds_sel);
2936         savesegment(es, host_state->es_sel);
2937
2938         gs_base = cpu_kernelmode_gs_base(cpu);
2939         if (likely(is_64bit_mm(current->mm))) {
2940                 save_fsgs_for_kvm();
2941                 fs_sel = current->thread.fsindex;
2942                 gs_sel = current->thread.gsindex;
2943                 fs_base = current->thread.fsbase;
2944                 vmx->msr_host_kernel_gs_base = current->thread.gsbase;
2945         } else {
2946                 savesegment(fs, fs_sel);
2947                 savesegment(gs, gs_sel);
2948                 fs_base = read_msr(MSR_FS_BASE);
2949                 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
2950         }
2951
2952         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2953 #else
2954         savesegment(fs, fs_sel);
2955         savesegment(gs, gs_sel);
2956         fs_base = segment_base(fs_sel);
2957         gs_base = segment_base(gs_sel);
2958 #endif
2959
2960         if (unlikely(fs_sel != host_state->fs_sel)) {
2961                 if (!(fs_sel & 7))
2962                         vmcs_write16(HOST_FS_SELECTOR, fs_sel);
2963                 else
2964                         vmcs_write16(HOST_FS_SELECTOR, 0);
2965                 host_state->fs_sel = fs_sel;
2966         }
2967         if (unlikely(gs_sel != host_state->gs_sel)) {
2968                 if (!(gs_sel & 7))
2969                         vmcs_write16(HOST_GS_SELECTOR, gs_sel);
2970                 else
2971                         vmcs_write16(HOST_GS_SELECTOR, 0);
2972                 host_state->gs_sel = gs_sel;
2973         }
2974         if (unlikely(fs_base != host_state->fs_base)) {
2975                 vmcs_writel(HOST_FS_BASE, fs_base);
2976                 host_state->fs_base = fs_base;
2977         }
2978         if (unlikely(gs_base != host_state->gs_base)) {
2979                 vmcs_writel(HOST_GS_BASE, gs_base);
2980                 host_state->gs_base = gs_base;
2981         }
2982 }
2983
2984 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
2985 {
2986         struct vmcs_host_state *host_state;
2987
2988         if (!vmx->loaded_cpu_state)
2989                 return;
2990
2991         WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs);
2992         host_state = &vmx->loaded_cpu_state->host_state;
2993
2994         ++vmx->vcpu.stat.host_state_reload;
2995         vmx->loaded_cpu_state = NULL;
2996
2997 #ifdef CONFIG_X86_64
2998         rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2999 #endif
3000         if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
3001                 kvm_load_ldt(host_state->ldt_sel);
3002 #ifdef CONFIG_X86_64
3003                 load_gs_index(host_state->gs_sel);
3004 #else
3005                 loadsegment(gs, host_state->gs_sel);
3006 #endif
3007         }
3008         if (host_state->fs_sel & 7)
3009                 loadsegment(fs, host_state->fs_sel);
3010 #ifdef CONFIG_X86_64
3011         if (unlikely(host_state->ds_sel | host_state->es_sel)) {
3012                 loadsegment(ds, host_state->ds_sel);
3013                 loadsegment(es, host_state->es_sel);
3014         }
3015 #endif
3016         invalidate_tss_limit();
3017 #ifdef CONFIG_X86_64
3018         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
3019 #endif
3020         load_fixmap_gdt(raw_smp_processor_id());
3021 }
3022
3023 #ifdef CONFIG_X86_64
3024 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
3025 {
3026         preempt_disable();
3027         if (vmx->loaded_cpu_state)
3028                 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
3029         preempt_enable();
3030         return vmx->msr_guest_kernel_gs_base;
3031 }
3032
3033 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
3034 {
3035         preempt_disable();
3036         if (vmx->loaded_cpu_state)
3037                 wrmsrl(MSR_KERNEL_GS_BASE, data);
3038         preempt_enable();
3039         vmx->msr_guest_kernel_gs_base = data;
3040 }
3041 #endif
3042
3043 static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
3044 {
3045         struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
3046         struct pi_desc old, new;
3047         unsigned int dest;
3048
3049         /*
3050          * In case of hot-plug or hot-unplug, we may have to undo
3051          * vmx_vcpu_pi_put even if there is no assigned device.  And we
3052          * always keep PI.NDST up to date for simplicity: it makes the
3053          * code easier, and CPU migration is not a fast path.
3054          */
3055         if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
3056                 return;
3057
3058         /*
3059          * First handle the simple case where no cmpxchg is necessary; just
3060          * allow posting non-urgent interrupts.
3061          *
3062          * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
3063          * PI.NDST: pi_post_block will do it for us and the wakeup_handler
3064          * expects the VCPU to be on the blocked_vcpu_list that matches
3065          * PI.NDST.
3066          */
3067         if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
3068             vcpu->cpu == cpu) {
3069                 pi_clear_sn(pi_desc);
3070                 return;
3071         }
3072
3073         /* The full case.  */
3074         do {
3075                 old.control = new.control = pi_desc->control;
3076
3077                 dest = cpu_physical_id(cpu);
3078
3079                 if (x2apic_enabled())
3080                         new.ndst = dest;
3081                 else
3082                         new.ndst = (dest << 8) & 0xFF00;
3083
3084                 new.sn = 0;
3085         } while (cmpxchg64(&pi_desc->control, old.control,
3086                            new.control) != old.control);
3087 }
3088
3089 static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
3090 {
3091         vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
3092         vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
3093 }
3094
3095 /*
3096  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
3097  * vcpu mutex is already taken.
3098  */
3099 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
3100 {
3101         struct vcpu_vmx *vmx = to_vmx(vcpu);
3102         bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
3103
3104         if (!already_loaded) {
3105                 loaded_vmcs_clear(vmx->loaded_vmcs);
3106                 local_irq_disable();
3107
3108                 /*
3109                  * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
3110                  * this cpu's percpu list, otherwise it may not yet be deleted
3111                  * from its previous cpu's percpu list.  Pairs with the
3112                  * smb_wmb() in __loaded_vmcs_clear().
3113                  */
3114                 smp_rmb();
3115
3116                 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
3117                          &per_cpu(loaded_vmcss_on_cpu, cpu));
3118                 local_irq_enable();
3119         }
3120
3121         if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
3122                 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
3123                 vmcs_load(vmx->loaded_vmcs->vmcs);
3124                 indirect_branch_prediction_barrier();
3125         }
3126
3127         if (!already_loaded) {
3128                 void *gdt = get_current_gdt_ro();
3129                 unsigned long sysenter_esp;
3130
3131                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3132
3133                 /*
3134                  * Linux uses per-cpu TSS and GDT, so set these when switching
3135                  * processors.  See 22.2.4.
3136                  */
3137                 vmcs_writel(HOST_TR_BASE,
3138                             (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
3139                 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
3140
3141                 /*
3142                  * VM exits change the host TR limit to 0x67 after a VM
3143                  * exit.  This is okay, since 0x67 covers everything except
3144                  * the IO bitmap and have have code to handle the IO bitmap
3145                  * being lost after a VM exit.
3146                  */
3147                 BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
3148
3149                 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
3150                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
3151
3152                 vmx->loaded_vmcs->cpu = cpu;
3153         }
3154
3155         /* Setup TSC multiplier */
3156         if (kvm_has_tsc_control &&
3157             vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
3158                 decache_tsc_multiplier(vmx);
3159
3160         vmx_vcpu_pi_load(vcpu, cpu);
3161         vmx->host_pkru = read_pkru();
3162         vmx->host_debugctlmsr = get_debugctlmsr();
3163 }
3164
3165 static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
3166 {
3167         struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
3168
3169         if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
3170                 !irq_remapping_cap(IRQ_POSTING_CAP)  ||
3171                 !kvm_vcpu_apicv_active(vcpu))
3172                 return;
3173
3174         /* Set SN when the vCPU is preempted */
3175         if (vcpu->preempted)
3176                 pi_set_sn(pi_desc);
3177 }
3178
3179 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
3180 {
3181         vmx_vcpu_pi_put(vcpu);
3182
3183         vmx_prepare_switch_to_host(to_vmx(vcpu));
3184 }
3185
3186 static bool emulation_required(struct kvm_vcpu *vcpu)
3187 {
3188         return emulate_invalid_guest_state && !guest_state_valid(vcpu);
3189 }
3190
3191 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
3192
3193 /*
3194  * Return the cr0 value that a nested guest would read. This is a combination
3195  * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
3196  * its hypervisor (cr0_read_shadow).
3197  */
3198 static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
3199 {
3200         return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
3201                 (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
3202 }
3203 static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
3204 {
3205         return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
3206                 (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
3207 }
3208
3209 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
3210 {
3211         unsigned long rflags, save_rflags;
3212
3213         if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
3214                 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
3215                 rflags = vmcs_readl(GUEST_RFLAGS);
3216                 if (to_vmx(vcpu)->rmode.vm86_active) {
3217                         rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
3218                         save_rflags = to_vmx(vcpu)->rmode.save_rflags;
3219                         rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
3220                 }
3221                 to_vmx(vcpu)->rflags = rflags;
3222         }
3223         return to_vmx(vcpu)->rflags;
3224 }
3225
3226 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
3227 {
3228         unsigned long old_rflags = vmx_get_rflags(vcpu);
3229
3230         __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
3231         to_vmx(vcpu)->rflags = rflags;
3232         if (to_vmx(vcpu)->rmode.vm86_active) {
3233                 to_vmx(vcpu)->rmode.save_rflags = rflags;
3234                 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
3235         }
3236         vmcs_writel(GUEST_RFLAGS, rflags);
3237
3238         if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM)
3239                 to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
3240 }
3241
3242 static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
3243 {
3244         u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3245         int ret = 0;
3246
3247         if (interruptibility & GUEST_INTR_STATE_STI)
3248                 ret |= KVM_X86_SHADOW_INT_STI;
3249         if (interruptibility & GUEST_INTR_STATE_MOV_SS)
3250                 ret |= KVM_X86_SHADOW_INT_MOV_SS;
3251
3252         return ret;
3253 }
3254
3255 static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
3256 {
3257         u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3258         u32 interruptibility = interruptibility_old;
3259
3260         interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
3261
3262         if (mask & KVM_X86_SHADOW_INT_MOV_SS)
3263                 interruptibility |= GUEST_INTR_STATE_MOV_SS;
3264         else if (mask & KVM_X86_SHADOW_INT_STI)
3265                 interruptibility |= GUEST_INTR_STATE_STI;
3266
3267         if ((interruptibility != interruptibility_old))
3268                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
3269 }
3270
3271 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
3272 {
3273         unsigned long rip;
3274
3275         rip = kvm_rip_read(vcpu);
3276         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3277         kvm_rip_write(vcpu, rip);
3278
3279         /* skipping an emulated instruction also counts */
3280         vmx_set_interrupt_shadow(vcpu, 0);
3281 }
3282
3283 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
3284                                                unsigned long exit_qual)
3285 {
3286         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3287         unsigned int nr = vcpu->arch.exception.nr;
3288         u32 intr_info = nr | INTR_INFO_VALID_MASK;
3289
3290         if (vcpu->arch.exception.has_error_code) {
3291                 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
3292                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3293         }
3294
3295         if (kvm_exception_is_soft(nr))
3296                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3297         else
3298                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
3299
3300         if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
3301             vmx_get_nmi_mask(vcpu))
3302                 intr_info |= INTR_INFO_UNBLOCK_NMI;
3303
3304         nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
3305 }
3306
3307 /*
3308  * KVM wants to inject page-faults which it got to the guest. This function
3309  * checks whether in a nested guest, we need to inject them to L1 or L2.
3310  */
3311 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
3312 {
3313         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3314         unsigned int nr = vcpu->arch.exception.nr;
3315
3316         if (nr == PF_VECTOR) {
3317                 if (vcpu->arch.exception.nested_apf) {
3318                         *exit_qual = vcpu->arch.apf.nested_apf_token;
3319                         return 1;
3320                 }
3321                 /*
3322                  * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
3323                  * The fix is to add the ancillary datum (CR2 or DR6) to structs
3324                  * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
3325                  * can be written only when inject_pending_event runs.  This should be
3326                  * conditional on a new capability---if the capability is disabled,
3327                  * kvm_multiple_exception would write the ancillary information to
3328                  * CR2 or DR6, for backwards ABI-compatibility.
3329                  */
3330                 if (nested_vmx_is_page_fault_vmexit(vmcs12,
3331                                                     vcpu->arch.exception.error_code)) {
3332                         *exit_qual = vcpu->arch.cr2;
3333                         return 1;
3334                 }
3335         } else {
3336                 if (vmcs12->exception_bitmap & (1u << nr)) {
3337                         if (nr == DB_VECTOR) {
3338                                 *exit_qual = vcpu->arch.dr6;
3339                                 *exit_qual &= ~(DR6_FIXED_1 | DR6_BT);
3340                                 *exit_qual ^= DR6_RTM;
3341                         } else {
3342                                 *exit_qual = 0;
3343                         }
3344                         return 1;
3345                 }
3346         }
3347
3348         return 0;
3349 }
3350
3351 static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
3352 {
3353         /*
3354          * Ensure that we clear the HLT state in the VMCS.  We don't need to
3355          * explicitly skip the instruction because if the HLT state is set,
3356          * then the instruction is already executing and RIP has already been
3357          * advanced.
3358          */
3359         if (kvm_hlt_in_guest(vcpu->kvm) &&
3360                         vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
3361                 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
3362 }
3363
3364 static void vmx_queue_exception(struct kvm_vcpu *vcpu)
3365 {
3366         struct vcpu_vmx *vmx = to_vmx(vcpu);
3367         unsigned nr = vcpu->arch.exception.nr;
3368         bool has_error_code = vcpu->arch.exception.has_error_code;
3369         u32 error_code = vcpu->arch.exception.error_code;
3370         u32 intr_info = nr | INTR_INFO_VALID_MASK;
3371
3372         if (has_error_code) {
3373                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
3374                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3375         }
3376
3377         if (vmx->rmode.vm86_active) {
3378                 int inc_eip = 0;
3379                 if (kvm_exception_is_soft(nr))
3380                         inc_eip = vcpu->arch.event_exit_inst_len;
3381                 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
3382                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3383                 return;
3384         }
3385
3386         WARN_ON_ONCE(vmx->emulation_required);
3387
3388         if (kvm_exception_is_soft(nr)) {
3389                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
3390                              vmx->vcpu.arch.event_exit_inst_len);
3391                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3392         } else
3393                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
3394
3395         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
3396
3397         vmx_clear_hlt(vcpu);
3398 }
3399
3400 static bool vmx_rdtscp_supported(void)
3401 {
3402         return cpu_has_vmx_rdtscp();
3403 }
3404
3405 static bool vmx_invpcid_supported(void)
3406 {
3407         return cpu_has_vmx_invpcid();
3408 }
3409
3410 /*
3411  * Swap MSR entry in host/guest MSR entry array.
3412  */
3413 static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
3414 {
3415         struct shared_msr_entry tmp;
3416
3417         tmp = vmx->guest_msrs[to];
3418         vmx->guest_msrs[to] = vmx->guest_msrs[from];
3419         vmx->guest_msrs[from] = tmp;
3420 }
3421
3422 /*
3423  * Set up the vmcs to automatically save and restore system
3424  * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
3425  * mode, as fiddling with msrs is very expensive.
3426  */
3427 static void setup_msrs(struct vcpu_vmx *vmx)
3428 {
3429         int save_nmsrs, index;
3430
3431         save_nmsrs = 0;
3432 #ifdef CONFIG_X86_64
3433         if (is_long_mode(&vmx->vcpu)) {
3434                 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
3435                 if (index >= 0)
3436                         move_msr_up(vmx, index, save_nmsrs++);
3437                 index = __find_msr_index(vmx, MSR_LSTAR);
3438                 if (index >= 0)
3439                         move_msr_up(vmx, index, save_nmsrs++);
3440                 index = __find_msr_index(vmx, MSR_CSTAR);
3441                 if (index >= 0)
3442                         move_msr_up(vmx, index, save_nmsrs++);
3443                 /*
3444                  * MSR_STAR is only needed on long mode guests, and only
3445                  * if efer.sce is enabled.
3446                  */
3447                 index = __find_msr_index(vmx, MSR_STAR);
3448                 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
3449                         move_msr_up(vmx, index, save_nmsrs++);
3450         }
3451 #endif
3452         index = __find_msr_index(vmx, MSR_EFER);
3453         if (index >= 0 && update_transition_efer(vmx, index))
3454                 move_msr_up(vmx, index, save_nmsrs++);
3455         index = __find_msr_index(vmx, MSR_TSC_AUX);
3456         if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
3457                 move_msr_up(vmx, index, save_nmsrs++);
3458
3459         vmx->save_nmsrs = save_nmsrs;
3460         vmx->guest_msrs_dirty = true;
3461
3462         if (cpu_has_vmx_msr_bitmap())
3463                 vmx_update_msr_bitmap(&vmx->vcpu);
3464 }
3465
3466 static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
3467 {
3468         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3469
3470         if (is_guest_mode(vcpu) &&
3471             (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
3472                 return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
3473
3474         return vcpu->arch.tsc_offset;
3475 }
3476
3477 static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
3478 {
3479         u64 active_offset = offset;
3480         if (is_guest_mode(vcpu)) {
3481                 /*
3482                  * We're here if L1 chose not to trap WRMSR to TSC. According
3483                  * to the spec, this should set L1's TSC; The offset that L1
3484                  * set for L2 remains unchanged, and still needs to be added
3485                  * to the newly set TSC to get L2's TSC.
3486                  */
3487                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3488                 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING))
3489                         active_offset += vmcs12->tsc_offset;
3490         } else {
3491                 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
3492                                            vmcs_read64(TSC_OFFSET), offset);
3493         }
3494
3495         vmcs_write64(TSC_OFFSET, active_offset);
3496         return active_offset;
3497 }
3498
3499 /*
3500  * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
3501  * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
3502  * all guests if the "nested" module option is off, and can also be disabled
3503  * for a single guest by disabling its VMX cpuid bit.
3504  */
3505 static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
3506 {
3507         return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
3508 }
3509
3510 /*
3511  * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
3512  * returned for the various VMX controls MSRs when nested VMX is enabled.
3513  * The same values should also be used to verify that vmcs12 control fields are
3514  * valid during nested entry from L1 to L2.
3515  * Each of these control msrs has a low and high 32-bit half: A low bit is on
3516  * if the corresponding bit in the (32-bit) control field *must* be on, and a
3517  * bit in the high half is on if the corresponding bit in the control field
3518  * may be on. See also vmx_control_verify().
3519  */
3520 static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
3521 {
3522         if (!nested) {
3523                 memset(msrs, 0, sizeof(*msrs));
3524                 return;
3525         }
3526
3527         /*
3528          * Note that as a general rule, the high half of the MSRs (bits in
3529          * the control fields which may be 1) should be initialized by the
3530          * intersection of the underlying hardware's MSR (i.e., features which
3531          * can be supported) and the list of features we want to expose -
3532          * because they are known to be properly supported in our code.
3533          * Also, usually, the low half of the MSRs (bits which must be 1) can
3534          * be set to 0, meaning that L1 may turn off any of these bits. The
3535          * reason is that if one of these bits is necessary, it will appear
3536          * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
3537          * fields of vmcs01 and vmcs02, will turn these bits off - and
3538          * nested_vmx_exit_reflected() will not pass related exits to L1.
3539          * These rules have exceptions below.
3540          */
3541
3542         /* pin-based controls */
3543         rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
3544                 msrs->pinbased_ctls_low,
3545                 msrs->pinbased_ctls_high);
3546         msrs->pinbased_ctls_low |=
3547                 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
3548         msrs->pinbased_ctls_high &=
3549                 PIN_BASED_EXT_INTR_MASK |
3550                 PIN_BASED_NMI_EXITING |
3551                 PIN_BASED_VIRTUAL_NMIS |
3552                 (apicv ? PIN_BASED_POSTED_INTR : 0);
3553         msrs->pinbased_ctls_high |=
3554                 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
3555                 PIN_BASED_VMX_PREEMPTION_TIMER;
3556
3557         /* exit controls */
3558         rdmsr(MSR_IA32_VMX_EXIT_CTLS,
3559                 msrs->exit_ctls_low,
3560                 msrs->exit_ctls_high);
3561         msrs->exit_ctls_low =
3562                 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
3563
3564         msrs->exit_ctls_high &=
3565 #ifdef CONFIG_X86_64
3566                 VM_EXIT_HOST_ADDR_SPACE_SIZE |
3567 #endif
3568                 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
3569         msrs->exit_ctls_high |=
3570                 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
3571                 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
3572                 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
3573
3574         /* We support free control of debug control saving. */
3575         msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
3576
3577         /* entry controls */
3578         rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
3579                 msrs->entry_ctls_low,
3580                 msrs->entry_ctls_high);
3581         msrs->entry_ctls_low =
3582                 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
3583         msrs->entry_ctls_high &=
3584 #ifdef CONFIG_X86_64
3585                 VM_ENTRY_IA32E_MODE |
3586 #endif
3587                 VM_ENTRY_LOAD_IA32_PAT;
3588         msrs->entry_ctls_high |=
3589                 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
3590
3591         /* We support free control of debug control loading. */
3592         msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
3593
3594         /* cpu-based controls */
3595         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
3596                 msrs->procbased_ctls_low,
3597                 msrs->procbased_ctls_high);
3598         msrs->procbased_ctls_low =
3599                 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
3600         msrs->procbased_ctls_high &=
3601                 CPU_BASED_VIRTUAL_INTR_PENDING |
3602                 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
3603                 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
3604                 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
3605                 CPU_BASED_CR3_STORE_EXITING |
3606 #ifdef CONFIG_X86_64
3607                 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
3608 #endif
3609                 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
3610                 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
3611                 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
3612                 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
3613                 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
3614         /*
3615          * We can allow some features even when not supported by the
3616          * hardware. For example, L1 can specify an MSR bitmap - and we
3617          * can use it to avoid exits to L1 - even when L0 runs L2
3618          * without MSR bitmaps.
3619          */
3620         msrs->procbased_ctls_high |=
3621                 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
3622                 CPU_BASED_USE_MSR_BITMAPS;
3623
3624         /* We support free control of CR3 access interception. */
3625         msrs->procbased_ctls_low &=
3626                 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
3627
3628         /*
3629          * secondary cpu-based controls.  Do not include those that
3630          * depend on CPUID bits, they are added later by vmx_cpuid_update.
3631          */
3632         if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
3633                 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
3634                       msrs->secondary_ctls_low,
3635                       msrs->secondary_ctls_high);
3636
3637         msrs->secondary_ctls_low = 0;
3638         msrs->secondary_ctls_high &=
3639                 SECONDARY_EXEC_DESC |
3640                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
3641                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
3642                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
3643                 SECONDARY_EXEC_WBINVD_EXITING;
3644
3645         /*
3646          * We can emulate "VMCS shadowing," even if the hardware
3647          * doesn't support it.
3648          */
3649         msrs->secondary_ctls_high |=
3650                 SECONDARY_EXEC_SHADOW_VMCS;
3651
3652         if (enable_ept) {
3653                 /* nested EPT: emulate EPT also to L1 */
3654                 msrs->secondary_ctls_high |=
3655                         SECONDARY_EXEC_ENABLE_EPT;
3656                 msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
3657                          VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
3658                 if (cpu_has_vmx_ept_execute_only())
3659                         msrs->ept_caps |=
3660                                 VMX_EPT_EXECUTE_ONLY_BIT;
3661                 msrs->ept_caps &= vmx_capability.ept;
3662                 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
3663                         VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
3664                         VMX_EPT_1GB_PAGE_BIT;
3665                 if (enable_ept_ad_bits) {
3666                         msrs->secondary_ctls_high |=
3667                                 SECONDARY_EXEC_ENABLE_PML;
3668                         msrs->ept_caps |= VMX_EPT_AD_BIT;
3669                 }
3670         }
3671
3672         if (cpu_has_vmx_vmfunc()) {
3673                 msrs->secondary_ctls_high |=
3674                         SECONDARY_EXEC_ENABLE_VMFUNC;
3675                 /*
3676                  * Advertise EPTP switching unconditionally
3677                  * since we emulate it
3678                  */
3679                 if (enable_ept)
3680                         msrs->vmfunc_controls =
3681                                 VMX_VMFUNC_EPTP_SWITCHING;
3682         }
3683
3684         /*
3685          * Old versions of KVM use the single-context version without
3686          * checking for support, so declare that it is supported even
3687          * though it is treated as global context.  The alternative is
3688          * not failing the single-context invvpid, and it is worse.
3689          */
3690         if (enable_vpid) {
3691                 msrs->secondary_ctls_high |=
3692                         SECONDARY_EXEC_ENABLE_VPID;
3693                 msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
3694                         VMX_VPID_EXTENT_SUPPORTED_MASK;
3695         }
3696
3697         if (enable_unrestricted_guest)
3698                 msrs->secondary_ctls_high |=
3699                         SECONDARY_EXEC_UNRESTRICTED_GUEST;
3700
3701         if (flexpriority_enabled)
3702                 msrs->secondary_ctls_high |=
3703                         SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
3704
3705         /* miscellaneous data */
3706         rdmsr(MSR_IA32_VMX_MISC,
3707                 msrs->misc_low,
3708                 msrs->misc_high);
3709         msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
3710         msrs->misc_low |=
3711                 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
3712                 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
3713                 VMX_MISC_ACTIVITY_HLT;
3714         msrs->misc_high = 0;
3715
3716         /*
3717          * This MSR reports some information about VMX support. We
3718          * should return information about the VMX we emulate for the
3719          * guest, and the VMCS structure we give it - not about the
3720          * VMX support of the underlying hardware.
3721          */
3722         msrs->basic =
3723                 VMCS12_REVISION |
3724                 VMX_BASIC_TRUE_CTLS |
3725                 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
3726                 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
3727
3728         if (cpu_has_vmx_basic_inout())
3729                 msrs->basic |= VMX_BASIC_INOUT;
3730
3731         /*
3732          * These MSRs specify bits which the guest must keep fixed on
3733          * while L1 is in VMXON mode (in L1's root mode, or running an L2).
3734          * We picked the standard core2 setting.
3735          */
3736 #define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
3737 #define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
3738         msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
3739         msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
3740
3741         /* These MSRs specify bits which the guest must keep fixed off. */
3742         rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
3743         rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
3744
3745         /* highest index: VMX_PREEMPTION_TIMER_VALUE */
3746         msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
3747 }
3748
3749 /*
3750  * if fixed0[i] == 1: val[i] must be 1
3751  * if fixed1[i] == 0: val[i] must be 0
3752  */
3753 static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1)
3754 {
3755         return ((val & fixed1) | fixed0) == val;
3756 }
3757
3758 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
3759 {
3760         return fixed_bits_valid(control, low, high);
3761 }
3762
3763 static inline u64 vmx_control_msr(u32 low, u32 high)
3764 {
3765         return low | ((u64)high << 32);
3766 }
3767
3768 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
3769 {
3770         superset &= mask;
3771         subset &= mask;
3772
3773         return (superset | subset) == superset;
3774 }
3775
3776 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
3777 {
3778         const u64 feature_and_reserved =
3779                 /* feature (except bit 48; see below) */
3780                 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
3781                 /* reserved */
3782                 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
3783         u64 vmx_basic = vmx->nested.msrs.basic;
3784
3785         if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
3786                 return -EINVAL;
3787
3788         /*
3789          * KVM does not emulate a version of VMX that constrains physical
3790          * addresses of VMX structures (e.g. VMCS) to 32-bits.
3791          */
3792         if (data & BIT_ULL(48))
3793                 return -EINVAL;
3794
3795         if (vmx_basic_vmcs_revision_id(vmx_basic) !=
3796             vmx_basic_vmcs_revision_id(data))
3797                 return -EINVAL;
3798
3799         if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
3800                 return -EINVAL;
3801
3802         vmx->nested.msrs.basic = data;
3803         return 0;
3804 }
3805
3806 static int
3807 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
3808 {
3809         u64 supported;
3810         u32 *lowp, *highp;
3811
3812         switch (msr_index) {
3813         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
3814                 lowp = &vmx->nested.msrs.pinbased_ctls_low;
3815                 highp = &vmx->nested.msrs.pinbased_ctls_high;
3816                 break;
3817         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
3818                 lowp = &vmx->nested.msrs.procbased_ctls_low;
3819                 highp = &vmx->nested.msrs.procbased_ctls_high;
3820                 break;
3821         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
3822                 lowp = &vmx->nested.msrs.exit_ctls_low;
3823                 highp = &vmx->nested.msrs.exit_ctls_high;
3824                 break;
3825         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
3826                 lowp = &vmx->nested.msrs.entry_ctls_low;
3827                 highp = &vmx->nested.msrs.entry_ctls_high;
3828                 break;
3829         case MSR_IA32_VMX_PROCBASED_CTLS2:
3830                 lowp = &vmx->nested.msrs.secondary_ctls_low;
3831                 highp = &vmx->nested.msrs.secondary_ctls_high;
3832                 break;
3833         default:
3834                 BUG();
3835         }
3836
3837         supported = vmx_control_msr(*lowp, *highp);
3838
3839         /* Check must-be-1 bits are still 1. */
3840         if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
3841                 return -EINVAL;
3842
3843         /* Check must-be-0 bits are still 0. */
3844         if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
3845                 return -EINVAL;
3846
3847         *lowp = data;
3848         *highp = data >> 32;
3849         return 0;
3850 }
3851
3852 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
3853 {
3854         const u64 feature_and_reserved_bits =
3855                 /* feature */
3856                 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
3857                 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
3858                 /* reserved */
3859                 GENMASK_ULL(13, 9) | BIT_ULL(31);
3860         u64 vmx_misc;
3861
3862         vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
3863                                    vmx->nested.msrs.misc_high);
3864
3865         if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
3866                 return -EINVAL;
3867
3868         if ((vmx->nested.msrs.pinbased_ctls_high &
3869              PIN_BASED_VMX_PREEMPTION_TIMER) &&
3870             vmx_misc_preemption_timer_rate(data) !=
3871             vmx_misc_preemption_timer_rate(vmx_misc))
3872                 return -EINVAL;
3873
3874         if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
3875                 return -EINVAL;
3876
3877         if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
3878                 return -EINVAL;
3879
3880         if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
3881                 return -EINVAL;
3882
3883         vmx->nested.msrs.misc_low = data;
3884         vmx->nested.msrs.misc_high = data >> 32;
3885
3886         /*
3887          * If L1 has read-only VM-exit information fields, use the
3888          * less permissive vmx_vmwrite_bitmap to specify write
3889          * permissions for the shadow VMCS.
3890          */
3891         if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
3892                 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
3893
3894         return 0;
3895 }
3896
3897 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
3898 {
3899         u64 vmx_ept_vpid_cap;
3900
3901         vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
3902                                            vmx->nested.msrs.vpid_caps);
3903
3904         /* Every bit is either reserved or a feature bit. */
3905         if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
3906                 return -EINVAL;
3907
3908         vmx->nested.msrs.ept_caps = data;
3909         vmx->nested.msrs.vpid_caps = data >> 32;
3910         return 0;
3911 }
3912
3913 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
3914 {
3915         u64 *msr;
3916
3917         switch (msr_index) {
3918         case MSR_IA32_VMX_CR0_FIXED0:
3919                 msr = &vmx->nested.msrs.cr0_fixed0;
3920                 break;
3921         case MSR_IA32_VMX_CR4_FIXED0:
3922                 msr = &vmx->nested.msrs.cr4_fixed0;
3923                 break;
3924         default:
3925                 BUG();
3926         }
3927
3928         /*
3929          * 1 bits (which indicates bits which "must-be-1" during VMX operation)
3930          * must be 1 in the restored value.
3931          */
3932         if (!is_bitwise_subset(data, *msr, -1ULL))
3933                 return -EINVAL;
3934
3935         *msr = data;
3936         return 0;
3937 }
3938
3939 /*
3940  * Called when userspace is restoring VMX MSRs.
3941  *
3942  * Returns 0 on success, non-0 otherwise.
3943  */
3944 static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
3945 {
3946         struct vcpu_vmx *vmx = to_vmx(vcpu);
3947
3948         /*
3949          * Don't allow changes to the VMX capability MSRs while the vCPU
3950          * is in VMX operation.
3951          */
3952         if (vmx->nested.vmxon)
3953                 return -EBUSY;
3954
3955         switch (msr_index) {
3956         case MSR_IA32_VMX_BASIC:
3957                 return vmx_restore_vmx_basic(vmx, data);
3958         case MSR_IA32_VMX_PINBASED_CTLS:
3959         case MSR_IA32_VMX_PROCBASED_CTLS:
3960         case MSR_IA32_VMX_EXIT_CTLS:
3961         case MSR_IA32_VMX_ENTRY_CTLS:
3962                 /*
3963                  * The "non-true" VMX capability MSRs are generated from the
3964                  * "true" MSRs, so we do not support restoring them directly.
3965                  *
3966                  * If userspace wants to emulate VMX_BASIC[55]=0, userspace
3967                  * should restore the "true" MSRs with the must-be-1 bits
3968                  * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
3969                  * DEFAULT SETTINGS".
3970                  */
3971                 return -EINVAL;
3972         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
3973         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
3974         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
3975         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
3976         case MSR_IA32_VMX_PROCBASED_CTLS2:
3977                 return vmx_restore_control_msr(vmx, msr_index, data);
3978         case MSR_IA32_VMX_MISC:
3979                 return vmx_restore_vmx_misc(vmx, data);
3980         case MSR_IA32_VMX_CR0_FIXED0:
3981         case MSR_IA32_VMX_CR4_FIXED0:
3982                 return vmx_restore_fixed0_msr(vmx, msr_index, data);
3983         case MSR_IA32_VMX_CR0_FIXED1:
3984         case MSR_IA32_VMX_CR4_FIXED1:
3985                 /*
3986                  * These MSRs are generated based on the vCPU's CPUID, so we
3987                  * do not support restoring them directly.
3988                  */
3989                 return -EINVAL;
3990         case MSR_IA32_VMX_EPT_VPID_CAP:
3991                 return vmx_restore_vmx_ept_vpid_cap(vmx, data);
3992         case MSR_IA32_VMX_VMCS_ENUM:
3993                 vmx->nested.msrs.vmcs_enum = data;
3994                 return 0;
3995         default:
3996                 /*
3997                  * The rest of the VMX capability MSRs do not support restore.
3998                  */
3999                 return -EINVAL;
4000         }
4001 }
4002
4003 /* Returns 0 on success, non-0 otherwise. */
4004 static int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
4005 {
4006         switch (msr_index) {
4007         case MSR_IA32_VMX_BASIC:
4008                 *pdata = msrs->basic;
4009                 break;
4010         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
4011         case MSR_IA32_VMX_PINBASED_CTLS:
4012                 *pdata = vmx_control_msr(
4013                         msrs->pinbased_ctls_low,
4014                         msrs->pinbased_ctls_high);
4015                 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
4016                         *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
4017                 break;
4018         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
4019         case MSR_IA32_VMX_PROCBASED_CTLS:
4020                 *pdata = vmx_control_msr(
4021                         msrs->procbased_ctls_low,
4022                         msrs->procbased_ctls_high);
4023                 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
4024                         *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
4025                 break;
4026         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
4027         case MSR_IA32_VMX_EXIT_CTLS:
4028                 *pdata = vmx_control_msr(
4029                         msrs->exit_ctls_low,
4030                         msrs->exit_ctls_high);
4031                 if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
4032                         *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
4033                 break;
4034         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
4035         case MSR_IA32_VMX_ENTRY_CTLS:
4036                 *pdata = vmx_control_msr(
4037                         msrs->entry_ctls_low,
4038                         msrs->entry_ctls_high);
4039                 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
4040                         *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
4041                 break;
4042         case MSR_IA32_VMX_MISC:
4043                 *pdata = vmx_control_msr(
4044                         msrs->misc_low,
4045                         msrs->misc_high);
4046                 break;
4047         case MSR_IA32_VMX_CR0_FIXED0:
4048                 *pdata = msrs->cr0_fixed0;
4049                 break;
4050         case MSR_IA32_VMX_CR0_FIXED1:
4051                 *pdata = msrs->cr0_fixed1;
4052                 break;
4053         case MSR_IA32_VMX_CR4_FIXED0:
4054                 *pdata = msrs->cr4_fixed0;
4055                 break;
4056         case MSR_IA32_VMX_CR4_FIXED1:
4057                 *pdata = msrs->cr4_fixed1;
4058                 break;
4059         case MSR_IA32_VMX_VMCS_ENUM:
4060                 *pdata = msrs->vmcs_enum;
4061                 break;
4062         case MSR_IA32_VMX_PROCBASED_CTLS2:
4063                 *pdata = vmx_control_msr(
4064                         msrs->secondary_ctls_low,
4065                         msrs->secondary_ctls_high);
4066                 break;
4067         case MSR_IA32_VMX_EPT_VPID_CAP:
4068                 *pdata = msrs->ept_caps |
4069                         ((u64)msrs->vpid_caps << 32);
4070                 break;
4071         case MSR_IA32_VMX_VMFUNC:
4072                 *pdata = msrs->vmfunc_controls;
4073                 break;
4074         default:
4075                 return 1;
4076         }
4077
4078         return 0;
4079 }
4080
4081 static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
4082                                                  uint64_t val)
4083 {
4084         uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
4085
4086         return !(val & ~valid_bits);
4087 }
4088
4089 static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
4090 {
4091         switch (msr->index) {
4092         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
4093                 if (!nested)
4094                         return 1;
4095                 return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
4096         default:
4097                 return 1;
4098         }
4099
4100         return 0;
4101 }
4102
4103 /*
4104  * Reads an msr value (of 'msr_index') into 'pdata'.
4105  * Returns 0 on success, non-0 otherwise.
4106  * Assumes vcpu_load() was already called.
4107  */
4108 static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
4109 {
4110         struct vcpu_vmx *vmx = to_vmx(vcpu);
4111         struct shared_msr_entry *msr;
4112
4113         switch (msr_info->index) {
4114 #ifdef CONFIG_X86_64
4115         case MSR_FS_BASE:
4116                 msr_info->data = vmcs_readl(GUEST_FS_BASE);
4117                 break;
4118         case MSR_GS_BASE:
4119                 msr_info->data = vmcs_readl(GUEST_GS_BASE);
4120                 break;
4121         case MSR_KERNEL_GS_BASE:
4122                 msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
4123                 break;
4124 #endif
4125         case MSR_EFER:
4126                 return kvm_get_msr_common(vcpu, msr_info);
4127         case MSR_IA32_SPEC_CTRL:
4128                 if (!msr_info->host_initiated &&
4129                     !guest_has_spec_ctrl_msr(vcpu))
4130                         return 1;
4131
4132                 msr_info->data = to_vmx(vcpu)->spec_ctrl;
4133                 break;
4134         case MSR_IA32_SYSENTER_CS:
4135                 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
4136                 break;
4137         case MSR_IA32_SYSENTER_EIP:
4138                 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
4139                 break;
4140         case MSR_IA32_SYSENTER_ESP:
4141                 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
4142                 break;
4143         case MSR_IA32_BNDCFGS:
4144                 if (!kvm_mpx_supported() ||
4145                     (!msr_info->host_initiated &&
4146                      !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
4147                         return 1;
4148                 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
4149                 break;
4150         case MSR_IA32_MCG_EXT_CTL:
4151                 if (!msr_info->host_initiated &&
4152                     !(vmx->msr_ia32_feature_control &
4153                       FEATURE_CONTROL_LMCE))
4154                         return 1;
4155                 msr_info->data = vcpu->arch.mcg_ext_ctl;
4156                 break;
4157         case MSR_IA32_FEATURE_CONTROL:
4158                 msr_info->data = vmx->msr_ia32_feature_control;
4159                 break;
4160         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
4161                 if (!nested_vmx_allowed(vcpu))
4162                         return 1;
4163                 return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
4164                                        &msr_info->data);
4165         case MSR_IA32_XSS:
4166                 if (!vmx_xsaves_supported() ||
4167                     (!msr_info->host_initiated &&
4168                      !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
4169                        guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))))
4170                         return 1;
4171                 msr_info->data = vcpu->arch.ia32_xss;
4172                 break;
4173         case MSR_TSC_AUX:
4174                 if (!msr_info->host_initiated &&
4175                     !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
4176                         return 1;
4177                 /* Otherwise falls through */
4178         default:
4179                 msr = find_msr_entry(vmx, msr_info->index);
4180                 if (msr) {
4181                         msr_info->data = msr->data;
4182                         break;
4183                 }
4184                 return kvm_get_msr_common(vcpu, msr_info);
4185         }
4186
4187         return 0;
4188 }
4189
4190 static void vmx_leave_nested(struct kvm_vcpu *vcpu);
4191
4192 /*
4193  * Writes msr value into into the appropriate "register".
4194  * Returns 0 on success, non-0 otherwise.
4195  * Assumes vcpu_load() was already called.
4196  */
4197 static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
4198 {
4199         struct vcpu_vmx *vmx = to_vmx(vcpu);
4200         struct shared_msr_entry *msr;
4201         int ret = 0;
4202         u32 msr_index = msr_info->index;
4203         u64 data = msr_info->data;
4204
4205         switch (msr_index) {
4206         case MSR_EFER:
4207                 ret = kvm_set_msr_common(vcpu, msr_info);
4208                 break;
4209 #ifdef CONFIG_X86_64
4210         case MSR_FS_BASE:
4211                 vmx_segment_cache_clear(vmx);
4212                 vmcs_writel(GUEST_FS_BASE, data);
4213                 break;
4214         case MSR_GS_BASE:
4215                 vmx_segment_cache_clear(vmx);
4216                 vmcs_writel(GUEST_GS_BASE, data);
4217                 break;
4218         case MSR_KERNEL_GS_BASE:
4219                 vmx_write_guest_kernel_gs_base(vmx, data);
4220                 break;
4221 #endif
4222         case MSR_IA32_SYSENTER_CS:
4223                 vmcs_write32(GUEST_SYSENTER_CS, data);
4224                 break;
4225         case MSR_IA32_SYSENTER_EIP:
4226                 vmcs_writel(GUEST_SYSENTER_EIP, data);
4227                 break;
4228         case MSR_IA32_SYSENTER_ESP:
4229                 vmcs_writel(GUEST_SYSENTER_ESP, data);
4230                 break;
4231         case MSR_IA32_BNDCFGS:
4232                 if (!kvm_mpx_supported() ||
4233                     (!msr_info->host_initiated &&
4234                      !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
4235                         return 1;
4236                 if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
4237                     (data & MSR_IA32_BNDCFGS_RSVD))
4238                         return 1;
4239                 vmcs_write64(GUEST_BNDCFGS, data);
4240                 break;
4241         case MSR_IA32_SPEC_CTRL:
4242                 if (!msr_info->host_initiated &&
4243                     !guest_has_spec_ctrl_msr(vcpu))
4244                         return 1;
4245
4246                 /* The STIBP bit doesn't fault even if it's not advertised */
4247                 if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
4248                         return 1;
4249
4250                 vmx->spec_ctrl = data;
4251
4252                 if (!data)
4253                         break;
4254
4255                 /*
4256                  * For non-nested:
4257                  * When it's written (to non-zero) for the first time, pass
4258                  * it through.
4259                  *
4260                  * For nested:
4261                  * The handling of the MSR bitmap for L2 guests is done in
4262                  * nested_vmx_merge_msr_bitmap. We should not touch the
4263                  * vmcs02.msr_bitmap here since it gets completely overwritten
4264                  * in the merging. We update the vmcs01 here for L1 as well
4265                  * since it will end up touching the MSR anyway now.
4266                  */
4267                 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
4268                                               MSR_IA32_SPEC_CTRL,
4269                                               MSR_TYPE_RW);
4270                 break;
4271         case MSR_IA32_PRED_CMD:
4272                 if (!msr_info->host_initiated &&
4273                     !guest_has_pred_cmd_msr(vcpu))
4274                         return 1;
4275
4276                 if (data & ~PRED_CMD_IBPB)
4277                         return 1;
4278
4279                 if (!data)
4280                         break;
4281
4282                 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
4283
4284                 /*
4285                  * For non-nested:
4286                  * When it's written (to non-zero) for the first time, pass
4287                  * it through.
4288                  *
4289                  * For nested:
4290                  * The handling of the MSR bitmap for L2 guests is done in
4291                  * nested_vmx_merge_msr_bitmap. We should not touch the
4292                  * vmcs02.msr_bitmap here since it gets completely overwritten
4293                  * in the merging.
4294                  */
4295                 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
4296                                               MSR_TYPE_W);
4297                 break;
4298         case MSR_IA32_CR_PAT:
4299                 if (!kvm_pat_valid(data))
4300                         return 1;
4301
4302                 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
4303                         vmcs_write64(GUEST_IA32_PAT, data);
4304                         vcpu->arch.pat = data;
4305                         break;
4306                 }
4307                 ret = kvm_set_msr_common(vcpu, msr_info);
4308                 break;
4309         case MSR_IA32_TSC_ADJUST:
4310                 ret = kvm_set_msr_common(vcpu, msr_info);
4311                 break;
4312         case MSR_IA32_MCG_EXT_CTL:
4313                 if ((!msr_info->host_initiated &&
4314                      !(to_vmx(vcpu)->msr_ia32_feature_control &
4315                        FEATURE_CONTROL_LMCE)) ||
4316                     (data & ~MCG_EXT_CTL_LMCE_EN))
4317                         return 1;
4318                 vcpu->arch.mcg_ext_ctl = data;
4319                 break;
4320         case MSR_IA32_FEATURE_CONTROL:
4321                 if (!vmx_feature_control_msr_valid(vcpu, data) ||
4322                     (to_vmx(vcpu)->msr_ia32_feature_control &
4323                      FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
4324                         return 1;
4325                 vmx->msr_ia32_feature_control = data;
4326                 if (msr_info->host_initiated && data == 0)
4327                         vmx_leave_nested(vcpu);
4328                 break;
4329         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
4330                 if (!msr_info->host_initiated)
4331                         return 1; /* they are read-only */
4332                 if (!nested_vmx_allowed(vcpu))
4333                         return 1;
4334                 return vmx_set_vmx_msr(vcpu, msr_index, data);
4335         case MSR_IA32_XSS:
4336                 if (!vmx_xsaves_supported() ||
4337                     (!msr_info->host_initiated &&
4338                      !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
4339                        guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))))
4340                         return 1;
4341                 /*
4342                  * The only supported bit as of Skylake is bit 8, but
4343                  * it is not supported on KVM.
4344                  */
4345                 if (data != 0)
4346                         return 1;
4347                 vcpu->arch.ia32_xss = data;
4348                 if (vcpu->arch.ia32_xss != host_xss)
4349                         add_atomic_switch_msr(vmx, MSR_IA32_XSS,
4350                                 vcpu->arch.ia32_xss, host_xss, false);
4351                 else
4352                         clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
4353                 break;
4354         case MSR_TSC_AUX:
4355                 if (!msr_info->host_initiated &&
4356                     !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
4357                         return 1;
4358                 /* Check reserved bit, higher 32 bits should be zero */
4359                 if ((data >> 32) != 0)
4360                         return 1;
4361                 /* Otherwise falls through */
4362         default:
4363                 msr = find_msr_entry(vmx, msr_index);
4364                 if (msr) {
4365                         u64 old_msr_data = msr->data;
4366                         msr->data = data;
4367                         if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
4368                                 preempt_disable();
4369                                 ret = kvm_set_shared_msr(msr->index, msr->data,
4370                                                          msr->mask);
4371                                 preempt_enable();
4372                                 if (ret)
4373                                         msr->data = old_msr_data;
4374                         }
4375                         break;
4376                 }
4377                         ret = kvm_set_msr_common(vcpu, msr_info);
4378         }
4379
4380         /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
4381         if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
4382                 vmx_update_fb_clear_dis(vcpu, vmx);
4383
4384         return ret;
4385 }
4386
4387 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
4388 {
4389         __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
4390         switch (reg) {
4391         case VCPU_REGS_RSP:
4392                 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
4393                 break;
4394         case VCPU_REGS_RIP:
4395                 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
4396                 break;
4397         case VCPU_EXREG_PDPTR:
4398                 if (enable_ept)
4399                         ept_save_pdptrs(vcpu);
4400                 break;
4401         default:
4402                 break;
4403         }
4404 }
4405
4406 static __init int cpu_has_kvm_support(void)
4407 {
4408         return cpu_has_vmx();
4409 }
4410
4411 static __init int vmx_disabled_by_bios(void)
4412 {
4413         u64 msr;
4414
4415         rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
4416         if (msr & FEATURE_CONTROL_LOCKED) {
4417                 /* launched w/ TXT and VMX disabled */
4418                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
4419                         && tboot_enabled())
4420                         return 1;
4421                 /* launched w/o TXT and VMX only enabled w/ TXT */
4422                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
4423                         && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
4424                         && !tboot_enabled()) {
4425                         printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
4426                                 "activate TXT before enabling KVM\n");
4427                         return 1;
4428                 }
4429                 /* launched w/o TXT and VMX disabled */
4430                 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
4431                         && !tboot_enabled())
4432                         return 1;
4433         }
4434
4435         return 0;
4436 }
4437
4438 static void kvm_cpu_vmxon(u64 addr)
4439 {
4440         cr4_set_bits(X86_CR4_VMXE);
4441         intel_pt_handle_vmx(1);
4442
4443         asm volatile (ASM_VMX_VMXON_RAX
4444                         : : "a"(&addr), "m"(addr)
4445                         : "memory", "cc");
4446 }
4447
4448 static int hardware_enable(void)
4449 {
4450         int cpu = raw_smp_processor_id();
4451         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
4452         u64 old, test_bits;
4453
4454         if (cr4_read_shadow() & X86_CR4_VMXE)
4455                 return -EBUSY;
4456
4457         /*
4458          * This can happen if we hot-added a CPU but failed to allocate
4459          * VP assist page for it.
4460          */
4461         if (static_branch_unlikely(&enable_evmcs) &&
4462             !hv_get_vp_assist_page(cpu))
4463                 return -EFAULT;
4464
4465         rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
4466
4467         test_bits = FEATURE_CONTROL_LOCKED;
4468         test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
4469         if (tboot_enabled())
4470                 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
4471
4472         if ((old & test_bits) != test_bits) {
4473                 /* enable and lock */
4474                 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
4475         }
4476         kvm_cpu_vmxon(phys_addr);
4477         if (enable_ept)
4478                 ept_sync_global();
4479
4480         return 0;
4481 }
4482
4483 static void vmclear_local_loaded_vmcss(void)
4484 {
4485         int cpu = raw_smp_processor_id();
4486         struct loaded_vmcs *v, *n;
4487
4488         list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
4489                                  loaded_vmcss_on_cpu_link)
4490                 __loaded_vmcs_clear(v);
4491 }
4492
4493
4494 /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
4495  * tricks.
4496  */
4497 static void kvm_cpu_vmxoff(void)
4498 {
4499         asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
4500
4501         intel_pt_handle_vmx(0);
4502         cr4_clear_bits(X86_CR4_VMXE);
4503 }
4504
4505 static void hardware_disable(void)
4506 {
4507         vmclear_local_loaded_vmcss();
4508         kvm_cpu_vmxoff();
4509 }
4510
4511 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
4512                                       u32 msr, u32 *result)
4513 {
4514         u32 vmx_msr_low, vmx_msr_high;
4515         u32 ctl = ctl_min | ctl_opt;
4516
4517         rdmsr(msr, vmx_msr_low, vmx_msr_high);
4518
4519         ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
4520         ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
4521
4522         /* Ensure minimum (required) set of control bits are supported. */
4523         if (ctl_min & ~ctl)
4524                 return -EIO;
4525
4526         *result = ctl;
4527         return 0;
4528 }
4529
4530 static __init bool allow_1_setting(u32 msr, u32 ctl)
4531 {
4532         u32 vmx_msr_low, vmx_msr_high;
4533
4534         rdmsr(msr, vmx_msr_low, vmx_msr_high);
4535         return vmx_msr_high & ctl;
4536 }
4537
4538 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
4539 {
4540         u32 vmx_msr_low, vmx_msr_high;
4541         u32 min, opt, min2, opt2;
4542         u32 _pin_based_exec_control = 0;
4543         u32 _cpu_based_exec_control = 0;
4544         u32 _cpu_based_2nd_exec_control = 0;
4545         u32 _vmexit_control = 0;
4546         u32 _vmentry_control = 0;
4547
4548         memset(vmcs_conf, 0, sizeof(*vmcs_conf));
4549         min = CPU_BASED_HLT_EXITING |
4550 #ifdef CONFIG_X86_64
4551               CPU_BASED_CR8_LOAD_EXITING |
4552               CPU_BASED_CR8_STORE_EXITING |
4553 #endif
4554               CPU_BASED_CR3_LOAD_EXITING |
4555               CPU_BASED_CR3_STORE_EXITING |
4556               CPU_BASED_UNCOND_IO_EXITING |
4557               CPU_BASED_MOV_DR_EXITING |
4558               CPU_BASED_USE_TSC_OFFSETING |
4559               CPU_BASED_MWAIT_EXITING |
4560               CPU_BASED_MONITOR_EXITING |
4561               CPU_BASED_INVLPG_EXITING |
4562               CPU_BASED_RDPMC_EXITING;
4563
4564         opt = CPU_BASED_TPR_SHADOW |
4565               CPU_BASED_USE_MSR_BITMAPS |
4566               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
4567         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
4568                                 &_cpu_based_exec_control) < 0)
4569                 return -EIO;
4570 #ifdef CONFIG_X86_64
4571         if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
4572                 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
4573                                            ~CPU_BASED_CR8_STORE_EXITING;
4574 #endif
4575         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
4576                 min2 = 0;
4577                 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
4578                         SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
4579                         SECONDARY_EXEC_WBINVD_EXITING |
4580                         SECONDARY_EXEC_ENABLE_VPID |
4581                         SECONDARY_EXEC_ENABLE_EPT |
4582                         SECONDARY_EXEC_UNRESTRICTED_GUEST |
4583                         SECONDARY_EXEC_PAUSE_LOOP_EXITING |
4584                         SECONDARY_EXEC_DESC |
4585                         SECONDARY_EXEC_RDTSCP |
4586                         SECONDARY_EXEC_ENABLE_INVPCID |
4587                         SECONDARY_EXEC_APIC_REGISTER_VIRT |
4588                         SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
4589                         SECONDARY_EXEC_SHADOW_VMCS |
4590                         SECONDARY_EXEC_XSAVES |
4591                         SECONDARY_EXEC_RDSEED_EXITING |
4592                         SECONDARY_EXEC_RDRAND_EXITING |
4593                         SECONDARY_EXEC_ENABLE_PML |
4594                         SECONDARY_EXEC_TSC_SCALING |
4595                         SECONDARY_EXEC_ENABLE_VMFUNC |
4596                         SECONDARY_EXEC_ENCLS_EXITING;
4597                 if (adjust_vmx_controls(min2, opt2,
4598                                         MSR_IA32_VMX_PROCBASED_CTLS2,
4599                                         &_cpu_based_2nd_exec_control) < 0)
4600                         return -EIO;
4601         }
4602 #ifndef CONFIG_X86_64
4603         if (!(_cpu_based_2nd_exec_control &
4604                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
4605                 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
4606 #endif
4607
4608         if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
4609                 _cpu_based_2nd_exec_control &= ~(
4610                                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
4611                                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
4612                                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4613
4614         rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
4615                 &vmx_capability.ept, &vmx_capability.vpid);
4616
4617         if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
4618                 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
4619                    enabled */
4620                 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
4621                                              CPU_BASED_CR3_STORE_EXITING |
4622                                              CPU_BASED_INVLPG_EXITING);
4623         } else if (vmx_capability.ept) {
4624                 vmx_capability.ept = 0;
4625                 pr_warn_once("EPT CAP should not exist if not support "
4626                                 "1-setting enable EPT VM-execution control\n");
4627         }
4628         if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
4629                 vmx_capability.vpid) {
4630                 vmx_capability.vpid = 0;
4631                 pr_warn_once("VPID CAP should not exist if not support "
4632                                 "1-setting enable VPID VM-execution control\n");
4633         }
4634
4635         min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
4636 #ifdef CONFIG_X86_64
4637         min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
4638 #endif
4639         opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
4640                 VM_EXIT_CLEAR_BNDCFGS;
4641         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
4642                                 &_vmexit_control) < 0)
4643                 return -EIO;
4644
4645         min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
4646         opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
4647                  PIN_BASED_VMX_PREEMPTION_TIMER;
4648         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
4649                                 &_pin_based_exec_control) < 0)
4650                 return -EIO;
4651
4652         if (cpu_has_broken_vmx_preemption_timer())
4653                 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
4654         if (!(_cpu_based_2nd_exec_control &
4655                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
4656                 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
4657
4658         min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
4659         opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
4660         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
4661                                 &_vmentry_control) < 0)
4662                 return -EIO;
4663
4664         rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
4665
4666         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
4667         if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
4668                 return -EIO;
4669
4670 #ifdef CONFIG_X86_64
4671         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
4672         if (vmx_msr_high & (1u<<16))
4673                 return -EIO;
4674 #endif
4675
4676         /* Require Write-Back (WB) memory type for VMCS accesses. */
4677         if (((vmx_msr_high >> 18) & 15) != 6)
4678                 return -EIO;
4679
4680         vmcs_conf->size = vmx_msr_high & 0x1fff;
4681         vmcs_conf->order = get_order(vmcs_conf->size);
4682         vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
4683
4684         vmcs_conf->revision_id = vmx_msr_low;
4685
4686         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
4687         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
4688         vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
4689         vmcs_conf->vmexit_ctrl         = _vmexit_control;
4690         vmcs_conf->vmentry_ctrl        = _vmentry_control;
4691
4692         if (static_branch_unlikely(&enable_evmcs))
4693                 evmcs_sanitize_exec_ctrls(vmcs_conf);
4694
4695         cpu_has_load_ia32_efer =
4696                 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
4697                                 VM_ENTRY_LOAD_IA32_EFER)
4698                 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
4699                                    VM_EXIT_LOAD_IA32_EFER);
4700
4701         cpu_has_load_perf_global_ctrl =
4702                 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
4703                                 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
4704                 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
4705                                    VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
4706
4707         /*
4708          * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
4709          * but due to errata below it can't be used. Workaround is to use
4710          * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
4711          *
4712          * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
4713          *
4714          * AAK155             (model 26)
4715          * AAP115             (model 30)
4716          * AAT100             (model 37)
4717          * BC86,AAY89,BD102   (model 44)
4718          * BA97               (model 46)
4719          *
4720          */
4721         if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) {
4722                 switch (boot_cpu_data.x86_model) {
4723                 case 26:
4724                 case 30:
4725                 case 37:
4726                 case 44:
4727                 case 46:
4728                         cpu_has_load_perf_global_ctrl = false;
4729                         printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
4730                                         "does not work properly. Using workaround\n");
4731                         break;
4732                 default:
4733                         break;
4734                 }
4735         }
4736
4737         if (boot_cpu_has(X86_FEATURE_XSAVES))
4738                 rdmsrl(MSR_IA32_XSS, host_xss);
4739
4740         return 0;
4741 }
4742
4743 static struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu)
4744 {
4745         int node = cpu_to_node(cpu);
4746         struct page *pages;
4747         struct vmcs *vmcs;
4748
4749         pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
4750         if (!pages)
4751                 return NULL;
4752         vmcs = page_address(pages);
4753         memset(vmcs, 0, vmcs_config.size);
4754
4755         /* KVM supports Enlightened VMCS v1 only */
4756         if (static_branch_unlikely(&enable_evmcs))
4757                 vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
4758         else
4759                 vmcs->hdr.revision_id = vmcs_config.revision_id;
4760
4761         if (shadow)
4762                 vmcs->hdr.shadow_vmcs = 1;
4763         return vmcs;
4764 }
4765
4766 static void free_vmcs(struct vmcs *vmcs)
4767 {
4768         free_pages((unsigned long)vmcs, vmcs_config.order);
4769 }
4770
4771 /*
4772  * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
4773  */
4774 static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
4775 {
4776         if (!loaded_vmcs->vmcs)
4777                 return;
4778         loaded_vmcs_clear(loaded_vmcs);
4779         free_vmcs(loaded_vmcs->vmcs);
4780         loaded_vmcs->vmcs = NULL;
4781         if (loaded_vmcs->msr_bitmap)
4782                 free_page((unsigned long)loaded_vmcs->msr_bitmap);
4783         WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
4784 }
4785
4786 static struct vmcs *alloc_vmcs(bool shadow)
4787 {
4788         return alloc_vmcs_cpu(shadow, raw_smp_processor_id());
4789 }
4790
4791 static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
4792 {
4793         loaded_vmcs->vmcs = alloc_vmcs(false);
4794         if (!loaded_vmcs->vmcs)
4795                 return -ENOMEM;
4796
4797         loaded_vmcs->shadow_vmcs = NULL;
4798         loaded_vmcs_init(loaded_vmcs);
4799
4800         if (cpu_has_vmx_msr_bitmap()) {
4801                 loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
4802                 if (!loaded_vmcs->msr_bitmap)
4803                         goto out_vmcs;
4804                 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
4805
4806                 if (IS_ENABLED(CONFIG_HYPERV) &&
4807                     static_branch_unlikely(&enable_evmcs) &&
4808                     (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
4809                         struct hv_enlightened_vmcs *evmcs =
4810                                 (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
4811
4812                         evmcs->hv_enlightenments_control.msr_bitmap = 1;
4813                 }
4814         }
4815
4816         memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
4817
4818         return 0;
4819
4820 out_vmcs:
4821         free_loaded_vmcs(loaded_vmcs);
4822         return -ENOMEM;
4823 }
4824
4825 static void free_kvm_area(void)
4826 {
4827         int cpu;
4828
4829         for_each_possible_cpu(cpu) {
4830                 free_vmcs(per_cpu(vmxarea, cpu));
4831                 per_cpu(vmxarea, cpu) = NULL;
4832         }
4833 }
4834
4835 enum vmcs_field_width {
4836         VMCS_FIELD_WIDTH_U16 = 0,
4837         VMCS_FIELD_WIDTH_U64 = 1,
4838         VMCS_FIELD_WIDTH_U32 = 2,
4839         VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3
4840 };
4841
4842 static inline int vmcs_field_width(unsigned long field)
4843 {
4844         if (0x1 & field)        /* the *_HIGH fields are all 32 bit */
4845                 return VMCS_FIELD_WIDTH_U32;
4846         return (field >> 13) & 0x3 ;
4847 }
4848
4849 static inline int vmcs_field_readonly(unsigned long field)
4850 {
4851         return (((field >> 10) & 0x3) == 1);
4852 }
4853
4854 static void init_vmcs_shadow_fields(void)
4855 {
4856         int i, j;
4857
4858         for (i = j = 0; i < max_shadow_read_only_fields; i++) {
4859                 u16 field = shadow_read_only_fields[i];
4860                 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
4861                     (i + 1 == max_shadow_read_only_fields ||
4862                      shadow_read_only_fields[i + 1] != field + 1))
4863                         pr_err("Missing field from shadow_read_only_field %x\n",
4864                                field + 1);
4865
4866                 clear_bit(field, vmx_vmread_bitmap);
4867 #ifdef CONFIG_X86_64
4868                 if (field & 1)
4869                         continue;
4870 #endif
4871                 if (j < i)
4872                         shadow_read_only_fields[j] = field;
4873                 j++;
4874         }
4875         max_shadow_read_only_fields = j;
4876
4877         for (i = j = 0; i < max_shadow_read_write_fields; i++) {
4878                 u16 field = shadow_read_write_fields[i];
4879                 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
4880                     (i + 1 == max_shadow_read_write_fields ||
4881                      shadow_read_write_fields[i + 1] != field + 1))
4882                         pr_err("Missing field from shadow_read_write_field %x\n",
4883                                field + 1);
4884
4885                 /*
4886                  * PML and the preemption timer can be emulated, but the
4887                  * processor cannot vmwrite to fields that don't exist
4888                  * on bare metal.
4889                  */
4890                 switch (field) {
4891                 case GUEST_PML_INDEX:
4892                         if (!cpu_has_vmx_pml())
4893                                 continue;
4894                         break;
4895                 case VMX_PREEMPTION_TIMER_VALUE:
4896                         if (!cpu_has_vmx_preemption_timer())
4897                                 continue;
4898                         break;
4899                 case GUEST_INTR_STATUS:
4900                         if (!cpu_has_vmx_apicv())
4901                                 continue;
4902                         break;
4903                 default:
4904                         break;
4905                 }
4906
4907                 clear_bit(field, vmx_vmwrite_bitmap);
4908                 clear_bit(field, vmx_vmread_bitmap);
4909 #ifdef CONFIG_X86_64
4910                 if (field & 1)
4911                         continue;
4912 #endif
4913                 if (j < i)
4914                         shadow_read_write_fields[j] = field;
4915                 j++;
4916         }
4917         max_shadow_read_write_fields = j;
4918 }
4919
4920 static __init int alloc_kvm_area(void)
4921 {
4922         int cpu;
4923
4924         for_each_possible_cpu(cpu) {
4925                 struct vmcs *vmcs;
4926
4927                 vmcs = alloc_vmcs_cpu(false, cpu);
4928                 if (!vmcs) {
4929                         free_kvm_area();
4930                         return -ENOMEM;
4931                 }
4932
4933                 /*
4934                  * When eVMCS is enabled, alloc_vmcs_cpu() sets
4935                  * vmcs->revision_id to KVM_EVMCS_VERSION instead of
4936                  * revision_id reported by MSR_IA32_VMX_BASIC.
4937                  *
4938                  * However, even though not explictly documented by
4939                  * TLFS, VMXArea passed as VMXON argument should
4940                  * still be marked with revision_id reported by
4941                  * physical CPU.
4942                  */
4943                 if (static_branch_unlikely(&enable_evmcs))
4944                         vmcs->hdr.revision_id = vmcs_config.revision_id;
4945
4946                 per_cpu(vmxarea, cpu) = vmcs;
4947         }
4948         return 0;
4949 }
4950
4951 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
4952                 struct kvm_segment *save)
4953 {
4954         if (!emulate_invalid_guest_state) {
4955                 /*
4956                  * CS and SS RPL should be equal during guest entry according
4957                  * to VMX spec, but in reality it is not always so. Since vcpu
4958                  * is in the middle of the transition from real mode to
4959                  * protected mode it is safe to assume that RPL 0 is a good
4960                  * default value.
4961                  */
4962                 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
4963                         save->selector &= ~SEGMENT_RPL_MASK;
4964                 save->dpl = save->selector & SEGMENT_RPL_MASK;
4965                 save->s = 1;
4966         }
4967         vmx_set_segment(vcpu, save, seg);
4968 }
4969
4970 static void enter_pmode(struct kvm_vcpu *vcpu)
4971 {
4972         unsigned long flags;
4973         struct vcpu_vmx *vmx = to_vmx(vcpu);
4974
4975         /*
4976          * Update real mode segment cache. It may be not up-to-date if sement
4977          * register was written while vcpu was in a guest mode.
4978          */
4979         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
4980         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
4981         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
4982         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
4983         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
4984         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
4985
4986         vmx->rmode.vm86_active = 0;
4987
4988         vmx_segment_cache_clear(vmx);
4989
4990         vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
4991
4992         flags = vmcs_readl(GUEST_RFLAGS);
4993         flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
4994         flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
4995         vmcs_writel(GUEST_RFLAGS, flags);
4996
4997         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
4998                         (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
4999
5000         update_exception_bitmap(vcpu);
5001
5002         fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
5003         fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
5004         fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
5005         fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
5006         fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
5007         fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
5008 }
5009
5010 static void fix_rmode_seg(int seg, struct kvm_segment *save)
5011 {
5012         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
5013         struct kvm_segment var = *save;
5014
5015         var.dpl = 0x3;
5016         if (seg == VCPU_SREG_CS)
5017                 var.type = 0x3;
5018
5019         if (!emulate_invalid_guest_state) {
5020                 var.selector = var.base >> 4;
5021                 var.base = var.base & 0xffff0;
5022                 var.limit = 0xffff;
5023                 var.g = 0;
5024                 var.db = 0;
5025                 var.present = 1;
5026                 var.s = 1;
5027                 var.l = 0;
5028                 var.unusable = 0;
5029                 var.type = 0x3;
5030                 var.avl = 0;
5031                 if (save->base & 0xf)
5032                         printk_once(KERN_WARNING "kvm: segment base is not "
5033                                         "paragraph aligned when entering "
5034                                         "protected mode (seg=%d)", seg);
5035         }
5036
5037         vmcs_write16(sf->selector, var.selector);
5038         vmcs_writel(sf->base, var.base);
5039         vmcs_write32(sf->limit, var.limit);
5040         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
5041 }
5042
5043 static void enter_rmode(struct kvm_vcpu *vcpu)
5044 {
5045         unsigned long flags;
5046         struct vcpu_vmx *vmx = to_vmx(vcpu);
5047         struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
5048
5049         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
5050         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
5051         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
5052         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
5053         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
5054         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
5055         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
5056
5057         vmx->rmode.vm86_active = 1;
5058
5059         /*
5060          * Very old userspace does not call KVM_SET_TSS_ADDR before entering
5061          * vcpu. Warn the user that an update is overdue.
5062          */
5063         if (!kvm_vmx->tss_addr)
5064                 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
5065                              "called before entering vcpu\n");
5066
5067         vmx_segment_cache_clear(vmx);
5068
5069         vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
5070         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
5071         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
5072
5073         flags = vmcs_readl(GUEST_RFLAGS);
5074         vmx->rmode.save_rflags = flags;
5075
5076         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
5077
5078         vmcs_writel(GUEST_RFLAGS, flags);
5079         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
5080         update_exception_bitmap(vcpu);
5081
5082         fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
5083         fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
5084         fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
5085         fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
5086         fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
5087         fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
5088
5089         kvm_mmu_reset_context(vcpu);
5090 }
5091
5092 static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
5093 {
5094         struct vcpu_vmx *vmx = to_vmx(vcpu);
5095         struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
5096
5097         if (!msr)
5098                 return;
5099
5100         vcpu->arch.efer = efer;
5101         if (efer & EFER_LMA) {
5102                 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
5103                 msr->data = efer;
5104         } else {
5105                 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
5106
5107                 msr->data = efer & ~EFER_LME;
5108         }
5109         setup_msrs(vmx);
5110 }
5111
5112 #ifdef CONFIG_X86_64
5113
5114 static void enter_lmode(struct kvm_vcpu *vcpu)
5115 {
5116         u32 guest_tr_ar;
5117
5118         vmx_segment_cache_clear(to_vmx(vcpu));
5119
5120         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
5121         if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
5122                 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
5123                                      __func__);
5124                 vmcs_write32(GUEST_TR_AR_BYTES,
5125                              (guest_tr_ar & ~VMX_AR_TYPE_MASK)
5126                              | VMX_AR_TYPE_BUSY_64_TSS);
5127         }
5128         vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
5129 }
5130
5131 static void exit_lmode(struct kvm_vcpu *vcpu)
5132 {
5133         vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
5134         vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
5135 }
5136
5137 #endif
5138
5139 static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid,
5140                                 bool invalidate_gpa)
5141 {
5142         if (enable_ept && (invalidate_gpa || !enable_vpid)) {
5143                 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
5144                         return;
5145                 ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa));
5146         } else {
5147                 vpid_sync_context(vpid);
5148         }
5149 }
5150
5151 static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
5152 {
5153         __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa);
5154 }
5155
5156 static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
5157 {
5158         int vpid = to_vmx(vcpu)->vpid;
5159
5160         if (!vpid_sync_vcpu_addr(vpid, addr))
5161                 vpid_sync_context(vpid);
5162
5163         /*
5164          * If VPIDs are not supported or enabled, then the above is a no-op.
5165          * But we don't really need a TLB flush in that case anyway, because
5166          * each VM entry/exit includes an implicit flush when VPID is 0.
5167          */
5168 }
5169
5170 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
5171 {
5172         ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
5173
5174         vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
5175         vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
5176 }
5177
5178 static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
5179 {
5180         if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
5181                 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
5182         __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
5183 }
5184
5185 static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
5186 {
5187         ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
5188
5189         vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
5190         vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
5191 }
5192
5193 static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
5194 {
5195         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
5196
5197         if (!test_bit(VCPU_EXREG_PDPTR,
5198                       (unsigned long *)&vcpu->arch.regs_dirty))
5199                 return;
5200
5201         if (is_pae_paging(vcpu)) {
5202                 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
5203                 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
5204                 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
5205                 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
5206         }
5207 }
5208
5209 static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
5210 {
5211         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
5212
5213         if (is_pae_paging(vcpu)) {
5214                 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
5215                 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
5216                 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
5217                 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
5218         }
5219
5220         __set_bit(VCPU_EXREG_PDPTR,
5221                   (unsigned long *)&vcpu->arch.regs_avail);
5222         __set_bit(VCPU_EXREG_PDPTR,
5223                   (unsigned long *)&vcpu->arch.regs_dirty);
5224 }
5225
5226 static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
5227 {
5228         u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
5229         u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
5230         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5231
5232         if (to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
5233                 SECONDARY_EXEC_UNRESTRICTED_GUEST &&
5234             nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
5235                 fixed0 &= ~(X86_CR0_PE | X86_CR0_PG);
5236
5237         return fixed_bits_valid(val, fixed0, fixed1);
5238 }
5239
5240 static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
5241 {
5242         u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
5243         u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
5244
5245         return fixed_bits_valid(val, fixed0, fixed1);
5246 }
5247
5248 static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
5249 {
5250         u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0;
5251         u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1;
5252
5253         return fixed_bits_valid(val, fixed0, fixed1);
5254 }
5255
5256 /* No difference in the restrictions on guest and host CR4 in VMX operation. */
5257 #define nested_guest_cr4_valid  nested_cr4_valid
5258 #define nested_host_cr4_valid   nested_cr4_valid
5259
5260 static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
5261
5262 static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
5263                                         unsigned long cr0,
5264                                         struct kvm_vcpu *vcpu)
5265 {
5266         if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
5267                 vmx_decache_cr3(vcpu);
5268         if (!(cr0 & X86_CR0_PG)) {
5269                 /* From paging/starting to nonpaging */
5270                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
5271                              vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
5272                              (CPU_BASED_CR3_LOAD_EXITING |
5273                               CPU_BASED_CR3_STORE_EXITING));
5274                 vcpu->arch.cr0 = cr0;
5275                 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
5276         } else if (!is_paging(vcpu)) {
5277                 /* From nonpaging to paging */
5278                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
5279                              vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
5280                              ~(CPU_BASED_CR3_LOAD_EXITING |
5281                                CPU_BASED_CR3_STORE_EXITING));
5282                 vcpu->arch.cr0 = cr0;
5283                 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
5284         }
5285
5286         if (!(cr0 & X86_CR0_WP))
5287                 *hw_cr0 &= ~X86_CR0_WP;
5288 }
5289
5290 static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
5291 {
5292         struct vcpu_vmx *vmx = to_vmx(vcpu);
5293         unsigned long hw_cr0;
5294
5295         hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK);
5296         if (enable_unrestricted_guest)
5297                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
5298         else {
5299                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
5300
5301                 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
5302                         enter_pmode(vcpu);
5303
5304                 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
5305                         enter_rmode(vcpu);
5306         }
5307
5308 #ifdef CONFIG_X86_64
5309         if (vcpu->arch.efer & EFER_LME) {
5310                 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
5311                         enter_lmode(vcpu);
5312                 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
5313                         exit_lmode(vcpu);
5314         }
5315 #endif
5316
5317         if (enable_ept && !enable_unrestricted_guest)
5318                 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
5319
5320         vmcs_writel(CR0_READ_SHADOW, cr0);
5321         vmcs_writel(GUEST_CR0, hw_cr0);
5322         vcpu->arch.cr0 = cr0;
5323
5324         /* depends on vcpu->arch.cr0 to be set to a new value */
5325         vmx->emulation_required = emulation_required(vcpu);
5326 }
5327
5328 static int get_ept_level(struct kvm_vcpu *vcpu)
5329 {
5330         /* Nested EPT currently only supports 4-level walks. */
5331         if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
5332                 return 4;
5333         if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
5334                 return 5;
5335         return 4;
5336 }
5337
5338 static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
5339 {
5340         u64 eptp = VMX_EPTP_MT_WB;
5341
5342         eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
5343
5344         if (enable_ept_ad_bits &&
5345             (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
5346                 eptp |= VMX_EPTP_AD_ENABLE_BIT;
5347         eptp |= (root_hpa & PAGE_MASK);
5348
5349         return eptp;
5350 }
5351
5352 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
5353 {
5354         struct kvm *kvm = vcpu->kvm;
5355         unsigned long guest_cr3;
5356         u64 eptp;
5357
5358         guest_cr3 = cr3;
5359         if (enable_ept) {
5360                 eptp = construct_eptp(vcpu, cr3);
5361                 vmcs_write64(EPT_POINTER, eptp);
5362
5363                 if (kvm_x86_ops->tlb_remote_flush) {
5364                         spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
5365                         to_vmx(vcpu)->ept_pointer = eptp;
5366                         to_kvm_vmx(kvm)->ept_pointers_match
5367                                 = EPT_POINTERS_CHECK;
5368                         spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
5369                 }
5370
5371                 if (enable_unrestricted_guest || is_paging(vcpu) ||
5372                     is_guest_mode(vcpu))
5373                         guest_cr3 = kvm_read_cr3(vcpu);
5374                 else
5375                         guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
5376                 ept_load_pdptrs(vcpu);
5377         }
5378
5379         vmcs_writel(GUEST_CR3, guest_cr3);
5380 }
5381
5382 static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
5383 {
5384         /*
5385          * Pass through host's Machine Check Enable value to hw_cr4, which
5386          * is in force while we are in guest mode.  Do not let guests control
5387          * this bit, even if host CR4.MCE == 0.
5388          */
5389         unsigned long hw_cr4;
5390
5391         hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
5392         if (enable_unrestricted_guest)
5393                 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
5394         else if (to_vmx(vcpu)->rmode.vm86_active)
5395                 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
5396         else
5397                 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
5398
5399         if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
5400                 if (cr4 & X86_CR4_UMIP) {
5401                         vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
5402                                 SECONDARY_EXEC_DESC);
5403                         hw_cr4 &= ~X86_CR4_UMIP;
5404                 } else if (!is_guest_mode(vcpu) ||
5405                         !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC))
5406                         vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
5407                                         SECONDARY_EXEC_DESC);
5408         }
5409
5410         if (cr4 & X86_CR4_VMXE) {
5411                 /*
5412                  * To use VMXON (and later other VMX instructions), a guest
5413                  * must first be able to turn on cr4.VMXE (see handle_vmon()).
5414                  * So basically the check on whether to allow nested VMX
5415                  * is here.  We operate under the default treatment of SMM,
5416                  * so VMX cannot be enabled under SMM.
5417                  */
5418                 if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
5419                         return 1;
5420         }
5421
5422         if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
5423                 return 1;
5424
5425         vcpu->arch.cr4 = cr4;
5426
5427         if (!enable_unrestricted_guest) {
5428                 if (enable_ept) {
5429                         if (!is_paging(vcpu)) {
5430                                 hw_cr4 &= ~X86_CR4_PAE;
5431                                 hw_cr4 |= X86_CR4_PSE;
5432                         } else if (!(cr4 & X86_CR4_PAE)) {
5433                                 hw_cr4 &= ~X86_CR4_PAE;
5434                         }
5435                 }
5436
5437                 /*
5438                  * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
5439                  * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
5440                  * to be manually disabled when guest switches to non-paging
5441                  * mode.
5442                  *
5443                  * If !enable_unrestricted_guest, the CPU is always running
5444                  * with CR0.PG=1 and CR4 needs to be modified.
5445                  * If enable_unrestricted_guest, the CPU automatically
5446                  * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
5447                  */
5448                 if (!is_paging(vcpu))
5449                         hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
5450         }
5451
5452         vmcs_writel(CR4_READ_SHADOW, cr4);
5453         vmcs_writel(GUEST_CR4, hw_cr4);
5454         return 0;
5455 }
5456
5457 static void vmx_get_segment(struct kvm_vcpu *vcpu,
5458                             struct kvm_segment *var, int seg)
5459 {
5460         struct vcpu_vmx *vmx = to_vmx(vcpu);
5461         u32 ar;
5462
5463         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
5464                 *var = vmx->rmode.segs[seg];
5465                 if (seg == VCPU_SREG_TR
5466                     || var->selector == vmx_read_guest_seg_selector(vmx, seg))
5467                         return;
5468                 var->base = vmx_read_guest_seg_base(vmx, seg);
5469                 var->selector = vmx_read_guest_seg_selector(vmx, seg);
5470                 return;
5471         }
5472         var->base = vmx_read_guest_seg_base(vmx, seg);
5473         var->limit = vmx_read_guest_seg_limit(vmx, seg);
5474         var->selector = vmx_read_guest_seg_selector(vmx, seg);
5475         ar = vmx_read_guest_seg_ar(vmx, seg);
5476         var->unusable = (ar >> 16) & 1;
5477         var->type = ar & 15;
5478         var->s = (ar >> 4) & 1;
5479         var->dpl = (ar >> 5) & 3;
5480         /*
5481          * Some userspaces do not preserve unusable property. Since usable
5482          * segment has to be present according to VMX spec we can use present
5483          * property to amend userspace bug by making unusable segment always
5484          * nonpresent. vmx_segment_access_rights() already marks nonpresent
5485          * segment as unusable.
5486          */
5487         var->present = !var->unusable;
5488         var->avl = (ar >> 12) & 1;
5489         var->l = (ar >> 13) & 1;
5490         var->db = (ar >> 14) & 1;
5491         var->g = (ar >> 15) & 1;
5492 }
5493
5494 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
5495 {
5496         struct kvm_segment s;
5497
5498         if (to_vmx(vcpu)->rmode.vm86_active) {
5499                 vmx_get_segment(vcpu, &s, seg);
5500                 return s.base;
5501         }
5502         return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
5503 }
5504
5505 static int vmx_get_cpl(struct kvm_vcpu *vcpu)
5506 {
5507         struct vcpu_vmx *vmx = to_vmx(vcpu);
5508
5509         if (unlikely(vmx->rmode.vm86_active))
5510                 return 0;
5511         else {
5512                 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
5513                 return VMX_AR_DPL(ar);
5514         }
5515 }
5516
5517 static u32 vmx_segment_access_rights(struct kvm_segment *var)
5518 {
5519         u32 ar;
5520
5521         if (var->unusable || !var->present)
5522                 ar = 1 << 16;
5523         else {
5524                 ar = var->type & 15;
5525                 ar |= (var->s & 1) << 4;
5526                 ar |= (var->dpl & 3) << 5;
5527                 ar |= (var->present & 1) << 7;
5528                 ar |= (var->avl & 1) << 12;
5529                 ar |= (var->l & 1) << 13;
5530                 ar |= (var->db & 1) << 14;
5531                 ar |= (var->g & 1) << 15;
5532         }
5533
5534         return ar;
5535 }
5536
5537 static void vmx_set_segment(struct kvm_vcpu *vcpu,
5538                             struct kvm_segment *var, int seg)
5539 {
5540         struct vcpu_vmx *vmx = to_vmx(vcpu);
5541         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
5542
5543         vmx_segment_cache_clear(vmx);
5544
5545         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
5546                 vmx->rmode.segs[seg] = *var;
5547                 if (seg == VCPU_SREG_TR)
5548                         vmcs_write16(sf->selector, var->selector);
5549                 else if (var->s)
5550                         fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
5551                 goto out;
5552         }
5553
5554         vmcs_writel(sf->base, var->base);
5555         vmcs_write32(sf->limit, var->limit);
5556         vmcs_write16(sf->selector, var->selector);
5557
5558         /*
5559          *   Fix the "Accessed" bit in AR field of segment registers for older
5560          * qemu binaries.
5561          *   IA32 arch specifies that at the time of processor reset the
5562          * "Accessed" bit in the AR field of segment registers is 1. And qemu
5563          * is setting it to 0 in the userland code. This causes invalid guest
5564          * state vmexit when "unrestricted guest" mode is turned on.
5565          *    Fix for this setup issue in cpu_reset is being pushed in the qemu
5566          * tree. Newer qemu binaries with that qemu fix would not need this
5567          * kvm hack.
5568          */
5569         if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
5570                 var->type |= 0x1; /* Accessed */
5571
5572         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
5573
5574 out:
5575         vmx->emulation_required = emulation_required(vcpu);
5576 }
5577
5578 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
5579 {
5580         u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
5581
5582         *db = (ar >> 14) & 1;
5583         *l = (ar >> 13) & 1;
5584 }
5585
5586 static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
5587 {
5588         dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
5589         dt->address = vmcs_readl(GUEST_IDTR_BASE);
5590 }
5591
5592 static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
5593 {
5594         vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
5595         vmcs_writel(GUEST_IDTR_BASE, dt->address);
5596 }
5597
5598 static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
5599 {
5600         dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
5601         dt->address = vmcs_readl(GUEST_GDTR_BASE);
5602 }
5603
5604 static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
5605 {
5606         vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
5607         vmcs_writel(GUEST_GDTR_BASE, dt->address);
5608 }
5609
5610 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
5611 {
5612         struct kvm_segment var;
5613         u32 ar;
5614
5615         vmx_get_segment(vcpu, &var, seg);
5616         var.dpl = 0x3;
5617         if (seg == VCPU_SREG_CS)
5618                 var.type = 0x3;
5619         ar = vmx_segment_access_rights(&var);
5620
5621         if (var.base != (var.selector << 4))
5622                 return false;
5623         if (var.limit != 0xffff)
5624                 return false;
5625         if (ar != 0xf3)
5626                 return false;
5627
5628         return true;
5629 }
5630
5631 static bool code_segment_valid(struct kvm_vcpu *vcpu)
5632 {
5633         struct kvm_segment cs;
5634         unsigned int cs_rpl;
5635
5636         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
5637         cs_rpl = cs.selector & SEGMENT_RPL_MASK;
5638
5639         if (cs.unusable)
5640                 return false;
5641         if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
5642                 return false;
5643         if (!cs.s)
5644                 return false;
5645         if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
5646                 if (cs.dpl > cs_rpl)
5647                         return false;
5648         } else {
5649                 if (cs.dpl != cs_rpl)
5650                         return false;
5651         }
5652         if (!cs.present)
5653                 return false;
5654
5655         /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
5656         return true;
5657 }
5658
5659 static bool stack_segment_valid(struct kvm_vcpu *vcpu)
5660 {
5661         struct kvm_segment ss;
5662         unsigned int ss_rpl;
5663
5664         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
5665         ss_rpl = ss.selector & SEGMENT_RPL_MASK;
5666
5667         if (ss.unusable)
5668                 return true;
5669         if (ss.type != 3 && ss.type != 7)
5670                 return false;
5671         if (!ss.s)
5672                 return false;
5673         if (ss.dpl != ss_rpl) /* DPL != RPL */
5674                 return false;
5675         if (!ss.present)
5676                 return false;
5677
5678         return true;
5679 }
5680
5681 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
5682 {
5683         struct kvm_segment var;
5684         unsigned int rpl;
5685
5686         vmx_get_segment(vcpu, &var, seg);
5687         rpl = var.selector & SEGMENT_RPL_MASK;
5688
5689         if (var.unusable)
5690                 return true;
5691         if (!var.s)
5692                 return false;
5693         if (!var.present)
5694                 return false;
5695         if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
5696                 if (var.dpl < rpl) /* DPL < RPL */
5697                         return false;
5698         }
5699
5700         /* TODO: Add other members to kvm_segment_field to allow checking for other access
5701          * rights flags
5702          */
5703         return true;
5704 }
5705
5706 static bool tr_valid(struct kvm_vcpu *vcpu)
5707 {
5708         struct kvm_segment tr;
5709
5710         vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
5711
5712         if (tr.unusable)
5713                 return false;
5714         if (tr.selector & SEGMENT_TI_MASK)      /* TI = 1 */
5715                 return false;
5716         if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
5717                 return false;
5718         if (!tr.present)
5719                 return false;
5720
5721         return true;
5722 }
5723
5724 static bool ldtr_valid(struct kvm_vcpu *vcpu)
5725 {
5726         struct kvm_segment ldtr;
5727
5728         vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
5729
5730         if (ldtr.unusable)
5731                 return true;
5732         if (ldtr.selector & SEGMENT_TI_MASK)    /* TI = 1 */
5733                 return false;
5734         if (ldtr.type != 2)
5735                 return false;
5736         if (!ldtr.present)
5737                 return false;
5738
5739         return true;
5740 }
5741
5742 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
5743 {
5744         struct kvm_segment cs, ss;
5745
5746         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
5747         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
5748
5749         return ((cs.selector & SEGMENT_RPL_MASK) ==
5750                  (ss.selector & SEGMENT_RPL_MASK));
5751 }
5752
5753 static bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu,
5754                                         unsigned int port, int size);
5755 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
5756                                        struct vmcs12 *vmcs12)
5757 {
5758         unsigned long exit_qualification;
5759         unsigned short port;
5760         int size;
5761
5762         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
5763                 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
5764
5765         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5766
5767         port = exit_qualification >> 16;
5768         size = (exit_qualification & 7) + 1;
5769
5770         return nested_vmx_check_io_bitmaps(vcpu, port, size);
5771 }
5772
5773 /*
5774  * Check if guest state is valid. Returns true if valid, false if
5775  * not.
5776  * We assume that registers are always usable
5777  */
5778 static bool guest_state_valid(struct kvm_vcpu *vcpu)
5779 {
5780         if (enable_unrestricted_guest)
5781                 return true;
5782
5783         /* real mode guest state checks */
5784         if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
5785                 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
5786                         return false;
5787                 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
5788                         return false;
5789                 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
5790                         return false;
5791                 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
5792                         return false;
5793                 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
5794                         return false;
5795                 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
5796                         return false;
5797         } else {
5798         /* protected mode guest state checks */
5799                 if (!cs_ss_rpl_check(vcpu))
5800                         return false;
5801                 if (!code_segment_valid(vcpu))
5802                         return false;
5803                 if (!stack_segment_valid(vcpu))
5804                         return false;
5805                 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
5806                         return false;
5807                 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
5808                         return false;
5809                 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
5810                         return false;
5811                 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
5812                         return false;
5813                 if (!tr_valid(vcpu))
5814                         return false;
5815                 if (!ldtr_valid(vcpu))
5816                         return false;
5817         }
5818         /* TODO:
5819          * - Add checks on RIP
5820          * - Add checks on RFLAGS
5821          */
5822
5823         return true;
5824 }
5825
5826 static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
5827 {
5828         return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
5829 }
5830
5831 static int init_rmode_tss(struct kvm *kvm)
5832 {
5833         gfn_t fn;
5834         u16 data = 0;
5835         int idx, r;
5836
5837         idx = srcu_read_lock(&kvm->srcu);
5838         fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
5839         r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
5840         if (r < 0)
5841                 goto out;
5842         data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
5843         r = kvm_write_guest_page(kvm, fn++, &data,
5844                         TSS_IOPB_BASE_OFFSET, sizeof(u16));
5845         if (r < 0)
5846                 goto out;
5847         r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
5848         if (r < 0)
5849                 goto out;
5850         r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
5851         if (r < 0)
5852                 goto out;
5853         data = ~0;
5854         r = kvm_write_guest_page(kvm, fn, &data,
5855                                  RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
5856                                  sizeof(u8));
5857 out:
5858         srcu_read_unlock(&kvm->srcu, idx);
5859         return r;
5860 }
5861
5862 static int init_rmode_identity_map(struct kvm *kvm)
5863 {
5864         struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
5865         int i, idx, r = 0;
5866         kvm_pfn_t identity_map_pfn;
5867         u32 tmp;
5868
5869         /* Protect kvm_vmx->ept_identity_pagetable_done. */
5870         mutex_lock(&kvm->slots_lock);
5871
5872         if (likely(kvm_vmx->ept_identity_pagetable_done))
5873                 goto out2;
5874
5875         if (!kvm_vmx->ept_identity_map_addr)
5876                 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
5877         identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
5878
5879         r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
5880                                     kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
5881         if (r < 0)
5882                 goto out2;
5883
5884         idx = srcu_read_lock(&kvm->srcu);
5885         r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
5886         if (r < 0)
5887                 goto out;
5888         /* Set up identity-mapping pagetable for EPT in real mode */
5889         for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
5890                 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
5891                         _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
5892                 r = kvm_write_guest_page(kvm, identity_map_pfn,
5893                                 &tmp, i * sizeof(tmp), sizeof(tmp));
5894                 if (r < 0)
5895                         goto out;
5896         }
5897         kvm_vmx->ept_identity_pagetable_done = true;
5898
5899 out:
5900         srcu_read_unlock(&kvm->srcu, idx);
5901
5902 out2:
5903         mutex_unlock(&kvm->slots_lock);
5904         return r;
5905 }
5906
5907 static void seg_setup(int seg)
5908 {
5909         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
5910         unsigned int ar;
5911
5912         vmcs_write16(sf->selector, 0);
5913         vmcs_writel(sf->base, 0);
5914         vmcs_write32(sf->limit, 0xffff);
5915         ar = 0x93;
5916         if (seg == VCPU_SREG_CS)
5917                 ar |= 0x08; /* code segment */
5918
5919         vmcs_write32(sf->ar_bytes, ar);
5920 }
5921
5922 static int alloc_apic_access_page(struct kvm *kvm)
5923 {
5924         struct page *page;
5925         int r = 0;
5926
5927         mutex_lock(&kvm->slots_lock);
5928         if (kvm->arch.apic_access_page_done)
5929                 goto out;
5930         r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
5931                                     APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
5932         if (r)
5933                 goto out;
5934
5935         page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
5936         if (is_error_page(page)) {
5937                 r = -EFAULT;
5938                 goto out;
5939         }
5940
5941         /*
5942          * Do not pin the page in memory, so that memory hot-unplug
5943          * is able to migrate it.
5944          */
5945         put_page(page);
5946         kvm->arch.apic_access_page_done = true;
5947 out:
5948         mutex_unlock(&kvm->slots_lock);
5949         return r;
5950 }
5951
5952 static int allocate_vpid(void)
5953 {
5954         int vpid;
5955
5956         if (!enable_vpid)
5957                 return 0;
5958         spin_lock(&vmx_vpid_lock);
5959         vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
5960         if (vpid < VMX_NR_VPIDS)
5961                 __set_bit(vpid, vmx_vpid_bitmap);
5962         else
5963                 vpid = 0;
5964         spin_unlock(&vmx_vpid_lock);
5965         return vpid;
5966 }
5967
5968 static void free_vpid(int vpid)
5969 {
5970         if (!enable_vpid || vpid == 0)
5971                 return;
5972         spin_lock(&vmx_vpid_lock);
5973         __clear_bit(vpid, vmx_vpid_bitmap);
5974         spin_unlock(&vmx_vpid_lock);
5975 }
5976
5977 static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
5978                                                           u32 msr, int type)
5979 {
5980         int f = sizeof(unsigned long);
5981
5982         if (!cpu_has_vmx_msr_bitmap())
5983                 return;
5984
5985         if (static_branch_unlikely(&enable_evmcs))
5986                 evmcs_touch_msr_bitmap();
5987
5988         /*
5989          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
5990          * have the write-low and read-high bitmap offsets the wrong way round.
5991          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
5992          */
5993         if (msr <= 0x1fff) {
5994                 if (type & MSR_TYPE_R)
5995                         /* read-low */
5996                         __clear_bit(msr, msr_bitmap + 0x000 / f);
5997
5998                 if (type & MSR_TYPE_W)
5999                         /* write-low */
6000                         __clear_bit(msr, msr_bitmap + 0x800 / f);
6001
6002         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
6003                 msr &= 0x1fff;
6004                 if (type & MSR_TYPE_R)
6005                         /* read-high */
6006                         __clear_bit(msr, msr_bitmap + 0x400 / f);
6007
6008                 if (type & MSR_TYPE_W)
6009                         /* write-high */
6010                         __clear_bit(msr, msr_bitmap + 0xc00 / f);
6011
6012         }
6013 }
6014
6015 static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
6016                                                          u32 msr, int type)
6017 {
6018         int f = sizeof(unsigned long);
6019
6020         if (!cpu_has_vmx_msr_bitmap())
6021                 return;
6022
6023         if (static_branch_unlikely(&enable_evmcs))
6024                 evmcs_touch_msr_bitmap();
6025
6026         /*
6027          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
6028          * have the write-low and read-high bitmap offsets the wrong way round.
6029          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
6030          */
6031         if (msr <= 0x1fff) {
6032                 if (type & MSR_TYPE_R)
6033                         /* read-low */
6034                         __set_bit(msr, msr_bitmap + 0x000 / f);
6035
6036                 if (type & MSR_TYPE_W)
6037                         /* write-low */
6038                         __set_bit(msr, msr_bitmap + 0x800 / f);
6039
6040         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
6041                 msr &= 0x1fff;
6042                 if (type & MSR_TYPE_R)
6043                         /* read-high */
6044                         __set_bit(msr, msr_bitmap + 0x400 / f);
6045
6046                 if (type & MSR_TYPE_W)
6047                         /* write-high */
6048                         __set_bit(msr, msr_bitmap + 0xc00 / f);
6049
6050         }
6051 }
6052
6053 static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
6054                                                       u32 msr, int type, bool value)
6055 {
6056         if (value)
6057                 vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
6058         else
6059                 vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
6060 }
6061
6062 /*
6063  * If a msr is allowed by L0, we should check whether it is allowed by L1.
6064  * The corresponding bit will be cleared unless both of L0 and L1 allow it.
6065  */
6066 static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
6067                                                unsigned long *msr_bitmap_nested,
6068                                                u32 msr, int type)
6069 {
6070         int f = sizeof(unsigned long);
6071
6072         /*
6073          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
6074          * have the write-low and read-high bitmap offsets the wrong way round.
6075          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
6076          */
6077         if (msr <= 0x1fff) {
6078                 if (type & MSR_TYPE_R &&
6079                    !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
6080                         /* read-low */
6081                         __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
6082
6083                 if (type & MSR_TYPE_W &&
6084                    !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
6085                         /* write-low */
6086                         __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
6087
6088         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
6089                 msr &= 0x1fff;
6090                 if (type & MSR_TYPE_R &&
6091                    !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
6092                         /* read-high */
6093                         __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
6094
6095                 if (type & MSR_TYPE_W &&
6096                    !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
6097                         /* write-high */
6098                         __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
6099
6100         }
6101 }
6102
6103 static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
6104 {
6105         u8 mode = 0;
6106
6107         if (cpu_has_secondary_exec_ctrls() &&
6108             (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
6109              SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
6110                 mode |= MSR_BITMAP_MODE_X2APIC;
6111                 if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
6112                         mode |= MSR_BITMAP_MODE_X2APIC_APICV;
6113         }
6114
6115         return mode;
6116 }
6117
6118 #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
6119
6120 static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
6121                                          u8 mode)
6122 {
6123         int msr;
6124
6125         for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
6126                 unsigned word = msr / BITS_PER_LONG;
6127                 msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
6128                 msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
6129         }
6130
6131         if (mode & MSR_BITMAP_MODE_X2APIC) {
6132                 /*
6133                  * TPR reads and writes can be virtualized even if virtual interrupt
6134                  * delivery is not in use.
6135                  */
6136                 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
6137                 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
6138                         vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
6139                         vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
6140                         vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
6141                 }
6142         }
6143 }
6144
6145 static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
6146 {
6147         struct vcpu_vmx *vmx = to_vmx(vcpu);
6148         unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
6149         u8 mode = vmx_msr_bitmap_mode(vcpu);
6150         u8 changed = mode ^ vmx->msr_bitmap_mode;
6151
6152         if (!changed)
6153                 return;
6154
6155         if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
6156                 vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
6157
6158         vmx->msr_bitmap_mode = mode;
6159 }
6160
6161 static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
6162 {
6163         return enable_apicv;
6164 }
6165
6166 static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
6167 {
6168         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6169         gfn_t gfn;
6170
6171         /*
6172          * Don't need to mark the APIC access page dirty; it is never
6173          * written to by the CPU during APIC virtualization.
6174          */
6175
6176         if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
6177                 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
6178                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
6179         }
6180
6181         if (nested_cpu_has_posted_intr(vmcs12)) {
6182                 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
6183                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
6184         }
6185 }
6186
6187
6188 static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
6189 {
6190         struct vcpu_vmx *vmx = to_vmx(vcpu);
6191         int max_irr;
6192         void *vapic_page;
6193         u16 status;
6194
6195         if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
6196                 return;
6197
6198         vmx->nested.pi_pending = false;
6199         if (!pi_test_and_clear_on(vmx->nested.pi_desc))
6200                 return;
6201
6202         max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
6203         if (max_irr != 256) {
6204                 vapic_page = kmap(vmx->nested.virtual_apic_page);
6205                 __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
6206                         vapic_page, &max_irr);
6207                 kunmap(vmx->nested.virtual_apic_page);
6208
6209                 status = vmcs_read16(GUEST_INTR_STATUS);
6210                 if ((u8)max_irr > ((u8)status & 0xff)) {
6211                         status &= ~0xff;
6212                         status |= (u8)max_irr;
6213                         vmcs_write16(GUEST_INTR_STATUS, status);
6214                 }
6215         }
6216
6217         nested_mark_vmcs12_pages_dirty(vcpu);
6218 }
6219
6220 static u8 vmx_get_rvi(void)
6221 {
6222         return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
6223 }
6224
6225 static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
6226 {
6227         struct vcpu_vmx *vmx = to_vmx(vcpu);
6228         void *vapic_page;
6229         u32 vppr;
6230         int rvi;
6231
6232         if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
6233                 !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
6234                 WARN_ON_ONCE(!vmx->nested.virtual_apic_page))
6235                 return false;
6236
6237         rvi = vmx_get_rvi();
6238
6239         vapic_page = kmap(vmx->nested.virtual_apic_page);
6240         vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
6241         kunmap(vmx->nested.virtual_apic_page);
6242
6243         return ((rvi & 0xf0) > (vppr & 0xf0));
6244 }
6245
6246 static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
6247                                                      bool nested)
6248 {
6249 #ifdef CONFIG_SMP
6250         int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
6251
6252         if (vcpu->mode == IN_GUEST_MODE) {
6253                 /*
6254                  * The vector of interrupt to be delivered to vcpu had
6255                  * been set in PIR before this function.
6256                  *
6257                  * Following cases will be reached in this block, and
6258                  * we always send a notification event in all cases as
6259                  * explained below.
6260                  *
6261                  * Case 1: vcpu keeps in non-root mode. Sending a
6262                  * notification event posts the interrupt to vcpu.
6263                  *
6264                  * Case 2: vcpu exits to root mode and is still
6265                  * runnable. PIR will be synced to vIRR before the
6266                  * next vcpu entry. Sending a notification event in
6267                  * this case has no effect, as vcpu is not in root
6268                  * mode.
6269                  *
6270                  * Case 3: vcpu exits to root mode and is blocked.
6271                  * vcpu_block() has already synced PIR to vIRR and
6272                  * never blocks vcpu if vIRR is not cleared. Therefore,
6273                  * a blocked vcpu here does not wait for any requested
6274                  * interrupts in PIR, and sending a notification event
6275                  * which has no effect is safe here.
6276                  */
6277
6278                 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
6279                 return true;
6280         }
6281 #endif
6282         return false;
6283 }
6284
6285 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
6286                                                 int vector)
6287 {
6288         struct vcpu_vmx *vmx = to_vmx(vcpu);
6289
6290         if (is_guest_mode(vcpu) &&
6291             vector == vmx->nested.posted_intr_nv) {
6292                 /*
6293                  * If a posted intr is not recognized by hardware,
6294                  * we will accomplish it in the next vmentry.
6295                  */
6296                 vmx->nested.pi_pending = true;
6297                 kvm_make_request(KVM_REQ_EVENT, vcpu);
6298                 /* the PIR and ON have been set by L1. */
6299                 if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
6300                         kvm_vcpu_kick(vcpu);
6301                 return 0;
6302         }
6303         return -1;
6304 }
6305 /*
6306  * Send interrupt to vcpu via posted interrupt way.
6307  * 1. If target vcpu is running(non-root mode), send posted interrupt
6308  * notification to vcpu and hardware will sync PIR to vIRR atomically.
6309  * 2. If target vcpu isn't running(root mode), kick it to pick up the
6310  * interrupt from PIR in next vmentry.
6311  */
6312 static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
6313 {
6314         struct vcpu_vmx *vmx = to_vmx(vcpu);
6315         int r;
6316
6317         r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
6318         if (!r)
6319                 return 0;
6320
6321         if (!vcpu->arch.apicv_active)
6322                 return -1;
6323
6324         if (pi_test_and_set_pir(vector, &vmx->pi_desc))
6325                 return 0;
6326
6327         /* If a previous notification has sent the IPI, nothing to do.  */
6328         if (pi_test_and_set_on(&vmx->pi_desc))
6329                 return 0;
6330
6331         if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
6332                 kvm_vcpu_kick(vcpu);
6333
6334         return 0;
6335 }
6336
6337 /*
6338  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
6339  * will not change in the lifetime of the guest.
6340  * Note that host-state that does change is set elsewhere. E.g., host-state
6341  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
6342  */
6343 static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
6344 {
6345         u32 low32, high32;
6346         unsigned long tmpl;
6347         struct desc_ptr dt;
6348         unsigned long cr0, cr3, cr4;
6349
6350         cr0 = read_cr0();
6351         WARN_ON(cr0 & X86_CR0_TS);
6352         vmcs_writel(HOST_CR0, cr0);  /* 22.2.3 */
6353
6354         /*
6355          * Save the most likely value for this task's CR3 in the VMCS.
6356          * We can't use __get_current_cr3_fast() because we're not atomic.
6357          */
6358         cr3 = __read_cr3();
6359         vmcs_writel(HOST_CR3, cr3);             /* 22.2.3  FIXME: shadow tables */
6360         vmx->loaded_vmcs->host_state.cr3 = cr3;
6361
6362         /* Save the most likely value for this task's CR4 in the VMCS. */
6363         cr4 = cr4_read_shadow();
6364         vmcs_writel(HOST_CR4, cr4);                     /* 22.2.3, 22.2.5 */
6365         vmx->loaded_vmcs->host_state.cr4 = cr4;
6366
6367         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
6368 #ifdef CONFIG_X86_64
6369         /*
6370          * Load null selectors, so we can avoid reloading them in
6371          * vmx_prepare_switch_to_host(), in case userspace uses
6372          * the null selectors too (the expected case).
6373          */
6374         vmcs_write16(HOST_DS_SELECTOR, 0);
6375         vmcs_write16(HOST_ES_SELECTOR, 0);
6376 #else
6377         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
6378         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
6379 #endif
6380         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
6381         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
6382
6383         store_idt(&dt);
6384         vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
6385         vmx->host_idt_base = dt.address;
6386
6387         vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
6388
6389         rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
6390         vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
6391         rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
6392         vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
6393
6394         if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
6395                 rdmsr(MSR_IA32_CR_PAT, low32, high32);
6396                 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
6397         }
6398 }
6399
6400 static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
6401 {
6402         BUILD_BUG_ON(KVM_CR4_GUEST_OWNED_BITS & ~KVM_POSSIBLE_CR4_GUEST_BITS);
6403
6404         vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
6405         if (enable_ept)
6406                 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
6407         if (is_guest_mode(&vmx->vcpu))
6408                 vmx->vcpu.arch.cr4_guest_owned_bits &=
6409                         ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
6410         vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
6411 }
6412
6413 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
6414 {
6415         u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
6416
6417         if (!kvm_vcpu_apicv_active(&vmx->vcpu))
6418                 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
6419
6420         if (!enable_vnmi)
6421                 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
6422
6423         /* Enable the preemption timer dynamically */
6424         pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
6425         return pin_based_exec_ctrl;
6426 }
6427
6428 static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
6429 {
6430         struct vcpu_vmx *vmx = to_vmx(vcpu);
6431
6432         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
6433         if (cpu_has_secondary_exec_ctrls()) {
6434                 if (kvm_vcpu_apicv_active(vcpu))
6435                         vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
6436                                       SECONDARY_EXEC_APIC_REGISTER_VIRT |
6437                                       SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
6438                 else
6439                         vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
6440                                         SECONDARY_EXEC_APIC_REGISTER_VIRT |
6441                                         SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
6442         }
6443
6444         if (cpu_has_vmx_msr_bitmap())
6445                 vmx_update_msr_bitmap(vcpu);
6446 }
6447
6448 static u32 vmx_exec_control(struct vcpu_vmx *vmx)
6449 {
6450         u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
6451
6452         if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
6453                 exec_control &= ~CPU_BASED_MOV_DR_EXITING;
6454
6455         if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
6456                 exec_control &= ~CPU_BASED_TPR_SHADOW;
6457 #ifdef CONFIG_X86_64
6458                 exec_control |= CPU_BASED_CR8_STORE_EXITING |
6459                                 CPU_BASED_CR8_LOAD_EXITING;
6460 #endif
6461         }
6462         if (!enable_ept)
6463                 exec_control |= CPU_BASED_CR3_STORE_EXITING |
6464                                 CPU_BASED_CR3_LOAD_EXITING  |
6465                                 CPU_BASED_INVLPG_EXITING;
6466         if (kvm_mwait_in_guest(vmx->vcpu.kvm))
6467                 exec_control &= ~(CPU_BASED_MWAIT_EXITING |
6468                                 CPU_BASED_MONITOR_EXITING);
6469         if (kvm_hlt_in_guest(vmx->vcpu.kvm))
6470                 exec_control &= ~CPU_BASED_HLT_EXITING;
6471         return exec_control;
6472 }
6473
6474 static bool vmx_rdrand_supported(void)
6475 {
6476         return vmcs_config.cpu_based_2nd_exec_ctrl &
6477                 SECONDARY_EXEC_RDRAND_EXITING;
6478 }
6479
6480 static bool vmx_rdseed_supported(void)
6481 {
6482         return vmcs_config.cpu_based_2nd_exec_ctrl &
6483                 SECONDARY_EXEC_RDSEED_EXITING;
6484 }
6485
6486 static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
6487 {
6488         struct kvm_vcpu *vcpu = &vmx->vcpu;
6489
6490         u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
6491
6492         if (!cpu_need_virtualize_apic_accesses(vcpu))
6493                 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6494         if (vmx->vpid == 0)
6495                 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
6496         if (!enable_ept) {
6497                 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
6498                 enable_unrestricted_guest = 0;
6499         }
6500         if (!enable_unrestricted_guest)
6501                 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
6502         if (kvm_pause_in_guest(vmx->vcpu.kvm))
6503                 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
6504         if (!kvm_vcpu_apicv_active(vcpu))
6505                 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
6506                                   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
6507         exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6508
6509         /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
6510          * in vmx_set_cr4.  */
6511         exec_control &= ~SECONDARY_EXEC_DESC;
6512
6513         /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
6514            (handle_vmptrld).
6515            We can NOT enable shadow_vmcs here because we don't have yet
6516            a current VMCS12
6517         */
6518         exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
6519
6520         if (!enable_pml)
6521                 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
6522
6523         if (vmx_xsaves_supported()) {
6524                 /* Exposing XSAVES only when XSAVE is exposed */
6525                 bool xsaves_enabled =
6526                         guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
6527                         guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
6528
6529                 if (!xsaves_enabled)
6530                         exec_control &= ~SECONDARY_EXEC_XSAVES;
6531
6532                 if (nested) {
6533                         if (xsaves_enabled)
6534                                 vmx->nested.msrs.secondary_ctls_high |=
6535                                         SECONDARY_EXEC_XSAVES;
6536                         else
6537                                 vmx->nested.msrs.secondary_ctls_high &=
6538                                         ~SECONDARY_EXEC_XSAVES;
6539                 }
6540         }
6541
6542         if (vmx_rdtscp_supported()) {
6543                 bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
6544                 if (!rdtscp_enabled)
6545                         exec_control &= ~SECONDARY_EXEC_RDTSCP;
6546
6547                 if (nested) {
6548                         if (rdtscp_enabled)
6549                                 vmx->nested.msrs.secondary_ctls_high |=
6550                                         SECONDARY_EXEC_RDTSCP;
6551                         else
6552                                 vmx->nested.msrs.secondary_ctls_high &=
6553                                         ~SECONDARY_EXEC_RDTSCP;
6554                 }
6555         }
6556
6557         if (vmx_invpcid_supported()) {
6558                 /* Exposing INVPCID only when PCID is exposed */
6559                 bool invpcid_enabled =
6560                         guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
6561                         guest_cpuid_has(vcpu, X86_FEATURE_PCID);
6562
6563                 if (!invpcid_enabled) {
6564                         exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
6565                         guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
6566                 }
6567
6568                 if (nested) {
6569                         if (invpcid_enabled)
6570                                 vmx->nested.msrs.secondary_ctls_high |=
6571                                         SECONDARY_EXEC_ENABLE_INVPCID;
6572                         else
6573                                 vmx->nested.msrs.secondary_ctls_high &=
6574                                         ~SECONDARY_EXEC_ENABLE_INVPCID;
6575                 }
6576         }
6577
6578         if (vmx_rdrand_supported()) {
6579                 bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
6580                 if (rdrand_enabled)
6581                         exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
6582
6583                 if (nested) {
6584                         if (rdrand_enabled)
6585                                 vmx->nested.msrs.secondary_ctls_high |=
6586                                         SECONDARY_EXEC_RDRAND_EXITING;
6587                         else
6588                                 vmx->nested.msrs.secondary_ctls_high &=
6589                                         ~SECONDARY_EXEC_RDRAND_EXITING;
6590                 }
6591         }
6592
6593         if (vmx_rdseed_supported()) {
6594                 bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
6595                 if (rdseed_enabled)
6596                         exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
6597
6598                 if (nested) {
6599                         if (rdseed_enabled)
6600                                 vmx->nested.msrs.secondary_ctls_high |=
6601                                         SECONDARY_EXEC_RDSEED_EXITING;
6602                         else
6603                                 vmx->nested.msrs.secondary_ctls_high &=
6604                                         ~SECONDARY_EXEC_RDSEED_EXITING;
6605                 }
6606         }
6607
6608         vmx->secondary_exec_control = exec_control;
6609 }
6610
6611 static void ept_set_mmio_spte_mask(void)
6612 {
6613         /*
6614          * EPT Misconfigurations can be generated if the value of bits 2:0
6615          * of an EPT paging-structure entry is 110b (write/execute).
6616          */
6617         kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
6618                                    VMX_EPT_MISCONFIG_WX_VALUE);
6619 }
6620
6621 #define VMX_XSS_EXIT_BITMAP 0
6622 /*
6623  * Sets up the vmcs for emulated real mode.
6624  */
6625 static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
6626 {
6627         int i;
6628
6629         if (enable_shadow_vmcs) {
6630                 /*
6631                  * At vCPU creation, "VMWRITE to any supported field
6632                  * in the VMCS" is supported, so use the more
6633                  * permissive vmx_vmread_bitmap to specify both read
6634                  * and write permissions for the shadow VMCS.
6635                  */
6636                 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
6637                 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap));
6638         }
6639         if (cpu_has_vmx_msr_bitmap())
6640                 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
6641
6642         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
6643
6644         /* Control */
6645         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
6646         vmx->hv_deadline_tsc = -1;
6647
6648         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
6649
6650         if (cpu_has_secondary_exec_ctrls()) {
6651                 vmx_compute_secondary_exec_control(vmx);
6652                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
6653                              vmx->secondary_exec_control);
6654         }
6655
6656         if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
6657                 vmcs_write64(EOI_EXIT_BITMAP0, 0);
6658                 vmcs_write64(EOI_EXIT_BITMAP1, 0);
6659                 vmcs_write64(EOI_EXIT_BITMAP2, 0);
6660                 vmcs_write64(EOI_EXIT_BITMAP3, 0);
6661
6662                 vmcs_write16(GUEST_INTR_STATUS, 0);
6663
6664                 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
6665                 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
6666         }
6667
6668         if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
6669                 vmcs_write32(PLE_GAP, ple_gap);
6670                 vmx->ple_window = ple_window;
6671                 vmx->ple_window_dirty = true;
6672         }
6673
6674         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
6675         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
6676         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
6677
6678         vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
6679         vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
6680         vmx_set_constant_host_state(vmx);
6681         vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
6682         vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
6683
6684         if (cpu_has_vmx_vmfunc())
6685                 vmcs_write64(VM_FUNCTION_CONTROL, 0);
6686
6687         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
6688         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
6689         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
6690         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
6691         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
6692
6693         if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
6694                 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
6695
6696         for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
6697                 u32 index = vmx_msr_index[i];
6698                 u32 data_low, data_high;
6699                 int j = vmx->nmsrs;
6700
6701                 if (rdmsr_safe(index, &data_low, &data_high) < 0)
6702                         continue;
6703                 if (wrmsr_safe(index, data_low, data_high) < 0)
6704                         continue;
6705                 vmx->guest_msrs[j].index = i;
6706                 vmx->guest_msrs[j].data = 0;
6707                 vmx->guest_msrs[j].mask = -1ull;
6708                 ++vmx->nmsrs;
6709         }
6710
6711         vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
6712
6713         /* 22.2.1, 20.8.1 */
6714         vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
6715
6716         vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
6717         vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
6718
6719         set_cr4_guest_host_mask(vmx);
6720
6721         if (vmx_xsaves_supported())
6722                 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
6723
6724         if (enable_pml) {
6725                 ASSERT(vmx->pml_pg);
6726                 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
6727                 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
6728         }
6729
6730         if (cpu_has_vmx_encls_vmexit())
6731                 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
6732 }
6733
6734 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
6735 {
6736         struct vcpu_vmx *vmx = to_vmx(vcpu);
6737         struct msr_data apic_base_msr;
6738         u64 cr0;
6739
6740         vmx->rmode.vm86_active = 0;
6741         vmx->spec_ctrl = 0;
6742
6743         vcpu->arch.microcode_version = 0x100000000ULL;
6744         vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
6745         kvm_set_cr8(vcpu, 0);
6746
6747         if (!init_event) {
6748                 apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
6749                                      MSR_IA32_APICBASE_ENABLE;
6750                 if (kvm_vcpu_is_reset_bsp(vcpu))
6751                         apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
6752                 apic_base_msr.host_initiated = true;
6753                 kvm_set_apic_base(vcpu, &apic_base_msr);
6754         }
6755
6756         vmx_segment_cache_clear(vmx);
6757
6758         seg_setup(VCPU_SREG_CS);
6759         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
6760         vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
6761
6762         seg_setup(VCPU_SREG_DS);
6763         seg_setup(VCPU_SREG_ES);
6764         seg_setup(VCPU_SREG_FS);
6765         seg_setup(VCPU_SREG_GS);
6766         seg_setup(VCPU_SREG_SS);
6767
6768         vmcs_write16(GUEST_TR_SELECTOR, 0);
6769         vmcs_writel(GUEST_TR_BASE, 0);
6770         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
6771         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
6772
6773         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
6774         vmcs_writel(GUEST_LDTR_BASE, 0);
6775         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
6776         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
6777
6778         if (!init_event) {
6779                 vmcs_write32(GUEST_SYSENTER_CS, 0);
6780                 vmcs_writel(GUEST_SYSENTER_ESP, 0);
6781                 vmcs_writel(GUEST_SYSENTER_EIP, 0);
6782                 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
6783         }
6784
6785         kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
6786         kvm_rip_write(vcpu, 0xfff0);
6787
6788         vmcs_writel(GUEST_GDTR_BASE, 0);
6789         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
6790
6791         vmcs_writel(GUEST_IDTR_BASE, 0);
6792         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
6793
6794         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
6795         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
6796         vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
6797         if (kvm_mpx_supported())
6798                 vmcs_write64(GUEST_BNDCFGS, 0);
6799
6800         setup_msrs(vmx);
6801
6802         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
6803
6804         if (cpu_has_vmx_tpr_shadow() && !init_event) {
6805                 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
6806                 if (cpu_need_tpr_shadow(vcpu))
6807                         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
6808                                      __pa(vcpu->arch.apic->regs));
6809                 vmcs_write32(TPR_THRESHOLD, 0);
6810         }
6811
6812         kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6813
6814         if (vmx->vpid != 0)
6815                 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
6816
6817         cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
6818         vmx->vcpu.arch.cr0 = cr0;
6819         vmx_set_cr0(vcpu, cr0); /* enter rmode */
6820         vmx_set_cr4(vcpu, 0);
6821         vmx_set_efer(vcpu, 0);
6822
6823         update_exception_bitmap(vcpu);
6824
6825         vpid_sync_context(vmx->vpid);
6826         if (init_event)
6827                 vmx_clear_hlt(vcpu);
6828
6829         vmx_update_fb_clear_dis(vcpu, vmx);
6830 }
6831
6832 /*
6833  * In nested virtualization, check if L1 asked to exit on external interrupts.
6834  * For most existing hypervisors, this will always return true.
6835  */
6836 static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
6837 {
6838         return get_vmcs12(vcpu)->pin_based_vm_exec_control &
6839                 PIN_BASED_EXT_INTR_MASK;
6840 }
6841
6842 /*
6843  * In nested virtualization, check if L1 has set
6844  * VM_EXIT_ACK_INTR_ON_EXIT
6845  */
6846 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
6847 {
6848         return get_vmcs12(vcpu)->vm_exit_controls &
6849                 VM_EXIT_ACK_INTR_ON_EXIT;
6850 }
6851
6852 static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
6853 {
6854         return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
6855 }
6856
6857 static void enable_irq_window(struct kvm_vcpu *vcpu)
6858 {
6859         vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
6860                       CPU_BASED_VIRTUAL_INTR_PENDING);
6861 }
6862
6863 static void enable_nmi_window(struct kvm_vcpu *vcpu)
6864 {
6865         if (!enable_vnmi ||
6866             vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
6867                 enable_irq_window(vcpu);
6868                 return;
6869         }
6870
6871         vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
6872                       CPU_BASED_VIRTUAL_NMI_PENDING);
6873 }
6874
6875 static void vmx_inject_irq(struct kvm_vcpu *vcpu)
6876 {
6877         struct vcpu_vmx *vmx = to_vmx(vcpu);
6878         uint32_t intr;
6879         int irq = vcpu->arch.interrupt.nr;
6880
6881         trace_kvm_inj_virq(irq);
6882
6883         ++vcpu->stat.irq_injections;
6884         if (vmx->rmode.vm86_active) {
6885                 int inc_eip = 0;
6886                 if (vcpu->arch.interrupt.soft)
6887                         inc_eip = vcpu->arch.event_exit_inst_len;
6888                 if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
6889                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
6890                 return;
6891         }
6892         intr = irq | INTR_INFO_VALID_MASK;
6893         if (vcpu->arch.interrupt.soft) {
6894                 intr |= INTR_TYPE_SOFT_INTR;
6895                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
6896                              vmx->vcpu.arch.event_exit_inst_len);
6897         } else
6898                 intr |= INTR_TYPE_EXT_INTR;
6899         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
6900
6901         vmx_clear_hlt(vcpu);
6902 }
6903
6904 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
6905 {
6906         struct vcpu_vmx *vmx = to_vmx(vcpu);
6907
6908         if (!enable_vnmi) {
6909                 /*
6910                  * Tracking the NMI-blocked state in software is built upon
6911                  * finding the next open IRQ window. This, in turn, depends on
6912                  * well-behaving guests: They have to keep IRQs disabled at
6913                  * least as long as the NMI handler runs. Otherwise we may
6914                  * cause NMI nesting, maybe breaking the guest. But as this is
6915                  * highly unlikely, we can live with the residual risk.
6916                  */
6917                 vmx->loaded_vmcs->soft_vnmi_blocked = 1;
6918                 vmx->loaded_vmcs->vnmi_blocked_time = 0;
6919         }
6920
6921         ++vcpu->stat.nmi_injections;
6922         vmx->loaded_vmcs->nmi_known_unmasked = false;
6923
6924         if (vmx->rmode.vm86_active) {
6925                 if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
6926                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
6927                 return;
6928         }
6929
6930         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
6931                         INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
6932
6933         vmx_clear_hlt(vcpu);
6934 }
6935
6936 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
6937 {
6938         struct vcpu_vmx *vmx = to_vmx(vcpu);
6939         bool masked;
6940
6941         if (!enable_vnmi)
6942                 return vmx->loaded_vmcs->soft_vnmi_blocked;
6943         if (vmx->loaded_vmcs->nmi_known_unmasked)
6944                 return false;
6945         masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
6946         vmx->loaded_vmcs->nmi_known_unmasked = !masked;
6947         return masked;
6948 }
6949
6950 static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
6951 {
6952         struct vcpu_vmx *vmx = to_vmx(vcpu);
6953
6954         if (!enable_vnmi) {
6955                 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
6956                         vmx->loaded_vmcs->soft_vnmi_blocked = masked;
6957                         vmx->loaded_vmcs->vnmi_blocked_time = 0;
6958                 }
6959         } else {
6960                 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
6961                 if (masked)
6962                         vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6963                                       GUEST_INTR_STATE_NMI);
6964                 else
6965                         vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
6966                                         GUEST_INTR_STATE_NMI);
6967         }
6968 }
6969
6970 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
6971 {
6972         if (to_vmx(vcpu)->nested.nested_run_pending)
6973                 return 0;
6974
6975         if (!enable_vnmi &&
6976             to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
6977                 return 0;
6978
6979         return  !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
6980                   (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
6981                    | GUEST_INTR_STATE_NMI));
6982 }
6983
6984 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
6985 {
6986         if (to_vmx(vcpu)->nested.nested_run_pending)
6987                 return false;
6988
6989         if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
6990                 return true;
6991
6992         return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
6993                 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
6994                         (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
6995 }
6996
6997 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
6998 {
6999         int ret;
7000
7001         if (enable_unrestricted_guest)
7002                 return 0;
7003
7004         ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
7005                                     PAGE_SIZE * 3);
7006         if (ret)
7007                 return ret;
7008         to_kvm_vmx(kvm)->tss_addr = addr;
7009         return init_rmode_tss(kvm);
7010 }
7011
7012 static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
7013 {
7014         to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
7015         return 0;
7016 }
7017
7018 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
7019 {
7020         switch (vec) {
7021         case BP_VECTOR:
7022                 /*
7023                  * Update instruction length as we may reinject the exception
7024                  * from user space while in guest debugging mode.
7025                  */
7026                 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
7027                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
7028                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
7029                         return false;
7030                 /* fall through */
7031         case DB_VECTOR:
7032                 if (vcpu->guest_debug &
7033                         (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
7034                         return false;
7035                 /* fall through */
7036         case DE_VECTOR:
7037         case OF_VECTOR:
7038         case BR_VECTOR:
7039         case UD_VECTOR:
7040         case DF_VECTOR:
7041         case SS_VECTOR:
7042         case GP_VECTOR:
7043         case MF_VECTOR:
7044                 return true;
7045         break;
7046         }
7047         return false;
7048 }
7049
7050 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
7051                                   int vec, u32 err_code)
7052 {
7053         /*
7054          * Instruction with address size override prefix opcode 0x67
7055          * Cause the #SS fault with 0 error code in VM86 mode.
7056          */
7057         if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
7058                 if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) {
7059                         if (vcpu->arch.halt_request) {
7060                                 vcpu->arch.halt_request = 0;
7061                                 return kvm_vcpu_halt(vcpu);
7062                         }
7063                         return 1;
7064                 }
7065                 return 0;
7066         }
7067
7068         /*
7069          * Forward all other exceptions that are valid in real mode.
7070          * FIXME: Breaks guest debugging in real mode, needs to be fixed with
7071          *        the required debugging infrastructure rework.
7072          */
7073         kvm_queue_exception(vcpu, vec);
7074         return 1;
7075 }
7076
7077 /*
7078  * Trigger machine check on the host. We assume all the MSRs are already set up
7079  * by the CPU and that we still run on the same CPU as the MCE occurred on.
7080  * We pass a fake environment to the machine check handler because we want
7081  * the guest to be always treated like user space, no matter what context
7082  * it used internally.
7083  */
7084 static void kvm_machine_check(void)
7085 {
7086 #if defined(CONFIG_X86_MCE)
7087         struct pt_regs regs = {
7088                 .cs = 3, /* Fake ring 3 no matter what the guest ran on */
7089                 .flags = X86_EFLAGS_IF,
7090         };
7091
7092         do_machine_check(&regs, 0);
7093 #endif
7094 }
7095
7096 static int handle_machine_check(struct kvm_vcpu *vcpu)
7097 {
7098         /* already handled by vcpu_run */
7099         return 1;
7100 }
7101
7102 static int handle_exception(struct kvm_vcpu *vcpu)
7103 {
7104         struct vcpu_vmx *vmx = to_vmx(vcpu);
7105         struct kvm_run *kvm_run = vcpu->run;
7106         u32 intr_info, ex_no, error_code;
7107         unsigned long cr2, rip, dr6;
7108         u32 vect_info;
7109         enum emulation_result er;
7110
7111         vect_info = vmx->idt_vectoring_info;
7112         intr_info = vmx->exit_intr_info;
7113
7114         if (is_machine_check(intr_info))
7115                 return handle_machine_check(vcpu);
7116
7117         if (is_nmi(intr_info))
7118                 return 1;  /* already handled by vmx_vcpu_run() */
7119
7120         if (is_invalid_opcode(intr_info))
7121                 return handle_ud(vcpu);
7122
7123         error_code = 0;
7124         if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
7125                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
7126
7127         if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
7128                 WARN_ON_ONCE(!enable_vmware_backdoor);
7129                 er = kvm_emulate_instruction(vcpu,
7130                         EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
7131                 if (er == EMULATE_USER_EXIT)
7132                         return 0;
7133                 else if (er != EMULATE_DONE)
7134                         kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
7135                 return 1;
7136         }
7137
7138         /*
7139          * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
7140          * MMIO, it is better to report an internal error.
7141          * See the comments in vmx_handle_exit.
7142          */
7143         if ((vect_info & VECTORING_INFO_VALID_MASK) &&
7144             !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
7145                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
7146                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
7147                 vcpu->run->internal.ndata = 3;
7148                 vcpu->run->internal.data[0] = vect_info;
7149                 vcpu->run->internal.data[1] = intr_info;
7150                 vcpu->run->internal.data[2] = error_code;
7151                 return 0;
7152         }
7153
7154         if (is_page_fault(intr_info)) {
7155                 cr2 = vmcs_readl(EXIT_QUALIFICATION);
7156                 /* EPT won't cause page fault directly */
7157                 WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
7158                 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
7159         }
7160
7161         ex_no = intr_info & INTR_INFO_VECTOR_MASK;
7162
7163         if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
7164                 return handle_rmode_exception(vcpu, ex_no, error_code);
7165
7166         switch (ex_no) {
7167         case AC_VECTOR:
7168                 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
7169                 return 1;
7170         case DB_VECTOR:
7171                 dr6 = vmcs_readl(EXIT_QUALIFICATION);
7172                 if (!(vcpu->guest_debug &
7173                       (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
7174                         vcpu->arch.dr6 &= ~15;
7175                         vcpu->arch.dr6 |= dr6 | DR6_RTM;
7176                         if (is_icebp(intr_info))
7177                                 skip_emulated_instruction(vcpu);
7178
7179                         kvm_queue_exception(vcpu, DB_VECTOR);
7180                         return 1;
7181                 }
7182                 kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
7183                 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
7184                 /* fall through */
7185         case BP_VECTOR:
7186                 /*
7187                  * Update instruction length as we may reinject #BP from
7188                  * user space while in guest debugging mode. Reading it for
7189                  * #DB as well causes no harm, it is not used in that case.
7190                  */
7191                 vmx->vcpu.arch.event_exit_inst_len =
7192                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
7193                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
7194                 rip = kvm_rip_read(vcpu);
7195                 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
7196                 kvm_run->debug.arch.exception = ex_no;
7197                 break;
7198         default:
7199                 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
7200                 kvm_run->ex.exception = ex_no;
7201                 kvm_run->ex.error_code = error_code;
7202                 break;
7203         }
7204         return 0;
7205 }
7206
7207 static int handle_external_interrupt(struct kvm_vcpu *vcpu)
7208 {
7209         ++vcpu->stat.irq_exits;
7210         return 1;
7211 }
7212
7213 static int handle_triple_fault(struct kvm_vcpu *vcpu)
7214 {
7215         vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
7216         vcpu->mmio_needed = 0;
7217         return 0;
7218 }
7219
7220 static int handle_io(struct kvm_vcpu *vcpu)
7221 {
7222         unsigned long exit_qualification;
7223         int size, in, string;
7224         unsigned port;
7225
7226         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7227         string = (exit_qualification & 16) != 0;
7228
7229         ++vcpu->stat.io_exits;
7230
7231         if (string)
7232                 return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
7233
7234         port = exit_qualification >> 16;
7235         size = (exit_qualification & 7) + 1;
7236         in = (exit_qualification & 8) != 0;
7237
7238         return kvm_fast_pio(vcpu, size, port, in);
7239 }
7240
7241 static void
7242 vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
7243 {
7244         /*
7245          * Patch in the VMCALL instruction:
7246          */
7247         hypercall[0] = 0x0f;
7248         hypercall[1] = 0x01;
7249         hypercall[2] = 0xc1;
7250 }
7251
7252 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
7253 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
7254 {
7255         if (is_guest_mode(vcpu)) {
7256                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7257                 unsigned long orig_val = val;
7258
7259                 /*
7260                  * We get here when L2 changed cr0 in a way that did not change
7261                  * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
7262                  * but did change L0 shadowed bits. So we first calculate the
7263                  * effective cr0 value that L1 would like to write into the
7264                  * hardware. It consists of the L2-owned bits from the new
7265                  * value combined with the L1-owned bits from L1's guest_cr0.
7266                  */
7267                 val = (val & ~vmcs12->cr0_guest_host_mask) |
7268                         (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
7269
7270                 if (!nested_guest_cr0_valid(vcpu, val))
7271                         return 1;
7272
7273                 if (kvm_set_cr0(vcpu, val))
7274                         return 1;
7275                 vmcs_writel(CR0_READ_SHADOW, orig_val);
7276                 return 0;
7277         } else {
7278                 if (to_vmx(vcpu)->nested.vmxon &&
7279                     !nested_host_cr0_valid(vcpu, val))
7280                         return 1;
7281
7282                 return kvm_set_cr0(vcpu, val);
7283         }
7284 }
7285
7286 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
7287 {
7288         if (is_guest_mode(vcpu)) {
7289                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7290                 unsigned long orig_val = val;
7291
7292                 /* analogously to handle_set_cr0 */
7293                 val = (val & ~vmcs12->cr4_guest_host_mask) |
7294                         (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
7295                 if (kvm_set_cr4(vcpu, val))
7296                         return 1;
7297                 vmcs_writel(CR4_READ_SHADOW, orig_val);
7298                 return 0;
7299         } else
7300                 return kvm_set_cr4(vcpu, val);
7301 }
7302
7303 static int handle_desc(struct kvm_vcpu *vcpu)
7304 {
7305         WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
7306         return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
7307 }
7308
7309 static int handle_cr(struct kvm_vcpu *vcpu)
7310 {
7311         unsigned long exit_qualification, val;
7312         int cr;
7313         int reg;
7314         int err;
7315         int ret;
7316
7317         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7318         cr = exit_qualification & 15;
7319         reg = (exit_qualification >> 8) & 15;
7320         switch ((exit_qualification >> 4) & 3) {
7321         case 0: /* mov to cr */
7322                 val = kvm_register_readl(vcpu, reg);
7323                 trace_kvm_cr_write(cr, val);
7324                 switch (cr) {
7325                 case 0:
7326                         err = handle_set_cr0(vcpu, val);
7327                         return kvm_complete_insn_gp(vcpu, err);
7328                 case 3:
7329                         WARN_ON_ONCE(enable_unrestricted_guest);
7330                         err = kvm_set_cr3(vcpu, val);
7331                         return kvm_complete_insn_gp(vcpu, err);
7332                 case 4:
7333                         err = handle_set_cr4(vcpu, val);
7334                         return kvm_complete_insn_gp(vcpu, err);
7335                 case 8: {
7336                                 u8 cr8_prev = kvm_get_cr8(vcpu);
7337                                 u8 cr8 = (u8)val;
7338                                 err = kvm_set_cr8(vcpu, cr8);
7339                                 ret = kvm_complete_insn_gp(vcpu, err);
7340                                 if (lapic_in_kernel(vcpu))
7341                                         return ret;
7342                                 if (cr8_prev <= cr8)
7343                                         return ret;
7344                                 /*
7345                                  * TODO: we might be squashing a
7346                                  * KVM_GUESTDBG_SINGLESTEP-triggered
7347                                  * KVM_EXIT_DEBUG here.
7348                                  */
7349                                 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
7350                                 return 0;
7351                         }
7352                 }
7353                 break;
7354         case 2: /* clts */
7355                 WARN_ONCE(1, "Guest should always own CR0.TS");
7356                 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
7357                 trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
7358                 return kvm_skip_emulated_instruction(vcpu);
7359         case 1: /*mov from cr*/
7360                 switch (cr) {
7361                 case 3:
7362                         WARN_ON_ONCE(enable_unrestricted_guest);
7363                         val = kvm_read_cr3(vcpu);
7364                         kvm_register_write(vcpu, reg, val);
7365                         trace_kvm_cr_read(cr, val);
7366                         return kvm_skip_emulated_instruction(vcpu);
7367                 case 8:
7368                         val = kvm_get_cr8(vcpu);
7369                         kvm_register_write(vcpu, reg, val);
7370                         trace_kvm_cr_read(cr, val);
7371                         return kvm_skip_emulated_instruction(vcpu);
7372                 }
7373                 break;
7374         case 3: /* lmsw */
7375                 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
7376                 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
7377                 kvm_lmsw(vcpu, val);
7378
7379                 return kvm_skip_emulated_instruction(vcpu);
7380         default:
7381                 break;
7382         }
7383         vcpu->run->exit_reason = 0;
7384         vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
7385                (int)(exit_qualification >> 4) & 3, cr);
7386         return 0;
7387 }
7388
7389 static int handle_dr(struct kvm_vcpu *vcpu)
7390 {
7391         unsigned long exit_qualification;
7392         int dr, dr7, reg;
7393
7394         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7395         dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
7396
7397         /* First, if DR does not exist, trigger UD */
7398         if (!kvm_require_dr(vcpu, dr))
7399                 return 1;
7400
7401         /* Do not handle if the CPL > 0, will trigger GP on re-entry */
7402         if (!kvm_require_cpl(vcpu, 0))
7403                 return 1;
7404         dr7 = vmcs_readl(GUEST_DR7);
7405         if (dr7 & DR7_GD) {
7406                 /*
7407                  * As the vm-exit takes precedence over the debug trap, we
7408                  * need to emulate the latter, either for the host or the
7409                  * guest debugging itself.
7410                  */
7411                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
7412                         vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
7413                         vcpu->run->debug.arch.dr7 = dr7;
7414                         vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
7415                         vcpu->run->debug.arch.exception = DB_VECTOR;
7416                         vcpu->run->exit_reason = KVM_EXIT_DEBUG;
7417                         return 0;
7418                 } else {
7419                         vcpu->arch.dr6 &= ~15;
7420                         vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
7421                         kvm_queue_exception(vcpu, DB_VECTOR);
7422                         return 1;
7423                 }
7424         }
7425
7426         if (vcpu->guest_debug == 0) {
7427                 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
7428                                 CPU_BASED_MOV_DR_EXITING);
7429
7430                 /*
7431                  * No more DR vmexits; force a reload of the debug registers
7432                  * and reenter on this instruction.  The next vmexit will
7433                  * retrieve the full state of the debug registers.
7434                  */
7435                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
7436                 return 1;
7437         }
7438
7439         reg = DEBUG_REG_ACCESS_REG(exit_qualification);
7440         if (exit_qualification & TYPE_MOV_FROM_DR) {
7441                 unsigned long val;
7442
7443                 if (kvm_get_dr(vcpu, dr, &val))
7444                         return 1;
7445                 kvm_register_write(vcpu, reg, val);
7446         } else
7447                 if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
7448                         return 1;
7449
7450         return kvm_skip_emulated_instruction(vcpu);
7451 }
7452
7453 static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
7454 {
7455         return vcpu->arch.dr6;
7456 }
7457
7458 static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
7459 {
7460 }
7461
7462 static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
7463 {
7464         get_debugreg(vcpu->arch.db[0], 0);
7465         get_debugreg(vcpu->arch.db[1], 1);
7466         get_debugreg(vcpu->arch.db[2], 2);
7467         get_debugreg(vcpu->arch.db[3], 3);
7468         get_debugreg(vcpu->arch.dr6, 6);
7469         vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
7470
7471         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
7472         vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
7473 }
7474
7475 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
7476 {
7477         vmcs_writel(GUEST_DR7, val);
7478 }
7479
7480 static int handle_cpuid(struct kvm_vcpu *vcpu)
7481 {
7482         return kvm_emulate_cpuid(vcpu);
7483 }
7484
7485 static int handle_rdmsr(struct kvm_vcpu *vcpu)
7486 {
7487         u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
7488         struct msr_data msr_info;
7489
7490         msr_info.index = ecx;
7491         msr_info.host_initiated = false;
7492         if (vmx_get_msr(vcpu, &msr_info)) {
7493                 trace_kvm_msr_read_ex(ecx);
7494                 kvm_inject_gp(vcpu, 0);
7495                 return 1;
7496         }
7497
7498         trace_kvm_msr_read(ecx, msr_info.data);
7499
7500         /* FIXME: handling of bits 32:63 of rax, rdx */
7501         vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u;
7502         vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u;
7503         return kvm_skip_emulated_instruction(vcpu);
7504 }
7505
7506 static int handle_wrmsr(struct kvm_vcpu *vcpu)
7507 {
7508         struct msr_data msr;
7509         u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
7510         u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
7511                 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
7512
7513         msr.data = data;
7514         msr.index = ecx;
7515         msr.host_initiated = false;
7516         if (kvm_set_msr(vcpu, &msr) != 0) {
7517                 trace_kvm_msr_write_ex(ecx, data);
7518                 kvm_inject_gp(vcpu, 0);
7519                 return 1;
7520         }
7521
7522         trace_kvm_msr_write(ecx, data);
7523         return kvm_skip_emulated_instruction(vcpu);
7524 }
7525
7526 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
7527 {
7528         kvm_apic_update_ppr(vcpu);
7529         return 1;
7530 }
7531
7532 static int handle_interrupt_window(struct kvm_vcpu *vcpu)
7533 {
7534         vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
7535                         CPU_BASED_VIRTUAL_INTR_PENDING);
7536
7537         kvm_make_request(KVM_REQ_EVENT, vcpu);
7538
7539         ++vcpu->stat.irq_window_exits;
7540         return 1;
7541 }
7542
7543 static int handle_halt(struct kvm_vcpu *vcpu)
7544 {
7545         return kvm_emulate_halt(vcpu);
7546 }
7547
7548 static int handle_vmcall(struct kvm_vcpu *vcpu)
7549 {
7550         return kvm_emulate_hypercall(vcpu);
7551 }
7552
7553 static int handle_invd(struct kvm_vcpu *vcpu)
7554 {
7555         return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
7556 }
7557
7558 static int handle_invlpg(struct kvm_vcpu *vcpu)
7559 {
7560         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7561
7562         kvm_mmu_invlpg(vcpu, exit_qualification);
7563         return kvm_skip_emulated_instruction(vcpu);
7564 }
7565
7566 static int handle_rdpmc(struct kvm_vcpu *vcpu)
7567 {
7568         int err;
7569
7570         err = kvm_rdpmc(vcpu);
7571         return kvm_complete_insn_gp(vcpu, err);
7572 }
7573
7574 static int handle_wbinvd(struct kvm_vcpu *vcpu)
7575 {
7576         return kvm_emulate_wbinvd(vcpu);
7577 }
7578
7579 static int handle_xsetbv(struct kvm_vcpu *vcpu)
7580 {
7581         u64 new_bv = kvm_read_edx_eax(vcpu);
7582         u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
7583
7584         if (kvm_set_xcr(vcpu, index, new_bv) == 0)
7585                 return kvm_skip_emulated_instruction(vcpu);
7586         return 1;
7587 }
7588
7589 static int handle_xsaves(struct kvm_vcpu *vcpu)
7590 {
7591         kvm_skip_emulated_instruction(vcpu);
7592         WARN(1, "this should never happen\n");
7593         return 1;
7594 }
7595
7596 static int handle_xrstors(struct kvm_vcpu *vcpu)
7597 {
7598         kvm_skip_emulated_instruction(vcpu);
7599         WARN(1, "this should never happen\n");
7600         return 1;
7601 }
7602
7603 static int handle_apic_access(struct kvm_vcpu *vcpu)
7604 {
7605         if (likely(fasteoi)) {
7606                 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7607                 int access_type, offset;
7608
7609                 access_type = exit_qualification & APIC_ACCESS_TYPE;
7610                 offset = exit_qualification & APIC_ACCESS_OFFSET;
7611                 /*
7612                  * Sane guest uses MOV to write EOI, with written value
7613                  * not cared. So make a short-circuit here by avoiding
7614                  * heavy instruction emulation.
7615                  */
7616                 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
7617                     (offset == APIC_EOI)) {
7618                         kvm_lapic_set_eoi(vcpu);
7619                         return kvm_skip_emulated_instruction(vcpu);
7620                 }
7621         }
7622         return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
7623 }
7624
7625 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
7626 {
7627         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7628         int vector = exit_qualification & 0xff;
7629
7630         /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
7631         kvm_apic_set_eoi_accelerated(vcpu, vector);
7632         return 1;
7633 }
7634
7635 static int handle_apic_write(struct kvm_vcpu *vcpu)
7636 {
7637         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7638         u32 offset = exit_qualification & 0xfff;
7639
7640         /* APIC-write VM exit is trap-like and thus no need to adjust IP */
7641         kvm_apic_write_nodecode(vcpu, offset);
7642         return 1;
7643 }
7644
7645 static int handle_task_switch(struct kvm_vcpu *vcpu)
7646 {
7647         struct vcpu_vmx *vmx = to_vmx(vcpu);
7648         unsigned long exit_qualification;
7649         bool has_error_code = false;
7650         u32 error_code = 0;
7651         u16 tss_selector;
7652         int reason, type, idt_v, idt_index;
7653
7654         idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
7655         idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
7656         type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
7657
7658         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7659
7660         reason = (u32)exit_qualification >> 30;
7661         if (reason == TASK_SWITCH_GATE && idt_v) {
7662                 switch (type) {
7663                 case INTR_TYPE_NMI_INTR:
7664                         vcpu->arch.nmi_injected = false;
7665                         vmx_set_nmi_mask(vcpu, true);
7666                         break;
7667                 case INTR_TYPE_EXT_INTR:
7668                 case INTR_TYPE_SOFT_INTR:
7669                         kvm_clear_interrupt_queue(vcpu);
7670                         break;
7671                 case INTR_TYPE_HARD_EXCEPTION:
7672                         if (vmx->idt_vectoring_info &
7673                             VECTORING_INFO_DELIVER_CODE_MASK) {
7674                                 has_error_code = true;
7675                                 error_code =
7676                                         vmcs_read32(IDT_VECTORING_ERROR_CODE);
7677                         }
7678                         /* fall through */
7679                 case INTR_TYPE_SOFT_EXCEPTION:
7680                         kvm_clear_exception_queue(vcpu);
7681                         break;
7682                 default:
7683                         break;
7684                 }
7685         }
7686         tss_selector = exit_qualification;
7687
7688         if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
7689                        type != INTR_TYPE_EXT_INTR &&
7690                        type != INTR_TYPE_NMI_INTR))
7691                 skip_emulated_instruction(vcpu);
7692
7693         if (kvm_task_switch(vcpu, tss_selector,
7694                             type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
7695                             has_error_code, error_code) == EMULATE_FAIL) {
7696                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
7697                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
7698                 vcpu->run->internal.ndata = 0;
7699                 return 0;
7700         }
7701
7702         /*
7703          * TODO: What about debug traps on tss switch?
7704          *       Are we supposed to inject them and update dr6?
7705          */
7706
7707         return 1;
7708 }
7709
7710 static int handle_ept_violation(struct kvm_vcpu *vcpu)
7711 {
7712         unsigned long exit_qualification;
7713         gpa_t gpa;
7714         u64 error_code;
7715
7716         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7717
7718         /*
7719          * EPT violation happened while executing iret from NMI,
7720          * "blocked by NMI" bit has to be set before next VM entry.
7721          * There are errata that may cause this bit to not be set:
7722          * AAK134, BY25.
7723          */
7724         if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
7725                         enable_vnmi &&
7726                         (exit_qualification & INTR_INFO_UNBLOCK_NMI))
7727                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
7728
7729         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
7730         trace_kvm_page_fault(gpa, exit_qualification);
7731
7732         /* Is it a read fault? */
7733         error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
7734                      ? PFERR_USER_MASK : 0;
7735         /* Is it a write fault? */
7736         error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
7737                       ? PFERR_WRITE_MASK : 0;
7738         /* Is it a fetch fault? */
7739         error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
7740                       ? PFERR_FETCH_MASK : 0;
7741         /* ept page table entry is present? */
7742         error_code |= (exit_qualification &
7743                        (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
7744                         EPT_VIOLATION_EXECUTABLE))
7745                       ? PFERR_PRESENT_MASK : 0;
7746
7747         error_code |= (exit_qualification & 0x100) != 0 ?
7748                PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
7749
7750         vcpu->arch.exit_qualification = exit_qualification;
7751         return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
7752 }
7753
7754 static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
7755 {
7756         gpa_t gpa;
7757
7758         /*
7759          * A nested guest cannot optimize MMIO vmexits, because we have an
7760          * nGPA here instead of the required GPA.
7761          */
7762         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
7763         if (!is_guest_mode(vcpu) &&
7764             !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
7765                 trace_kvm_fast_mmio(gpa);
7766                 /*
7767                  * Doing kvm_skip_emulated_instruction() depends on undefined
7768                  * behavior: Intel's manual doesn't mandate
7769                  * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG
7770                  * occurs and while on real hardware it was observed to be set,
7771                  * other hypervisors (namely Hyper-V) don't set it, we end up
7772                  * advancing IP with some random value. Disable fast mmio when
7773                  * running nested and keep it for real hardware in hope that
7774                  * VM_EXIT_INSTRUCTION_LEN will always be set correctly.
7775                  */
7776                 if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
7777                         return kvm_skip_emulated_instruction(vcpu);
7778                 else
7779                         return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) ==
7780                                                                 EMULATE_DONE;
7781         }
7782
7783         return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
7784 }
7785
7786 static int handle_nmi_window(struct kvm_vcpu *vcpu)
7787 {
7788         WARN_ON_ONCE(!enable_vnmi);
7789         vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
7790                         CPU_BASED_VIRTUAL_NMI_PENDING);
7791         ++vcpu->stat.nmi_window_exits;
7792         kvm_make_request(KVM_REQ_EVENT, vcpu);
7793
7794         return 1;
7795 }
7796
7797 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
7798 {
7799         struct vcpu_vmx *vmx = to_vmx(vcpu);
7800         enum emulation_result err = EMULATE_DONE;
7801         int ret = 1;
7802         u32 cpu_exec_ctrl;
7803         bool intr_window_requested;
7804         unsigned count = 130;
7805
7806         /*
7807          * We should never reach the point where we are emulating L2
7808          * due to invalid guest state as that means we incorrectly
7809          * allowed a nested VMEntry with an invalid vmcs12.
7810          */
7811         WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
7812
7813         cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
7814         intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
7815
7816         while (vmx->emulation_required && count-- != 0) {
7817                 if (intr_window_requested && vmx_interrupt_allowed(vcpu))
7818                         return handle_interrupt_window(&vmx->vcpu);
7819
7820                 if (kvm_test_request(KVM_REQ_EVENT, vcpu))
7821                         return 1;
7822
7823                 err = kvm_emulate_instruction(vcpu, 0);
7824
7825                 if (err == EMULATE_USER_EXIT) {
7826                         ++vcpu->stat.mmio_exits;
7827                         ret = 0;
7828                         goto out;
7829                 }
7830
7831                 if (err != EMULATE_DONE)
7832                         goto emulation_error;
7833
7834                 if (vmx->emulation_required && !vmx->rmode.vm86_active &&
7835                     vcpu->arch.exception.pending)
7836                         goto emulation_error;
7837
7838                 if (vcpu->arch.halt_request) {
7839                         vcpu->arch.halt_request = 0;
7840                         ret = kvm_vcpu_halt(vcpu);
7841                         goto out;
7842                 }
7843
7844                 if (signal_pending(current))
7845                         goto out;
7846                 if (need_resched())
7847                         schedule();
7848         }
7849
7850 out:
7851         return ret;
7852
7853 emulation_error:
7854         vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
7855         vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
7856         vcpu->run->internal.ndata = 0;
7857         return 0;
7858 }
7859
7860 static void grow_ple_window(struct kvm_vcpu *vcpu)
7861 {
7862         struct vcpu_vmx *vmx = to_vmx(vcpu);
7863         int old = vmx->ple_window;
7864
7865         vmx->ple_window = __grow_ple_window(old, ple_window,
7866                                             ple_window_grow,
7867                                             ple_window_max);
7868
7869         if (vmx->ple_window != old)
7870                 vmx->ple_window_dirty = true;
7871
7872         trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old);
7873 }
7874
7875 static void shrink_ple_window(struct kvm_vcpu *vcpu)
7876 {
7877         struct vcpu_vmx *vmx = to_vmx(vcpu);
7878         int old = vmx->ple_window;
7879
7880         vmx->ple_window = __shrink_ple_window(old, ple_window,
7881                                               ple_window_shrink,
7882                                               ple_window);
7883
7884         if (vmx->ple_window != old)
7885                 vmx->ple_window_dirty = true;
7886
7887         trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old);
7888 }
7889
7890 /*
7891  * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
7892  */
7893 static void wakeup_handler(void)
7894 {
7895         struct kvm_vcpu *vcpu;
7896         int cpu = smp_processor_id();
7897
7898         spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
7899         list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
7900                         blocked_vcpu_list) {
7901                 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
7902
7903                 if (pi_test_on(pi_desc) == 1)
7904                         kvm_vcpu_kick(vcpu);
7905         }
7906         spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
7907 }
7908
7909 static void vmx_enable_tdp(void)
7910 {
7911         kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
7912                 enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
7913                 enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
7914                 0ull, VMX_EPT_EXECUTABLE_MASK,
7915                 cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
7916                 VMX_EPT_RWX_MASK, 0ull);
7917
7918         ept_set_mmio_spte_mask();
7919         kvm_enable_tdp();
7920 }
7921
7922 static __init int hardware_setup(void)
7923 {
7924         unsigned long host_bndcfgs;
7925         int r = -ENOMEM, i;
7926
7927         rdmsrl_safe(MSR_EFER, &host_efer);
7928
7929         for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
7930                 kvm_define_shared_msr(i, vmx_msr_index[i]);
7931
7932         for (i = 0; i < VMX_BITMAP_NR; i++) {
7933                 vmx_bitmap[i] = (unsigned long *)__get_free_page(GFP_KERNEL);
7934                 if (!vmx_bitmap[i])
7935                         goto out;
7936         }
7937
7938         memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
7939         memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
7940
7941         if (setup_vmcs_config(&vmcs_config) < 0) {
7942                 r = -EIO;
7943                 goto out;
7944         }
7945
7946         if (boot_cpu_has(X86_FEATURE_NX))
7947                 kvm_enable_efer_bits(EFER_NX);
7948
7949         if (boot_cpu_has(X86_FEATURE_MPX)) {
7950                 rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
7951                 WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
7952         }
7953
7954         if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
7955                 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
7956                 enable_vpid = 0;
7957
7958         if (!cpu_has_vmx_ept() ||
7959             !cpu_has_vmx_ept_4levels() ||
7960             !cpu_has_vmx_ept_mt_wb() ||
7961             !cpu_has_vmx_invept_global())
7962                 enable_ept = 0;
7963
7964         if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
7965                 enable_ept_ad_bits = 0;
7966
7967         if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
7968                 enable_unrestricted_guest = 0;
7969
7970         if (!cpu_has_vmx_flexpriority())
7971                 flexpriority_enabled = 0;
7972
7973         if (!cpu_has_virtual_nmis())
7974                 enable_vnmi = 0;
7975
7976         /*
7977          * set_apic_access_page_addr() is used to reload apic access
7978          * page upon invalidation.  No need to do anything if not
7979          * using the APIC_ACCESS_ADDR VMCS field.
7980          */
7981         if (!flexpriority_enabled)
7982                 kvm_x86_ops->set_apic_access_page_addr = NULL;
7983
7984         if (!cpu_has_vmx_tpr_shadow())
7985                 kvm_x86_ops->update_cr8_intercept = NULL;
7986
7987         if (enable_ept && !cpu_has_vmx_ept_2m_page())
7988                 kvm_disable_largepages();
7989
7990 #if IS_ENABLED(CONFIG_HYPERV)
7991         if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
7992             && enable_ept)
7993                 kvm_x86_ops->tlb_remote_flush = vmx_hv_remote_flush_tlb;
7994 #endif
7995
7996         if (!cpu_has_vmx_ple()) {
7997                 ple_gap = 0;
7998                 ple_window = 0;
7999                 ple_window_grow = 0;
8000                 ple_window_max = 0;
8001                 ple_window_shrink = 0;
8002         }
8003
8004         if (!cpu_has_vmx_apicv()) {
8005                 enable_apicv = 0;
8006                 kvm_x86_ops->sync_pir_to_irr = NULL;
8007         }
8008
8009         if (cpu_has_vmx_tsc_scaling()) {
8010                 kvm_has_tsc_control = true;
8011                 kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
8012                 kvm_tsc_scaling_ratio_frac_bits = 48;
8013         }
8014
8015         set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
8016
8017         if (enable_ept)
8018                 vmx_enable_tdp();
8019         else
8020                 kvm_disable_tdp();
8021
8022         if (!nested) {
8023                 kvm_x86_ops->get_nested_state = NULL;
8024                 kvm_x86_ops->set_nested_state = NULL;
8025         }
8026
8027         /*
8028          * Only enable PML when hardware supports PML feature, and both EPT
8029          * and EPT A/D bit features are enabled -- PML depends on them to work.
8030          */
8031         if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
8032                 enable_pml = 0;
8033
8034         if (!enable_pml) {
8035                 kvm_x86_ops->slot_enable_log_dirty = NULL;
8036                 kvm_x86_ops->slot_disable_log_dirty = NULL;
8037                 kvm_x86_ops->flush_log_dirty = NULL;
8038                 kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
8039         }
8040
8041         if (!cpu_has_vmx_preemption_timer())
8042                 kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
8043
8044         if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
8045                 u64 vmx_msr;
8046
8047                 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
8048                 cpu_preemption_timer_multi =
8049                          vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
8050         } else {
8051                 kvm_x86_ops->set_hv_timer = NULL;
8052                 kvm_x86_ops->cancel_hv_timer = NULL;
8053         }
8054
8055         if (!cpu_has_vmx_shadow_vmcs())
8056                 enable_shadow_vmcs = 0;
8057         if (enable_shadow_vmcs)
8058                 init_vmcs_shadow_fields();
8059
8060         kvm_set_posted_intr_wakeup_handler(wakeup_handler);
8061         nested_vmx_setup_ctls_msrs(&vmcs_config.nested, enable_apicv);
8062
8063         kvm_mce_cap_supported |= MCG_LMCE_P;
8064
8065         r = alloc_kvm_area();
8066         if (r)
8067                 goto out;
8068         return 0;
8069
8070 out:
8071         for (i = 0; i < VMX_BITMAP_NR; i++)
8072                 free_page((unsigned long)vmx_bitmap[i]);
8073
8074         return r;
8075 }
8076
8077 static __exit void hardware_unsetup(void)
8078 {
8079         int i;
8080
8081         for (i = 0; i < VMX_BITMAP_NR; i++)
8082                 free_page((unsigned long)vmx_bitmap[i]);
8083
8084         free_kvm_area();
8085 }
8086
8087 /*
8088  * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
8089  * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
8090  */
8091 static int handle_pause(struct kvm_vcpu *vcpu)
8092 {
8093         if (!kvm_pause_in_guest(vcpu->kvm))
8094                 grow_ple_window(vcpu);
8095
8096         /*
8097          * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
8098          * VM-execution control is ignored if CPL > 0. OTOH, KVM
8099          * never set PAUSE_EXITING and just set PLE if supported,
8100          * so the vcpu must be CPL=0 if it gets a PAUSE exit.
8101          */
8102         kvm_vcpu_on_spin(vcpu, true);
8103         return kvm_skip_emulated_instruction(vcpu);
8104 }
8105
8106 static int handle_nop(struct kvm_vcpu *vcpu)
8107 {
8108         return kvm_skip_emulated_instruction(vcpu);
8109 }
8110
8111 static int handle_mwait(struct kvm_vcpu *vcpu)
8112 {
8113         printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
8114         return handle_nop(vcpu);
8115 }
8116
8117 static int handle_invalid_op(struct kvm_vcpu *vcpu)
8118 {
8119         kvm_queue_exception(vcpu, UD_VECTOR);
8120         return 1;
8121 }
8122
8123 static int handle_monitor_trap(struct kvm_vcpu *vcpu)
8124 {
8125         return 1;
8126 }
8127
8128 static int handle_monitor(struct kvm_vcpu *vcpu)
8129 {
8130         printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
8131         return handle_nop(vcpu);
8132 }
8133
8134 /*
8135  * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
8136  * set the success or error code of an emulated VMX instruction, as specified
8137  * by Vol 2B, VMX Instruction Reference, "Conventions".
8138  */
8139 static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
8140 {
8141         vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
8142                         & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
8143                             X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
8144 }
8145
8146 static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
8147 {
8148         vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
8149                         & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
8150                             X86_EFLAGS_SF | X86_EFLAGS_OF))
8151                         | X86_EFLAGS_CF);
8152 }
8153
8154 static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
8155                                         u32 vm_instruction_error)
8156 {
8157         if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
8158                 /*
8159                  * failValid writes the error number to the current VMCS, which
8160                  * can't be done there isn't a current VMCS.
8161                  */
8162                 nested_vmx_failInvalid(vcpu);
8163                 return;
8164         }
8165         vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
8166                         & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
8167                             X86_EFLAGS_SF | X86_EFLAGS_OF))
8168                         | X86_EFLAGS_ZF);
8169         get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
8170         /*
8171          * We don't need to force a shadow sync because
8172          * VM_INSTRUCTION_ERROR is not shadowed
8173          */
8174 }
8175
8176 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
8177 {
8178         /* TODO: not to reset guest simply here. */
8179         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
8180         pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
8181 }
8182
8183 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
8184 {
8185         struct vcpu_vmx *vmx =
8186                 container_of(timer, struct vcpu_vmx, nested.preemption_timer);
8187
8188         vmx->nested.preemption_timer_expired = true;
8189         kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
8190         kvm_vcpu_kick(&vmx->vcpu);
8191
8192         return HRTIMER_NORESTART;
8193 }
8194
8195 /*
8196  * Decode the memory-address operand of a vmx instruction, as recorded on an
8197  * exit caused by such an instruction (run by a guest hypervisor).
8198  * On success, returns 0. When the operand is invalid, returns 1 and throws
8199  * #UD or #GP.
8200  */
8201 static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
8202                                  unsigned long exit_qualification,
8203                                  u32 vmx_instruction_info, bool wr, gva_t *ret)
8204 {
8205         gva_t off;
8206         bool exn;
8207         struct kvm_segment s;
8208
8209         /*
8210          * According to Vol. 3B, "Information for VM Exits Due to Instruction
8211          * Execution", on an exit, vmx_instruction_info holds most of the
8212          * addressing components of the operand. Only the displacement part
8213          * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
8214          * For how an actual address is calculated from all these components,
8215          * refer to Vol. 1, "Operand Addressing".
8216          */
8217         int  scaling = vmx_instruction_info & 3;
8218         int  addr_size = (vmx_instruction_info >> 7) & 7;
8219         bool is_reg = vmx_instruction_info & (1u << 10);
8220         int  seg_reg = (vmx_instruction_info >> 15) & 7;
8221         int  index_reg = (vmx_instruction_info >> 18) & 0xf;
8222         bool index_is_valid = !(vmx_instruction_info & (1u << 22));
8223         int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
8224         bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
8225
8226         if (is_reg) {
8227                 kvm_queue_exception(vcpu, UD_VECTOR);
8228                 return 1;
8229         }
8230
8231         /* Addr = segment_base + offset */
8232         /* offset = base + [index * scale] + displacement */
8233         off = exit_qualification; /* holds the displacement */
8234         if (addr_size == 1)
8235                 off = (gva_t)sign_extend64(off, 31);
8236         else if (addr_size == 0)
8237                 off = (gva_t)sign_extend64(off, 15);
8238         if (base_is_valid)
8239                 off += kvm_register_read(vcpu, base_reg);
8240         if (index_is_valid)
8241                 off += kvm_register_read(vcpu, index_reg)<<scaling;
8242         vmx_get_segment(vcpu, &s, seg_reg);
8243
8244         /*
8245          * The effective address, i.e. @off, of a memory operand is truncated
8246          * based on the address size of the instruction.  Note that this is
8247          * the *effective address*, i.e. the address prior to accounting for
8248          * the segment's base.
8249          */
8250         if (addr_size == 1) /* 32 bit */
8251                 off &= 0xffffffff;
8252         else if (addr_size == 0) /* 16 bit */
8253                 off &= 0xffff;
8254
8255         /* Checks for #GP/#SS exceptions. */
8256         exn = false;
8257         if (is_long_mode(vcpu)) {
8258                 /*
8259                  * The virtual/linear address is never truncated in 64-bit
8260                  * mode, e.g. a 32-bit address size can yield a 64-bit virtual
8261                  * address when using FS/GS with a non-zero base.
8262                  */
8263                 *ret = s.base + off;
8264
8265                 /* Long mode: #GP(0)/#SS(0) if the memory address is in a
8266                  * non-canonical form. This is the only check on the memory
8267                  * destination for long mode!
8268                  */
8269                 exn = is_noncanonical_address(*ret, vcpu);
8270         } else if (is_protmode(vcpu)) {
8271                 /*
8272                  * When not in long mode, the virtual/linear address is
8273                  * unconditionally truncated to 32 bits regardless of the
8274                  * address size.
8275                  */
8276                 *ret = (s.base + off) & 0xffffffff;
8277
8278                 /* Protected mode: apply checks for segment validity in the
8279                  * following order:
8280                  * - segment type check (#GP(0) may be thrown)
8281                  * - usability check (#GP(0)/#SS(0))
8282                  * - limit check (#GP(0)/#SS(0))
8283                  */
8284                 if (wr)
8285                         /* #GP(0) if the destination operand is located in a
8286                          * read-only data segment or any code segment.
8287                          */
8288                         exn = ((s.type & 0xa) == 0 || (s.type & 8));
8289                 else
8290                         /* #GP(0) if the source operand is located in an
8291                          * execute-only code segment
8292                          */
8293                         exn = ((s.type & 0xa) == 8);
8294                 if (exn) {
8295                         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
8296                         return 1;
8297                 }
8298                 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
8299                  */
8300                 exn = (s.unusable != 0);
8301
8302                 /*
8303                  * Protected mode: #GP(0)/#SS(0) if the memory operand is
8304                  * outside the segment limit.  All CPUs that support VMX ignore
8305                  * limit checks for flat segments, i.e. segments with base==0,
8306                  * limit==0xffffffff and of type expand-up data or code.
8307                  */
8308                 if (!(s.base == 0 && s.limit == 0xffffffff &&
8309                      ((s.type & 8) || !(s.type & 4))))
8310                         exn = exn || (off + sizeof(u64) > s.limit);
8311         }
8312         if (exn) {
8313                 kvm_queue_exception_e(vcpu,
8314                                       seg_reg == VCPU_SREG_SS ?
8315                                                 SS_VECTOR : GP_VECTOR,
8316                                       0);
8317                 return 1;
8318         }
8319
8320         return 0;
8321 }
8322
8323 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
8324 {
8325         gva_t gva;
8326         struct x86_exception e;
8327
8328         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
8329                         vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
8330                 return 1;
8331
8332         if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
8333                 kvm_inject_page_fault(vcpu, &e);
8334                 return 1;
8335         }
8336
8337         return 0;
8338 }
8339
8340 /*
8341  * Allocate a shadow VMCS and associate it with the currently loaded
8342  * VMCS, unless such a shadow VMCS already exists. The newly allocated
8343  * VMCS is also VMCLEARed, so that it is ready for use.
8344  */
8345 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
8346 {
8347         struct vcpu_vmx *vmx = to_vmx(vcpu);
8348         struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
8349
8350         /*
8351          * We should allocate a shadow vmcs for vmcs01 only when L1
8352          * executes VMXON and free it when L1 executes VMXOFF.
8353          * As it is invalid to execute VMXON twice, we shouldn't reach
8354          * here when vmcs01 already have an allocated shadow vmcs.
8355          */
8356         WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
8357
8358         if (!loaded_vmcs->shadow_vmcs) {
8359                 loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
8360                 if (loaded_vmcs->shadow_vmcs)
8361                         vmcs_clear(loaded_vmcs->shadow_vmcs);
8362         }
8363         return loaded_vmcs->shadow_vmcs;
8364 }
8365
8366 static int enter_vmx_operation(struct kvm_vcpu *vcpu)
8367 {
8368         struct vcpu_vmx *vmx = to_vmx(vcpu);
8369         int r;
8370
8371         r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
8372         if (r < 0)
8373                 goto out_vmcs02;
8374
8375         vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL);
8376         if (!vmx->nested.cached_vmcs12)
8377                 goto out_cached_vmcs12;
8378
8379         vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL);
8380         if (!vmx->nested.cached_shadow_vmcs12)
8381                 goto out_cached_shadow_vmcs12;
8382
8383         if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
8384                 goto out_shadow_vmcs;
8385
8386         hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
8387                      HRTIMER_MODE_REL_PINNED);
8388         vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
8389
8390         vmx->nested.vpid02 = allocate_vpid();
8391
8392         vmx->nested.vmxon = true;
8393         return 0;
8394
8395 out_shadow_vmcs:
8396         kfree(vmx->nested.cached_shadow_vmcs12);
8397
8398 out_cached_shadow_vmcs12:
8399         kfree(vmx->nested.cached_vmcs12);
8400
8401 out_cached_vmcs12:
8402         free_loaded_vmcs(&vmx->nested.vmcs02);
8403
8404 out_vmcs02:
8405         return -ENOMEM;
8406 }
8407
8408 /*
8409  * Emulate the VMXON instruction.
8410  * Currently, we just remember that VMX is active, and do not save or even
8411  * inspect the argument to VMXON (the so-called "VMXON pointer") because we
8412  * do not currently need to store anything in that guest-allocated memory
8413  * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
8414  * argument is different from the VMXON pointer (which the spec says they do).
8415  */
8416 static int handle_vmon(struct kvm_vcpu *vcpu)
8417 {
8418         int ret;
8419         gpa_t vmptr;
8420         struct page *page;
8421         struct vcpu_vmx *vmx = to_vmx(vcpu);
8422         const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
8423                 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
8424
8425         /*
8426          * The Intel VMX Instruction Reference lists a bunch of bits that are
8427          * prerequisite to running VMXON, most notably cr4.VMXE must be set to
8428          * 1 (see vmx_set_cr4() for when we allow the guest to set this).
8429          * Otherwise, we should fail with #UD.  But most faulting conditions
8430          * have already been checked by hardware, prior to the VM-exit for
8431          * VMXON.  We do test guest cr4.VMXE because processor CR4 always has
8432          * that bit set to 1 in non-root mode.
8433          */
8434         if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
8435                 kvm_queue_exception(vcpu, UD_VECTOR);
8436                 return 1;
8437         }
8438
8439         /* CPL=0 must be checked manually. */
8440         if (vmx_get_cpl(vcpu)) {
8441                 kvm_inject_gp(vcpu, 0);
8442                 return 1;
8443         }
8444
8445         if (vmx->nested.vmxon) {
8446                 nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
8447                 return kvm_skip_emulated_instruction(vcpu);
8448         }
8449
8450         if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
8451                         != VMXON_NEEDED_FEATURES) {
8452                 kvm_inject_gp(vcpu, 0);
8453                 return 1;
8454         }
8455
8456         if (nested_vmx_get_vmptr(vcpu, &vmptr))
8457                 return 1;
8458
8459         /*
8460          * SDM 3: 24.11.5
8461          * The first 4 bytes of VMXON region contain the supported
8462          * VMCS revision identifier
8463          *
8464          * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
8465          * which replaces physical address width with 32
8466          */
8467         if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
8468                 nested_vmx_failInvalid(vcpu);
8469                 return kvm_skip_emulated_instruction(vcpu);
8470         }
8471
8472         page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
8473         if (is_error_page(page)) {
8474                 nested_vmx_failInvalid(vcpu);
8475                 return kvm_skip_emulated_instruction(vcpu);
8476         }
8477         if (*(u32 *)kmap(page) != VMCS12_REVISION) {
8478                 kunmap(page);
8479                 kvm_release_page_clean(page);
8480                 nested_vmx_failInvalid(vcpu);
8481                 return kvm_skip_emulated_instruction(vcpu);
8482         }
8483         kunmap(page);
8484         kvm_release_page_clean(page);
8485
8486         vmx->nested.vmxon_ptr = vmptr;
8487         ret = enter_vmx_operation(vcpu);
8488         if (ret)
8489                 return ret;
8490
8491         nested_vmx_succeed(vcpu);
8492         return kvm_skip_emulated_instruction(vcpu);
8493 }
8494
8495 /*
8496  * Intel's VMX Instruction Reference specifies a common set of prerequisites
8497  * for running VMX instructions (except VMXON, whose prerequisites are
8498  * slightly different). It also specifies what exception to inject otherwise.
8499  * Note that many of these exceptions have priority over VM exits, so they
8500  * don't have to be checked again here.
8501  */
8502 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
8503 {
8504         if (!to_vmx(vcpu)->nested.vmxon) {
8505                 kvm_queue_exception(vcpu, UD_VECTOR);
8506                 return 0;
8507         }
8508
8509         if (vmx_get_cpl(vcpu)) {
8510                 kvm_inject_gp(vcpu, 0);
8511                 return 0;
8512         }
8513
8514         return 1;
8515 }
8516
8517 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
8518 {
8519         vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
8520         vmcs_write64(VMCS_LINK_POINTER, -1ull);
8521         vmx->nested.sync_shadow_vmcs = false;
8522 }
8523
8524 static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
8525 {
8526         if (vmx->nested.current_vmptr == -1ull)
8527                 return;
8528
8529         if (enable_shadow_vmcs) {
8530                 /* copy to memory all shadowed fields in case
8531                    they were modified */
8532                 copy_shadow_to_vmcs12(vmx);
8533                 vmx_disable_shadow_vmcs(vmx);
8534         }
8535         vmx->nested.posted_intr_nv = -1;
8536
8537         /* Flush VMCS12 to guest memory */
8538         kvm_vcpu_write_guest_page(&vmx->vcpu,
8539                                   vmx->nested.current_vmptr >> PAGE_SHIFT,
8540                                   vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
8541
8542         vmx->nested.current_vmptr = -1ull;
8543 }
8544
8545 /*
8546  * Free whatever needs to be freed from vmx->nested when L1 goes down, or
8547  * just stops using VMX.
8548  */
8549 static void free_nested(struct vcpu_vmx *vmx)
8550 {
8551         if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
8552                 return;
8553
8554         kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, &vmx->vcpu);
8555
8556         hrtimer_cancel(&vmx->nested.preemption_timer);
8557         vmx->nested.vmxon = false;
8558         vmx->nested.smm.vmxon = false;
8559         free_vpid(vmx->nested.vpid02);
8560         vmx->nested.posted_intr_nv = -1;
8561         vmx->nested.current_vmptr = -1ull;
8562         if (enable_shadow_vmcs) {
8563                 vmx_disable_shadow_vmcs(vmx);
8564                 vmcs_clear(vmx->vmcs01.shadow_vmcs);
8565                 free_vmcs(vmx->vmcs01.shadow_vmcs);
8566                 vmx->vmcs01.shadow_vmcs = NULL;
8567         }
8568         kfree(vmx->nested.cached_vmcs12);
8569         kfree(vmx->nested.cached_shadow_vmcs12);
8570         /* Unpin physical memory we referred to in the vmcs02 */
8571         if (vmx->nested.apic_access_page) {
8572                 kvm_release_page_dirty(vmx->nested.apic_access_page);
8573                 vmx->nested.apic_access_page = NULL;
8574         }
8575         if (vmx->nested.virtual_apic_page) {
8576                 kvm_release_page_dirty(vmx->nested.virtual_apic_page);
8577                 vmx->nested.virtual_apic_page = NULL;
8578         }
8579         if (vmx->nested.pi_desc_page) {
8580                 kunmap(vmx->nested.pi_desc_page);
8581                 kvm_release_page_dirty(vmx->nested.pi_desc_page);
8582                 vmx->nested.pi_desc_page = NULL;
8583                 vmx->nested.pi_desc = NULL;
8584         }
8585
8586         free_loaded_vmcs(&vmx->nested.vmcs02);
8587 }
8588
8589 /* Emulate the VMXOFF instruction */
8590 static int handle_vmoff(struct kvm_vcpu *vcpu)
8591 {
8592         if (!nested_vmx_check_permission(vcpu))
8593                 return 1;
8594         free_nested(to_vmx(vcpu));
8595         nested_vmx_succeed(vcpu);
8596         return kvm_skip_emulated_instruction(vcpu);
8597 }
8598
8599 /* Emulate the VMCLEAR instruction */
8600 static int handle_vmclear(struct kvm_vcpu *vcpu)
8601 {
8602         struct vcpu_vmx *vmx = to_vmx(vcpu);
8603         u32 zero = 0;
8604         gpa_t vmptr;
8605
8606         if (!nested_vmx_check_permission(vcpu))
8607                 return 1;
8608
8609         if (nested_vmx_get_vmptr(vcpu, &vmptr))
8610                 return 1;
8611
8612         if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
8613                 nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
8614                 return kvm_skip_emulated_instruction(vcpu);
8615         }
8616
8617         if (vmptr == vmx->nested.vmxon_ptr) {
8618                 nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
8619                 return kvm_skip_emulated_instruction(vcpu);
8620         }
8621
8622         if (vmptr == vmx->nested.current_vmptr)
8623                 nested_release_vmcs12(vmx);
8624
8625         kvm_vcpu_write_guest(vcpu,
8626                         vmptr + offsetof(struct vmcs12, launch_state),
8627                         &zero, sizeof(zero));
8628
8629         nested_vmx_succeed(vcpu);
8630         return kvm_skip_emulated_instruction(vcpu);
8631 }
8632
8633 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
8634
8635 /* Emulate the VMLAUNCH instruction */
8636 static int handle_vmlaunch(struct kvm_vcpu *vcpu)
8637 {
8638         return nested_vmx_run(vcpu, true);
8639 }
8640
8641 /* Emulate the VMRESUME instruction */
8642 static int handle_vmresume(struct kvm_vcpu *vcpu)
8643 {
8644
8645         return nested_vmx_run(vcpu, false);
8646 }
8647
8648 /*
8649  * Read a vmcs12 field. Since these can have varying lengths and we return
8650  * one type, we chose the biggest type (u64) and zero-extend the return value
8651  * to that size. Note that the caller, handle_vmread, might need to use only
8652  * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
8653  * 64-bit fields are to be returned).
8654  */
8655 static inline int vmcs12_read_any(struct vmcs12 *vmcs12,
8656                                   unsigned long field, u64 *ret)
8657 {
8658         short offset = vmcs_field_to_offset(field);
8659         char *p;
8660
8661         if (offset < 0)
8662                 return offset;
8663
8664         p = (char *)vmcs12 + offset;
8665
8666         switch (vmcs_field_width(field)) {
8667         case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
8668                 *ret = *((natural_width *)p);
8669                 return 0;
8670         case VMCS_FIELD_WIDTH_U16:
8671                 *ret = *((u16 *)p);
8672                 return 0;
8673         case VMCS_FIELD_WIDTH_U32:
8674                 *ret = *((u32 *)p);
8675                 return 0;
8676         case VMCS_FIELD_WIDTH_U64:
8677                 *ret = *((u64 *)p);
8678                 return 0;
8679         default:
8680                 WARN_ON(1);
8681                 return -ENOENT;
8682         }
8683 }
8684
8685
8686 static inline int vmcs12_write_any(struct vmcs12 *vmcs12,
8687                                    unsigned long field, u64 field_value){
8688         short offset = vmcs_field_to_offset(field);
8689         char *p = (char *)vmcs12 + offset;
8690         if (offset < 0)
8691                 return offset;
8692
8693         switch (vmcs_field_width(field)) {
8694         case VMCS_FIELD_WIDTH_U16:
8695                 *(u16 *)p = field_value;
8696                 return 0;
8697         case VMCS_FIELD_WIDTH_U32:
8698                 *(u32 *)p = field_value;
8699                 return 0;
8700         case VMCS_FIELD_WIDTH_U64:
8701                 *(u64 *)p = field_value;
8702                 return 0;
8703         case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
8704                 *(natural_width *)p = field_value;
8705                 return 0;
8706         default:
8707                 WARN_ON(1);
8708                 return -ENOENT;
8709         }
8710
8711 }
8712
8713 /*
8714  * Copy the writable VMCS shadow fields back to the VMCS12, in case
8715  * they have been modified by the L1 guest. Note that the "read-only"
8716  * VM-exit information fields are actually writable if the vCPU is
8717  * configured to support "VMWRITE to any supported field in the VMCS."
8718  */
8719 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
8720 {
8721         const u16 *fields[] = {
8722                 shadow_read_write_fields,
8723                 shadow_read_only_fields
8724         };
8725         const int max_fields[] = {
8726                 max_shadow_read_write_fields,
8727                 max_shadow_read_only_fields
8728         };
8729         int i, q;
8730         unsigned long field;
8731         u64 field_value;
8732         struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
8733
8734         if (WARN_ON(!shadow_vmcs))
8735                 return;
8736
8737         preempt_disable();
8738
8739         vmcs_load(shadow_vmcs);
8740
8741         for (q = 0; q < ARRAY_SIZE(fields); q++) {
8742                 for (i = 0; i < max_fields[q]; i++) {
8743                         field = fields[q][i];
8744                         field_value = __vmcs_readl(field);
8745                         vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value);
8746                 }
8747                 /*
8748                  * Skip the VM-exit information fields if they are read-only.
8749                  */
8750                 if (!nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
8751                         break;
8752         }
8753
8754         vmcs_clear(shadow_vmcs);
8755         vmcs_load(vmx->loaded_vmcs->vmcs);
8756
8757         preempt_enable();
8758 }
8759
8760 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
8761 {
8762         const u16 *fields[] = {
8763                 shadow_read_write_fields,
8764                 shadow_read_only_fields
8765         };
8766         const int max_fields[] = {
8767                 max_shadow_read_write_fields,
8768                 max_shadow_read_only_fields
8769         };
8770         int i, q;
8771         unsigned long field;
8772         u64 field_value = 0;
8773         struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
8774
8775         if (WARN_ON(!shadow_vmcs))
8776                 return;
8777
8778         vmcs_load(shadow_vmcs);
8779
8780         for (q = 0; q < ARRAY_SIZE(fields); q++) {
8781                 for (i = 0; i < max_fields[q]; i++) {
8782                         field = fields[q][i];
8783                         vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value);
8784                         __vmcs_writel(field, field_value);
8785                 }
8786         }
8787
8788         vmcs_clear(shadow_vmcs);
8789         vmcs_load(vmx->loaded_vmcs->vmcs);
8790 }
8791
8792 /*
8793  * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
8794  * used before) all generate the same failure when it is missing.
8795  */
8796 static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
8797 {
8798         struct vcpu_vmx *vmx = to_vmx(vcpu);
8799         if (vmx->nested.current_vmptr == -1ull) {
8800                 nested_vmx_failInvalid(vcpu);
8801                 return 0;
8802         }
8803         return 1;
8804 }
8805
8806 static int handle_vmread(struct kvm_vcpu *vcpu)
8807 {
8808         unsigned long field;
8809         u64 field_value;
8810         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
8811         u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
8812         gva_t gva = 0;
8813         struct vmcs12 *vmcs12;
8814         struct x86_exception e;
8815
8816         if (!nested_vmx_check_permission(vcpu))
8817                 return 1;
8818
8819         if (!nested_vmx_check_vmcs12(vcpu))
8820                 return kvm_skip_emulated_instruction(vcpu);
8821
8822         if (!is_guest_mode(vcpu))
8823                 vmcs12 = get_vmcs12(vcpu);
8824         else {
8825                 /*
8826                  * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
8827                  * to shadowed-field sets the ALU flags for VMfailInvalid.
8828                  */
8829                 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
8830                         nested_vmx_failInvalid(vcpu);
8831                         return kvm_skip_emulated_instruction(vcpu);
8832                 }
8833                 vmcs12 = get_shadow_vmcs12(vcpu);
8834         }
8835
8836         /* Decode instruction info and find the field to read */
8837         field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
8838         /* Read the field, zero-extended to a u64 field_value */
8839         if (vmcs12_read_any(vmcs12, field, &field_value) < 0) {
8840                 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
8841                 return kvm_skip_emulated_instruction(vcpu);
8842         }
8843         /*
8844          * Now copy part of this value to register or memory, as requested.
8845          * Note that the number of bits actually copied is 32 or 64 depending
8846          * on the guest's mode (32 or 64 bit), not on the given field's length.
8847          */
8848         if (vmx_instruction_info & (1u << 10)) {
8849                 kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
8850                         field_value);
8851         } else {
8852                 if (get_vmx_mem_address(vcpu, exit_qualification,
8853                                 vmx_instruction_info, true, &gva))
8854                         return 1;
8855                 /* _system ok, nested_vmx_check_permission has verified cpl=0 */
8856                 if (kvm_write_guest_virt_system(vcpu, gva, &field_value,
8857                                                 (is_long_mode(vcpu) ? 8 : 4),
8858                                                 &e)) {
8859                         kvm_inject_page_fault(vcpu, &e);
8860                         return 1;
8861                 }
8862         }
8863
8864         nested_vmx_succeed(vcpu);
8865         return kvm_skip_emulated_instruction(vcpu);
8866 }
8867
8868
8869 static int handle_vmwrite(struct kvm_vcpu *vcpu)
8870 {
8871         unsigned long field;
8872         gva_t gva;
8873         struct vcpu_vmx *vmx = to_vmx(vcpu);
8874         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
8875         u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
8876
8877         /* The value to write might be 32 or 64 bits, depending on L1's long
8878          * mode, and eventually we need to write that into a field of several
8879          * possible lengths. The code below first zero-extends the value to 64
8880          * bit (field_value), and then copies only the appropriate number of
8881          * bits into the vmcs12 field.
8882          */
8883         u64 field_value = 0;
8884         struct x86_exception e;
8885         struct vmcs12 *vmcs12;
8886
8887         if (!nested_vmx_check_permission(vcpu))
8888                 return 1;
8889
8890         if (!nested_vmx_check_vmcs12(vcpu))
8891                 return kvm_skip_emulated_instruction(vcpu);
8892
8893         if (vmx_instruction_info & (1u << 10))
8894                 field_value = kvm_register_readl(vcpu,
8895                         (((vmx_instruction_info) >> 3) & 0xf));
8896         else {
8897                 if (get_vmx_mem_address(vcpu, exit_qualification,
8898                                 vmx_instruction_info, false, &gva))
8899                         return 1;
8900                 if (kvm_read_guest_virt(vcpu, gva, &field_value,
8901                                         (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
8902                         kvm_inject_page_fault(vcpu, &e);
8903                         return 1;
8904                 }
8905         }
8906
8907
8908         field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
8909         /*
8910          * If the vCPU supports "VMWRITE to any supported field in the
8911          * VMCS," then the "read-only" fields are actually read/write.
8912          */
8913         if (vmcs_field_readonly(field) &&
8914             !nested_cpu_has_vmwrite_any_field(vcpu)) {
8915                 nested_vmx_failValid(vcpu,
8916                         VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
8917                 return kvm_skip_emulated_instruction(vcpu);
8918         }
8919
8920         if (!is_guest_mode(vcpu))
8921                 vmcs12 = get_vmcs12(vcpu);
8922         else {
8923                 /*
8924                  * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
8925                  * to shadowed-field sets the ALU flags for VMfailInvalid.
8926                  */
8927                 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
8928                         nested_vmx_failInvalid(vcpu);
8929                         return kvm_skip_emulated_instruction(vcpu);
8930                 }
8931                 vmcs12 = get_shadow_vmcs12(vcpu);
8932
8933         }
8934
8935         if (vmcs12_write_any(vmcs12, field, field_value) < 0) {
8936                 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
8937                 return kvm_skip_emulated_instruction(vcpu);
8938         }
8939
8940         /*
8941          * Do not track vmcs12 dirty-state if in guest-mode
8942          * as we actually dirty shadow vmcs12 instead of vmcs12.
8943          */
8944         if (!is_guest_mode(vcpu)) {
8945                 switch (field) {
8946 #define SHADOW_FIELD_RW(x) case x:
8947 #include "vmx_shadow_fields.h"
8948                         /*
8949                          * The fields that can be updated by L1 without a vmexit are
8950                          * always updated in the vmcs02, the others go down the slow
8951                          * path of prepare_vmcs02.
8952                          */
8953                         break;
8954                 default:
8955                         vmx->nested.dirty_vmcs12 = true;
8956                         break;
8957                 }
8958         }
8959
8960         nested_vmx_succeed(vcpu);
8961         return kvm_skip_emulated_instruction(vcpu);
8962 }
8963
8964 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
8965 {
8966         vmx->nested.current_vmptr = vmptr;
8967         if (enable_shadow_vmcs) {
8968                 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
8969                               SECONDARY_EXEC_SHADOW_VMCS);
8970                 vmcs_write64(VMCS_LINK_POINTER,
8971                              __pa(vmx->vmcs01.shadow_vmcs));
8972                 vmx->nested.sync_shadow_vmcs = true;
8973         }
8974         vmx->nested.dirty_vmcs12 = true;
8975 }
8976
8977 /* Emulate the VMPTRLD instruction */
8978 static int handle_vmptrld(struct kvm_vcpu *vcpu)
8979 {
8980         struct vcpu_vmx *vmx = to_vmx(vcpu);
8981         gpa_t vmptr;
8982
8983         if (!nested_vmx_check_permission(vcpu))
8984                 return 1;
8985
8986         if (nested_vmx_get_vmptr(vcpu, &vmptr))
8987                 return 1;
8988
8989         if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
8990                 nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
8991                 return kvm_skip_emulated_instruction(vcpu);
8992         }
8993
8994         if (vmptr == vmx->nested.vmxon_ptr) {
8995                 nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
8996                 return kvm_skip_emulated_instruction(vcpu);
8997         }
8998
8999         if (vmx->nested.current_vmptr != vmptr) {
9000                 struct vmcs12 *new_vmcs12;
9001                 struct page *page;
9002                 page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
9003                 if (is_error_page(page)) {
9004                         nested_vmx_failInvalid(vcpu);
9005                         return kvm_skip_emulated_instruction(vcpu);
9006                 }
9007                 new_vmcs12 = kmap(page);
9008                 if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
9009                     (new_vmcs12->hdr.shadow_vmcs &&
9010                      !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
9011                         kunmap(page);
9012                         kvm_release_page_clean(page);
9013                         nested_vmx_failValid(vcpu,
9014                                 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
9015                         return kvm_skip_emulated_instruction(vcpu);
9016                 }
9017
9018                 nested_release_vmcs12(vmx);
9019                 /*
9020                  * Load VMCS12 from guest memory since it is not already
9021                  * cached.
9022                  */
9023                 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
9024                 kunmap(page);
9025                 kvm_release_page_clean(page);
9026
9027                 set_current_vmptr(vmx, vmptr);
9028         }
9029
9030         nested_vmx_succeed(vcpu);
9031         return kvm_skip_emulated_instruction(vcpu);
9032 }
9033
9034 /* Emulate the VMPTRST instruction */
9035 static int handle_vmptrst(struct kvm_vcpu *vcpu)
9036 {
9037         unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION);
9038         u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
9039         gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
9040         struct x86_exception e;
9041         gva_t gva;
9042
9043         if (!nested_vmx_check_permission(vcpu))
9044                 return 1;
9045
9046         if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva))
9047                 return 1;
9048         /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
9049         if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
9050                                         sizeof(gpa_t), &e)) {
9051                 kvm_inject_page_fault(vcpu, &e);
9052                 return 1;
9053         }
9054         nested_vmx_succeed(vcpu);
9055         return kvm_skip_emulated_instruction(vcpu);
9056 }
9057
9058 /* Emulate the INVEPT instruction */
9059 static int handle_invept(struct kvm_vcpu *vcpu)
9060 {
9061         struct vcpu_vmx *vmx = to_vmx(vcpu);
9062         u32 vmx_instruction_info, types;
9063         unsigned long type;
9064         gva_t gva;
9065         struct x86_exception e;
9066         struct {
9067                 u64 eptp, gpa;
9068         } operand;
9069
9070         if (!(vmx->nested.msrs.secondary_ctls_high &
9071               SECONDARY_EXEC_ENABLE_EPT) ||
9072             !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
9073                 kvm_queue_exception(vcpu, UD_VECTOR);
9074                 return 1;
9075         }
9076
9077         if (!nested_vmx_check_permission(vcpu))
9078                 return 1;
9079
9080         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
9081         type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
9082
9083         types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
9084
9085         if (type >= 32 || !(types & (1 << type))) {
9086                 nested_vmx_failValid(vcpu,
9087                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
9088                 return kvm_skip_emulated_instruction(vcpu);
9089         }
9090
9091         /* According to the Intel VMX instruction reference, the memory
9092          * operand is read even if it isn't needed (e.g., for type==global)
9093          */
9094         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
9095                         vmx_instruction_info, false, &gva))
9096                 return 1;
9097         if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
9098                 kvm_inject_page_fault(vcpu, &e);
9099                 return 1;
9100         }
9101
9102         switch (type) {
9103         case VMX_EPT_EXTENT_GLOBAL:
9104         /*
9105          * TODO: track mappings and invalidate
9106          * single context requests appropriately
9107          */
9108         case VMX_EPT_EXTENT_CONTEXT:
9109                 kvm_mmu_sync_roots(vcpu);
9110                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
9111                 nested_vmx_succeed(vcpu);
9112                 break;
9113         default:
9114                 BUG_ON(1);
9115                 break;
9116         }
9117
9118         return kvm_skip_emulated_instruction(vcpu);
9119 }
9120
9121 static int handle_invvpid(struct kvm_vcpu *vcpu)
9122 {
9123         struct vcpu_vmx *vmx = to_vmx(vcpu);
9124         u32 vmx_instruction_info;
9125         unsigned long type, types;
9126         gva_t gva;
9127         struct x86_exception e;
9128         struct {
9129                 u64 vpid;
9130                 u64 gla;
9131         } operand;
9132
9133         if (!(vmx->nested.msrs.secondary_ctls_high &
9134               SECONDARY_EXEC_ENABLE_VPID) ||
9135                         !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
9136                 kvm_queue_exception(vcpu, UD_VECTOR);
9137                 return 1;
9138         }
9139
9140         if (!nested_vmx_check_permission(vcpu))
9141                 return 1;
9142
9143         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
9144         type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
9145
9146         types = (vmx->nested.msrs.vpid_caps &
9147                         VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
9148
9149         if (type >= 32 || !(types & (1 << type))) {
9150                 nested_vmx_failValid(vcpu,
9151                         VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
9152                 return kvm_skip_emulated_instruction(vcpu);
9153         }
9154
9155         /* according to the intel vmx instruction reference, the memory
9156          * operand is read even if it isn't needed (e.g., for type==global)
9157          */
9158         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
9159                         vmx_instruction_info, false, &gva))
9160                 return 1;
9161         if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
9162                 kvm_inject_page_fault(vcpu, &e);
9163                 return 1;
9164         }
9165         if (operand.vpid >> 16) {
9166                 nested_vmx_failValid(vcpu,
9167                         VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
9168                 return kvm_skip_emulated_instruction(vcpu);
9169         }
9170
9171         switch (type) {
9172         case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
9173                 if (!operand.vpid ||
9174                     is_noncanonical_address(operand.gla, vcpu)) {
9175                         nested_vmx_failValid(vcpu,
9176                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
9177                         return kvm_skip_emulated_instruction(vcpu);
9178                 }
9179                 if (cpu_has_vmx_invvpid_individual_addr() &&
9180                     vmx->nested.vpid02) {
9181                         __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
9182                                 vmx->nested.vpid02, operand.gla);
9183                 } else
9184                         __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
9185                 break;
9186         case VMX_VPID_EXTENT_SINGLE_CONTEXT:
9187         case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
9188                 if (!operand.vpid) {
9189                         nested_vmx_failValid(vcpu,
9190                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
9191                         return kvm_skip_emulated_instruction(vcpu);
9192                 }
9193                 __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
9194                 break;
9195         case VMX_VPID_EXTENT_ALL_CONTEXT:
9196                 __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
9197                 break;
9198         default:
9199                 WARN_ON_ONCE(1);
9200                 return kvm_skip_emulated_instruction(vcpu);
9201         }
9202
9203         nested_vmx_succeed(vcpu);
9204
9205         return kvm_skip_emulated_instruction(vcpu);
9206 }
9207
9208 static int handle_invpcid(struct kvm_vcpu *vcpu)
9209 {
9210         u32 vmx_instruction_info;
9211         unsigned long type;
9212         bool pcid_enabled;
9213         gva_t gva;
9214         struct x86_exception e;
9215         unsigned i;
9216         unsigned long roots_to_free = 0;
9217         struct {
9218                 u64 pcid;
9219                 u64 gla;
9220         } operand;
9221
9222         if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
9223                 kvm_queue_exception(vcpu, UD_VECTOR);
9224                 return 1;
9225         }
9226
9227         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
9228         type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
9229
9230         if (type > 3) {
9231                 kvm_inject_gp(vcpu, 0);
9232                 return 1;
9233         }
9234
9235         /* According to the Intel instruction reference, the memory operand
9236          * is read even if it isn't needed (e.g., for type==all)
9237          */
9238         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
9239                                 vmx_instruction_info, false, &gva))
9240                 return 1;
9241
9242         if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
9243                 kvm_inject_page_fault(vcpu, &e);
9244                 return 1;
9245         }
9246
9247         if (operand.pcid >> 12 != 0) {
9248                 kvm_inject_gp(vcpu, 0);
9249                 return 1;
9250         }
9251
9252         pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
9253
9254         switch (type) {
9255         case INVPCID_TYPE_INDIV_ADDR:
9256                 if ((!pcid_enabled && (operand.pcid != 0)) ||
9257                     is_noncanonical_address(operand.gla, vcpu)) {
9258                         kvm_inject_gp(vcpu, 0);
9259                         return 1;
9260                 }
9261                 kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
9262                 return kvm_skip_emulated_instruction(vcpu);
9263
9264         case INVPCID_TYPE_SINGLE_CTXT:
9265                 if (!pcid_enabled && (operand.pcid != 0)) {
9266                         kvm_inject_gp(vcpu, 0);
9267                         return 1;
9268                 }
9269
9270                 if (kvm_get_active_pcid(vcpu) == operand.pcid) {
9271                         kvm_mmu_sync_roots(vcpu);
9272                         kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
9273                 }
9274
9275                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
9276                         if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_roots[i].cr3)
9277                             == operand.pcid)
9278                                 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
9279
9280                 kvm_mmu_free_roots(vcpu, roots_to_free);
9281                 /*
9282                  * If neither the current cr3 nor any of the prev_roots use the
9283                  * given PCID, then nothing needs to be done here because a
9284                  * resync will happen anyway before switching to any other CR3.
9285                  */
9286
9287                 return kvm_skip_emulated_instruction(vcpu);
9288
9289         case INVPCID_TYPE_ALL_NON_GLOBAL:
9290                 /*
9291                  * Currently, KVM doesn't mark global entries in the shadow
9292                  * page tables, so a non-global flush just degenerates to a
9293                  * global flush. If needed, we could optimize this later by
9294                  * keeping track of global entries in shadow page tables.
9295                  */
9296
9297                 /* fall-through */
9298         case INVPCID_TYPE_ALL_INCL_GLOBAL:
9299                 kvm_mmu_unload(vcpu);
9300                 return kvm_skip_emulated_instruction(vcpu);
9301
9302         default:
9303                 BUG(); /* We have already checked above that type <= 3 */
9304         }
9305 }
9306
9307 static int handle_pml_full(struct kvm_vcpu *vcpu)
9308 {
9309         unsigned long exit_qualification;
9310
9311         trace_kvm_pml_full(vcpu->vcpu_id);
9312
9313         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
9314
9315         /*
9316          * PML buffer FULL happened while executing iret from NMI,
9317          * "blocked by NMI" bit has to be set before next VM entry.
9318          */
9319         if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
9320                         enable_vnmi &&
9321                         (exit_qualification & INTR_INFO_UNBLOCK_NMI))
9322                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
9323                                 GUEST_INTR_STATE_NMI);
9324
9325         /*
9326          * PML buffer already flushed at beginning of VMEXIT. Nothing to do
9327          * here.., and there's no userspace involvement needed for PML.
9328          */
9329         return 1;
9330 }
9331
9332 static int handle_preemption_timer(struct kvm_vcpu *vcpu)
9333 {
9334         if (!to_vmx(vcpu)->req_immediate_exit)
9335                 kvm_lapic_expired_hv_timer(vcpu);
9336         return 1;
9337 }
9338
9339 static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
9340 {
9341         struct vcpu_vmx *vmx = to_vmx(vcpu);
9342         int maxphyaddr = cpuid_maxphyaddr(vcpu);
9343
9344         /* Check for memory type validity */
9345         switch (address & VMX_EPTP_MT_MASK) {
9346         case VMX_EPTP_MT_UC:
9347                 if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))
9348                         return false;
9349                 break;
9350         case VMX_EPTP_MT_WB:
9351                 if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))
9352                         return false;
9353                 break;
9354         default:
9355                 return false;
9356         }
9357
9358         /* only 4 levels page-walk length are valid */
9359         if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)
9360                 return false;
9361
9362         /* Reserved bits should not be set */
9363         if (address >> maxphyaddr || ((address >> 7) & 0x1f))
9364                 return false;
9365
9366         /* AD, if set, should be supported */
9367         if (address & VMX_EPTP_AD_ENABLE_BIT) {
9368                 if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))
9369                         return false;
9370         }
9371
9372         return true;
9373 }
9374
9375 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
9376                                      struct vmcs12 *vmcs12)
9377 {
9378         u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
9379         u64 address;
9380         bool accessed_dirty;
9381         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
9382
9383         if (!nested_cpu_has_eptp_switching(vmcs12) ||
9384             !nested_cpu_has_ept(vmcs12))
9385                 return 1;
9386
9387         if (index >= VMFUNC_EPTP_ENTRIES)
9388                 return 1;
9389
9390
9391         if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
9392                                      &address, index * 8, 8))
9393                 return 1;
9394
9395         accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT);
9396
9397         /*
9398          * If the (L2) guest does a vmfunc to the currently
9399          * active ept pointer, we don't have to do anything else
9400          */
9401         if (vmcs12->ept_pointer != address) {
9402                 if (!valid_ept_address(vcpu, address))
9403                         return 1;
9404
9405                 kvm_mmu_unload(vcpu);
9406                 mmu->ept_ad = accessed_dirty;
9407                 mmu->base_role.ad_disabled = !accessed_dirty;
9408                 vmcs12->ept_pointer = address;
9409                 /*
9410                  * TODO: Check what's the correct approach in case
9411                  * mmu reload fails. Currently, we just let the next
9412                  * reload potentially fail
9413                  */
9414                 kvm_mmu_reload(vcpu);
9415         }
9416
9417         return 0;
9418 }
9419
9420 static int handle_vmfunc(struct kvm_vcpu *vcpu)
9421 {
9422         struct vcpu_vmx *vmx = to_vmx(vcpu);
9423         struct vmcs12 *vmcs12;
9424         u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
9425
9426         /*
9427          * VMFUNC is only supported for nested guests, but we always enable the
9428          * secondary control for simplicity; for non-nested mode, fake that we
9429          * didn't by injecting #UD.
9430          */
9431         if (!is_guest_mode(vcpu)) {
9432                 kvm_queue_exception(vcpu, UD_VECTOR);
9433                 return 1;
9434         }
9435
9436         vmcs12 = get_vmcs12(vcpu);
9437         if ((vmcs12->vm_function_control & (1 << function)) == 0)
9438                 goto fail;
9439
9440         switch (function) {
9441         case 0:
9442                 if (nested_vmx_eptp_switching(vcpu, vmcs12))
9443                         goto fail;
9444                 break;
9445         default:
9446                 goto fail;
9447         }
9448         return kvm_skip_emulated_instruction(vcpu);
9449
9450 fail:
9451         nested_vmx_vmexit(vcpu, vmx->exit_reason,
9452                           vmcs_read32(VM_EXIT_INTR_INFO),
9453                           vmcs_readl(EXIT_QUALIFICATION));
9454         return 1;
9455 }
9456
9457 static int handle_encls(struct kvm_vcpu *vcpu)
9458 {
9459         /*
9460          * SGX virtualization is not yet supported.  There is no software
9461          * enable bit for SGX, so we have to trap ENCLS and inject a #UD
9462          * to prevent the guest from executing ENCLS.
9463          */
9464         kvm_queue_exception(vcpu, UD_VECTOR);
9465         return 1;
9466 }
9467
9468 /*
9469  * The exit handlers return 1 if the exit was handled fully and guest execution
9470  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
9471  * to be done to userspace and return 0.
9472  */
9473 static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
9474         [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
9475         [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
9476         [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
9477         [EXIT_REASON_NMI_WINDOW]              = handle_nmi_window,
9478         [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
9479         [EXIT_REASON_CR_ACCESS]               = handle_cr,
9480         [EXIT_REASON_DR_ACCESS]               = handle_dr,
9481         [EXIT_REASON_CPUID]                   = handle_cpuid,
9482         [EXIT_REASON_MSR_READ]                = handle_rdmsr,
9483         [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
9484         [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
9485         [EXIT_REASON_HLT]                     = handle_halt,
9486         [EXIT_REASON_INVD]                    = handle_invd,
9487         [EXIT_REASON_INVLPG]                  = handle_invlpg,
9488         [EXIT_REASON_RDPMC]                   = handle_rdpmc,
9489         [EXIT_REASON_VMCALL]                  = handle_vmcall,
9490         [EXIT_REASON_VMCLEAR]                 = handle_vmclear,
9491         [EXIT_REASON_VMLAUNCH]                = handle_vmlaunch,
9492         [EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
9493         [EXIT_REASON_VMPTRST]                 = handle_vmptrst,
9494         [EXIT_REASON_VMREAD]                  = handle_vmread,
9495         [EXIT_REASON_VMRESUME]                = handle_vmresume,
9496         [EXIT_REASON_VMWRITE]                 = handle_vmwrite,
9497         [EXIT_REASON_VMOFF]                   = handle_vmoff,
9498         [EXIT_REASON_VMON]                    = handle_vmon,
9499         [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
9500         [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
9501         [EXIT_REASON_APIC_WRITE]              = handle_apic_write,
9502         [EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
9503         [EXIT_REASON_WBINVD]                  = handle_wbinvd,
9504         [EXIT_REASON_XSETBV]                  = handle_xsetbv,
9505         [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
9506         [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
9507         [EXIT_REASON_GDTR_IDTR]               = handle_desc,
9508         [EXIT_REASON_LDTR_TR]                 = handle_desc,
9509         [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
9510         [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
9511         [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
9512         [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_mwait,
9513         [EXIT_REASON_MONITOR_TRAP_FLAG]       = handle_monitor_trap,
9514         [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_monitor,
9515         [EXIT_REASON_INVEPT]                  = handle_invept,
9516         [EXIT_REASON_INVVPID]                 = handle_invvpid,
9517         [EXIT_REASON_RDRAND]                  = handle_invalid_op,
9518         [EXIT_REASON_RDSEED]                  = handle_invalid_op,
9519         [EXIT_REASON_XSAVES]                  = handle_xsaves,
9520         [EXIT_REASON_XRSTORS]                 = handle_xrstors,
9521         [EXIT_REASON_PML_FULL]                = handle_pml_full,
9522         [EXIT_REASON_INVPCID]                 = handle_invpcid,
9523         [EXIT_REASON_VMFUNC]                  = handle_vmfunc,
9524         [EXIT_REASON_PREEMPTION_TIMER]        = handle_preemption_timer,
9525         [EXIT_REASON_ENCLS]                   = handle_encls,
9526 };
9527
9528 static const int kvm_vmx_max_exit_handlers =
9529         ARRAY_SIZE(kvm_vmx_exit_handlers);
9530
9531 /*
9532  * Return true if an IO instruction with the specified port and size should cause
9533  * a VM-exit into L1.
9534  */
9535 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
9536                                  int size)
9537 {
9538         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
9539         gpa_t bitmap, last_bitmap;
9540         u8 b;
9541
9542         last_bitmap = (gpa_t)-1;
9543         b = -1;
9544
9545         while (size > 0) {
9546                 if (port < 0x8000)
9547                         bitmap = vmcs12->io_bitmap_a;
9548                 else if (port < 0x10000)
9549                         bitmap = vmcs12->io_bitmap_b;
9550                 else
9551                         return true;
9552                 bitmap += (port & 0x7fff) / 8;
9553
9554                 if (last_bitmap != bitmap)
9555                         if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
9556                                 return true;
9557                 if (b & (1 << (port & 7)))
9558                         return true;
9559
9560                 port++;
9561                 size--;
9562                 last_bitmap = bitmap;
9563         }
9564
9565         return false;
9566 }
9567
9568 /*
9569  * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
9570  * rather than handle it ourselves in L0. I.e., check whether L1 expressed
9571  * disinterest in the current event (read or write a specific MSR) by using an
9572  * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
9573  */
9574 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
9575         struct vmcs12 *vmcs12, u32 exit_reason)
9576 {
9577         u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
9578         gpa_t bitmap;
9579
9580         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
9581                 return true;
9582
9583         /*
9584          * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
9585          * for the four combinations of read/write and low/high MSR numbers.
9586          * First we need to figure out which of the four to use:
9587          */
9588         bitmap = vmcs12->msr_bitmap;
9589         if (exit_reason == EXIT_REASON_MSR_WRITE)
9590                 bitmap += 2048;
9591         if (msr_index >= 0xc0000000) {
9592                 msr_index -= 0xc0000000;
9593                 bitmap += 1024;
9594         }
9595
9596         /* Then read the msr_index'th bit from this bitmap: */
9597         if (msr_index < 1024*8) {
9598                 unsigned char b;
9599                 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
9600                         return true;
9601                 return 1 & (b >> (msr_index & 7));
9602         } else
9603                 return true; /* let L1 handle the wrong parameter */
9604 }
9605
9606 /*
9607  * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
9608  * rather than handle it ourselves in L0. I.e., check if L1 wanted to
9609  * intercept (via guest_host_mask etc.) the current event.
9610  */
9611 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
9612         struct vmcs12 *vmcs12)
9613 {
9614         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
9615         int cr = exit_qualification & 15;
9616         int reg;
9617         unsigned long val;
9618
9619         switch ((exit_qualification >> 4) & 3) {
9620         case 0: /* mov to cr */
9621                 reg = (exit_qualification >> 8) & 15;
9622                 val = kvm_register_readl(vcpu, reg);
9623                 switch (cr) {
9624                 case 0:
9625                         if (vmcs12->cr0_guest_host_mask &
9626                             (val ^ vmcs12->cr0_read_shadow))
9627                                 return true;
9628                         break;
9629                 case 3:
9630                         if ((vmcs12->cr3_target_count >= 1 &&
9631                                         vmcs12->cr3_target_value0 == val) ||
9632                                 (vmcs12->cr3_target_count >= 2 &&
9633                                         vmcs12->cr3_target_value1 == val) ||
9634                                 (vmcs12->cr3_target_count >= 3 &&
9635                                         vmcs12->cr3_target_value2 == val) ||
9636                                 (vmcs12->cr3_target_count >= 4 &&
9637                                         vmcs12->cr3_target_value3 == val))
9638                                 return false;
9639                         if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
9640                                 return true;
9641                         break;
9642                 case 4:
9643                         if (vmcs12->cr4_guest_host_mask &
9644                             (vmcs12->cr4_read_shadow ^ val))
9645                                 return true;
9646                         break;
9647                 case 8:
9648                         if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
9649                                 return true;
9650                         break;
9651                 }
9652                 break;
9653         case 2: /* clts */
9654                 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
9655                     (vmcs12->cr0_read_shadow & X86_CR0_TS))
9656                         return true;
9657                 break;
9658         case 1: /* mov from cr */
9659                 switch (cr) {
9660                 case 3:
9661                         if (vmcs12->cpu_based_vm_exec_control &
9662                             CPU_BASED_CR3_STORE_EXITING)
9663                                 return true;
9664                         break;
9665                 case 8:
9666                         if (vmcs12->cpu_based_vm_exec_control &
9667                             CPU_BASED_CR8_STORE_EXITING)
9668                                 return true;
9669                         break;
9670                 }
9671                 break;
9672         case 3: /* lmsw */
9673                 /*
9674                  * lmsw can change bits 1..3 of cr0, and only set bit 0 of
9675                  * cr0. Other attempted changes are ignored, with no exit.
9676                  */
9677                 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
9678                 if (vmcs12->cr0_guest_host_mask & 0xe &
9679                     (val ^ vmcs12->cr0_read_shadow))
9680                         return true;
9681                 if ((vmcs12->cr0_guest_host_mask & 0x1) &&
9682                     !(vmcs12->cr0_read_shadow & 0x1) &&
9683                     (val & 0x1))
9684                         return true;
9685                 break;
9686         }
9687         return false;
9688 }
9689
9690 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
9691         struct vmcs12 *vmcs12, gpa_t bitmap)
9692 {
9693         u32 vmx_instruction_info;
9694         unsigned long field;
9695         u8 b;
9696
9697         if (!nested_cpu_has_shadow_vmcs(vmcs12))
9698                 return true;
9699
9700         /* Decode instruction info and find the field to access */
9701         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
9702         field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
9703
9704         /* Out-of-range fields always cause a VM exit from L2 to L1 */
9705         if (field >> 15)
9706                 return true;
9707
9708         if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
9709                 return true;
9710
9711         return 1 & (b >> (field & 7));
9712 }
9713
9714 /*
9715  * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
9716  * should handle it ourselves in L0 (and then continue L2). Only call this
9717  * when in is_guest_mode (L2).
9718  */
9719 static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
9720 {
9721         u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
9722         struct vcpu_vmx *vmx = to_vmx(vcpu);
9723         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
9724
9725         if (vmx->nested.nested_run_pending)
9726                 return false;
9727
9728         if (unlikely(vmx->fail)) {
9729                 pr_info_ratelimited("%s failed vm entry %x\n", __func__,
9730                                     vmcs_read32(VM_INSTRUCTION_ERROR));
9731                 return true;
9732         }
9733
9734         /*
9735          * The host physical addresses of some pages of guest memory
9736          * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
9737          * Page). The CPU may write to these pages via their host
9738          * physical address while L2 is running, bypassing any
9739          * address-translation-based dirty tracking (e.g. EPT write
9740          * protection).
9741          *
9742          * Mark them dirty on every exit from L2 to prevent them from
9743          * getting out of sync with dirty tracking.
9744          */
9745         nested_mark_vmcs12_pages_dirty(vcpu);
9746
9747         trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
9748                                 vmcs_readl(EXIT_QUALIFICATION),
9749                                 vmx->idt_vectoring_info,
9750                                 intr_info,
9751                                 vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
9752                                 KVM_ISA_VMX);
9753
9754         switch ((u16)exit_reason) {
9755         case EXIT_REASON_EXCEPTION_NMI:
9756                 if (is_nmi(intr_info))
9757                         return false;
9758                 else if (is_page_fault(intr_info))
9759                         return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
9760                 else if (is_no_device(intr_info) &&
9761                          !(vmcs12->guest_cr0 & X86_CR0_TS))
9762                         return false;
9763                 else if (is_debug(intr_info) &&
9764                          vcpu->guest_debug &
9765                          (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
9766                         return false;
9767                 else if (is_breakpoint(intr_info) &&
9768                          vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
9769                         return false;
9770                 return vmcs12->exception_bitmap &
9771                                 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
9772         case EXIT_REASON_EXTERNAL_INTERRUPT:
9773                 return false;
9774         case EXIT_REASON_TRIPLE_FAULT:
9775                 return true;
9776         case EXIT_REASON_PENDING_INTERRUPT:
9777                 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
9778         case EXIT_REASON_NMI_WINDOW:
9779                 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
9780         case EXIT_REASON_TASK_SWITCH:
9781                 return true;
9782         case EXIT_REASON_CPUID:
9783                 return true;
9784         case EXIT_REASON_HLT:
9785                 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
9786         case EXIT_REASON_INVD:
9787                 return true;
9788         case EXIT_REASON_INVLPG:
9789                 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
9790         case EXIT_REASON_RDPMC:
9791                 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
9792         case EXIT_REASON_RDRAND:
9793                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
9794         case EXIT_REASON_RDSEED:
9795                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
9796         case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
9797                 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
9798         case EXIT_REASON_VMREAD:
9799                 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
9800                         vmcs12->vmread_bitmap);
9801         case EXIT_REASON_VMWRITE:
9802                 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
9803                         vmcs12->vmwrite_bitmap);
9804         case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
9805         case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
9806         case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
9807         case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
9808         case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
9809                 /*
9810                  * VMX instructions trap unconditionally. This allows L1 to
9811                  * emulate them for its L2 guest, i.e., allows 3-level nesting!
9812                  */
9813                 return true;
9814         case EXIT_REASON_CR_ACCESS:
9815                 return nested_vmx_exit_handled_cr(vcpu, vmcs12);
9816         case EXIT_REASON_DR_ACCESS:
9817                 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
9818         case EXIT_REASON_IO_INSTRUCTION:
9819                 return nested_vmx_exit_handled_io(vcpu, vmcs12);
9820         case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
9821                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
9822         case EXIT_REASON_MSR_READ:
9823         case EXIT_REASON_MSR_WRITE:
9824                 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
9825         case EXIT_REASON_INVALID_STATE:
9826                 return true;
9827         case EXIT_REASON_MWAIT_INSTRUCTION:
9828                 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
9829         case EXIT_REASON_MONITOR_TRAP_FLAG:
9830                 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
9831         case EXIT_REASON_MONITOR_INSTRUCTION:
9832                 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
9833         case EXIT_REASON_PAUSE_INSTRUCTION:
9834                 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
9835                         nested_cpu_has2(vmcs12,
9836                                 SECONDARY_EXEC_PAUSE_LOOP_EXITING);
9837         case EXIT_REASON_MCE_DURING_VMENTRY:
9838                 return false;
9839         case EXIT_REASON_TPR_BELOW_THRESHOLD:
9840                 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
9841         case EXIT_REASON_APIC_ACCESS:
9842         case EXIT_REASON_APIC_WRITE:
9843         case EXIT_REASON_EOI_INDUCED:
9844                 /*
9845                  * The controls for "virtualize APIC accesses," "APIC-
9846                  * register virtualization," and "virtual-interrupt
9847                  * delivery" only come from vmcs12.
9848                  */
9849                 return true;
9850         case EXIT_REASON_EPT_VIOLATION:
9851                 /*
9852                  * L0 always deals with the EPT violation. If nested EPT is
9853                  * used, and the nested mmu code discovers that the address is
9854                  * missing in the guest EPT table (EPT12), the EPT violation
9855                  * will be injected with nested_ept_inject_page_fault()
9856                  */
9857                 return false;
9858         case EXIT_REASON_EPT_MISCONFIG:
9859                 /*
9860                  * L2 never uses directly L1's EPT, but rather L0's own EPT
9861                  * table (shadow on EPT) or a merged EPT table that L0 built
9862                  * (EPT on EPT). So any problems with the structure of the
9863                  * table is L0's fault.
9864                  */
9865                 return false;
9866         case EXIT_REASON_INVPCID:
9867                 return
9868                         nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
9869                         nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
9870         case EXIT_REASON_WBINVD:
9871                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
9872         case EXIT_REASON_XSETBV:
9873                 return true;
9874         case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
9875                 /*
9876                  * This should never happen, since it is not possible to
9877                  * set XSS to a non-zero value---neither in L1 nor in L2.
9878                  * If if it were, XSS would have to be checked against
9879                  * the XSS exit bitmap in vmcs12.
9880                  */
9881                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
9882         case EXIT_REASON_PREEMPTION_TIMER:
9883                 return false;
9884         case EXIT_REASON_PML_FULL:
9885                 /* We emulate PML support to L1. */
9886                 return false;
9887         case EXIT_REASON_VMFUNC:
9888                 /* VM functions are emulated through L2->L0 vmexits. */
9889                 return false;
9890         case EXIT_REASON_ENCLS:
9891                 /* SGX is never exposed to L1 */
9892                 return false;
9893         default:
9894                 return true;
9895         }
9896 }
9897
9898 static int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
9899 {
9900         u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
9901
9902         /*
9903          * At this point, the exit interruption info in exit_intr_info
9904          * is only valid for EXCEPTION_NMI exits.  For EXTERNAL_INTERRUPT
9905          * we need to query the in-kernel LAPIC.
9906          */
9907         WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
9908         if ((exit_intr_info &
9909              (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
9910             (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
9911                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
9912                 vmcs12->vm_exit_intr_error_code =
9913                         vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
9914         }
9915
9916         nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
9917                           vmcs_readl(EXIT_QUALIFICATION));
9918         return 1;
9919 }
9920
9921 static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
9922 {
9923         *info1 = vmcs_readl(EXIT_QUALIFICATION);
9924         *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
9925 }
9926
9927 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
9928 {
9929         if (vmx->pml_pg) {
9930                 __free_page(vmx->pml_pg);
9931                 vmx->pml_pg = NULL;
9932         }
9933 }
9934
9935 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
9936 {
9937         struct vcpu_vmx *vmx = to_vmx(vcpu);
9938         u64 *pml_buf;
9939         u16 pml_idx;
9940
9941         pml_idx = vmcs_read16(GUEST_PML_INDEX);
9942
9943         /* Do nothing if PML buffer is empty */
9944         if (pml_idx == (PML_ENTITY_NUM - 1))
9945                 return;
9946
9947         /* PML index always points to next available PML buffer entity */
9948         if (pml_idx >= PML_ENTITY_NUM)
9949                 pml_idx = 0;
9950         else
9951                 pml_idx++;
9952
9953         pml_buf = page_address(vmx->pml_pg);
9954         for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
9955                 u64 gpa;
9956
9957                 gpa = pml_buf[pml_idx];
9958                 WARN_ON(gpa & (PAGE_SIZE - 1));
9959                 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
9960         }
9961
9962         /* reset PML index */
9963         vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
9964 }
9965
9966 /*
9967  * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
9968  * Called before reporting dirty_bitmap to userspace.
9969  */
9970 static void kvm_flush_pml_buffers(struct kvm *kvm)
9971 {
9972         int i;
9973         struct kvm_vcpu *vcpu;
9974         /*
9975          * We only need to kick vcpu out of guest mode here, as PML buffer
9976          * is flushed at beginning of all VMEXITs, and it's obvious that only
9977          * vcpus running in guest are possible to have unflushed GPAs in PML
9978          * buffer.
9979          */
9980         kvm_for_each_vcpu(i, vcpu, kvm)
9981                 kvm_vcpu_kick(vcpu);
9982 }
9983
9984 static void vmx_dump_sel(char *name, uint32_t sel)
9985 {
9986         pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
9987                name, vmcs_read16(sel),
9988                vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
9989                vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
9990                vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
9991 }
9992
9993 static void vmx_dump_dtsel(char *name, uint32_t limit)
9994 {
9995         pr_err("%s                           limit=0x%08x, base=0x%016lx\n",
9996                name, vmcs_read32(limit),
9997                vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
9998 }
9999
10000 static void dump_vmcs(void)
10001 {
10002         u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
10003         u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
10004         u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
10005         u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
10006         u32 secondary_exec_control = 0;
10007         unsigned long cr4 = vmcs_readl(GUEST_CR4);
10008         u64 efer = vmcs_read64(GUEST_IA32_EFER);
10009         int i, n;
10010
10011         if (cpu_has_secondary_exec_ctrls())
10012                 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
10013
10014         pr_err("*** Guest State ***\n");
10015         pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
10016                vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
10017                vmcs_readl(CR0_GUEST_HOST_MASK));
10018         pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
10019                cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
10020         pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
10021         if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
10022             (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
10023         {
10024                 pr_err("PDPTR0 = 0x%016llx  PDPTR1 = 0x%016llx\n",
10025                        vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
10026                 pr_err("PDPTR2 = 0x%016llx  PDPTR3 = 0x%016llx\n",
10027                        vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
10028         }
10029         pr_err("RSP = 0x%016lx  RIP = 0x%016lx\n",
10030                vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
10031         pr_err("RFLAGS=0x%08lx         DR7 = 0x%016lx\n",
10032                vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
10033         pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
10034                vmcs_readl(GUEST_SYSENTER_ESP),
10035                vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
10036         vmx_dump_sel("CS:  ", GUEST_CS_SELECTOR);
10037         vmx_dump_sel("DS:  ", GUEST_DS_SELECTOR);
10038         vmx_dump_sel("SS:  ", GUEST_SS_SELECTOR);
10039         vmx_dump_sel("ES:  ", GUEST_ES_SELECTOR);
10040         vmx_dump_sel("FS:  ", GUEST_FS_SELECTOR);
10041         vmx_dump_sel("GS:  ", GUEST_GS_SELECTOR);
10042         vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
10043         vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
10044         vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
10045         vmx_dump_sel("TR:  ", GUEST_TR_SELECTOR);
10046         if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
10047             (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
10048                 pr_err("EFER =     0x%016llx  PAT = 0x%016llx\n",
10049                        efer, vmcs_read64(GUEST_IA32_PAT));
10050         pr_err("DebugCtl = 0x%016llx  DebugExceptions = 0x%016lx\n",
10051                vmcs_read64(GUEST_IA32_DEBUGCTL),
10052                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
10053         if (cpu_has_load_perf_global_ctrl &&
10054             vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
10055                 pr_err("PerfGlobCtl = 0x%016llx\n",
10056                        vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
10057         if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
10058                 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
10059         pr_err("Interruptibility = %08x  ActivityState = %08x\n",
10060                vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
10061                vmcs_read32(GUEST_ACTIVITY_STATE));
10062         if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
10063                 pr_err("InterruptStatus = %04x\n",
10064                        vmcs_read16(GUEST_INTR_STATUS));
10065
10066         pr_err("*** Host State ***\n");
10067         pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
10068                vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
10069         pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
10070                vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
10071                vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
10072                vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
10073                vmcs_read16(HOST_TR_SELECTOR));
10074         pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
10075                vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
10076                vmcs_readl(HOST_TR_BASE));
10077         pr_err("GDTBase=%016lx IDTBase=%016lx\n",
10078                vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
10079         pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
10080                vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
10081                vmcs_readl(HOST_CR4));
10082         pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
10083                vmcs_readl(HOST_IA32_SYSENTER_ESP),
10084                vmcs_read32(HOST_IA32_SYSENTER_CS),
10085                vmcs_readl(HOST_IA32_SYSENTER_EIP));
10086         if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
10087                 pr_err("EFER = 0x%016llx  PAT = 0x%016llx\n",
10088                        vmcs_read64(HOST_IA32_EFER),
10089                        vmcs_read64(HOST_IA32_PAT));
10090         if (cpu_has_load_perf_global_ctrl &&
10091             vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
10092                 pr_err("PerfGlobCtl = 0x%016llx\n",
10093                        vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
10094
10095         pr_err("*** Control State ***\n");
10096         pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
10097                pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
10098         pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
10099         pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
10100                vmcs_read32(EXCEPTION_BITMAP),
10101                vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
10102                vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
10103         pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
10104                vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
10105                vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
10106                vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
10107         pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
10108                vmcs_read32(VM_EXIT_INTR_INFO),
10109                vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
10110                vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
10111         pr_err("        reason=%08x qualification=%016lx\n",
10112                vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
10113         pr_err("IDTVectoring: info=%08x errcode=%08x\n",
10114                vmcs_read32(IDT_VECTORING_INFO_FIELD),
10115                vmcs_read32(IDT_VECTORING_ERROR_CODE));
10116         pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
10117         if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
10118                 pr_err("TSC Multiplier = 0x%016llx\n",
10119                        vmcs_read64(TSC_MULTIPLIER));
10120         if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
10121                 pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
10122         if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
10123                 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
10124         if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
10125                 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
10126         n = vmcs_read32(CR3_TARGET_COUNT);
10127         for (i = 0; i + 1 < n; i += 4)
10128                 pr_err("CR3 target%u=%016lx target%u=%016lx\n",
10129                        i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
10130                        i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
10131         if (i < n)
10132                 pr_err("CR3 target%u=%016lx\n",
10133                        i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
10134         if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
10135                 pr_err("PLE Gap=%08x Window=%08x\n",
10136                        vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
10137         if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
10138                 pr_err("Virtual processor ID = 0x%04x\n",
10139                        vmcs_read16(VIRTUAL_PROCESSOR_ID));
10140 }
10141
10142 /*
10143  * The guest has exited.  See if we can fix it or if we need userspace
10144  * assistance.
10145  */
10146 static int vmx_handle_exit(struct kvm_vcpu *vcpu)
10147 {
10148         struct vcpu_vmx *vmx = to_vmx(vcpu);
10149         u32 exit_reason = vmx->exit_reason;
10150         u32 vectoring_info = vmx->idt_vectoring_info;
10151
10152         trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
10153
10154         /*
10155          * Flush logged GPAs PML buffer, this will make dirty_bitmap more
10156          * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
10157          * querying dirty_bitmap, we only need to kick all vcpus out of guest
10158          * mode as if vcpus is in root mode, the PML buffer must has been
10159          * flushed already.
10160          */
10161         if (enable_pml)
10162                 vmx_flush_pml_buffer(vcpu);
10163
10164         /* If guest state is invalid, start emulating */
10165         if (vmx->emulation_required)
10166                 return handle_invalid_guest_state(vcpu);
10167
10168         if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
10169                 return nested_vmx_reflect_vmexit(vcpu, exit_reason);
10170
10171         if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
10172                 dump_vmcs();
10173                 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
10174                 vcpu->run->fail_entry.hardware_entry_failure_reason
10175                         = exit_reason;
10176                 return 0;
10177         }
10178
10179         if (unlikely(vmx->fail)) {
10180                 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
10181                 vcpu->run->fail_entry.hardware_entry_failure_reason
10182                         = vmcs_read32(VM_INSTRUCTION_ERROR);
10183                 return 0;
10184         }
10185
10186         /*
10187          * Note:
10188          * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
10189          * delivery event since it indicates guest is accessing MMIO.
10190          * The vm-exit can be triggered again after return to guest that
10191          * will cause infinite loop.
10192          */
10193         if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
10194                         (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
10195                         exit_reason != EXIT_REASON_EPT_VIOLATION &&
10196                         exit_reason != EXIT_REASON_PML_FULL &&
10197                         exit_reason != EXIT_REASON_APIC_ACCESS &&
10198                         exit_reason != EXIT_REASON_TASK_SWITCH)) {
10199                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
10200                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
10201                 vcpu->run->internal.ndata = 3;
10202                 vcpu->run->internal.data[0] = vectoring_info;
10203                 vcpu->run->internal.data[1] = exit_reason;
10204                 vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
10205                 if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
10206                         vcpu->run->internal.ndata++;
10207                         vcpu->run->internal.data[3] =
10208                                 vmcs_read64(GUEST_PHYSICAL_ADDRESS);
10209                 }
10210                 return 0;
10211         }
10212
10213         if (unlikely(!enable_vnmi &&
10214                      vmx->loaded_vmcs->soft_vnmi_blocked)) {
10215                 if (vmx_interrupt_allowed(vcpu)) {
10216                         vmx->loaded_vmcs->soft_vnmi_blocked = 0;
10217                 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
10218                            vcpu->arch.nmi_pending) {
10219                         /*
10220                          * This CPU don't support us in finding the end of an
10221                          * NMI-blocked window if the guest runs with IRQs
10222                          * disabled. So we pull the trigger after 1 s of
10223                          * futile waiting, but inform the user about this.
10224                          */
10225                         printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
10226                                "state on VCPU %d after 1 s timeout\n",
10227                                __func__, vcpu->vcpu_id);
10228                         vmx->loaded_vmcs->soft_vnmi_blocked = 0;
10229                 }
10230         }
10231
10232         if (exit_reason < kvm_vmx_max_exit_handlers
10233             && kvm_vmx_exit_handlers[exit_reason])
10234                 return kvm_vmx_exit_handlers[exit_reason](vcpu);
10235         else {
10236                 vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
10237                                 exit_reason);
10238                 kvm_queue_exception(vcpu, UD_VECTOR);
10239                 return 1;
10240         }
10241 }
10242
10243 /*
10244  * Software based L1D cache flush which is used when microcode providing
10245  * the cache control MSR is not loaded.
10246  *
10247  * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
10248  * flush it is required to read in 64 KiB because the replacement algorithm
10249  * is not exactly LRU. This could be sized at runtime via topology
10250  * information but as all relevant affected CPUs have 32KiB L1D cache size
10251  * there is no point in doing so.
10252  */
10253 static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
10254 {
10255         int size = PAGE_SIZE << L1D_CACHE_ORDER;
10256
10257         /*
10258          * This code is only executed when the the flush mode is 'cond' or
10259          * 'always'
10260          */
10261         if (static_branch_likely(&vmx_l1d_flush_cond)) {
10262                 bool flush_l1d;
10263
10264                 /*
10265                  * Clear the per-vcpu flush bit, it gets set again
10266                  * either from vcpu_run() or from one of the unsafe
10267                  * VMEXIT handlers.
10268                  */
10269                 flush_l1d = vcpu->arch.l1tf_flush_l1d;
10270                 vcpu->arch.l1tf_flush_l1d = false;
10271
10272                 /*
10273                  * Clear the per-cpu flush bit, it gets set again from
10274                  * the interrupt handlers.
10275                  */
10276                 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
10277                 kvm_clear_cpu_l1tf_flush_l1d();
10278
10279                 if (!flush_l1d)
10280                         return;
10281         }
10282
10283         vcpu->stat.l1d_flush++;
10284
10285         if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
10286                 wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
10287                 return;
10288         }
10289
10290         asm volatile(
10291                 /* First ensure the pages are in the TLB */
10292                 "xorl   %%eax, %%eax\n"
10293                 ".Lpopulate_tlb:\n\t"
10294                 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
10295                 "addl   $4096, %%eax\n\t"
10296                 "cmpl   %%eax, %[size]\n\t"
10297                 "jne    .Lpopulate_tlb\n\t"
10298                 "xorl   %%eax, %%eax\n\t"
10299                 "cpuid\n\t"
10300                 /* Now fill the cache */
10301                 "xorl   %%eax, %%eax\n"
10302                 ".Lfill_cache:\n"
10303                 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
10304                 "addl   $64, %%eax\n\t"
10305                 "cmpl   %%eax, %[size]\n\t"
10306                 "jne    .Lfill_cache\n\t"
10307                 "lfence\n"
10308                 :: [flush_pages] "r" (vmx_l1d_flush_pages),
10309                     [size] "r" (size)
10310                 : "eax", "ebx", "ecx", "edx");
10311 }
10312
10313 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
10314 {
10315         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
10316
10317         if (is_guest_mode(vcpu) &&
10318                 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
10319                 return;
10320
10321         if (irr == -1 || tpr < irr) {
10322                 vmcs_write32(TPR_THRESHOLD, 0);
10323                 return;
10324         }
10325
10326         vmcs_write32(TPR_THRESHOLD, irr);
10327 }
10328
10329 static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
10330 {
10331         u32 sec_exec_control;
10332
10333         if (!lapic_in_kernel(vcpu))
10334                 return;
10335
10336         if (!flexpriority_enabled &&
10337             !cpu_has_vmx_virtualize_x2apic_mode())
10338                 return;
10339
10340         /* Postpone execution until vmcs01 is the current VMCS. */
10341         if (is_guest_mode(vcpu)) {
10342                 to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true;
10343                 return;
10344         }
10345
10346         sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
10347         sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
10348                               SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
10349
10350         switch (kvm_get_apic_mode(vcpu)) {
10351         case LAPIC_MODE_INVALID:
10352                 WARN_ONCE(true, "Invalid local APIC state");
10353         case LAPIC_MODE_DISABLED:
10354                 break;
10355         case LAPIC_MODE_XAPIC:
10356                 if (flexpriority_enabled) {
10357                         sec_exec_control |=
10358                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
10359                         vmx_flush_tlb(vcpu, true);
10360                 }
10361                 break;
10362         case LAPIC_MODE_X2APIC:
10363                 if (cpu_has_vmx_virtualize_x2apic_mode())
10364                         sec_exec_control |=
10365                                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
10366                 break;
10367         }
10368         vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
10369
10370         vmx_update_msr_bitmap(vcpu);
10371 }
10372
10373 static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
10374 {
10375         if (!is_guest_mode(vcpu)) {
10376                 vmcs_write64(APIC_ACCESS_ADDR, hpa);
10377                 vmx_flush_tlb(vcpu, true);
10378         }
10379 }
10380
10381 static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
10382 {
10383         u16 status;
10384         u8 old;
10385
10386         if (max_isr == -1)
10387                 max_isr = 0;
10388
10389         status = vmcs_read16(GUEST_INTR_STATUS);
10390         old = status >> 8;
10391         if (max_isr != old) {
10392                 status &= 0xff;
10393                 status |= max_isr << 8;
10394                 vmcs_write16(GUEST_INTR_STATUS, status);
10395         }
10396 }
10397
10398 static void vmx_set_rvi(int vector)
10399 {
10400         u16 status;
10401         u8 old;
10402
10403         if (vector == -1)
10404                 vector = 0;
10405
10406         status = vmcs_read16(GUEST_INTR_STATUS);
10407         old = (u8)status & 0xff;
10408         if ((u8)vector != old) {
10409                 status &= ~0xff;
10410                 status |= (u8)vector;
10411                 vmcs_write16(GUEST_INTR_STATUS, status);
10412         }
10413 }
10414
10415 static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
10416 {
10417         /*
10418          * When running L2, updating RVI is only relevant when
10419          * vmcs12 virtual-interrupt-delivery enabled.
10420          * However, it can be enabled only when L1 also
10421          * intercepts external-interrupts and in that case
10422          * we should not update vmcs02 RVI but instead intercept
10423          * interrupt. Therefore, do nothing when running L2.
10424          */
10425         if (!is_guest_mode(vcpu))
10426                 vmx_set_rvi(max_irr);
10427 }
10428
10429 static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
10430 {
10431         struct vcpu_vmx *vmx = to_vmx(vcpu);
10432         int max_irr;
10433         bool max_irr_updated;
10434
10435         WARN_ON(!vcpu->arch.apicv_active);
10436         if (pi_test_on(&vmx->pi_desc)) {
10437                 pi_clear_on(&vmx->pi_desc);
10438                 /*
10439                  * IOMMU can write to PIR.ON, so the barrier matters even on UP.
10440                  * But on x86 this is just a compiler barrier anyway.
10441                  */
10442                 smp_mb__after_atomic();
10443                 max_irr_updated =
10444                         kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
10445
10446                 /*
10447                  * If we are running L2 and L1 has a new pending interrupt
10448                  * which can be injected, we should re-evaluate
10449                  * what should be done with this new L1 interrupt.
10450                  * If L1 intercepts external-interrupts, we should
10451                  * exit from L2 to L1. Otherwise, interrupt should be
10452                  * delivered directly to L2.
10453                  */
10454                 if (is_guest_mode(vcpu) && max_irr_updated) {
10455                         if (nested_exit_on_intr(vcpu))
10456                                 kvm_vcpu_exiting_guest_mode(vcpu);
10457                         else
10458                                 kvm_make_request(KVM_REQ_EVENT, vcpu);
10459                 }
10460         } else {
10461                 max_irr = kvm_lapic_find_highest_irr(vcpu);
10462         }
10463         vmx_hwapic_irr_update(vcpu, max_irr);
10464         return max_irr;
10465 }
10466
10467 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
10468 {
10469         u8 rvi = vmx_get_rvi();
10470         u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
10471
10472         return ((rvi & 0xf0) > (vppr & 0xf0));
10473 }
10474
10475 static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
10476 {
10477         return pi_test_on(vcpu_to_pi_desc(vcpu));
10478 }
10479
10480 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
10481 {
10482         if (!kvm_vcpu_apicv_active(vcpu))
10483                 return;
10484
10485         vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
10486         vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
10487         vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
10488         vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
10489 }
10490
10491 static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
10492 {
10493         struct vcpu_vmx *vmx = to_vmx(vcpu);
10494
10495         pi_clear_on(&vmx->pi_desc);
10496         memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
10497 }
10498
10499 static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
10500 {
10501         if (vmx->exit_reason != EXIT_REASON_EXCEPTION_NMI)
10502                 return;
10503
10504         vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
10505
10506         /* if exit due to PF check for async PF */
10507         if (is_page_fault(vmx->exit_intr_info))
10508                 vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
10509
10510         /* Handle machine checks before interrupts are enabled */
10511         if (is_machine_check(vmx->exit_intr_info))
10512                 kvm_machine_check();
10513
10514         /* We need to handle NMIs before interrupts are enabled */
10515         if (is_nmi(vmx->exit_intr_info)) {
10516                 kvm_before_interrupt(&vmx->vcpu);
10517                 asm("int $2");
10518                 kvm_after_interrupt(&vmx->vcpu);
10519         }
10520 }
10521
10522 static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
10523 {
10524         u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
10525
10526         if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
10527                         == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
10528                 unsigned int vector;
10529                 unsigned long entry;
10530                 gate_desc *desc;
10531                 struct vcpu_vmx *vmx = to_vmx(vcpu);
10532 #ifdef CONFIG_X86_64
10533                 unsigned long tmp;
10534 #endif
10535
10536                 vector =  exit_intr_info & INTR_INFO_VECTOR_MASK;
10537                 desc = (gate_desc *)vmx->host_idt_base + vector;
10538                 entry = gate_offset(desc);
10539                 asm volatile(
10540 #ifdef CONFIG_X86_64
10541                         "mov %%" _ASM_SP ", %[sp]\n\t"
10542                         "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
10543                         "push $%c[ss]\n\t"
10544                         "push %[sp]\n\t"
10545 #endif
10546                         "pushf\n\t"
10547                         __ASM_SIZE(push) " $%c[cs]\n\t"
10548                         CALL_NOSPEC
10549                         :
10550 #ifdef CONFIG_X86_64
10551                         [sp]"=&r"(tmp),
10552 #endif
10553                         ASM_CALL_CONSTRAINT
10554                         :
10555                         THUNK_TARGET(entry),
10556                         [ss]"i"(__KERNEL_DS),
10557                         [cs]"i"(__KERNEL_CS)
10558                         );
10559         }
10560 }
10561 STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
10562
10563 static bool vmx_has_emulated_msr(int index)
10564 {
10565         switch (index) {
10566         case MSR_IA32_SMBASE:
10567                 /*
10568                  * We cannot do SMM unless we can run the guest in big
10569                  * real mode.
10570                  */
10571                 return enable_unrestricted_guest || emulate_invalid_guest_state;
10572         case MSR_AMD64_VIRT_SPEC_CTRL:
10573                 /* This is AMD only.  */
10574                 return false;
10575         default:
10576                 return true;
10577         }
10578 }
10579
10580 static bool vmx_mpx_supported(void)
10581 {
10582         return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
10583                 (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
10584 }
10585
10586 static bool vmx_xsaves_supported(void)
10587 {
10588         return vmcs_config.cpu_based_2nd_exec_ctrl &
10589                 SECONDARY_EXEC_XSAVES;
10590 }
10591
10592 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
10593 {
10594         u32 exit_intr_info;
10595         bool unblock_nmi;
10596         u8 vector;
10597         bool idtv_info_valid;
10598
10599         idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
10600
10601         if (enable_vnmi) {
10602                 if (vmx->loaded_vmcs->nmi_known_unmasked)
10603                         return;
10604                 /*
10605                  * Can't use vmx->exit_intr_info since we're not sure what
10606                  * the exit reason is.
10607                  */
10608                 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
10609                 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
10610                 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
10611                 /*
10612                  * SDM 3: 27.7.1.2 (September 2008)
10613                  * Re-set bit "block by NMI" before VM entry if vmexit caused by
10614                  * a guest IRET fault.
10615                  * SDM 3: 23.2.2 (September 2008)
10616                  * Bit 12 is undefined in any of the following cases:
10617                  *  If the VM exit sets the valid bit in the IDT-vectoring
10618                  *   information field.
10619                  *  If the VM exit is due to a double fault.
10620                  */
10621                 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
10622                     vector != DF_VECTOR && !idtv_info_valid)
10623                         vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
10624                                       GUEST_INTR_STATE_NMI);
10625                 else
10626                         vmx->loaded_vmcs->nmi_known_unmasked =
10627                                 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
10628                                   & GUEST_INTR_STATE_NMI);
10629         } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
10630                 vmx->loaded_vmcs->vnmi_blocked_time +=
10631                         ktime_to_ns(ktime_sub(ktime_get(),
10632                                               vmx->loaded_vmcs->entry_time));
10633 }
10634
10635 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
10636                                       u32 idt_vectoring_info,
10637                                       int instr_len_field,
10638                                       int error_code_field)
10639 {
10640         u8 vector;
10641         int type;
10642         bool idtv_info_valid;
10643
10644         idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
10645
10646         vcpu->arch.nmi_injected = false;
10647         kvm_clear_exception_queue(vcpu);
10648         kvm_clear_interrupt_queue(vcpu);
10649
10650         if (!idtv_info_valid)
10651                 return;
10652
10653         kvm_make_request(KVM_REQ_EVENT, vcpu);
10654
10655         vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
10656         type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
10657
10658         switch (type) {
10659         case INTR_TYPE_NMI_INTR:
10660                 vcpu->arch.nmi_injected = true;
10661                 /*
10662                  * SDM 3: 27.7.1.2 (September 2008)
10663                  * Clear bit "block by NMI" before VM entry if a NMI
10664                  * delivery faulted.
10665                  */
10666                 vmx_set_nmi_mask(vcpu, false);
10667                 break;
10668         case INTR_TYPE_SOFT_EXCEPTION:
10669                 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
10670                 /* fall through */
10671         case INTR_TYPE_HARD_EXCEPTION:
10672                 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
10673                         u32 err = vmcs_read32(error_code_field);
10674                         kvm_requeue_exception_e(vcpu, vector, err);
10675                 } else
10676                         kvm_requeue_exception(vcpu, vector);
10677                 break;
10678         case INTR_TYPE_SOFT_INTR:
10679                 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
10680                 /* fall through */
10681         case INTR_TYPE_EXT_INTR:
10682                 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
10683                 break;
10684         default:
10685                 break;
10686         }
10687 }
10688
10689 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
10690 {
10691         __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
10692                                   VM_EXIT_INSTRUCTION_LEN,
10693                                   IDT_VECTORING_ERROR_CODE);
10694 }
10695
10696 static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
10697 {
10698         __vmx_complete_interrupts(vcpu,
10699                                   vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
10700                                   VM_ENTRY_INSTRUCTION_LEN,
10701                                   VM_ENTRY_EXCEPTION_ERROR_CODE);
10702
10703         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
10704 }
10705
10706 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
10707 {
10708         int i, nr_msrs;
10709         struct perf_guest_switch_msr *msrs;
10710
10711         msrs = perf_guest_get_msrs(&nr_msrs);
10712
10713         if (!msrs)
10714                 return;
10715
10716         for (i = 0; i < nr_msrs; i++)
10717                 if (msrs[i].host == msrs[i].guest)
10718                         clear_atomic_switch_msr(vmx, msrs[i].msr);
10719                 else
10720                         add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
10721                                         msrs[i].host, false);
10722 }
10723
10724 static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val)
10725 {
10726         vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val);
10727         if (!vmx->loaded_vmcs->hv_timer_armed)
10728                 vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
10729                               PIN_BASED_VMX_PREEMPTION_TIMER);
10730         vmx->loaded_vmcs->hv_timer_armed = true;
10731 }
10732
10733 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
10734 {
10735         struct vcpu_vmx *vmx = to_vmx(vcpu);
10736         u64 tscl;
10737         u32 delta_tsc;
10738
10739         if (vmx->req_immediate_exit) {
10740                 vmx_arm_hv_timer(vmx, 0);
10741                 return;
10742         }
10743
10744         if (vmx->hv_deadline_tsc != -1) {
10745                 tscl = rdtsc();
10746                 if (vmx->hv_deadline_tsc > tscl)
10747                         /* set_hv_timer ensures the delta fits in 32-bits */
10748                         delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
10749                                 cpu_preemption_timer_multi);
10750                 else
10751                         delta_tsc = 0;
10752
10753                 vmx_arm_hv_timer(vmx, delta_tsc);
10754                 return;
10755         }
10756
10757         if (vmx->loaded_vmcs->hv_timer_armed)
10758                 vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
10759                                 PIN_BASED_VMX_PREEMPTION_TIMER);
10760         vmx->loaded_vmcs->hv_timer_armed = false;
10761 }
10762
10763 u64 __always_inline vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx)
10764 {
10765         u64 guestval, hostval = this_cpu_read(x86_spec_ctrl_current);
10766
10767         if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
10768                 return 0;
10769
10770         guestval = __rdmsr(MSR_IA32_SPEC_CTRL);
10771
10772         /*
10773          *
10774          * For legacy IBRS, the IBRS bit always needs to be written after
10775          * transitioning from a less privileged predictor mode, regardless of
10776          * whether the guest/host values differ.
10777          */
10778         if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
10779             guestval != hostval)
10780                 native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval);
10781
10782         barrier_nospec();
10783
10784         return guestval;
10785 }
10786
10787 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
10788 {
10789         struct vcpu_vmx *vmx = to_vmx(vcpu);
10790         unsigned long cr3, cr4, evmcs_rsp;
10791         u64 spec_ctrl;
10792
10793         /* Record the guest's net vcpu time for enforced NMI injections. */
10794         if (unlikely(!enable_vnmi &&
10795                      vmx->loaded_vmcs->soft_vnmi_blocked))
10796                 vmx->loaded_vmcs->entry_time = ktime_get();
10797
10798         /* Don't enter VMX if guest state is invalid, let the exit handler
10799            start emulation until we arrive back to a valid state */
10800         if (vmx->emulation_required)
10801                 return;
10802
10803         if (vmx->ple_window_dirty) {
10804                 vmx->ple_window_dirty = false;
10805                 vmcs_write32(PLE_WINDOW, vmx->ple_window);
10806         }
10807
10808         if (vmx->nested.sync_shadow_vmcs) {
10809                 copy_vmcs12_to_shadow(vmx);
10810                 vmx->nested.sync_shadow_vmcs = false;
10811         }
10812
10813         if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
10814                 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
10815         if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
10816                 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
10817
10818         cr3 = __get_current_cr3_fast();
10819         if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
10820                 vmcs_writel(HOST_CR3, cr3);
10821                 vmx->loaded_vmcs->host_state.cr3 = cr3;
10822         }
10823
10824         cr4 = cr4_read_shadow();
10825         if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
10826                 vmcs_writel(HOST_CR4, cr4);
10827                 vmx->loaded_vmcs->host_state.cr4 = cr4;
10828         }
10829
10830         /* When single-stepping over STI and MOV SS, we must clear the
10831          * corresponding interruptibility bits in the guest state. Otherwise
10832          * vmentry fails as it then expects bit 14 (BS) in pending debug
10833          * exceptions being set, but that's not correct for the guest debugging
10834          * case. */
10835         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
10836                 vmx_set_interrupt_shadow(vcpu, 0);
10837
10838         kvm_load_guest_xcr0(vcpu);
10839
10840         if (static_cpu_has(X86_FEATURE_PKU) &&
10841             kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
10842             vcpu->arch.pkru != vmx->host_pkru)
10843                 __write_pkru(vcpu->arch.pkru);
10844
10845         atomic_switch_perf_msrs(vmx);
10846
10847         vmx_update_hv_timer(vcpu);
10848
10849         /*
10850          * If this vCPU has touched SPEC_CTRL, restore the guest's value if
10851          * it's non-zero. Since vmentry is serialising on affected CPUs, there
10852          * is no need to worry about the conditional branch over the wrmsr
10853          * being speculatively taken.
10854          */
10855         x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
10856
10857         vmx->__launched = vmx->loaded_vmcs->launched;
10858
10859         evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
10860                 (unsigned long)&current_evmcs->host_rsp : 0;
10861
10862         /* L1D Flush includes CPU buffer clear to mitigate MDS */
10863         if (static_branch_unlikely(&vmx_l1d_should_flush))
10864                 vmx_l1d_flush(vcpu);
10865         else if (static_branch_unlikely(&mds_user_clear))
10866                 mds_clear_cpu_buffers();
10867         else if (static_branch_unlikely(&mmio_stale_data_clear) &&
10868                  kvm_arch_has_assigned_device(vcpu->kvm))
10869                 mds_clear_cpu_buffers();
10870
10871         vmx_disable_fb_clear(vmx);
10872
10873         asm volatile (
10874                 /* Store host registers */
10875                 "push %%" _ASM_DX "; push %%" _ASM_BP ";"
10876                 "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
10877                 "push %%" _ASM_CX " \n\t"
10878                 "cmp %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t"
10879                 "je 1f \n\t"
10880                 "mov %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t"
10881                 /* Avoid VMWRITE when Enlightened VMCS is in use */
10882                 "test %%" _ASM_SI ", %%" _ASM_SI " \n\t"
10883                 "jz 2f \n\t"
10884                 "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t"
10885                 "jmp 1f \n\t"
10886                 "2: \n\t"
10887                 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
10888                 "1: \n\t"
10889                 /* Reload cr2 if changed */
10890                 "mov %c[cr2](%%" _ASM_CX "), %%" _ASM_AX " \n\t"
10891                 "mov %%cr2, %%" _ASM_DX " \n\t"
10892                 "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
10893                 "je 3f \n\t"
10894                 "mov %%" _ASM_AX", %%cr2 \n\t"
10895                 "3: \n\t"
10896                 /* Check if vmlaunch of vmresume is needed */
10897                 "cmpb $0, %c[launched](%%" _ASM_CX ") \n\t"
10898                 /* Load guest registers.  Don't clobber flags. */
10899                 "mov %c[rax](%%" _ASM_CX "), %%" _ASM_AX " \n\t"
10900                 "mov %c[rbx](%%" _ASM_CX "), %%" _ASM_BX " \n\t"
10901                 "mov %c[rdx](%%" _ASM_CX "), %%" _ASM_DX " \n\t"
10902                 "mov %c[rsi](%%" _ASM_CX "), %%" _ASM_SI " \n\t"
10903                 "mov %c[rdi](%%" _ASM_CX "), %%" _ASM_DI " \n\t"
10904                 "mov %c[rbp](%%" _ASM_CX "), %%" _ASM_BP " \n\t"
10905 #ifdef CONFIG_X86_64
10906                 "mov %c[r8](%%" _ASM_CX "),  %%r8  \n\t"
10907                 "mov %c[r9](%%" _ASM_CX "),  %%r9  \n\t"
10908                 "mov %c[r10](%%" _ASM_CX "), %%r10 \n\t"
10909                 "mov %c[r11](%%" _ASM_CX "), %%r11 \n\t"
10910                 "mov %c[r12](%%" _ASM_CX "), %%r12 \n\t"
10911                 "mov %c[r13](%%" _ASM_CX "), %%r13 \n\t"
10912                 "mov %c[r14](%%" _ASM_CX "), %%r14 \n\t"
10913                 "mov %c[r15](%%" _ASM_CX "), %%r15 \n\t"
10914 #endif
10915                 /* Load guest RCX.  This kills the vmx_vcpu pointer! */
10916                 "mov %c[rcx](%%" _ASM_CX "), %%" _ASM_CX " \n\t"
10917
10918                 /* Enter guest mode */
10919                 "jne 1f \n\t"
10920                 __ex(ASM_VMX_VMLAUNCH) "\n\t"
10921                 "jmp 2f \n\t"
10922                 "1: " __ex(ASM_VMX_VMRESUME) "\n\t"
10923                 "2: "
10924
10925                 /* Save guest's RCX to the stack placeholder (see above) */
10926                 "mov %%" _ASM_CX ", %c[wordsize](%%" _ASM_SP ") \n\t"
10927
10928                 /* Load host's RCX, i.e. the vmx_vcpu pointer */
10929                 "pop %%" _ASM_CX " \n\t"
10930
10931                 /* Set vmx->fail based on EFLAGS.{CF,ZF} */
10932                 "setbe %c[fail](%%" _ASM_CX ")\n\t"
10933
10934                 /* Save all guest registers, including RCX from the stack */
10935                 "mov %%" _ASM_AX ", %c[rax](%%" _ASM_CX ") \n\t"
10936                 "mov %%" _ASM_BX ", %c[rbx](%%" _ASM_CX ") \n\t"
10937                 __ASM_SIZE(pop) " %c[rcx](%%" _ASM_CX ") \n\t"
10938                 "mov %%" _ASM_DX ", %c[rdx](%%" _ASM_CX ") \n\t"
10939                 "mov %%" _ASM_SI ", %c[rsi](%%" _ASM_CX ") \n\t"
10940                 "mov %%" _ASM_DI ", %c[rdi](%%" _ASM_CX ") \n\t"
10941                 "mov %%" _ASM_BP ", %c[rbp](%%" _ASM_CX ") \n\t"
10942 #ifdef CONFIG_X86_64
10943                 "mov %%r8,  %c[r8](%%" _ASM_CX ") \n\t"
10944                 "mov %%r9,  %c[r9](%%" _ASM_CX ") \n\t"
10945                 "mov %%r10, %c[r10](%%" _ASM_CX ") \n\t"
10946                 "mov %%r11, %c[r11](%%" _ASM_CX ") \n\t"
10947                 "mov %%r12, %c[r12](%%" _ASM_CX ") \n\t"
10948                 "mov %%r13, %c[r13](%%" _ASM_CX ") \n\t"
10949                 "mov %%r14, %c[r14](%%" _ASM_CX ") \n\t"
10950                 "mov %%r15, %c[r15](%%" _ASM_CX ") \n\t"
10951
10952                 /*
10953                  * Clear all general purpose registers (except RSP, which is loaded by
10954                  * the CPU during VM-Exit) to prevent speculative use of the guest's
10955                  * values, even those that are saved/loaded via the stack.  In theory,
10956                  * an L1 cache miss when restoring registers could lead to speculative
10957                  * execution with the guest's values.  Zeroing XORs are dirt cheap,
10958                  * i.e. the extra paranoia is essentially free.
10959                  */
10960                 "xor %%r8d,  %%r8d \n\t"
10961                 "xor %%r9d,  %%r9d \n\t"
10962                 "xor %%r10d, %%r10d \n\t"
10963                 "xor %%r11d, %%r11d \n\t"
10964                 "xor %%r12d, %%r12d \n\t"
10965                 "xor %%r13d, %%r13d \n\t"
10966                 "xor %%r14d, %%r14d \n\t"
10967                 "xor %%r15d, %%r15d \n\t"
10968 #endif
10969                 "mov %%cr2, %%" _ASM_AX "   \n\t"
10970                 "mov %%" _ASM_AX ", %c[cr2](%%" _ASM_CX ") \n\t"
10971
10972                 "xor %%eax, %%eax \n\t"
10973                 "xor %%ebx, %%ebx \n\t"
10974                 "xor %%ecx, %%ecx \n\t"
10975                 "xor %%edx, %%edx \n\t"
10976                 "xor %%esi, %%esi \n\t"
10977                 "xor %%edi, %%edi \n\t"
10978                 "xor %%ebp, %%ebp \n\t"
10979                 "pop  %%" _ASM_BP "; pop  %%" _ASM_DX " \n\t"
10980                 ".pushsection .rodata \n\t"
10981                 ".global vmx_return \n\t"
10982                 "vmx_return: " _ASM_PTR " 2b \n\t"
10983                 ".popsection"
10984               : "=c"((int){0}), "=d"((int){0}), "=S"((int){0})
10985               : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp),
10986                 [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
10987                 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
10988                 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
10989                 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
10990                 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
10991                 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
10992                 [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
10993                 [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
10994                 [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
10995                 [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
10996 #ifdef CONFIG_X86_64
10997                 [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
10998                 [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
10999                 [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
11000                 [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
11001                 [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
11002                 [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
11003                 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
11004                 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
11005 #endif
11006                 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
11007                 [wordsize]"i"(sizeof(ulong))
11008               : "cc", "memory"
11009 #ifdef CONFIG_X86_64
11010                 , "rax", "rbx", "rdi"
11011                 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
11012 #else
11013                 , "eax", "ebx", "edi"
11014 #endif
11015               );
11016
11017         /*
11018          * IMPORTANT: RSB filling and SPEC_CTRL handling must be done before
11019          * the first unbalanced RET after vmexit!
11020          *
11021          * For retpoline or IBRS, RSB filling is needed to prevent poisoned RSB
11022          * entries and (in some cases) RSB underflow.
11023          *
11024          * eIBRS has its own protection against poisoned RSB, so it doesn't
11025          * need the RSB filling sequence.  But it does need to be enabled, and a
11026          * single call to retire, before the first unbalanced RET.
11027          *
11028          * So no RETs before vmx_spec_ctrl_restore_host() below.
11029          */
11030         vmexit_fill_RSB();
11031
11032         /* Save this for below */
11033         spec_ctrl = vmx_spec_ctrl_restore_host(vmx);
11034
11035         vmx_enable_fb_clear(vmx);
11036
11037         /*
11038          * We do not use IBRS in the kernel. If this vCPU has used the
11039          * SPEC_CTRL MSR it may have left it on; save the value and
11040          * turn it off. This is much more efficient than blindly adding
11041          * it to the atomic save/restore list. Especially as the former
11042          * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
11043          *
11044          * For non-nested case:
11045          * If the L01 MSR bitmap does not intercept the MSR, then we need to
11046          * save it.
11047          *
11048          * For nested case:
11049          * If the L02 MSR bitmap does not intercept the MSR, then we need to
11050          * save it.
11051          */
11052         if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
11053                 vmx->spec_ctrl = spec_ctrl;
11054
11055         /* All fields are clean at this point */
11056         if (static_branch_unlikely(&enable_evmcs))
11057                 current_evmcs->hv_clean_fields |=
11058                         HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
11059
11060         /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
11061         if (vmx->host_debugctlmsr)
11062                 update_debugctlmsr(vmx->host_debugctlmsr);
11063
11064 #ifndef CONFIG_X86_64
11065         /*
11066          * The sysexit path does not restore ds/es, so we must set them to
11067          * a reasonable value ourselves.
11068          *
11069          * We can't defer this to vmx_prepare_switch_to_host() since that
11070          * function may be executed in interrupt context, which saves and
11071          * restore segments around it, nullifying its effect.
11072          */
11073         loadsegment(ds, __USER_DS);
11074         loadsegment(es, __USER_DS);
11075 #endif
11076
11077         vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
11078                                   | (1 << VCPU_EXREG_RFLAGS)
11079                                   | (1 << VCPU_EXREG_PDPTR)
11080                                   | (1 << VCPU_EXREG_SEGMENTS)
11081                                   | (1 << VCPU_EXREG_CR3));
11082         vcpu->arch.regs_dirty = 0;
11083
11084         /*
11085          * eager fpu is enabled if PKEY is supported and CR4 is switched
11086          * back on host, so it is safe to read guest PKRU from current
11087          * XSAVE.
11088          */
11089         if (static_cpu_has(X86_FEATURE_PKU) &&
11090             kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
11091                 vcpu->arch.pkru = __read_pkru();
11092                 if (vcpu->arch.pkru != vmx->host_pkru)
11093                         __write_pkru(vmx->host_pkru);
11094         }
11095
11096         kvm_put_guest_xcr0(vcpu);
11097
11098         vmx->nested.nested_run_pending = 0;
11099         vmx->idt_vectoring_info = 0;
11100
11101         vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
11102         if ((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
11103                 kvm_machine_check();
11104
11105         if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
11106                 return;
11107
11108         vmx->loaded_vmcs->launched = 1;
11109         vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
11110
11111         vmx_complete_atomic_exit(vmx);
11112         vmx_recover_nmi_blocking(vmx);
11113         vmx_complete_interrupts(vmx);
11114 }
11115 STACK_FRAME_NON_STANDARD(vmx_vcpu_run);
11116
11117 static struct kvm *vmx_vm_alloc(void)
11118 {
11119         struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx));
11120
11121         if (!kvm_vmx)
11122                 return NULL;
11123
11124         return &kvm_vmx->kvm;
11125 }
11126
11127 static void vmx_vm_free(struct kvm *kvm)
11128 {
11129         vfree(to_kvm_vmx(kvm));
11130 }
11131
11132 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
11133 {
11134         struct vcpu_vmx *vmx = to_vmx(vcpu);
11135         int cpu;
11136
11137         if (vmx->loaded_vmcs == vmcs)
11138                 return;
11139
11140         cpu = get_cpu();
11141         vmx_vcpu_put(vcpu);
11142         vmx->loaded_vmcs = vmcs;
11143         vmx_vcpu_load(vcpu, cpu);
11144         put_cpu();
11145 }
11146
11147 /*
11148  * Ensure that the current vmcs of the logical processor is the
11149  * vmcs01 of the vcpu before calling free_nested().
11150  */
11151 static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
11152 {
11153        struct vcpu_vmx *vmx = to_vmx(vcpu);
11154
11155        vcpu_load(vcpu);
11156        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
11157        free_nested(vmx);
11158        vcpu_put(vcpu);
11159 }
11160
11161 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
11162 {
11163         struct vcpu_vmx *vmx = to_vmx(vcpu);
11164
11165         if (enable_pml)
11166                 vmx_destroy_pml_buffer(vmx);
11167         free_vpid(vmx->vpid);
11168         leave_guest_mode(vcpu);
11169         vmx_free_vcpu_nested(vcpu);
11170         free_loaded_vmcs(vmx->loaded_vmcs);
11171         kfree(vmx->guest_msrs);
11172         kvm_vcpu_uninit(vcpu);
11173         kmem_cache_free(kvm_vcpu_cache, vmx);
11174 }
11175
11176 static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
11177 {
11178         int err;
11179         struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
11180         unsigned long *msr_bitmap;
11181         int cpu;
11182
11183         if (!vmx)
11184                 return ERR_PTR(-ENOMEM);
11185
11186         vmx->vpid = allocate_vpid();
11187
11188         err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
11189         if (err)
11190                 goto free_vcpu;
11191
11192         err = -ENOMEM;
11193
11194         /*
11195          * If PML is turned on, failure on enabling PML just results in failure
11196          * of creating the vcpu, therefore we can simplify PML logic (by
11197          * avoiding dealing with cases, such as enabling PML partially on vcpus
11198          * for the guest, etc.
11199          */
11200         if (enable_pml) {
11201                 vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
11202                 if (!vmx->pml_pg)
11203                         goto uninit_vcpu;
11204         }
11205
11206         vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
11207         BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
11208                      > PAGE_SIZE);
11209
11210         if (!vmx->guest_msrs)
11211                 goto free_pml;
11212
11213         err = alloc_loaded_vmcs(&vmx->vmcs01);
11214         if (err < 0)
11215                 goto free_msrs;
11216
11217         msr_bitmap = vmx->vmcs01.msr_bitmap;
11218         vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
11219         vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
11220         vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
11221         vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
11222         vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
11223         vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
11224         vmx->msr_bitmap_mode = 0;
11225
11226         vmx->loaded_vmcs = &vmx->vmcs01;
11227         cpu = get_cpu();
11228         vmx_vcpu_load(&vmx->vcpu, cpu);
11229         vmx->vcpu.cpu = cpu;
11230         vmx_vcpu_setup(vmx);
11231         vmx_vcpu_put(&vmx->vcpu);
11232         put_cpu();
11233         if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
11234                 err = alloc_apic_access_page(kvm);
11235                 if (err)
11236                         goto free_vmcs;
11237         }
11238
11239         if (enable_ept && !enable_unrestricted_guest) {
11240                 err = init_rmode_identity_map(kvm);
11241                 if (err)
11242                         goto free_vmcs;
11243         }
11244
11245         if (nested)
11246                 nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
11247                                            kvm_vcpu_apicv_active(&vmx->vcpu));
11248
11249         vmx->nested.posted_intr_nv = -1;
11250         vmx->nested.current_vmptr = -1ull;
11251
11252         vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
11253
11254         /*
11255          * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
11256          * or POSTED_INTR_WAKEUP_VECTOR.
11257          */
11258         vmx->pi_desc.nv = POSTED_INTR_VECTOR;
11259         vmx->pi_desc.sn = 1;
11260
11261         return &vmx->vcpu;
11262
11263 free_vmcs:
11264         free_loaded_vmcs(vmx->loaded_vmcs);
11265 free_msrs:
11266         kfree(vmx->guest_msrs);
11267 free_pml:
11268         vmx_destroy_pml_buffer(vmx);
11269 uninit_vcpu:
11270         kvm_vcpu_uninit(&vmx->vcpu);
11271 free_vcpu:
11272         free_vpid(vmx->vpid);
11273         kmem_cache_free(kvm_vcpu_cache, vmx);
11274         return ERR_PTR(err);
11275 }
11276
11277 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
11278 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
11279
11280 static int vmx_vm_init(struct kvm *kvm)
11281 {
11282         spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
11283
11284         if (!ple_gap)
11285                 kvm->arch.pause_in_guest = true;
11286
11287         if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
11288                 switch (l1tf_mitigation) {
11289                 case L1TF_MITIGATION_OFF:
11290                 case L1TF_MITIGATION_FLUSH_NOWARN:
11291                         /* 'I explicitly don't care' is set */
11292                         break;
11293                 case L1TF_MITIGATION_FLUSH:
11294                 case L1TF_MITIGATION_FLUSH_NOSMT:
11295                 case L1TF_MITIGATION_FULL:
11296                         /*
11297                          * Warn upon starting the first VM in a potentially
11298                          * insecure environment.
11299                          */
11300                         if (sched_smt_active())
11301                                 pr_warn_once(L1TF_MSG_SMT);
11302                         if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
11303                                 pr_warn_once(L1TF_MSG_L1D);
11304                         break;
11305                 case L1TF_MITIGATION_FULL_FORCE:
11306                         /* Flush is enforced */
11307                         break;
11308                 }
11309         }
11310         return 0;
11311 }
11312
11313 static void __init vmx_check_processor_compat(void *rtn)
11314 {
11315         struct vmcs_config vmcs_conf;
11316
11317         *(int *)rtn = 0;
11318         if (setup_vmcs_config(&vmcs_conf) < 0)
11319                 *(int *)rtn = -EIO;
11320         nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, enable_apicv);
11321         if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
11322                 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
11323                                 smp_processor_id());
11324                 *(int *)rtn = -EIO;
11325         }
11326 }
11327
11328 static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
11329 {
11330         u8 cache;
11331         u64 ipat = 0;
11332
11333         /* For VT-d and EPT combination
11334          * 1. MMIO: always map as UC
11335          * 2. EPT with VT-d:
11336          *   a. VT-d without snooping control feature: can't guarantee the
11337          *      result, try to trust guest.
11338          *   b. VT-d with snooping control feature: snooping control feature of
11339          *      VT-d engine can guarantee the cache correctness. Just set it
11340          *      to WB to keep consistent with host. So the same as item 3.
11341          * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
11342          *    consistent with host MTRR
11343          */
11344         if (is_mmio) {
11345                 cache = MTRR_TYPE_UNCACHABLE;
11346                 goto exit;
11347         }
11348
11349         if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
11350                 ipat = VMX_EPT_IPAT_BIT;
11351                 cache = MTRR_TYPE_WRBACK;
11352                 goto exit;
11353         }
11354
11355         if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
11356                 ipat = VMX_EPT_IPAT_BIT;
11357                 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
11358                         cache = MTRR_TYPE_WRBACK;
11359                 else
11360                         cache = MTRR_TYPE_UNCACHABLE;
11361                 goto exit;
11362         }
11363
11364         cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
11365
11366 exit:
11367         return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
11368 }
11369
11370 static int vmx_get_lpage_level(void)
11371 {
11372         if (enable_ept && !cpu_has_vmx_ept_1g_page())
11373                 return PT_DIRECTORY_LEVEL;
11374         else
11375                 /* For shadow and EPT supported 1GB page */
11376                 return PT_PDPE_LEVEL;
11377 }
11378
11379 static void vmcs_set_secondary_exec_control(u32 new_ctl)
11380 {
11381         /*
11382          * These bits in the secondary execution controls field
11383          * are dynamic, the others are mostly based on the hypervisor
11384          * architecture and the guest's CPUID.  Do not touch the
11385          * dynamic bits.
11386          */
11387         u32 mask =
11388                 SECONDARY_EXEC_SHADOW_VMCS |
11389                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
11390                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
11391                 SECONDARY_EXEC_DESC;
11392
11393         u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
11394
11395         vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
11396                      (new_ctl & ~mask) | (cur_ctl & mask));
11397 }
11398
11399 /*
11400  * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
11401  * (indicating "allowed-1") if they are supported in the guest's CPUID.
11402  */
11403 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
11404 {
11405         struct vcpu_vmx *vmx = to_vmx(vcpu);
11406         struct kvm_cpuid_entry2 *entry;
11407
11408         vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
11409         vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
11410
11411 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do {            \
11412         if (entry && (entry->_reg & (_cpuid_mask)))                     \
11413                 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask);     \
11414 } while (0)
11415
11416         entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
11417         cr4_fixed1_update(X86_CR4_VME,        edx, bit(X86_FEATURE_VME));
11418         cr4_fixed1_update(X86_CR4_PVI,        edx, bit(X86_FEATURE_VME));
11419         cr4_fixed1_update(X86_CR4_TSD,        edx, bit(X86_FEATURE_TSC));
11420         cr4_fixed1_update(X86_CR4_DE,         edx, bit(X86_FEATURE_DE));
11421         cr4_fixed1_update(X86_CR4_PSE,        edx, bit(X86_FEATURE_PSE));
11422         cr4_fixed1_update(X86_CR4_PAE,        edx, bit(X86_FEATURE_PAE));
11423         cr4_fixed1_update(X86_CR4_MCE,        edx, bit(X86_FEATURE_MCE));
11424         cr4_fixed1_update(X86_CR4_PGE,        edx, bit(X86_FEATURE_PGE));
11425         cr4_fixed1_update(X86_CR4_OSFXSR,     edx, bit(X86_FEATURE_FXSR));
11426         cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM));
11427         cr4_fixed1_update(X86_CR4_VMXE,       ecx, bit(X86_FEATURE_VMX));
11428         cr4_fixed1_update(X86_CR4_SMXE,       ecx, bit(X86_FEATURE_SMX));
11429         cr4_fixed1_update(X86_CR4_PCIDE,      ecx, bit(X86_FEATURE_PCID));
11430         cr4_fixed1_update(X86_CR4_OSXSAVE,    ecx, bit(X86_FEATURE_XSAVE));
11431
11432         entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
11433         cr4_fixed1_update(X86_CR4_FSGSBASE,   ebx, bit(X86_FEATURE_FSGSBASE));
11434         cr4_fixed1_update(X86_CR4_SMEP,       ebx, bit(X86_FEATURE_SMEP));
11435         cr4_fixed1_update(X86_CR4_SMAP,       ebx, bit(X86_FEATURE_SMAP));
11436         cr4_fixed1_update(X86_CR4_PKE,        ecx, bit(X86_FEATURE_PKU));
11437         cr4_fixed1_update(X86_CR4_UMIP,       ecx, bit(X86_FEATURE_UMIP));
11438
11439 #undef cr4_fixed1_update
11440 }
11441
11442 static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
11443 {
11444         struct vcpu_vmx *vmx = to_vmx(vcpu);
11445
11446         if (kvm_mpx_supported()) {
11447                 bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
11448
11449                 if (mpx_enabled) {
11450                         vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
11451                         vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
11452                 } else {
11453                         vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
11454                         vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
11455                 }
11456         }
11457 }
11458
11459 static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
11460 {
11461         struct vcpu_vmx *vmx = to_vmx(vcpu);
11462
11463         if (cpu_has_secondary_exec_ctrls()) {
11464                 vmx_compute_secondary_exec_control(vmx);
11465                 vmcs_set_secondary_exec_control(vmx->secondary_exec_control);
11466         }
11467
11468         if (nested_vmx_allowed(vcpu))
11469                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
11470                         FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
11471         else
11472                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
11473                         ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
11474
11475         if (nested_vmx_allowed(vcpu)) {
11476                 nested_vmx_cr_fixed1_bits_update(vcpu);
11477                 nested_vmx_entry_exit_ctls_update(vcpu);
11478         }
11479 }
11480
11481 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
11482 {
11483         if (func == 1 && nested)
11484                 entry->ecx |= bit(X86_FEATURE_VMX);
11485 }
11486
11487 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
11488                 struct x86_exception *fault)
11489 {
11490         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
11491         struct vcpu_vmx *vmx = to_vmx(vcpu);
11492         u32 exit_reason;
11493         unsigned long exit_qualification = vcpu->arch.exit_qualification;
11494
11495         if (vmx->nested.pml_full) {
11496                 exit_reason = EXIT_REASON_PML_FULL;
11497                 vmx->nested.pml_full = false;
11498                 exit_qualification &= INTR_INFO_UNBLOCK_NMI;
11499         } else if (fault->error_code & PFERR_RSVD_MASK)
11500                 exit_reason = EXIT_REASON_EPT_MISCONFIG;
11501         else
11502                 exit_reason = EXIT_REASON_EPT_VIOLATION;
11503
11504         nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
11505         vmcs12->guest_physical_address = fault->address;
11506 }
11507
11508 static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
11509 {
11510         return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT;
11511 }
11512
11513 /* Callbacks for nested_ept_init_mmu_context: */
11514
11515 static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
11516 {
11517         /* return the page table to be shadowed - in our case, EPT12 */
11518         return get_vmcs12(vcpu)->ept_pointer;
11519 }
11520
11521 static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
11522 {
11523         WARN_ON(mmu_is_nested(vcpu));
11524         if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu)))
11525                 return 1;
11526
11527         kvm_init_shadow_ept_mmu(vcpu,
11528                         to_vmx(vcpu)->nested.msrs.ept_caps &
11529                         VMX_EPT_EXECUTE_ONLY_BIT,
11530                         nested_ept_ad_enabled(vcpu),
11531                         nested_ept_get_cr3(vcpu));
11532         vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
11533         vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
11534         vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
11535
11536         vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
11537         return 0;
11538 }
11539
11540 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
11541 {
11542         vcpu->arch.walk_mmu = &vcpu->arch.mmu;
11543 }
11544
11545 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
11546                                             u16 error_code)
11547 {
11548         bool inequality, bit;
11549
11550         bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
11551         inequality =
11552                 (error_code & vmcs12->page_fault_error_code_mask) !=
11553                  vmcs12->page_fault_error_code_match;
11554         return inequality ^ bit;
11555 }
11556
11557 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
11558                 struct x86_exception *fault)
11559 {
11560         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
11561
11562         WARN_ON(!is_guest_mode(vcpu));
11563
11564         if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
11565                 !to_vmx(vcpu)->nested.nested_run_pending) {
11566                 vmcs12->vm_exit_intr_error_code = fault->error_code;
11567                 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
11568                                   PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
11569                                   INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
11570                                   fault->address);
11571         } else {
11572                 kvm_inject_page_fault(vcpu, fault);
11573         }
11574 }
11575
11576 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
11577                                                  struct vmcs12 *vmcs12);
11578
11579 static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
11580 {
11581         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
11582         struct vcpu_vmx *vmx = to_vmx(vcpu);
11583         struct page *page;
11584         u64 hpa;
11585
11586         if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
11587                 /*
11588                  * Translate L1 physical address to host physical
11589                  * address for vmcs02. Keep the page pinned, so this
11590                  * physical address remains valid. We keep a reference
11591                  * to it so we can release it later.
11592                  */
11593                 if (vmx->nested.apic_access_page) { /* shouldn't happen */
11594                         kvm_release_page_dirty(vmx->nested.apic_access_page);
11595                         vmx->nested.apic_access_page = NULL;
11596                 }
11597                 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
11598                 /*
11599                  * If translation failed, no matter: This feature asks
11600                  * to exit when accessing the given address, and if it
11601                  * can never be accessed, this feature won't do
11602                  * anything anyway.
11603                  */
11604                 if (!is_error_page(page)) {
11605                         vmx->nested.apic_access_page = page;
11606                         hpa = page_to_phys(vmx->nested.apic_access_page);
11607                         vmcs_write64(APIC_ACCESS_ADDR, hpa);
11608                 } else {
11609                         vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
11610                                         SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
11611                 }
11612         }
11613
11614         if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
11615                 if (vmx->nested.virtual_apic_page) { /* shouldn't happen */
11616                         kvm_release_page_dirty(vmx->nested.virtual_apic_page);
11617                         vmx->nested.virtual_apic_page = NULL;
11618                 }
11619                 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr);
11620
11621                 /*
11622                  * If translation failed, VM entry will fail because
11623                  * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
11624                  * Failing the vm entry is _not_ what the processor
11625                  * does but it's basically the only possibility we
11626                  * have.  We could still enter the guest if CR8 load
11627                  * exits are enabled, CR8 store exits are enabled, and
11628                  * virtualize APIC access is disabled; in this case
11629                  * the processor would never use the TPR shadow and we
11630                  * could simply clear the bit from the execution
11631                  * control.  But such a configuration is useless, so
11632                  * let's keep the code simple.
11633                  */
11634                 if (!is_error_page(page)) {
11635                         vmx->nested.virtual_apic_page = page;
11636                         hpa = page_to_phys(vmx->nested.virtual_apic_page);
11637                         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
11638                 }
11639         }
11640
11641         if (nested_cpu_has_posted_intr(vmcs12)) {
11642                 if (vmx->nested.pi_desc_page) { /* shouldn't happen */
11643                         kunmap(vmx->nested.pi_desc_page);
11644                         kvm_release_page_dirty(vmx->nested.pi_desc_page);
11645                         vmx->nested.pi_desc_page = NULL;
11646                         vmx->nested.pi_desc = NULL;
11647                         vmcs_write64(POSTED_INTR_DESC_ADDR, -1ull);
11648                 }
11649                 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr);
11650                 if (is_error_page(page))
11651                         return;
11652                 vmx->nested.pi_desc_page = page;
11653                 vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page);
11654                 vmx->nested.pi_desc =
11655                         (struct pi_desc *)((void *)vmx->nested.pi_desc +
11656                         (unsigned long)(vmcs12->posted_intr_desc_addr &
11657                         (PAGE_SIZE - 1)));
11658                 vmcs_write64(POSTED_INTR_DESC_ADDR,
11659                         page_to_phys(vmx->nested.pi_desc_page) +
11660                         (unsigned long)(vmcs12->posted_intr_desc_addr &
11661                         (PAGE_SIZE - 1)));
11662         }
11663         if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
11664                 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
11665                               CPU_BASED_USE_MSR_BITMAPS);
11666         else
11667                 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
11668                                 CPU_BASED_USE_MSR_BITMAPS);
11669 }
11670
11671 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
11672 {
11673         u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
11674         struct vcpu_vmx *vmx = to_vmx(vcpu);
11675
11676         /*
11677          * A timer value of zero is architecturally guaranteed to cause
11678          * a VMExit prior to executing any instructions in the guest.
11679          */
11680         if (preemption_timeout == 0) {
11681                 vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
11682                 return;
11683         }
11684
11685         if (vcpu->arch.virtual_tsc_khz == 0)
11686                 return;
11687
11688         preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
11689         preemption_timeout *= 1000000;
11690         do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
11691         hrtimer_start(&vmx->nested.preemption_timer,
11692                       ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
11693 }
11694
11695 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
11696                                                struct vmcs12 *vmcs12)
11697 {
11698         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
11699                 return 0;
11700
11701         if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) ||
11702             !page_address_valid(vcpu, vmcs12->io_bitmap_b))
11703                 return -EINVAL;
11704
11705         return 0;
11706 }
11707
11708 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
11709                                                 struct vmcs12 *vmcs12)
11710 {
11711         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
11712                 return 0;
11713
11714         if (!page_address_valid(vcpu, vmcs12->msr_bitmap))
11715                 return -EINVAL;
11716
11717         return 0;
11718 }
11719
11720 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
11721                                                 struct vmcs12 *vmcs12)
11722 {
11723         if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
11724                 return 0;
11725
11726         if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))
11727                 return -EINVAL;
11728
11729         return 0;
11730 }
11731
11732 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) {
11733         int msr;
11734
11735         for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
11736                 unsigned word = msr / BITS_PER_LONG;
11737
11738                 msr_bitmap[word] = ~0;
11739                 msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
11740         }
11741 }
11742
11743 /*
11744  * Merge L0's and L1's MSR bitmap, return false to indicate that
11745  * we do not use the hardware.
11746  */
11747 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
11748                                                  struct vmcs12 *vmcs12)
11749 {
11750         int msr;
11751         struct page *page;
11752         unsigned long *msr_bitmap_l1;
11753         unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
11754         /*
11755          * pred_cmd & spec_ctrl are trying to verify two things:
11756          *
11757          * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
11758          *    ensures that we do not accidentally generate an L02 MSR bitmap
11759          *    from the L12 MSR bitmap that is too permissive.
11760          * 2. That L1 or L2s have actually used the MSR. This avoids
11761          *    unnecessarily merging of the bitmap if the MSR is unused. This
11762          *    works properly because we only update the L01 MSR bitmap lazily.
11763          *    So even if L0 should pass L1 these MSRs, the L01 bitmap is only
11764          *    updated to reflect this when L1 (or its L2s) actually write to
11765          *    the MSR.
11766          */
11767         bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
11768         bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
11769
11770         /* Nothing to do if the MSR bitmap is not in use.  */
11771         if (!cpu_has_vmx_msr_bitmap() ||
11772             !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
11773                 return false;
11774
11775         if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
11776             !pred_cmd && !spec_ctrl)
11777                 return false;
11778
11779         page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
11780         if (is_error_page(page))
11781                 return false;
11782
11783         msr_bitmap_l1 = (unsigned long *)kmap(page);
11784
11785         /*
11786          * To keep the control flow simple, pay eight 8-byte writes (sixteen
11787          * 4-byte writes on 32-bit systems) up front to enable intercepts for
11788          * the x2APIC MSR range and selectively disable them below.
11789          */
11790         enable_x2apic_msr_intercepts(msr_bitmap_l0);
11791
11792         if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
11793                 if (nested_cpu_has_apic_reg_virt(vmcs12)) {
11794                         /*
11795                          * L0 need not intercept reads for MSRs between 0x800
11796                          * and 0x8ff, it just lets the processor take the value
11797                          * from the virtual-APIC page; take those 256 bits
11798                          * directly from the L1 bitmap.
11799                          */
11800                         for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
11801                                 unsigned word = msr / BITS_PER_LONG;
11802
11803                                 msr_bitmap_l0[word] = msr_bitmap_l1[word];
11804                         }
11805                 }
11806
11807                 nested_vmx_disable_intercept_for_msr(
11808                         msr_bitmap_l1, msr_bitmap_l0,
11809                         X2APIC_MSR(APIC_TASKPRI),
11810                         MSR_TYPE_R | MSR_TYPE_W);
11811
11812                 if (nested_cpu_has_vid(vmcs12)) {
11813                         nested_vmx_disable_intercept_for_msr(
11814                                 msr_bitmap_l1, msr_bitmap_l0,
11815                                 X2APIC_MSR(APIC_EOI),
11816                                 MSR_TYPE_W);
11817                         nested_vmx_disable_intercept_for_msr(
11818                                 msr_bitmap_l1, msr_bitmap_l0,
11819                                 X2APIC_MSR(APIC_SELF_IPI),
11820                                 MSR_TYPE_W);
11821                 }
11822         }
11823
11824         if (spec_ctrl)
11825                 nested_vmx_disable_intercept_for_msr(
11826                                         msr_bitmap_l1, msr_bitmap_l0,
11827                                         MSR_IA32_SPEC_CTRL,
11828                                         MSR_TYPE_R | MSR_TYPE_W);
11829
11830         if (pred_cmd)
11831                 nested_vmx_disable_intercept_for_msr(
11832                                         msr_bitmap_l1, msr_bitmap_l0,
11833                                         MSR_IA32_PRED_CMD,
11834                                         MSR_TYPE_W);
11835
11836         kunmap(page);
11837         kvm_release_page_clean(page);
11838
11839         return true;
11840 }
11841
11842 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
11843                                        struct vmcs12 *vmcs12)
11844 {
11845         struct vmcs12 *shadow;
11846         struct page *page;
11847
11848         if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
11849             vmcs12->vmcs_link_pointer == -1ull)
11850                 return;
11851
11852         shadow = get_shadow_vmcs12(vcpu);
11853         page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
11854
11855         memcpy(shadow, kmap(page), VMCS12_SIZE);
11856
11857         kunmap(page);
11858         kvm_release_page_clean(page);
11859 }
11860
11861 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
11862                                               struct vmcs12 *vmcs12)
11863 {
11864         struct vcpu_vmx *vmx = to_vmx(vcpu);
11865
11866         if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
11867             vmcs12->vmcs_link_pointer == -1ull)
11868                 return;
11869
11870         kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
11871                         get_shadow_vmcs12(vcpu), VMCS12_SIZE);
11872 }
11873
11874 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
11875                                           struct vmcs12 *vmcs12)
11876 {
11877         if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
11878             !page_address_valid(vcpu, vmcs12->apic_access_addr))
11879                 return -EINVAL;
11880         else
11881                 return 0;
11882 }
11883
11884 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
11885                                            struct vmcs12 *vmcs12)
11886 {
11887         if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
11888             !nested_cpu_has_apic_reg_virt(vmcs12) &&
11889             !nested_cpu_has_vid(vmcs12) &&
11890             !nested_cpu_has_posted_intr(vmcs12))
11891                 return 0;
11892
11893         /*
11894          * If virtualize x2apic mode is enabled,
11895          * virtualize apic access must be disabled.
11896          */
11897         if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
11898             nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
11899                 return -EINVAL;
11900
11901         /*
11902          * If virtual interrupt delivery is enabled,
11903          * we must exit on external interrupts.
11904          */
11905         if (nested_cpu_has_vid(vmcs12) &&
11906            !nested_exit_on_intr(vcpu))
11907                 return -EINVAL;
11908
11909         /*
11910          * bits 15:8 should be zero in posted_intr_nv,
11911          * the descriptor address has been already checked
11912          * in nested_get_vmcs12_pages.
11913          *
11914          * bits 5:0 of posted_intr_desc_addr should be zero.
11915          */
11916         if (nested_cpu_has_posted_intr(vmcs12) &&
11917            (!nested_cpu_has_vid(vmcs12) ||
11918             !nested_exit_intr_ack_set(vcpu) ||
11919             (vmcs12->posted_intr_nv & 0xff00) ||
11920             (vmcs12->posted_intr_desc_addr & 0x3f) ||
11921             (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))
11922                 return -EINVAL;
11923
11924         /* tpr shadow is needed by all apicv features. */
11925         if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
11926                 return -EINVAL;
11927
11928         return 0;
11929 }
11930
11931 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
11932                                        unsigned long count_field,
11933                                        unsigned long addr_field)
11934 {
11935         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
11936         int maxphyaddr;
11937         u64 count, addr;
11938
11939         if (vmcs12_read_any(vmcs12, count_field, &count) ||
11940             vmcs12_read_any(vmcs12, addr_field, &addr)) {
11941                 WARN_ON(1);
11942                 return -EINVAL;
11943         }
11944         if (count == 0)
11945                 return 0;
11946         maxphyaddr = cpuid_maxphyaddr(vcpu);
11947         if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
11948             (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
11949                 pr_debug_ratelimited(
11950                         "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
11951                         addr_field, maxphyaddr, count, addr);
11952                 return -EINVAL;
11953         }
11954         return 0;
11955 }
11956
11957 static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
11958                                                 struct vmcs12 *vmcs12)
11959 {
11960         if (vmcs12->vm_exit_msr_load_count == 0 &&
11961             vmcs12->vm_exit_msr_store_count == 0 &&
11962             vmcs12->vm_entry_msr_load_count == 0)
11963                 return 0; /* Fast path */
11964         if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
11965                                         VM_EXIT_MSR_LOAD_ADDR) ||
11966             nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
11967                                         VM_EXIT_MSR_STORE_ADDR) ||
11968             nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
11969                                         VM_ENTRY_MSR_LOAD_ADDR))
11970                 return -EINVAL;
11971         return 0;
11972 }
11973
11974 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
11975                                          struct vmcs12 *vmcs12)
11976 {
11977         u64 address = vmcs12->pml_address;
11978         int maxphyaddr = cpuid_maxphyaddr(vcpu);
11979
11980         if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML)) {
11981                 if (!nested_cpu_has_ept(vmcs12) ||
11982                     !IS_ALIGNED(address, 4096)  ||
11983                     address >> maxphyaddr)
11984                         return -EINVAL;
11985         }
11986
11987         return 0;
11988 }
11989
11990 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
11991                                                  struct vmcs12 *vmcs12)
11992 {
11993         if (!nested_cpu_has_shadow_vmcs(vmcs12))
11994                 return 0;
11995
11996         if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) ||
11997             !page_address_valid(vcpu, vmcs12->vmwrite_bitmap))
11998                 return -EINVAL;
11999
12000         return 0;
12001 }
12002
12003 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
12004                                        struct vmx_msr_entry *e)
12005 {
12006         /* x2APIC MSR accesses are not allowed */
12007         if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)
12008                 return -EINVAL;
12009         if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
12010             e->index == MSR_IA32_UCODE_REV)
12011                 return -EINVAL;
12012         if (e->reserved != 0)
12013                 return -EINVAL;
12014         return 0;
12015 }
12016
12017 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
12018                                      struct vmx_msr_entry *e)
12019 {
12020         if (e->index == MSR_FS_BASE ||
12021             e->index == MSR_GS_BASE ||
12022             e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
12023             nested_vmx_msr_check_common(vcpu, e))
12024                 return -EINVAL;
12025         return 0;
12026 }
12027
12028 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
12029                                       struct vmx_msr_entry *e)
12030 {
12031         if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
12032             nested_vmx_msr_check_common(vcpu, e))
12033                 return -EINVAL;
12034         return 0;
12035 }
12036
12037 /*
12038  * Load guest's/host's msr at nested entry/exit.
12039  * return 0 for success, entry index for failure.
12040  */
12041 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
12042 {
12043         u32 i;
12044         struct vmx_msr_entry e;
12045         struct msr_data msr;
12046
12047         msr.host_initiated = false;
12048         for (i = 0; i < count; i++) {
12049                 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
12050                                         &e, sizeof(e))) {
12051                         pr_debug_ratelimited(
12052                                 "%s cannot read MSR entry (%u, 0x%08llx)\n",
12053                                 __func__, i, gpa + i * sizeof(e));
12054                         goto fail;
12055                 }
12056                 if (nested_vmx_load_msr_check(vcpu, &e)) {
12057                         pr_debug_ratelimited(
12058                                 "%s check failed (%u, 0x%x, 0x%x)\n",
12059                                 __func__, i, e.index, e.reserved);
12060                         goto fail;
12061                 }
12062                 msr.index = e.index;
12063                 msr.data = e.value;
12064                 if (kvm_set_msr(vcpu, &msr)) {
12065                         pr_debug_ratelimited(
12066                                 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
12067                                 __func__, i, e.index, e.value);
12068                         goto fail;
12069                 }
12070         }
12071         return 0;
12072 fail:
12073         return i + 1;
12074 }
12075
12076 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
12077 {
12078         u32 i;
12079         struct vmx_msr_entry e;
12080
12081         for (i = 0; i < count; i++) {
12082                 struct msr_data msr_info;
12083                 if (kvm_vcpu_read_guest(vcpu,
12084                                         gpa + i * sizeof(e),
12085                                         &e, 2 * sizeof(u32))) {
12086                         pr_debug_ratelimited(
12087                                 "%s cannot read MSR entry (%u, 0x%08llx)\n",
12088                                 __func__, i, gpa + i * sizeof(e));
12089                         return -EINVAL;
12090                 }
12091                 if (nested_vmx_store_msr_check(vcpu, &e)) {
12092                         pr_debug_ratelimited(
12093                                 "%s check failed (%u, 0x%x, 0x%x)\n",
12094                                 __func__, i, e.index, e.reserved);
12095                         return -EINVAL;
12096                 }
12097                 msr_info.host_initiated = false;
12098                 msr_info.index = e.index;
12099                 if (kvm_get_msr(vcpu, &msr_info)) {
12100                         pr_debug_ratelimited(
12101                                 "%s cannot read MSR (%u, 0x%x)\n",
12102                                 __func__, i, e.index);
12103                         return -EINVAL;
12104                 }
12105                 if (kvm_vcpu_write_guest(vcpu,
12106                                          gpa + i * sizeof(e) +
12107                                              offsetof(struct vmx_msr_entry, value),
12108                                          &msr_info.data, sizeof(msr_info.data))) {
12109                         pr_debug_ratelimited(
12110                                 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
12111                                 __func__, i, e.index, msr_info.data);
12112                         return -EINVAL;
12113                 }
12114         }
12115         return 0;
12116 }
12117
12118 static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
12119 {
12120         unsigned long invalid_mask;
12121
12122         invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
12123         return (val & invalid_mask) == 0;
12124 }
12125
12126 /*
12127  * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are
12128  * emulating VM entry into a guest with EPT enabled.
12129  * Returns 0 on success, 1 on failure. Invalid state exit qualification code
12130  * is assigned to entry_failure_code on failure.
12131  */
12132 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
12133                                u32 *entry_failure_code)
12134 {
12135         if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
12136                 if (!nested_cr3_valid(vcpu, cr3)) {
12137                         *entry_failure_code = ENTRY_FAIL_DEFAULT;
12138                         return 1;
12139                 }
12140
12141                 /*
12142                  * If PAE paging and EPT are both on, CR3 is not used by the CPU and
12143                  * must not be dereferenced.
12144                  */
12145                 if (is_pae_paging(vcpu) && !nested_ept) {
12146                         if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) {
12147                                 *entry_failure_code = ENTRY_FAIL_PDPTE;
12148                                 return 1;
12149                         }
12150                 }
12151         }
12152
12153         if (!nested_ept)
12154                 kvm_mmu_new_cr3(vcpu, cr3, false);
12155
12156         vcpu->arch.cr3 = cr3;
12157         __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
12158
12159         kvm_init_mmu(vcpu, false);
12160
12161         return 0;
12162 }
12163
12164 static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
12165 {
12166         struct vcpu_vmx *vmx = to_vmx(vcpu);
12167
12168         vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
12169         vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
12170         vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
12171         vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
12172         vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
12173         vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
12174         vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
12175         vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
12176         vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
12177         vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
12178         vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
12179         vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
12180         vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
12181         vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
12182         vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
12183         vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
12184         vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
12185         vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
12186         vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
12187         vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
12188         vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
12189         vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
12190         vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
12191         vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
12192         vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
12193         vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
12194         vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
12195         vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
12196         vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
12197         vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
12198         vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
12199
12200         vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
12201         vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
12202                 vmcs12->guest_pending_dbg_exceptions);
12203         vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
12204         vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
12205
12206         if (nested_cpu_has_xsaves(vmcs12))
12207                 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
12208         vmcs_write64(VMCS_LINK_POINTER, -1ull);
12209
12210         if (cpu_has_vmx_posted_intr())
12211                 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
12212
12213         /*
12214          * Whether page-faults are trapped is determined by a combination of
12215          * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
12216          * If enable_ept, L0 doesn't care about page faults and we should
12217          * set all of these to L1's desires. However, if !enable_ept, L0 does
12218          * care about (at least some) page faults, and because it is not easy
12219          * (if at all possible?) to merge L0 and L1's desires, we simply ask
12220          * to exit on each and every L2 page fault. This is done by setting
12221          * MASK=MATCH=0 and (see below) EB.PF=1.
12222          * Note that below we don't need special code to set EB.PF beyond the
12223          * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
12224          * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
12225          * !enable_ept, EB.PF is 1, so the "or" will always be 1.
12226          */
12227         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
12228                 enable_ept ? vmcs12->page_fault_error_code_mask : 0);
12229         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
12230                 enable_ept ? vmcs12->page_fault_error_code_match : 0);
12231
12232         /* All VMFUNCs are currently emulated through L0 vmexits.  */
12233         if (cpu_has_vmx_vmfunc())
12234                 vmcs_write64(VM_FUNCTION_CONTROL, 0);
12235
12236         if (cpu_has_vmx_apicv()) {
12237                 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
12238                 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
12239                 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
12240                 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
12241         }
12242
12243         /*
12244          * Set host-state according to L0's settings (vmcs12 is irrelevant here)
12245          * Some constant fields are set here by vmx_set_constant_host_state().
12246          * Other fields are different per CPU, and will be set later when
12247          * vmx_vcpu_load() is called, and when vmx_prepare_switch_to_guest()
12248          * is called.
12249          */
12250         vmx_set_constant_host_state(vmx);
12251
12252         /*
12253          * Set the MSR load/store lists to match L0's settings.
12254          */
12255         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
12256         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
12257         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
12258         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
12259         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
12260
12261         set_cr4_guest_host_mask(vmx);
12262
12263         if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
12264             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
12265                 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
12266
12267         if (enable_vpid) {
12268                 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
12269                         vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
12270                 else
12271                         vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
12272         }
12273
12274         /*
12275          * L1 may access the L2's PDPTR, so save them to construct vmcs12
12276          */
12277         if (enable_ept) {
12278                 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
12279                 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
12280                 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
12281                 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
12282         }
12283
12284         if (cpu_has_vmx_msr_bitmap())
12285                 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
12286 }
12287
12288 /*
12289  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
12290  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
12291  * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
12292  * guest in a way that will both be appropriate to L1's requests, and our
12293  * needs. In addition to modifying the active vmcs (which is vmcs02), this
12294  * function also has additional necessary side-effects, like setting various
12295  * vcpu->arch fields.
12296  * Returns 0 on success, 1 on failure. Invalid state exit qualification code
12297  * is assigned to entry_failure_code on failure.
12298  */
12299 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
12300                           u32 *entry_failure_code)
12301 {
12302         struct vcpu_vmx *vmx = to_vmx(vcpu);
12303         u32 exec_control, vmcs12_exec_ctrl;
12304
12305         if (vmx->nested.dirty_vmcs12) {
12306                 prepare_vmcs02_full(vcpu, vmcs12);
12307                 vmx->nested.dirty_vmcs12 = false;
12308         }
12309
12310         /*
12311          * First, the fields that are shadowed.  This must be kept in sync
12312          * with vmx_shadow_fields.h.
12313          */
12314
12315         vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
12316         vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
12317         vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
12318         vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
12319         vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
12320
12321         if (vmx->nested.nested_run_pending &&
12322             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
12323                 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
12324                 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
12325         } else {
12326                 kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
12327                 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
12328         }
12329         if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
12330             !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
12331                 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
12332         if (vmx->nested.nested_run_pending) {
12333                 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
12334                              vmcs12->vm_entry_intr_info_field);
12335                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
12336                              vmcs12->vm_entry_exception_error_code);
12337                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
12338                              vmcs12->vm_entry_instruction_len);
12339                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
12340                              vmcs12->guest_interruptibility_info);
12341                 vmx->loaded_vmcs->nmi_known_unmasked =
12342                         !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
12343         } else {
12344                 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
12345         }
12346         vmx_set_rflags(vcpu, vmcs12->guest_rflags);
12347
12348         exec_control = vmcs12->pin_based_vm_exec_control;
12349
12350         /* Preemption timer setting is computed directly in vmx_vcpu_run.  */
12351         exec_control |= vmcs_config.pin_based_exec_ctrl;
12352         exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
12353         vmx->loaded_vmcs->hv_timer_armed = false;
12354
12355         /* Posted interrupts setting is only taken from vmcs12.  */
12356         if (nested_cpu_has_posted_intr(vmcs12)) {
12357                 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
12358                 vmx->nested.pi_pending = false;
12359         } else {
12360                 exec_control &= ~PIN_BASED_POSTED_INTR;
12361         }
12362
12363         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
12364
12365         vmx->nested.preemption_timer_expired = false;
12366         if (nested_cpu_has_preemption_timer(vmcs12))
12367                 vmx_start_preemption_timer(vcpu);
12368
12369         if (cpu_has_secondary_exec_ctrls()) {
12370                 exec_control = vmx->secondary_exec_control;
12371
12372                 /* Take the following fields only from vmcs12 */
12373                 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
12374                                   SECONDARY_EXEC_ENABLE_INVPCID |
12375                                   SECONDARY_EXEC_RDTSCP |
12376                                   SECONDARY_EXEC_XSAVES |
12377                                   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
12378                                   SECONDARY_EXEC_APIC_REGISTER_VIRT |
12379                                   SECONDARY_EXEC_ENABLE_VMFUNC);
12380                 if (nested_cpu_has(vmcs12,
12381                                    CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
12382                         vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
12383                                 ~SECONDARY_EXEC_ENABLE_PML;
12384                         exec_control |= vmcs12_exec_ctrl;
12385                 }
12386
12387                 /* VMCS shadowing for L2 is emulated for now */
12388                 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
12389
12390                 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
12391                         vmcs_write16(GUEST_INTR_STATUS,
12392                                 vmcs12->guest_intr_status);
12393
12394                 /*
12395                  * Write an illegal value to APIC_ACCESS_ADDR. Later,
12396                  * nested_get_vmcs12_pages will either fix it up or
12397                  * remove the VM execution control.
12398                  */
12399                 if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
12400                         vmcs_write64(APIC_ACCESS_ADDR, -1ull);
12401
12402                 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
12403                         vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
12404
12405                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
12406         }
12407
12408         /*
12409          * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
12410          * entry, but only if the current (host) sp changed from the value
12411          * we wrote last (vmx->host_rsp). This cache is no longer relevant
12412          * if we switch vmcs, and rather than hold a separate cache per vmcs,
12413          * here we just force the write to happen on entry.
12414          */
12415         vmx->host_rsp = 0;
12416
12417         exec_control = vmx_exec_control(vmx); /* L0's desires */
12418         exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
12419         exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
12420         exec_control &= ~CPU_BASED_TPR_SHADOW;
12421         exec_control |= vmcs12->cpu_based_vm_exec_control;
12422
12423         /*
12424          * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
12425          * nested_get_vmcs12_pages can't fix it up, the illegal value
12426          * will result in a VM entry failure.
12427          */
12428         if (exec_control & CPU_BASED_TPR_SHADOW) {
12429                 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
12430                 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
12431         } else {
12432 #ifdef CONFIG_X86_64
12433                 exec_control |= CPU_BASED_CR8_LOAD_EXITING |
12434                                 CPU_BASED_CR8_STORE_EXITING;
12435 #endif
12436         }
12437
12438         /*
12439          * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
12440          * for I/O port accesses.
12441          */
12442         exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
12443         exec_control |= CPU_BASED_UNCOND_IO_EXITING;
12444
12445         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
12446
12447         /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
12448          * bitwise-or of what L1 wants to trap for L2, and what we want to
12449          * trap. Note that CR0.TS also needs updating - we do this later.
12450          */
12451         update_exception_bitmap(vcpu);
12452         vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
12453         vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
12454
12455         /* L2->L1 exit controls are emulated - the hardware exit is to L0 so
12456          * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
12457          * bits are further modified by vmx_set_efer() below.
12458          */
12459         vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
12460
12461         /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
12462          * emulated by vmx_set_efer(), below.
12463          */
12464         vm_entry_controls_init(vmx,
12465                 (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
12466                         ~VM_ENTRY_IA32E_MODE) |
12467                 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
12468
12469         if (vmx->nested.nested_run_pending &&
12470             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
12471                 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
12472                 vcpu->arch.pat = vmcs12->guest_ia32_pat;
12473         } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
12474                 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
12475         }
12476
12477         vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
12478
12479         if (kvm_has_tsc_control)
12480                 decache_tsc_multiplier(vmx);
12481
12482         if (enable_vpid) {
12483                 /*
12484                  * There is no direct mapping between vpid02 and vpid12, the
12485                  * vpid02 is per-vCPU for L0 and reused while the value of
12486                  * vpid12 is changed w/ one invvpid during nested vmentry.
12487                  * The vpid12 is allocated by L1 for L2, so it will not
12488                  * influence global bitmap(for vpid01 and vpid02 allocation)
12489                  * even if spawn a lot of nested vCPUs.
12490                  */
12491                 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
12492                         if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
12493                                 vmx->nested.last_vpid = vmcs12->virtual_processor_id;
12494                                 __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
12495                         }
12496                 } else {
12497                         vmx_flush_tlb(vcpu, true);
12498                 }
12499         }
12500
12501         if (enable_pml) {
12502                 /*
12503                  * Conceptually we want to copy the PML address and index from
12504                  * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
12505                  * since we always flush the log on each vmexit, this happens
12506                  * to be equivalent to simply resetting the fields in vmcs02.
12507                  */
12508                 ASSERT(vmx->pml_pg);
12509                 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
12510                 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
12511         }
12512
12513         if (nested_cpu_has_ept(vmcs12)) {
12514                 if (nested_ept_init_mmu_context(vcpu)) {
12515                         *entry_failure_code = ENTRY_FAIL_DEFAULT;
12516                         return 1;
12517                 }
12518         } else if (nested_cpu_has2(vmcs12,
12519                                    SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
12520                 vmx_flush_tlb(vcpu, true);
12521         }
12522
12523         /*
12524          * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
12525          * bits which we consider mandatory enabled.
12526          * The CR0_READ_SHADOW is what L2 should have expected to read given
12527          * the specifications by L1; It's not enough to take
12528          * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
12529          * have more bits than L1 expected.
12530          */
12531         vmx_set_cr0(vcpu, vmcs12->guest_cr0);
12532         vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
12533
12534         vmx_set_cr4(vcpu, vmcs12->guest_cr4);
12535         vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
12536
12537         if (vmx->nested.nested_run_pending &&
12538             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
12539                 vcpu->arch.efer = vmcs12->guest_ia32_efer;
12540         else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
12541                 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
12542         else
12543                 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
12544         /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
12545         vmx_set_efer(vcpu, vcpu->arch.efer);
12546
12547         /*
12548          * Guest state is invalid and unrestricted guest is disabled,
12549          * which means L1 attempted VMEntry to L2 with invalid state.
12550          * Fail the VMEntry.
12551          */
12552         if (vmx->emulation_required) {
12553                 *entry_failure_code = ENTRY_FAIL_DEFAULT;
12554                 return 1;
12555         }
12556
12557         /* Shadow page tables on either EPT or shadow page tables. */
12558         if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
12559                                 entry_failure_code))
12560                 return 1;
12561
12562         if (!enable_ept)
12563                 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
12564
12565         kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
12566         kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
12567         return 0;
12568 }
12569
12570 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
12571 {
12572         if (!nested_cpu_has_nmi_exiting(vmcs12) &&
12573             nested_cpu_has_virtual_nmis(vmcs12))
12574                 return -EINVAL;
12575
12576         if (!nested_cpu_has_virtual_nmis(vmcs12) &&
12577             nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))
12578                 return -EINVAL;
12579
12580         return 0;
12581 }
12582
12583 static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
12584 {
12585         struct vcpu_vmx *vmx = to_vmx(vcpu);
12586
12587         if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
12588             vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
12589                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12590
12591         if (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)
12592                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12593
12594         if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12))
12595                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12596
12597         if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
12598                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12599
12600         if (nested_vmx_check_apic_access_controls(vcpu, vmcs12))
12601                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12602
12603         if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12))
12604                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12605
12606         if (nested_vmx_check_apicv_controls(vcpu, vmcs12))
12607                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12608
12609         if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12))
12610                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12611
12612         if (nested_vmx_check_pml_controls(vcpu, vmcs12))
12613                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12614
12615         if (nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12))
12616                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12617
12618         if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
12619                                 vmx->nested.msrs.procbased_ctls_low,
12620                                 vmx->nested.msrs.procbased_ctls_high) ||
12621             (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
12622              !vmx_control_verify(vmcs12->secondary_vm_exec_control,
12623                                  vmx->nested.msrs.secondary_ctls_low,
12624                                  vmx->nested.msrs.secondary_ctls_high)) ||
12625             !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
12626                                 vmx->nested.msrs.pinbased_ctls_low,
12627                                 vmx->nested.msrs.pinbased_ctls_high) ||
12628             !vmx_control_verify(vmcs12->vm_exit_controls,
12629                                 vmx->nested.msrs.exit_ctls_low,
12630                                 vmx->nested.msrs.exit_ctls_high) ||
12631             !vmx_control_verify(vmcs12->vm_entry_controls,
12632                                 vmx->nested.msrs.entry_ctls_low,
12633                                 vmx->nested.msrs.entry_ctls_high))
12634                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12635
12636         if (nested_vmx_check_nmi_controls(vmcs12))
12637                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12638
12639         if (nested_cpu_has_vmfunc(vmcs12)) {
12640                 if (vmcs12->vm_function_control &
12641                     ~vmx->nested.msrs.vmfunc_controls)
12642                         return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12643
12644                 if (nested_cpu_has_eptp_switching(vmcs12)) {
12645                         if (!nested_cpu_has_ept(vmcs12) ||
12646                             !page_address_valid(vcpu, vmcs12->eptp_list_address))
12647                                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12648                 }
12649         }
12650
12651         if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu))
12652                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12653
12654         if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
12655             !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
12656             !nested_cr3_valid(vcpu, vmcs12->host_cr3))
12657                 return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
12658
12659         /*
12660          * From the Intel SDM, volume 3:
12661          * Fields relevant to VM-entry event injection must be set properly.
12662          * These fields are the VM-entry interruption-information field, the
12663          * VM-entry exception error code, and the VM-entry instruction length.
12664          */
12665         if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
12666                 u32 intr_info = vmcs12->vm_entry_intr_info_field;
12667                 u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
12668                 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
12669                 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
12670                 bool should_have_error_code;
12671                 bool urg = nested_cpu_has2(vmcs12,
12672                                            SECONDARY_EXEC_UNRESTRICTED_GUEST);
12673                 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
12674
12675                 /* VM-entry interruption-info field: interruption type */
12676                 if (intr_type == INTR_TYPE_RESERVED ||
12677                     (intr_type == INTR_TYPE_OTHER_EVENT &&
12678                      !nested_cpu_supports_monitor_trap_flag(vcpu)))
12679                         return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12680
12681                 /* VM-entry interruption-info field: vector */
12682                 if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
12683                     (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
12684                     (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
12685                         return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12686
12687                 /* VM-entry interruption-info field: deliver error code */
12688                 should_have_error_code =
12689                         intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
12690                         x86_exception_has_error_code(vector);
12691                 if (has_error_code != should_have_error_code)
12692                         return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12693
12694                 /* VM-entry exception error code */
12695                 if (has_error_code &&
12696                     vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))
12697                         return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12698
12699                 /* VM-entry interruption-info field: reserved bits */
12700                 if (intr_info & INTR_INFO_RESVD_BITS_MASK)
12701                         return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12702
12703                 /* VM-entry instruction length */
12704                 switch (intr_type) {
12705                 case INTR_TYPE_SOFT_EXCEPTION:
12706                 case INTR_TYPE_SOFT_INTR:
12707                 case INTR_TYPE_PRIV_SW_EXCEPTION:
12708                         if ((vmcs12->vm_entry_instruction_len > 15) ||
12709                             (vmcs12->vm_entry_instruction_len == 0 &&
12710                              !nested_cpu_has_zero_length_injection(vcpu)))
12711                                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
12712                 }
12713         }
12714
12715         return 0;
12716 }
12717
12718 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
12719                                           struct vmcs12 *vmcs12)
12720 {
12721         int r;
12722         struct page *page;
12723         struct vmcs12 *shadow;
12724
12725         if (vmcs12->vmcs_link_pointer == -1ull)
12726                 return 0;
12727
12728         if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))
12729                 return -EINVAL;
12730
12731         page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
12732         if (is_error_page(page))
12733                 return -EINVAL;
12734
12735         r = 0;
12736         shadow = kmap(page);
12737         if (shadow->hdr.revision_id != VMCS12_REVISION ||
12738             shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
12739                 r = -EINVAL;
12740         kunmap(page);
12741         kvm_release_page_clean(page);
12742         return r;
12743 }
12744
12745 static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
12746                                   u32 *exit_qual)
12747 {
12748         bool ia32e;
12749
12750         *exit_qual = ENTRY_FAIL_DEFAULT;
12751
12752         if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
12753             !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
12754                 return 1;
12755
12756         if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
12757                 *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
12758                 return 1;
12759         }
12760
12761         /*
12762          * If the load IA32_EFER VM-entry control is 1, the following checks
12763          * are performed on the field for the IA32_EFER MSR:
12764          * - Bits reserved in the IA32_EFER MSR must be 0.
12765          * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
12766          *   the IA-32e mode guest VM-exit control. It must also be identical
12767          *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
12768          *   CR0.PG) is 1.
12769          */
12770         if (to_vmx(vcpu)->nested.nested_run_pending &&
12771             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
12772                 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
12773                 if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
12774                     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
12775                     ((vmcs12->guest_cr0 & X86_CR0_PG) &&
12776                      ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
12777                         return 1;
12778         }
12779
12780         /*
12781          * If the load IA32_EFER VM-exit control is 1, bits reserved in the
12782          * IA32_EFER MSR must be 0 in the field for that register. In addition,
12783          * the values of the LMA and LME bits in the field must each be that of
12784          * the host address-space size VM-exit control.
12785          */
12786         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
12787                 ia32e = (vmcs12->vm_exit_controls &
12788                          VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
12789                 if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
12790                     ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
12791                     ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
12792                         return 1;
12793         }
12794
12795         if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
12796                 (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
12797                 (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
12798                         return 1;
12799
12800         return 0;
12801 }
12802
12803 /*
12804  * If exit_qual is NULL, this is being called from state restore (either RSM
12805  * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
12806  */
12807 static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
12808 {
12809         struct vcpu_vmx *vmx = to_vmx(vcpu);
12810         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
12811         bool from_vmentry = !!exit_qual;
12812         u32 dummy_exit_qual;
12813         bool evaluate_pending_interrupts;
12814         int r = 0;
12815
12816         evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
12817                 (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
12818         if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
12819                 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
12820
12821         enter_guest_mode(vcpu);
12822
12823         if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
12824                 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
12825         if (kvm_mpx_supported() &&
12826                 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
12827                 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
12828
12829         vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
12830         vmx_segment_cache_clear(vmx);
12831
12832         if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
12833                 vcpu->arch.tsc_offset += vmcs12->tsc_offset;
12834
12835         r = EXIT_REASON_INVALID_STATE;
12836         if (prepare_vmcs02(vcpu, vmcs12, from_vmentry ? exit_qual : &dummy_exit_qual))
12837                 goto fail;
12838
12839         if (from_vmentry) {
12840                 nested_get_vmcs12_pages(vcpu);
12841
12842                 r = EXIT_REASON_MSR_LOAD_FAIL;
12843                 *exit_qual = nested_vmx_load_msr(vcpu,
12844                                                  vmcs12->vm_entry_msr_load_addr,
12845                                                  vmcs12->vm_entry_msr_load_count);
12846                 if (*exit_qual)
12847                         goto fail;
12848         } else {
12849                 /*
12850                  * The MMU is not initialized to point at the right entities yet and
12851                  * "get pages" would need to read data from the guest (i.e. we will
12852                  * need to perform gpa to hpa translation). Request a call
12853                  * to nested_get_vmcs12_pages before the next VM-entry.  The MSRs
12854                  * have already been set at vmentry time and should not be reset.
12855                  */
12856                 kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
12857         }
12858
12859         /*
12860          * If L1 had a pending IRQ/NMI until it executed
12861          * VMLAUNCH/VMRESUME which wasn't delivered because it was
12862          * disallowed (e.g. interrupts disabled), L0 needs to
12863          * evaluate if this pending event should cause an exit from L2
12864          * to L1 or delivered directly to L2 (e.g. In case L1 don't
12865          * intercept EXTERNAL_INTERRUPT).
12866          *
12867          * Usually this would be handled by the processor noticing an
12868          * IRQ/NMI window request, or checking RVI during evaluation of
12869          * pending virtual interrupts.  However, this setting was done
12870          * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
12871          * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
12872          */
12873         if (unlikely(evaluate_pending_interrupts))
12874                 kvm_make_request(KVM_REQ_EVENT, vcpu);
12875
12876         /*
12877          * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
12878          * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
12879          * returned as far as L1 is concerned. It will only return (and set
12880          * the success flag) when L2 exits (see nested_vmx_vmexit()).
12881          */
12882         return 0;
12883
12884 fail:
12885         if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
12886                 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
12887         leave_guest_mode(vcpu);
12888         vmx_switch_vmcs(vcpu, &vmx->vmcs01);
12889         return r;
12890 }
12891
12892 /*
12893  * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
12894  * for running an L2 nested guest.
12895  */
12896 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
12897 {
12898         struct vmcs12 *vmcs12;
12899         struct vcpu_vmx *vmx = to_vmx(vcpu);
12900         u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
12901         u32 exit_qual;
12902         int ret;
12903
12904         if (!nested_vmx_check_permission(vcpu))
12905                 return 1;
12906
12907         if (!nested_vmx_check_vmcs12(vcpu))
12908                 goto out;
12909
12910         vmcs12 = get_vmcs12(vcpu);
12911
12912         /*
12913          * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
12914          * that there *is* a valid VMCS pointer, RFLAGS.CF is set
12915          * rather than RFLAGS.ZF, and no error number is stored to the
12916          * VM-instruction error field.
12917          */
12918         if (vmcs12->hdr.shadow_vmcs) {
12919                 nested_vmx_failInvalid(vcpu);
12920                 goto out;
12921         }
12922
12923         if (enable_shadow_vmcs)
12924                 copy_shadow_to_vmcs12(vmx);
12925
12926         /*
12927          * The nested entry process starts with enforcing various prerequisites
12928          * on vmcs12 as required by the Intel SDM, and act appropriately when
12929          * they fail: As the SDM explains, some conditions should cause the
12930          * instruction to fail, while others will cause the instruction to seem
12931          * to succeed, but return an EXIT_REASON_INVALID_STATE.
12932          * To speed up the normal (success) code path, we should avoid checking
12933          * for misconfigurations which will anyway be caught by the processor
12934          * when using the merged vmcs02.
12935          */
12936         if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) {
12937                 nested_vmx_failValid(vcpu,
12938                                      VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
12939                 goto out;
12940         }
12941
12942         if (vmcs12->launch_state == launch) {
12943                 nested_vmx_failValid(vcpu,
12944                         launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
12945                                : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
12946                 goto out;
12947         }
12948
12949         ret = check_vmentry_prereqs(vcpu, vmcs12);
12950         if (ret) {
12951                 nested_vmx_failValid(vcpu, ret);
12952                 goto out;
12953         }
12954
12955         /*
12956          * After this point, the trap flag no longer triggers a singlestep trap
12957          * on the vm entry instructions; don't call kvm_skip_emulated_instruction.
12958          * This is not 100% correct; for performance reasons, we delegate most
12959          * of the checks on host state to the processor.  If those fail,
12960          * the singlestep trap is missed.
12961          */
12962         skip_emulated_instruction(vcpu);
12963
12964         ret = check_vmentry_postreqs(vcpu, vmcs12, &exit_qual);
12965         if (ret) {
12966                 nested_vmx_entry_failure(vcpu, vmcs12,
12967                                          EXIT_REASON_INVALID_STATE, exit_qual);
12968                 return 1;
12969         }
12970
12971         /*
12972          * We're finally done with prerequisite checking, and can start with
12973          * the nested entry.
12974          */
12975
12976         vmx->nested.nested_run_pending = 1;
12977         ret = enter_vmx_non_root_mode(vcpu, &exit_qual);
12978         if (ret) {
12979                 nested_vmx_entry_failure(vcpu, vmcs12, ret, exit_qual);
12980                 vmx->nested.nested_run_pending = 0;
12981                 return 1;
12982         }
12983
12984         /* Hide L1D cache contents from the nested guest.  */
12985         vmx->vcpu.arch.l1tf_flush_l1d = true;
12986
12987         /*
12988          * Must happen outside of enter_vmx_non_root_mode() as it will
12989          * also be used as part of restoring nVMX state for
12990          * snapshot restore (migration).
12991          *
12992          * In this flow, it is assumed that vmcs12 cache was
12993          * trasferred as part of captured nVMX state and should
12994          * therefore not be read from guest memory (which may not
12995          * exist on destination host yet).
12996          */
12997         nested_cache_shadow_vmcs12(vcpu, vmcs12);
12998
12999         /*
13000          * If we're entering a halted L2 vcpu and the L2 vcpu won't be
13001          * awakened by event injection or by an NMI-window VM-exit or
13002          * by an interrupt-window VM-exit, halt the vcpu.
13003          */
13004         if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
13005             !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
13006             !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) &&
13007             !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) &&
13008               (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
13009                 vmx->nested.nested_run_pending = 0;
13010                 return kvm_vcpu_halt(vcpu);
13011         }
13012         return 1;
13013
13014 out:
13015         return kvm_skip_emulated_instruction(vcpu);
13016 }
13017
13018 /*
13019  * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
13020  * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
13021  * This function returns the new value we should put in vmcs12.guest_cr0.
13022  * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
13023  *  1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
13024  *     available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
13025  *     didn't trap the bit, because if L1 did, so would L0).
13026  *  2. Bits that L1 asked to trap (and therefore L0 also did) could not have
13027  *     been modified by L2, and L1 knows it. So just leave the old value of
13028  *     the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
13029  *     isn't relevant, because if L0 traps this bit it can set it to anything.
13030  *  3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
13031  *     changed these bits, and therefore they need to be updated, but L0
13032  *     didn't necessarily allow them to be changed in GUEST_CR0 - and rather
13033  *     put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
13034  */
13035 static inline unsigned long
13036 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
13037 {
13038         return
13039         /*1*/   (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
13040         /*2*/   (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
13041         /*3*/   (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
13042                         vcpu->arch.cr0_guest_owned_bits));
13043 }
13044
13045 static inline unsigned long
13046 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
13047 {
13048         return
13049         /*1*/   (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
13050         /*2*/   (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
13051         /*3*/   (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
13052                         vcpu->arch.cr4_guest_owned_bits));
13053 }
13054
13055 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
13056                                        struct vmcs12 *vmcs12)
13057 {
13058         u32 idt_vectoring;
13059         unsigned int nr;
13060
13061         if (vcpu->arch.exception.injected) {
13062                 nr = vcpu->arch.exception.nr;
13063                 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
13064
13065                 if (kvm_exception_is_soft(nr)) {
13066                         vmcs12->vm_exit_instruction_len =
13067                                 vcpu->arch.event_exit_inst_len;
13068                         idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
13069                 } else
13070                         idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
13071
13072                 if (vcpu->arch.exception.has_error_code) {
13073                         idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
13074                         vmcs12->idt_vectoring_error_code =
13075                                 vcpu->arch.exception.error_code;
13076                 }
13077
13078                 vmcs12->idt_vectoring_info_field = idt_vectoring;
13079         } else if (vcpu->arch.nmi_injected) {
13080                 vmcs12->idt_vectoring_info_field =
13081                         INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
13082         } else if (vcpu->arch.interrupt.injected) {
13083                 nr = vcpu->arch.interrupt.nr;
13084                 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
13085
13086                 if (vcpu->arch.interrupt.soft) {
13087                         idt_vectoring |= INTR_TYPE_SOFT_INTR;
13088                         vmcs12->vm_entry_instruction_len =
13089                                 vcpu->arch.event_exit_inst_len;
13090                 } else
13091                         idt_vectoring |= INTR_TYPE_EXT_INTR;
13092
13093                 vmcs12->idt_vectoring_info_field = idt_vectoring;
13094         }
13095 }
13096
13097 static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
13098 {
13099         struct vcpu_vmx *vmx = to_vmx(vcpu);
13100         unsigned long exit_qual;
13101         bool block_nested_events =
13102             vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
13103
13104         if (vcpu->arch.exception.pending &&
13105                 nested_vmx_check_exception(vcpu, &exit_qual)) {
13106                 if (block_nested_events)
13107                         return -EBUSY;
13108                 nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
13109                 return 0;
13110         }
13111
13112         if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
13113             vmx->nested.preemption_timer_expired) {
13114                 if (block_nested_events)
13115                         return -EBUSY;
13116                 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
13117                 return 0;
13118         }
13119
13120         if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
13121                 if (block_nested_events)
13122                         return -EBUSY;
13123                 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
13124                                   NMI_VECTOR | INTR_TYPE_NMI_INTR |
13125                                   INTR_INFO_VALID_MASK, 0);
13126                 /*
13127                  * The NMI-triggered VM exit counts as injection:
13128                  * clear this one and block further NMIs.
13129                  */
13130                 vcpu->arch.nmi_pending = 0;
13131                 vmx_set_nmi_mask(vcpu, true);
13132                 return 0;
13133         }
13134
13135         if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(vcpu)) {
13136                 if (block_nested_events)
13137                         return -EBUSY;
13138                 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
13139                 return 0;
13140         }
13141
13142         vmx_complete_nested_posted_interrupt(vcpu);
13143         return 0;
13144 }
13145
13146 static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
13147 {
13148         to_vmx(vcpu)->req_immediate_exit = true;
13149 }
13150
13151 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
13152 {
13153         ktime_t remaining =
13154                 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
13155         u64 value;
13156
13157         if (ktime_to_ns(remaining) <= 0)
13158                 return 0;
13159
13160         value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
13161         do_div(value, 1000000);
13162         return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
13163 }
13164
13165 /*
13166  * Update the guest state fields of vmcs12 to reflect changes that
13167  * occurred while L2 was running. (The "IA-32e mode guest" bit of the
13168  * VM-entry controls is also updated, since this is really a guest
13169  * state bit.)
13170  */
13171 static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
13172 {
13173         vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
13174         vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
13175
13176         vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
13177         vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
13178         vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
13179
13180         vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
13181         vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
13182         vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
13183         vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
13184         vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
13185         vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
13186         vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
13187         vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
13188         vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
13189         vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
13190         vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
13191         vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
13192         vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
13193         vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
13194         vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
13195         vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
13196         vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
13197         vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
13198         vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
13199         vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
13200         vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
13201         vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
13202         vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
13203         vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
13204         vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
13205         vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
13206         vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
13207         vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
13208         vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
13209         vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
13210         vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
13211         vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
13212         vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
13213         vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
13214         vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
13215         vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
13216
13217         vmcs12->guest_interruptibility_info =
13218                 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
13219         vmcs12->guest_pending_dbg_exceptions =
13220                 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
13221         if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
13222                 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
13223         else
13224                 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
13225
13226         if (nested_cpu_has_preemption_timer(vmcs12)) {
13227                 if (vmcs12->vm_exit_controls &
13228                     VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
13229                         vmcs12->vmx_preemption_timer_value =
13230                                 vmx_get_preemption_timer_value(vcpu);
13231                 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
13232         }
13233
13234         /*
13235          * In some cases (usually, nested EPT), L2 is allowed to change its
13236          * own CR3 without exiting. If it has changed it, we must keep it.
13237          * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
13238          * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
13239          *
13240          * Additionally, restore L2's PDPTR to vmcs12.
13241          */
13242         if (enable_ept) {
13243                 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
13244                 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
13245                 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
13246                 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
13247                 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
13248         }
13249
13250         vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
13251
13252         if (nested_cpu_has_vid(vmcs12))
13253                 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
13254
13255         vmcs12->vm_entry_controls =
13256                 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
13257                 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
13258
13259         if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) {
13260                 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
13261                 vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
13262         }
13263
13264         /* TODO: These cannot have changed unless we have MSR bitmaps and
13265          * the relevant bit asks not to trap the change */
13266         if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
13267                 vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
13268         if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
13269                 vmcs12->guest_ia32_efer = vcpu->arch.efer;
13270         vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
13271         vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
13272         vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
13273         if (kvm_mpx_supported())
13274                 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
13275 }
13276
13277 /*
13278  * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
13279  * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
13280  * and this function updates it to reflect the changes to the guest state while
13281  * L2 was running (and perhaps made some exits which were handled directly by L0
13282  * without going back to L1), and to reflect the exit reason.
13283  * Note that we do not have to copy here all VMCS fields, just those that
13284  * could have changed by the L2 guest or the exit - i.e., the guest-state and
13285  * exit-information fields only. Other fields are modified by L1 with VMWRITE,
13286  * which already writes to vmcs12 directly.
13287  */
13288 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
13289                            u32 exit_reason, u32 exit_intr_info,
13290                            unsigned long exit_qualification)
13291 {
13292         /* update guest state fields: */
13293         sync_vmcs12(vcpu, vmcs12);
13294
13295         /* update exit information fields: */
13296
13297         vmcs12->vm_exit_reason = exit_reason;
13298         vmcs12->exit_qualification = exit_qualification;
13299         vmcs12->vm_exit_intr_info = exit_intr_info;
13300
13301         vmcs12->idt_vectoring_info_field = 0;
13302         vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
13303         vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
13304
13305         if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
13306                 vmcs12->launch_state = 1;
13307
13308                 /* vm_entry_intr_info_field is cleared on exit. Emulate this
13309                  * instead of reading the real value. */
13310                 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
13311
13312                 /*
13313                  * Transfer the event that L0 or L1 may wanted to inject into
13314                  * L2 to IDT_VECTORING_INFO_FIELD.
13315                  */
13316                 vmcs12_save_pending_event(vcpu, vmcs12);
13317         }
13318 }
13319
13320 /*
13321  * A part of what we need to when the nested L2 guest exits and we want to
13322  * run its L1 parent, is to reset L1's guest state to the host state specified
13323  * in vmcs12.
13324  * This function is to be called not only on normal nested exit, but also on
13325  * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
13326  * Failures During or After Loading Guest State").
13327  * This function should be called when the active VMCS is L1's (vmcs01).
13328  */
13329 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
13330                                    struct vmcs12 *vmcs12)
13331 {
13332         struct kvm_segment seg;
13333         u32 entry_failure_code;
13334
13335         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
13336                 vcpu->arch.efer = vmcs12->host_ia32_efer;
13337         else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
13338                 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
13339         else
13340                 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
13341         vmx_set_efer(vcpu, vcpu->arch.efer);
13342
13343         kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
13344         kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
13345         vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
13346         /*
13347          * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
13348          * actually changed, because vmx_set_cr0 refers to efer set above.
13349          *
13350          * CR0_GUEST_HOST_MASK is already set in the original vmcs01
13351          * (KVM doesn't change it);
13352          */
13353         vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
13354         vmx_set_cr0(vcpu, vmcs12->host_cr0);
13355
13356         /* Same as above - no reason to call set_cr4_guest_host_mask().  */
13357         vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
13358         vmx_set_cr4(vcpu, vmcs12->host_cr4);
13359
13360         nested_ept_uninit_mmu_context(vcpu);
13361
13362         /*
13363          * Only PDPTE load can fail as the value of cr3 was checked on entry and
13364          * couldn't have changed.
13365          */
13366         if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
13367                 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
13368
13369         if (!enable_ept)
13370                 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
13371
13372         /*
13373          * If vmcs01 don't use VPID, CPU flushes TLB on every
13374          * VMEntry/VMExit. Thus, no need to flush TLB.
13375          *
13376          * If vmcs12 uses VPID, TLB entries populated by L2 are
13377          * tagged with vmx->nested.vpid02 while L1 entries are tagged
13378          * with vmx->vpid. Thus, no need to flush TLB.
13379          *
13380          * Therefore, flush TLB only in case vmcs01 uses VPID and
13381          * vmcs12 don't use VPID as in this case L1 & L2 TLB entries
13382          * are both tagged with vmx->vpid.
13383          */
13384         if (enable_vpid &&
13385             !(nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02)) {
13386                 vmx_flush_tlb(vcpu, true);
13387         }
13388
13389         vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
13390         vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
13391         vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
13392         vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
13393         vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
13394         vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
13395         vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
13396
13397         /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1.  */
13398         if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
13399                 vmcs_write64(GUEST_BNDCFGS, 0);
13400
13401         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
13402                 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
13403                 vcpu->arch.pat = vmcs12->host_ia32_pat;
13404         }
13405         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
13406                 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
13407                         vmcs12->host_ia32_perf_global_ctrl);
13408
13409         /* Set L1 segment info according to Intel SDM
13410             27.5.2 Loading Host Segment and Descriptor-Table Registers */
13411         seg = (struct kvm_segment) {
13412                 .base = 0,
13413                 .limit = 0xFFFFFFFF,
13414                 .selector = vmcs12->host_cs_selector,
13415                 .type = 11,
13416                 .present = 1,
13417                 .s = 1,
13418                 .g = 1
13419         };
13420         if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
13421                 seg.l = 1;
13422         else
13423                 seg.db = 1;
13424         vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
13425         seg = (struct kvm_segment) {
13426                 .base = 0,
13427                 .limit = 0xFFFFFFFF,
13428                 .type = 3,
13429                 .present = 1,
13430                 .s = 1,
13431                 .db = 1,
13432                 .g = 1
13433         };
13434         seg.selector = vmcs12->host_ds_selector;
13435         vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
13436         seg.selector = vmcs12->host_es_selector;
13437         vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
13438         seg.selector = vmcs12->host_ss_selector;
13439         vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
13440         seg.selector = vmcs12->host_fs_selector;
13441         seg.base = vmcs12->host_fs_base;
13442         vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
13443         seg.selector = vmcs12->host_gs_selector;
13444         seg.base = vmcs12->host_gs_base;
13445         vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
13446         seg = (struct kvm_segment) {
13447                 .base = vmcs12->host_tr_base,
13448                 .limit = 0x67,
13449                 .selector = vmcs12->host_tr_selector,
13450                 .type = 11,
13451                 .present = 1
13452         };
13453         vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
13454
13455         kvm_set_dr(vcpu, 7, 0x400);
13456         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
13457
13458         if (cpu_has_vmx_msr_bitmap())
13459                 vmx_update_msr_bitmap(vcpu);
13460
13461         if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
13462                                 vmcs12->vm_exit_msr_load_count))
13463                 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
13464 }
13465
13466 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
13467 {
13468         struct shared_msr_entry *efer_msr;
13469         unsigned int i;
13470
13471         if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
13472                 return vmcs_read64(GUEST_IA32_EFER);
13473
13474         if (cpu_has_load_ia32_efer)
13475                 return host_efer;
13476
13477         for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
13478                 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
13479                         return vmx->msr_autoload.guest.val[i].value;
13480         }
13481
13482         efer_msr = find_msr_entry(vmx, MSR_EFER);
13483         if (efer_msr)
13484                 return efer_msr->data;
13485
13486         return host_efer;
13487 }
13488
13489 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
13490 {
13491         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
13492         struct vcpu_vmx *vmx = to_vmx(vcpu);
13493         struct vmx_msr_entry g, h;
13494         struct msr_data msr;
13495         gpa_t gpa;
13496         u32 i, j;
13497
13498         vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
13499
13500         if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
13501                 /*
13502                  * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
13503                  * as vmcs01.GUEST_DR7 contains a userspace defined value
13504                  * and vcpu->arch.dr7 is not squirreled away before the
13505                  * nested VMENTER (not worth adding a variable in nested_vmx).
13506                  */
13507                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
13508                         kvm_set_dr(vcpu, 7, DR7_FIXED_1);
13509                 else
13510                         WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
13511         }
13512
13513         /*
13514          * Note that calling vmx_set_{efer,cr0,cr4} is important as they
13515          * handle a variety of side effects to KVM's software model.
13516          */
13517         vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
13518
13519         vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
13520         vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
13521
13522         vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
13523         vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
13524
13525         nested_ept_uninit_mmu_context(vcpu);
13526         vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
13527         __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
13528
13529         /*
13530          * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
13531          * from vmcs01 (if necessary).  The PDPTRs are not loaded on
13532          * VMFail, like everything else we just need to ensure our
13533          * software model is up-to-date.
13534          */
13535         ept_save_pdptrs(vcpu);
13536
13537         kvm_mmu_reset_context(vcpu);
13538
13539         if (cpu_has_vmx_msr_bitmap())
13540                 vmx_update_msr_bitmap(vcpu);
13541
13542         /*
13543          * This nasty bit of open coding is a compromise between blindly
13544          * loading L1's MSRs using the exit load lists (incorrect emulation
13545          * of VMFail), leaving the nested VM's MSRs in the software model
13546          * (incorrect behavior) and snapshotting the modified MSRs (too
13547          * expensive since the lists are unbound by hardware).  For each
13548          * MSR that was (prematurely) loaded from the nested VMEntry load
13549          * list, reload it from the exit load list if it exists and differs
13550          * from the guest value.  The intent is to stuff host state as
13551          * silently as possible, not to fully process the exit load list.
13552          */
13553         msr.host_initiated = false;
13554         for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
13555                 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
13556                 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
13557                         pr_debug_ratelimited(
13558                                 "%s read MSR index failed (%u, 0x%08llx)\n",
13559                                 __func__, i, gpa);
13560                         goto vmabort;
13561                 }
13562
13563                 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
13564                         gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
13565                         if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
13566                                 pr_debug_ratelimited(
13567                                         "%s read MSR failed (%u, 0x%08llx)\n",
13568                                         __func__, j, gpa);
13569                                 goto vmabort;
13570                         }
13571                         if (h.index != g.index)
13572                                 continue;
13573                         if (h.value == g.value)
13574                                 break;
13575
13576                         if (nested_vmx_load_msr_check(vcpu, &h)) {
13577                                 pr_debug_ratelimited(
13578                                         "%s check failed (%u, 0x%x, 0x%x)\n",
13579                                         __func__, j, h.index, h.reserved);
13580                                 goto vmabort;
13581                         }
13582
13583                         msr.index = h.index;
13584                         msr.data = h.value;
13585                         if (kvm_set_msr(vcpu, &msr)) {
13586                                 pr_debug_ratelimited(
13587                                         "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
13588                                         __func__, j, h.index, h.value);
13589                                 goto vmabort;
13590                         }
13591                 }
13592         }
13593
13594         return;
13595
13596 vmabort:
13597         nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
13598 }
13599
13600 /*
13601  * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
13602  * and modify vmcs12 to make it see what it would expect to see there if
13603  * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
13604  */
13605 static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
13606                               u32 exit_intr_info,
13607                               unsigned long exit_qualification)
13608 {
13609         struct vcpu_vmx *vmx = to_vmx(vcpu);
13610         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
13611
13612         /* trying to cancel vmlaunch/vmresume is a bug */
13613         WARN_ON_ONCE(vmx->nested.nested_run_pending);
13614
13615         /*
13616          * The only expected VM-instruction error is "VM entry with
13617          * invalid control field(s)." Anything else indicates a
13618          * problem with L0.
13619          */
13620         WARN_ON_ONCE(vmx->fail && (vmcs_read32(VM_INSTRUCTION_ERROR) !=
13621                                    VMXERR_ENTRY_INVALID_CONTROL_FIELD));
13622
13623         leave_guest_mode(vcpu);
13624
13625         if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
13626                 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
13627
13628         if (likely(!vmx->fail)) {
13629                 if (exit_reason == -1)
13630                         sync_vmcs12(vcpu, vmcs12);
13631                 else
13632                         prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
13633                                        exit_qualification);
13634
13635                 /*
13636                  * Must happen outside of sync_vmcs12() as it will
13637                  * also be used to capture vmcs12 cache as part of
13638                  * capturing nVMX state for snapshot (migration).
13639                  *
13640                  * Otherwise, this flush will dirty guest memory at a
13641                  * point it is already assumed by user-space to be
13642                  * immutable.
13643                  */
13644                 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
13645
13646                 if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
13647                                          vmcs12->vm_exit_msr_store_count))
13648                         nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
13649         }
13650
13651         /*
13652          * Drop events/exceptions that were queued for re-injection to L2
13653          * (picked up via vmx_complete_interrupts()), as well as exceptions
13654          * that were pending for L2.  Note, this must NOT be hoisted above
13655          * prepare_vmcs12(), events/exceptions queued for re-injection need to
13656          * be captured in vmcs12 (see vmcs12_save_pending_event()).
13657          */
13658         vcpu->arch.nmi_injected = false;
13659         kvm_clear_exception_queue(vcpu);
13660         kvm_clear_interrupt_queue(vcpu);
13661
13662         vmx_switch_vmcs(vcpu, &vmx->vmcs01);
13663         vm_entry_controls_reset_shadow(vmx);
13664         vm_exit_controls_reset_shadow(vmx);
13665         vmx_segment_cache_clear(vmx);
13666
13667         /* Update any VMCS fields that might have changed while L2 ran */
13668         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
13669         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
13670         vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
13671
13672         if (kvm_has_tsc_control)
13673                 decache_tsc_multiplier(vmx);
13674
13675         if (vmx->nested.change_vmcs01_virtual_apic_mode) {
13676                 vmx->nested.change_vmcs01_virtual_apic_mode = false;
13677                 vmx_set_virtual_apic_mode(vcpu);
13678         } else if (!nested_cpu_has_ept(vmcs12) &&
13679                    nested_cpu_has2(vmcs12,
13680                                    SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
13681                 vmx_flush_tlb(vcpu, true);
13682         }
13683
13684         /* This is needed for same reason as it was needed in prepare_vmcs02 */
13685         vmx->host_rsp = 0;
13686
13687         /* Unpin physical memory we referred to in vmcs02 */
13688         if (vmx->nested.apic_access_page) {
13689                 kvm_release_page_dirty(vmx->nested.apic_access_page);
13690                 vmx->nested.apic_access_page = NULL;
13691         }
13692         if (vmx->nested.virtual_apic_page) {
13693                 kvm_release_page_dirty(vmx->nested.virtual_apic_page);
13694                 vmx->nested.virtual_apic_page = NULL;
13695         }
13696         if (vmx->nested.pi_desc_page) {
13697                 kunmap(vmx->nested.pi_desc_page);
13698                 kvm_release_page_dirty(vmx->nested.pi_desc_page);
13699                 vmx->nested.pi_desc_page = NULL;
13700                 vmx->nested.pi_desc = NULL;
13701         }
13702
13703         /*
13704          * We are now running in L2, mmu_notifier will force to reload the
13705          * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
13706          */
13707         kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
13708
13709         if (enable_shadow_vmcs && exit_reason != -1)
13710                 vmx->nested.sync_shadow_vmcs = true;
13711
13712         /* in case we halted in L2 */
13713         vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
13714
13715         if (likely(!vmx->fail)) {
13716                 if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
13717                     nested_exit_intr_ack_set(vcpu)) {
13718                         int irq = kvm_cpu_get_interrupt(vcpu);
13719                         WARN_ON(irq < 0);
13720                         vmcs12->vm_exit_intr_info = irq |
13721                                 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
13722                 }
13723
13724                 if (exit_reason != -1)
13725                         trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
13726                                                        vmcs12->exit_qualification,
13727                                                        vmcs12->idt_vectoring_info_field,
13728                                                        vmcs12->vm_exit_intr_info,
13729                                                        vmcs12->vm_exit_intr_error_code,
13730                                                        KVM_ISA_VMX);
13731
13732                 load_vmcs12_host_state(vcpu, vmcs12);
13733
13734                 return;
13735         }
13736
13737         /*
13738          * After an early L2 VM-entry failure, we're now back
13739          * in L1 which thinks it just finished a VMLAUNCH or
13740          * VMRESUME instruction, so we need to set the failure
13741          * flag and the VM-instruction error field of the VMCS
13742          * accordingly.
13743          */
13744         nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
13745
13746         /*
13747          * Restore L1's host state to KVM's software model.  We're here
13748          * because a consistency check was caught by hardware, which
13749          * means some amount of guest state has been propagated to KVM's
13750          * model and needs to be unwound to the host's state.
13751          */
13752         nested_vmx_restore_host_state(vcpu);
13753
13754         /*
13755          * The emulated instruction was already skipped in
13756          * nested_vmx_run, but the updated RIP was never
13757          * written back to the vmcs01.
13758          */
13759         skip_emulated_instruction(vcpu);
13760         vmx->fail = 0;
13761 }
13762
13763 /*
13764  * Forcibly leave nested mode in order to be able to reset the VCPU later on.
13765  */
13766 static void vmx_leave_nested(struct kvm_vcpu *vcpu)
13767 {
13768         if (is_guest_mode(vcpu)) {
13769                 to_vmx(vcpu)->nested.nested_run_pending = 0;
13770                 nested_vmx_vmexit(vcpu, -1, 0, 0);
13771         }
13772         free_nested(to_vmx(vcpu));
13773 }
13774
13775 /*
13776  * L1's failure to enter L2 is a subset of a normal exit, as explained in
13777  * 23.7 "VM-entry failures during or after loading guest state" (this also
13778  * lists the acceptable exit-reason and exit-qualification parameters).
13779  * It should only be called before L2 actually succeeded to run, and when
13780  * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss).
13781  */
13782 static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
13783                         struct vmcs12 *vmcs12,
13784                         u32 reason, unsigned long qualification)
13785 {
13786         load_vmcs12_host_state(vcpu, vmcs12);
13787         vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
13788         vmcs12->exit_qualification = qualification;
13789         nested_vmx_succeed(vcpu);
13790         if (enable_shadow_vmcs)
13791                 to_vmx(vcpu)->nested.sync_shadow_vmcs = true;
13792 }
13793
13794 static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
13795                                   struct x86_instruction_info *info)
13796 {
13797         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
13798         unsigned short port;
13799         bool intercept;
13800         int size;
13801
13802         if (info->intercept == x86_intercept_in ||
13803             info->intercept == x86_intercept_ins) {
13804                 port = info->src_val;
13805                 size = info->dst_bytes;
13806         } else {
13807                 port = info->dst_val;
13808                 size = info->src_bytes;
13809         }
13810
13811         /*
13812          * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
13813          * VM-exits depend on the 'unconditional IO exiting' VM-execution
13814          * control.
13815          *
13816          * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
13817          */
13818         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
13819                 intercept = nested_cpu_has(vmcs12,
13820                                            CPU_BASED_UNCOND_IO_EXITING);
13821         else
13822                 intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
13823
13824         /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
13825         return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
13826 }
13827
13828 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
13829                                struct x86_instruction_info *info,
13830                                enum x86_intercept_stage stage)
13831 {
13832         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
13833         struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
13834
13835         switch (info->intercept) {
13836         /*
13837          * RDPID causes #UD if disabled through secondary execution controls.
13838          * Because it is marked as EmulateOnUD, we need to intercept it here.
13839          */
13840         case x86_intercept_rdtscp:
13841                 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
13842                         ctxt->exception.vector = UD_VECTOR;
13843                         ctxt->exception.error_code_valid = false;
13844                         return X86EMUL_PROPAGATE_FAULT;
13845                 }
13846                 break;
13847
13848         case x86_intercept_in:
13849         case x86_intercept_ins:
13850         case x86_intercept_out:
13851         case x86_intercept_outs:
13852                 return vmx_check_intercept_io(vcpu, info);
13853
13854         case x86_intercept_lgdt:
13855         case x86_intercept_lidt:
13856         case x86_intercept_lldt:
13857         case x86_intercept_ltr:
13858         case x86_intercept_sgdt:
13859         case x86_intercept_sidt:
13860         case x86_intercept_sldt:
13861         case x86_intercept_str:
13862                 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
13863                         return X86EMUL_CONTINUE;
13864
13865                 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
13866                 break;
13867
13868         /* TODO: check more intercepts... */
13869         default:
13870                 break;
13871         }
13872
13873         return X86EMUL_UNHANDLEABLE;
13874 }
13875
13876 #ifdef CONFIG_X86_64
13877 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */
13878 static inline int u64_shl_div_u64(u64 a, unsigned int shift,
13879                                   u64 divisor, u64 *result)
13880 {
13881         u64 low = a << shift, high = a >> (64 - shift);
13882
13883         /* To avoid the overflow on divq */
13884         if (high >= divisor)
13885                 return 1;
13886
13887         /* Low hold the result, high hold rem which is discarded */
13888         asm("divq %2\n\t" : "=a" (low), "=d" (high) :
13889             "rm" (divisor), "0" (low), "1" (high));
13890         *result = low;
13891
13892         return 0;
13893 }
13894
13895 static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
13896 {
13897         struct vcpu_vmx *vmx;
13898         u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
13899
13900         if (kvm_mwait_in_guest(vcpu->kvm))
13901                 return -EOPNOTSUPP;
13902
13903         vmx = to_vmx(vcpu);
13904         tscl = rdtsc();
13905         guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
13906         delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
13907         lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns);
13908
13909         if (delta_tsc > lapic_timer_advance_cycles)
13910                 delta_tsc -= lapic_timer_advance_cycles;
13911         else
13912                 delta_tsc = 0;
13913
13914         /* Convert to host delta tsc if tsc scaling is enabled */
13915         if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
13916                         u64_shl_div_u64(delta_tsc,
13917                                 kvm_tsc_scaling_ratio_frac_bits,
13918                                 vcpu->arch.tsc_scaling_ratio,
13919                                 &delta_tsc))
13920                 return -ERANGE;
13921
13922         /*
13923          * If the delta tsc can't fit in the 32 bit after the multi shift,
13924          * we can't use the preemption timer.
13925          * It's possible that it fits on later vmentries, but checking
13926          * on every vmentry is costly so we just use an hrtimer.
13927          */
13928         if (delta_tsc >> (cpu_preemption_timer_multi + 32))
13929                 return -ERANGE;
13930
13931         vmx->hv_deadline_tsc = tscl + delta_tsc;
13932         return delta_tsc == 0;
13933 }
13934
13935 static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
13936 {
13937         to_vmx(vcpu)->hv_deadline_tsc = -1;
13938 }
13939 #endif
13940
13941 static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
13942 {
13943         if (!kvm_pause_in_guest(vcpu->kvm))
13944                 shrink_ple_window(vcpu);
13945 }
13946
13947 static void vmx_slot_enable_log_dirty(struct kvm *kvm,
13948                                      struct kvm_memory_slot *slot)
13949 {
13950         kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
13951         kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
13952 }
13953
13954 static void vmx_slot_disable_log_dirty(struct kvm *kvm,
13955                                        struct kvm_memory_slot *slot)
13956 {
13957         kvm_mmu_slot_set_dirty(kvm, slot);
13958 }
13959
13960 static void vmx_flush_log_dirty(struct kvm *kvm)
13961 {
13962         kvm_flush_pml_buffers(kvm);
13963 }
13964
13965 static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
13966 {
13967         struct vmcs12 *vmcs12;
13968         struct vcpu_vmx *vmx = to_vmx(vcpu);
13969         struct page *page = NULL;
13970         u64 *pml_address;
13971
13972         if (is_guest_mode(vcpu)) {
13973                 WARN_ON_ONCE(vmx->nested.pml_full);
13974
13975                 /*
13976                  * Check if PML is enabled for the nested guest.
13977                  * Whether eptp bit 6 is set is already checked
13978                  * as part of A/D emulation.
13979                  */
13980                 vmcs12 = get_vmcs12(vcpu);
13981                 if (!nested_cpu_has_pml(vmcs12))
13982                         return 0;
13983
13984                 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
13985                         vmx->nested.pml_full = true;
13986                         return 1;
13987                 }
13988
13989                 gpa &= ~0xFFFull;
13990
13991                 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address);
13992                 if (is_error_page(page))
13993                         return 0;
13994
13995                 pml_address = kmap(page);
13996                 pml_address[vmcs12->guest_pml_index--] = gpa;
13997                 kunmap(page);
13998                 kvm_release_page_clean(page);
13999         }
14000
14001         return 0;
14002 }
14003
14004 static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
14005                                            struct kvm_memory_slot *memslot,
14006                                            gfn_t offset, unsigned long mask)
14007 {
14008         kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
14009 }
14010
14011 static void __pi_post_block(struct kvm_vcpu *vcpu)
14012 {
14013         struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
14014         struct pi_desc old, new;
14015         unsigned int dest;
14016
14017         do {
14018                 old.control = new.control = pi_desc->control;
14019                 WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
14020                      "Wakeup handler not enabled while the VCPU is blocked\n");
14021
14022                 dest = cpu_physical_id(vcpu->cpu);
14023
14024                 if (x2apic_enabled())
14025                         new.ndst = dest;
14026                 else
14027                         new.ndst = (dest << 8) & 0xFF00;
14028
14029                 /* set 'NV' to 'notification vector' */
14030                 new.nv = POSTED_INTR_VECTOR;
14031         } while (cmpxchg64(&pi_desc->control, old.control,
14032                            new.control) != old.control);
14033
14034         if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
14035                 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
14036                 list_del(&vcpu->blocked_vcpu_list);
14037                 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
14038                 vcpu->pre_pcpu = -1;
14039         }
14040 }
14041
14042 /*
14043  * This routine does the following things for vCPU which is going
14044  * to be blocked if VT-d PI is enabled.
14045  * - Store the vCPU to the wakeup list, so when interrupts happen
14046  *   we can find the right vCPU to wake up.
14047  * - Change the Posted-interrupt descriptor as below:
14048  *      'NDST' <-- vcpu->pre_pcpu
14049  *      'NV' <-- POSTED_INTR_WAKEUP_VECTOR
14050  * - If 'ON' is set during this process, which means at least one
14051  *   interrupt is posted for this vCPU, we cannot block it, in
14052  *   this case, return 1, otherwise, return 0.
14053  *
14054  */
14055 static int pi_pre_block(struct kvm_vcpu *vcpu)
14056 {
14057         unsigned int dest;
14058         struct pi_desc old, new;
14059         struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
14060
14061         if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
14062                 !irq_remapping_cap(IRQ_POSTING_CAP)  ||
14063                 !kvm_vcpu_apicv_active(vcpu))
14064                 return 0;
14065
14066         WARN_ON(irqs_disabled());
14067         local_irq_disable();
14068         if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
14069                 vcpu->pre_pcpu = vcpu->cpu;
14070                 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
14071                 list_add_tail(&vcpu->blocked_vcpu_list,
14072                               &per_cpu(blocked_vcpu_on_cpu,
14073                                        vcpu->pre_pcpu));
14074                 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
14075         }
14076
14077         do {
14078                 old.control = new.control = pi_desc->control;
14079
14080                 WARN((pi_desc->sn == 1),
14081                      "Warning: SN field of posted-interrupts "
14082                      "is set before blocking\n");
14083
14084                 /*
14085                  * Since vCPU can be preempted during this process,
14086                  * vcpu->cpu could be different with pre_pcpu, we
14087                  * need to set pre_pcpu as the destination of wakeup
14088                  * notification event, then we can find the right vCPU
14089                  * to wakeup in wakeup handler if interrupts happen
14090                  * when the vCPU is in blocked state.
14091                  */
14092                 dest = cpu_physical_id(vcpu->pre_pcpu);
14093
14094                 if (x2apic_enabled())
14095                         new.ndst = dest;
14096                 else
14097                         new.ndst = (dest << 8) & 0xFF00;
14098
14099                 /* set 'NV' to 'wakeup vector' */
14100                 new.nv = POSTED_INTR_WAKEUP_VECTOR;
14101         } while (cmpxchg64(&pi_desc->control, old.control,
14102                            new.control) != old.control);
14103
14104         /* We should not block the vCPU if an interrupt is posted for it.  */
14105         if (pi_test_on(pi_desc) == 1)
14106                 __pi_post_block(vcpu);
14107
14108         local_irq_enable();
14109         return (vcpu->pre_pcpu == -1);
14110 }
14111
14112 static int vmx_pre_block(struct kvm_vcpu *vcpu)
14113 {
14114         if (pi_pre_block(vcpu))
14115                 return 1;
14116
14117         if (kvm_lapic_hv_timer_in_use(vcpu))
14118                 kvm_lapic_switch_to_sw_timer(vcpu);
14119
14120         return 0;
14121 }
14122
14123 static void pi_post_block(struct kvm_vcpu *vcpu)
14124 {
14125         if (vcpu->pre_pcpu == -1)
14126                 return;
14127
14128         WARN_ON(irqs_disabled());
14129         local_irq_disable();
14130         __pi_post_block(vcpu);
14131         local_irq_enable();
14132 }
14133
14134 static void vmx_post_block(struct kvm_vcpu *vcpu)
14135 {
14136         if (kvm_x86_ops->set_hv_timer)
14137                 kvm_lapic_switch_to_hv_timer(vcpu);
14138
14139         pi_post_block(vcpu);
14140 }
14141
14142 /*
14143  * vmx_update_pi_irte - set IRTE for Posted-Interrupts
14144  *
14145  * @kvm: kvm
14146  * @host_irq: host irq of the interrupt
14147  * @guest_irq: gsi of the interrupt
14148  * @set: set or unset PI
14149  * returns 0 on success, < 0 on failure
14150  */
14151 static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
14152                               uint32_t guest_irq, bool set)
14153 {
14154         struct kvm_kernel_irq_routing_entry *e;
14155         struct kvm_irq_routing_table *irq_rt;
14156         struct kvm_lapic_irq irq;
14157         struct kvm_vcpu *vcpu;
14158         struct vcpu_data vcpu_info;
14159         int idx, ret = 0;
14160
14161         if (!kvm_arch_has_assigned_device(kvm) ||
14162                 !irq_remapping_cap(IRQ_POSTING_CAP) ||
14163                 !kvm_vcpu_apicv_active(kvm->vcpus[0]))
14164                 return 0;
14165
14166         idx = srcu_read_lock(&kvm->irq_srcu);
14167         irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
14168         if (guest_irq >= irq_rt->nr_rt_entries ||
14169             hlist_empty(&irq_rt->map[guest_irq])) {
14170                 pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
14171                              guest_irq, irq_rt->nr_rt_entries);
14172                 goto out;
14173         }
14174
14175         hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
14176                 if (e->type != KVM_IRQ_ROUTING_MSI)
14177                         continue;
14178                 /*
14179                  * VT-d PI cannot support posting multicast/broadcast
14180                  * interrupts to a vCPU, we still use interrupt remapping
14181                  * for these kind of interrupts.
14182                  *
14183                  * For lowest-priority interrupts, we only support
14184                  * those with single CPU as the destination, e.g. user
14185                  * configures the interrupts via /proc/irq or uses
14186                  * irqbalance to make the interrupts single-CPU.
14187                  *
14188                  * We will support full lowest-priority interrupt later.
14189                  */
14190
14191                 kvm_set_msi_irq(kvm, e, &irq);
14192                 if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
14193                         /*
14194                          * Make sure the IRTE is in remapped mode if
14195                          * we don't handle it in posted mode.
14196                          */
14197                         ret = irq_set_vcpu_affinity(host_irq, NULL);
14198                         if (ret < 0) {
14199                                 printk(KERN_INFO
14200                                    "failed to back to remapped mode, irq: %u\n",
14201                                    host_irq);
14202                                 goto out;
14203                         }
14204
14205                         continue;
14206                 }
14207
14208                 vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
14209                 vcpu_info.vector = irq.vector;
14210
14211                 trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
14212                                 vcpu_info.vector, vcpu_info.pi_desc_addr, set);
14213
14214                 if (set)
14215                         ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
14216                 else
14217                         ret = irq_set_vcpu_affinity(host_irq, NULL);
14218
14219                 if (ret < 0) {
14220                         printk(KERN_INFO "%s: failed to update PI IRTE\n",
14221                                         __func__);
14222                         goto out;
14223                 }
14224         }
14225
14226         ret = 0;
14227 out:
14228         srcu_read_unlock(&kvm->irq_srcu, idx);
14229         return ret;
14230 }
14231
14232 static void vmx_setup_mce(struct kvm_vcpu *vcpu)
14233 {
14234         if (vcpu->arch.mcg_cap & MCG_LMCE_P)
14235                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
14236                         FEATURE_CONTROL_LMCE;
14237         else
14238                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
14239                         ~FEATURE_CONTROL_LMCE;
14240 }
14241
14242 static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
14243 {
14244         /* we need a nested vmexit to enter SMM, postpone if run is pending */
14245         if (to_vmx(vcpu)->nested.nested_run_pending)
14246                 return 0;
14247         return 1;
14248 }
14249
14250 static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
14251 {
14252         struct vcpu_vmx *vmx = to_vmx(vcpu);
14253
14254         vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
14255         if (vmx->nested.smm.guest_mode)
14256                 nested_vmx_vmexit(vcpu, -1, 0, 0);
14257
14258         vmx->nested.smm.vmxon = vmx->nested.vmxon;
14259         vmx->nested.vmxon = false;
14260         vmx_clear_hlt(vcpu);
14261         return 0;
14262 }
14263
14264 static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
14265 {
14266         struct vcpu_vmx *vmx = to_vmx(vcpu);
14267         int ret;
14268
14269         if (vmx->nested.smm.vmxon) {
14270                 vmx->nested.vmxon = true;
14271                 vmx->nested.smm.vmxon = false;
14272         }
14273
14274         if (vmx->nested.smm.guest_mode) {
14275                 vcpu->arch.hflags &= ~HF_SMM_MASK;
14276                 ret = enter_vmx_non_root_mode(vcpu, NULL);
14277                 vcpu->arch.hflags |= HF_SMM_MASK;
14278                 if (ret)
14279                         return ret;
14280
14281                 vmx->nested.smm.guest_mode = false;
14282         }
14283         return 0;
14284 }
14285
14286 static int enable_smi_window(struct kvm_vcpu *vcpu)
14287 {
14288         return 0;
14289 }
14290
14291 static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
14292                                 struct kvm_nested_state __user *user_kvm_nested_state,
14293                                 u32 user_data_size)
14294 {
14295         struct vcpu_vmx *vmx;
14296         struct vmcs12 *vmcs12;
14297         struct kvm_nested_state kvm_state = {
14298                 .flags = 0,
14299                 .format = 0,
14300                 .size = sizeof(kvm_state),
14301                 .vmx.vmxon_pa = -1ull,
14302                 .vmx.vmcs_pa = -1ull,
14303         };
14304
14305         if (!vcpu)
14306                 return kvm_state.size + 2 * VMCS12_SIZE;
14307
14308         vmx = to_vmx(vcpu);
14309         vmcs12 = get_vmcs12(vcpu);
14310         if (nested_vmx_allowed(vcpu) &&
14311             (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
14312                 kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
14313                 kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;
14314
14315                 if (vmx->nested.current_vmptr != -1ull) {
14316                         kvm_state.size += VMCS12_SIZE;
14317
14318                         if (is_guest_mode(vcpu) &&
14319                             nested_cpu_has_shadow_vmcs(vmcs12) &&
14320                             vmcs12->vmcs_link_pointer != -1ull)
14321                                 kvm_state.size += VMCS12_SIZE;
14322                 }
14323
14324                 if (vmx->nested.smm.vmxon)
14325                         kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
14326
14327                 if (vmx->nested.smm.guest_mode)
14328                         kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
14329
14330                 if (is_guest_mode(vcpu)) {
14331                         kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
14332
14333                         if (vmx->nested.nested_run_pending)
14334                                 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
14335                 }
14336         }
14337
14338         if (user_data_size < kvm_state.size)
14339                 goto out;
14340
14341         if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
14342                 return -EFAULT;
14343
14344         if (vmx->nested.current_vmptr == -1ull)
14345                 goto out;
14346
14347         /*
14348          * When running L2, the authoritative vmcs12 state is in the
14349          * vmcs02. When running L1, the authoritative vmcs12 state is
14350          * in the shadow vmcs linked to vmcs01, unless
14351          * sync_shadow_vmcs is set, in which case, the authoritative
14352          * vmcs12 state is in the vmcs12 already.
14353          */
14354         if (is_guest_mode(vcpu))
14355                 sync_vmcs12(vcpu, vmcs12);
14356         else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs)
14357                 copy_shadow_to_vmcs12(vmx);
14358
14359         /*
14360          * Copy over the full allocated size of vmcs12 rather than just the size
14361          * of the struct.
14362          */
14363         if (copy_to_user(user_kvm_nested_state->data, vmcs12, VMCS12_SIZE))
14364                 return -EFAULT;
14365
14366         if (nested_cpu_has_shadow_vmcs(vmcs12) &&
14367             vmcs12->vmcs_link_pointer != -1ull) {
14368                 if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE,
14369                                  get_shadow_vmcs12(vcpu), VMCS12_SIZE))
14370                         return -EFAULT;
14371         }
14372
14373 out:
14374         return kvm_state.size;
14375 }
14376
14377 static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
14378                                 struct kvm_nested_state __user *user_kvm_nested_state,
14379                                 struct kvm_nested_state *kvm_state)
14380 {
14381         struct vcpu_vmx *vmx = to_vmx(vcpu);
14382         struct vmcs12 *vmcs12;
14383         u32 exit_qual;
14384         int ret;
14385
14386         if (kvm_state->format != 0)
14387                 return -EINVAL;
14388
14389         if (!nested_vmx_allowed(vcpu))
14390                 return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
14391
14392         if (kvm_state->vmx.vmxon_pa == -1ull) {
14393                 if (kvm_state->vmx.smm.flags)
14394                         return -EINVAL;
14395
14396                 if (kvm_state->vmx.vmcs_pa != -1ull)
14397                         return -EINVAL;
14398
14399                 vmx_leave_nested(vcpu);
14400                 return 0;
14401         }
14402
14403         if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa))
14404                 return -EINVAL;
14405
14406         if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
14407             (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
14408                 return -EINVAL;
14409
14410         if (kvm_state->vmx.smm.flags &
14411             ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
14412                 return -EINVAL;
14413
14414         /*
14415          * SMM temporarily disables VMX, so we cannot be in guest mode,
14416          * nor can VMLAUNCH/VMRESUME be pending.  Outside SMM, SMM flags
14417          * must be zero.
14418          */
14419         if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags)
14420                 return -EINVAL;
14421
14422         if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
14423             !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
14424                 return -EINVAL;
14425
14426         vmx_leave_nested(vcpu);
14427         if (kvm_state->vmx.vmxon_pa == -1ull)
14428                 return 0;
14429
14430         vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa;
14431         ret = enter_vmx_operation(vcpu);
14432         if (ret)
14433                 return ret;
14434
14435         /* Empty 'VMXON' state is permitted */
14436         if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12))
14437                 return 0;
14438
14439         if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
14440             !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
14441                 return -EINVAL;
14442
14443         set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
14444
14445         if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
14446                 vmx->nested.smm.vmxon = true;
14447                 vmx->nested.vmxon = false;
14448
14449                 if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
14450                         vmx->nested.smm.guest_mode = true;
14451         }
14452
14453         vmcs12 = get_vmcs12(vcpu);
14454         if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12)))
14455                 return -EFAULT;
14456
14457         if (vmcs12->hdr.revision_id != VMCS12_REVISION)
14458                 return -EINVAL;
14459
14460         if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
14461                 return 0;
14462
14463         vmx->nested.nested_run_pending =
14464                 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
14465
14466         if (nested_cpu_has_shadow_vmcs(vmcs12) &&
14467             vmcs12->vmcs_link_pointer != -1ull) {
14468                 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
14469                 if (kvm_state->size < sizeof(*kvm_state) + 2 * sizeof(*vmcs12))
14470                         return -EINVAL;
14471
14472                 if (copy_from_user(shadow_vmcs12,
14473                                    user_kvm_nested_state->data + VMCS12_SIZE,
14474                                    sizeof(*vmcs12)))
14475                         return -EFAULT;
14476
14477                 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
14478                     !shadow_vmcs12->hdr.shadow_vmcs)
14479                         return -EINVAL;
14480         }
14481
14482         if (check_vmentry_prereqs(vcpu, vmcs12) ||
14483             check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
14484                 return -EINVAL;
14485
14486         vmx->nested.dirty_vmcs12 = true;
14487         ret = enter_vmx_non_root_mode(vcpu, NULL);
14488         if (ret)
14489                 return -EINVAL;
14490
14491         return 0;
14492 }
14493
14494 static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
14495         .cpu_has_kvm_support = cpu_has_kvm_support,
14496         .disabled_by_bios = vmx_disabled_by_bios,
14497         .hardware_setup = hardware_setup,
14498         .hardware_unsetup = hardware_unsetup,
14499         .check_processor_compatibility = vmx_check_processor_compat,
14500         .hardware_enable = hardware_enable,
14501         .hardware_disable = hardware_disable,
14502         .cpu_has_accelerated_tpr = report_flexpriority,
14503         .has_emulated_msr = vmx_has_emulated_msr,
14504
14505         .vm_init = vmx_vm_init,
14506         .vm_alloc = vmx_vm_alloc,
14507         .vm_free = vmx_vm_free,
14508
14509         .vcpu_create = vmx_create_vcpu,
14510         .vcpu_free = vmx_free_vcpu,
14511         .vcpu_reset = vmx_vcpu_reset,
14512
14513         .prepare_guest_switch = vmx_prepare_switch_to_guest,
14514         .vcpu_load = vmx_vcpu_load,
14515         .vcpu_put = vmx_vcpu_put,
14516
14517         .update_bp_intercept = update_exception_bitmap,
14518         .get_msr_feature = vmx_get_msr_feature,
14519         .get_msr = vmx_get_msr,
14520         .set_msr = vmx_set_msr,
14521         .get_segment_base = vmx_get_segment_base,
14522         .get_segment = vmx_get_segment,
14523         .set_segment = vmx_set_segment,
14524         .get_cpl = vmx_get_cpl,
14525         .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
14526         .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
14527         .decache_cr3 = vmx_decache_cr3,
14528         .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
14529         .set_cr0 = vmx_set_cr0,
14530         .set_cr3 = vmx_set_cr3,
14531         .set_cr4 = vmx_set_cr4,
14532         .set_efer = vmx_set_efer,
14533         .get_idt = vmx_get_idt,
14534         .set_idt = vmx_set_idt,
14535         .get_gdt = vmx_get_gdt,
14536         .set_gdt = vmx_set_gdt,
14537         .get_dr6 = vmx_get_dr6,
14538         .set_dr6 = vmx_set_dr6,
14539         .set_dr7 = vmx_set_dr7,
14540         .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
14541         .cache_reg = vmx_cache_reg,
14542         .get_rflags = vmx_get_rflags,
14543         .set_rflags = vmx_set_rflags,
14544
14545         .tlb_flush = vmx_flush_tlb,
14546         .tlb_flush_gva = vmx_flush_tlb_gva,
14547
14548         .run = vmx_vcpu_run,
14549         .handle_exit = vmx_handle_exit,
14550         .skip_emulated_instruction = skip_emulated_instruction,
14551         .set_interrupt_shadow = vmx_set_interrupt_shadow,
14552         .get_interrupt_shadow = vmx_get_interrupt_shadow,
14553         .patch_hypercall = vmx_patch_hypercall,
14554         .set_irq = vmx_inject_irq,
14555         .set_nmi = vmx_inject_nmi,
14556         .queue_exception = vmx_queue_exception,
14557         .cancel_injection = vmx_cancel_injection,
14558         .interrupt_allowed = vmx_interrupt_allowed,
14559         .nmi_allowed = vmx_nmi_allowed,
14560         .get_nmi_mask = vmx_get_nmi_mask,
14561         .set_nmi_mask = vmx_set_nmi_mask,
14562         .enable_nmi_window = enable_nmi_window,
14563         .enable_irq_window = enable_irq_window,
14564         .update_cr8_intercept = update_cr8_intercept,
14565         .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
14566         .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
14567         .get_enable_apicv = vmx_get_enable_apicv,
14568         .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
14569         .load_eoi_exitmap = vmx_load_eoi_exitmap,
14570         .apicv_post_state_restore = vmx_apicv_post_state_restore,
14571         .hwapic_irr_update = vmx_hwapic_irr_update,
14572         .hwapic_isr_update = vmx_hwapic_isr_update,
14573         .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
14574         .sync_pir_to_irr = vmx_sync_pir_to_irr,
14575         .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
14576         .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt,
14577
14578         .set_tss_addr = vmx_set_tss_addr,
14579         .set_identity_map_addr = vmx_set_identity_map_addr,
14580         .get_tdp_level = get_ept_level,
14581         .get_mt_mask = vmx_get_mt_mask,
14582
14583         .get_exit_info = vmx_get_exit_info,
14584
14585         .get_lpage_level = vmx_get_lpage_level,
14586
14587         .cpuid_update = vmx_cpuid_update,
14588
14589         .rdtscp_supported = vmx_rdtscp_supported,
14590         .invpcid_supported = vmx_invpcid_supported,
14591
14592         .set_supported_cpuid = vmx_set_supported_cpuid,
14593
14594         .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
14595
14596         .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
14597         .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
14598
14599         .set_tdp_cr3 = vmx_set_cr3,
14600
14601         .check_intercept = vmx_check_intercept,
14602         .handle_external_intr = vmx_handle_external_intr,
14603         .mpx_supported = vmx_mpx_supported,
14604         .xsaves_supported = vmx_xsaves_supported,
14605         .umip_emulated = vmx_umip_emulated,
14606
14607         .check_nested_events = vmx_check_nested_events,
14608         .request_immediate_exit = vmx_request_immediate_exit,
14609
14610         .sched_in = vmx_sched_in,
14611
14612         .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
14613         .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
14614         .flush_log_dirty = vmx_flush_log_dirty,
14615         .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
14616         .write_log_dirty = vmx_write_pml_buffer,
14617
14618         .pre_block = vmx_pre_block,
14619         .post_block = vmx_post_block,
14620
14621         .pmu_ops = &intel_pmu_ops,
14622
14623         .update_pi_irte = vmx_update_pi_irte,
14624
14625 #ifdef CONFIG_X86_64
14626         .set_hv_timer = vmx_set_hv_timer,
14627         .cancel_hv_timer = vmx_cancel_hv_timer,
14628 #endif
14629
14630         .setup_mce = vmx_setup_mce,
14631
14632         .get_nested_state = vmx_get_nested_state,
14633         .set_nested_state = vmx_set_nested_state,
14634         .get_vmcs12_pages = nested_get_vmcs12_pages,
14635
14636         .smi_allowed = vmx_smi_allowed,
14637         .pre_enter_smm = vmx_pre_enter_smm,
14638         .pre_leave_smm = vmx_pre_leave_smm,
14639         .enable_smi_window = enable_smi_window,
14640 };
14641
14642 static void vmx_cleanup_l1d_flush(void)
14643 {
14644         if (vmx_l1d_flush_pages) {
14645                 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
14646                 vmx_l1d_flush_pages = NULL;
14647         }
14648         /* Restore state so sysfs ignores VMX */
14649         l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
14650 }
14651
14652 static void vmx_exit(void)
14653 {
14654 #ifdef CONFIG_KEXEC_CORE
14655         RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
14656         synchronize_rcu();
14657 #endif
14658
14659         kvm_exit();
14660
14661 #if IS_ENABLED(CONFIG_HYPERV)
14662         if (static_branch_unlikely(&enable_evmcs)) {
14663                 int cpu;
14664                 struct hv_vp_assist_page *vp_ap;
14665                 /*
14666                  * Reset everything to support using non-enlightened VMCS
14667                  * access later (e.g. when we reload the module with
14668                  * enlightened_vmcs=0)
14669                  */
14670                 for_each_online_cpu(cpu) {
14671                         vp_ap = hv_get_vp_assist_page(cpu);
14672
14673                         if (!vp_ap)
14674                                 continue;
14675
14676                         vp_ap->current_nested_vmcs = 0;
14677                         vp_ap->enlighten_vmentry = 0;
14678                 }
14679
14680                 static_branch_disable(&enable_evmcs);
14681         }
14682 #endif
14683         vmx_cleanup_l1d_flush();
14684 }
14685 module_exit(vmx_exit);
14686
14687 static int __init vmx_init(void)
14688 {
14689         int r, cpu;
14690
14691 #if IS_ENABLED(CONFIG_HYPERV)
14692         /*
14693          * Enlightened VMCS usage should be recommended and the host needs
14694          * to support eVMCS v1 or above. We can also disable eVMCS support
14695          * with module parameter.
14696          */
14697         if (enlightened_vmcs &&
14698             ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
14699             (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
14700             KVM_EVMCS_VERSION) {
14701                 int cpu;
14702
14703                 /* Check that we have assist pages on all online CPUs */
14704                 for_each_online_cpu(cpu) {
14705                         if (!hv_get_vp_assist_page(cpu)) {
14706                                 enlightened_vmcs = false;
14707                                 break;
14708                         }
14709                 }
14710
14711                 if (enlightened_vmcs) {
14712                         pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
14713                         static_branch_enable(&enable_evmcs);
14714                 }
14715         } else {
14716                 enlightened_vmcs = false;
14717         }
14718 #endif
14719
14720         r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
14721                      __alignof__(struct vcpu_vmx), THIS_MODULE);
14722         if (r)
14723                 return r;
14724
14725         /*
14726          * Must be called after kvm_init() so enable_ept is properly set
14727          * up. Hand the parameter mitigation value in which was stored in
14728          * the pre module init parser. If no parameter was given, it will
14729          * contain 'auto' which will be turned into the default 'cond'
14730          * mitigation mode.
14731          */
14732         if (boot_cpu_has(X86_BUG_L1TF)) {
14733                 r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
14734                 if (r) {
14735                         vmx_exit();
14736                         return r;
14737                 }
14738         }
14739
14740         vmx_setup_fb_clear_ctrl();
14741
14742         for_each_possible_cpu(cpu) {
14743                 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
14744
14745                 INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
14746                 spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
14747         }
14748
14749 #ifdef CONFIG_KEXEC_CORE
14750         rcu_assign_pointer(crash_vmclear_loaded_vmcss,
14751                            crash_vmclear_local_loaded_vmcss);
14752 #endif
14753         vmx_check_vmcs12_offsets();
14754
14755         return 0;
14756 }
14757 module_init(vmx_init);