From fa44730105aae8cbde220440899b426064c1df7f Mon Sep 17 00:00:00 2001 From: Colin Percival Date: Sun, 2 Oct 2022 10:42:38 -0700 Subject: [PATCH] pvh/arch-x86_64: Initialize vCPU regs for PVH Set the initial values of the KVM vCPU registers as specified in the PVH boot ABI: https://xenbits.xen.org/docs/unstable/misc/pvh.html Signed-off-by: Colin Percival Co-authored-by: Alejandro Jimenez --- src/arch/src/x86_64/gdt.rs | 32 +++++- src/arch/src/x86_64/layout.rs | 3 + src/arch/src/x86_64/regs.rs | 177 ++++++++++++++++++++++-------- src/vmm/src/builder.rs | 8 +- src/vmm/src/vstate/vcpu/mod.rs | 10 +- src/vmm/src/vstate/vcpu/x86_64.rs | 27 +++-- 6 files changed, 192 insertions(+), 65 deletions(-) diff --git a/src/arch/src/x86_64/gdt.rs b/src/arch/src/x86_64/gdt.rs index c7fcbf31bf02..03a67247d511 100644 --- a/src/arch/src/x86_64/gdt.rs +++ b/src/arch/src/x86_64/gdt.rs @@ -1,3 +1,5 @@ +// Copyright © 2020, Oracle and/or its affiliates. +// // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 // @@ -24,8 +26,34 @@ fn get_base(entry: u64) -> u64 { | (((entry) & 0x0000_0000_FFFF_0000) >> 16) } +// Extract the segment limit from the GDT segment descriptor. +// +// In a segment descriptor, the limit field is 20 bits, so it can directly describe +// a range from 0 to 0xFFFFF (1 MB). When G flag is set (4-KByte page granularity) it +// scales the value in the limit field by a factor of 2^12 (4 Kbytes), making the effective +// limit range from 0xFFF (4 KBytes) to 0xFFFF_FFFF (4 GBytes). +// +// However, the limit field in the VMCS definition is a 32 bit field, and the limit value is not +// automatically scaled using the G flag. This means that for a desired range of 4GB for a +// given segment, its limit must be specified as 0xFFFF_FFFF. Therefore the method of obtaining +// the limit from the GDT entry is not sufficient, since it only provides 20 bits when 32 bits +// are necessary. Fortunately, we can check if the G flag is set when extracting the limit since +// the full GDT entry is passed as an argument, and perform the scaling of the limit value to +// return the full 32 bit value. +// +// The scaling mentioned above is required when using PVH boot, since the guest boots in protected +// (32-bit) mode and must be able to access the entire 32-bit address space. It does not cause +// issues for the case of direct boot to 64-bit (long) mode, since in 64-bit mode the processor does +// not perform runtime limit checking on code or data segments. fn get_limit(entry: u64) -> u32 { - ((((entry) & 0x000F_0000_0000_0000) >> 32) | ((entry) & 0x0000_0000_0000_FFFF)) as u32 + let limit: u32 = + ((((entry) & 0x000F_0000_0000_0000) >> 32) | ((entry) & 0x0000_0000_0000_FFFF)) as u32; + + // Perform manual limit scaling if G flag is set + match get_g(entry) { + 0 => limit, + _ => (limit << 12) | 0xFFF, // G flag is either 0 or 1 + } } fn get_g(entry: u64) -> u8 { @@ -109,7 +137,7 @@ mod tests { assert_eq!(0xB, seg.type_); // base and limit assert_eq!(0x10_0000, seg.base); - assert_eq!(0xfffff, seg.limit); + assert_eq!(0xffff_ffff, seg.limit); assert_eq!(0x0, seg.unusable); } } diff --git a/src/arch/src/x86_64/layout.rs b/src/arch/src/x86_64/layout.rs index 38776c1e7ed1..936458d54d4c 100644 --- a/src/arch/src/x86_64/layout.rs +++ b/src/arch/src/x86_64/layout.rs @@ -27,5 +27,8 @@ pub const IRQ_MAX: u32 = 23; /// Address for the TSS setup. pub const KVM_TSS_ADDRESS: u64 = 0xfffb_d000; +/// Address of the hvm_start_info struct used in PVH boot +pub const PVH_INFO_START: u64 = 0x6000; + /// The 'zero page', a.k.a linux kernel bootparams. pub const ZERO_PAGE_START: u64 = 0x7000; diff --git a/src/arch/src/x86_64/regs.rs b/src/arch/src/x86_64/regs.rs index a1371546469b..4a05a8551490 100644 --- a/src/arch/src/x86_64/regs.rs +++ b/src/arch/src/x86_64/regs.rs @@ -1,3 +1,4 @@ +// Copyright © 2020, Oracle and/or its affiliates. // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 // @@ -11,6 +12,7 @@ use kvm_bindings::{kvm_fpu, kvm_regs, kvm_sregs}; use kvm_ioctls::VcpuFd; use vm_memory::{Address, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap}; +use super::super::{BootProtocol, EntryPoint}; use super::gdt::{gdt_entry, kvm_segment_from_gdt}; // Initial pagetables. @@ -100,20 +102,33 @@ impl fmt::Display for SetupRegistersError { /// # Errors /// /// When [`kvm_ioctls::ioctls::vcpu::VcpuFd::set_regs`] errors. -pub fn setup_regs(vcpu: &VcpuFd, boot_ip: u64) -> std::result::Result<(), SetupRegistersError> { - let regs: kvm_regs = kvm_regs { - rflags: 0x0000_0000_0000_0002u64, - rip: boot_ip, - // Frame pointer. It gets a snapshot of the stack pointer (rsp) so that when adjustments are - // made to rsp (i.e. reserving space for local variables or pushing values on to the stack), - // local variables and function parameters are still accessible from a constant offset from - // rbp. - rsp: super::layout::BOOT_STACK_POINTER, - // Starting stack pointer. - rbp: super::layout::BOOT_STACK_POINTER, - // Must point to zero page address per Linux ABI. This is x86_64 specific. - rsi: super::layout::ZERO_PAGE_START, - ..Default::default() +pub fn setup_regs( + vcpu: &VcpuFd, + entry_point: EntryPoint, +) -> std::result::Result<(), SetupRegistersError> { + let regs: kvm_regs = match entry_point.protocol { + BootProtocol::PvhBoot => kvm_regs { + // Configure regs as required by PVH boot protocol. + rflags: 0x0000_0000_0000_0002u64, + rbx: super::layout::PVH_INFO_START, + rip: entry_point.entry_addr.raw_value(), + ..Default::default() + }, + BootProtocol::LinuxBoot => kvm_regs { + // Configure regs as required by Linux 64-bit boot protocol. + rflags: 0x0000_0000_0000_0002u64, + rip: entry_point.entry_addr.raw_value(), + // Frame pointer. It gets a snapshot of the stack pointer (rsp) so that when adjustments + // are made to rsp (i.e. reserving space for local variables or pushing + // values on to the stack), local variables and function parameters are + // still accessible from a constant offset from rbp. + rsp: super::layout::BOOT_STACK_POINTER, + // Starting stack pointer. + rbp: super::layout::BOOT_STACK_POINTER, + // Must point to zero page address per Linux ABI. This is x86_64 specific. + rsi: super::layout::ZERO_PAGE_START, + ..Default::default() + }, }; vcpu.set_regs(®s).map_err(SetupRegistersError) @@ -142,6 +157,7 @@ pub enum SetupSpecialRegistersError { /// /// * `mem` - The memory that will be passed to the guest. /// * `vcpu` - Structure for the VCPU that holds the VCPU's fd. +/// * `boot_prot` - The boot protocol being used. /// /// # Errors /// @@ -153,14 +169,18 @@ pub enum SetupSpecialRegistersError { pub fn setup_sregs( mem: &GuestMemoryMmap, vcpu: &VcpuFd, + boot_prot: BootProtocol, ) -> std::result::Result<(), SetupSpecialRegistersError> { let mut sregs: kvm_sregs = vcpu .get_sregs() .map_err(SetupSpecialRegistersError::GetSpecialRegisters)?; - configure_segments_and_sregs(mem, &mut sregs) + configure_segments_and_sregs(mem, &mut sregs, boot_prot) .map_err(SetupSpecialRegistersError::ConfigureSegmentsAndSpecialRegisters)?; - setup_page_tables(mem, &mut sregs).map_err(SetupSpecialRegistersError::SetupPageTables)?; // TODO(dgreid) - Can this be done once per system instead? + if let BootProtocol::LinuxBoot = boot_prot { + setup_page_tables(mem, &mut sregs).map_err(SetupSpecialRegistersError::SetupPageTables)?; + // TODO(dgreid) - Can this be done once per system instead? + } vcpu.set_sregs(&sregs) .map_err(SetupSpecialRegistersError::SetSpecialRegisters) @@ -175,6 +195,7 @@ const EFER_LMA: u64 = 0x400; const EFER_LME: u64 = 0x100; const X86_CR0_PE: u64 = 0x1; +const X86_CR0_ET: u64 = 0x10; const X86_CR0_PG: u64 = 0x8000_0000; const X86_CR4_PAE: u64 = 0x20; @@ -198,13 +219,31 @@ fn write_idt_value(val: u64, guest_mem: &GuestMemoryMmap) -> Result<()> { .map_err(|_| Error::WriteIDT) } -fn configure_segments_and_sregs(mem: &GuestMemoryMmap, sregs: &mut kvm_sregs) -> Result<()> { - let gdt_table: [u64; BOOT_GDT_MAX] = [ - gdt_entry(0, 0, 0), // NULL - gdt_entry(0xa09b, 0, 0xfffff), // CODE - gdt_entry(0xc093, 0, 0xfffff), // DATA - gdt_entry(0x808b, 0, 0xfffff), // TSS - ]; +fn configure_segments_and_sregs( + mem: &GuestMemoryMmap, + sregs: &mut kvm_sregs, + boot_prot: BootProtocol, +) -> Result<()> { + let gdt_table: [u64; BOOT_GDT_MAX] = match boot_prot { + BootProtocol::PvhBoot => { + // Configure GDT entries as specified by PVH boot protocol + [ + gdt_entry(0, 0, 0), // NULL + gdt_entry(0xc09b, 0, 0xffff_ffff), // CODE + gdt_entry(0xc093, 0, 0xffff_ffff), // DATA + gdt_entry(0x008b, 0, 0x67), // TSS + ] + } + BootProtocol::LinuxBoot => { + // Configure GDT entries as specified by Linux 64bit boot protocol + [ + gdt_entry(0, 0, 0), // NULL + gdt_entry(0xa09b, 0, 0xfffff), // CODE + gdt_entry(0xc093, 0, 0xfffff), // DATA + gdt_entry(0x808b, 0, 0xfffff), // TSS + ] + } + }; let code_seg = kvm_segment_from_gdt(gdt_table[1], 1); let data_seg = kvm_segment_from_gdt(gdt_table[2], 2); @@ -227,9 +266,17 @@ fn configure_segments_and_sregs(mem: &GuestMemoryMmap, sregs: &mut kvm_sregs) -> sregs.ss = data_seg; sregs.tr = tss_seg; - // 64-bit protected mode - sregs.cr0 |= X86_CR0_PE; - sregs.efer |= EFER_LME | EFER_LMA; + match boot_prot { + BootProtocol::PvhBoot => { + sregs.cr0 = X86_CR0_PE | X86_CR0_ET; + sregs.cr4 = 0; + } + BootProtocol::LinuxBoot => { + // 64-bit protected mode + sregs.cr0 |= X86_CR0_PE; + sregs.efer |= EFER_LME | EFER_LMA; + } + } Ok(()) } @@ -287,24 +334,45 @@ mod tests { gm.read_obj(read_addr).unwrap() } - fn validate_segments_and_sregs(gm: &GuestMemoryMmap, sregs: &kvm_sregs) { + fn validate_segments_and_sregs( + gm: &GuestMemoryMmap, + sregs: &kvm_sregs, + boot_prot: BootProtocol, + ) { + if let BootProtocol::LinuxBoot = boot_prot { + assert_eq!(0xaf_9b00_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 8)); + assert_eq!(0xcf_9300_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 16)); + assert_eq!(0x8f_8b00_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 24)); + + assert_eq!(0xffff_ffff, sregs.tr.limit); + + assert!(sregs.cr0 & X86_CR0_PE != 0); + assert!(sregs.efer & EFER_LME != 0 && sregs.efer & EFER_LMA != 0); + } else { + // Validate values that are specific to PVH boot protocol + assert_eq!(0xcf_9b00_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 8)); + assert_eq!(0xcf_9300_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 16)); + assert_eq!(0x00_8b00_0000_0067, read_u64(gm, BOOT_GDT_OFFSET + 24)); + + assert_eq!(0x67, sregs.tr.limit); + assert_eq!(0, sregs.tr.g); + + assert!(sregs.cr0 & X86_CR0_PE != 0 && sregs.cr0 & X86_CR0_ET != 0); + assert_eq!(0, sregs.cr4); + } + + // Common settings for both PVH and Linux boot protocol assert_eq!(0x0, read_u64(gm, BOOT_GDT_OFFSET)); - assert_eq!(0xaf_9b00_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 8)); - assert_eq!(0xcf_9300_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 16)); - assert_eq!(0x8f_8b00_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 24)); assert_eq!(0x0, read_u64(gm, BOOT_IDT_OFFSET)); assert_eq!(0, sregs.cs.base); - assert_eq!(0xfffff, sregs.ds.limit); + assert_eq!(0xffff_ffff, sregs.ds.limit); assert_eq!(0x10, sregs.es.selector); assert_eq!(1, sregs.fs.present); assert_eq!(1, sregs.gs.g); assert_eq!(0, sregs.ss.avl); assert_eq!(0, sregs.tr.base); - assert_eq!(0xfffff, sregs.tr.limit); assert_eq!(0, sregs.tr.avl); - assert!(sregs.cr0 & X86_CR0_PE != 0); - assert!(sregs.efer & EFER_LME != 0 && sregs.efer & EFER_LMA != 0); } fn validate_page_tables(gm: &GuestMemoryMmap, sregs: &kvm_sregs) { @@ -356,7 +424,12 @@ mod tests { ..Default::default() }; - setup_regs(&vcpu, expected_regs.rip).unwrap(); + let entry_point: EntryPoint = EntryPoint { + entry_addr: GuestAddress(expected_regs.rip), + protocol: BootProtocol::LinuxBoot, + }; + + setup_regs(&vcpu, entry_point).unwrap(); let actual_regs: kvm_regs = vcpu.get_regs().unwrap(); assert_eq!(actual_regs, expected_regs); @@ -369,16 +442,22 @@ mod tests { let vcpu = vm.create_vcpu(0).unwrap(); let gm = create_guest_mem(None); - assert!(vcpu.set_sregs(&Default::default()).is_ok()); - setup_sregs(&gm, &vcpu).unwrap(); - - let mut sregs: kvm_sregs = vcpu.get_sregs().unwrap(); - // for AMD KVM_GET_SREGS returns g = 0 for each kvm_segment. - // We set it to 1, otherwise the test will fail. - sregs.gs.g = 1; - - validate_segments_and_sregs(&gm, &sregs); - validate_page_tables(&gm, &sregs); + [BootProtocol::LinuxBoot, BootProtocol::PvhBoot] + .iter() + .for_each(|boot_prot| { + assert!(vcpu.set_sregs(&Default::default()).is_ok()); + setup_sregs(&gm, &vcpu, *boot_prot).unwrap(); + + let mut sregs: kvm_sregs = vcpu.get_sregs().unwrap(); + // for AMD KVM_GET_SREGS returns g = 0 for each kvm_segment. + // We set it to 1, otherwise the test will fail. + sregs.gs.g = 1; + + validate_segments_and_sregs(&gm, &sregs, *boot_prot); + if let BootProtocol::LinuxBoot = *boot_prot { + validate_page_tables(&gm, &sregs); + } + }); } #[test] @@ -423,9 +502,13 @@ mod tests { fn test_configure_segments_and_sregs() { let mut sregs: kvm_sregs = Default::default(); let gm = create_guest_mem(None); - configure_segments_and_sregs(&gm, &mut sregs).unwrap(); + configure_segments_and_sregs(&gm, &mut sregs, BootProtocol::LinuxBoot).unwrap(); + + validate_segments_and_sregs(&gm, &sregs, BootProtocol::LinuxBoot); + + configure_segments_and_sregs(&gm, &mut sregs, BootProtocol::PvhBoot).unwrap(); - validate_segments_and_sregs(&gm, &sregs); + validate_segments_and_sregs(&gm, &sregs, BootProtocol::PvhBoot); } #[test] diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index f2812b6a50b1..2b8a592cb67b 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -385,7 +385,7 @@ pub fn build_microvm_for_boot( &vmm, vcpus.as_mut(), vcpu_config, - entry_point.entry_addr, + entry_point, &initrd, boot_cmdline, )?; @@ -849,7 +849,7 @@ pub fn configure_system_for_boot( vmm: &Vmm, vcpus: &mut [Vcpu], vcpu_config: VcpuConfig, - entry_addr: GuestAddress, + entry_point: EntryPoint, initrd: &Option, boot_cmdline: LoaderKernelCmdline, ) -> std::result::Result<(), StartMicrovmError> { @@ -860,7 +860,7 @@ pub fn configure_system_for_boot( vcpu.kvm_vcpu .configure( vmm.guest_memory(), - entry_addr, + entry_point, &vcpu_config, vmm.vm.supported_cpuid().clone(), ) @@ -893,7 +893,7 @@ pub fn configure_system_for_boot( { for vcpu in vcpus.iter_mut() { vcpu.kvm_vcpu - .configure(vmm.guest_memory(), entry_addr) + .configure(vmm.guest_memory(), entry_point.entry_addr) .map_err(Error::VcpuConfigure) .map_err(Internal)?; } diff --git a/src/vmm/src/vstate/vcpu/mod.rs b/src/vmm/src/vstate/vcpu/mod.rs index ae835719acf8..919692feaebc 100644 --- a/src/vmm/src/vstate/vcpu/mod.rs +++ b/src/vmm/src/vstate/vcpu/mod.rs @@ -666,6 +666,7 @@ mod tests { use std::fmt; use std::sync::{Arc, Barrier, Mutex}; + use arch::{BootProtocol, EntryPoint}; use linux_loader::loader::KernelLoader; use utils::errno; use utils::signal::validate_signal_num; @@ -911,11 +912,14 @@ mod tests { let vcpu_exit_evt = vcpu.exit_evt.try_clone().unwrap(); // Needs a kernel since we'll actually run this vcpu. - let entry_addr = load_good_kernel(&vm_mem); + let entry_point = EntryPoint { + entry_addr: load_good_kernel(&vm_mem), + protocol: BootProtocol::LinuxBoot, + }; #[cfg(target_arch = "aarch64")] vcpu.kvm_vcpu - .configure(&vm_mem, entry_addr) + .configure(&vm_mem, entry_point) .expect("failed to configure vcpu"); #[cfg(target_arch = "x86_64")] { @@ -927,7 +931,7 @@ mod tests { vcpu.kvm_vcpu .configure( &vm_mem, - entry_addr, + entry_point, &vcpu_config, _vm.supported_cpuid().clone(), ) diff --git a/src/vmm/src/vstate/vcpu/x86_64.rs b/src/vmm/src/vstate/vcpu/x86_64.rs index a20443537051..892522c985d4 100644 --- a/src/vmm/src/vstate/vcpu/x86_64.rs +++ b/src/vmm/src/vstate/vcpu/x86_64.rs @@ -12,6 +12,7 @@ use std::{fmt, result}; use arch::x86_64::interrupts; use arch::x86_64::msr::SetMSRsError; use arch::x86_64::regs::{SetupFpuError, SetupRegistersError, SetupSpecialRegistersError}; +use arch::EntryPoint; use cpuid::{c3, filter_cpuid, msrs_to_save_by_cpuid, t2, t2s, VmSpec}; use kvm_bindings::{ kvm_debugregs, kvm_lapic_state, kvm_mp_state, kvm_regs, kvm_sregs, kvm_vcpu_events, kvm_xcrs, @@ -21,7 +22,7 @@ use kvm_ioctls::{VcpuExit, VcpuFd}; use logger::{error, warn, IncMetric, METRICS}; use versionize::{VersionMap, Versionize, VersionizeError, VersionizeResult}; use versionize_derive::Versionize; -use vm_memory::{Address, GuestAddress, GuestMemoryMmap}; +use vm_memory::GuestMemoryMmap; use crate::vmm_config::machine_config::CpuFeaturesTemplate; use crate::vstate::vcpu::{VcpuConfig, VcpuEmulation}; @@ -244,13 +245,14 @@ impl KvmVcpu { /// # Arguments /// /// * `guest_mem` - The guest memory used by this microvm. - /// * `kernel_start_addr` - Offset from `guest_mem` at which the kernel starts. + /// * `kernel_entry_point` - Specifies the boot protocol and offset from `guest_mem` at which + /// the kernel starts. /// * `vcpu_config` - The vCPU configuration. /// * `cpuid` - The capabilities exposed by this vCPU. pub fn configure( &mut self, guest_mem: &GuestMemoryMmap, - kernel_start_addr: GuestAddress, + kernel_entry_point: EntryPoint, vcpu_config: &VcpuConfig, mut cpuid: CpuId, ) -> std::result::Result<(), KvmVcpuConfigureError> { @@ -315,9 +317,9 @@ impl KvmVcpu { // MSRs defined by the template` arch::x86_64::msr::set_msrs(&self.fd, &msr_boot_entries)?; - arch::x86_64::regs::setup_regs(&self.fd, kernel_start_addr.raw_value())?; + arch::x86_64::regs::setup_regs(&self.fd, kernel_entry_point)?; arch::x86_64::regs::setup_fpu(&self.fd)?; - arch::x86_64::regs::setup_sregs(guest_mem, &self.fd)?; + arch::x86_64::regs::setup_sregs(guest_mem, &self.fd, kernel_entry_point.protocol)?; arch::x86_64::interrupts::set_lint(&self.fd)?; Ok(()) } @@ -619,8 +621,10 @@ mod tests { use std::os::unix::io::AsRawFd; + use arch::BootProtocol; use cpuid::common::{get_vendor_id_from_host, VENDOR_ID_INTEL}; use kvm_ioctls::Cap; + use vm_memory::GuestAddress; use super::*; use crate::vstate::vm::tests::setup_vm; @@ -662,10 +666,15 @@ mod tests { cpu_template: CpuFeaturesTemplate::None, }; + let entry_point = EntryPoint { + entry_addr: GuestAddress(0), + protocol: BootProtocol::LinuxBoot, + }; + assert!(vcpu .configure( &vm_mem, - GuestAddress(0), + entry_point, &vcpu_config, vm.supported_cpuid().clone() ) @@ -675,7 +684,7 @@ mod tests { vcpu_config.cpu_template = CpuFeaturesTemplate::T2; let t2_res = vcpu.configure( &vm_mem, - GuestAddress(arch::get_kernel_start()), + entry_point, &vcpu_config, vm.supported_cpuid().clone(), ); @@ -684,7 +693,7 @@ mod tests { vcpu_config.cpu_template = CpuFeaturesTemplate::C3; let c3_res = vcpu.configure( &vm_mem, - GuestAddress(0), + entry_point, &vcpu_config, vm.supported_cpuid().clone(), ); @@ -693,7 +702,7 @@ mod tests { vcpu_config.cpu_template = CpuFeaturesTemplate::T2S; let t2s_res = vcpu.configure( &vm_mem, - GuestAddress(0), + entry_point, &vcpu_config, vm.supported_cpuid().clone(), );