Skip to content

Commit

Permalink
pvh/arch-x86_64: Initialize vCPU regs for PVH
Browse files Browse the repository at this point in the history
Set the initial values of the KVM vCPU registers as specified in
the PVH boot ABI:

https://xenbits.xen.org/docs/unstable/misc/pvh.html

Signed-off-by: Colin Percival <[email protected]>
Co-authored-by: Alejandro Jimenez <[email protected]>
  • Loading branch information
cperciva and aljimenezb committed Dec 29, 2022
1 parent a9ebc4a commit fa44730
Show file tree
Hide file tree
Showing 6 changed files with 192 additions and 65 deletions.
32 changes: 30 additions & 2 deletions src/arch/src/x86_64/gdt.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
// Copyright © 2020, Oracle and/or its affiliates.
//
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
//
Expand All @@ -24,8 +26,34 @@ fn get_base(entry: u64) -> u64 {
| (((entry) & 0x0000_0000_FFFF_0000) >> 16)
}

// Extract the segment limit from the GDT segment descriptor.
//
// In a segment descriptor, the limit field is 20 bits, so it can directly describe
// a range from 0 to 0xFFFFF (1 MB). When G flag is set (4-KByte page granularity) it
// scales the value in the limit field by a factor of 2^12 (4 Kbytes), making the effective
// limit range from 0xFFF (4 KBytes) to 0xFFFF_FFFF (4 GBytes).
//
// However, the limit field in the VMCS definition is a 32 bit field, and the limit value is not
// automatically scaled using the G flag. This means that for a desired range of 4GB for a
// given segment, its limit must be specified as 0xFFFF_FFFF. Therefore the method of obtaining
// the limit from the GDT entry is not sufficient, since it only provides 20 bits when 32 bits
// are necessary. Fortunately, we can check if the G flag is set when extracting the limit since
// the full GDT entry is passed as an argument, and perform the scaling of the limit value to
// return the full 32 bit value.
//
// The scaling mentioned above is required when using PVH boot, since the guest boots in protected
// (32-bit) mode and must be able to access the entire 32-bit address space. It does not cause
// issues for the case of direct boot to 64-bit (long) mode, since in 64-bit mode the processor does
// not perform runtime limit checking on code or data segments.
fn get_limit(entry: u64) -> u32 {
((((entry) & 0x000F_0000_0000_0000) >> 32) | ((entry) & 0x0000_0000_0000_FFFF)) as u32
let limit: u32 =
((((entry) & 0x000F_0000_0000_0000) >> 32) | ((entry) & 0x0000_0000_0000_FFFF)) as u32;

// Perform manual limit scaling if G flag is set
match get_g(entry) {
0 => limit,
_ => (limit << 12) | 0xFFF, // G flag is either 0 or 1
}
}

fn get_g(entry: u64) -> u8 {
Expand Down Expand Up @@ -109,7 +137,7 @@ mod tests {
assert_eq!(0xB, seg.type_);
// base and limit
assert_eq!(0x10_0000, seg.base);
assert_eq!(0xfffff, seg.limit);
assert_eq!(0xffff_ffff, seg.limit);
assert_eq!(0x0, seg.unusable);
}
}
3 changes: 3 additions & 0 deletions src/arch/src/x86_64/layout.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,8 @@ pub const IRQ_MAX: u32 = 23;
/// Address for the TSS setup.
pub const KVM_TSS_ADDRESS: u64 = 0xfffb_d000;

/// Address of the hvm_start_info struct used in PVH boot
pub const PVH_INFO_START: u64 = 0x6000;

/// The 'zero page', a.k.a linux kernel bootparams.
pub const ZERO_PAGE_START: u64 = 0x7000;
177 changes: 130 additions & 47 deletions src/arch/src/x86_64/regs.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// Copyright © 2020, Oracle and/or its affiliates.
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
//
Expand All @@ -11,6 +12,7 @@ use kvm_bindings::{kvm_fpu, kvm_regs, kvm_sregs};
use kvm_ioctls::VcpuFd;
use vm_memory::{Address, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap};

use super::super::{BootProtocol, EntryPoint};
use super::gdt::{gdt_entry, kvm_segment_from_gdt};

// Initial pagetables.
Expand Down Expand Up @@ -100,20 +102,33 @@ impl fmt::Display for SetupRegistersError {
/// # Errors
///
/// When [`kvm_ioctls::ioctls::vcpu::VcpuFd::set_regs`] errors.
pub fn setup_regs(vcpu: &VcpuFd, boot_ip: u64) -> std::result::Result<(), SetupRegistersError> {
let regs: kvm_regs = kvm_regs {
rflags: 0x0000_0000_0000_0002u64,
rip: boot_ip,
// Frame pointer. It gets a snapshot of the stack pointer (rsp) so that when adjustments are
// made to rsp (i.e. reserving space for local variables or pushing values on to the stack),
// local variables and function parameters are still accessible from a constant offset from
// rbp.
rsp: super::layout::BOOT_STACK_POINTER,
// Starting stack pointer.
rbp: super::layout::BOOT_STACK_POINTER,
// Must point to zero page address per Linux ABI. This is x86_64 specific.
rsi: super::layout::ZERO_PAGE_START,
..Default::default()
pub fn setup_regs(
vcpu: &VcpuFd,
entry_point: EntryPoint,
) -> std::result::Result<(), SetupRegistersError> {
let regs: kvm_regs = match entry_point.protocol {
BootProtocol::PvhBoot => kvm_regs {
// Configure regs as required by PVH boot protocol.
rflags: 0x0000_0000_0000_0002u64,
rbx: super::layout::PVH_INFO_START,
rip: entry_point.entry_addr.raw_value(),
..Default::default()
},
BootProtocol::LinuxBoot => kvm_regs {
// Configure regs as required by Linux 64-bit boot protocol.
rflags: 0x0000_0000_0000_0002u64,
rip: entry_point.entry_addr.raw_value(),
// Frame pointer. It gets a snapshot of the stack pointer (rsp) so that when adjustments
// are made to rsp (i.e. reserving space for local variables or pushing
// values on to the stack), local variables and function parameters are
// still accessible from a constant offset from rbp.
rsp: super::layout::BOOT_STACK_POINTER,
// Starting stack pointer.
rbp: super::layout::BOOT_STACK_POINTER,
// Must point to zero page address per Linux ABI. This is x86_64 specific.
rsi: super::layout::ZERO_PAGE_START,
..Default::default()
},
};

vcpu.set_regs(&regs).map_err(SetupRegistersError)
Expand Down Expand Up @@ -142,6 +157,7 @@ pub enum SetupSpecialRegistersError {
///
/// * `mem` - The memory that will be passed to the guest.
/// * `vcpu` - Structure for the VCPU that holds the VCPU's fd.
/// * `boot_prot` - The boot protocol being used.
///
/// # Errors
///
Expand All @@ -153,14 +169,18 @@ pub enum SetupSpecialRegistersError {
pub fn setup_sregs(
mem: &GuestMemoryMmap,
vcpu: &VcpuFd,
boot_prot: BootProtocol,
) -> std::result::Result<(), SetupSpecialRegistersError> {
let mut sregs: kvm_sregs = vcpu
.get_sregs()
.map_err(SetupSpecialRegistersError::GetSpecialRegisters)?;

configure_segments_and_sregs(mem, &mut sregs)
configure_segments_and_sregs(mem, &mut sregs, boot_prot)
.map_err(SetupSpecialRegistersError::ConfigureSegmentsAndSpecialRegisters)?;
setup_page_tables(mem, &mut sregs).map_err(SetupSpecialRegistersError::SetupPageTables)?; // TODO(dgreid) - Can this be done once per system instead?
if let BootProtocol::LinuxBoot = boot_prot {
setup_page_tables(mem, &mut sregs).map_err(SetupSpecialRegistersError::SetupPageTables)?;
// TODO(dgreid) - Can this be done once per system instead?
}

vcpu.set_sregs(&sregs)
.map_err(SetupSpecialRegistersError::SetSpecialRegisters)
Expand All @@ -175,6 +195,7 @@ const EFER_LMA: u64 = 0x400;
const EFER_LME: u64 = 0x100;

const X86_CR0_PE: u64 = 0x1;
const X86_CR0_ET: u64 = 0x10;
const X86_CR0_PG: u64 = 0x8000_0000;
const X86_CR4_PAE: u64 = 0x20;

Expand All @@ -198,13 +219,31 @@ fn write_idt_value(val: u64, guest_mem: &GuestMemoryMmap) -> Result<()> {
.map_err(|_| Error::WriteIDT)
}

fn configure_segments_and_sregs(mem: &GuestMemoryMmap, sregs: &mut kvm_sregs) -> Result<()> {
let gdt_table: [u64; BOOT_GDT_MAX] = [
gdt_entry(0, 0, 0), // NULL
gdt_entry(0xa09b, 0, 0xfffff), // CODE
gdt_entry(0xc093, 0, 0xfffff), // DATA
gdt_entry(0x808b, 0, 0xfffff), // TSS
];
fn configure_segments_and_sregs(
mem: &GuestMemoryMmap,
sregs: &mut kvm_sregs,
boot_prot: BootProtocol,
) -> Result<()> {
let gdt_table: [u64; BOOT_GDT_MAX] = match boot_prot {
BootProtocol::PvhBoot => {
// Configure GDT entries as specified by PVH boot protocol
[
gdt_entry(0, 0, 0), // NULL
gdt_entry(0xc09b, 0, 0xffff_ffff), // CODE
gdt_entry(0xc093, 0, 0xffff_ffff), // DATA
gdt_entry(0x008b, 0, 0x67), // TSS
]
}
BootProtocol::LinuxBoot => {
// Configure GDT entries as specified by Linux 64bit boot protocol
[
gdt_entry(0, 0, 0), // NULL
gdt_entry(0xa09b, 0, 0xfffff), // CODE
gdt_entry(0xc093, 0, 0xfffff), // DATA
gdt_entry(0x808b, 0, 0xfffff), // TSS
]
}
};

let code_seg = kvm_segment_from_gdt(gdt_table[1], 1);
let data_seg = kvm_segment_from_gdt(gdt_table[2], 2);
Expand All @@ -227,9 +266,17 @@ fn configure_segments_and_sregs(mem: &GuestMemoryMmap, sregs: &mut kvm_sregs) ->
sregs.ss = data_seg;
sregs.tr = tss_seg;

// 64-bit protected mode
sregs.cr0 |= X86_CR0_PE;
sregs.efer |= EFER_LME | EFER_LMA;
match boot_prot {
BootProtocol::PvhBoot => {
sregs.cr0 = X86_CR0_PE | X86_CR0_ET;
sregs.cr4 = 0;
}
BootProtocol::LinuxBoot => {
// 64-bit protected mode
sregs.cr0 |= X86_CR0_PE;
sregs.efer |= EFER_LME | EFER_LMA;
}
}

Ok(())
}
Expand Down Expand Up @@ -287,24 +334,45 @@ mod tests {
gm.read_obj(read_addr).unwrap()
}

fn validate_segments_and_sregs(gm: &GuestMemoryMmap, sregs: &kvm_sregs) {
fn validate_segments_and_sregs(
gm: &GuestMemoryMmap,
sregs: &kvm_sregs,
boot_prot: BootProtocol,
) {
if let BootProtocol::LinuxBoot = boot_prot {
assert_eq!(0xaf_9b00_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 8));
assert_eq!(0xcf_9300_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 16));
assert_eq!(0x8f_8b00_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 24));

assert_eq!(0xffff_ffff, sregs.tr.limit);

assert!(sregs.cr0 & X86_CR0_PE != 0);
assert!(sregs.efer & EFER_LME != 0 && sregs.efer & EFER_LMA != 0);
} else {
// Validate values that are specific to PVH boot protocol
assert_eq!(0xcf_9b00_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 8));
assert_eq!(0xcf_9300_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 16));
assert_eq!(0x00_8b00_0000_0067, read_u64(gm, BOOT_GDT_OFFSET + 24));

assert_eq!(0x67, sregs.tr.limit);
assert_eq!(0, sregs.tr.g);

assert!(sregs.cr0 & X86_CR0_PE != 0 && sregs.cr0 & X86_CR0_ET != 0);
assert_eq!(0, sregs.cr4);
}

// Common settings for both PVH and Linux boot protocol
assert_eq!(0x0, read_u64(gm, BOOT_GDT_OFFSET));
assert_eq!(0xaf_9b00_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 8));
assert_eq!(0xcf_9300_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 16));
assert_eq!(0x8f_8b00_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 24));
assert_eq!(0x0, read_u64(gm, BOOT_IDT_OFFSET));

assert_eq!(0, sregs.cs.base);
assert_eq!(0xfffff, sregs.ds.limit);
assert_eq!(0xffff_ffff, sregs.ds.limit);
assert_eq!(0x10, sregs.es.selector);
assert_eq!(1, sregs.fs.present);
assert_eq!(1, sregs.gs.g);
assert_eq!(0, sregs.ss.avl);
assert_eq!(0, sregs.tr.base);
assert_eq!(0xfffff, sregs.tr.limit);
assert_eq!(0, sregs.tr.avl);
assert!(sregs.cr0 & X86_CR0_PE != 0);
assert!(sregs.efer & EFER_LME != 0 && sregs.efer & EFER_LMA != 0);
}

fn validate_page_tables(gm: &GuestMemoryMmap, sregs: &kvm_sregs) {
Expand Down Expand Up @@ -356,7 +424,12 @@ mod tests {
..Default::default()
};

setup_regs(&vcpu, expected_regs.rip).unwrap();
let entry_point: EntryPoint = EntryPoint {
entry_addr: GuestAddress(expected_regs.rip),
protocol: BootProtocol::LinuxBoot,
};

setup_regs(&vcpu, entry_point).unwrap();

let actual_regs: kvm_regs = vcpu.get_regs().unwrap();
assert_eq!(actual_regs, expected_regs);
Expand All @@ -369,16 +442,22 @@ mod tests {
let vcpu = vm.create_vcpu(0).unwrap();
let gm = create_guest_mem(None);

assert!(vcpu.set_sregs(&Default::default()).is_ok());
setup_sregs(&gm, &vcpu).unwrap();

let mut sregs: kvm_sregs = vcpu.get_sregs().unwrap();
// for AMD KVM_GET_SREGS returns g = 0 for each kvm_segment.
// We set it to 1, otherwise the test will fail.
sregs.gs.g = 1;

validate_segments_and_sregs(&gm, &sregs);
validate_page_tables(&gm, &sregs);
[BootProtocol::LinuxBoot, BootProtocol::PvhBoot]
.iter()
.for_each(|boot_prot| {
assert!(vcpu.set_sregs(&Default::default()).is_ok());
setup_sregs(&gm, &vcpu, *boot_prot).unwrap();

let mut sregs: kvm_sregs = vcpu.get_sregs().unwrap();
// for AMD KVM_GET_SREGS returns g = 0 for each kvm_segment.
// We set it to 1, otherwise the test will fail.
sregs.gs.g = 1;

validate_segments_and_sregs(&gm, &sregs, *boot_prot);
if let BootProtocol::LinuxBoot = *boot_prot {
validate_page_tables(&gm, &sregs);
}
});
}

#[test]
Expand Down Expand Up @@ -423,9 +502,13 @@ mod tests {
fn test_configure_segments_and_sregs() {
let mut sregs: kvm_sregs = Default::default();
let gm = create_guest_mem(None);
configure_segments_and_sregs(&gm, &mut sregs).unwrap();
configure_segments_and_sregs(&gm, &mut sregs, BootProtocol::LinuxBoot).unwrap();

validate_segments_and_sregs(&gm, &sregs, BootProtocol::LinuxBoot);

configure_segments_and_sregs(&gm, &mut sregs, BootProtocol::PvhBoot).unwrap();

validate_segments_and_sregs(&gm, &sregs);
validate_segments_and_sregs(&gm, &sregs, BootProtocol::PvhBoot);
}

#[test]
Expand Down
8 changes: 4 additions & 4 deletions src/vmm/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ pub fn build_microvm_for_boot(
&vmm,
vcpus.as_mut(),
vcpu_config,
entry_point.entry_addr,
entry_point,
&initrd,
boot_cmdline,
)?;
Expand Down Expand Up @@ -849,7 +849,7 @@ pub fn configure_system_for_boot(
vmm: &Vmm,
vcpus: &mut [Vcpu],
vcpu_config: VcpuConfig,
entry_addr: GuestAddress,
entry_point: EntryPoint,
initrd: &Option<InitrdConfig>,
boot_cmdline: LoaderKernelCmdline,
) -> std::result::Result<(), StartMicrovmError> {
Expand All @@ -860,7 +860,7 @@ pub fn configure_system_for_boot(
vcpu.kvm_vcpu
.configure(
vmm.guest_memory(),
entry_addr,
entry_point,
&vcpu_config,
vmm.vm.supported_cpuid().clone(),
)
Expand Down Expand Up @@ -893,7 +893,7 @@ pub fn configure_system_for_boot(
{
for vcpu in vcpus.iter_mut() {
vcpu.kvm_vcpu
.configure(vmm.guest_memory(), entry_addr)
.configure(vmm.guest_memory(), entry_point.entry_addr)
.map_err(Error::VcpuConfigure)
.map_err(Internal)?;
}
Expand Down
Loading

0 comments on commit fa44730

Please sign in to comment.