Skip to content

Commit

Permalink
fix(snapshot/x86_64): make sure TSC_DEADLINE MSR is non-zero
Browse files Browse the repository at this point in the history
On x86_64, we observed that when restoring from a snapshot,
one of the vCPUs had MSR_IA32_TSC_DEADLINE cleared and never
received TSC interrupts until the MSR is updated externally
(eg by setting the system time).

We believe this happens because the TSC interrupt is lost
during snapshot taking process: the MSR is cleared, but the
interrupt is not delivered to the guest, so the guest
does not rearm the timer.

A visible effect of that is failure to connect to a restored VM
via SSH.

This commit introduces a workaround. If when taking a snapshot,
we see a zero MSR_IA32_TSC_DEADLINE, we replace its value with
the maximum MSR_IA32_TSC_DEADLINE value across all vCPUs.
This makes sure the vCPU will continue to receive TSC interrupts.

Signed-off-by: Nikita Kalyazin <[email protected]>
  • Loading branch information
kalyazin committed May 19, 2024
1 parent 3853362 commit 79452f4
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 1 deletion.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,12 @@ and this project adheres to
UFFD support not being forward-compatible with new ioctl options introduced in
Linux 6.6. See also
https://github.com/bytecodealliance/userfaultfd-rs/issues/61.
- [#4618](https://github.com/firecracker-microvm/firecracker/pull/4618): On
x86_64, when taking a snapshot, if a vCPU has MSR_IA32_TSC_DEADLINE set to 0,
Firecracker will replace it with the maximum MSR_IA32_TSC_DEADLINE value
across all vCPUs. This is to guarantee that the vCPU will continue receiving
TSC interrupts after restoring from the snapshot even if an interrupt is lost
when taking a snapshot.

## \[1.7.0\]

Expand Down
5 changes: 5 additions & 0 deletions docs/snapshotting/snapshot-support.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,11 @@ The snapshot functionality is still in developer preview due to the following:
the data store is not persisted across snapshots.
- Configuration information for metrics and logs are not saved to the snapshot.
These need to be reconfigured on the restored microVM.
- On x86_64, if a vCPU has MSR_IA32_TSC_DEADLINE set to 0 when a snapshot is
taken, Firecracker replaces it with the maximum MSR_IA32_TSC_DEADLINE value
across all vCPUs. This is to guarantee that the vCPU will continue receiving
TSC interrupts after restoring from the snapshot even if an interrupt is lost
when taking a snapshot.

## Snapshot versioning

Expand Down
9 changes: 8 additions & 1 deletion src/vmm/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,8 @@ use utils::epoll::EventSet;
use utils::eventfd::EventFd;
use utils::terminal::Terminal;
use utils::u64_to_usize;
#[cfg(target_arch = "x86_64")]
use vstate::vcpu::fix_zero_tsc_deadline_msr;
use vstate::vcpu::{self, KvmVcpuConfigureError, StartThreadedError, VcpuSendEventError};

use crate::arch::DeviceType;
Expand Down Expand Up @@ -559,7 +561,9 @@ impl Vmm {
.collect::<Result<Vec<VcpuResponse>, RecvTimeoutError>>()
.map_err(|_| MicrovmStateError::UnexpectedVcpuResponse)?;

let vcpu_states = vcpu_responses
// This is because we only need to modify the states on x86_64, but not on aarch64.
#[allow(unused_mut)]
let mut vcpu_states = vcpu_responses
.into_iter()
.map(|response| match response {
VcpuResponse::SavedState(state) => Ok(*state),
Expand All @@ -569,6 +573,9 @@ impl Vmm {
})
.collect::<Result<Vec<VcpuState>, MicrovmStateError>>()?;

#[cfg(target_arch = "x86_64")]
fix_zero_tsc_deadline_msr(&mut vcpu_states);

Ok(vcpu_states)
}

Expand Down
99 changes: 99 additions & 0 deletions src/vmm/src/vstate/vcpu/x86_64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use serde::{Deserialize, Serialize};
use crate::arch::x86_64::interrupts;
use crate::arch::x86_64::msr::{create_boot_msr_entries, MsrError};
use crate::arch::x86_64::regs::{SetupFpuError, SetupRegistersError, SetupSpecialRegistersError};
use crate::arch_gen::x86::msr_index::MSR_IA32_TSC_DEADLINE;
use crate::cpu_config::x86_64::{cpuid, CpuConfiguration};
use crate::logger::{IncMetric, METRICS};
use crate::vstate::memory::{Address, GuestAddress, GuestMemoryMmap};
Expand Down Expand Up @@ -143,6 +144,40 @@ pub(super) struct Peripherals {
pub mmio_bus: Option<crate::devices::Bus>,
}

/// If any of per-vCPU IA32_TSC_DEADLINE MSRs are zero, update them
/// with the maximum value observed across all vCPUs.
///
/// We observed that sometimes when taking a snapshot,
/// the IA32_TSC_DEADLINE MSR is cleared, but the interrupt is not
/// delivered to the guest, leading to a situation where one
/// of the vCPUs never receives TSC interrupts after restoring,
/// until the MSR is updated externally, eg by setting the system time.
pub fn fix_zero_tsc_deadline_msr(vcpu_states: &mut [VcpuState]) {
let max_tsc_deadline_val = vcpu_states
.iter()
.flat_map(|state| state.saved_msrs.iter().flat_map(|msrs| msrs.as_slice()))
.filter(|msr| msr.index == MSR_IA32_TSC_DEADLINE)
.map(|msr| msr.data)
.max()
.unwrap_or(0);

for state in vcpu_states.iter_mut() {
for msr in state
.saved_msrs
.iter_mut()
.flat_map(|msrs| msrs.as_mut_slice())
{
if msr.index == MSR_IA32_TSC_DEADLINE && msr.data == 0 {
warn!(
"MSR_IA32_TSC_DEADLINE is 0, replacing with {:x}.",
max_tsc_deadline_val
);
msr.data = max_tsc_deadline_val;
}
}
}
}

impl KvmVcpu {
/// Constructs a new kvm vcpu with arch specific functionality.
///
Expand Down Expand Up @@ -594,10 +629,12 @@ mod tests {

use std::os::unix::io::AsRawFd;

use kvm_bindings::kvm_msr_entry;
use kvm_ioctls::Cap;

use super::*;
use crate::arch::x86_64::cpu_model::CpuModel;
use crate::arch_gen::x86::msr_index::MSR_TSC_AUX;
use crate::cpu_config::templates::{
CpuConfiguration, CpuTemplateType, CustomCpuTemplate, GetCpuTemplate, GuestConfigError,
StaticCpuTemplate,
Expand Down Expand Up @@ -949,4 +986,66 @@ mod tests {
}
}
}

macro_rules! vcpu_state_with_msr {
($index:expr, $data:expr) => {
VcpuState {
saved_msrs: vec![Msrs::from_entries(&[kvm_msr_entry {
index: $index,
data: $data,
..Default::default()
}])
.unwrap()],
..Default::default()
}
};
}

macro_rules! assert_vcpu_state_msr {
($vcpu_state:expr, $msr_index:expr, $expected_value:expr) => {
assert_eq!(
$vcpu_state
.saved_msrs
.iter()
.flat_map(|msrs_grp| msrs_grp.as_slice())
.filter(|msr| msr.index == $msr_index)
.map(|msr| msr.data)
.next()
.unwrap_or(0),
$expected_value
);
};
}

#[test]
fn test_fix_zero_tsc_deadline_msr_chooses_max() {
let mut vcpu_states = vec![
vcpu_state_with_msr!(MSR_IA32_TSC_DEADLINE, 0),
vcpu_state_with_msr!(MSR_IA32_TSC_DEADLINE, 1),
vcpu_state_with_msr!(MSR_IA32_TSC_DEADLINE, 2),
];

fix_zero_tsc_deadline_msr(&mut vcpu_states);

// We expect for vCPU 0 MSR to be updated with vCPU 2 MSR value
assert_vcpu_state_msr!(vcpu_states[0], MSR_IA32_TSC_DEADLINE, 2);
assert_vcpu_state_msr!(vcpu_states[1], MSR_IA32_TSC_DEADLINE, 1);
assert_vcpu_state_msr!(vcpu_states[2], MSR_IA32_TSC_DEADLINE, 2);
}

#[test]
fn test_fix_zero_tsc_deadline_msr_does_not_fix_other_msrs() {
let mut vcpu_states = vec![
vcpu_state_with_msr!(MSR_TSC_AUX, 0),
vcpu_state_with_msr!(MSR_TSC_AUX, 1),
vcpu_state_with_msr!(MSR_TSC_AUX, 2),
];

fix_zero_tsc_deadline_msr(&mut vcpu_states);

// We expect for all MSRs to remain the same
assert_vcpu_state_msr!(vcpu_states[0], MSR_TSC_AUX, 0);
assert_vcpu_state_msr!(vcpu_states[1], MSR_TSC_AUX, 1);
assert_vcpu_state_msr!(vcpu_states[2], MSR_TSC_AUX, 2);
}
}

0 comments on commit 79452f4

Please sign in to comment.