Skip to content

Commit

Permalink
KVM: x86/mmu: Use atomic ops to set SPTEs in TDP MMU map
Browse files Browse the repository at this point in the history
To prepare for handling page faults in parallel, change the TDP MMU
page fault handler to use atomic operations to set SPTEs so that changes
are not lost if multiple threads attempt to modify the same SPTE.

Reviewed-by: Peter Feiner <[email protected]>
Signed-off-by: Ben Gardon <[email protected]>

Message-Id: <[email protected]>
[Document new locking rules. - Paolo]
Signed-off-by: Paolo Bonzini <[email protected]>
  • Loading branch information
Ben Gardon authored and bonzini committed Feb 4, 2021
1 parent a9442f5 commit 9a77daa
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 34 deletions.
9 changes: 8 additions & 1 deletion Documentation/virt/kvm/locking.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,14 @@ The acquisition orders for mutexes are as follows:
- kvm->slots_lock is taken outside kvm->irq_lock, though acquiring
them together is quite rare.

On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock.
On x86:

- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock

- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock is
taken inside kvm->arch.mmu_lock, and cannot be taken without already
holding kvm->arch.mmu_lock (typically with ``read_lock``, otherwise
there's no need to take kvm->arch.tdp_mmu_pages_lock at all).

Everything else is a leaf: no other lock is taken inside the critical
sections.
Expand Down
13 changes: 13 additions & 0 deletions arch/x86/include/asm/kvm_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -1039,6 +1039,19 @@ struct kvm_arch {
* tdp_mmu_page set and a root_count of 0.
*/
struct list_head tdp_mmu_pages;

/*
* Protects accesses to the following fields when the MMU lock
* is held in read mode:
* - tdp_mmu_pages (above)
* - the link field of struct kvm_mmu_pages used by the TDP MMU
* - lpage_disallowed_mmu_pages
* - the lpage_disallowed_link field of struct kvm_mmu_pages used
* by the TDP MMU
* It is acceptable, but not necessary, to acquire this lock when
* the thread holds the MMU lock in write mode.
*/
spinlock_t tdp_mmu_pages_lock;
};

struct kvm_vm_stat {
Expand Down
142 changes: 109 additions & 33 deletions arch/x86/kvm/mmu/tdp_mmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "tdp_mmu.h"
#include "spte.h"

#include <asm/cmpxchg.h>
#include <trace/events/kvm.h>

#ifdef CONFIG_X86_64
Expand All @@ -33,6 +34,7 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
kvm->arch.tdp_mmu_enabled = true;

INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
}

Expand Down Expand Up @@ -225,7 +227,8 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
}

static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
u64 old_spte, u64 new_spte, int level);
u64 old_spte, u64 new_spte, int level,
bool shared);

static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
{
Expand Down Expand Up @@ -267,61 +270,91 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
*
* @kvm: kvm instance
* @sp: the new page
* @shared: This operation may not be running under the exclusive use of
* the MMU lock and the operation must synchronize with other
* threads that might be adding or removing pages.
* @account_nx: This page replaces a NX large page and should be marked for
* eventual reclaim.
*/
static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
bool account_nx)
bool shared, bool account_nx)
{
lockdep_assert_held_write(&kvm->mmu_lock);
if (shared)
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
else
lockdep_assert_held_write(&kvm->mmu_lock);

list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
if (account_nx)
account_huge_nx_page(kvm, sp);

if (shared)
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
}

/**
* tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
*
* @kvm: kvm instance
* @sp: the page to be removed
* @shared: This operation may not be running under the exclusive use of
* the MMU lock and the operation must synchronize with other
* threads that might be adding or removing pages.
*/
static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp)
static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
bool shared)
{
lockdep_assert_held_write(&kvm->mmu_lock);
if (shared)
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
else
lockdep_assert_held_write(&kvm->mmu_lock);

list_del(&sp->link);
if (sp->lpage_disallowed)
unaccount_huge_nx_page(kvm, sp);

if (shared)
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
}

/**
* handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
*
* @kvm: kvm instance
* @pt: the page removed from the paging structure
* @shared: This operation may not be running under the exclusive use
* of the MMU lock and the operation must synchronize with other
* threads that might be modifying SPTEs.
*
* Given a page table that has been removed from the TDP paging structure,
* iterates through the page table to clear SPTEs and free child page tables.
*/
static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt)
static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
bool shared)
{
struct kvm_mmu_page *sp = sptep_to_sp(pt);
int level = sp->role.level;
gfn_t gfn = sp->gfn;
u64 old_child_spte;
u64 *sptep;
int i;

trace_kvm_mmu_prepare_zap_page(sp);

tdp_mmu_unlink_page(kvm, sp);
tdp_mmu_unlink_page(kvm, sp, shared);

for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
old_child_spte = READ_ONCE(*(pt + i));
WRITE_ONCE(*(pt + i), 0);
sptep = pt + i;

if (shared) {
old_child_spte = xchg(sptep, 0);
} else {
old_child_spte = READ_ONCE(*sptep);
WRITE_ONCE(*sptep, 0);
}
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp),
gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)),
old_child_spte, 0, level - 1);
old_child_spte, 0, level - 1, shared);
}

kvm_flush_remote_tlbs_with_address(kvm, gfn,
Expand All @@ -338,12 +371,16 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt)
* @old_spte: The value of the SPTE before the change
* @new_spte: The value of the SPTE after the change
* @level: the level of the PT the SPTE is part of in the paging structure
* @shared: This operation may not be running under the exclusive use of
* the MMU lock and the operation must synchronize with other
* threads that might be modifying SPTEs.
*
* Handle bookkeeping that might result from the modification of a SPTE.
* This function must be called for all TDP SPTE modifications.
*/
static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
u64 old_spte, u64 new_spte, int level)
u64 old_spte, u64 new_spte, int level,
bool shared)
{
bool was_present = is_shadow_present_pte(old_spte);
bool is_present = is_shadow_present_pte(new_spte);
Expand Down Expand Up @@ -415,18 +452,51 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
*/
if (was_present && !was_leaf && (pfn_changed || !is_present))
handle_removed_tdp_mmu_page(kvm,
spte_to_child_pt(old_spte, level));
spte_to_child_pt(old_spte, level), shared);
}

static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
u64 old_spte, u64 new_spte, int level)
u64 old_spte, u64 new_spte, int level,
bool shared)
{
__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level);
__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
shared);
handle_changed_spte_acc_track(old_spte, new_spte, level);
handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
new_spte, level);
}

/*
* tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
* associated bookkeeping
*
* @kvm: kvm instance
* @iter: a tdp_iter instance currently on the SPTE that should be set
* @new_spte: The value the SPTE should be set to
* Returns: true if the SPTE was set, false if it was not. If false is returned,
* this function will have no side-effects.
*/
static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
struct tdp_iter *iter,
u64 new_spte)
{
u64 *root_pt = tdp_iter_root_pt(iter);
struct kvm_mmu_page *root = sptep_to_sp(root_pt);
int as_id = kvm_mmu_page_as_id(root);

lockdep_assert_held_read(&kvm->mmu_lock);

if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
new_spte) != iter->old_spte)
return false;

handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
iter->level, true);

return true;
}


/*
* __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
* @kvm: kvm instance
Expand Down Expand Up @@ -456,7 +526,7 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);

__handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
iter->level);
iter->level, false);
if (record_acc_track)
handle_changed_spte_acc_track(iter->old_spte, new_spte,
iter->level);
Expand Down Expand Up @@ -630,23 +700,18 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
int ret = 0;
int make_spte_ret = 0;

if (unlikely(is_noslot_pfn(pfn))) {
if (unlikely(is_noslot_pfn(pfn)))
new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
new_spte);
} else {
else
make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
pfn, iter->old_spte, prefault, true,
map_writable, !shadow_accessed_mask,
&new_spte);
trace_kvm_mmu_set_spte(iter->level, iter->gfn,
rcu_dereference(iter->sptep));
}

if (new_spte == iter->old_spte)
ret = RET_PF_SPURIOUS;
else
tdp_mmu_set_spte(vcpu->kvm, iter, new_spte);
else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
return RET_PF_RETRY;

/*
* If the page fault was caused by a write but the page is write
Expand All @@ -660,8 +725,13 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
}

/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
if (unlikely(is_mmio_spte(new_spte)))
if (unlikely(is_mmio_spte(new_spte))) {
trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
new_spte);
ret = RET_PF_EMULATE;
} else
trace_kvm_mmu_set_spte(iter->level, iter->gfn,
rcu_dereference(iter->sptep));

trace_kvm_mmu_set_spte(iter->level, iter->gfn,
rcu_dereference(iter->sptep));
Expand Down Expand Up @@ -720,7 +790,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
*/
if (is_shadow_present_pte(iter.old_spte) &&
is_large_pte(iter.old_spte)) {
tdp_mmu_set_spte(vcpu->kvm, &iter, 0);
if (!tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, 0))
break;

kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn,
KVM_PAGES_PER_HPAGE(iter.level));
Expand All @@ -737,19 +808,24 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
child_pt = sp->spt;

tdp_mmu_link_page(vcpu->kvm, sp,
huge_page_disallowed &&
req_level >= iter.level);

new_spte = make_nonleaf_spte(child_pt,
!shadow_accessed_mask);

trace_kvm_mmu_get_page(sp, true);
tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte);
if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
new_spte)) {
tdp_mmu_link_page(vcpu->kvm, sp, true,
huge_page_disallowed &&
req_level >= iter.level);

trace_kvm_mmu_get_page(sp, true);
} else {
tdp_mmu_free_sp(sp);
break;
}
}
}

if (WARN_ON(iter.level != level)) {
if (iter.level != level) {
rcu_read_unlock();
return RET_PF_RETRY;
}
Expand Down

0 comments on commit 9a77daa

Please sign in to comment.