Skip to content

Commit

Permalink
mm/device-public-memory: device memory cache coherent with CPU
Browse files Browse the repository at this point in the history
Platform with advance system bus (like CAPI or CCIX) allow device memory
to be accessible from CPU in a cache coherent fashion.  Add a new type of
ZONE_DEVICE to represent such memory.  The use case are the same as for
the un-addressable device memory but without all the corners cases.

Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Jérôme Glisse <[email protected]>
Cc: Aneesh Kumar <[email protected]>
Cc: Paul E. McKenney <[email protected]>
Cc: Benjamin Herrenschmidt <[email protected]>
Cc: Dan Williams <[email protected]>
Cc: Ross Zwisler <[email protected]>
Cc: Balbir Singh <[email protected]>
Cc: David Nellans <[email protected]>
Cc: Evgeny Baskakov <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: John Hubbard <[email protected]>
Cc: Kirill A. Shutemov <[email protected]>
Cc: Mark Hairgrove <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Sherry Cheung <[email protected]>
Cc: Subhash Gutti <[email protected]>
Cc: Vladimir Davydov <[email protected]>
Cc: Bob Liu <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
Jérôme Glisse authored and torvalds committed Sep 9, 2017
1 parent 8315ada commit df6ad69
Show file tree
Hide file tree
Showing 14 changed files with 159 additions and 47 deletions.
2 changes: 1 addition & 1 deletion fs/proc/task_mmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1267,7 +1267,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
if (pm->show_pfn)
frame = pte_pfn(pte);
flags |= PM_PRESENT;
page = vm_normal_page(vma, addr, pte);
page = _vm_normal_page(vma, addr, pte, true);
if (pte_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
} else if (is_swap_pte(pte)) {
Expand Down
4 changes: 2 additions & 2 deletions include/linux/hmm.h
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ int hmm_vma_fault(struct vm_area_struct *vma,
#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */


#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
struct hmm_devmem;

struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
Expand Down Expand Up @@ -494,7 +494,7 @@ struct hmm_device {
*/
struct hmm_device *hmm_device_new(void *drvdata);
void hmm_device_put(struct hmm_device *hmm_device);
#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */


/* Below are for HMM internal use only! Not to be used by device driver! */
Expand Down
1 change: 1 addition & 0 deletions include/linux/ioport.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ enum {
IORES_DESC_PERSISTENT_MEMORY = 4,
IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
IORES_DESC_DEVICE_PRIVATE_MEMORY = 6,
IORES_DESC_DEVICE_PUBLIC_MEMORY = 7,
};

/* helpers to define resources */
Expand Down
21 changes: 21 additions & 0 deletions include/linux/memremap.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,18 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
*
* A more complete discussion of unaddressable memory may be found in
* include/linux/hmm.h and Documentation/vm/hmm.txt.
*
* MEMORY_DEVICE_PUBLIC:
* Device memory that is cache coherent from device and CPU point of view. This
* is use on platform that have an advance system bus (like CAPI or CCIX). A
* driver can hotplug the device memory using ZONE_DEVICE and with that memory
* type. Any page of a process can be migrated to such memory. However no one
* should be allow to pin such memory so that it can always be evicted.
*/
enum memory_type {
MEMORY_DEVICE_HOST = 0,
MEMORY_DEVICE_PRIVATE,
MEMORY_DEVICE_PUBLIC,
};

/*
Expand Down Expand Up @@ -92,6 +100,8 @@ enum memory_type {
* The page_free() callback is called once the page refcount reaches 1
* (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug.
* This allows the device driver to implement its own memory management.)
*
* For MEMORY_DEVICE_PUBLIC only the page_free() callback matter.
*/
typedef int (*dev_page_fault_t)(struct vm_area_struct *vma,
unsigned long addr,
Expand Down Expand Up @@ -134,6 +144,12 @@ static inline bool is_device_private_page(const struct page *page)
return is_zone_device_page(page) &&
page->pgmap->type == MEMORY_DEVICE_PRIVATE;
}

static inline bool is_device_public_page(const struct page *page)
{
return is_zone_device_page(page) &&
page->pgmap->type == MEMORY_DEVICE_PUBLIC;
}
#else
static inline void *devm_memremap_pages(struct device *dev,
struct resource *res, struct percpu_ref *ref,
Expand All @@ -157,6 +173,11 @@ static inline bool is_device_private_page(const struct page *page)
{
return false;
}

static inline bool is_device_public_page(const struct page *page)
{
return false;
}
#endif

/**
Expand Down
20 changes: 12 additions & 8 deletions include/linux/mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -800,15 +800,16 @@ static inline bool is_zone_device_page(const struct page *page)
}
#endif

#ifdef CONFIG_DEVICE_PRIVATE
void put_zone_device_private_page(struct page *page);
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
void put_zone_device_private_or_public_page(struct page *page);
#else
static inline void put_zone_device_private_page(struct page *page)
static inline void put_zone_device_private_or_public_page(struct page *page)
{
}
#endif
#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */

static inline bool is_device_private_page(const struct page *page);
static inline bool is_device_public_page(const struct page *page);

DECLARE_STATIC_KEY_FALSE(device_private_key);

Expand All @@ -834,8 +835,9 @@ static inline void put_page(struct page *page)
* include/linux/memremap.h and HMM for details.
*/
if (static_branch_unlikely(&device_private_key) &&
unlikely(is_device_private_page(page))) {
put_zone_device_private_page(page);
unlikely(is_device_private_page(page) ||
is_device_public_page(page))) {
put_zone_device_private_or_public_page(page);
return;
}

Expand Down Expand Up @@ -1224,8 +1226,10 @@ struct zap_details {
pgoff_t last_index; /* Highest page->index to unmap */
};

struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte, bool with_public_device);
#define vm_normal_page(vma, addr, pte) _vm_normal_page(vma, addr, pte, false)

struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd);

Expand Down
8 changes: 4 additions & 4 deletions kernel/memremap.c
Original file line number Diff line number Diff line change
Expand Up @@ -501,8 +501,8 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
#endif /* CONFIG_ZONE_DEVICE */


#ifdef CONFIG_DEVICE_PRIVATE
void put_zone_device_private_page(struct page *page)
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
void put_zone_device_private_or_public_page(struct page *page)
{
int count = page_ref_dec_return(page);

Expand All @@ -522,5 +522,5 @@ void put_zone_device_private_page(struct page *page)
} else if (!count)
__put_page(page);
}
EXPORT_SYMBOL(put_zone_device_private_page);
#endif /* CONFIG_DEVICE_PRIVATE */
EXPORT_SYMBOL(put_zone_device_private_or_public_page);
#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
11 changes: 11 additions & 0 deletions mm/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -720,12 +720,23 @@ config HMM_MIRROR
config DEVICE_PRIVATE
bool "Unaddressable device memory (GPU memory, ...)"
depends on ARCH_HAS_HMM
select HMM

help
Allows creation of struct pages to represent unaddressable device
memory; i.e., memory that is only accessible from the device (or
group of devices). You likely also want to select HMM_MIRROR.

config DEVICE_PUBLIC
bool "Addressable device memory (like GPU memory)"
depends on ARCH_HAS_HMM
select HMM

help
Allows creation of struct pages to represent addressable device
memory; i.e., memory that is accessible from both the device and
the CPU

config FRAME_VECTOR
bool

Expand Down
7 changes: 7 additions & 0 deletions mm/gup.c
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,13 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
goto unmap;
*page = pte_page(*pte);

/*
* This should never happen (a device public page in the gate
* area).
*/
if (is_device_public_page(*page))
goto unmap;
}
get_page(*page);
out:
Expand Down
4 changes: 2 additions & 2 deletions mm/hmm.c
Original file line number Diff line number Diff line change
Expand Up @@ -737,7 +737,7 @@ EXPORT_SYMBOL(hmm_vma_fault);
#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */


#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
unsigned long addr)
{
Expand Down Expand Up @@ -1177,4 +1177,4 @@ static int __init hmm_init(void)
}

device_initcall(hmm_init);
#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
2 changes: 1 addition & 1 deletion mm/madvise.c
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
continue;
}

page = vm_normal_page(vma, addr, ptent);
page = _vm_normal_page(vma, addr, ptent, true);
if (!page)
continue;

Expand Down
12 changes: 7 additions & 5 deletions mm/memcontrol.c
Original file line number Diff line number Diff line change
Expand Up @@ -4623,10 +4623,11 @@ static int mem_cgroup_move_account(struct page *page,
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored
* in target->ent.
* 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
* (so ZONE_DEVICE page and thus not on the lru). For now we such page is
* charge like a regular page would be as for all intent and purposes it is
* just special memory taking the place of a regular page.
* 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC
* or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
* For now we such page is charge like a regular page would be as for all
* intent and purposes it is just special memory taking the place of a
* regular page.
*
* See Documentations/vm/hmm.txt and include/linux/hmm.h
*
Expand Down Expand Up @@ -4657,7 +4658,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
*/
if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
if (is_device_private_page(page))
if (is_device_private_page(page) ||
is_device_public_page(page))
ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
Expand Down
46 changes: 41 additions & 5 deletions mm/memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -818,8 +818,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
#else
# define HAVE_PTE_SPECIAL 0
#endif
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte)
struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte, bool with_public_device)
{
unsigned long pfn = pte_pfn(pte);

Expand All @@ -830,8 +830,31 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
return vma->vm_ops->find_special_page(vma, addr);
if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
if (!is_zero_pfn(pfn))
print_bad_pte(vma, addr, pte, NULL);
if (is_zero_pfn(pfn))
return NULL;

/*
* Device public pages are special pages (they are ZONE_DEVICE
* pages but different from persistent memory). They behave
* allmost like normal pages. The difference is that they are
* not on the lru and thus should never be involve with any-
* thing that involve lru manipulation (mlock, numa balancing,
* ...).
*
* This is why we still want to return NULL for such page from
* vm_normal_page() so that we do not have to special case all
* call site of vm_normal_page().
*/
if (likely(pfn < highest_memmap_pfn)) {
struct page *page = pfn_to_page(pfn);

if (is_device_public_page(page)) {
if (with_public_device)
return page;
return NULL;
}
}
print_bad_pte(vma, addr, pte, NULL);
return NULL;
}

Expand Down Expand Up @@ -1012,6 +1035,19 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
get_page(page);
page_dup_rmap(page, false);
rss[mm_counter(page)]++;
} else if (pte_devmap(pte)) {
page = pte_page(pte);

/*
* Cache coherent device memory behave like regular page and
* not like persistent memory page. For more informations see
* MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h
*/
if (is_device_public_page(page)) {
get_page(page);
page_dup_rmap(page, false);
rss[mm_counter(page)]++;
}
}

out_set_pte:
Expand Down Expand Up @@ -1267,7 +1303,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
if (pte_present(ptent)) {
struct page *page;

page = vm_normal_page(vma, addr, ptent);
page = _vm_normal_page(vma, addr, ptent, true);
if (unlikely(details) && page) {
/*
* unmap_shared_mapping_pages() wants to
Expand Down
Loading

0 comments on commit df6ad69

Please sign in to comment.