Skip to content

Commit

Permalink
mm: Cleanup faultaround and finish_fault() codepaths
Browse files Browse the repository at this point in the history
alloc_set_pte() has two users with different requirements: in the
faultaround code, it called from an atomic context and PTE page table
has to be preallocated. finish_fault() can sleep and allocate page table
as needed.

PTL locking rules are also strange, hard to follow and overkill for
finish_fault().

Let's untangle the mess. alloc_set_pte() has gone now. All locking is
explicit.

The price is some code duplication to handle huge pages in faultaround
path, but it should be fine, having overall improvement in readability.

Link: https://lore.kernel.org/r/20201229132819.najtavneutnf7ajp@box
Signed-off-by: Kirill A. Shutemov <[email protected]>
[will: s/from from/from/ in comment; spotted by willy]
Signed-off-by: Will Deacon <[email protected]>
  • Loading branch information
kiryl authored and willdeacon committed Jan 20, 2021
1 parent 19c329f commit f9ce0be
Show file tree
Hide file tree
Showing 5 changed files with 213 additions and 192 deletions.
6 changes: 4 additions & 2 deletions fs/xfs/xfs_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -1319,17 +1319,19 @@ xfs_filemap_pfn_mkwrite(
return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
}

static void
static vm_fault_t
xfs_filemap_map_pages(
struct vm_fault *vmf,
pgoff_t start_pgoff,
pgoff_t end_pgoff)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
vm_fault_t ret;

xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
filemap_map_pages(vmf, start_pgoff, end_pgoff);
ret = filemap_map_pages(vmf, start_pgoff, end_pgoff);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
return ret;
}

static const struct vm_operations_struct xfs_file_vm_ops = {
Expand Down
12 changes: 7 additions & 5 deletions include/linux/mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -542,8 +542,8 @@ struct vm_fault {
* is not NULL, otherwise pmd.
*/
pgtable_t prealloc_pte; /* Pre-allocated pte page table.
* vm_ops->map_pages() calls
* alloc_set_pte() from atomic context.
* vm_ops->map_pages() sets up a page
* table from atomic context.
* do_fault_around() pre-allocates
* page table to avoid allocation from
* atomic context.
Expand Down Expand Up @@ -578,7 +578,7 @@ struct vm_operations_struct {
vm_fault_t (*fault)(struct vm_fault *vmf);
vm_fault_t (*huge_fault)(struct vm_fault *vmf,
enum page_entry_size pe_size);
void (*map_pages)(struct vm_fault *vmf,
vm_fault_t (*map_pages)(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff);
unsigned long (*pagesize)(struct vm_area_struct * area);

Expand Down Expand Up @@ -988,7 +988,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
return pte;
}

vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page);
vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
void do_set_pte(struct vm_fault *vmf, struct page *page);

vm_fault_t finish_fault(struct vm_fault *vmf);
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
#endif
Expand Down Expand Up @@ -2622,7 +2624,7 @@ extern void truncate_inode_pages_final(struct address_space *);

/* generic vm_area_ops exported for stackable file systems */
extern vm_fault_t filemap_fault(struct vm_fault *vmf);
extern void filemap_map_pages(struct vm_fault *vmf,
extern vm_fault_t filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff);
extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);

Expand Down
11 changes: 11 additions & 0 deletions include/linux/pgtable.h
Original file line number Diff line number Diff line change
Expand Up @@ -1314,6 +1314,17 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
#endif
}

/*
* the ordering of these checks is important for pmds with _page_devmap set.
* if we check pmd_trans_unstable() first we will trip the bad_pmd() check
* inside of pmd_none_or_trans_huge_or_clear_bad(). this will end up correctly
* returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
*/
static inline int pmd_devmap_trans_unstable(pmd_t *pmd)
{
return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
}

#ifndef CONFIG_NUMA_BALANCING
/*
* Technically a PTE can be PROTNONE even when not doing NUMA balancing but
Expand Down
177 changes: 134 additions & 43 deletions mm/filemap.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
#include <linux/psi.h>
#include <linux/ramfs.h>
#include <linux/page_idle.h>
#include <asm/pgalloc.h>
#include "internal.h"

#define CREATE_TRACE_POINTS
Expand Down Expand Up @@ -2911,74 +2912,164 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
}
EXPORT_SYMBOL(filemap_fault);

void filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff)
static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page)
{
struct file *file = vmf->vma->vm_file;
struct mm_struct *mm = vmf->vma->vm_mm;

/* Huge page is mapped? No need to proceed. */
if (pmd_trans_huge(*vmf->pmd)) {
unlock_page(page);
put_page(page);
return true;
}

if (pmd_none(*vmf->pmd) && PageTransHuge(page)) {
vm_fault_t ret = do_set_pmd(vmf, page);
if (!ret) {
/* The page is mapped successfully, reference consumed. */
unlock_page(page);
return true;
}
}

if (pmd_none(*vmf->pmd)) {
vmf->ptl = pmd_lock(mm, vmf->pmd);
if (likely(pmd_none(*vmf->pmd))) {
mm_inc_nr_ptes(mm);
pmd_populate(mm, vmf->pmd, vmf->prealloc_pte);
vmf->prealloc_pte = NULL;
}
spin_unlock(vmf->ptl);
}

/* See comment in handle_pte_fault() */
if (pmd_devmap_trans_unstable(vmf->pmd)) {
unlock_page(page);
put_page(page);
return true;
}

return false;
}

static struct page *next_uptodate_page(struct page *page,
struct address_space *mapping,
struct xa_state *xas, pgoff_t end_pgoff)
{
unsigned long max_idx;

do {
if (!page)
return NULL;
if (xas_retry(xas, page))
continue;
if (xa_is_value(page))
continue;
if (PageLocked(page))
continue;
if (!page_cache_get_speculative(page))
continue;
/* Has the page moved or been split? */
if (unlikely(page != xas_reload(xas)))
goto skip;
if (!PageUptodate(page) || PageReadahead(page))
goto skip;
if (PageHWPoison(page))
goto skip;
if (!trylock_page(page))
goto skip;
if (page->mapping != mapping)
goto unlock;
if (!PageUptodate(page))
goto unlock;
max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
if (xas->xa_index >= max_idx)
goto unlock;
return page;
unlock:
unlock_page(page);
skip:
put_page(page);
} while ((page = xas_next_entry(xas, end_pgoff)) != NULL);

return NULL;
}

static inline struct page *first_map_page(struct address_space *mapping,
struct xa_state *xas,
pgoff_t end_pgoff)
{
return next_uptodate_page(xas_find(xas, end_pgoff),
mapping, xas, end_pgoff);
}

static inline struct page *next_map_page(struct address_space *mapping,
struct xa_state *xas,
pgoff_t end_pgoff)
{
return next_uptodate_page(xas_next_entry(xas, end_pgoff),
mapping, xas, end_pgoff);
}

vm_fault_t filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{
struct vm_area_struct *vma = vmf->vma;
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff;
unsigned long max_idx;
unsigned long address = vmf->address;
XA_STATE(xas, &mapping->i_pages, start_pgoff);
struct page *head, *page;
unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
vm_fault_t ret = 0;

rcu_read_lock();
xas_for_each(&xas, head, end_pgoff) {
if (xas_retry(&xas, head))
continue;
if (xa_is_value(head))
goto next;
head = first_map_page(mapping, &xas, end_pgoff);
if (!head)
goto out;

/*
* Check for a locked page first, as a speculative
* reference may adversely influence page migration.
*/
if (PageLocked(head))
goto next;
if (!page_cache_get_speculative(head))
goto next;
if (filemap_map_pmd(vmf, head)) {
ret = VM_FAULT_NOPAGE;
goto out;
}

/* Has the page moved or been split? */
if (unlikely(head != xas_reload(&xas)))
goto skip;
vmf->address = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl);
do {
page = find_subpage(head, xas.xa_index);

if (!PageUptodate(head) ||
PageReadahead(page) ||
PageHWPoison(page))
goto skip;
if (!trylock_page(head))
goto skip;

if (head->mapping != mapping || !PageUptodate(head))
goto unlock;

max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
if (xas.xa_index >= max_idx)
if (PageHWPoison(page))
goto unlock;

if (mmap_miss > 0)
mmap_miss--;

vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
if (vmf->pte)
vmf->pte += xas.xa_index - last_pgoff;
vmf->pte += xas.xa_index - last_pgoff;
last_pgoff = xas.xa_index;
if (alloc_set_pte(vmf, page))

if (!pte_none(*vmf->pte))
goto unlock;

do_set_pte(vmf, page);
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, vmf->address, vmf->pte);
unlock_page(head);
goto next;

/* The fault is handled */
if (vmf->address == address)
ret = VM_FAULT_NOPAGE;
continue;
unlock:
unlock_page(head);
skip:
put_page(head);
next:
/* Huge page is mapped? No need to proceed. */
if (pmd_trans_huge(*vmf->pmd))
break;
}
} while ((head = next_map_page(mapping, &xas, end_pgoff)) != NULL);
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
rcu_read_unlock();
vmf->address = address;
WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
return ret;
}
EXPORT_SYMBOL(filemap_map_pages);

Expand Down
Loading

0 comments on commit f9ce0be

Please sign in to comment.