Newer
Older

Greg Kroah-Hartman
committed
// SPDX-License-Identifier: GPL-2.0
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/mm.h>
#include <linux/sched.h>

Ingo Molnar
committed
#include <linux/sched/mm.h>

Ingo Molnar
committed
#include <linux/sched/coredump.h>
#include <linux/mmu_notifier.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/mm_inline.h>
#include <linux/kthread.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
#include <linux/mman.h>
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
#include <linux/page_table_check.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
enum scan_result {
SCAN_FAIL,
SCAN_SUCCEED,
SCAN_PMD_NULL,
SCAN_PMD_MAPPED,
SCAN_EXCEED_SWAP_PTE,
SCAN_EXCEED_SHARED_PTE,
SCAN_PTE_MAPPED_HUGEPAGE,
SCAN_LACK_REFERENCED_PAGE,
SCAN_PAGE_NULL,
SCAN_SCAN_ABORT,
SCAN_PAGE_COUNT,
SCAN_PAGE_LRU,
SCAN_PAGE_LOCK,
SCAN_PAGE_ANON,
SCAN_PAGE_COMPOUND,
SCAN_ANY_PROCESS,
SCAN_VMA_NULL,
SCAN_VMA_CHECK,
SCAN_ADDRESS_RANGE,
SCAN_DEL_PAGE_LRU,
SCAN_ALLOC_HUGE_PAGE_FAIL,
SCAN_CGROUP_CHARGE_FAIL,
SCAN_TRUNCATED,
};
#define CREATE_TRACE_POINTS
#include <trace/events/huge_memory.h>

Vijay Balakrishna
committed
static struct task_struct *khugepaged_thread __read_mostly;
static DEFINE_MUTEX(khugepaged_mutex);
/* default scan 8*512 pte (or vmas) every 30 second */
static unsigned int khugepaged_pages_to_scan __read_mostly;
static unsigned int khugepaged_pages_collapsed;
static unsigned int khugepaged_full_scans;
static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
/* during fragmentation poll the hugepage allocator once every minute */
static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
static unsigned long khugepaged_sleep_expire;
static DEFINE_SPINLOCK(khugepaged_mm_lock);
static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
/*
* default collapse hugepages if there is at least one pte mapped like
* it would have happened if the vma was large enough during page
* fault.
*
* Note that these are only respected if collapse was initiated by khugepaged.
*/
static unsigned int khugepaged_max_ptes_none __read_mostly;
static unsigned int khugepaged_max_ptes_swap __read_mostly;
static unsigned int khugepaged_max_ptes_shared __read_mostly;
#define MM_SLOTS_HASH_BITS 10
static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
static struct kmem_cache *mm_slot_cache __read_mostly;
#define MAX_PTE_MAPPED_THP 8
bool is_khugepaged;
/* Num pages scanned per node */
u32 node_load[MAX_NUMNODES];
/* nodemask for allocation fallback */
nodemask_t alloc_nmask;
* struct khugepaged_mm_slot - khugepaged information per mm that is being scanned
* @slot: hash lookup from mm to mm_slot
* @nr_pte_mapped_thp: number of pte mapped THP
* @pte_mapped_thp: address array corresponding pte mapped THP
struct khugepaged_mm_slot {
struct mm_slot slot;
/* pte-mapped THP in this mm */
int nr_pte_mapped_thp;
unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP];
};
/**
* struct khugepaged_scan - cursor for scanning
* @mm_head: the head of the mm list to scan
* @mm_slot: the current mm_slot we are scanning
* @address: the next address inside that to be scanned
*
* There is only the one khugepaged_scan instance of this cursor structure.
*/
struct khugepaged_scan {
struct list_head mm_head;
struct khugepaged_mm_slot *mm_slot;
unsigned long address;
};
static struct khugepaged_scan khugepaged_scan = {
.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
};
#ifdef CONFIG_SYSFS
static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs);
}
static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
err = kstrtouint(buf, 10, &msecs);
if (err)
return -EINVAL;
khugepaged_scan_sleep_millisecs = msecs;
khugepaged_sleep_expire = 0;
wake_up_interruptible(&khugepaged_wait);
return count;
}
static struct kobj_attribute scan_sleep_millisecs_attr =
static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
}
static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
err = kstrtouint(buf, 10, &msecs);
if (err)
return -EINVAL;
khugepaged_alloc_sleep_millisecs = msecs;
khugepaged_sleep_expire = 0;
wake_up_interruptible(&khugepaged_wait);
return count;
}
static struct kobj_attribute alloc_sleep_millisecs_attr =
__ATTR_RW(alloc_sleep_millisecs);
static ssize_t pages_to_scan_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan);
}
static ssize_t pages_to_scan_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
err = kstrtouint(buf, 10, &pages);
if (err || !pages)
return -EINVAL;
khugepaged_pages_to_scan = pages;
return count;
}
static struct kobj_attribute pages_to_scan_attr =
static ssize_t pages_collapsed_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed);
}
static struct kobj_attribute pages_collapsed_attr =
__ATTR_RO(pages_collapsed);
static ssize_t full_scans_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
return sysfs_emit(buf, "%u\n", khugepaged_full_scans);
}
static struct kobj_attribute full_scans_attr =
__ATTR_RO(full_scans);
static ssize_t defrag_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return single_hugepage_flag_show(kobj, attr, buf,
TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
static ssize_t defrag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
return single_hugepage_flag_store(kobj, attr, buf, count,
TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
}
static struct kobj_attribute khugepaged_defrag_attr =
/*
* max_ptes_none controls if khugepaged should collapse hugepages over
* any unmapped ptes in turn potentially increasing the memory
* footprint of the vmas. When max_ptes_none is 0 khugepaged will not
* reduce the available free memory in the system as it
* runs. Increasing max_ptes_none will instead potentially reduce the
* free memory in the system during the khugepaged scan.
*/
static ssize_t max_ptes_none_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none);
static ssize_t max_ptes_none_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
int err;
unsigned long max_ptes_none;
err = kstrtoul(buf, 10, &max_ptes_none);
if (err || max_ptes_none > HPAGE_PMD_NR - 1)
return -EINVAL;
khugepaged_max_ptes_none = max_ptes_none;
return count;
}
static struct kobj_attribute khugepaged_max_ptes_none_attr =
static ssize_t max_ptes_swap_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap);
static ssize_t max_ptes_swap_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
int err;
unsigned long max_ptes_swap;
err = kstrtoul(buf, 10, &max_ptes_swap);
if (err || max_ptes_swap > HPAGE_PMD_NR - 1)
return -EINVAL;
khugepaged_max_ptes_swap = max_ptes_swap;
return count;
}
static struct kobj_attribute khugepaged_max_ptes_swap_attr =
static ssize_t max_ptes_shared_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared);
static ssize_t max_ptes_shared_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
int err;
unsigned long max_ptes_shared;
err = kstrtoul(buf, 10, &max_ptes_shared);
if (err || max_ptes_shared > HPAGE_PMD_NR - 1)
return -EINVAL;
khugepaged_max_ptes_shared = max_ptes_shared;
return count;
}
static struct kobj_attribute khugepaged_max_ptes_shared_attr =
static struct attribute *khugepaged_attr[] = {
&khugepaged_defrag_attr.attr,
&khugepaged_max_ptes_none_attr.attr,
&khugepaged_max_ptes_swap_attr.attr,
&khugepaged_max_ptes_shared_attr.attr,
&pages_to_scan_attr.attr,
&pages_collapsed_attr.attr,
&full_scans_attr.attr,
&scan_sleep_millisecs_attr.attr,
&alloc_sleep_millisecs_attr.attr,
NULL,
};
struct attribute_group khugepaged_attr_group = {
.attrs = khugepaged_attr,
.name = "khugepaged",
};
#endif /* CONFIG_SYSFS */
int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice)
{
switch (advice) {
case MADV_HUGEPAGE:
#ifdef CONFIG_S390
/*
* qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
* can't handle this properly after s390_enable_sie, so we simply
* ignore the madvise to prevent qemu from causing a SIGSEGV.
*/
if (mm_has_pgste(vma->vm_mm))
return 0;
#endif
*vm_flags &= ~VM_NOHUGEPAGE;
*vm_flags |= VM_HUGEPAGE;
/*
* If the vma become good for khugepaged to scan,
* register it here without waiting a page fault that
* may not happen any time soon.
*/
khugepaged_enter_vma(vma, *vm_flags);
break;
case MADV_NOHUGEPAGE:
*vm_flags &= ~VM_HUGEPAGE;
*vm_flags |= VM_NOHUGEPAGE;
/*
* Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
* this vma even if we leave the mm registered in khugepaged if
* it got registered before VM_NOHUGEPAGE was set.
*/
break;
}
return 0;
}
int __init khugepaged_init(void)
{
mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
sizeof(struct khugepaged_mm_slot),
__alignof__(struct khugepaged_mm_slot),
0, NULL);
if (!mm_slot_cache)
return -ENOMEM;
khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
return 0;
}
void __init khugepaged_destroy(void)
{
kmem_cache_destroy(mm_slot_cache);
}
static inline int hpage_collapse_test_exit(struct mm_struct *mm)
return atomic_read(&mm->mm_users) == 0;
void __khugepaged_enter(struct mm_struct *mm)
struct khugepaged_mm_slot *mm_slot;
struct mm_slot *slot;
mm_slot = mm_slot_alloc(mm_slot_cache);
/* __khugepaged_exit() must not run from under us */
VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
mm_slot_free(mm_slot_cache, mm_slot);
}
spin_lock(&khugepaged_mm_lock);
mm_slot_insert(mm_slots_hash, mm, slot);
/*
* Insert just behind the scanning cursor, to let the area settle
* down a little.
*/
wakeup = list_empty(&khugepaged_scan.mm_head);
list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head);
spin_unlock(&khugepaged_mm_lock);
if (wakeup)
wake_up_interruptible(&khugepaged_wait);
}
void khugepaged_enter_vma(struct vm_area_struct *vma,
unsigned long vm_flags)
if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
if (hugepage_vma_check(vma, vm_flags, false, false, true))
__khugepaged_enter(vma->vm_mm);
}
}
void __khugepaged_exit(struct mm_struct *mm)
{
struct khugepaged_mm_slot *mm_slot;
struct mm_slot *slot;
int free = 0;
spin_lock(&khugepaged_mm_lock);
slot = mm_slot_lookup(mm_slots_hash, mm);
mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
hash_del(&slot->hash);
list_del(&slot->mm_node);
free = 1;
}
spin_unlock(&khugepaged_mm_lock);
if (free) {
clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
mm_slot_free(mm_slot_cache, mm_slot);
mmdrop(mm);
} else if (mm_slot) {
/*
* This is required to serialize against
* hpage_collapse_test_exit() (which is guaranteed to run
* under mmap sem read mode). Stop here (after we return all
* pagetables will be destroyed) until khugepaged has finished
* working on the pagetables under the mmap_lock.
mmap_write_lock(mm);
mmap_write_unlock(mm);
}
}
static void release_pte_page(struct page *page)
{
mod_node_page_state(page_pgdat(page),
NR_ISOLATED_ANON + page_is_file_lru(page),
-compound_nr(page));
unlock_page(page);
putback_lru_page(page);
}
static void release_pte_pages(pte_t *pte, pte_t *_pte,
struct list_head *compound_pagelist)
struct page *page, *tmp;
while (--_pte >= pte) {
pte_t pteval = *_pte;
page = pte_page(pteval);
if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) &&
!PageCompound(page))
release_pte_page(page);
}
list_for_each_entry_safe(page, tmp, compound_pagelist, lru) {
list_del(&page->lru);
release_pte_page(page);
static bool is_refcount_suitable(struct page *page)
{
int expected_refcount;
expected_refcount = total_mapcount(page);
if (PageSwapCache(page))
expected_refcount += compound_nr(page);
return page_count(page) == expected_refcount;
}
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte,
struct collapse_control *cc,
struct list_head *compound_pagelist)
{
struct page *page = NULL;
pte_t *_pte;
int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
bool writable = false;
for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval) || (pte_present(pteval) &&
is_zero_pfn(pte_pfn(pteval)))) {
++none_or_zero;
if (!userfaultfd_armed(vma) &&
(!cc->is_khugepaged ||
none_or_zero <= khugepaged_max_ptes_none)) {
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
goto out;
}
}
if (!pte_present(pteval)) {
result = SCAN_PTE_NON_PRESENT;
goto out;
}
page = vm_normal_page(vma, address, pteval);
if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out;
}
VM_BUG_ON_PAGE(!PageAnon(page), page);
if (page_mapcount(page) > 1) {
++shared;
if (cc->is_khugepaged &&
shared > khugepaged_max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
goto out;
}
if (PageCompound(page)) {
struct page *p;
page = compound_head(page);
/*
* Check if we have dealt with the compound page
* already
*/
list_for_each_entry(p, compound_pagelist, lru) {
if (page == p)
goto next;
}
}
/*
* We can do it before isolate_lru_page because the
* page can't be freed from under us. NOTE: PG_lock
* is needed to serialize against split_huge_page
* when invoked from the VM.
*/
if (!trylock_page(page)) {
result = SCAN_PAGE_LOCK;
goto out;
}
/*
* Check if the page has any GUP (or other external) pins.
*
* The page table that maps the page has been already unlinked
* from the page table tree and this process cannot get
*
* New pins can come later if the page is shared across fork,
* but not from this process. The other process cannot write to
* the page, only trigger CoW.
if (!is_refcount_suitable(page)) {
unlock_page(page);
result = SCAN_PAGE_COUNT;
goto out;
}
/*
* Isolate the page to avoid collapsing an hugepage
* currently in use by the VM.
*/
if (isolate_lru_page(page)) {
unlock_page(page);
result = SCAN_DEL_PAGE_LRU;
goto out;
}
mod_node_page_state(page_pgdat(page),
NR_ISOLATED_ANON + page_is_file_lru(page),
compound_nr(page));
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageLRU(page), page);
if (PageCompound(page))
list_add_tail(&page->lru, compound_pagelist);
next:
/*
* If collapse was initiated by khugepaged, check that there is
* enough young pte to justify collapsing the page
*/
if (cc->is_khugepaged &&
(pte_young(pteval) || page_is_young(page) ||
PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
address)))
referenced++;
if (pte_write(pteval))
writable = true;
if (unlikely(!writable)) {
result = SCAN_PAGE_RO;
} else if (unlikely(cc->is_khugepaged && !referenced)) {
result = SCAN_LACK_REFERENCED_PAGE;
} else {
result = SCAN_SUCCEED;
trace_mm_collapse_huge_page_isolate(page, none_or_zero,
referenced, writable, result);
return result;
release_pte_pages(pte, _pte, compound_pagelist);
trace_mm_collapse_huge_page_isolate(page, none_or_zero,
referenced, writable, result);
return result;
}
static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
struct vm_area_struct *vma,
unsigned long address,
spinlock_t *ptl,
struct list_head *compound_pagelist)
struct page *src_page, *tmp;
for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, page++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
clear_user_highpage(page, address);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
if (is_zero_pfn(pte_pfn(pteval))) {
/*
* ptl mostly unnecessary.
*/
spin_lock(ptl);
ptep_clear(vma->vm_mm, address, _pte);
spin_unlock(ptl);
}
} else {
src_page = pte_page(pteval);
copy_user_highpage(page, src_page, address, vma);
if (!PageCompound(src_page))
release_pte_page(src_page);
/*
* ptl mostly unnecessary, but preempt has to
* be disabled to update the per-cpu stats
* inside page_remove_rmap().
*/
spin_lock(ptl);
ptep_clear(vma->vm_mm, address, _pte);
page_remove_rmap(src_page, vma, false);
spin_unlock(ptl);
free_page_and_swap_cache(src_page);
}
}
list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
list_del(&src_page->lru);
mod_node_page_state(page_pgdat(src_page),
NR_ISOLATED_ANON + page_is_file_lru(src_page),
-compound_nr(src_page));
unlock_page(src_page);
free_swap_cache(src_page);
putback_lru_page(src_page);
}
static void khugepaged_alloc_sleep(void)
{
DEFINE_WAIT(wait);
add_wait_queue(&khugepaged_wait, &wait);
__set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
remove_wait_queue(&khugepaged_wait, &wait);
}
struct collapse_control khugepaged_collapse_control = {
.is_khugepaged = true,
static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc)
{
int i;
/*
* If node_reclaim_mode is disabled, then no extra effort is made to
* allocate memory locally.
*/
if (!node_reclaim_enabled())
return false;
/* If there is a count for this node already, it must be acceptable */
return false;
for (i = 0; i < MAX_NUMNODES; i++) {
if (node_distance(nid, i) > node_reclaim_distance)
return true;
}
return false;
}
#define khugepaged_defrag() \
(transparent_hugepage_flags & \
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
{
return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
}
#ifdef CONFIG_NUMA
static int hpage_collapse_find_target_node(struct collapse_control *cc)
{
int nid, target_node = 0, max_value = 0;
/* find first node with max normal pages hit */
for (nid = 0; nid < MAX_NUMNODES; nid++)
if (cc->node_load[nid] > max_value) {
max_value = cc->node_load[nid];
target_node = nid;
}
for_each_online_node(nid) {
if (max_value == cc->node_load[nid])
node_set(nid, cc->alloc_nmask);
}
return target_node;
}
#else
static int hpage_collapse_find_target_node(struct collapse_control *cc)
return 0;
#endif
static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node,
nodemask_t *nmask)
*hpage = __alloc_pages(gfp, HPAGE_PMD_ORDER, node, nmask);
if (unlikely(!*hpage)) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
return false;
}
prep_transhuge_page(*hpage);
count_vm_event(THP_COLLAPSE_ALLOC);
return true;
}
/*
* If mmap_lock temporarily dropped, revalidate vma
* before taking mmap_lock.
* Returns enum scan_result value.
static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
bool expect_anon,
struct vm_area_struct **vmap,
struct collapse_control *cc)
{
struct vm_area_struct *vma;
if (unlikely(hpage_collapse_test_exit(mm)))
return SCAN_ANY_PROCESS;
*vmap = vma = find_vma(mm, address);
if (!vma)
return SCAN_VMA_NULL;
if (!transhuge_vma_suitable(vma, address))
return SCAN_ADDRESS_RANGE;
if (!hugepage_vma_check(vma, vma->vm_flags, false, false,
cc->is_khugepaged))
return SCAN_VMA_CHECK;
/*
* Anon VMA expected, the address may be unmapped then
* remapped to file after khugepaged reaquired the mmap_lock.
*
* hugepage_vma_check may return true for qualified file
* vmas.
*/
if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
return SCAN_PAGE_ANON;
return SCAN_SUCCEED;
static int find_pmd_or_thp_or_none(struct mm_struct *mm,
unsigned long address,
pmd_t **pmd)
{
pmd_t pmde;
*pmd = mm_find_pmd(mm, address);
if (!*pmd)
return SCAN_PMD_NULL;
pmde = pmd_read_atomic(*pmd);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* See comments in pmd_none_or_trans_huge_or_clear_bad() */
barrier();
#endif
if (pmd_none(pmde))
return SCAN_PMD_NONE;
if (pmd_trans_huge(pmde))
return SCAN_PMD_MAPPED;
if (pmd_bad(pmde))
return SCAN_PMD_NULL;
return SCAN_SUCCEED;
}
static int check_pmd_still_valid(struct mm_struct *mm,
unsigned long address,
pmd_t *pmd)
{
pmd_t *new_pmd;
int result = find_pmd_or_thp_or_none(mm, address, &new_pmd);
if (result != SCAN_SUCCEED)
return result;
if (new_pmd != pmd)
return SCAN_FAIL;
return SCAN_SUCCEED;
}
/*
* Bring missing pages in from swap, to complete THP collapse.
* Only done if hpage_collapse_scan_pmd believes it is worthwhile.
* Called and returns without pte mapped or spinlocks held.
* Note that if false is returned, mmap_lock will be released.
static int __collapse_huge_page_swapin(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long haddr, pmd_t *pmd,
int referenced)
int swapped_in = 0;
vm_fault_t ret = 0;
unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
for (address = haddr; address < end; address += PAGE_SIZE) {
struct vm_fault vmf = {
.vma = vma,
.address = address,
.pgoff = linear_page_index(vma, haddr),
.flags = FAULT_FLAG_ALLOW_RETRY,
.pmd = pmd,
};
vmf.pte = pte_offset_map(pmd, address);
if (!is_swap_pte(vmf.orig_pte)) {
pte_unmap(vmf.pte);
/*
* do_swap_page returns VM_FAULT_RETRY with released mmap_lock.
* Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because
* we do not retry here and swap entry will remain in pagetable
* resulting in later failure.
*/
if (ret & VM_FAULT_RETRY) {
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
/* Likely, but not guaranteed, that page lock failed */
return SCAN_PAGE_LOCK;
}
if (ret & VM_FAULT_ERROR) {
mmap_read_unlock(mm);
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return SCAN_FAIL;
swapped_in++;
/* Drain LRU add pagevec to remove extra pin on the swapped in pages */
if (swapped_in)
lru_add_drain();
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
return SCAN_SUCCEED;
static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
struct collapse_control *cc)
{
gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
GFP_TRANSHUGE);
int node = hpage_collapse_find_target_node(cc);
if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask))
return SCAN_ALLOC_HUGE_PAGE_FAIL;
if (unlikely(mem_cgroup_charge(page_folio(*hpage), mm, gfp)))
return SCAN_CGROUP_CHARGE_FAIL;
count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC);
return SCAN_SUCCEED;
}
static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
int referenced, int unmapped,
struct collapse_control *cc)
LIST_HEAD(compound_pagelist);
pmd_t *pmd, _pmd;
pte_t *pte;
pgtable_t pgtable;
struct page *hpage;
spinlock_t *pmd_ptl, *pte_ptl;
int result = SCAN_FAIL;
struct vm_area_struct *vma;
struct mmu_notifier_range range;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
/*
* Before allocating the hugepage, release the mmap_lock read lock.
* The allocation can take potentially a long time if it involves
* sync compaction, and we do not need to hold the mmap_lock during
* that. We will recheck the vma after taking it again in write mode.
*/
mmap_read_unlock(mm);
result = alloc_charge_hpage(&hpage, mm, cc);
if (result != SCAN_SUCCEED)
mmap_read_lock(mm);
result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm);
goto out_nolock;
}
result = find_pmd_or_thp_or_none(mm, address, &pmd);