Commit f47edb59 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'akpm' (patches from Andrew)

Mergr misc fixes from Andrew Morton:
 "11 fixes"

Mostly VM fixes, one psi polling fix, and one parisc build fix.

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  mm/kasan: fix false positive invalid-free reports with CONFIG_KASAN_SW_TAGS=y
  mm/zsmalloc.c: fix race condition in zs_destroy_pool
  mm/zsmalloc.c: migration can leave pages in ZS_EMPTY indefinitely
  mm, page_owner: handle THP splits correctly
  userfaultfd_release: always remove uffd flags and clear vm_userfaultfd_ctx
  psi: get poll_work to run when calling poll syscall next time
  mm: memcontrol: flush percpu vmevents before releasing memcg
  mm: memcontrol: flush percpu vmstats before releasing memcg
  parisc: fix compilation errrors
  mm, page_alloc: move_freepages should not examine struct page of reserved memory
  mm/z3fold.c: fix race between migration and destruction
parents e67095fd 00fb24a4
......@@ -2,6 +2,7 @@
#ifndef _PARISC_PGTABLE_H
#define _PARISC_PGTABLE_H
#include <asm/page.h>
#include <asm-generic/4level-fixup.h>
#include <asm/fixmap.h>
......@@ -98,8 +99,6 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr)
#endif /* !__ASSEMBLY__ */
#include <asm/page.h>
#define pte_ERROR(e) \
printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
#define pmd_ERROR(e) \
......
......@@ -880,6 +880,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
/* len == 0 means wake all */
struct userfaultfd_wake_range range = { .len = 0, };
unsigned long new_flags;
bool still_valid;
WRITE_ONCE(ctx->released, true);
......@@ -895,8 +896,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
* taking the mmap_sem for writing.
*/
down_write(&mm->mmap_sem);
if (!mmget_still_valid(mm))
goto skip_mm;
still_valid = mmget_still_valid(mm);
prev = NULL;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
cond_resched();
......@@ -907,19 +907,20 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
continue;
}
new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
new_flags, vma->anon_vma,
vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
NULL_VM_UFFD_CTX);
if (prev)
vma = prev;
else
prev = vma;
if (still_valid) {
prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
new_flags, vma->anon_vma,
vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
NULL_VM_UFFD_CTX);
if (prev)
vma = prev;
else
prev = vma;
}
vma->vm_flags = new_flags;
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
}
skip_mm:
up_write(&mm->mmap_sem);
mmput(mm);
wakeup:
......
......@@ -1131,7 +1131,15 @@ static void psi_trigger_destroy(struct kref *ref)
* deadlock while waiting for psi_poll_work to acquire trigger_lock
*/
if (kworker_to_destroy) {
/*
* After the RCU grace period has expired, the worker
* can no longer be found through group->poll_kworker.
* But it might have been already scheduled before
* that - deschedule it cleanly before destroying it.
*/
kthread_cancel_delayed_work_sync(&group->poll_work);
atomic_set(&group->poll_scheduled, 0);
kthread_destroy_worker(kworker_to_destroy);
}
kfree(t);
......
......@@ -32,6 +32,7 @@
#include <linux/shmem_fs.h>
#include <linux/oom.h>
#include <linux/numa.h>
#include <linux/page_owner.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
......@@ -2516,6 +2517,9 @@ static void __split_huge_page(struct page *page, struct list_head *list,
}
ClearPageCompound(head);
split_page_owner(head, HPAGE_PMD_ORDER);
/* See comment in __split_huge_page_tail() */
if (PageAnon(head)) {
/* Additional pin to swap cache */
......
......@@ -407,8 +407,14 @@ static inline bool shadow_invalid(u8 tag, s8 shadow_byte)
if (IS_ENABLED(CONFIG_KASAN_GENERIC))
return shadow_byte < 0 ||
shadow_byte >= KASAN_SHADOW_SCALE_SIZE;
else
return tag != (u8)shadow_byte;
/* else CONFIG_KASAN_SW_TAGS: */
if ((u8)shadow_byte == KASAN_TAG_INVALID)
return true;
if ((tag != KASAN_TAG_KERNEL) && (tag != (u8)shadow_byte))
return true;
return false;
}
static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
......
......@@ -3260,6 +3260,60 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
}
}
static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
{
unsigned long stat[MEMCG_NR_STAT];
struct mem_cgroup *mi;
int node, cpu, i;
for (i = 0; i < MEMCG_NR_STAT; i++)
stat[i] = 0;
for_each_online_cpu(cpu)
for (i = 0; i < MEMCG_NR_STAT; i++)
stat[i] += raw_cpu_read(memcg->vmstats_percpu->stat[i]);
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
for (i = 0; i < MEMCG_NR_STAT; i++)
atomic_long_add(stat[i], &mi->vmstats[i]);
for_each_node(node) {
struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
struct mem_cgroup_per_node *pi;
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
stat[i] = 0;
for_each_online_cpu(cpu)
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
stat[i] += raw_cpu_read(
pn->lruvec_stat_cpu->count[i]);
for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
atomic_long_add(stat[i], &pi->lruvec_stat[i]);
}
}
static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
{
unsigned long events[NR_VM_EVENT_ITEMS];
struct mem_cgroup *mi;
int cpu, i;
for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
events[i] = 0;
for_each_online_cpu(cpu)
for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
events[i] += raw_cpu_read(
memcg->vmstats_percpu->events[i]);
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
atomic_long_add(events[i], &mi->vmevents[i]);
}
#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
......@@ -4682,6 +4736,12 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
/*
* Flush percpu vmstats and vmevents to guarantee the value correctness
* on parent's and all ancestor levels.
*/
memcg_flush_percpu_vmstats(memcg);
memcg_flush_percpu_vmevents(memcg);
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->vmstats_percpu);
......
......@@ -2238,27 +2238,12 @@ static int move_freepages(struct zone *zone,
unsigned int order;
int pages_moved = 0;
#ifndef CONFIG_HOLES_IN_ZONE
/*
* page_zone is not safe to call in this context when
* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
* anyway as we check zone boundaries in move_freepages_block().
* Remove at a later date when no bug reports exist related to
* grouping pages by mobility
*/
VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) &&
pfn_valid(page_to_pfn(end_page)) &&
page_zone(start_page) != page_zone(end_page));
#endif
for (page = start_page; page <= end_page;) {
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
continue;
}
/* Make sure we are not inadvertently changing nodes */
VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
if (!PageBuddy(page)) {
/*
* We assume that pages that could be isolated for
......@@ -2273,6 +2258,10 @@ static int move_freepages(struct zone *zone,
continue;
}
/* Make sure we are not inadvertently changing nodes */
VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
VM_BUG_ON_PAGE(page_zone(page) != zone, page);
order = page_order(page);
move_to_free_area(page, &zone->free_area[order], migratetype);
page += 1 << order;
......
......@@ -41,6 +41,7 @@
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/wait.h>
#include <linux/zpool.h>
#include <linux/magic.h>
......@@ -145,6 +146,8 @@ struct z3fold_header {
* @release_wq: workqueue for safe page release
* @work: work_struct for safe page release
* @inode: inode for z3fold pseudo filesystem
* @destroying: bool to stop migration once we start destruction
* @isolated: int to count the number of pages currently in isolation
*
* This structure is allocated at pool creation time and maintains metadata
* pertaining to a particular z3fold pool.
......@@ -163,8 +166,11 @@ struct z3fold_pool {
const struct zpool_ops *zpool_ops;
struct workqueue_struct *compact_wq;
struct workqueue_struct *release_wq;
struct wait_queue_head isolate_wait;
struct work_struct work;
struct inode *inode;
bool destroying;
int isolated;
};
/*
......@@ -769,6 +775,7 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
goto out_c;
spin_lock_init(&pool->lock);
spin_lock_init(&pool->stale_lock);
init_waitqueue_head(&pool->isolate_wait);
pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
if (!pool->unbuddied)
goto out_pool;
......@@ -808,6 +815,15 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
return NULL;
}
static bool pool_isolated_are_drained(struct z3fold_pool *pool)
{
bool ret;
spin_lock(&pool->lock);
ret = pool->isolated == 0;
spin_unlock(&pool->lock);
return ret;
}
/**
* z3fold_destroy_pool() - destroys an existing z3fold pool
* @pool: the z3fold pool to be destroyed
......@@ -817,6 +833,22 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
static void z3fold_destroy_pool(struct z3fold_pool *pool)
{
kmem_cache_destroy(pool->c_handle);
/*
* We set pool-> destroying under lock to ensure that
* z3fold_page_isolate() sees any changes to destroying. This way we
* avoid the need for any memory barriers.
*/
spin_lock(&pool->lock);
pool->destroying = true;
spin_unlock(&pool->lock);
/*
* We need to ensure that no pages are being migrated while we destroy
* these workqueues, as migration can queue work on either of the
* workqueues.
*/
wait_event(pool->isolate_wait, !pool_isolated_are_drained(pool));
/*
* We need to destroy pool->compact_wq before pool->release_wq,
......@@ -1307,6 +1339,28 @@ static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
return atomic64_read(&pool->pages_nr);
}
/*
* z3fold_dec_isolated() expects to be called while pool->lock is held.
*/
static void z3fold_dec_isolated(struct z3fold_pool *pool)
{
assert_spin_locked(&pool->lock);
VM_BUG_ON(pool->isolated <= 0);
pool->isolated--;
/*
* If we have no more isolated pages, we have to see if
* z3fold_destroy_pool() is waiting for a signal.
*/
if (pool->isolated == 0 && waitqueue_active(&pool->isolate_wait))
wake_up_all(&pool->isolate_wait);
}
static void z3fold_inc_isolated(struct z3fold_pool *pool)
{
pool->isolated++;
}
static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
{
struct z3fold_header *zhdr;
......@@ -1333,6 +1387,33 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
spin_lock(&pool->lock);
if (!list_empty(&page->lru))
list_del(&page->lru);
/*
* We need to check for destruction while holding pool->lock, as
* otherwise destruction could see 0 isolated pages, and
* proceed.
*/
if (unlikely(pool->destroying)) {
spin_unlock(&pool->lock);
/*
* If this page isn't stale, somebody else holds a
* reference to it. Let't drop our refcount so that they
* can call the release logic.
*/
if (unlikely(kref_put(&zhdr->refcount,
release_z3fold_page_locked))) {
/*
* If we get here we have kref problems, so we
* should freak out.
*/
WARN(1, "Z3fold is experiencing kref problems\n");
return false;
}
z3fold_page_unlock(zhdr);
return false;
}
z3fold_inc_isolated(pool);
spin_unlock(&pool->lock);
z3fold_page_unlock(zhdr);
return true;
......@@ -1401,6 +1482,10 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
spin_lock(&pool->lock);
z3fold_dec_isolated(pool);
spin_unlock(&pool->lock);
page_mapcount_reset(page);
put_page(page);
return 0;
......@@ -1420,10 +1505,14 @@ static void z3fold_page_putback(struct page *page)
INIT_LIST_HEAD(&page->lru);
if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
atomic64_dec(&pool->pages_nr);
spin_lock(&pool->lock);
z3fold_dec_isolated(pool);
spin_unlock(&pool->lock);
return;
}
spin_lock(&pool->lock);
list_add(&page->lru, &pool->lru);
z3fold_dec_isolated(pool);
spin_unlock(&pool->lock);
z3fold_page_unlock(zhdr);
}
......
......@@ -54,6 +54,7 @@
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
#include <linux/migrate.h>
#include <linux/wait.h>
#include <linux/pagemap.h>
#include <linux/fs.h>
......@@ -268,6 +269,10 @@ struct zs_pool {
#ifdef CONFIG_COMPACTION
struct inode *inode;
struct work_struct free_work;
/* A wait queue for when migration races with async_free_zspage() */
struct wait_queue_head migration_wait;
atomic_long_t isolated_pages;
bool destroying;
#endif
};
......@@ -1862,6 +1867,31 @@ static void dec_zspage_isolation(struct zspage *zspage)
zspage->isolated--;
}
static void putback_zspage_deferred(struct zs_pool *pool,
struct size_class *class,
struct zspage *zspage)
{
enum fullness_group fg;
fg = putback_zspage(class, zspage);
if (fg == ZS_EMPTY)
schedule_work(&pool->free_work);
}
static inline void zs_pool_dec_isolated(struct zs_pool *pool)
{
VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0);
atomic_long_dec(&pool->isolated_pages);
/*
* There's no possibility of racing, since wait_for_isolated_drain()
* checks the isolated count under &class->lock after enqueuing
* on migration_wait.
*/
if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying)
wake_up_all(&pool->migration_wait);
}
static void replace_sub_page(struct size_class *class, struct zspage *zspage,
struct page *newpage, struct page *oldpage)
{
......@@ -1931,6 +1961,7 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
*/
if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
get_zspage_mapping(zspage, &class_idx, &fullness);
atomic_long_inc(&pool->isolated_pages);
remove_zspage(class, zspage, fullness);
}
......@@ -2030,8 +2061,16 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
* Page migration is done so let's putback isolated zspage to
* the list if @page is final isolated subpage in the zspage.
*/
if (!is_zspage_isolated(zspage))
putback_zspage(class, zspage);
if (!is_zspage_isolated(zspage)) {
/*
* We cannot race with zs_destroy_pool() here because we wait
* for isolation to hit zero before we start destroying.
* Also, we ensure that everyone can see pool->destroying before
* we start waiting.
*/
putback_zspage_deferred(pool, class, zspage);
zs_pool_dec_isolated(pool);
}
reset_page(page);
put_page(page);
......@@ -2077,13 +2116,12 @@ static void zs_page_putback(struct page *page)
spin_lock(&class->lock);
dec_zspage_isolation(zspage);
if (!is_zspage_isolated(zspage)) {
fg = putback_zspage(class, zspage);
/*
* Due to page_lock, we cannot free zspage immediately
* so let's defer.
*/
if (fg == ZS_EMPTY)
schedule_work(&pool->free_work);
putback_zspage_deferred(pool, class, zspage);
zs_pool_dec_isolated(pool);
}
spin_unlock(&class->lock);
}
......@@ -2107,8 +2145,36 @@ static int zs_register_migration(struct zs_pool *pool)
return 0;
}
static bool pool_isolated_are_drained(struct zs_pool *pool)
{
return atomic_long_read(&pool->isolated_pages) == 0;
}
/* Function for resolving migration */
static void wait_for_isolated_drain(struct zs_pool *pool)
{
/*
* We're in the process of destroying the pool, so there are no
* active allocations. zs_page_isolate() fails for completely free
* zspages, so we need only wait for the zs_pool's isolated
* count to hit zero.
*/
wait_event(pool->migration_wait,
pool_isolated_are_drained(pool));
}
static void zs_unregister_migration(struct zs_pool *pool)
{
pool->destroying = true;
/*
* We need a memory barrier here to ensure global visibility of
* pool->destroying. Thus pool->isolated pages will either be 0 in which
* case we don't care, or it will be > 0 and pool->destroying will
* ensure that we wake up once isolation hits 0.
*/
smp_mb();
wait_for_isolated_drain(pool); /* This can block */
flush_work(&pool->free_work);
iput(pool->inode);
}
......@@ -2346,6 +2412,8 @@ struct zs_pool *zs_create_pool(const char *name)
if (!pool->name)
goto err;
init_waitqueue_head(&pool->migration_wait);
if (create_cache(pool))
goto err;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment