Commit ce612879 authored by Michal Hocko's avatar Michal Hocko Committed by Linus Torvalds
Browse files

mm: move pcp and lru-pcp draining into single wq

We currently have 2 specific WQ_RECLAIM workqueues in the mm code.
vmstat_wq for updating pcp stats and lru_add_drain_wq dedicated to drain
per cpu lru caches.  This seems more than necessary because both can run
on a single WQ.  Both do not block on locks requiring a memory
allocation nor perform any allocations themselves.  We will save one
rescuer thread this way.

On the other hand drain_all_pages() queues work on the system wq which
doesn't have rescuer and so this depend on memory allocation (when all
workers are stuck allocating and new ones cannot be created).

Initially we thought this would be more of a theoretical problem but
Hugh Dickins has reported:

: 4.11-rc has been giving me hangs after hours of swapping load.  At
: first they looked like memory leaks ("fork: Cannot allocate memory");
: but for no good reason I happened to do "cat /proc/sys/vm/stat_refresh"
: before looking at /proc/meminfo one time, and the stat_refresh stuck
: in D state, waiting for completion of flush_work like many kworkers.
: kthreadd waiting for completion of flush_work in drain_all_pages().

This worker should be using WQ_RECLAIM as well in order to guarantee a
forward progress.  We can reuse the same one as for lru draining and


Signed-off-by: default avatarMichal Hocko <>
Suggested-by: default avatarTetsuo Handa <>
Acked-by: default avatarVlastimil Babka <>
Acked-by: default avatarMel Gorman <>
Tested-by: default avatarYang Li <>
Tested-by: default avatarHugh Dickins <>
Signed-off-by: default avatarAndrew Morton <>
Signed-off-by: default avatarLinus Torvalds <>
parent cdcf4330
......@@ -481,6 +481,13 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
enum ttu_flags;
struct tlbflush_unmap_batch;
* only for MM internal work items which do not depend on
* any allocations or locks which might depend on allocations
extern struct workqueue_struct *mm_percpu_wq;
void try_to_unmap_flush(void);
void try_to_unmap_flush_dirty(void);
......@@ -2373,6 +2373,13 @@ void drain_all_pages(struct zone *zone)
static cpumask_t cpus_with_pcps;
* Make sure nobody triggers this path before mm_percpu_wq is fully
* initialized.
if (WARN_ON_ONCE(!mm_percpu_wq))
/* Workqueues cannot recurse */
if (current->flags & PF_WQ_WORKER)
......@@ -2422,7 +2429,7 @@ void drain_all_pages(struct zone *zone)
for_each_cpu(cpu, &cpus_with_pcps) {
struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
INIT_WORK(work, drain_local_pages_wq);
schedule_work_on(cpu, work);
queue_work_on(cpu, mm_percpu_wq, work);
for_each_cpu(cpu, &cpus_with_pcps)
flush_work(per_cpu_ptr(&pcpu_drain, cpu));
......@@ -670,30 +670,19 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
* lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM
* workqueue, aiding in getting memory freed.
static struct workqueue_struct *lru_add_drain_wq;
static int __init lru_init(void)
lru_add_drain_wq = alloc_workqueue("lru-add-drain", WQ_MEM_RECLAIM, 0);
if (WARN(!lru_add_drain_wq,
"Failed to create workqueue lru_add_drain_wq"))
return -ENOMEM;
return 0;
void lru_add_drain_all(void)
static DEFINE_MUTEX(lock);
static struct cpumask has_work;
int cpu;
* Make sure nobody triggers this path before mm_percpu_wq is fully
* initialized.
if (WARN_ON(!mm_percpu_wq))
......@@ -707,7 +696,7 @@ void lru_add_drain_all(void)
pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
queue_work_on(cpu, lru_add_drain_wq, work);
queue_work_on(cpu, mm_percpu_wq, work);
cpumask_set_cpu(cpu, &has_work);
......@@ -1552,7 +1552,6 @@ static const struct file_operations proc_vmstat_file_operations = {
#endif /* CONFIG_PROC_FS */
static struct workqueue_struct *vmstat_wq;
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
int sysctl_stat_interval __read_mostly = HZ;
......@@ -1623,7 +1622,7 @@ static void vmstat_update(struct work_struct *w)
* to occur in the future. Keep on running the
* update worker thread.
queue_delayed_work_on(smp_processor_id(), vmstat_wq,
queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
......@@ -1702,7 +1701,7 @@ static void vmstat_shepherd(struct work_struct *w)
struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
if (!delayed_work_pending(dw) && need_update(cpu))
queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
......@@ -1718,7 +1717,6 @@ static void __init start_shepherd_timer(void)
INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
......@@ -1764,11 +1762,16 @@ static int vmstat_cpu_dead(unsigned int cpu)
struct workqueue_struct *mm_percpu_wq;
void __init init_mm_internals(void)
int ret;
int ret __maybe_unused;
mm_percpu_wq = alloc_workqueue("mm_percpu_wq",
ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
NULL, vmstat_cpu_dead);
if (ret < 0)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment