workqueue.c 156 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
/*
Tejun Heo's avatar
Tejun Heo committed
2
 * kernel/workqueue.c - generic async execution with shared worker pool
Linus Torvalds's avatar
Linus Torvalds committed
3
 *
Tejun Heo's avatar
Tejun Heo committed
4
 * Copyright (C) 2002		Ingo Molnar
Linus Torvalds's avatar
Linus Torvalds committed
5
 *
Tejun Heo's avatar
Tejun Heo committed
6 7 8 9 10
 *   Derived from the taskqueue/keventd code by:
 *     David Woodhouse <dwmw2@infradead.org>
 *     Andrew Morton
 *     Kai Petzke <wpp@marie.physik.tu-berlin.de>
 *     Theodore Ts'o <tytso@mit.edu>
Linus Torvalds's avatar
Linus Torvalds committed
11
 *
Tejun Heo's avatar
Tejun Heo committed
12
 * Made to use alloc_percpu by Christoph Lameter.
Linus Torvalds's avatar
Linus Torvalds committed
13
 *
Tejun Heo's avatar
Tejun Heo committed
14 15
 * Copyright (C) 2010		SUSE Linux Products GmbH
 * Copyright (C) 2010		Tejun Heo <tj@kernel.org>
16
 *
Tejun Heo's avatar
Tejun Heo committed
17 18
 * This is the generic async execution mechanism.  Work items as are
 * executed in process context.  The worker pool is shared and
19 20 21 22
 * automatically managed.  There are two worker pools for each CPU (one for
 * normal work items and the other for high priority ones) and some extra
 * pools for workqueues which are not bound to any specific CPU - the
 * number of these backing pools is dynamic.
Tejun Heo's avatar
Tejun Heo committed
23
 *
24
 * Please read Documentation/core-api/workqueue.rst for details.
Linus Torvalds's avatar
Linus Torvalds committed
25 26
 */

27
#include <linux/export.h>
Linus Torvalds's avatar
Linus Torvalds committed
28 29 30 31 32 33 34 35 36 37
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/signal.h>
#include <linux/completion.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/kthread.h>
38
#include <linux/hardirq.h>
39
#include <linux/mempolicy.h>
40
#include <linux/freezer.h>
41
#include <linux/debug_locks.h>
42
#include <linux/lockdep.h>
Tejun Heo's avatar
Tejun Heo committed
43
#include <linux/idr.h>
44
#include <linux/jhash.h>
45
#include <linux/hashtable.h>
46
#include <linux/rculist.h>
47
#include <linux/nodemask.h>
48
#include <linux/moduleparam.h>
49
#include <linux/uaccess.h>
50
#include <linux/sched/isolation.h>
51
#include <linux/nmi.h>
52

53
#include "workqueue_internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
54

Tejun Heo's avatar
Tejun Heo committed
55
enum {
56 57
	/*
	 * worker_pool flags
58
	 *
59
	 * A bound pool is either associated or disassociated with its CPU.
60 61 62 63 64 65
	 * While associated (!DISASSOCIATED), all workers are bound to the
	 * CPU and none has %WORKER_UNBOUND set and concurrency management
	 * is in effect.
	 *
	 * While DISASSOCIATED, the cpu may be offline and all workers have
	 * %WORKER_UNBOUND set and concurrency management disabled, and may
66
	 * be executing on any CPU.  The pool behaves as an unbound one.
67
	 *
68
	 * Note that DISASSOCIATED should be flipped only while holding
69
	 * attach_mutex to avoid changing binding state while
70
	 * worker_attach_to_pool() is in progress.
71
	 */
72
	POOL_MANAGER_ACTIVE	= 1 << 0,	/* being managed */
73
	POOL_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */
74

Tejun Heo's avatar
Tejun Heo committed
75 76 77
	/* worker flags */
	WORKER_DIE		= 1 << 1,	/* die die die */
	WORKER_IDLE		= 1 << 2,	/* is idle */
78
	WORKER_PREP		= 1 << 3,	/* preparing to run works */
79
	WORKER_CPU_INTENSIVE	= 1 << 6,	/* cpu intensive */
80
	WORKER_UNBOUND		= 1 << 7,	/* worker is unbound */
81
	WORKER_REBOUND		= 1 << 8,	/* worker was rebound */
82

83 84
	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_CPU_INTENSIVE |
				  WORKER_UNBOUND | WORKER_REBOUND,
85

86
	NR_STD_WORKER_POOLS	= 2,		/* # standard pools per cpu */
87

88
	UNBOUND_POOL_HASH_ORDER	= 6,		/* hashed by pool->attrs */
Tejun Heo's avatar
Tejun Heo committed
89
	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */
90

91 92 93
	MAX_IDLE_WORKERS_RATIO	= 4,		/* 1/4 of busy can be idle */
	IDLE_WORKER_TIMEOUT	= 300 * HZ,	/* keep idle ones for 5 mins */

94 95 96
	MAYDAY_INITIAL_TIMEOUT  = HZ / 100 >= 2 ? HZ / 100 : 2,
						/* call for help after 10ms
						   (min two ticks) */
97 98 99 100 101
	MAYDAY_INTERVAL		= HZ / 10,	/* and then every 100ms */
	CREATE_COOLDOWN		= HZ,		/* time to breath after fail */

	/*
	 * Rescue workers are used only on emergencies and shared by
102
	 * all cpus.  Give MIN_NICE.
103
	 */
104 105
	RESCUER_NICE_LEVEL	= MIN_NICE,
	HIGHPRI_NICE_LEVEL	= MIN_NICE,
106 107

	WQ_NAME_LEN		= 24,
Tejun Heo's avatar
Tejun Heo committed
108
};
Linus Torvalds's avatar
Linus Torvalds committed
109 110

/*
Tejun Heo's avatar
Tejun Heo committed
111 112
 * Structure fields follow one of the following exclusion rules.
 *
113 114
 * I: Modifiable by initialization/destruction paths and read-only for
 *    everyone else.
Tejun Heo's avatar
Tejun Heo committed
115
 *
116 117 118
 * P: Preemption protected.  Disabling preemption is enough and should
 *    only be modified and accessed from the local cpu.
 *
119
 * L: pool->lock protected.  Access with pool->lock held.
Tejun Heo's avatar
Tejun Heo committed
120
 *
121 122 123 124
 * X: During normal operation, modification requires pool->lock and should
 *    be done only from local cpu.  Either disabling preemption on local
 *    cpu or grabbing pool->lock is enough for read access.  If
 *    POOL_DISASSOCIATED is set, it's identical to L.
125
 *
126
 * A: pool->attach_mutex protected.
127
 *
128
 * PL: wq_pool_mutex protected.
129
 *
130
 * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
131
 *
132 133 134 135 136
 * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
 *
 * PWR: wq_pool_mutex and wq->mutex protected for writes.  Either or
 *      sched-RCU for reads.
 *
137 138
 * WQ: wq->mutex protected.
 *
139
 * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
140 141
 *
 * MD: wq_mayday_lock protected.
Linus Torvalds's avatar
Linus Torvalds committed
142 143
 */

144
/* struct worker is defined in workqueue_internal.h */
Tejun Heo's avatar
Tejun Heo committed
145

146
struct worker_pool {
147
	spinlock_t		lock;		/* the pool lock */
148
	int			cpu;		/* I: the associated cpu */
149
	int			node;		/* I: the associated node ID */
Tejun Heo's avatar
Tejun Heo committed
150
	int			id;		/* I: pool ID */
151
	unsigned int		flags;		/* X: flags */
152

153 154
	unsigned long		watchdog_ts;	/* L: watchdog timestamp */

155 156
	struct list_head	worklist;	/* L: list of pending works */
	int			nr_workers;	/* L: total number of workers */
157 158

	/* nr_idle includes the ones off idle_list for rebinding */
159 160 161 162 163 164
	int			nr_idle;	/* L: currently idle ones */

	struct list_head	idle_list;	/* X: list of idle workers */
	struct timer_list	idle_timer;	/* L: worker idle timeout */
	struct timer_list	mayday_timer;	/* L: SOS timer for workers */

165
	/* a workers is either on busy_hash or idle_list, or the manager */
166 167 168
	DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
						/* L: hash of busy workers */

169
	/* see manage_workers() for details on the two manager mutexes */
170
	struct worker		*manager;	/* L: purely informational */
171 172
	struct mutex		attach_mutex;	/* attach/detach exclusion */
	struct list_head	workers;	/* A: attached workers */
173
	struct completion	*detach_completion; /* all workers detached */
174

175
	struct ida		worker_ida;	/* worker IDs for task name */
176

177
	struct workqueue_attrs	*attrs;		/* I: worker attributes */
178 179
	struct hlist_node	hash_node;	/* PL: unbound_pool_hash node */
	int			refcnt;		/* PL: refcnt for unbound pools */
180

181 182 183 184 185 186
	/*
	 * The current concurrency level.  As it's likely to be accessed
	 * from other CPUs during try_to_wake_up(), put it in a separate
	 * cacheline.
	 */
	atomic_t		nr_running ____cacheline_aligned_in_smp;
187 188 189 190 191 192

	/*
	 * Destruction of pool is sched-RCU protected to allow dereferences
	 * from get_work_pool().
	 */
	struct rcu_head		rcu;
193 194
} ____cacheline_aligned_in_smp;

Linus Torvalds's avatar
Linus Torvalds committed
195
/*
196 197 198 199
 * The per-pool workqueue.  While queued, the lower WORK_STRUCT_FLAG_BITS
 * of work_struct->data are used for flags and the remaining high bits
 * point to the pwq; thus, pwqs need to be aligned at two's power of the
 * number of flag bits.
Linus Torvalds's avatar
Linus Torvalds committed
200
 */
201
struct pool_workqueue {
202
	struct worker_pool	*pool;		/* I: the associated pool */
Tejun Heo's avatar
Tejun Heo committed
203
	struct workqueue_struct *wq;		/* I: the owning workqueue */
204 205
	int			work_color;	/* L: current color */
	int			flush_color;	/* L: flushing color */
Tejun Heo's avatar
Tejun Heo committed
206
	int			refcnt;		/* L: reference count */
207 208
	int			nr_in_flight[WORK_NR_COLORS];
						/* L: nr of in_flight works */
209
	int			nr_active;	/* L: nr of active works */
210
	int			max_active;	/* L: max active works */
211
	struct list_head	delayed_works;	/* L: delayed works */
212
	struct list_head	pwqs_node;	/* WR: node on wq->pwqs */
213
	struct list_head	mayday_node;	/* MD: node on wq->maydays */
Tejun Heo's avatar
Tejun Heo committed
214 215 216 217 218

	/*
	 * Release of unbound pwq is punted to system_wq.  See put_pwq()
	 * and pwq_unbound_release_workfn() for details.  pool_workqueue
	 * itself is also sched-RCU protected so that the first pwq can be
219
	 * determined without grabbing wq->mutex.
Tejun Heo's avatar
Tejun Heo committed
220 221 222
	 */
	struct work_struct	unbound_release_work;
	struct rcu_head		rcu;
223
} __aligned(1 << WORK_STRUCT_FLAG_BITS);
Linus Torvalds's avatar
Linus Torvalds committed
224

225 226 227 228
/*
 * Structure used to wait for workqueue flush.
 */
struct wq_flusher {
229 230
	struct list_head	list;		/* WQ: list of flushers */
	int			flush_color;	/* WQ: flush color waiting for */
231 232 233
	struct completion	done;		/* flush completion */
};

234 235
struct wq_device;

Linus Torvalds's avatar
Linus Torvalds committed
236
/*
237 238
 * The externally visible workqueue.  It relays the issued work items to
 * the appropriate worker_pool through its pool_workqueues.
Linus Torvalds's avatar
Linus Torvalds committed
239 240
 */
struct workqueue_struct {
241
	struct list_head	pwqs;		/* WR: all pwqs of this wq */
242
	struct list_head	list;		/* PR: list of all workqueues */
243

244 245 246
	struct mutex		mutex;		/* protects this wq */
	int			work_color;	/* WQ: current work color */
	int			flush_color;	/* WQ: current flush color */
247
	atomic_t		nr_pwqs_to_flush; /* flush in progress */
248 249 250
	struct wq_flusher	*first_flusher;	/* WQ: first flusher */
	struct list_head	flusher_queue;	/* WQ: flush waiters */
	struct list_head	flusher_overflow; /* WQ: flush overflow list */
251

252
	struct list_head	maydays;	/* MD: pwqs requesting rescue */
253 254
	struct worker		*rescuer;	/* I: rescue worker */

255
	int			nr_drainers;	/* WQ: drain in progress */
256
	int			saved_max_active; /* WQ: saved pwq max_active */
257

258 259
	struct workqueue_attrs	*unbound_attrs;	/* PW: only for unbound wqs */
	struct pool_workqueue	*dfl_pwq;	/* PW: only for unbound wqs */
260

261 262 263
#ifdef CONFIG_SYSFS
	struct wq_device	*wq_dev;	/* I: for sysfs interface */
#endif
264
#ifdef CONFIG_LOCKDEP
Tejun Heo's avatar
Tejun Heo committed
265
	struct lockdep_map	lockdep_map;
266
#endif
267
	char			name[WQ_NAME_LEN]; /* I: workqueue name */
268

269 270 271 272 273 274 275
	/*
	 * Destruction of workqueue_struct is sched-RCU protected to allow
	 * walking the workqueues list without grabbing wq_pool_mutex.
	 * This is used to dump all workqueues from sysrq.
	 */
	struct rcu_head		rcu;

276 277 278
	/* hot fields used during command issue, aligned to cacheline */
	unsigned int		flags ____cacheline_aligned; /* WQ: WQ_* flags */
	struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
279
	struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
Linus Torvalds's avatar
Linus Torvalds committed
280 281
};

282 283
static struct kmem_cache *pwq_cache;

284 285 286
static cpumask_var_t *wq_numa_possible_cpumask;
					/* possible CPUs of each node */

287 288 289
static bool wq_disable_numa;
module_param_named(disable_numa, wq_disable_numa, bool, 0444);

290
/* see the comment above the definition of WQ_POWER_EFFICIENT */
291
static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
292 293
module_param_named(power_efficient, wq_power_efficient, bool, 0444);

Tejun Heo's avatar
Tejun Heo committed
294
static bool wq_online;			/* can kworkers be created yet? */
295

296 297
static bool wq_numa_enabled;		/* unbound NUMA affinity enabled */

298 299 300
/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;

301
static DEFINE_MUTEX(wq_pool_mutex);	/* protects pools and workqueues list */
302
static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
303
static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */
304

305
static LIST_HEAD(workqueues);		/* PR: list of all workqueues */
306
static bool workqueue_freezing;		/* PL: have wqs started freezing? */
307

308 309 310 311 312
/* PL: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;

/* CPU where unbound work was last round robin scheduled from this CPU */
static DEFINE_PER_CPU(int, wq_rr_cpu_last);
313

314 315 316 317 318 319 320 321 322 323 324 325
/*
 * Local execution of unbound work items is no longer guaranteed.  The
 * following always forces round-robin CPU selection on unbound work items
 * to uncover usages which depend on it.
 */
#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU
static bool wq_debug_force_rr_cpu = true;
#else
static bool wq_debug_force_rr_cpu = false;
#endif
module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);

326
/* the per-cpu worker pools */
327
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);
328

329
static DEFINE_IDR(worker_pool_idr);	/* PR: idr of all pools */
330

331
/* PL: hash of all unbound pools keyed by pool->attrs */
332 333
static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);

334
/* I: attributes used when instantiating standard unbound pools on demand */
335 336
static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];

337 338 339
/* I: attributes used when instantiating ordered pools on demand */
static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];

340
struct workqueue_struct *system_wq __read_mostly;
341
EXPORT_SYMBOL(system_wq);
342
struct workqueue_struct *system_highpri_wq __read_mostly;
343
EXPORT_SYMBOL_GPL(system_highpri_wq);
344
struct workqueue_struct *system_long_wq __read_mostly;
345
EXPORT_SYMBOL_GPL(system_long_wq);
346
struct workqueue_struct *system_unbound_wq __read_mostly;
347
EXPORT_SYMBOL_GPL(system_unbound_wq);
348
struct workqueue_struct *system_freezable_wq __read_mostly;
349
EXPORT_SYMBOL_GPL(system_freezable_wq);
350 351 352 353
struct workqueue_struct *system_power_efficient_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_power_efficient_wq);
struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
354

355
static int worker_thread(void *__worker);
356
static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
357

358 359 360
#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>

361
#define assert_rcu_or_pool_mutex()					\
362 363 364
	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\
			 !lockdep_is_held(&wq_pool_mutex),		\
			 "sched RCU or wq_pool_mutex should be held")
365

366
#define assert_rcu_or_wq_mutex(wq)					\
367 368 369
	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\
			 !lockdep_is_held(&wq->mutex),			\
			 "sched RCU or wq->mutex should be held")
370

371
#define assert_rcu_or_wq_mutex_or_pool_mutex(wq)			\
372 373 374 375
	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\
			 !lockdep_is_held(&wq->mutex) &&		\
			 !lockdep_is_held(&wq_pool_mutex),		\
			 "sched RCU, wq->mutex or wq_pool_mutex should be held")
376

377 378 379
#define for_each_cpu_worker_pool(pool, cpu)				\
	for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];		\
	     (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
380
	     (pool)++)
381

382 383 384
/**
 * for_each_pool - iterate through all worker_pools in the system
 * @pool: iteration cursor
385
 * @pi: integer used for iteration
386
 *
387 388 389
 * This must be called either with wq_pool_mutex held or sched RCU read
 * locked.  If the pool needs to be used beyond the locking in effect, the
 * caller is responsible for guaranteeing that the pool stays online.
390 391 392
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
393
 */
394 395
#define for_each_pool(pool, pi)						\
	idr_for_each_entry(&worker_pool_idr, pool, pi)			\
396
		if (({ assert_rcu_or_pool_mutex(); false; })) { }	\
397
		else
398

399 400 401 402 403
/**
 * for_each_pool_worker - iterate through all workers of a worker_pool
 * @worker: iteration cursor
 * @pool: worker_pool to iterate workers of
 *
404
 * This must be called with @pool->attach_mutex.
405 406 407 408
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
 */
409 410
#define for_each_pool_worker(worker, pool)				\
	list_for_each_entry((worker), &(pool)->workers, node)		\
411
		if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \
412 413
		else

414 415 416 417
/**
 * for_each_pwq - iterate through all pool_workqueues of the specified workqueue
 * @pwq: iteration cursor
 * @wq: the target workqueue
418
 *
419
 * This must be called either with wq->mutex held or sched RCU read locked.
420 421
 * If the pwq needs to be used beyond the locking in effect, the caller is
 * responsible for guaranteeing that the pwq stays online.
422 423 424
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
425 426
 */
#define for_each_pwq(pwq, wq)						\
427
	list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node)		\
428
		if (({ assert_rcu_or_wq_mutex(wq); false; })) { }	\
429
		else
430

431 432 433 434
#ifdef CONFIG_DEBUG_OBJECTS_WORK

static struct debug_obj_descr work_debug_descr;

435 436 437 438 439
static void *work_debug_hint(void *addr)
{
	return ((struct work_struct *) addr)->func;
}

440 441 442 443 444 445 446
static bool work_is_static_object(void *addr)
{
	struct work_struct *work = addr;

	return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work));
}

447 448 449 450
/*
 * fixup_init is called when:
 * - an active object is initialized
 */
451
static bool work_fixup_init(void *addr, enum debug_obj_state state)
452 453 454 455 456 457 458
{
	struct work_struct *work = addr;

	switch (state) {
	case ODEBUG_STATE_ACTIVE:
		cancel_work_sync(work);
		debug_object_init(work, &work_debug_descr);
459
		return true;
460
	default:
461
		return false;
462 463 464 465 466 467 468
	}
}

/*
 * fixup_free is called when:
 * - an active object is freed
 */
469
static bool work_fixup_free(void *addr, enum debug_obj_state state)
470 471 472 473 474 475 476
{
	struct work_struct *work = addr;

	switch (state) {
	case ODEBUG_STATE_ACTIVE:
		cancel_work_sync(work);
		debug_object_free(work, &work_debug_descr);
477
		return true;
478
	default:
479
		return false;
480 481 482 483 484
	}
}

static struct debug_obj_descr work_debug_descr = {
	.name		= "work_struct",
485
	.debug_hint	= work_debug_hint,
486
	.is_static_object = work_is_static_object,
487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515
	.fixup_init	= work_fixup_init,
	.fixup_free	= work_fixup_free,
};

static inline void debug_work_activate(struct work_struct *work)
{
	debug_object_activate(work, &work_debug_descr);
}

static inline void debug_work_deactivate(struct work_struct *work)
{
	debug_object_deactivate(work, &work_debug_descr);
}

void __init_work(struct work_struct *work, int onstack)
{
	if (onstack)
		debug_object_init_on_stack(work, &work_debug_descr);
	else
		debug_object_init(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(__init_work);

void destroy_work_on_stack(struct work_struct *work)
{
	debug_object_free(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_work_on_stack);

516 517 518 519 520 521 522
void destroy_delayed_work_on_stack(struct delayed_work *work)
{
	destroy_timer_on_stack(&work->timer);
	debug_object_free(&work->work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);

523 524 525 526 527
#else
static inline void debug_work_activate(struct work_struct *work) { }
static inline void debug_work_deactivate(struct work_struct *work) { }
#endif

528 529 530 531 532 533 534
/**
 * worker_pool_assign_id - allocate ID and assing it to @pool
 * @pool: the pool pointer of interest
 *
 * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
 * successfully, -errno on failure.
 */
Tejun Heo's avatar
Tejun Heo committed
535 536 537 538
static int worker_pool_assign_id(struct worker_pool *pool)
{
	int ret;

539
	lockdep_assert_held(&wq_pool_mutex);
540

541 542
	ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
			GFP_KERNEL);
543
	if (ret >= 0) {
Tejun Heo's avatar
Tejun Heo committed
544
		pool->id = ret;
545 546
		return 0;
	}
547
	return ret;
548 549
}

550 551 552 553 554
/**
 * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
 * @wq: the target workqueue
 * @node: the node ID
 *
555 556
 * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
 * read locked.
557 558
 * If the pwq needs to be used beyond the locking in effect, the caller is
 * responsible for guaranteeing that the pwq stays online.
559 560
 *
 * Return: The unbound pool_workqueue for @node.
561 562 563 564
 */
static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
						  int node)
{
565
	assert_rcu_or_wq_mutex_or_pool_mutex(wq);
566 567 568 569 570 571 572 573 574 575

	/*
	 * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a
	 * delayed item is pending.  The plan is to keep CPU -> NODE
	 * mapping valid and stable across CPU on/offlines.  Once that
	 * happens, this workaround can be removed.
	 */
	if (unlikely(node == NUMA_NO_NODE))
		return wq->dfl_pwq;

576 577 578
	return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
}

579 580 581 582 583 584 585 586 587 588 589 590 591 592 593
static unsigned int work_color_to_flags(int color)
{
	return color << WORK_STRUCT_COLOR_SHIFT;
}

static int get_work_color(struct work_struct *work)
{
	return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
		((1 << WORK_STRUCT_COLOR_BITS) - 1);
}

static int work_next_color(int color)
{
	return (color + 1) % WORK_NR_COLORS;
}
Linus Torvalds's avatar
Linus Torvalds committed
594

595
/*
596 597
 * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
 * contain the pointer to the queued pwq.  Once execution starts, the flag
598
 * is cleared and the high bits contain OFFQ flags and pool ID.
599
 *
600 601
 * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()
 * and clear_work_data() can be used to set the pwq, pool or clear
602 603
 * work->data.  These functions should only be called while the work is
 * owned - ie. while the PENDING bit is set.
604
 *
605
 * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
606
 * corresponding to a work.  Pool is available once the work has been
607
 * queued anywhere after initialization until it is sync canceled.  pwq is
608
 * available only while the work item is queued.
609
 *
610 611 612 613
 * %WORK_OFFQ_CANCELING is used to mark a work item which is being
 * canceled.  While being canceled, a work item may have its PENDING set
 * but stay off timer and worklist for arbitrarily long and nobody should
 * try to steal the PENDING bit.
614
 */
615 616
static inline void set_work_data(struct work_struct *work, unsigned long data,
				 unsigned long flags)
617
{
618
	WARN_ON_ONCE(!work_pending(work));
619 620
	atomic_long_set(&work->data, data | flags | work_static(work));
}
621

622
static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
623 624
			 unsigned long extra_flags)
{
625 626
	set_work_data(work, (unsigned long)pwq,
		      WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);
627 628
}

629 630 631 632 633 634 635
static void set_work_pool_and_keep_pending(struct work_struct *work,
					   int pool_id)
{
	set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,
		      WORK_STRUCT_PENDING);
}

636 637
static void set_work_pool_and_clear_pending(struct work_struct *work,
					    int pool_id)
638
{
639 640 641 642 643 644 645
	/*
	 * The following wmb is paired with the implied mb in
	 * test_and_set_bit(PENDING) and ensures all updates to @work made
	 * here are visible to and precede any updates by the next PENDING
	 * owner.
	 */
	smp_wmb();
646
	set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675
	/*
	 * The following mb guarantees that previous clear of a PENDING bit
	 * will not be reordered with any speculative LOADS or STORES from
	 * work->current_func, which is executed afterwards.  This possible
	 * reordering can lead to a missed execution on attempt to qeueue
	 * the same @work.  E.g. consider this case:
	 *
	 *   CPU#0                         CPU#1
	 *   ----------------------------  --------------------------------
	 *
	 * 1  STORE event_indicated
	 * 2  queue_work_on() {
	 * 3    test_and_set_bit(PENDING)
	 * 4 }                             set_..._and_clear_pending() {
	 * 5                                 set_work_data() # clear bit
	 * 6                                 smp_mb()
	 * 7                               work->current_func() {
	 * 8				      LOAD event_indicated
	 *				   }
	 *
	 * Without an explicit full barrier speculative LOAD on line 8 can
	 * be executed before CPU#0 does STORE on line 1.  If that happens,
	 * CPU#0 observes the PENDING bit is still set and new execution of
	 * a @work is not queued in a hope, that CPU#1 will eventually
	 * finish the queued @work.  Meanwhile CPU#1 does not see
	 * event_indicated is set, because speculative LOAD was executed
	 * before actual STORE.
	 */
	smp_mb();
676
}
677

678
static void clear_work_data(struct work_struct *work)
Linus Torvalds's avatar
Linus Torvalds committed
679
{
680 681
	smp_wmb();	/* see set_work_pool_and_clear_pending() */
	set_work_data(work, WORK_STRUCT_NO_POOL, 0);
Linus Torvalds's avatar
Linus Torvalds committed
682 683
}

684
static struct pool_workqueue *get_work_pwq(struct work_struct *work)
685
{
686
	unsigned long data = atomic_long_read(&work->data);
687

688
	if (data & WORK_STRUCT_PWQ)
689 690 691
		return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
	else
		return NULL;
692 693
}

694 695 696 697
/**
 * get_work_pool - return the worker_pool a given work was associated with
 * @work: the work item of interest
 *
698 699 700
 * Pools are created and destroyed under wq_pool_mutex, and allows read
 * access under sched-RCU read lock.  As such, this function should be
 * called under wq_pool_mutex or with preemption disabled.
701 702 703 704 705
 *
 * All fields of the returned pool are accessible as long as the above
 * mentioned locking is in effect.  If the returned pool needs to be used
 * beyond the critical section, the caller is responsible for ensuring the
 * returned pool is and stays online.
706 707
 *
 * Return: The worker_pool @work was last associated with.  %NULL if none.
708 709
 */
static struct worker_pool *get_work_pool(struct work_struct *work)
710
{
711
	unsigned long data = atomic_long_read(&work->data);
712
	int pool_id;
713

714
	assert_rcu_or_pool_mutex();
715

716 717
	if (data & WORK_STRUCT_PWQ)
		return ((struct pool_workqueue *)
718
			(data & WORK_STRUCT_WQ_DATA_MASK))->pool;
719

720 721
	pool_id = data >> WORK_OFFQ_POOL_SHIFT;
	if (pool_id == WORK_OFFQ_POOL_NONE)
722 723
		return NULL;

724
	return idr_find(&worker_pool_idr, pool_id);
725 726 727 728 729 730
}

/**
 * get_work_pool_id - return the worker pool ID a given work is associated with
 * @work: the work item of interest
 *
731
 * Return: The worker_pool ID @work was last associated with.
732 733 734 735
 * %WORK_OFFQ_POOL_NONE if none.
 */
static int get_work_pool_id(struct work_struct *work)
{
736 737
	unsigned long data = atomic_long_read(&work->data);

738 739
	if (data & WORK_STRUCT_PWQ)
		return ((struct pool_workqueue *)
740
			(data & WORK_STRUCT_WQ_DATA_MASK))->pool->id;
741

742
	return data >> WORK_OFFQ_POOL_SHIFT;
743 744
}

745 746
static void mark_work_canceling(struct work_struct *work)
{
747
	unsigned long pool_id = get_work_pool_id(work);
748

749 750
	pool_id <<= WORK_OFFQ_POOL_SHIFT;
	set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);
751 752 753 754 755 756
}

static bool work_is_canceling(struct work_struct *work)
{
	unsigned long data = atomic_long_read(&work->data);

757
	return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);
758 759
}

760
/*
761 762
 * Policy functions.  These define the policies on how the global worker
 * pools are managed.  Unless noted otherwise, these functions assume that
763
 * they're being called with pool->lock held.
764 765
 */

766
static bool __need_more_worker(struct worker_pool *pool)
767
{
768
	return !atomic_read(&pool->nr_running);
769 770
}

771
/*
772 773
 * Need to wake up a worker?  Called from anything but currently
 * running workers.
774 775
 *
 * Note that, because unbound workers never contribute to nr_running, this
776
 * function will always return %true for unbound pools as long as the
777
 * worklist isn't empty.
778
 */
779
static bool need_more_worker(struct worker_pool *pool)
780
{
781
	return !list_empty(&pool->worklist) && __need_more_worker(pool);
782
}
783

784
/* Can I start working?  Called from busy but !running workers. */
785
static bool may_start_working(struct worker_pool *pool)
786
{
787
	return pool->nr_idle;
788 789 790
}

/* Do I need to keep working?  Called from currently running workers. */
791
static bool keep_working(struct worker_pool *pool)
792
{
793 794
	return !list_empty(&pool->worklist) &&
		atomic_read(&pool->nr_running) <= 1;
795 796 797
}

/* Do we need a new worker?  Called from manager. */
798
static bool need_to_create_worker(struct worker_pool *pool)
799
{
800
	return need_more_worker(pool) && !may_start_working(pool);
801
}
802

803
/* Do we have too many workers and should some go away? */
804
static bool too_many_workers(struct worker_pool *pool)
805
{
806
	bool managing = pool->flags & POOL_MANAGER_ACTIVE;
807 808
	int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
	int nr_busy = pool->nr_workers - nr_idle;
809 810

	return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
811 812
}

813
/*
814 815 816
 * Wake up functions.
 */

817 818
/* Return the first idle worker.  Safe with preemption disabled */
static struct worker *first_idle_worker(struct worker_pool *pool)
819
{
820
	if (unlikely(list_empty(&pool->idle_list)))
821 822
		return NULL;

823
	return list_first_entry(&pool->idle_list, struct worker, entry);
824 825 826 827
}

/**
 * wake_up_worker - wake up an idle worker
828
 * @pool: worker pool to wake worker from
829
 *
830
 * Wake up the first idle worker of @pool.
831 832
 *
 * CONTEXT:
833
 * spin_lock_irq(pool->lock).
834
 */
835
static void wake_up_worker(struct worker_pool *pool)
836
{
837
	struct worker *worker = first_idle_worker(pool);
838 839 840 841 842

	if (likely(worker))
		wake_up_process(worker->task);
}