workqueue.c 154 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
/*
Tejun Heo's avatar
Tejun Heo committed
2
 * kernel/workqueue.c - generic async execution with shared worker pool
Linus Torvalds's avatar
Linus Torvalds committed
3
 *
Tejun Heo's avatar
Tejun Heo committed
4
 * Copyright (C) 2002		Ingo Molnar
Linus Torvalds's avatar
Linus Torvalds committed
5
 *
Tejun Heo's avatar
Tejun Heo committed
6 7 8 9 10
 *   Derived from the taskqueue/keventd code by:
 *     David Woodhouse <dwmw2@infradead.org>
 *     Andrew Morton
 *     Kai Petzke <wpp@marie.physik.tu-berlin.de>
 *     Theodore Ts'o <tytso@mit.edu>
Linus Torvalds's avatar
Linus Torvalds committed
11
 *
Tejun Heo's avatar
Tejun Heo committed
12
 * Made to use alloc_percpu by Christoph Lameter.
Linus Torvalds's avatar
Linus Torvalds committed
13
 *
Tejun Heo's avatar
Tejun Heo committed
14 15
 * Copyright (C) 2010		SUSE Linux Products GmbH
 * Copyright (C) 2010		Tejun Heo <tj@kernel.org>
16
 *
Tejun Heo's avatar
Tejun Heo committed
17 18
 * This is the generic async execution mechanism.  Work items as are
 * executed in process context.  The worker pool is shared and
19 20 21 22
 * automatically managed.  There are two worker pools for each CPU (one for
 * normal work items and the other for high priority ones) and some extra
 * pools for workqueues which are not bound to any specific CPU - the
 * number of these backing pools is dynamic.
Tejun Heo's avatar
Tejun Heo committed
23 24
 *
 * Please read Documentation/workqueue.txt for details.
Linus Torvalds's avatar
Linus Torvalds committed
25 26
 */

27
#include <linux/export.h>
Linus Torvalds's avatar
Linus Torvalds committed
28 29 30 31 32 33 34 35 36 37
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/signal.h>
#include <linux/completion.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/kthread.h>
38
#include <linux/hardirq.h>
39
#include <linux/mempolicy.h>
40
#include <linux/freezer.h>
41 42
#include <linux/kallsyms.h>
#include <linux/debug_locks.h>
43
#include <linux/lockdep.h>
Tejun Heo's avatar
Tejun Heo committed
44
#include <linux/idr.h>
45
#include <linux/jhash.h>
46
#include <linux/hashtable.h>
47
#include <linux/rculist.h>
48
#include <linux/nodemask.h>
49
#include <linux/moduleparam.h>
50
#include <linux/uaccess.h>
51

52
#include "workqueue_internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
53

Tejun Heo's avatar
Tejun Heo committed
54
enum {
55 56
	/*
	 * worker_pool flags
57
	 *
58
	 * A bound pool is either associated or disassociated with its CPU.
59 60 61 62 63 64
	 * While associated (!DISASSOCIATED), all workers are bound to the
	 * CPU and none has %WORKER_UNBOUND set and concurrency management
	 * is in effect.
	 *
	 * While DISASSOCIATED, the cpu may be offline and all workers have
	 * %WORKER_UNBOUND set and concurrency management disabled, and may
65
	 * be executing on any CPU.  The pool behaves as an unbound one.
66
	 *
67
	 * Note that DISASSOCIATED should be flipped only while holding
68
	 * attach_mutex to avoid changing binding state while
69
	 * worker_attach_to_pool() is in progress.
70
	 */
71
	POOL_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */
72

Tejun Heo's avatar
Tejun Heo committed
73 74 75
	/* worker flags */
	WORKER_DIE		= 1 << 1,	/* die die die */
	WORKER_IDLE		= 1 << 2,	/* is idle */
76
	WORKER_PREP		= 1 << 3,	/* preparing to run works */
77
	WORKER_CPU_INTENSIVE	= 1 << 6,	/* cpu intensive */
78
	WORKER_UNBOUND		= 1 << 7,	/* worker is unbound */
79
	WORKER_REBOUND		= 1 << 8,	/* worker was rebound */
80

81 82
	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_CPU_INTENSIVE |
				  WORKER_UNBOUND | WORKER_REBOUND,
83

84
	NR_STD_WORKER_POOLS	= 2,		/* # standard pools per cpu */
85

86
	UNBOUND_POOL_HASH_ORDER	= 6,		/* hashed by pool->attrs */
Tejun Heo's avatar
Tejun Heo committed
87
	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */
88

89 90 91
	MAX_IDLE_WORKERS_RATIO	= 4,		/* 1/4 of busy can be idle */
	IDLE_WORKER_TIMEOUT	= 300 * HZ,	/* keep idle ones for 5 mins */

92 93 94
	MAYDAY_INITIAL_TIMEOUT  = HZ / 100 >= 2 ? HZ / 100 : 2,
						/* call for help after 10ms
						   (min two ticks) */
95 96 97 98 99
	MAYDAY_INTERVAL		= HZ / 10,	/* and then every 100ms */
	CREATE_COOLDOWN		= HZ,		/* time to breath after fail */

	/*
	 * Rescue workers are used only on emergencies and shared by
100
	 * all cpus.  Give MIN_NICE.
101
	 */
102 103
	RESCUER_NICE_LEVEL	= MIN_NICE,
	HIGHPRI_NICE_LEVEL	= MIN_NICE,
104 105

	WQ_NAME_LEN		= 24,
Tejun Heo's avatar
Tejun Heo committed
106
};
Linus Torvalds's avatar
Linus Torvalds committed
107 108

/*
Tejun Heo's avatar
Tejun Heo committed
109 110
 * Structure fields follow one of the following exclusion rules.
 *
111 112
 * I: Modifiable by initialization/destruction paths and read-only for
 *    everyone else.
Tejun Heo's avatar
Tejun Heo committed
113
 *
114 115 116
 * P: Preemption protected.  Disabling preemption is enough and should
 *    only be modified and accessed from the local cpu.
 *
117
 * L: pool->lock protected.  Access with pool->lock held.
Tejun Heo's avatar
Tejun Heo committed
118
 *
119 120 121 122
 * X: During normal operation, modification requires pool->lock and should
 *    be done only from local cpu.  Either disabling preemption on local
 *    cpu or grabbing pool->lock is enough for read access.  If
 *    POOL_DISASSOCIATED is set, it's identical to L.
123
 *
124
 * A: pool->attach_mutex protected.
125
 *
126
 * PL: wq_pool_mutex protected.
127
 *
128
 * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
129
 *
130 131 132 133 134
 * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
 *
 * PWR: wq_pool_mutex and wq->mutex protected for writes.  Either or
 *      sched-RCU for reads.
 *
135 136
 * WQ: wq->mutex protected.
 *
137
 * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
138 139
 *
 * MD: wq_mayday_lock protected.
Linus Torvalds's avatar
Linus Torvalds committed
140 141
 */

142
/* struct worker is defined in workqueue_internal.h */
Tejun Heo's avatar
Tejun Heo committed
143

144
struct worker_pool {
145
	spinlock_t		lock;		/* the pool lock */
146
	int			cpu;		/* I: the associated cpu */
147
	int			node;		/* I: the associated node ID */
Tejun Heo's avatar
Tejun Heo committed
148
	int			id;		/* I: pool ID */
149
	unsigned int		flags;		/* X: flags */
150

151 152
	unsigned long		watchdog_ts;	/* L: watchdog timestamp */

153 154
	struct list_head	worklist;	/* L: list of pending works */
	int			nr_workers;	/* L: total number of workers */
155 156

	/* nr_idle includes the ones off idle_list for rebinding */
157 158 159 160 161 162
	int			nr_idle;	/* L: currently idle ones */

	struct list_head	idle_list;	/* X: list of idle workers */
	struct timer_list	idle_timer;	/* L: worker idle timeout */
	struct timer_list	mayday_timer;	/* L: SOS timer for workers */

163
	/* a workers is either on busy_hash or idle_list, or the manager */
164 165 166
	DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
						/* L: hash of busy workers */

167
	/* see manage_workers() for details on the two manager mutexes */
168
	struct mutex		manager_arb;	/* manager arbitration */
169
	struct worker		*manager;	/* L: purely informational */
170 171
	struct mutex		attach_mutex;	/* attach/detach exclusion */
	struct list_head	workers;	/* A: attached workers */
172
	struct completion	*detach_completion; /* all workers detached */
173

174
	struct ida		worker_ida;	/* worker IDs for task name */
175

176
	struct workqueue_attrs	*attrs;		/* I: worker attributes */
177 178
	struct hlist_node	hash_node;	/* PL: unbound_pool_hash node */
	int			refcnt;		/* PL: refcnt for unbound pools */
179

180 181 182 183 184 185
	/*
	 * The current concurrency level.  As it's likely to be accessed
	 * from other CPUs during try_to_wake_up(), put it in a separate
	 * cacheline.
	 */
	atomic_t		nr_running ____cacheline_aligned_in_smp;
186 187 188 189 190 191

	/*
	 * Destruction of pool is sched-RCU protected to allow dereferences
	 * from get_work_pool().
	 */
	struct rcu_head		rcu;
192 193
} ____cacheline_aligned_in_smp;

Linus Torvalds's avatar
Linus Torvalds committed
194
/*
195 196 197 198
 * The per-pool workqueue.  While queued, the lower WORK_STRUCT_FLAG_BITS
 * of work_struct->data are used for flags and the remaining high bits
 * point to the pwq; thus, pwqs need to be aligned at two's power of the
 * number of flag bits.
Linus Torvalds's avatar
Linus Torvalds committed
199
 */
200
struct pool_workqueue {
201
	struct worker_pool	*pool;		/* I: the associated pool */
Tejun Heo's avatar
Tejun Heo committed
202
	struct workqueue_struct *wq;		/* I: the owning workqueue */
203 204
	int			work_color;	/* L: current color */
	int			flush_color;	/* L: flushing color */
Tejun Heo's avatar
Tejun Heo committed
205
	int			refcnt;		/* L: reference count */
206 207
	int			nr_in_flight[WORK_NR_COLORS];
						/* L: nr of in_flight works */
208
	int			nr_active;	/* L: nr of active works */
209
	int			max_active;	/* L: max active works */
210
	struct list_head	delayed_works;	/* L: delayed works */
211
	struct list_head	pwqs_node;	/* WR: node on wq->pwqs */
212
	struct list_head	mayday_node;	/* MD: node on wq->maydays */
Tejun Heo's avatar
Tejun Heo committed
213 214 215 216 217

	/*
	 * Release of unbound pwq is punted to system_wq.  See put_pwq()
	 * and pwq_unbound_release_workfn() for details.  pool_workqueue
	 * itself is also sched-RCU protected so that the first pwq can be
218
	 * determined without grabbing wq->mutex.
Tejun Heo's avatar
Tejun Heo committed
219 220 221
	 */
	struct work_struct	unbound_release_work;
	struct rcu_head		rcu;
222
} __aligned(1 << WORK_STRUCT_FLAG_BITS);
Linus Torvalds's avatar
Linus Torvalds committed
223

224 225 226 227
/*
 * Structure used to wait for workqueue flush.
 */
struct wq_flusher {
228 229
	struct list_head	list;		/* WQ: list of flushers */
	int			flush_color;	/* WQ: flush color waiting for */
230 231 232
	struct completion	done;		/* flush completion */
};

233 234
struct wq_device;

Linus Torvalds's avatar
Linus Torvalds committed
235
/*
236 237
 * The externally visible workqueue.  It relays the issued work items to
 * the appropriate worker_pool through its pool_workqueues.
Linus Torvalds's avatar
Linus Torvalds committed
238 239
 */
struct workqueue_struct {
240
	struct list_head	pwqs;		/* WR: all pwqs of this wq */
241
	struct list_head	list;		/* PR: list of all workqueues */
242

243 244 245
	struct mutex		mutex;		/* protects this wq */
	int			work_color;	/* WQ: current work color */
	int			flush_color;	/* WQ: current flush color */
246
	atomic_t		nr_pwqs_to_flush; /* flush in progress */
247 248 249
	struct wq_flusher	*first_flusher;	/* WQ: first flusher */
	struct list_head	flusher_queue;	/* WQ: flush waiters */
	struct list_head	flusher_overflow; /* WQ: flush overflow list */
250

251
	struct list_head	maydays;	/* MD: pwqs requesting rescue */
252 253
	struct worker		*rescuer;	/* I: rescue worker */

254
	int			nr_drainers;	/* WQ: drain in progress */
255
	int			saved_max_active; /* WQ: saved pwq max_active */
256

257 258
	struct workqueue_attrs	*unbound_attrs;	/* PW: only for unbound wqs */
	struct pool_workqueue	*dfl_pwq;	/* PW: only for unbound wqs */
259

260 261 262
#ifdef CONFIG_SYSFS
	struct wq_device	*wq_dev;	/* I: for sysfs interface */
#endif
263
#ifdef CONFIG_LOCKDEP
Tejun Heo's avatar
Tejun Heo committed
264
	struct lockdep_map	lockdep_map;
265
#endif
266
	char			name[WQ_NAME_LEN]; /* I: workqueue name */
267

268 269 270 271 272 273 274
	/*
	 * Destruction of workqueue_struct is sched-RCU protected to allow
	 * walking the workqueues list without grabbing wq_pool_mutex.
	 * This is used to dump all workqueues from sysrq.
	 */
	struct rcu_head		rcu;

275 276 277
	/* hot fields used during command issue, aligned to cacheline */
	unsigned int		flags ____cacheline_aligned; /* WQ: WQ_* flags */
	struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
278
	struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
Linus Torvalds's avatar
Linus Torvalds committed
279 280
};

281 282
static struct kmem_cache *pwq_cache;

283 284 285
static cpumask_var_t *wq_numa_possible_cpumask;
					/* possible CPUs of each node */

286 287 288
static bool wq_disable_numa;
module_param_named(disable_numa, wq_disable_numa, bool, 0444);

289
/* see the comment above the definition of WQ_POWER_EFFICIENT */
290
static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
291 292
module_param_named(power_efficient, wq_power_efficient, bool, 0444);

Tejun Heo's avatar
Tejun Heo committed
293
static bool wq_online;			/* can kworkers be created yet? */
294

295 296
static bool wq_numa_enabled;		/* unbound NUMA affinity enabled */

297 298 299
/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;

300
static DEFINE_MUTEX(wq_pool_mutex);	/* protects pools and workqueues list */
301
static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
302

303
static LIST_HEAD(workqueues);		/* PR: list of all workqueues */
304
static bool workqueue_freezing;		/* PL: have wqs started freezing? */
305

306 307 308 309 310
/* PL: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;

/* CPU where unbound work was last round robin scheduled from this CPU */
static DEFINE_PER_CPU(int, wq_rr_cpu_last);
311

312 313 314 315 316 317 318 319 320 321 322 323
/*
 * Local execution of unbound work items is no longer guaranteed.  The
 * following always forces round-robin CPU selection on unbound work items
 * to uncover usages which depend on it.
 */
#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU
static bool wq_debug_force_rr_cpu = true;
#else
static bool wq_debug_force_rr_cpu = false;
#endif
module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);

324
/* the per-cpu worker pools */
325
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);
326

327
static DEFINE_IDR(worker_pool_idr);	/* PR: idr of all pools */
328

329
/* PL: hash of all unbound pools keyed by pool->attrs */
330 331
static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);

332
/* I: attributes used when instantiating standard unbound pools on demand */
333 334
static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];

335 336 337
/* I: attributes used when instantiating ordered pools on demand */
static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];

338
struct workqueue_struct *system_wq __read_mostly;
339
EXPORT_SYMBOL(system_wq);
340
struct workqueue_struct *system_highpri_wq __read_mostly;
341
EXPORT_SYMBOL_GPL(system_highpri_wq);
342
struct workqueue_struct *system_long_wq __read_mostly;
343
EXPORT_SYMBOL_GPL(system_long_wq);
344
struct workqueue_struct *system_unbound_wq __read_mostly;
345
EXPORT_SYMBOL_GPL(system_unbound_wq);
346
struct workqueue_struct *system_freezable_wq __read_mostly;
347
EXPORT_SYMBOL_GPL(system_freezable_wq);
348 349 350 351
struct workqueue_struct *system_power_efficient_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_power_efficient_wq);
struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
352

353
static int worker_thread(void *__worker);
354
static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
355

356 357 358
#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>

359
#define assert_rcu_or_pool_mutex()					\
360 361 362
	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\
			 !lockdep_is_held(&wq_pool_mutex),		\
			 "sched RCU or wq_pool_mutex should be held")
363

364
#define assert_rcu_or_wq_mutex(wq)					\
365 366 367
	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\
			 !lockdep_is_held(&wq->mutex),			\
			 "sched RCU or wq->mutex should be held")
368

369
#define assert_rcu_or_wq_mutex_or_pool_mutex(wq)			\
370 371 372 373
	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\
			 !lockdep_is_held(&wq->mutex) &&		\
			 !lockdep_is_held(&wq_pool_mutex),		\
			 "sched RCU, wq->mutex or wq_pool_mutex should be held")
374

375 376 377
#define for_each_cpu_worker_pool(pool, cpu)				\
	for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];		\
	     (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
378
	     (pool)++)
379

380 381 382
/**
 * for_each_pool - iterate through all worker_pools in the system
 * @pool: iteration cursor
383
 * @pi: integer used for iteration
384
 *
385 386 387
 * This must be called either with wq_pool_mutex held or sched RCU read
 * locked.  If the pool needs to be used beyond the locking in effect, the
 * caller is responsible for guaranteeing that the pool stays online.
388 389 390
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
391
 */
392 393
#define for_each_pool(pool, pi)						\
	idr_for_each_entry(&worker_pool_idr, pool, pi)			\
394
		if (({ assert_rcu_or_pool_mutex(); false; })) { }	\
395
		else
396

397 398 399 400 401
/**
 * for_each_pool_worker - iterate through all workers of a worker_pool
 * @worker: iteration cursor
 * @pool: worker_pool to iterate workers of
 *
402
 * This must be called with @pool->attach_mutex.
403 404 405 406
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
 */
407 408
#define for_each_pool_worker(worker, pool)				\
	list_for_each_entry((worker), &(pool)->workers, node)		\
409
		if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \
410 411
		else

412 413 414 415
/**
 * for_each_pwq - iterate through all pool_workqueues of the specified workqueue
 * @pwq: iteration cursor
 * @wq: the target workqueue
416
 *
417
 * This must be called either with wq->mutex held or sched RCU read locked.
418 419
 * If the pwq needs to be used beyond the locking in effect, the caller is
 * responsible for guaranteeing that the pwq stays online.
420 421 422
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
423 424
 */
#define for_each_pwq(pwq, wq)						\
425
	list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node)		\
426
		if (({ assert_rcu_or_wq_mutex(wq); false; })) { }	\
427
		else
428

429 430 431 432
#ifdef CONFIG_DEBUG_OBJECTS_WORK

static struct debug_obj_descr work_debug_descr;

433 434 435 436 437
static void *work_debug_hint(void *addr)
{
	return ((struct work_struct *) addr)->func;
}

438 439 440 441 442 443 444
static bool work_is_static_object(void *addr)
{
	struct work_struct *work = addr;

	return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work));
}

445 446 447 448
/*
 * fixup_init is called when:
 * - an active object is initialized
 */
449
static bool work_fixup_init(void *addr, enum debug_obj_state state)
450 451 452 453 454 455 456
{
	struct work_struct *work = addr;

	switch (state) {
	case ODEBUG_STATE_ACTIVE:
		cancel_work_sync(work);
		debug_object_init(work, &work_debug_descr);
457
		return true;
458
	default:
459
		return false;
460 461 462 463 464 465 466
	}
}

/*
 * fixup_free is called when:
 * - an active object is freed
 */
467
static bool work_fixup_free(void *addr, enum debug_obj_state state)
468 469 470 471 472 473 474
{
	struct work_struct *work = addr;

	switch (state) {
	case ODEBUG_STATE_ACTIVE:
		cancel_work_sync(work);
		debug_object_free(work, &work_debug_descr);
475
		return true;
476
	default:
477
		return false;
478 479 480 481 482
	}
}

static struct debug_obj_descr work_debug_descr = {
	.name		= "work_struct",
483
	.debug_hint	= work_debug_hint,
484
	.is_static_object = work_is_static_object,
485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513
	.fixup_init	= work_fixup_init,
	.fixup_free	= work_fixup_free,
};

static inline void debug_work_activate(struct work_struct *work)
{
	debug_object_activate(work, &work_debug_descr);
}

static inline void debug_work_deactivate(struct work_struct *work)
{
	debug_object_deactivate(work, &work_debug_descr);
}

void __init_work(struct work_struct *work, int onstack)
{
	if (onstack)
		debug_object_init_on_stack(work, &work_debug_descr);
	else
		debug_object_init(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(__init_work);

void destroy_work_on_stack(struct work_struct *work)
{
	debug_object_free(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_work_on_stack);

514 515 516 517 518 519 520
void destroy_delayed_work_on_stack(struct delayed_work *work)
{
	destroy_timer_on_stack(&work->timer);
	debug_object_free(&work->work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);

521 522 523 524 525
#else
static inline void debug_work_activate(struct work_struct *work) { }
static inline void debug_work_deactivate(struct work_struct *work) { }
#endif

526 527 528 529 530 531 532
/**
 * worker_pool_assign_id - allocate ID and assing it to @pool
 * @pool: the pool pointer of interest
 *
 * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
 * successfully, -errno on failure.
 */
Tejun Heo's avatar
Tejun Heo committed
533 534 535 536
static int worker_pool_assign_id(struct worker_pool *pool)
{
	int ret;

537
	lockdep_assert_held(&wq_pool_mutex);
538

539 540
	ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
			GFP_KERNEL);
541
	if (ret >= 0) {
Tejun Heo's avatar
Tejun Heo committed
542
		pool->id = ret;
543 544
		return 0;
	}
545
	return ret;
546 547
}

548 549 550 551 552
/**
 * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
 * @wq: the target workqueue
 * @node: the node ID
 *
553 554
 * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
 * read locked.
555 556
 * If the pwq needs to be used beyond the locking in effect, the caller is
 * responsible for guaranteeing that the pwq stays online.
557 558
 *
 * Return: The unbound pool_workqueue for @node.
559 560 561 562
 */
static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
						  int node)
{
563
	assert_rcu_or_wq_mutex_or_pool_mutex(wq);
564 565 566 567 568 569 570 571 572 573

	/*
	 * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a
	 * delayed item is pending.  The plan is to keep CPU -> NODE
	 * mapping valid and stable across CPU on/offlines.  Once that
	 * happens, this workaround can be removed.
	 */
	if (unlikely(node == NUMA_NO_NODE))
		return wq->dfl_pwq;

574 575 576
	return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
}

577 578 579 580 581 582 583 584 585 586 587 588 589 590 591
static unsigned int work_color_to_flags(int color)
{
	return color << WORK_STRUCT_COLOR_SHIFT;
}

static int get_work_color(struct work_struct *work)
{
	return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
		((1 << WORK_STRUCT_COLOR_BITS) - 1);
}

static int work_next_color(int color)
{
	return (color + 1) % WORK_NR_COLORS;
}
Linus Torvalds's avatar
Linus Torvalds committed
592

593
/*
594 595
 * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
 * contain the pointer to the queued pwq.  Once execution starts, the flag
596
 * is cleared and the high bits contain OFFQ flags and pool ID.
597
 *
598 599
 * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()
 * and clear_work_data() can be used to set the pwq, pool or clear
600 601
 * work->data.  These functions should only be called while the work is
 * owned - ie. while the PENDING bit is set.
602
 *
603
 * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
604
 * corresponding to a work.  Pool is available once the work has been
605
 * queued anywhere after initialization until it is sync canceled.  pwq is
606
 * available only while the work item is queued.
607
 *
608 609 610 611
 * %WORK_OFFQ_CANCELING is used to mark a work item which is being
 * canceled.  While being canceled, a work item may have its PENDING set
 * but stay off timer and worklist for arbitrarily long and nobody should
 * try to steal the PENDING bit.
612
 */
613 614
static inline void set_work_data(struct work_struct *work, unsigned long data,
				 unsigned long flags)
615
{
616
	WARN_ON_ONCE(!work_pending(work));
617 618
	atomic_long_set(&work->data, data | flags | work_static(work));
}
619

620
static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
621 622
			 unsigned long extra_flags)
{
623 624
	set_work_data(work, (unsigned long)pwq,
		      WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);
625 626
}

627 628 629 630 631 632 633
static void set_work_pool_and_keep_pending(struct work_struct *work,
					   int pool_id)
{
	set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,
		      WORK_STRUCT_PENDING);
}

634 635
static void set_work_pool_and_clear_pending(struct work_struct *work,
					    int pool_id)
636
{
637 638 639 640 641 642 643
	/*
	 * The following wmb is paired with the implied mb in
	 * test_and_set_bit(PENDING) and ensures all updates to @work made
	 * here are visible to and precede any updates by the next PENDING
	 * owner.
	 */
	smp_wmb();
644
	set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673
	/*
	 * The following mb guarantees that previous clear of a PENDING bit
	 * will not be reordered with any speculative LOADS or STORES from
	 * work->current_func, which is executed afterwards.  This possible
	 * reordering can lead to a missed execution on attempt to qeueue
	 * the same @work.  E.g. consider this case:
	 *
	 *   CPU#0                         CPU#1
	 *   ----------------------------  --------------------------------
	 *
	 * 1  STORE event_indicated
	 * 2  queue_work_on() {
	 * 3    test_and_set_bit(PENDING)
	 * 4 }                             set_..._and_clear_pending() {
	 * 5                                 set_work_data() # clear bit
	 * 6                                 smp_mb()
	 * 7                               work->current_func() {
	 * 8				      LOAD event_indicated
	 *				   }
	 *
	 * Without an explicit full barrier speculative LOAD on line 8 can
	 * be executed before CPU#0 does STORE on line 1.  If that happens,
	 * CPU#0 observes the PENDING bit is still set and new execution of
	 * a @work is not queued in a hope, that CPU#1 will eventually
	 * finish the queued @work.  Meanwhile CPU#1 does not see
	 * event_indicated is set, because speculative LOAD was executed
	 * before actual STORE.
	 */
	smp_mb();
674
}
675

676
static void clear_work_data(struct work_struct *work)
Linus Torvalds's avatar
Linus Torvalds committed
677
{
678 679
	smp_wmb();	/* see set_work_pool_and_clear_pending() */
	set_work_data(work, WORK_STRUCT_NO_POOL, 0);
Linus Torvalds's avatar
Linus Torvalds committed
680 681
}

682
static struct pool_workqueue *get_work_pwq(struct work_struct *work)
683
{
684
	unsigned long data = atomic_long_read(&work->data);
685

686
	if (data & WORK_STRUCT_PWQ)
687 688 689
		return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
	else
		return NULL;
690 691
}

692 693 694 695
/**
 * get_work_pool - return the worker_pool a given work was associated with
 * @work: the work item of interest
 *
696 697 698
 * Pools are created and destroyed under wq_pool_mutex, and allows read
 * access under sched-RCU read lock.  As such, this function should be
 * called under wq_pool_mutex or with preemption disabled.
699 700 701 702 703
 *
 * All fields of the returned pool are accessible as long as the above
 * mentioned locking is in effect.  If the returned pool needs to be used
 * beyond the critical section, the caller is responsible for ensuring the
 * returned pool is and stays online.
704 705
 *
 * Return: The worker_pool @work was last associated with.  %NULL if none.
706 707
 */
static struct worker_pool *get_work_pool(struct work_struct *work)
708
{
709
	unsigned long data = atomic_long_read(&work->data);
710
	int pool_id;
711

712
	assert_rcu_or_pool_mutex();
713

714 715
	if (data & WORK_STRUCT_PWQ)
		return ((struct pool_workqueue *)
716
			(data & WORK_STRUCT_WQ_DATA_MASK))->pool;
717

718 719
	pool_id = data >> WORK_OFFQ_POOL_SHIFT;
	if (pool_id == WORK_OFFQ_POOL_NONE)
720 721
		return NULL;

722
	return idr_find(&worker_pool_idr, pool_id);
723 724 725 726 727 728
}

/**
 * get_work_pool_id - return the worker pool ID a given work is associated with
 * @work: the work item of interest
 *
729
 * Return: The worker_pool ID @work was last associated with.
730 731 732 733
 * %WORK_OFFQ_POOL_NONE if none.
 */
static int get_work_pool_id(struct work_struct *work)
{
734 735
	unsigned long data = atomic_long_read(&work->data);

736 737
	if (data & WORK_STRUCT_PWQ)
		return ((struct pool_workqueue *)
738
			(data & WORK_STRUCT_WQ_DATA_MASK))->pool->id;
739

740
	return data >> WORK_OFFQ_POOL_SHIFT;
741 742
}

743 744
static void mark_work_canceling(struct work_struct *work)
{
745
	unsigned long pool_id = get_work_pool_id(work);
746

747 748
	pool_id <<= WORK_OFFQ_POOL_SHIFT;
	set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);
749 750 751 752 753 754
}

static bool work_is_canceling(struct work_struct *work)
{
	unsigned long data = atomic_long_read(&work->data);

755
	return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);
756 757
}

758
/*
759 760
 * Policy functions.  These define the policies on how the global worker
 * pools are managed.  Unless noted otherwise, these functions assume that
761
 * they're being called with pool->lock held.
762 763
 */

764
static bool __need_more_worker(struct worker_pool *pool)
765
{
766
	return !atomic_read(&pool->nr_running);
767 768
}

769
/*
770 771
 * Need to wake up a worker?  Called from anything but currently
 * running workers.
772 773
 *
 * Note that, because unbound workers never contribute to nr_running, this
774
 * function will always return %true for unbound pools as long as the
775
 * worklist isn't empty.
776
 */
777
static bool need_more_worker(struct worker_pool *pool)
778
{
779
	return !list_empty(&pool->worklist) && __need_more_worker(pool);
780
}
781

782
/* Can I start working?  Called from busy but !running workers. */
783
static bool may_start_working(struct worker_pool *pool)
784
{
785
	return pool->nr_idle;
786 787 788
}

/* Do I need to keep working?  Called from currently running workers. */
789
static bool keep_working(struct worker_pool *pool)
790
{
791 792
	return !list_empty(&pool->worklist) &&
		atomic_read(&pool->nr_running) <= 1;
793 794 795
}

/* Do we need a new worker?  Called from manager. */
796
static bool need_to_create_worker(struct worker_pool *pool)
797
{
798
	return need_more_worker(pool) && !may_start_working(pool);
799
}
800

801
/* Do we have too many workers and should some go away? */
802
static bool too_many_workers(struct worker_pool *pool)
803
{
804
	bool managing = mutex_is_locked(&pool->manager_arb);
805 806
	int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
	int nr_busy = pool->nr_workers - nr_idle;
807 808

	return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
809 810
}

811
/*
812 813 814
 * Wake up functions.
 */

815 816
/* Return the first idle worker.  Safe with preemption disabled */
static struct worker *first_idle_worker(struct worker_pool *pool)
817
{
818
	if (unlikely(list_empty(&pool->idle_list)))
819 820
		return NULL;

821
	return list_first_entry(&pool->idle_list, struct worker, entry);
822 823 824 825
}

/**
 * wake_up_worker - wake up an idle worker
826
 * @pool: worker pool to wake worker from
827
 *
828
 * Wake up the first idle worker of @pool.
829 830
 *
 * CONTEXT:
831
 * spin_lock_irq(pool->lock).
832
 */
833
static void wake_up_worker(struct worker_pool *pool)
834
{
Lai Jiangshan's avatar