setup_percpu.c 8.73 KB
Newer Older
1 2
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

3
#include <linux/kernel.h>
4
#include <linux/export.h>
5 6 7
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/percpu.h>
8
#include <linux/kexec.h>
9
#include <linux/crash_dump.h>
10 11
#include <linux/smp.h>
#include <linux/topology.h>
12
#include <linux/pfn.h>
13 14
#include <asm/sections.h>
#include <asm/processor.h>
15
#include <asm/desc.h>
16
#include <asm/setup.h>
17
#include <asm/mpspec.h>
18
#include <asm/apicdef.h>
19
#include <asm/highmem.h>
20
#include <asm/proto.h>
21
#include <asm/cpumask.h>
Brian Gerst's avatar
Brian Gerst committed
22
#include <asm/cpu.h>
23
#include <asm/stackprotector.h>
24

25
DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number);
26 27
EXPORT_PER_CPU_SYMBOL(cpu_number);

Brian Gerst's avatar
Brian Gerst committed
28 29 30 31 32 33
#ifdef CONFIG_X86_64
#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
#else
#define BOOT_PERCPU_OFFSET 0
#endif

34
DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
Brian Gerst's avatar
Brian Gerst committed
35 36
EXPORT_PER_CPU_SYMBOL(this_cpu_off);

37
unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = {
Brian Gerst's avatar
Brian Gerst committed
38
	[0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
39 40
};
EXPORT_SYMBOL(__per_cpu_offset);
41

42 43 44 45 46 47 48 49 50 51 52 53 54
/*
 * On x86_64 symbols referenced from code should be reachable using
 * 32bit relocations.  Reserve space for static percpu variables in
 * modules so that they are always served from the first chunk which
 * is located at the percpu segment base.  On x86_32, anything can
 * address anywhere.  No need to reserve space in the first chunk.
 */
#ifdef CONFIG_X86_64
#define PERCPU_FIRST_CHUNK_RESERVE	PERCPU_MODULE_RESERVE
#else
#define PERCPU_FIRST_CHUNK_RESERVE	0
#endif

55
#ifdef CONFIG_X86_32
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
/**
 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
 *
 * If NUMA is not configured or there is only one NUMA node available,
 * there is no reason to consider NUMA.  This function determines
 * whether percpu allocation should consider NUMA or not.
 *
 * RETURNS:
 * true if NUMA should be considered; otherwise, false.
 */
static bool __init pcpu_need_numa(void)
{
#ifdef CONFIG_NEED_MULTIPLE_NODES
	pg_data_t *last = NULL;
	unsigned int cpu;

	for_each_possible_cpu(cpu) {
		int node = early_cpu_to_node(cpu);

		if (node_online(node) && NODE_DATA(node) &&
		    last && last != NODE_DATA(node))
			return true;

		last = NODE_DATA(node);
	}
#endif
	return false;
}
84
#endif
85

86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
/**
 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
 * @cpu: cpu to allocate for
 * @size: size allocation in bytes
 * @align: alignment
 *
 * Allocate @size bytes aligned at @align for cpu @cpu.  This wrapper
 * does the right thing for NUMA regardless of the current
 * configuration.
 *
 * RETURNS:
 * Pointer to the allocated area on success, NULL on failure.
 */
static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
					unsigned long align)
{
	const unsigned long goal = __pa(MAX_DMA_ADDRESS);
#ifdef CONFIG_NEED_MULTIPLE_NODES
	int node = early_cpu_to_node(cpu);
	void *ptr;

	if (!node_online(node) || !NODE_DATA(node)) {
		ptr = __alloc_bootmem_nopanic(size, align, goal);
		pr_info("cpu %d has no node %d or node-local memory\n",
			cpu, node);
		pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
			 cpu, size, __pa(ptr));
	} else {
		ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
						   size, align, goal);
116 117
		pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
			 cpu, size, node, __pa(ptr));
118 119 120 121 122 123 124
	}
	return ptr;
#else
	return __alloc_bootmem_nopanic(size, align, goal);
#endif
}

125 126 127
/*
 * Helpers for first chunk memory allocation
 */
128
static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
129
{
130
	return pcpu_alloc_bootmem(cpu, size, align);
131 132 133 134 135 136 137
}

static void __init pcpu_fc_free(void *ptr, size_t size)
{
	free_bootmem(__pa(ptr), size);
}

138
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
139
{
140
#ifdef CONFIG_NEED_MULTIPLE_NODES
141 142 143 144
	if (early_cpu_to_node(from) == early_cpu_to_node(to))
		return LOCAL_DISTANCE;
	else
		return REMOTE_DISTANCE;
145
#else
146
	return LOCAL_DISTANCE;
147
#endif
148 149
}

150
static void __init pcpup_populate_pte(unsigned long addr)
151 152 153 154
{
	populate_extra_pte(addr);
}

155 156 157 158 159 160 161 162
static inline void setup_percpu_segment(int cpu)
{
#ifdef CONFIG_X86_32
	struct desc_struct gdt;

	pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
			0x2 | DESCTYPE_S, 0x8);
	gdt.s = 1;
163
	write_gdt_entry(get_cpu_gdt_rw(cpu),
164 165 166 167
			GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
#endif
}

168 169
void __init setup_per_cpu_areas(void)
{
170
	unsigned int cpu;
171
	unsigned long delta;
Tejun Heo's avatar
Tejun Heo committed
172
	int rc;
173

174
	pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
175
		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
176

177
	/*
178 179 180 181
	 * Allocate percpu area.  Embedding allocator is our favorite;
	 * however, on NUMA configurations, it can result in very
	 * sparse unit mapping and vmalloc area isn't spacious enough
	 * on 32bit.  Use page in that case.
182
	 */
183 184 185 186
#ifdef CONFIG_X86_32
	if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
		pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
Tejun Heo's avatar
Tejun Heo committed
187
	rc = -EINVAL;
188 189 190
	if (pcpu_chosen_fc != PCPU_FC_PAGE) {
		const size_t dyn_size = PERCPU_MODULE_RESERVE +
			PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
191 192 193 194 195 196 197 198 199 200 201 202 203 204
		size_t atom_size;

		/*
		 * On 64bit, use PMD_SIZE for atom_size so that embedded
		 * percpu areas are aligned to PMD.  This, in the future,
		 * can also allow using PMD mappings in vmalloc area.  Use
		 * PAGE_SIZE on 32bit as vmalloc space is highly contended
		 * and large vmalloc area allocs can easily fail.
		 */
#ifdef CONFIG_X86_64
		atom_size = PMD_SIZE;
#else
		atom_size = PAGE_SIZE;
#endif
205 206 207 208
		rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
					    dyn_size, atom_size,
					    pcpu_cpu_distance,
					    pcpu_fc_alloc, pcpu_fc_free);
Tejun Heo's avatar
Tejun Heo committed
209
		if (rc < 0)
210
			pr_warning("%s allocator failed (%d), falling back to page size\n",
211
				   pcpu_fc_names[pcpu_chosen_fc], rc);
212
	}
Tejun Heo's avatar
Tejun Heo committed
213
	if (rc < 0)
214 215 216
		rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
					   pcpu_fc_alloc, pcpu_fc_free,
					   pcpup_populate_pte);
Tejun Heo's avatar
Tejun Heo committed
217 218
	if (rc < 0)
		panic("cannot initialize percpu area (err=%d)", rc);
219

220
	/* alrighty, percpu areas up and running */
221 222
	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
	for_each_possible_cpu(cpu) {
Tejun Heo's avatar
Tejun Heo committed
223
		per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
224
		per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
225
		per_cpu(cpu_number, cpu) = cpu;
226
		setup_percpu_segment(cpu);
227
		setup_stack_canary_segment(cpu);
228
		/*
229 230 231 232 233
		 * Copy data used in early init routines from the
		 * initial arrays to the per cpu data areas.  These
		 * arrays then become expendable and the *_early_ptr's
		 * are zeroed indicating that the static arrays are
		 * gone.
234
		 */
235
#ifdef CONFIG_X86_LOCAL_APIC
236
		per_cpu(x86_cpu_to_apicid, cpu) =
237
			early_per_cpu_map(x86_cpu_to_apicid, cpu);
238
		per_cpu(x86_bios_cpu_apicid, cpu) =
239
			early_per_cpu_map(x86_bios_cpu_apicid, cpu);
240 241
		per_cpu(x86_cpu_to_acpiid, cpu) =
			early_per_cpu_map(x86_cpu_to_acpiid, cpu);
242
#endif
243 244 245 246
#ifdef CONFIG_X86_32
		per_cpu(x86_cpu_to_logical_apicid, cpu) =
			early_per_cpu_map(x86_cpu_to_logical_apicid, cpu);
#endif
247
#ifdef CONFIG_X86_64
248
		per_cpu(irq_stack_ptr, cpu) =
249
			per_cpu(irq_stack_union.irq_stack, cpu) +
250
			IRQ_STACK_SIZE;
251
#endif
Brian Gerst's avatar
Brian Gerst committed
252 253
#ifdef CONFIG_NUMA
		per_cpu(x86_cpu_to_node_map, cpu) =
254
			early_per_cpu_map(x86_cpu_to_node_map, cpu);
255
		/*
256
		 * Ensure that the boot cpu numa_node is correct when the boot
257 258 259 260 261 262 263
		 * cpu is on a node that doesn't have memory installed.
		 * Also cpu_up() will call cpu_to_node() for APs when
		 * MEMORY_HOTPLUG is defined, before per_cpu(numa_node) is set
		 * up later with c_init aka intel_init/amd_init.
		 * So set them all (boot cpu and all APs).
		 */
		set_cpu_numa_node(cpu, early_cpu_to_node(cpu));
Brian Gerst's avatar
Brian Gerst committed
264
#endif
265
		/*
266
		 * Up to this point, the boot CPU has been using .init.data
267
		 * area.  Reload any changed state for the boot CPU.
268
		 */
269
		if (!cpu)
270
			switch_to_new_gdt(cpu);
271 272
	}

273
	/* indicate the early static arrays will soon be gone */
274
#ifdef CONFIG_X86_LOCAL_APIC
275 276
	early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
	early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
277
	early_per_cpu_ptr(x86_cpu_to_acpiid) = NULL;
278
#endif
279 280 281
#ifdef CONFIG_X86_32
	early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL;
#endif
282
#ifdef CONFIG_NUMA
283 284
	early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
#endif
285

286 287
	/* Setup node to cpumask map */
	setup_node_to_cpumask_map();
288 289 290

	/* Setup cpu initialized, callin, callout masks */
	setup_cpu_local_masks();
291 292 293

#ifdef CONFIG_X86_32
	/*
294 295 296 297 298
	 * Sync back kernel address range again.  We already did this in
	 * setup_arch(), but percpu data also needs to be available in
	 * the smpboot asm.  We can't reliably pick up percpu mappings
	 * using vmalloc_fault(), because exception dispatch needs
	 * percpu data.
299 300 301 302 303 304 305 306 307 308 309 310 311
	 */
	clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
			swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
			KERNEL_PGD_PTRS);

	/*
	 * sync back low identity map too.  It is used for example
	 * in the 32-bit EFI stub.
	 */
	clone_pgd_range(initial_page_table,
			swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
			min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
#endif
312
}