diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt index 3c94ff3f9693c4fa876ddf47948a02258e2f2470..f2235a16252951c32b2a2ad151d92ac814b18e02 100644 --- a/Documentation/cgroups/cpusets.txt +++ b/Documentation/cgroups/cpusets.txt @@ -445,7 +445,7 @@ across partially overlapping sets of CPUs would risk unstable dynamics that would be beyond our understanding. So if each of two partially overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we form a single sched domain that is a superset of both. We won't move -a task to a CPU outside it cpuset, but the scheduler load balancing +a task to a CPU outside its cpuset, but the scheduler load balancing code might waste some compute cycles considering that possibility. This mismatch is why there is not a simple one-to-one relation @@ -552,8 +552,8 @@ otherwise initial value -1 that indicates the cpuset has no request. 1 : search siblings (hyperthreads in a core). 2 : search cores in a package. 3 : search cpus in a node [= system wide on non-NUMA system] - ( 4 : search nodes in a chunk of node [on NUMA system] ) - ( 5 : search system wide [on NUMA system] ) + 4 : search nodes in a chunk of node [on NUMA system] + 5 : search system wide [on NUMA system] The system default is architecture dependent. The system default can be changed using the relax_domain_level= boot parameter. diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 46b2b508031794a4d274c8d4470aaf983e9d9066..a22df3ad35fff8a43d78517f790a9822aa5d8d55 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -326,7 +326,7 @@ per cgroup, instead of globally. * tcp memory pressure: sockets memory pressure for the tcp protocol. -2.7.3 Common use cases +2.7.2 Common use cases Because the "kmem" counter is fed to the main user counter, kernel memory can never be limited completely independently of user memory. Say "U" is the user @@ -354,19 +354,19 @@ set: 3. User Interface -0. Configuration +3.0. Configuration a. Enable CONFIG_CGROUPS b. Enable CONFIG_MEMCG c. Enable CONFIG_MEMCG_SWAP (to use swap extension) d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) -1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) +3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) # mount -t tmpfs none /sys/fs/cgroup # mkdir /sys/fs/cgroup/memory # mount -t cgroup none /sys/fs/cgroup/memory -o memory -2. Make the new group and move bash into it +3.2. Make the new group and move bash into it # mkdir /sys/fs/cgroup/memory/0 # echo $$ > /sys/fs/cgroup/memory/0/tasks diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 43ecdcd39df23bf0189666907d91ba43024c78e7..4a337daf0c09812a284aff91007fff40048f2a87 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -829,6 +829,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. CONFIG_DEBUG_PAGEALLOC, hence this option will not help tracking down these problems. + debug_pagealloc= + [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this + parameter enables the feature at boot time. In + default, it is disabled. We can avoid allocating huge + chunk of memory for debug pagealloc if we don't enable + it at boot time and the system will work mostly same + with the kernel built without CONFIG_DEBUG_PAGEALLOC. + on: enable the feature + debugpat [X86] Enable PAT debugging decnet.addr= [HW,NET] @@ -1228,9 +1237,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. multiple times interleaved with hugepages= to reserve huge pages of different sizes. Valid pages sizes on x86-64 are 2M (when the CPU supports "pse") and 1G - (when the CPU supports the "pdpe1gb" cpuinfo flag) - Note that 1GB pages can only be allocated at boot time - using hugepages= and not freed afterwards. + (when the CPU supports the "pdpe1gb" cpuinfo flag). hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC) terminal devices. Valid values: 0..8 @@ -2506,6 +2513,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. OSS [HW,OSS] See Documentation/sound/oss/oss-parameters.txt + page_owner= [KNL] Boot-time page_owner enabling option. + Storage of the information about who allocated + each page is disabled in default. With this switch, + we can turn it on. + on: enable the feature + panic= [KNL] Kernel behaviour on panic: delay <timeout> timeout > 0: seconds before rebooting timeout = 0: wait forever diff --git a/Documentation/local_ops.txt b/Documentation/local_ops.txt index 300da4bdfdbd703c4809bc4b3ec5433dfe634d1e..407576a233177c3c336827b952872c082207d9e4 100644 --- a/Documentation/local_ops.txt +++ b/Documentation/local_ops.txt @@ -8,6 +8,11 @@ to implement them for any given architecture and shows how they can be used properly. It also stresses on the precautions that must be taken when reading those local variables across CPUs when the order of memory writes matters. +Note that local_t based operations are not recommended for general kernel use. +Please use the this_cpu operations instead unless there is really a special purpose. +Most uses of local_t in the kernel have been replaced by this_cpu operations. +this_cpu operations combine the relocation with the local_t like semantics in +a single instruction and yield more compact and faster executing code. * Purpose of local atomic operations @@ -87,10 +92,10 @@ the per cpu variable. For instance : local_inc(&get_cpu_var(counters)); put_cpu_var(counters); -If you are already in a preemption-safe context, you can directly use -__get_cpu_var() instead. +If you are already in a preemption-safe context, you can use +this_cpu_ptr() instead. - local_inc(&__get_cpu_var(counters)); + local_inc(this_cpu_ptr(&counters)); @@ -134,7 +139,7 @@ static void test_each(void *info) { /* Increment the counter from a non preemptible context */ printk("Increment on cpu %d\n", smp_processor_id()); - local_inc(&__get_cpu_var(counters)); + local_inc(this_cpu_ptr(&counters)); /* This is what incrementing the variable would look like within a * preemptible context (it disables preemption) : diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index b5d0c8501a189c8e2473bea02502cd6a5932eecd..75511efefc64b94c3110303c8fac4e78bb5356c8 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -116,10 +116,12 @@ set during run time. auto_msgmni: -Enables/Disables automatic recomputing of msgmni upon memory add/remove -or upon ipc namespace creation/removal (see the msgmni description -above). Echoing "1" into this file enables msgmni automatic recomputing. -Echoing "0" turns it off. auto_msgmni default value is 1. +This variable has no effect and may be removed in future kernel +releases. Reading it always returns 0. +Up to Linux 3.17, it enabled/disabled automatic recomputing of msgmni +upon memory add/remove or upon ipc namespace creation/removal. +Echoing "1" into this file enabled msgmni automatic recomputing. +Echoing "0" turned it off. auto_msgmni default value was 1. ============================================================== diff --git a/Documentation/vm/page_owner.txt b/Documentation/vm/page_owner.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f3ce9b3aa11b27c52bc20b395765b5df0833d71 --- /dev/null +++ b/Documentation/vm/page_owner.txt @@ -0,0 +1,81 @@ +page owner: Tracking about who allocated each page +----------------------------------------------------------- + +* Introduction + +page owner is for the tracking about who allocated each page. +It can be used to debug memory leak or to find a memory hogger. +When allocation happens, information about allocation such as call stack +and order of pages is stored into certain storage for each page. +When we need to know about status of all pages, we can get and analyze +this information. + +Although we already have tracepoint for tracing page allocation/free, +using it for analyzing who allocate each page is rather complex. We need +to enlarge the trace buffer for preventing overlapping until userspace +program launched. And, launched program continually dump out the trace +buffer for later analysis and it would change system behviour with more +possibility rather than just keeping it in memory, so bad for debugging. + +page owner can also be used for various purposes. For example, accurate +fragmentation statistics can be obtained through gfp flag information of +each page. It is already implemented and activated if page owner is +enabled. Other usages are more than welcome. + +page owner is disabled in default. So, if you'd like to use it, you need +to add "page_owner=on" into your boot cmdline. If the kernel is built +with page owner and page owner is disabled in runtime due to no enabling +boot option, runtime overhead is marginal. If disabled in runtime, it +doesn't require memory to store owner information, so there is no runtime +memory overhead. And, page owner inserts just two unlikely branches into +the page allocator hotpath and if it returns false then allocation is +done like as the kernel without page owner. These two unlikely branches +would not affect to allocation performance. Following is the kernel's +code size change due to this facility. + +- Without page owner + text data bss dec hex filename + 40662 1493 644 42799 a72f mm/page_alloc.o + +- With page owner + text data bss dec hex filename + 40892 1493 644 43029 a815 mm/page_alloc.o + 1427 24 8 1459 5b3 mm/page_ext.o + 2722 50 0 2772 ad4 mm/page_owner.o + +Although, roughly, 4 KB code is added in total, page_alloc.o increase by +230 bytes and only half of it is in hotpath. Building the kernel with +page owner and turning it on if needed would be great option to debug +kernel memory problem. + +There is one notice that is caused by implementation detail. page owner +stores information into the memory from struct page extension. This memory +is initialized some time later than that page allocator starts in sparse +memory system, so, until initialization, many pages can be allocated and +they would have no owner information. To fix it up, these early allocated +pages are investigated and marked as allocated in initialization phase. +Although it doesn't mean that they have the right owner information, +at least, we can tell whether the page is allocated or not, +more accurately. On 2GB memory x86-64 VM box, 13343 early allocated pages +are catched and marked, although they are mostly allocated from struct +page extension feature. Anyway, after that, no page is left in +un-tracking state. + +* Usage + +1) Build user-space helper + cd tools/vm + make page_owner_sort + +2) Enable page owner + Add "page_owner=on" to boot cmdline. + +3) Do the job what you want to debug + +4) Analyze information from page owner + cat /sys/kernel/debug/page_owner > page_owner_full.txt + grep -v ^PFN page_owner_full.txt > page_owner.txt + ./page_owner_sort page_owner.txt sorted_page_owner.txt + + See the result about who allocated each page + in the sorted_page_owner.txt. diff --git a/MAINTAINERS b/MAINTAINERS index 326dc2d1652d210368e76e9f00808cb5c0af4f0e..1f0ef48830f91fb8aa3b2cf6ebeaa6ec61e14102 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4045,7 +4045,7 @@ F: drivers/tty/serial/ucc_uart.c FREESCALE SOC SOUND DRIVERS M: Timur Tabi <timur@tabi.org> M: Nicolin Chen <nicoleotsuka@gmail.com> -M: Xiubo Li <Li.Xiubo@freescale.com> +M: Xiubo Li <Xiubo.Lee@gmail.com> L: alsa-devel@alsa-project.org (moderated for non-subscribers) L: linuxppc-dev@lists.ozlabs.org S: Maintained diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 0bee1fe209b10923a65197e63903e67ef197c29e..97d07ed60a0b7b01cad35891c7695a2d3d4a1640 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -5,6 +5,7 @@ config ARM select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAVE_CUSTOM_GPIO_H + select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_USE_BUILTIN_BSWAP diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 6b1ebd964c108429f7feff86f74b17b124229f88..688db03ef5b8f606b1db8de5db7221e4643b3b3e 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2,6 +2,7 @@ config ARM64 def_bool y select ARCH_BINFMT_ELF_RANDOMIZE_PIE select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE + select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_SG_CHAIN select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_USE_CMPXCHG_LOCKREF diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index a7736fa0580cc2e000cf1613d7b21dbe7aba4dcc..0bce820428fca2e32d5ae5e04714b372fdeb4adc 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -1,5 +1,6 @@ config MICROBLAZE def_bool y + select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_WANT_IPC_PARSE_VERSION select ARCH_WANT_OPTIONAL_GPIOLIB diff --git a/arch/parisc/lib/fixup.S b/arch/parisc/lib/fixup.S index f8c45cc2947ded5f5178b7a24cecc16422aa7a12..536ef66bb94b5aa8c46800a968514184b5e2d85d 100644 --- a/arch/parisc/lib/fixup.S +++ b/arch/parisc/lib/fixup.S @@ -38,14 +38,14 @@ LDREGX \t2(\t1),\t2 addil LT%exception_data,%r27 LDREG RT%exception_data(%r1),\t1 - /* t1 = &__get_cpu_var(exception_data) */ + /* t1 = this_cpu_ptr(&exception_data) */ add,l \t1,\t2,\t1 /* t1 = t1->fault_ip */ LDREG EXCDATA_IP(\t1), \t1 .endm #else .macro get_fault_ip t1 t2 - /* t1 = &__get_cpu_var(exception_data) */ + /* t1 = this_cpu_ptr(&exception_data) */ addil LT%exception_data,%r27 LDREG RT%exception_data(%r1),\t2 /* t1 = t2->fault_ip */ diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index af696874248bb12e51298e8d59706d79295ec4aa..a2a168e2dfe757c02a6935a4a58dfc83af7f86bd 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -129,6 +129,7 @@ config PPC select HAVE_BPF_JIT if PPC64 select HAVE_ARCH_JUMP_LABEL select ARCH_HAVE_NMI_SAFE_CMPXCHG + select ARCH_HAS_GCOV_PROFILE_ALL select GENERIC_SMP_IDLE_THREAD select GENERIC_CMOS_UPDATE select GENERIC_TIME_VSYSCALL_OLD diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index e56a307bc676b392ec9d41a9d41fbd16c9925ca3..2c2022d1605905c0bea4415e44ad2fde47c88df2 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1514,7 +1514,7 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) mmu_kernel_ssize, 0); } -void kernel_map_pages(struct page *page, int numpages, int enable) +void __kernel_map_pages(struct page *page, int numpages, int enable) { unsigned long flags, vaddr, lmi; int i; diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index d545b12315947c76ff25d5db989a1aec7c60c166..50fad3801f3040d1ee479e3af963576dbaeb0ce9 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -429,7 +429,7 @@ static int change_page_attr(struct page *page, int numpages, pgprot_t prot) } -void kernel_map_pages(struct page *page, int numpages, int enable) +void __kernel_map_pages(struct page *page, int numpages, int enable) { if (PageHighMem(page)) return; diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index f2cf1f90295b6b38e855b398a3e0a2180e82fe4e..68b68d755fdfe46c0a9aecad0cebef4e1e079487 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -65,6 +65,7 @@ config S390 def_bool y select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS + select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_INLINE_READ_LOCK select ARCH_INLINE_READ_LOCK_BH diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 3fef3b299665797bbd486baec5239d5f88705428..426c9d462d1ce3fe58e021802ffedd3fc413cd35 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -120,7 +120,7 @@ static void ipte_range(pte_t *pte, unsigned long address, int nr) } } -void kernel_map_pages(struct page *page, int numpages, int enable) +void __kernel_map_pages(struct page *page, int numpages, int enable) { unsigned long address; int nr, i, j; diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index a1403470f80e69b5e05de75ae3a8f0058e9414ab..c6b6ee5f38b2dd43f612999ea2516254bf049868 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -16,6 +16,7 @@ config SUPERH select HAVE_DEBUG_BUGVERBOSE select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A) + select ARCH_HAS_GCOV_PROFILE_ALL select PERF_USE_VMALLOC select HAVE_DEBUG_KMEMLEAK select HAVE_KERNEL_GZIP diff --git a/arch/sparc/include/uapi/asm/unistd.h b/arch/sparc/include/uapi/asm/unistd.h index 46d83842eddc6ceae0496b6397a0a04e77ce0e10..6f35f4df17f281f5aa52c31381a6b781629bbbb0 100644 --- a/arch/sparc/include/uapi/asm/unistd.h +++ b/arch/sparc/include/uapi/asm/unistd.h @@ -415,8 +415,9 @@ #define __NR_getrandom 347 #define __NR_memfd_create 348 #define __NR_bpf 349 +#define __NR_execveat 350 -#define NR_syscalls 350 +#define NR_syscalls 351 /* Bitmask values returned from kern_features system call. */ #define KERN_FEATURE_MIXED_MODE_STACK 0x00000001 diff --git a/arch/sparc/kernel/syscalls.S b/arch/sparc/kernel/syscalls.S index 33a17e7b3ccd0fad8477d4c5b051cca1b24f248a..bb0008927598b1f7bbeeccdf30c6fa154219a4ce 100644 --- a/arch/sparc/kernel/syscalls.S +++ b/arch/sparc/kernel/syscalls.S @@ -6,6 +6,11 @@ sys64_execve: jmpl %g1, %g0 flushw +sys64_execveat: + set sys_execveat, %g1 + jmpl %g1, %g0 + flushw + #ifdef CONFIG_COMPAT sunos_execv: mov %g0, %o2 @@ -13,6 +18,11 @@ sys32_execve: set compat_sys_execve, %g1 jmpl %g1, %g0 flushw + +sys32_execveat: + set compat_sys_execveat, %g1 + jmpl %g1, %g0 + flushw #endif .align 32 diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S index ad0cdf497b785de84fe3f8f5e1e092a4f2de7429..e31a9056a3039ad5e5b2b7da5abe7f18509d1107 100644 --- a/arch/sparc/kernel/systbls_32.S +++ b/arch/sparc/kernel/systbls_32.S @@ -87,3 +87,4 @@ sys_call_table: /*335*/ .long sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev /*340*/ .long sys_ni_syscall, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr /*345*/ .long sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf +/*350*/ .long sys_execveat diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S index 580cde9370c9317a2cc8b6522579d5c38fa48ee3..d72f76ae70eba5a7418630c7e1890a6e239675cd 100644 --- a/arch/sparc/kernel/systbls_64.S +++ b/arch/sparc/kernel/systbls_64.S @@ -88,6 +88,7 @@ sys_call_table32: .word sys_syncfs, compat_sys_sendmmsg, sys_setns, compat_sys_process_vm_readv, compat_sys_process_vm_writev /*340*/ .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr .word sys32_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf +/*350*/ .word sys32_execveat #endif /* CONFIG_COMPAT */ @@ -167,3 +168,4 @@ sys_call_table: .word sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev /*340*/ .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr .word sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf +/*350*/ .word sys64_execveat diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 2d91c62f7f5f156524b7bef4ff7737701c08db85..3ea267c53320d49683ab39c9e0d95189e4adb56e 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -1621,7 +1621,7 @@ static void __init kernel_physical_mapping_init(void) } #ifdef CONFIG_DEBUG_PAGEALLOC -void kernel_map_pages(struct page *page, int numpages, int enable) +void __kernel_map_pages(struct page *page, int numpages, int enable) { unsigned long phys_start = page_to_pfn(page) << PAGE_SHIFT; unsigned long phys_end = phys_start + (numpages * PAGE_SIZE); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bea3a0159496c3592eaabffa3706bf503c390294..d69f1cd87fd996bcf7052d7abdf2ad43325402b0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -24,6 +24,7 @@ config X86 select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_HAS_FAST_MULTIPLIER + select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select HAVE_AOUT if X86_32 diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c index 5d7b381da692e082ae4edb75a71e8a4c009bdb34..2eccc8932ae6f9229296d74c09503ebc2714432f 100644 --- a/arch/x86/ia32/audit.c +++ b/arch/x86/ia32/audit.c @@ -35,6 +35,7 @@ int ia32_classify_syscall(unsigned syscall) case __NR_socketcall: return 4; case __NR_execve: + case __NR_execveat: return 5; default: return 1; diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index ffe71228fc10c3080795c576b1ffae6b62160d7f..82e8a1d446583efaa5a3c0426887d3a47abc5ddd 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -480,6 +480,7 @@ GLOBAL(\label) PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn PTREGSCALL stub32_sigreturn, sys32_sigreturn PTREGSCALL stub32_execve, compat_sys_execve + PTREGSCALL stub32_execveat, compat_sys_execveat PTREGSCALL stub32_fork, sys_fork PTREGSCALL stub32_vfork, sys_vfork diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c index 06d3e5a14d9dd70e2f709cef205c269f0a514203..f3672508b2491b385dddbe83c864c455eccba4b0 100644 --- a/arch/x86/kernel/audit_64.c +++ b/arch/x86/kernel/audit_64.c @@ -50,6 +50,7 @@ int audit_classify_syscall(int abi, unsigned syscall) case __NR_openat: return 3; case __NR_execve: + case __NR_execveat: return 5; default: return 0; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c0226ab541061870bb590a9c817d0770e618697c..90878aa38dbd7fac20bb42386248f1d860f26ff1 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -652,6 +652,20 @@ ENTRY(stub_execve) CFI_ENDPROC END(stub_execve) +ENTRY(stub_execveat) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + call sys_execveat + RESTORE_TOP_OF_STACK %r11 + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_execveat) + /* * sigreturn is special because it needs to restore all registers on return. * This cannot be done with SYSRET, so use the IRET return path instead. @@ -697,6 +711,20 @@ ENTRY(stub_x32_execve) CFI_ENDPROC END(stub_x32_execve) +ENTRY(stub_x32_execveat) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + call compat_sys_execveat + RESTORE_TOP_OF_STACK %r11 + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_x32_execveat) + #endif /* diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a3a5d46605d299068fc00d27fb52db5c8d61678b..dfaf2e0f5f8fc504908266cd1767e16fb64afaff 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1817,7 +1817,7 @@ static int __set_pages_np(struct page *page, int numpages) return __change_page_attr_set_clr(&cpa, 0); } -void kernel_map_pages(struct page *page, int numpages, int enable) +void __kernel_map_pages(struct page *page, int numpages, int enable) { if (PageHighMem(page)) return; diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index 9fe1b5d002f0ad5a30039ba672c85dbbf1c3ca38..b3560ece1c9fbae0ff05c14824b76a07efdb0776 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -364,3 +364,4 @@ 355 i386 getrandom sys_getrandom 356 i386 memfd_create sys_memfd_create 357 i386 bpf sys_bpf +358 i386 execveat sys_execveat stub32_execveat diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 281150b539a2fb8eb802816612e768cf25dc9990..8d656fbb57aab2606132858d48b2db7e99e8bf7c 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -328,6 +328,7 @@ 319 common memfd_create sys_memfd_create 320 common kexec_file_load sys_kexec_file_load 321 common bpf sys_bpf +322 64 execveat stub_execveat # # x32-specific system call numbers start at 512 to avoid cache impact @@ -366,3 +367,4 @@ 542 x32 getsockopt compat_sys_getsockopt 543 x32 io_setup compat_sys_io_setup 544 x32 io_submit compat_sys_io_submit +545 x32 execveat stub_x32_execveat diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index f2f0723070caea848a88fe1f96863943fc3df121..20c3649d06915cce37ea62b5ee8c9a6d704c0a62 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -31,6 +31,7 @@ #define stub_fork sys_fork #define stub_vfork sys_vfork #define stub_execve sys_execve +#define stub_execveat sys_execveat #define stub_rt_sigreturn sys_rt_sigreturn #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 7c5d87191b286a1275f43ef380d7cc0d9db6b43b..85be040a21c80505c09183c1578eb9c15f524244 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -228,8 +228,8 @@ memory_block_action(unsigned long phys_index, unsigned long action, int online_t struct page *first_page; int ret; - first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT); - start_pfn = page_to_pfn(first_page); + start_pfn = phys_index << PFN_SECTION_SHIFT; + first_page = pfn_to_page(start_pfn); switch (action) { case MEM_ONLINE: diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3920ee45aa5942dd816a775180eeb16f662e804c..bd8bda386e02180c989f2701c7ef056fbe51b24d 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -44,15 +44,14 @@ static const char *default_compressor = "lzo"; static unsigned int num_devices = 1; #define ZRAM_ATTR_RO(name) \ -static ssize_t zram_attr_##name##_show(struct device *d, \ +static ssize_t name##_show(struct device *d, \ struct device_attribute *attr, char *b) \ { \ struct zram *zram = dev_to_zram(d); \ return scnprintf(b, PAGE_SIZE, "%llu\n", \ (u64)atomic64_read(&zram->stats.name)); \ } \ -static struct device_attribute dev_attr_##name = \ - __ATTR(name, S_IRUGO, zram_attr_##name##_show, NULL); +static DEVICE_ATTR_RO(name); static inline int init_done(struct zram *zram) { @@ -287,19 +286,18 @@ static inline int is_partial_io(struct bio_vec *bvec) /* * Check if request is within bounds and aligned on zram logical blocks. */ -static inline int valid_io_request(struct zram *zram, struct bio *bio) +static inline int valid_io_request(struct zram *zram, + sector_t start, unsigned int size) { - u64 start, end, bound; + u64 end, bound; /* unaligned request */ - if (unlikely(bio->bi_iter.bi_sector & - (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) + if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) return 0; - if (unlikely(bio->bi_iter.bi_size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) + if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) return 0; - start = bio->bi_iter.bi_sector; - end = start + (bio->bi_iter.bi_size >> SECTOR_SHIFT); + end = start + (size >> SECTOR_SHIFT); bound = zram->disksize >> SECTOR_SHIFT; /* out of range range */ if (unlikely(start >= bound || end > bound || start > end)) @@ -453,7 +451,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) } static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, - u32 index, int offset, struct bio *bio) + u32 index, int offset) { int ret; struct page *page; @@ -645,14 +643,13 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, } static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset, struct bio *bio) + int offset, int rw) { int ret; - int rw = bio_data_dir(bio); if (rw == READ) { atomic64_inc(&zram->stats.num_reads); - ret = zram_bvec_read(zram, bvec, index, offset, bio); + ret = zram_bvec_read(zram, bvec, index, offset); } else { atomic64_inc(&zram->stats.num_writes); ret = zram_bvec_write(zram, bvec, index, offset); @@ -853,7 +850,7 @@ static ssize_t reset_store(struct device *dev, static void __zram_make_request(struct zram *zram, struct bio *bio) { - int offset; + int offset, rw; u32 index; struct bio_vec bvec; struct bvec_iter iter; @@ -868,6 +865,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) return; } + rw = bio_data_dir(bio); bio_for_each_segment(bvec, bio, iter) { int max_transfer_size = PAGE_SIZE - offset; @@ -882,15 +880,15 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) bv.bv_len = max_transfer_size; bv.bv_offset = bvec.bv_offset; - if (zram_bvec_rw(zram, &bv, index, offset, bio) < 0) + if (zram_bvec_rw(zram, &bv, index, offset, rw) < 0) goto out; bv.bv_len = bvec.bv_len - max_transfer_size; bv.bv_offset += max_transfer_size; - if (zram_bvec_rw(zram, &bv, index + 1, 0, bio) < 0) + if (zram_bvec_rw(zram, &bv, index + 1, 0, rw) < 0) goto out; } else - if (zram_bvec_rw(zram, &bvec, index, offset, bio) < 0) + if (zram_bvec_rw(zram, &bvec, index, offset, rw) < 0) goto out; update_position(&index, &offset, &bvec); @@ -915,7 +913,8 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio) if (unlikely(!init_done(zram))) goto error; - if (!valid_io_request(zram, bio)) { + if (!valid_io_request(zram, bio->bi_iter.bi_sector, + bio->bi_iter.bi_size)) { atomic64_inc(&zram->stats.invalid_io); goto error; } @@ -945,25 +944,64 @@ static void zram_slot_free_notify(struct block_device *bdev, atomic64_inc(&zram->stats.notify_free); } +static int zram_rw_page(struct block_device *bdev, sector_t sector, + struct page *page, int rw) +{ + int offset, err; + u32 index; + struct zram *zram; + struct bio_vec bv; + + zram = bdev->bd_disk->private_data; + if (!valid_io_request(zram, sector, PAGE_SIZE)) { + atomic64_inc(&zram->stats.invalid_io); + return -EINVAL; + } + + down_read(&zram->init_lock); + if (unlikely(!init_done(zram))) { + err = -EIO; + goto out_unlock; + } + + index = sector >> SECTORS_PER_PAGE_SHIFT; + offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT; + + bv.bv_page = page; + bv.bv_len = PAGE_SIZE; + bv.bv_offset = 0; + + err = zram_bvec_rw(zram, &bv, index, offset, rw); +out_unlock: + up_read(&zram->init_lock); + /* + * If I/O fails, just return error(ie, non-zero) without + * calling page_endio. + * It causes resubmit the I/O with bio request by upper functions + * of rw_page(e.g., swap_readpage, __swap_writepage) and + * bio->bi_end_io does things to handle the error + * (e.g., SetPageError, set_page_dirty and extra works). + */ + if (err == 0) + page_endio(page, rw, 0); + return err; +} + static const struct block_device_operations zram_devops = { .swap_slot_free_notify = zram_slot_free_notify, + .rw_page = zram_rw_page, .owner = THIS_MODULE }; -static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR, - disksize_show, disksize_store); -static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL); -static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); -static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); -static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); -static DEVICE_ATTR(mem_limit, S_IRUGO | S_IWUSR, mem_limit_show, - mem_limit_store); -static DEVICE_ATTR(mem_used_max, S_IRUGO | S_IWUSR, mem_used_max_show, - mem_used_max_store); -static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR, - max_comp_streams_show, max_comp_streams_store); -static DEVICE_ATTR(comp_algorithm, S_IRUGO | S_IWUSR, - comp_algorithm_show, comp_algorithm_store); +static DEVICE_ATTR_RW(disksize); +static DEVICE_ATTR_RO(initstate); +static DEVICE_ATTR_WO(reset); +static DEVICE_ATTR_RO(orig_data_size); +static DEVICE_ATTR_RO(mem_used_total); +static DEVICE_ATTR_RW(mem_limit); +static DEVICE_ATTR_RW(mem_used_max); +static DEVICE_ATTR_RW(max_comp_streams); +static DEVICE_ATTR_RW(comp_algorithm); ZRAM_ATTR_RO(num_reads); ZRAM_ATTR_RO(num_writes); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index c6ee271317f5b344619bdc7c01bc16a858f18a9f..b05a816b09acbf41ba5067274f36c601bd4dbb00 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -66,8 +66,8 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; /* Flags for zram pages (table[page_no].value) */ enum zram_pageflags { /* Page consists entirely of zeros */ - ZRAM_ZERO = ZRAM_FLAG_SHIFT + 1, - ZRAM_ACCESS, /* page in now accessed */ + ZRAM_ZERO = ZRAM_FLAG_SHIFT, + ZRAM_ACCESS, /* page is now accessed */ __NR_ZRAM_PAGEFLAGS, }; diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index a2d87a60c27f1c8d2c91264690b036b4c7174f5b..bea878f8e7d3723284671437d892d5846a830b25 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c @@ -509,45 +509,67 @@ static void finish_pri_tag(struct device_state *dev_state, spin_unlock_irqrestore(&pasid_state->lock, flags); } +static void handle_fault_error(struct fault *fault) +{ + int status; + + if (!fault->dev_state->inv_ppr_cb) { + set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); + return; + } + + status = fault->dev_state->inv_ppr_cb(fault->dev_state->pdev, + fault->pasid, + fault->address, + fault->flags); + switch (status) { + case AMD_IOMMU_INV_PRI_RSP_SUCCESS: + set_pri_tag_status(fault->state, fault->tag, PPR_SUCCESS); + break; + case AMD_IOMMU_INV_PRI_RSP_INVALID: + set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); + break; + case AMD_IOMMU_INV_PRI_RSP_FAIL: + set_pri_tag_status(fault->state, fault->tag, PPR_FAILURE); + break; + default: + BUG(); + } +} + static void do_fault(struct work_struct *work) { struct fault *fault = container_of(work, struct fault, work); - int npages, write; - struct page *page; + struct mm_struct *mm; + struct vm_area_struct *vma; + u64 address; + int ret, write; write = !!(fault->flags & PPR_FAULT_WRITE); - down_read(&fault->state->mm->mmap_sem); - npages = get_user_pages(NULL, fault->state->mm, - fault->address, 1, write, 0, &page, NULL); - up_read(&fault->state->mm->mmap_sem); - - if (npages == 1) { - put_page(page); - } else if (fault->dev_state->inv_ppr_cb) { - int status; - - status = fault->dev_state->inv_ppr_cb(fault->dev_state->pdev, - fault->pasid, - fault->address, - fault->flags); - switch (status) { - case AMD_IOMMU_INV_PRI_RSP_SUCCESS: - set_pri_tag_status(fault->state, fault->tag, PPR_SUCCESS); - break; - case AMD_IOMMU_INV_PRI_RSP_INVALID: - set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); - break; - case AMD_IOMMU_INV_PRI_RSP_FAIL: - set_pri_tag_status(fault->state, fault->tag, PPR_FAILURE); - break; - default: - BUG(); - } - } else { - set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); + mm = fault->state->mm; + address = fault->address; + + down_read(&mm->mmap_sem); + vma = find_extend_vma(mm, address); + if (!vma || address < vma->vm_start) { + /* failed to get a vma in the right range */ + up_read(&mm->mmap_sem); + handle_fault_error(fault); + goto out; + } + + ret = handle_mm_fault(mm, vma, address, write); + if (ret & VM_FAULT_ERROR) { + /* failed to service fault */ + up_read(&mm->mmap_sem); + handle_fault_error(fault); + goto out; } + up_read(&mm->mmap_sem); + +out: finish_pri_tag(fault->dev_state, fault->state, fault->tag); put_pasid_state(fault->state); diff --git a/drivers/rtc/rtc-snvs.c b/drivers/rtc/rtc-snvs.c index 2cd8ffe5c6981f986a086989322bae38d611e053..942b267c62713dfa4fd25a497ede6f9b3a45b193 100644 --- a/drivers/rtc/rtc-snvs.c +++ b/drivers/rtc/rtc-snvs.c @@ -344,13 +344,20 @@ static int snvs_rtc_resume(struct device *dev) return 0; } -#endif static const struct dev_pm_ops snvs_rtc_pm_ops = { .suspend_noirq = snvs_rtc_suspend, .resume_noirq = snvs_rtc_resume, }; +#define SNVS_RTC_PM_OPS (&snvs_rtc_pm_ops) + +#else + +#define SNVS_RTC_PM_OPS NULL + +#endif + static const struct of_device_id snvs_dt_ids[] = { { .compatible = "fsl,sec-v4.0-mon-rtc-lp", }, { /* sentinel */ } @@ -361,7 +368,7 @@ static struct platform_driver snvs_rtc_driver = { .driver = { .name = "snvs_rtc", .owner = THIS_MODULE, - .pm = &snvs_rtc_pm_ops, + .pm = SNVS_RTC_PM_OPS, .of_match_table = snvs_dt_ids, }, .probe = snvs_rtc_probe, diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c index ad4f5790a76f5005fbb9d55fcf67cb85bf887770..46f8ef42559e6bf381dc82428e6c6794ca2578e5 100644 --- a/drivers/staging/android/ashmem.c +++ b/drivers/staging/android/ashmem.c @@ -418,7 +418,7 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma) } /* - * ashmem_shrink - our cache shrinker, called from mm/vmscan.c :: shrink_slab + * ashmem_shrink - our cache shrinker, called from mm/vmscan.c * * 'nr_to_scan' is the number of objects to scan for freeing. * @@ -785,7 +785,6 @@ static long ashmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg) .nr_to_scan = LONG_MAX, }; ret = ashmem_shrink_count(&ashmem_shrinker, &sc); - nodes_setall(sc.nodes_to_scan); ashmem_shrink_scan(&ashmem_shrinker, &sc); } break; diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 9bca88159725e0b258a808786b3ff145ee1b9c16..ff44ff3ff015f6bda981be87dde47d79bf2a7fcc 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -135,8 +135,10 @@ extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh); extern void secs_to_datestamp(time_t secs, struct affs_date *ds); extern umode_t prot_to_mode(u32 prot); extern void mode_to_prot(struct inode *inode); +__printf(3, 4) extern void affs_error(struct super_block *sb, const char *function, const char *fmt, ...); +__printf(3, 4) extern void affs_warning(struct super_block *sb, const char *function, const char *fmt, ...); extern bool affs_nofilenametruncate(const struct dentry *dentry); diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 937ce8754b2416b1945b35006b685aa6d66e2718..c852f2fa17102a24935f27adb68198fe90a82f92 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -10,8 +10,6 @@ #include "affs.h" -static char ErrorBuffer[256]; - /* * Functions for accessing Amiga-FFS structures. */ @@ -444,30 +442,30 @@ mode_to_prot(struct inode *inode) void affs_error(struct super_block *sb, const char *function, const char *fmt, ...) { - va_list args; - - va_start(args,fmt); - vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args); - va_end(args); + struct va_format vaf; + va_list args; - pr_crit("error (device %s): %s(): %s\n", sb->s_id, - function,ErrorBuffer); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_crit("error (device %s): %s(): %pV\n", sb->s_id, function, &vaf); if (!(sb->s_flags & MS_RDONLY)) pr_warn("Remounting filesystem read-only\n"); sb->s_flags |= MS_RDONLY; + va_end(args); } void affs_warning(struct super_block *sb, const char *function, const char *fmt, ...) { - va_list args; + struct va_format vaf; + va_list args; - va_start(args,fmt); - vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_warn("(device %s): %s(): %pV\n", sb->s_id, function, &vaf); va_end(args); - - pr_warn("(device %s): %s(): %s\n", sb->s_id, - function,ErrorBuffer); } bool diff --git a/fs/affs/file.c b/fs/affs/file.c index 1ed590aafecf744884c3e55af7cd9627d1c729f8..8faa6593ca6defe4ab77c169f3fe9673b84a9ec3 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -12,35 +12,10 @@ * affs regular file handling primitives */ +#include <linux/aio.h> #include "affs.h" -#if PAGE_SIZE < 4096 -#error PAGE_SIZE must be at least 4096 -#endif - -static int affs_grow_extcache(struct inode *inode, u32 lc_idx); -static struct buffer_head *affs_alloc_extblock(struct inode *inode, struct buffer_head *bh, u32 ext); -static inline struct buffer_head *affs_get_extblock(struct inode *inode, u32 ext); static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext); -static int affs_file_open(struct inode *inode, struct file *filp); -static int affs_file_release(struct inode *inode, struct file *filp); - -const struct file_operations affs_file_operations = { - .llseek = generic_file_llseek, - .read = new_sync_read, - .read_iter = generic_file_read_iter, - .write = new_sync_write, - .write_iter = generic_file_write_iter, - .mmap = generic_file_mmap, - .open = affs_file_open, - .release = affs_file_release, - .fsync = affs_file_fsync, - .splice_read = generic_file_splice_read, -}; - -const struct inode_operations affs_file_inode_operations = { - .setattr = affs_notify_change, -}; static int affs_file_open(struct inode *inode, struct file *filp) @@ -355,7 +330,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul /* store new block */ if (bh_result->b_blocknr) - affs_warning(sb, "get_block", "block already set (%x)", bh_result->b_blocknr); + affs_warning(sb, "get_block", "block already set (%lx)", + (unsigned long)bh_result->b_blocknr); AFFS_BLOCK(sb, ext_bh, block) = cpu_to_be32(blocknr); AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(block + 1); affs_adjust_checksum(ext_bh, blocknr - bh_result->b_blocknr + 1); @@ -377,7 +353,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul return 0; err_big: - affs_error(inode->i_sb,"get_block","strange block request %d", block); + affs_error(inode->i_sb, "get_block", "strange block request %d", + (int)block); return -EIO; err_ext: // unlock cache @@ -412,6 +389,22 @@ static void affs_write_failed(struct address_space *mapping, loff_t to) } } +static ssize_t +affs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); + ssize_t ret; + + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, affs_get_block); + if (ret < 0 && (rw & WRITE)) + affs_write_failed(mapping, offset + count); + return ret; +} + static int affs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -438,6 +431,7 @@ const struct address_space_operations affs_aops = { .writepage = affs_writepage, .write_begin = affs_write_begin, .write_end = generic_write_end, + .direct_IO = affs_direct_IO, .bmap = _affs_bmap }; @@ -867,8 +861,9 @@ affs_truncate(struct inode *inode) // lock cache ext_bh = affs_get_extblock(inode, ext); if (IS_ERR(ext_bh)) { - affs_warning(sb, "truncate", "unexpected read error for ext block %u (%d)", - ext, PTR_ERR(ext_bh)); + affs_warning(sb, "truncate", + "unexpected read error for ext block %u (%ld)", + (unsigned int)ext, PTR_ERR(ext_bh)); return; } if (AFFS_I(inode)->i_lc) { @@ -914,8 +909,9 @@ affs_truncate(struct inode *inode) struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0); u32 tmp; if (IS_ERR(bh)) { - affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)", - ext, PTR_ERR(bh)); + affs_warning(sb, "truncate", + "unexpected read error for last block %u (%ld)", + (unsigned int)ext, PTR_ERR(bh)); return; } tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next); @@ -961,3 +957,19 @@ int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) mutex_unlock(&inode->i_mutex); return ret; } +const struct file_operations affs_file_operations = { + .llseek = generic_file_llseek, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = generic_file_write_iter, + .mmap = generic_file_mmap, + .open = affs_file_open, + .release = affs_file_release, + .fsync = affs_file_fsync, + .splice_read = generic_file_splice_read, +}; + +const struct inode_operations affs_file_inode_operations = { + .setattr = affs_notify_change, +}; diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index b94d1cc9cd3037db7608b5b25023866900e353d5..edf47774b03d56f7ab0671a4b6d57b7f4193a550 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -269,10 +269,6 @@ befs_readdir(struct file *file, struct dir_context *ctx) } ctx->pos++; goto more; - - befs_debug(sb, "<--- %s pos %lld", __func__, ctx->pos); - - return 0; } static struct inode * diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index f37b08cea1f760a061d7a5d49bf8dcaeda385ab8..490538536cb44b20dde69074da93d2866e9ccd3e 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -42,6 +42,10 @@ static int load_em86(struct linux_binprm *bprm) return -ENOEXEC; } + /* Need to be able to load the file after exec */ + if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) + return -ENOENT; + allow_write_access(bprm->file); fput(bprm->file); bprm->file = NULL; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 70789e198dea9249e1ad3388a25ba209769acb12..c04ef1d4f18a573726f83d7f5a1401f06ec9652b 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -144,6 +144,10 @@ static int load_misc_binary(struct linux_binprm *bprm) if (!fmt) goto ret; + /* Need to be able to load the file after exec */ + if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) + return -ENOENT; + if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) { retval = remove_arg_zero(bprm); if (retval) diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 5027a3e149222bd5945c87d9521e6e7a6108dc77..afdf4e3cafc2aa5f1c1ff1dc0e8cfb256c6a3b89 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -24,6 +24,16 @@ static int load_script(struct linux_binprm *bprm) if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!')) return -ENOEXEC; + + /* + * If the script filename will be inaccessible after exec, typically + * because it is a "/dev/fd/<fd>/.." path against an O_CLOEXEC fd, give + * up now (on the assumption that the interpreter will want to load + * this file). + */ + if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) + return -ENOENT; + /* * This section does the #! interpretation. * Sorta complicated, but hopefully it will work. -TYT diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 1de7294aad208db5fb5b749ee7fc4c5a7d048f4e..2bc2c87f35e7a251628da1390f288b0cf8f2e721 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -40,13 +40,14 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) static void drop_slab(void) { int nr_objects; - struct shrink_control shrink = { - .gfp_mask = GFP_KERNEL, - }; - nodes_setall(shrink.nodes_to_scan); do { - nr_objects = shrink_slab(&shrink, 1000, 1000); + int nid; + + nr_objects = 0; + for_each_online_node(nid) + nr_objects += shrink_node_slabs(GFP_KERNEL, nid, + 1000, 1000); } while (nr_objects > 10); } diff --git a/fs/exec.c b/fs/exec.c index 01aebe300200e105faf1a8dfd3db9c30d69baf77..ad8798e26be9f6e40d15607d0971bc2143611fcc 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -748,18 +748,25 @@ EXPORT_SYMBOL(setup_arg_pages); #endif /* CONFIG_MMU */ -static struct file *do_open_exec(struct filename *name) +static struct file *do_open_execat(int fd, struct filename *name, int flags) { struct file *file; int err; - static const struct open_flags open_exec_flags = { + struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC | MAY_OPEN, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; - file = do_filp_open(AT_FDCWD, name, &open_exec_flags); + if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) + return ERR_PTR(-EINVAL); + if (flags & AT_SYMLINK_NOFOLLOW) + open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + open_exec_flags.lookup_flags |= LOOKUP_EMPTY; + + file = do_filp_open(fd, name, &open_exec_flags); if (IS_ERR(file)) goto out; @@ -770,12 +777,13 @@ static struct file *do_open_exec(struct filename *name) if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) goto exit; - fsnotify_open(file); - err = deny_write_access(file); if (err) goto exit; + if (name->name[0] != '\0') + fsnotify_open(file); + out: return file; @@ -787,7 +795,7 @@ static struct file *do_open_exec(struct filename *name) struct file *open_exec(const char *name) { struct filename tmp = { .name = name }; - return do_open_exec(&tmp); + return do_open_execat(AT_FDCWD, &tmp, 0); } EXPORT_SYMBOL(open_exec); @@ -1428,10 +1436,12 @@ static int exec_binprm(struct linux_binprm *bprm) /* * sys_execve() executes a new program. */ -static int do_execve_common(struct filename *filename, - struct user_arg_ptr argv, - struct user_arg_ptr envp) +static int do_execveat_common(int fd, struct filename *filename, + struct user_arg_ptr argv, + struct user_arg_ptr envp, + int flags) { + char *pathbuf = NULL; struct linux_binprm *bprm; struct file *file; struct files_struct *displaced; @@ -1472,7 +1482,7 @@ static int do_execve_common(struct filename *filename, check_unsafe_exec(bprm); current->in_execve = 1; - file = do_open_exec(filename); + file = do_open_execat(fd, filename, flags); retval = PTR_ERR(file); if (IS_ERR(file)) goto out_unmark; @@ -1480,7 +1490,28 @@ static int do_execve_common(struct filename *filename, sched_exec(); bprm->file = file; - bprm->filename = bprm->interp = filename->name; + if (fd == AT_FDCWD || filename->name[0] == '/') { + bprm->filename = filename->name; + } else { + if (filename->name[0] == '\0') + pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d", fd); + else + pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d/%s", + fd, filename->name); + if (!pathbuf) { + retval = -ENOMEM; + goto out_unmark; + } + /* + * Record that a name derived from an O_CLOEXEC fd will be + * inaccessible after exec. Relies on having exclusive access to + * current->files (due to unshare_files above). + */ + if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt))) + bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; + bprm->filename = pathbuf; + } + bprm->interp = bprm->filename; retval = bprm_mm_init(bprm); if (retval) @@ -1521,6 +1552,7 @@ static int do_execve_common(struct filename *filename, acct_update_integrals(current); task_numa_free(current); free_bprm(bprm); + kfree(pathbuf); putname(filename); if (displaced) put_files_struct(displaced); @@ -1538,6 +1570,7 @@ static int do_execve_common(struct filename *filename, out_free: free_bprm(bprm); + kfree(pathbuf); out_files: if (displaced) @@ -1553,7 +1586,18 @@ int do_execve(struct filename *filename, { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; - return do_execve_common(filename, argv, envp); + return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); +} + +int do_execveat(int fd, struct filename *filename, + const char __user *const __user *__argv, + const char __user *const __user *__envp, + int flags) +{ + struct user_arg_ptr argv = { .ptr.native = __argv }; + struct user_arg_ptr envp = { .ptr.native = __envp }; + + return do_execveat_common(fd, filename, argv, envp, flags); } #ifdef CONFIG_COMPAT @@ -1569,7 +1613,23 @@ static int compat_do_execve(struct filename *filename, .is_compat = true, .ptr.compat = __envp, }; - return do_execve_common(filename, argv, envp); + return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); +} + +static int compat_do_execveat(int fd, struct filename *filename, + const compat_uptr_t __user *__argv, + const compat_uptr_t __user *__envp, + int flags) +{ + struct user_arg_ptr argv = { + .is_compat = true, + .ptr.compat = __argv, + }; + struct user_arg_ptr envp = { + .is_compat = true, + .ptr.compat = __envp, + }; + return do_execveat_common(fd, filename, argv, envp, flags); } #endif @@ -1609,6 +1669,20 @@ SYSCALL_DEFINE3(execve, { return do_execve(getname(filename), argv, envp); } + +SYSCALL_DEFINE5(execveat, + int, fd, const char __user *, filename, + const char __user *const __user *, argv, + const char __user *const __user *, envp, + int, flags) +{ + int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; + + return do_execveat(fd, + getname_flags(filename, lookup_flags, NULL), + argv, envp, flags); +} + #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, const compat_uptr_t __user *, argv, @@ -1616,4 +1690,17 @@ COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, { return compat_do_execve(getname(filename), argv, envp); } + +COMPAT_SYSCALL_DEFINE5(execveat, int, fd, + const char __user *, filename, + const compat_uptr_t __user *, argv, + const compat_uptr_t __user *, envp, + int, flags) +{ + int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; + + return compat_do_execveat(fd, + getname_flags(filename, lookup_flags, NULL), + argv, envp, flags); +} #endif diff --git a/fs/fat/fat.h b/fs/fat/fat.h index e0c4ba39a377b407bb19c50f830443ab4f7f3188..64e295e8ff38cb5a212bcc69880dfb71b3bd6602 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -370,6 +370,7 @@ extern int fat_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); /* fat/inode.c */ +extern int fat_block_truncate_page(struct inode *inode, loff_t from); extern void fat_attach(struct inode *inode, loff_t i_pos); extern void fat_detach(struct inode *inode); extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos); diff --git a/fs/fat/file.c b/fs/fat/file.c index 85f79a89e7474658c8c552cfe027e72ff048d542..8429c68e30578150b7dd85667a4b04f8e0d2ed4b 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -443,6 +443,9 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_SIZE) { + error = fat_block_truncate_page(inode, attr->ia_size); + if (error) + goto out; down_write(&MSDOS_I(inode)->truncate_lock); truncate_setsize(inode, attr->ia_size); fat_truncate_blocks(inode, attr->ia_size); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 756aead10d9618593e3267e697df4915d528fbc9..7b41a2dcdd7662d8d7be045d94523b97299dd879 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -294,6 +294,18 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block) return blocknr; } +/* + * fat_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This is required during truncate to physically zeroout the tail end + * of that block so it doesn't yield old data if the file is later grown. + * Also, avoid causing failure from fsx for cases of "data past EOF" + */ +int fat_block_truncate_page(struct inode *inode, loff_t from) +{ + return block_truncate_page(inode->i_mapping, from, fat_get_block); +} + static const struct address_space_operations fat_aops = { .readpage = fat_readpage, .readpages = fat_readpages, diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 1e2872b25343f106df7a96d0b667e194dd070c54..5eba47f593f8f888855c6b83428455f8b46af670 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -412,10 +412,10 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) pgoff = offset >> PAGE_SHIFT; i_size_write(inode, offset); - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); truncate_hugepages(inode, offset); return 0; } @@ -472,12 +472,12 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, } /* - * Hugetlbfs is not reclaimable; therefore its i_mmap_mutex will never + * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never * be taken from reclaim -- unlike regular filesystems. This needs an * annotation because huge_pmd_share() does an allocation under - * i_mmap_mutex. + * i_mmap_rwsem. */ -static struct lock_class_key hugetlbfs_i_mmap_mutex_key; +static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct inode *dir, @@ -495,8 +495,8 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct hugetlbfs_inode_info *info; inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); - lockdep_set_class(&inode->i_mapping->i_mmap_mutex, - &hugetlbfs_i_mmap_mutex_key); + lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, + &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/inode.c b/fs/inode.c index 2ed95f7caa4f874ea97dbc7ebc698a4d0fc04674..ad60555b4768fa012dfeeba0661de6f7fc9addec 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -346,7 +346,7 @@ void address_space_init_once(struct address_space *mapping) memset(mapping, 0, sizeof(*mapping)); INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); spin_lock_init(&mapping->tree_lock); - mutex_init(&mapping->i_mmap_mutex); + init_rwsem(&mapping->i_mmap_rwsem); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); mapping->i_mmap = RB_ROOT; diff --git a/fs/namei.c b/fs/namei.c index db5fe86319e69d51e59ff6dd0053d199b406bdc9..ca814165d84cc25214740eb72b81c4d3d6c07b46 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -130,7 +130,7 @@ void final_putname(struct filename *name) #define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename)) -static struct filename * +struct filename * getname_flags(const char __user *filename, int flags, int *empty) { struct filename *result, *err; diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index caaaf9dfe3534be9e29826e46058151b5536c48e..44523f4a608414187d24aa86204bf6d4b92c77f9 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -69,8 +69,8 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) if (old_mask == new_mask) return; - if (fsn_mark->i.inode) - fsnotify_recalc_inode_mask(fsn_mark->i.inode); + if (fsn_mark->inode) + fsnotify_recalc_inode_mask(fsn_mark->inode); } /* diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 6ffd220eb14d4f453400e85a98aa567ba2d11204..58b7cdb63da9f2e6e9fc4fa095ceb9e762b4ffb9 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -80,7 +80,7 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) return; inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); - inode = igrab(mark->i.inode); + inode = igrab(mark->inode); if (inode) { seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ", inode_mark->wd, inode->i_ino, inode->i_sb->s_dev, @@ -112,7 +112,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) mflags |= FAN_MARK_IGNORED_SURV_MODIFY; if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { - inode = igrab(mark->i.inode); + inode = igrab(mark->inode); if (!inode) return; seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ", @@ -122,7 +122,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) seq_putc(m, '\n'); iput(inode); } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) { - struct mount *mnt = real_mount(mark->m.mnt); + struct mount *mnt = real_mount(mark->mnt); seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n", mnt->mnt_id, mflags, mark->mask, mark->ignored_mask); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 41e39102743a55aa71f41cff96405218c5a4c802..dd3fb0b17be7cc5d914275c6e83da49e7b8e3f48 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -242,13 +242,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, if (inode_node) { inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu), - struct fsnotify_mark, i.i_list); + struct fsnotify_mark, obj_list); inode_group = inode_mark->group; } if (vfsmount_node) { vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu), - struct fsnotify_mark, m.m_list); + struct fsnotify_mark, obj_list); vfsmount_group = vfsmount_mark->group; } diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 3b68b0ae0a97cb6beb6b642f3098de5c8aaec4d0..13a00be516d250bc1328697060c8f73ddb688827 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -12,12 +12,19 @@ extern void fsnotify_flush_notify(struct fsnotify_group *group); /* protects reads of inode and vfsmount marks list */ extern struct srcu_struct fsnotify_mark_srcu; +/* Calculate mask of events for a list of marks */ +extern u32 fsnotify_recalc_mask(struct hlist_head *head); + /* compare two groups for sorting of marks lists */ extern int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b); extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark, __u32 mask); +/* Add mark to a proper place in mark list */ +extern int fsnotify_add_mark_list(struct hlist_head *head, + struct fsnotify_mark *mark, + int allow_dups); /* add a mark to an inode */ extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, struct inode *inode, @@ -31,6 +38,11 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); /* inode specific destruction of a mark */ extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark); +/* Destroy all marks in the given list */ +extern void fsnotify_destroy_marks(struct list_head *to_free); +/* Find mark belonging to given group in the list of marks */ +extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, + struct fsnotify_group *group); /* run the list of all marks associated with inode and flag them to be freed */ extern void fsnotify_clear_marks_by_inode(struct inode *inode); /* run the list of all marks associated with vfsmount and flag them to be freed */ diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index dfbf5447eea4cea8fdf664ff5b0232f8a61e68b0..3daf513ee99e6ccf21ce01bfed71f85a21b5717e 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -30,21 +30,6 @@ #include "../internal.h" -/* - * Recalculate the mask of events relevant to a given inode locked. - */ -static void fsnotify_recalc_inode_mask_locked(struct inode *inode) -{ - struct fsnotify_mark *mark; - __u32 new_mask = 0; - - assert_spin_locked(&inode->i_lock); - - hlist_for_each_entry(mark, &inode->i_fsnotify_marks, i.i_list) - new_mask |= mark->mask; - inode->i_fsnotify_mask = new_mask; -} - /* * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types * any notifier is interested in hearing for this inode. @@ -52,7 +37,7 @@ static void fsnotify_recalc_inode_mask_locked(struct inode *inode) void fsnotify_recalc_inode_mask(struct inode *inode) { spin_lock(&inode->i_lock); - fsnotify_recalc_inode_mask_locked(inode); + inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); spin_unlock(&inode->i_lock); __fsnotify_update_child_dentry_flags(inode); @@ -60,23 +45,22 @@ void fsnotify_recalc_inode_mask(struct inode *inode) void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) { - struct inode *inode = mark->i.inode; + struct inode *inode = mark->inode; BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); assert_spin_locked(&mark->lock); spin_lock(&inode->i_lock); - hlist_del_init_rcu(&mark->i.i_list); - mark->i.inode = NULL; + hlist_del_init_rcu(&mark->obj_list); + mark->inode = NULL; /* * this mark is now off the inode->i_fsnotify_marks list and we * hold the inode->i_lock, so this is the perfect time to update the * inode->i_fsnotify_mask */ - fsnotify_recalc_inode_mask_locked(inode); - + inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); spin_unlock(&inode->i_lock); } @@ -85,30 +69,19 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) */ void fsnotify_clear_marks_by_inode(struct inode *inode) { - struct fsnotify_mark *mark, *lmark; + struct fsnotify_mark *mark; struct hlist_node *n; LIST_HEAD(free_list); spin_lock(&inode->i_lock); - hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, i.i_list) { - list_add(&mark->i.free_i_list, &free_list); - hlist_del_init_rcu(&mark->i.i_list); + hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, obj_list) { + list_add(&mark->free_list, &free_list); + hlist_del_init_rcu(&mark->obj_list); fsnotify_get_mark(mark); } spin_unlock(&inode->i_lock); - list_for_each_entry_safe(mark, lmark, &free_list, i.free_i_list) { - struct fsnotify_group *group; - - spin_lock(&mark->lock); - fsnotify_get_group(mark->group); - group = mark->group; - spin_unlock(&mark->lock); - - fsnotify_destroy_mark(mark, group); - fsnotify_put_mark(mark); - fsnotify_put_group(group); - } + fsnotify_destroy_marks(&free_list); } /* @@ -119,27 +92,6 @@ void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_INODE); } -/* - * given a group and inode, find the mark associated with that combination. - * if found take a reference to that mark and return it, else return NULL - */ -static struct fsnotify_mark *fsnotify_find_inode_mark_locked( - struct fsnotify_group *group, - struct inode *inode) -{ - struct fsnotify_mark *mark; - - assert_spin_locked(&inode->i_lock); - - hlist_for_each_entry(mark, &inode->i_fsnotify_marks, i.i_list) { - if (mark->group == group) { - fsnotify_get_mark(mark); - return mark; - } - } - return NULL; -} - /* * given a group and inode, find the mark associated with that combination. * if found take a reference to that mark and return it, else return NULL @@ -150,7 +102,7 @@ struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, struct fsnotify_mark *mark; spin_lock(&inode->i_lock); - mark = fsnotify_find_inode_mark_locked(group, inode); + mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); spin_unlock(&inode->i_lock); return mark; @@ -168,10 +120,10 @@ void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark, assert_spin_locked(&mark->lock); if (mask && - mark->i.inode && + mark->inode && !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) { mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED; - inode = igrab(mark->i.inode); + inode = igrab(mark->inode); /* * we shouldn't be able to get here if the inode wasn't * already safely held in memory. But bug in case it @@ -192,9 +144,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, struct inode *inode, int allow_dups) { - struct fsnotify_mark *lmark, *last = NULL; - int ret = 0; - int cmp; + int ret; mark->flags |= FSNOTIFY_MARK_FLAG_INODE; @@ -202,37 +152,10 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, assert_spin_locked(&mark->lock); spin_lock(&inode->i_lock); - - mark->i.inode = inode; - - /* is mark the first mark? */ - if (hlist_empty(&inode->i_fsnotify_marks)) { - hlist_add_head_rcu(&mark->i.i_list, &inode->i_fsnotify_marks); - goto out; - } - - /* should mark be in the middle of the current list? */ - hlist_for_each_entry(lmark, &inode->i_fsnotify_marks, i.i_list) { - last = lmark; - - if ((lmark->group == group) && !allow_dups) { - ret = -EEXIST; - goto out; - } - - cmp = fsnotify_compare_groups(lmark->group, mark->group); - if (cmp < 0) - continue; - - hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list); - goto out; - } - - BUG_ON(last == NULL); - /* mark should be the last entry. last is the current last entry */ - hlist_add_behind_rcu(&mark->i.i_list, &last->i.i_list); -out: - fsnotify_recalc_inode_mask_locked(inode); + mark->inode = inode; + ret = fsnotify_add_mark_list(&inode->i_fsnotify_marks, mark, + allow_dups); + inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); spin_unlock(&inode->i_lock); return ret; diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 7d888d77d59afc834d74dd8db55dfe20962502e0..2cd900c2c73758c31ae941e10fee8e44494ca370 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -156,7 +156,7 @@ static int idr_callback(int id, void *p, void *data) */ if (fsn_mark) printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n", - fsn_mark->group, fsn_mark->i.inode, i_mark->wd); + fsn_mark->group, fsn_mark->inode, i_mark->wd); return 0; } diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 283aa312d7458a36bce37090fa10fdaf3dae48f1..4506486974336d6451abc0658de78c0c06bdec9a 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -433,7 +433,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group, if (wd == -1) { WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, - i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode); + i_mark->fsn_mark.group, i_mark->fsn_mark.inode); goto out; } @@ -442,7 +442,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group, if (unlikely(!found_i_mark)) { WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, - i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode); + i_mark->fsn_mark.group, i_mark->fsn_mark.inode); goto out; } @@ -456,9 +456,9 @@ static void inotify_remove_from_idr(struct fsnotify_group *group, "mark->inode=%p found_i_mark=%p found_i_mark->wd=%d " "found_i_mark->group=%p found_i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group, - i_mark->fsn_mark.i.inode, found_i_mark, found_i_mark->wd, + i_mark->fsn_mark.inode, found_i_mark, found_i_mark->wd, found_i_mark->fsn_mark.group, - found_i_mark->fsn_mark.i.inode); + found_i_mark->fsn_mark.inode); goto out; } @@ -470,7 +470,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group, if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) { printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, - i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode); + i_mark->fsn_mark.group, i_mark->fsn_mark.inode); /* we can't really recover with bad ref cnting.. */ BUG(); } diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 34c38fabf514f1a892e10e2e24ff4a3d252ec000..92e48c70f0f05542a75804fe7aac752672abd214 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -110,6 +110,17 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) } } +/* Calculate mask of events for a list of marks */ +u32 fsnotify_recalc_mask(struct hlist_head *head) +{ + u32 new_mask = 0; + struct fsnotify_mark *mark; + + hlist_for_each_entry(mark, head, obj_list) + new_mask |= mark->mask; + return new_mask; +} + /* * Any time a mark is getting freed we end up here. * The caller had better be holding a reference to this mark so we don't actually @@ -133,7 +144,7 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { - inode = mark->i.inode; + inode = mark->inode; fsnotify_destroy_inode_mark(mark); } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) fsnotify_destroy_vfsmount_mark(mark); @@ -150,7 +161,7 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, mutex_unlock(&group->mark_mutex); spin_lock(&destroy_lock); - list_add(&mark->destroy_list, &destroy_list); + list_add(&mark->g_list, &destroy_list); spin_unlock(&destroy_lock); wake_up(&destroy_waitq); /* @@ -192,6 +203,27 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark, mutex_unlock(&group->mark_mutex); } +/* + * Destroy all marks in the given list. The marks must be already detached from + * the original inode / vfsmount. + */ +void fsnotify_destroy_marks(struct list_head *to_free) +{ + struct fsnotify_mark *mark, *lmark; + struct fsnotify_group *group; + + list_for_each_entry_safe(mark, lmark, to_free, free_list) { + spin_lock(&mark->lock); + fsnotify_get_group(mark->group); + group = mark->group; + spin_unlock(&mark->lock); + + fsnotify_destroy_mark(mark, group); + fsnotify_put_mark(mark); + fsnotify_put_group(group); + } +} + void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) { assert_spin_locked(&mark->lock); @@ -245,6 +277,39 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) return -1; } +/* Add mark into proper place in given list of marks */ +int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark, + int allow_dups) +{ + struct fsnotify_mark *lmark, *last = NULL; + int cmp; + + /* is mark the first mark? */ + if (hlist_empty(head)) { + hlist_add_head_rcu(&mark->obj_list, head); + return 0; + } + + /* should mark be in the middle of the current list? */ + hlist_for_each_entry(lmark, head, obj_list) { + last = lmark; + + if ((lmark->group == mark->group) && !allow_dups) + return -EEXIST; + + cmp = fsnotify_compare_groups(lmark->group, mark->group); + if (cmp >= 0) { + hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list); + return 0; + } + } + + BUG_ON(last == NULL); + /* mark should be the last entry. last is the current last entry */ + hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); + return 0; +} + /* * Attach an initialized mark to a given group and fs object. * These marks may be used for the fsnotify backend to determine which @@ -305,7 +370,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, spin_unlock(&mark->lock); spin_lock(&destroy_lock); - list_add(&mark->destroy_list, &destroy_list); + list_add(&mark->g_list, &destroy_list); spin_unlock(&destroy_lock); wake_up(&destroy_waitq); @@ -322,6 +387,24 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, return ret; } +/* + * Given a list of marks, find the mark associated with given group. If found + * take a reference to that mark and return it, else return NULL. + */ +struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, + struct fsnotify_group *group) +{ + struct fsnotify_mark *mark; + + hlist_for_each_entry(mark, head, obj_list) { + if (mark->group == group) { + fsnotify_get_mark(mark); + return mark; + } + } + return NULL; +} + /* * clear any marks in a group in which mark->flags & flags is true */ @@ -352,8 +435,8 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group) void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old) { assert_spin_locked(&old->lock); - new->i.inode = old->i.inode; - new->m.mnt = old->m.mnt; + new->inode = old->inode; + new->mnt = old->mnt; if (old->group) fsnotify_get_group(old->group); new->group = old->group; @@ -386,8 +469,8 @@ static int fsnotify_mark_destroy(void *ignored) synchronize_srcu(&fsnotify_mark_srcu); - list_for_each_entry_safe(mark, next, &private_destroy_list, destroy_list) { - list_del_init(&mark->destroy_list); + list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { + list_del_init(&mark->g_list); fsnotify_put_mark(mark); } diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index faefa72a11ebaacff32569535e69a9005a0d4f0f..326b148e623cdf26d30935a985ab398033cb0950 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -32,31 +32,20 @@ void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) { - struct fsnotify_mark *mark, *lmark; + struct fsnotify_mark *mark; struct hlist_node *n; struct mount *m = real_mount(mnt); LIST_HEAD(free_list); spin_lock(&mnt->mnt_root->d_lock); - hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, m.m_list) { - list_add(&mark->m.free_m_list, &free_list); - hlist_del_init_rcu(&mark->m.m_list); + hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, obj_list) { + list_add(&mark->free_list, &free_list); + hlist_del_init_rcu(&mark->obj_list); fsnotify_get_mark(mark); } spin_unlock(&mnt->mnt_root->d_lock); - list_for_each_entry_safe(mark, lmark, &free_list, m.free_m_list) { - struct fsnotify_group *group; - - spin_lock(&mark->lock); - fsnotify_get_group(mark->group); - group = mark->group; - spin_unlock(&mark->lock); - - fsnotify_destroy_mark(mark, group); - fsnotify_put_mark(mark); - fsnotify_put_group(group); - } + fsnotify_destroy_marks(&free_list); } void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) @@ -64,67 +53,36 @@ void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_VFSMOUNT); } -/* - * Recalculate the mask of events relevant to a given vfsmount locked. - */ -static void fsnotify_recalc_vfsmount_mask_locked(struct vfsmount *mnt) -{ - struct mount *m = real_mount(mnt); - struct fsnotify_mark *mark; - __u32 new_mask = 0; - - assert_spin_locked(&mnt->mnt_root->d_lock); - - hlist_for_each_entry(mark, &m->mnt_fsnotify_marks, m.m_list) - new_mask |= mark->mask; - m->mnt_fsnotify_mask = new_mask; -} - /* * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types * any notifier is interested in hearing for this mount point */ void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt) { + struct mount *m = real_mount(mnt); + spin_lock(&mnt->mnt_root->d_lock); - fsnotify_recalc_vfsmount_mask_locked(mnt); + m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); spin_unlock(&mnt->mnt_root->d_lock); } void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark) { - struct vfsmount *mnt = mark->m.mnt; + struct vfsmount *mnt = mark->mnt; + struct mount *m = real_mount(mnt); BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); assert_spin_locked(&mark->lock); spin_lock(&mnt->mnt_root->d_lock); - hlist_del_init_rcu(&mark->m.m_list); - mark->m.mnt = NULL; - - fsnotify_recalc_vfsmount_mask_locked(mnt); + hlist_del_init_rcu(&mark->obj_list); + mark->mnt = NULL; + m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); spin_unlock(&mnt->mnt_root->d_lock); } -static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_group *group, - struct vfsmount *mnt) -{ - struct mount *m = real_mount(mnt); - struct fsnotify_mark *mark; - - assert_spin_locked(&mnt->mnt_root->d_lock); - - hlist_for_each_entry(mark, &m->mnt_fsnotify_marks, m.m_list) { - if (mark->group == group) { - fsnotify_get_mark(mark); - return mark; - } - } - return NULL; -} - /* * given a group and vfsmount, find the mark associated with that combination. * if found take a reference to that mark and return it, else return NULL @@ -132,10 +90,11 @@ static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_ struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, struct vfsmount *mnt) { + struct mount *m = real_mount(mnt); struct fsnotify_mark *mark; spin_lock(&mnt->mnt_root->d_lock); - mark = fsnotify_find_vfsmount_mark_locked(group, mnt); + mark = fsnotify_find_mark(&m->mnt_fsnotify_marks, group); spin_unlock(&mnt->mnt_root->d_lock); return mark; @@ -151,9 +110,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, int allow_dups) { struct mount *m = real_mount(mnt); - struct fsnotify_mark *lmark, *last = NULL; - int ret = 0; - int cmp; + int ret; mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT; @@ -161,37 +118,9 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, assert_spin_locked(&mark->lock); spin_lock(&mnt->mnt_root->d_lock); - - mark->m.mnt = mnt; - - /* is mark the first mark? */ - if (hlist_empty(&m->mnt_fsnotify_marks)) { - hlist_add_head_rcu(&mark->m.m_list, &m->mnt_fsnotify_marks); - goto out; - } - - /* should mark be in the middle of the current list? */ - hlist_for_each_entry(lmark, &m->mnt_fsnotify_marks, m.m_list) { - last = lmark; - - if ((lmark->group == group) && !allow_dups) { - ret = -EEXIST; - goto out; - } - - cmp = fsnotify_compare_groups(lmark->group, mark->group); - if (cmp < 0) - continue; - - hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list); - goto out; - } - - BUG_ON(last == NULL); - /* mark should be the last entry. last is the current last entry */ - hlist_add_behind_rcu(&mark->m.m_list, &last->m.m_list); -out: - fsnotify_recalc_vfsmount_mask_locked(mnt); + mark->mnt = mnt; + ret = fsnotify_add_mark_list(&m->mnt_fsnotify_marks, mark, allow_dups); + m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); spin_unlock(&mnt->mnt_root->d_lock); return ret; diff --git a/fs/open.c b/fs/open.c index b1bf3d542d5d0a5bc0394ea65d1b504c7d60cdeb..d45bd905d4182d1c8d1ba660e4600affb85af9ce 100644 --- a/fs/open.c +++ b/fs/open.c @@ -295,6 +295,17 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) sb_start_write(inode->i_sb); ret = file->f_op->fallocate(file, mode, offset, len); + + /* + * Create inotify and fanotify events. + * + * To keep the logic simple always create events if fallocate succeeds. + * This implies that events are even created if the file size remains + * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE. + */ + if (ret == 0) + fsnotify_modify(file); + sb_end_write(inode->i_sb); return ret; } diff --git a/fs/seq_file.c b/fs/seq_file.c index 353948ba1c5b2d2ed08ead3448fb9218cbb89f96..dbf3a59c86bbbb081464830585f216623ab9f112 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -25,7 +25,11 @@ static void *seq_buf_alloc(unsigned long size) { void *buf; - buf = kmalloc(size, GFP_KERNEL | __GFP_NOWARN); + /* + * __GFP_NORETRY to avoid oom-killings with high-order allocations - + * it's better to fall back to vmalloc() than to kill things. + */ + buf = kmalloc(size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); if (!buf && size > PAGE_SIZE) buf = vmalloc(size); return buf; diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 61f29e5ea840bb2ba14420ecc29e63b996684281..576e4639ca609e8bfb8686474406073937e912e9 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -53,6 +53,10 @@ struct linux_binprm { #define BINPRM_FLAGS_EXECFD_BIT 1 #define BINPRM_FLAGS_EXECFD (1 << BINPRM_FLAGS_EXECFD_BIT) +/* filename of the binary will be inaccessible after exec */ +#define BINPRM_FLAGS_PATH_INACCESSIBLE_BIT 2 +#define BINPRM_FLAGS_PATH_INACCESSIBLE (1 << BINPRM_FLAGS_PATH_INACCESSIBLE_BIT) + /* Function parameter for binfmt->coredump */ struct coredump_params { const siginfo_t *siginfo; diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index e1c8d080c4271d655f75a771ec71ab6ee81c66e3..34e020c2364489adaba0d5cef9d146d608950cb0 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -45,6 +45,7 @@ * bitmap_set(dst, pos, nbits) Set specified bit area * bitmap_clear(dst, pos, nbits) Clear specified bit area * bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area + * bitmap_find_next_zero_area_off(buf, len, pos, n, mask) as above * bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n * bitmap_shift_left(dst, src, n, nbits) *dst = *src << n * bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src) @@ -114,11 +115,36 @@ extern int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); extern void bitmap_set(unsigned long *map, unsigned int start, int len); extern void bitmap_clear(unsigned long *map, unsigned int start, int len); -extern unsigned long bitmap_find_next_zero_area(unsigned long *map, - unsigned long size, - unsigned long start, - unsigned int nr, - unsigned long align_mask); + +extern unsigned long bitmap_find_next_zero_area_off(unsigned long *map, + unsigned long size, + unsigned long start, + unsigned int nr, + unsigned long align_mask, + unsigned long align_offset); + +/** + * bitmap_find_next_zero_area - find a contiguous aligned zero area + * @map: The address to base the search on + * @size: The bitmap size in bits + * @start: The bitnumber to start searching at + * @nr: The number of zeroed bits we're looking for + * @align_mask: Alignment mask for zero area + * + * The @align_mask should be one less than a power of 2; the effect is that + * the bit offset of all zero areas this function finds is multiples of that + * power of 2. A @align_mask of 0 means no alignment is required. + */ +static inline unsigned long +bitmap_find_next_zero_area(unsigned long *map, + unsigned long size, + unsigned long start, + unsigned int nr, + unsigned long align_mask) +{ + return bitmap_find_next_zero_area_off(map, size, start, nr, + align_mask, 0); +} extern int bitmap_scnprintf(char *buf, unsigned int len, const unsigned long *src, int nbits); diff --git a/include/linux/compat.h b/include/linux/compat.h index e6494261eaff306b050c37c9f9be7d6becd01a0d..7450ca2ac1fc6a5d585084dadde7851ea0c35464 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -357,6 +357,9 @@ asmlinkage long compat_sys_lseek(unsigned int, compat_off_t, unsigned int); asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv, const compat_uptr_t __user *envp); +asmlinkage long compat_sys_execveat(int dfd, const char __user *filename, + const compat_uptr_t __user *argv, + const compat_uptr_t __user *envp, int flags); asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index c6f996f2abb6c552345884760bfc5d5967d40bdb..798fad9e420d656985d423ec5732605ad835f6d2 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -5,6 +5,7 @@ #include <linux/types.h> #include <linux/debugfs.h> +#include <linux/ratelimit.h> #include <linux/atomic.h> /* @@ -25,14 +26,18 @@ struct fault_attr { unsigned long reject_end; unsigned long count; + struct ratelimit_state ratelimit_state; + struct dentry *dname; }; -#define FAULT_ATTR_INITIALIZER { \ - .interval = 1, \ - .times = ATOMIC_INIT(1), \ - .require_end = ULONG_MAX, \ - .stacktrace_depth = 32, \ - .verbose = 2, \ +#define FAULT_ATTR_INITIALIZER { \ + .interval = 1, \ + .times = ATOMIC_INIT(1), \ + .require_end = ULONG_MAX, \ + .stacktrace_depth = 32, \ + .ratelimit_state = RATELIMIT_STATE_INIT_DISABLED, \ + .verbose = 2, \ + .dname = NULL, \ } #define DECLARE_FAULT_ATTR(name) struct fault_attr name = FAULT_ATTR_INITIALIZER diff --git a/include/linux/fs.h b/include/linux/fs.h index bb29b02d9bb6646c5d96c8d4d5c084a83f7efefe..4193a0bd99b09bba2846f71bdbfa34f10c82b092 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -18,6 +18,7 @@ #include <linux/pid.h> #include <linux/bug.h> #include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/capability.h> #include <linux/semaphore.h> #include <linux/fiemap.h> @@ -401,7 +402,7 @@ struct address_space { atomic_t i_mmap_writable;/* count VM_SHARED mappings */ struct rb_root i_mmap; /* tree of private and shared mappings */ struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ - struct mutex i_mmap_mutex; /* protect tree, count, list */ + struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ /* Protected by tree_lock together with the radix tree */ unsigned long nrpages; /* number of total pages */ unsigned long nrshadows; /* number of shadow entries */ @@ -467,6 +468,26 @@ struct block_device { int mapping_tagged(struct address_space *mapping, int tag); +static inline void i_mmap_lock_write(struct address_space *mapping) +{ + down_write(&mapping->i_mmap_rwsem); +} + +static inline void i_mmap_unlock_write(struct address_space *mapping) +{ + up_write(&mapping->i_mmap_rwsem); +} + +static inline void i_mmap_lock_read(struct address_space *mapping) +{ + down_read(&mapping->i_mmap_rwsem); +} + +static inline void i_mmap_unlock_read(struct address_space *mapping) +{ + up_read(&mapping->i_mmap_rwsem); +} + /* * Might pages of this file be mapped into userspace? */ @@ -2075,6 +2096,7 @@ extern int vfs_open(const struct path *, struct file *, const struct cred *); extern struct file * dentry_open(const struct path *, int, const struct cred *); extern int filp_close(struct file *, fl_owner_t id); +extern struct filename *getname_flags(const char __user *, int, int *); extern struct filename *getname(const char __user *); extern struct filename *getname_kernel(const char *); diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index ca060d7c4fa63e3395c4754ed8e8f5ab32ceff30..0f313f93c586f8e60e2c39cf5d1b99ae48b1f6cd 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -196,24 +196,6 @@ struct fsnotify_group { #define FSNOTIFY_EVENT_PATH 1 #define FSNOTIFY_EVENT_INODE 2 -/* - * Inode specific fields in an fsnotify_mark - */ -struct fsnotify_inode_mark { - struct inode *inode; /* inode this mark is associated with */ - struct hlist_node i_list; /* list of marks by inode->i_fsnotify_marks */ - struct list_head free_i_list; /* tmp list used when freeing this mark */ -}; - -/* - * Mount point specific fields in an fsnotify_mark - */ -struct fsnotify_vfsmount_mark { - struct vfsmount *mnt; /* vfsmount this mark is associated with */ - struct hlist_node m_list; /* list of marks by inode->i_fsnotify_marks */ - struct list_head free_m_list; /* tmp list used when freeing this mark */ -}; - /* * a mark is simply an object attached to an in core inode which allows an * fsnotify listener to indicate they are either no longer interested in events @@ -230,11 +212,17 @@ struct fsnotify_mark { * in kernel that found and may be using this mark. */ atomic_t refcnt; /* active things looking at this mark */ struct fsnotify_group *group; /* group this mark is for */ - struct list_head g_list; /* list of marks by group->i_fsnotify_marks */ + struct list_head g_list; /* list of marks by group->i_fsnotify_marks + * Also reused for queueing mark into + * destroy_list when it's waiting for + * the end of SRCU period before it can + * be freed */ spinlock_t lock; /* protect group and inode */ + struct hlist_node obj_list; /* list of marks for inode / vfsmount */ + struct list_head free_list; /* tmp list used when freeing this mark */ union { - struct fsnotify_inode_mark i; - struct fsnotify_vfsmount_mark m; + struct inode *inode; /* inode this mark is associated with */ + struct vfsmount *mnt; /* vfsmount this mark is associated with */ }; __u32 ignored_mask; /* events types to ignore */ #define FSNOTIFY_MARK_FLAG_INODE 0x01 @@ -243,7 +231,6 @@ struct fsnotify_mark { #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x08 #define FSNOTIFY_MARK_FLAG_ALIVE 0x10 unsigned int flags; /* vfsmount or inode mark? */ - struct list_head destroy_list; void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */ }; diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 07d2699cdb51a36851b832a9eb67222e8aa6b305..b840e3b2770dd726ef3b4d4e4b07f2f41061f8e7 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -110,11 +110,8 @@ struct vm_area_struct; #define GFP_TEMPORARY (__GFP_WAIT | __GFP_IO | __GFP_FS | \ __GFP_RECLAIMABLE) #define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL) -#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \ - __GFP_HIGHMEM) -#define GFP_HIGHUSER_MOVABLE (__GFP_WAIT | __GFP_IO | __GFP_FS | \ - __GFP_HARDWALL | __GFP_HIGHMEM | \ - __GFP_MOVABLE) +#define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM) +#define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE) #define GFP_IOFS (__GFP_IO | __GFP_FS) #define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 35e7eca4e33b1fbbde3029b5f3cb2be350d0dafc..e365d5ec69cba1bfedb7e35707c8943091711bdf 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -7,15 +7,6 @@ #include <linux/notifier.h> #include <linux/nsproxy.h> -/* - * ipc namespace events - */ -#define IPCNS_MEMCHANGED 0x00000001 /* Notify lowmem size changed */ -#define IPCNS_CREATED 0x00000002 /* Notify new ipc namespace created */ -#define IPCNS_REMOVED 0x00000003 /* Notify ipc namespace removed */ - -#define IPCNS_CALLBACK_PRI 0 - struct user_namespace; struct ipc_ids { @@ -38,7 +29,6 @@ struct ipc_namespace { unsigned int msg_ctlmni; atomic_t msg_bytes; atomic_t msg_hdrs; - int auto_msgmni; size_t shm_ctlmax; size_t shm_ctlall; @@ -77,18 +67,8 @@ extern atomic_t nr_ipc_ns; extern spinlock_t mq_lock; #ifdef CONFIG_SYSVIPC -extern int register_ipcns_notifier(struct ipc_namespace *); -extern int cond_register_ipcns_notifier(struct ipc_namespace *); -extern void unregister_ipcns_notifier(struct ipc_namespace *); -extern int ipcns_notify(unsigned long); extern void shm_destroy_orphaned(struct ipc_namespace *ns); #else /* CONFIG_SYSVIPC */ -static inline int register_ipcns_notifier(struct ipc_namespace *ns) -{ return 0; } -static inline int cond_register_ipcns_notifier(struct ipc_namespace *ns) -{ return 0; } -static inline void unregister_ipcns_notifier(struct ipc_namespace *ns) { } -static inline int ipcns_notify(unsigned long l) { return 0; } static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {} #endif /* CONFIG_SYSVIPC */ diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 057e959710143e3f21bbe6e1523e10ad9a2067af..e705467ddb478d1d0a56f53269f5bebf19822b32 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -21,6 +21,8 @@ #ifndef __KMEMLEAK_H #define __KMEMLEAK_H +#include <linux/slab.h> + #ifdef CONFIG_DEBUG_KMEMLEAK extern void kmemleak_init(void) __ref; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6ea9f919e8881dcef92fe52bd664f375707bb402..7c95af8d552cffd0dd260b66ca162efd8ef608cf 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -400,8 +400,8 @@ int memcg_cache_id(struct mem_cgroup *memcg); void memcg_update_array_size(int num_groups); -struct kmem_cache * -__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); +struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep); +void __memcg_kmem_put_cache(struct kmem_cache *cachep); int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order); void __memcg_uncharge_slab(struct kmem_cache *cachep, int order); @@ -492,7 +492,13 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) if (unlikely(fatal_signal_pending(current))) return cachep; - return __memcg_kmem_get_cache(cachep, gfp); + return __memcg_kmem_get_cache(cachep); +} + +static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep) +{ + if (memcg_kmem_enabled()) + __memcg_kmem_put_cache(cachep); } #else #define for_each_memcg_cache_index(_idx) \ @@ -528,6 +534,10 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) { return cachep; } + +static inline void memcg_kmem_put_cache(struct kmem_cache *cachep) +{ +} #endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 3b337efbe53382bfb87e9b7c0a14c36d08528b57..c0a67b894c4ce5338e845774b3c4e3ad8c325f5f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -19,6 +19,7 @@ #include <linux/bit_spinlock.h> #include <linux/shrinker.h> #include <linux/resource.h> +#include <linux/page_ext.h> struct mempolicy; struct anon_vma; @@ -2060,7 +2061,22 @@ static inline void vm_stat_account(struct mm_struct *mm, #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_DEBUG_PAGEALLOC -extern void kernel_map_pages(struct page *page, int numpages, int enable); +extern bool _debug_pagealloc_enabled; +extern void __kernel_map_pages(struct page *page, int numpages, int enable); + +static inline bool debug_pagealloc_enabled(void) +{ + return _debug_pagealloc_enabled; +} + +static inline void +kernel_map_pages(struct page *page, int numpages, int enable) +{ + if (!debug_pagealloc_enabled()) + return; + + __kernel_map_pages(page, numpages, enable); +} #ifdef CONFIG_HIBERNATION extern bool kernel_page_present(struct page *page); #endif /* CONFIG_HIBERNATION */ @@ -2094,9 +2110,9 @@ int drop_caches_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); #endif -unsigned long shrink_slab(struct shrink_control *shrink, - unsigned long nr_pages_scanned, - unsigned long lru_pages); +unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, + unsigned long nr_scanned, + unsigned long nr_eligible); #ifndef CONFIG_MMU #define randomize_va_space 0 @@ -2155,20 +2171,36 @@ extern void copy_user_huge_page(struct page *dst, struct page *src, unsigned int pages_per_huge_page); #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ +extern struct page_ext_operations debug_guardpage_ops; +extern struct page_ext_operations page_poisoning_ops; + #ifdef CONFIG_DEBUG_PAGEALLOC extern unsigned int _debug_guardpage_minorder; +extern bool _debug_guardpage_enabled; static inline unsigned int debug_guardpage_minorder(void) { return _debug_guardpage_minorder; } +static inline bool debug_guardpage_enabled(void) +{ + return _debug_guardpage_enabled; +} + static inline bool page_is_guard(struct page *page) { - return test_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); + struct page_ext *page_ext; + + if (!debug_guardpage_enabled()) + return false; + + page_ext = lookup_page_ext(page); + return test_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); } #else static inline unsigned int debug_guardpage_minorder(void) { return 0; } +static inline bool debug_guardpage_enabled(void) { return false; } static inline bool page_is_guard(struct page *page) { return false; } #endif /* CONFIG_DEBUG_PAGEALLOC */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bf9f57529dcf18fa007e402d81afdc46916ddce3..6d34aa266a8ce9bd8d2ad53b4fef7a204d5d01b5 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -10,7 +10,6 @@ #include <linux/rwsem.h> #include <linux/completion.h> #include <linux/cpumask.h> -#include <linux/page-debug-flags.h> #include <linux/uprobes.h> #include <linux/page-flags-layout.h> #include <asm/page.h> @@ -186,9 +185,6 @@ struct page { void *virtual; /* Kernel virtual address (NULL if not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ -#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS - unsigned long debug_flags; /* Use atomic bitops on this */ -#endif #ifdef CONFIG_KMEMCHECK /* @@ -534,4 +530,12 @@ enum tlb_flush_reason { NR_TLB_FLUSH_REASONS, }; + /* + * A swap entry has to fit into a "unsigned long", as the entry is hidden + * in the "index" field of the swapper address space. + */ +typedef struct { + unsigned long val; +} swp_entry_t; + #endif /* _LINUX_MM_TYPES_H */ diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 88787bb4b3b93fe08d322c2a54a183486427a2af..ab8564b03468ed7e493b17a89d14f9ab79d3f59e 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -154,7 +154,7 @@ struct mmu_notifier_ops { * Therefore notifier chains can only be traversed when either * * 1. mmap_sem is held. - * 2. One of the reverse map locks is held (i_mmap_mutex or anon_vma->rwsem). + * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem). * 3. No other concurrent thread can access the list (release) */ struct mmu_notifier { diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3879d7664dfcc19cacec76100981ea79b7a75196..2f0856d14b212d80786e09da61fcf6fcf21b09d8 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -722,6 +722,9 @@ typedef struct pglist_data { int nr_zones; #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ struct page *node_mem_map; +#ifdef CONFIG_PAGE_EXTENSION + struct page_ext *node_page_ext; +#endif #endif #ifndef CONFIG_NO_BOOTMEM struct bootmem_data *bdata; @@ -1075,6 +1078,7 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn) #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) struct page; +struct page_ext; struct mem_section { /* * This is, logically, a pointer to an array of struct @@ -1092,6 +1096,14 @@ struct mem_section { /* See declaration of similar field in struct zone */ unsigned long *pageblock_flags; +#ifdef CONFIG_PAGE_EXTENSION + /* + * If !SPARSEMEM, pgdat doesn't have page_ext pointer. We use + * section. (see page_ext.h about this.) + */ + struct page_ext *page_ext; + unsigned long pad; +#endif /* * WARNING: mem_section must be a power-of-2 in size for the * calculation and use of SECTION_ROOT_MASK to make sense. diff --git a/include/linux/oom.h b/include/linux/oom.h index e8d6e10587233466c666d6be2e3fb21603a82552..853698c721f7d1547df181a4fbfc904a67758f72 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -92,6 +92,17 @@ static inline bool oom_gfp_allowed(gfp_t gfp_mask) extern struct task_struct *find_lock_task_mm(struct task_struct *p); +static inline bool task_will_free_mem(struct task_struct *task) +{ + /* + * A coredumping process may sleep for an extended period in exit_mm(), + * so the oom killer cannot assume that the process will promptly exit + * and release memory. + */ + return (task->flags & PF_EXITING) && + !(task->signal->flags & SIGNAL_GROUP_COREDUMP); +} + /* sysctls */ extern int sysctl_oom_dump_tasks; extern int sysctl_oom_kill_allocating_task; diff --git a/include/linux/page-debug-flags.h b/include/linux/page-debug-flags.h deleted file mode 100644 index 22691f614043df3d3cdf77c3b12dd97a551bf813..0000000000000000000000000000000000000000 --- a/include/linux/page-debug-flags.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef LINUX_PAGE_DEBUG_FLAGS_H -#define LINUX_PAGE_DEBUG_FLAGS_H - -/* - * page->debug_flags bits: - * - * PAGE_DEBUG_FLAG_POISON is set for poisoned pages. This is used to - * implement generic debug pagealloc feature. The pages are filled with - * poison patterns and set this flag after free_pages(). The poisoned - * pages are verified whether the patterns are not corrupted and clear - * the flag before alloc_pages(). - */ - -enum page_debug_flags { - PAGE_DEBUG_FLAG_POISON, /* Page is poisoned */ - PAGE_DEBUG_FLAG_GUARD, -}; - -/* - * Ensure that CONFIG_WANT_PAGE_DEBUG_FLAGS reliably - * gets turned off when no debug features are enabling it! - */ - -#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS -#if !defined(CONFIG_PAGE_POISONING) && \ - !defined(CONFIG_PAGE_GUARD) \ -/* && !defined(CONFIG_PAGE_DEBUG_SOMETHING_ELSE) && ... */ -#error WANT_PAGE_DEBUG_FLAGS is turned on with no debug features! -#endif -#endif /* CONFIG_WANT_PAGE_DEBUG_FLAGS */ - -#endif /* LINUX_PAGE_DEBUG_FLAGS_H */ diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h new file mode 100644 index 0000000000000000000000000000000000000000..d2a2c84c72d04aa7a7b993be3c40df100015790e --- /dev/null +++ b/include/linux/page_ext.h @@ -0,0 +1,84 @@ +#ifndef __LINUX_PAGE_EXT_H +#define __LINUX_PAGE_EXT_H + +#include <linux/types.h> +#include <linux/stacktrace.h> + +struct pglist_data; +struct page_ext_operations { + bool (*need)(void); + void (*init)(void); +}; + +#ifdef CONFIG_PAGE_EXTENSION + +/* + * page_ext->flags bits: + * + * PAGE_EXT_DEBUG_POISON is set for poisoned pages. This is used to + * implement generic debug pagealloc feature. The pages are filled with + * poison patterns and set this flag after free_pages(). The poisoned + * pages are verified whether the patterns are not corrupted and clear + * the flag before alloc_pages(). + */ + +enum page_ext_flags { + PAGE_EXT_DEBUG_POISON, /* Page is poisoned */ + PAGE_EXT_DEBUG_GUARD, + PAGE_EXT_OWNER, +}; + +/* + * Page Extension can be considered as an extended mem_map. + * A page_ext page is associated with every page descriptor. The + * page_ext helps us add more information about the page. + * All page_ext are allocated at boot or memory hotplug event, + * then the page_ext for pfn always exists. + */ +struct page_ext { + unsigned long flags; +#ifdef CONFIG_PAGE_OWNER + unsigned int order; + gfp_t gfp_mask; + struct stack_trace trace; + unsigned long trace_entries[8]; +#endif +}; + +extern void pgdat_page_ext_init(struct pglist_data *pgdat); + +#ifdef CONFIG_SPARSEMEM +static inline void page_ext_init_flatmem(void) +{ +} +extern void page_ext_init(void); +#else +extern void page_ext_init_flatmem(void); +static inline void page_ext_init(void) +{ +} +#endif + +struct page_ext *lookup_page_ext(struct page *page); + +#else /* !CONFIG_PAGE_EXTENSION */ +struct page_ext; + +static inline void pgdat_page_ext_init(struct pglist_data *pgdat) +{ +} + +static inline struct page_ext *lookup_page_ext(struct page *page) +{ + return NULL; +} + +static inline void page_ext_init(void) +{ +} + +static inline void page_ext_init_flatmem(void) +{ +} +#endif /* CONFIG_PAGE_EXTENSION */ +#endif /* __LINUX_PAGE_EXT_H */ diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h new file mode 100644 index 0000000000000000000000000000000000000000..b48c3471c254471ed940921ab7bb4832d6f92023 --- /dev/null +++ b/include/linux/page_owner.h @@ -0,0 +1,38 @@ +#ifndef __LINUX_PAGE_OWNER_H +#define __LINUX_PAGE_OWNER_H + +#ifdef CONFIG_PAGE_OWNER +extern bool page_owner_inited; +extern struct page_ext_operations page_owner_ops; + +extern void __reset_page_owner(struct page *page, unsigned int order); +extern void __set_page_owner(struct page *page, + unsigned int order, gfp_t gfp_mask); + +static inline void reset_page_owner(struct page *page, unsigned int order) +{ + if (likely(!page_owner_inited)) + return; + + __reset_page_owner(page, order); +} + +static inline void set_page_owner(struct page *page, + unsigned int order, gfp_t gfp_mask) +{ + if (likely(!page_owner_inited)) + return; + + __set_page_owner(page, order, gfp_mask); +} +#else +static inline void reset_page_owner(struct page *page, unsigned int order) +{ +} +static inline void set_page_owner(struct page *page, + unsigned int order, gfp_t gfp_mask) +{ +} + +#endif /* CONFIG_PAGE_OWNER */ +#endif /* __LINUX_PAGE_OWNER_H */ diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 420032d41d27aeb269210ce27baafa5537373dda..57f3a1c550dc66e59803a2118f377a318ca7e51e 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -254,8 +254,6 @@ do { \ #endif /* CONFIG_SMP */ #define per_cpu(var, cpu) (*per_cpu_ptr(&(var), cpu)) -#define __raw_get_cpu_var(var) (*raw_cpu_ptr(&(var))) -#define __get_cpu_var(var) (*this_cpu_ptr(&(var))) /* * Must be an lvalue. Since @var must be a simple identifier, diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h index 0a260d8a18bf92dc3b8d4b582d3506e169d9dac5..18102529254eeb8fe9fe112c9b455ca7c4e8650f 100644 --- a/include/linux/ratelimit.h +++ b/include/linux/ratelimit.h @@ -17,14 +17,20 @@ struct ratelimit_state { unsigned long begin; }; -#define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init) \ - \ - struct ratelimit_state name = { \ +#define RATELIMIT_STATE_INIT(name, interval_init, burst_init) { \ .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ .interval = interval_init, \ .burst = burst_init, \ } +#define RATELIMIT_STATE_INIT_DISABLED \ + RATELIMIT_STATE_INIT(ratelimit_state, 0, DEFAULT_RATELIMIT_BURST) + +#define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init) \ + \ + struct ratelimit_state name = \ + RATELIMIT_STATE_INIT(name, interval_init, burst_init) \ + static inline void ratelimit_state_init(struct ratelimit_state *rs, int interval, int burst) { diff --git a/include/linux/sched.h b/include/linux/sched.h index 55f5ee7cc3d3aa1f0e608adce1126c331c9b0656..8db31ef98d2f4b5ad9f68f8296fbdbb475f96c46 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1364,6 +1364,10 @@ struct task_struct { unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; +#ifdef CONFIG_MEMCG_KMEM + unsigned memcg_kmem_skip_account:1; +#endif + unsigned long atomic_flags; /* Flags needing atomic access. */ pid_t pid; @@ -1679,8 +1683,7 @@ struct task_struct { /* bitmask and counter of trace recursion */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ -#ifdef CONFIG_MEMCG /* memcg uses this to do batch job */ - unsigned int memcg_kmem_skip_account; +#ifdef CONFIG_MEMCG struct memcg_oom_info { struct mem_cgroup *memcg; gfp_t gfp_mask; @@ -2482,6 +2485,10 @@ extern void do_group_exit(int); extern int do_execve(struct filename *, const char __user * const __user *, const char __user * const __user *); +extern int do_execveat(int, struct filename *, + const char __user * const __user *, + const char __user * const __user *, + int); extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 68c097077ef04af4533dd3ccc0272db8564bf269..f4aee75f00b19449e63c2a48b864e36f11b10ce7 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -18,8 +18,6 @@ struct shrink_control { */ unsigned long nr_to_scan; - /* shrink from these nodes */ - nodemask_t nodes_to_scan; /* current node being shrunk (for NUMA aware shrinkers) */ int nid; }; diff --git a/include/linux/slab.h b/include/linux/slab.h index 8a2457d42fc8f864536f559a07bef830cd6881ae..9a139b637069730e85fe6903253baf16e211e7b0 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -493,7 +493,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) * @memcg: pointer to the memcg this cache belongs to * @list: list_head for the list of all caches in this memcg * @root_cache: pointer to the global, root cache, this cache was derived from - * @nr_pages: number of pages that belongs to this cache. */ struct memcg_cache_params { bool is_root_cache; @@ -506,7 +505,6 @@ struct memcg_cache_params { struct mem_cgroup *memcg; struct list_head list; struct kmem_cache *root_cache; - atomic_t nr_pages; }; }; }; diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index 115b570e3bff3d47df43794b5781cfeaa3aa6245..669045ab73f30d7abbf72da9faa1a3724fe69b25 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -1,6 +1,8 @@ #ifndef __LINUX_STACKTRACE_H #define __LINUX_STACKTRACE_H +#include <linux/types.h> + struct task_struct; struct pt_regs; @@ -20,6 +22,8 @@ extern void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace); extern void print_stack_trace(struct stack_trace *trace, int spaces); +extern int snprint_stack_trace(char *buf, size_t size, + struct stack_trace *trace, int spaces); #ifdef CONFIG_USER_STACKTRACE_SUPPORT extern void save_stack_trace_user(struct stack_trace *trace); @@ -32,6 +36,7 @@ extern void save_stack_trace_user(struct stack_trace *trace); # define save_stack_trace_tsk(tsk, trace) do { } while (0) # define save_stack_trace_user(trace) do { } while (0) # define print_stack_trace(trace, spaces) do { } while (0) +# define snprint_stack_trace(buf, size, trace, spaces) do { } while (0) #endif #endif diff --git a/include/linux/swap.h b/include/linux/swap.h index 37a585beef5cf86e27fe54bc9a444ce7874ac1d3..34e8b60ab9736b89aa97346d7ccea203f212c84a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -102,14 +102,6 @@ union swap_header { } info; }; - /* A swap entry has to fit into a "unsigned long", as - * the entry is hidden in the "index" field of the - * swapper address space. - */ -typedef struct { - unsigned long val; -} swp_entry_t; - /* * current->reclaim_state points to one of these when a task is running * memory reclaim diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index c9afdc7a7f84db0eec47414cca25598abed52b89..85893d744901b0629edb48e811434193a13892d6 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -877,4 +877,9 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags, asmlinkage long sys_getrandom(char __user *buf, size_t count, unsigned int flags); asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); + +asmlinkage long sys_execveat(int dfd, const char __user *filename, + const char __user *const __user *argv, + const char __user *const __user *envp, int flags); + #endif diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 730334cdf037c8ea14dc83377eb61073bb9678ed..9246d32dc9734374d893a5357e60fdbdd0c296e3 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -90,6 +90,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_DEBUG_VM_VMACACHE VMACACHE_FIND_CALLS, VMACACHE_FIND_HITS, + VMACACHE_FULL_FLUSHES, #endif NR_VM_EVENT_ITEMS }; diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 22749c13411775a77cabae4617f228228aadd47f..e016bd9b1a049686da0ee465fd561a8062156ff6 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -707,9 +707,11 @@ __SYSCALL(__NR_getrandom, sys_getrandom) __SYSCALL(__NR_memfd_create, sys_memfd_create) #define __NR_bpf 280 __SYSCALL(__NR_bpf, sys_bpf) +#define __NR_execveat 281 +__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat) #undef __NR_syscalls -#define __NR_syscalls 281 +#define __NR_syscalls 282 /* * All syscalls below here should go away really, diff --git a/include/uapi/linux/msg.h b/include/uapi/linux/msg.h index a70375526578039691cf324a68108b60e4876619..f51c8001dbe5ed9733e2d1afd945f343d39f0663 100644 --- a/include/uapi/linux/msg.h +++ b/include/uapi/linux/msg.h @@ -51,16 +51,28 @@ struct msginfo { }; /* - * Scaling factor to compute msgmni: - * the memory dedicated to msg queues (msgmni * msgmnb) should occupy - * at most 1/MSG_MEM_SCALE of the lowmem (see the formula in ipc/msg.c): - * up to 8MB : msgmni = 16 (MSGMNI) - * 4 GB : msgmni = 8K - * more than 16 GB : msgmni = 32K (IPCMNI) + * MSGMNI, MSGMAX and MSGMNB are default values which can be + * modified by sysctl. + * + * MSGMNI is the upper limit for the number of messages queues per + * namespace. + * It has been chosen to be as large possible without facilitating + * scenarios where userspace causes overflows when adjusting the limits via + * operations of the form retrieve current limit; add X; update limit". + * + * MSGMNB is the default size of a new message queue. Non-root tasks can + * decrease the size with msgctl(IPC_SET), root tasks + * (actually: CAP_SYS_RESOURCE) can both increase and decrease the queue + * size. The optimal value is application dependent. + * 16384 is used because it was always used (since 0.99.10) + * + * MAXMAX is the maximum size of an individual message, it's a global + * (per-namespace) limit that applies for all message queues. + * It's set to 1/2 of MSGMNB, to ensure that at least two messages fit into + * the queue. This is also an arbitrary choice (since 2.6.0). */ -#define MSG_MEM_SCALE 32 -#define MSGMNI 16 /* <= IPCMNI */ /* max # of msg queue identifiers */ +#define MSGMNI 32000 /* <= IPCMNI */ /* max # of msg queue identifiers */ #define MSGMAX 8192 /* <= INT_MAX */ /* max size of message (bytes) */ #define MSGMNB 16384 /* <= INT_MAX */ /* default max size of a message queue */ diff --git a/include/uapi/linux/sem.h b/include/uapi/linux/sem.h index 541fce03b50cfb360004008f60b6d1a9d876ccd0..dd73b908b2f3225d7a7a0611c97cca8eb36295ee 100644 --- a/include/uapi/linux/sem.h +++ b/include/uapi/linux/sem.h @@ -63,10 +63,22 @@ struct seminfo { int semaem; }; -#define SEMMNI 128 /* <= IPCMNI max # of semaphore identifiers */ -#define SEMMSL 250 /* <= 8 000 max num of semaphores per id */ +/* + * SEMMNI, SEMMSL and SEMMNS are default values which can be + * modified by sysctl. + * The values has been chosen to be larger than necessary for any + * known configuration. + * + * SEMOPM should not be increased beyond 1000, otherwise there is the + * risk that semop()/semtimedop() fails due to kernel memory fragmentation when + * allocating the sop array. + */ + + +#define SEMMNI 32000 /* <= IPCMNI max # of semaphore identifiers */ +#define SEMMSL 32000 /* <= INT_MAX max num of semaphores per id */ #define SEMMNS (SEMMNI*SEMMSL) /* <= INT_MAX max # of semaphores in system */ -#define SEMOPM 32 /* <= 1 000 max num of ops per semop call */ +#define SEMOPM 500 /* <= 1 000 max num of ops per semop call */ #define SEMVMX 32767 /* <= 32767 semaphore maximum value */ #define SEMAEM SEMVMX /* adjust on exit max value */ diff --git a/init/main.c b/init/main.c index ca380ec685de493f9d4f5d659d10fd549c899820..ed7e7ad5fee04ef0fe9e9a9ce53b3e42a6ddc552 100644 --- a/init/main.c +++ b/init/main.c @@ -51,6 +51,7 @@ #include <linux/mempolicy.h> #include <linux/key.h> #include <linux/buffer_head.h> +#include <linux/page_ext.h> #include <linux/debug_locks.h> #include <linux/debugobjects.h> #include <linux/lockdep.h> @@ -484,6 +485,11 @@ void __init __weak thread_info_cache_init(void) */ static void __init mm_init(void) { + /* + * page_ext requires contiguous pages, + * bigger than MAX_ORDER unless SPARSEMEM. + */ + page_ext_init_flatmem(); mem_init(); kmem_cache_init(); percpu_init_late(); @@ -621,6 +627,7 @@ asmlinkage __visible void __init start_kernel(void) initrd_start = 0; } #endif + page_ext_init(); debug_objects_mem_init(); kmemleak_init(); setup_per_cpu_pageset(); diff --git a/ipc/Makefile b/ipc/Makefile index 9075e172e52cacbcc4e5c0ba3d3ccb03d2a6d561..86c7300ecdf5d566111459fe07e7d171e08a192d 100644 --- a/ipc/Makefile +++ b/ipc/Makefile @@ -3,7 +3,7 @@ # obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o -obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o ipcns_notifier.o syscall.o +obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o syscall.o obj-$(CONFIG_SYSVIPC_SYSCTL) += ipc_sysctl.o obj_mq-$(CONFIG_COMPAT) += compat_mq.o obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y) diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index e8075b24749775f254c4d821ea1fb2c4a8a4c5f1..8ad93c29f511033dc4a6df1b2a26398ea78c9104 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -62,29 +62,6 @@ static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write, return err; } -static int proc_ipc_callback_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table ipc_table; - size_t lenp_bef = *lenp; - int rc; - - memcpy(&ipc_table, table, sizeof(ipc_table)); - ipc_table.data = get_ipc(table); - - rc = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); - - if (write && !rc && lenp_bef == *lenp) - /* - * Tunable has successfully been changed by hand. Disable its - * automatic adjustment. This simply requires unregistering - * the notifiers that trigger recalculation. - */ - unregister_ipcns_notifier(current->nsproxy->ipc_ns); - - return rc; -} - static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -96,54 +73,19 @@ static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, lenp, ppos); } -/* - * Routine that is called when the file "auto_msgmni" has successfully been - * written. - * Two values are allowed: - * 0: unregister msgmni's callback routine from the ipc namespace notifier - * chain. This means that msgmni won't be recomputed anymore upon memory - * add/remove or ipc namespace creation/removal. - * 1: register back the callback routine. - */ -static void ipc_auto_callback(int val) -{ - if (!val) - unregister_ipcns_notifier(current->nsproxy->ipc_ns); - else { - /* - * Re-enable automatic recomputing only if not already - * enabled. - */ - recompute_msgmni(current->nsproxy->ipc_ns); - cond_register_ipcns_notifier(current->nsproxy->ipc_ns); - } -} - -static int proc_ipcauto_dointvec_minmax(struct ctl_table *table, int write, +static int proc_ipc_auto_msgmni(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; - int oldval; - int rc; + int dummy = 0; memcpy(&ipc_table, table, sizeof(ipc_table)); - ipc_table.data = get_ipc(table); - oldval = *((int *)(ipc_table.data)); + ipc_table.data = &dummy; - rc = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); + if (write) + pr_info_once("writing to auto_msgmni has no effect"); - if (write && !rc) { - int newval = *((int *)(ipc_table.data)); - /* - * The file "auto_msgmni" has correctly been set. - * React by (un)registering the corresponding tunable, if the - * value has changed. - */ - if (newval != oldval) - ipc_auto_callback(newval); - } - - return rc; + return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); } #else @@ -151,8 +93,7 @@ static int proc_ipcauto_dointvec_minmax(struct ctl_table *table, int write, #define proc_ipc_dointvec NULL #define proc_ipc_dointvec_minmax NULL #define proc_ipc_dointvec_minmax_orphans NULL -#define proc_ipc_callback_dointvec_minmax NULL -#define proc_ipcauto_dointvec_minmax NULL +#define proc_ipc_auto_msgmni NULL #endif static int zero; @@ -204,10 +145,19 @@ static struct ctl_table ipc_kern_table[] = { .data = &init_ipc_ns.msg_ctlmni, .maxlen = sizeof(init_ipc_ns.msg_ctlmni), .mode = 0644, - .proc_handler = proc_ipc_callback_dointvec_minmax, + .proc_handler = proc_ipc_dointvec_minmax, .extra1 = &zero, .extra2 = &int_max, }, + { + .procname = "auto_msgmni", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_ipc_auto_msgmni, + .extra1 = &zero, + .extra2 = &one, + }, { .procname = "msgmnb", .data = &init_ipc_ns.msg_ctlmnb, @@ -224,15 +174,6 @@ static struct ctl_table ipc_kern_table[] = { .mode = 0644, .proc_handler = proc_ipc_dointvec, }, - { - .procname = "auto_msgmni", - .data = &init_ipc_ns.auto_msgmni, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_ipcauto_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, #ifdef CONFIG_CHECKPOINT_RESTORE { .procname = "sem_next_id", diff --git a/ipc/ipcns_notifier.c b/ipc/ipcns_notifier.c deleted file mode 100644 index b9b31a4f77e114cff79eef3cd25e075a3dc8d871..0000000000000000000000000000000000000000 --- a/ipc/ipcns_notifier.c +++ /dev/null @@ -1,92 +0,0 @@ -/* - * linux/ipc/ipcns_notifier.c - * Copyright (C) 2007 BULL SA. Nadia Derbey - * - * Notification mechanism for ipc namespaces: - * The callback routine registered in the memory chain invokes the ipcns - * notifier chain with the IPCNS_MEMCHANGED event. - * Each callback routine registered in the ipcns namespace recomputes msgmni - * for the owning namespace. - */ - -#include <linux/msg.h> -#include <linux/rcupdate.h> -#include <linux/notifier.h> -#include <linux/nsproxy.h> -#include <linux/ipc_namespace.h> - -#include "util.h" - - - -static BLOCKING_NOTIFIER_HEAD(ipcns_chain); - - -static int ipcns_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - struct ipc_namespace *ns; - - switch (action) { - case IPCNS_MEMCHANGED: /* amount of lowmem has changed */ - case IPCNS_CREATED: - case IPCNS_REMOVED: - /* - * It's time to recompute msgmni - */ - ns = container_of(self, struct ipc_namespace, ipcns_nb); - /* - * No need to get a reference on the ns: the 1st job of - * free_ipc_ns() is to unregister the callback routine. - * blocking_notifier_chain_unregister takes the wr lock to do - * it. - * When this callback routine is called the rd lock is held by - * blocking_notifier_call_chain. - * So the ipc ns cannot be freed while we are here. - */ - recompute_msgmni(ns); - break; - default: - break; - } - - return NOTIFY_OK; -} - -int register_ipcns_notifier(struct ipc_namespace *ns) -{ - int rc; - - memset(&ns->ipcns_nb, 0, sizeof(ns->ipcns_nb)); - ns->ipcns_nb.notifier_call = ipcns_callback; - ns->ipcns_nb.priority = IPCNS_CALLBACK_PRI; - rc = blocking_notifier_chain_register(&ipcns_chain, &ns->ipcns_nb); - if (!rc) - ns->auto_msgmni = 1; - return rc; -} - -int cond_register_ipcns_notifier(struct ipc_namespace *ns) -{ - int rc; - - memset(&ns->ipcns_nb, 0, sizeof(ns->ipcns_nb)); - ns->ipcns_nb.notifier_call = ipcns_callback; - ns->ipcns_nb.priority = IPCNS_CALLBACK_PRI; - rc = blocking_notifier_chain_cond_register(&ipcns_chain, - &ns->ipcns_nb); - if (!rc) - ns->auto_msgmni = 1; - return rc; -} - -void unregister_ipcns_notifier(struct ipc_namespace *ns) -{ - blocking_notifier_chain_unregister(&ipcns_chain, &ns->ipcns_nb); - ns->auto_msgmni = 0; -} - -int ipcns_notify(unsigned long val) -{ - return blocking_notifier_call_chain(&ipcns_chain, val, NULL); -} diff --git a/ipc/msg.c b/ipc/msg.c index c5d8e3749985be98b17c7bb1183dafb41332ea36..a7261d5cbc89be93c1f1bd41b39ebe9683e82a19 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -989,43 +989,12 @@ SYSCALL_DEFINE5(msgrcv, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz, return do_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg, do_msg_fill); } -/* - * Scale msgmni with the available lowmem size: the memory dedicated to msg - * queues should occupy at most 1/MSG_MEM_SCALE of lowmem. - * Also take into account the number of nsproxies created so far. - * This should be done staying within the (MSGMNI , IPCMNI/nr_ipc_ns) range. - */ -void recompute_msgmni(struct ipc_namespace *ns) -{ - struct sysinfo i; - unsigned long allowed; - int nb_ns; - - si_meminfo(&i); - allowed = (((i.totalram - i.totalhigh) / MSG_MEM_SCALE) * i.mem_unit) - / MSGMNB; - nb_ns = atomic_read(&nr_ipc_ns); - allowed /= nb_ns; - - if (allowed < MSGMNI) { - ns->msg_ctlmni = MSGMNI; - return; - } - - if (allowed > IPCMNI / nb_ns) { - ns->msg_ctlmni = IPCMNI / nb_ns; - return; - } - - ns->msg_ctlmni = allowed; -} void msg_init_ns(struct ipc_namespace *ns) { ns->msg_ctlmax = MSGMAX; ns->msg_ctlmnb = MSGMNB; - - recompute_msgmni(ns); + ns->msg_ctlmni = MSGMNI; atomic_set(&ns->msg_bytes, 0); atomic_set(&ns->msg_hdrs, 0); @@ -1069,9 +1038,6 @@ void __init msg_init(void) { msg_init_ns(&init_ipc_ns); - printk(KERN_INFO "msgmni has been set to %d\n", - init_ipc_ns.msg_ctlmni); - ipc_init_proc_interface("sysvipc/msg", " key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n", IPC_MSG_IDS, sysvipc_msg_proc_show); diff --git a/ipc/namespace.c b/ipc/namespace.c index b54468e48e3214899b992b38b130072d1a580d4b..1a3ffd40356e37d0270c18fde4765c50dd6872a8 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -45,14 +45,6 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, msg_init_ns(ns); shm_init_ns(ns); - /* - * msgmni has already been computed for the new ipc ns. - * Thus, do the ipcns creation notification before registering that - * new ipcns in the chain. - */ - ipcns_notify(IPCNS_CREATED); - register_ipcns_notifier(ns); - ns->user_ns = get_user_ns(user_ns); return ns; @@ -99,25 +91,11 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, static void free_ipc_ns(struct ipc_namespace *ns) { - /* - * Unregistering the hotplug notifier at the beginning guarantees - * that the ipc namespace won't be freed while we are inside the - * callback routine. Since the blocking_notifier_chain_XXX routines - * hold a rw lock on the notifier list, unregister_ipcns_notifier() - * won't take the rw lock before blocking_notifier_call_chain() has - * released the rd lock. - */ - unregister_ipcns_notifier(ns); sem_exit_ns(ns); msg_exit_ns(ns); shm_exit_ns(ns); atomic_dec(&nr_ipc_ns); - /* - * Do the ipcns removal notification after decrementing nr_ipc_ns in - * order to have a correct value when recomputing msgmni. - */ - ipcns_notify(IPCNS_REMOVED); put_user_ns(ns->user_ns); proc_free_inum(ns->proc_inum); kfree(ns); diff --git a/ipc/sem.c b/ipc/sem.c index 53c3310f41c6867fd4b5f3f0493a150184b8c617..6115146563f94e5bfab0973be730b3388fb3fc59 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -326,10 +326,17 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, /* Then check that the global lock is free */ if (!spin_is_locked(&sma->sem_perm.lock)) { - /* spin_is_locked() is not a memory barrier */ - smp_mb(); + /* + * The ipc object lock check must be visible on all + * cores before rechecking the complex count. Otherwise + * we can race with another thread that does: + * complex_count++; + * spin_unlock(sem_perm.lock); + */ + smp_rmb(); - /* Now repeat the test of complex_count: + /* + * Now repeat the test of complex_count: * It can't change anymore until we drop sem->lock. * Thus: if is now 0, then it will stay 0. */ diff --git a/ipc/shm.c b/ipc/shm.c index 01454796ba3c59e20b4db8e71fdf7a17c74f9ad7..19633b4a2350edf689af1a1cd2d8fcc1b0d7b6c2 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -219,7 +219,8 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) if (!is_file_hugepages(shm_file)) shmem_lock(shm_file, 0, shp->mlock_user); else if (shp->mlock_user) - user_shm_unlock(file_inode(shm_file)->i_size, shp->mlock_user); + user_shm_unlock(i_size_read(file_inode(shm_file)), + shp->mlock_user); fput(shm_file); ipc_rcu_putref(shp, shm_rcu_free); } @@ -1229,6 +1230,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) int retval = -EINVAL; #ifdef CONFIG_MMU loff_t size = 0; + struct file *file; struct vm_area_struct *next; #endif @@ -1245,7 +1247,8 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) * started at address shmaddr. It records it's size and then unmaps * it. * - Then it unmaps all shm vmas that started at shmaddr and that - * are within the initially determined size. + * are within the initially determined size and that are from the + * same shm segment from which we determined the size. * Errors from do_munmap are ignored: the function only fails if * it's called with invalid parameters or if it's called to unmap * a part of a vma. Both calls in this function are for full vmas, @@ -1271,8 +1274,14 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) if ((vma->vm_ops == &shm_vm_ops) && (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) { - - size = file_inode(vma->vm_file)->i_size; + /* + * Record the file of the shm segment being + * unmapped. With mremap(), someone could place + * page from another segment but with equal offsets + * in the range we are unmapping. + */ + file = vma->vm_file; + size = i_size_read(file_inode(vma->vm_file)); do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); /* * We discovered the size of the shm segment, so @@ -1298,8 +1307,8 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) /* finding a matching vma now does not alter retval */ if ((vma->vm_ops == &shm_vm_ops) && - (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) - + ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) && + (vma->vm_file == file)) do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); vma = next; } diff --git a/ipc/util.c b/ipc/util.c index 88adc329888c5df2d0e93f23fe306995f498d68b..106bed0378ab3575d6aadac8196b32971b5a2b19 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -71,44 +71,6 @@ struct ipc_proc_iface { int (*show)(struct seq_file *, void *); }; -static void ipc_memory_notifier(struct work_struct *work) -{ - ipcns_notify(IPCNS_MEMCHANGED); -} - -static int ipc_memory_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - static DECLARE_WORK(ipc_memory_wq, ipc_memory_notifier); - - switch (action) { - case MEM_ONLINE: /* memory successfully brought online */ - case MEM_OFFLINE: /* or offline: it's time to recompute msgmni */ - /* - * This is done by invoking the ipcns notifier chain with the - * IPC_MEMCHANGED event. - * In order not to keep the lock on the hotplug memory chain - * for too long, queue a work item that will, when waken up, - * activate the ipcns notification chain. - */ - schedule_work(&ipc_memory_wq); - break; - case MEM_GOING_ONLINE: - case MEM_GOING_OFFLINE: - case MEM_CANCEL_ONLINE: - case MEM_CANCEL_OFFLINE: - default: - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block ipc_memory_nb = { - .notifier_call = ipc_memory_callback, - .priority = IPC_CALLBACK_PRI, -}; - /** * ipc_init - initialise ipc subsystem * @@ -124,8 +86,6 @@ static int __init ipc_init(void) sem_init(); msg_init(); shm_init(); - register_hotmemory_notifier(&ipc_memory_nb); - register_ipcns_notifier(&init_ipc_ns); return 0; } device_initcall(ipc_init); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 80f29e0155705159fc83a80c1138e45b590661e2..2e0c97427b339c2f794568b8bb0ac383abf90421 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -174,9 +174,9 @@ static void insert_hash(struct audit_chunk *chunk) struct fsnotify_mark *entry = &chunk->mark; struct list_head *list; - if (!entry->i.inode) + if (!entry->inode) return; - list = chunk_hash(entry->i.inode); + list = chunk_hash(entry->inode); list_add_rcu(&chunk->hash, list); } @@ -188,7 +188,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) list_for_each_entry_rcu(p, list, hash) { /* mark.inode may have gone NULL, but who cares? */ - if (p->mark.i.inode == inode) { + if (p->mark.inode == inode) { atomic_long_inc(&p->refs); return p; } @@ -231,7 +231,7 @@ static void untag_chunk(struct node *p) new = alloc_chunk(size); spin_lock(&entry->lock); - if (chunk->dead || !entry->i.inode) { + if (chunk->dead || !entry->inode) { spin_unlock(&entry->lock); if (new) free_chunk(new); @@ -258,7 +258,7 @@ static void untag_chunk(struct node *p) goto Fallback; fsnotify_duplicate_mark(&new->mark, entry); - if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { + if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) { fsnotify_put_mark(&new->mark); goto Fallback; } @@ -386,7 +386,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) chunk_entry = &chunk->mark; spin_lock(&old_entry->lock); - if (!old_entry->i.inode) { + if (!old_entry->inode) { /* old_entry is being shot, lets just lie */ spin_unlock(&old_entry->lock); fsnotify_put_mark(old_entry); @@ -395,7 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) } fsnotify_duplicate_mark(chunk_entry, old_entry); - if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { + if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) { spin_unlock(&old_entry->lock); fsnotify_put_mark(chunk_entry); fsnotify_put_mark(old_entry); @@ -611,7 +611,7 @@ void audit_trim_trees(void) list_for_each_entry(node, &tree->chunks, list) { struct audit_chunk *chunk = find_chunk(node); /* this could be NULL if the watch is dying else where... */ - struct inode *inode = chunk->mark.i.inode; + struct inode *inode = chunk->mark.inode; node->index |= 1U<<31; if (iterate_mounts(compare_root, inode, root_mnt)) node->index &= ~(1U<<31); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ed8f2cde34c57acce554ee08d79625c3b260ada8..995a95f61a198b82694e814a28a0d13d295ea8f7 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -724,14 +724,14 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) int more = 0; again: - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (!valid_vma(vma, is_register)) continue; if (!prev && !more) { /* - * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through + * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through * reclaim. This is optimistic, no harm done if it fails. */ prev = kmalloc(sizeof(struct map_info), @@ -755,7 +755,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) info->mm = vma->vm_mm; info->vaddr = offset_to_vaddr(vma, offset); } - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_read(mapping); if (!more) goto out; diff --git a/kernel/fork.c b/kernel/fork.c index 9ca84189cfc20ad34f6b745c10d753a461b89ee3..4dc2ddade9f1f288aaa08d8a665f1fded2ba88cc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -433,7 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); if (tmp->vm_flags & VM_SHARED) atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); @@ -445,7 +445,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) vma_interval_tree_insert_after(tmp, mpnt, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); } /* diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 3b7408759bdfdfc4a5fe469df649d631efdae40d..c92e44855ddda18843a2b7c9102a323811d27917 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -32,10 +32,13 @@ config GCOV_KERNEL Note that the debugfs filesystem has to be mounted to access profiling data. +config ARCH_HAS_GCOV_PROFILE_ALL + def_bool n + config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM || ARM64 + depends on ARCH_HAS_GCOV_PROFILE_ALL default n ---help--- This options activates profiling for the entire kernel. diff --git a/kernel/kexec.c b/kernel/kexec.c index 2abf9f6e9a61866302c858e408c8dd7b5ec93dd5..9a8a01abbaed16a235c6e98b12f43996e6900cc2 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -600,7 +600,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, if (!kexec_on_panic) { image->swap_page = kimage_alloc_control_pages(image, 0); if (!image->swap_page) { - pr_err(KERN_ERR "Could not allocate swap buffer\n"); + pr_err("Could not allocate swap buffer\n"); goto out_free_control_pages; } } diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 00fe55cc5a829a76b5b6faee8c2052b9f64e65b7..b6e4c16377c708027c1c8c52914106fb71e3ca93 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -25,6 +25,38 @@ void print_stack_trace(struct stack_trace *trace, int spaces) } EXPORT_SYMBOL_GPL(print_stack_trace); +int snprint_stack_trace(char *buf, size_t size, + struct stack_trace *trace, int spaces) +{ + int i; + unsigned long ip; + int generated; + int total = 0; + + if (WARN_ON(!trace->entries)) + return 0; + + for (i = 0; i < trace->nr_entries; i++) { + ip = trace->entries[i]; + generated = snprintf(buf, size, "%*c[<%p>] %pS\n", + 1 + spaces, ' ', (void *) ip, (void *) ip); + + total += generated; + + /* Assume that generated isn't a negative number */ + if (generated >= size) { + buf += size; + size = 0; + } else { + buf += generated; + size -= generated; + } + } + + return total; +} +EXPORT_SYMBOL_GPL(snprint_stack_trace); + /* * Architectures that do not implement save_stack_trace_tsk or * save_stack_trace_regs get this weak alias and a once-per-bootup warning diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 61eea02b53f5afbcd916b21684d5e57269e1ecad..5adcb0ae3a58aa6d77dc0daa558e8fc8776c6777 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -226,3 +226,6 @@ cond_syscall(sys_seccomp); /* access BPF programs and maps */ cond_syscall(sys_bpf); + +/* execveat */ +cond_syscall(sys_execveat); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index d780351835e9cb9e0b9bcf26f0e85f43d11f15c4..5f2ce616c0462db9b9055528110268385b2b653e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -227,6 +227,22 @@ config UNUSED_SYMBOLS you really need it, and what the merge plan to the mainline kernel for your module is. +config PAGE_OWNER + bool "Track page owner" + depends on DEBUG_KERNEL && STACKTRACE_SUPPORT + select DEBUG_FS + select STACKTRACE + select PAGE_EXTENSION + help + This keeps track of what call chain is the owner of a page, may + help to find bare alloc_page(s) leaks. Even if you include this + feature on your build, it is disabled in default. You should pass + "page_owner=on" to boot parameter in order to enable it. Eats + a fair amount of memory if enabled. See tools/vm/page_owner_sort.c + for user-space helper. + + If unsure, say N. + config DEBUG_FS bool "Debug Filesystem" help diff --git a/lib/audit.c b/lib/audit.c index 1d726a22565bdc41a36a19b1a5c26a441549f9b3..b8fb5ee81e26efc3a9a1457b6f4423046750d869 100644 --- a/lib/audit.c +++ b/lib/audit.c @@ -53,6 +53,9 @@ int audit_classify_syscall(int abi, unsigned syscall) #ifdef __NR_socketcall case __NR_socketcall: return 4; +#endif +#ifdef __NR_execveat + case __NR_execveat: #endif case __NR_execve: return 5; diff --git a/lib/bitmap.c b/lib/bitmap.c index b499ab6ada29a0d4db2e4692083c2d21441a42ee..969ae8fbc85b50dc50d151192acb32a400524a5c 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -326,30 +326,32 @@ void bitmap_clear(unsigned long *map, unsigned int start, int len) } EXPORT_SYMBOL(bitmap_clear); -/* - * bitmap_find_next_zero_area - find a contiguous aligned zero area +/** + * bitmap_find_next_zero_area_off - find a contiguous aligned zero area * @map: The address to base the search on * @size: The bitmap size in bits * @start: The bitnumber to start searching at * @nr: The number of zeroed bits we're looking for * @align_mask: Alignment mask for zero area + * @align_offset: Alignment offset for zero area. * * The @align_mask should be one less than a power of 2; the effect is that - * the bit offset of all zero areas this function finds is multiples of that - * power of 2. A @align_mask of 0 means no alignment is required. + * the bit offset of all zero areas this function finds plus @align_offset + * is multiple of that power of 2. */ -unsigned long bitmap_find_next_zero_area(unsigned long *map, - unsigned long size, - unsigned long start, - unsigned int nr, - unsigned long align_mask) +unsigned long bitmap_find_next_zero_area_off(unsigned long *map, + unsigned long size, + unsigned long start, + unsigned int nr, + unsigned long align_mask, + unsigned long align_offset) { unsigned long index, end, i; again: index = find_next_zero_bit(map, size, start); /* Align allocation */ - index = __ALIGN_MASK(index, align_mask); + index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset; end = index + nr; if (end > size) @@ -361,7 +363,7 @@ unsigned long bitmap_find_next_zero_area(unsigned long *map, } return index; } -EXPORT_SYMBOL(bitmap_find_next_zero_area); +EXPORT_SYMBOL(bitmap_find_next_zero_area_off); /* * Bitmap printing & parsing functions: first version by Nadia Yvette Chambers, diff --git a/lib/decompress.c b/lib/decompress.c index 37f3c786348f21a233c7f8dc486d8ef3124e8dba..528ff932d8e445ae9e19940d4f13cd20c0273bc7 100644 --- a/lib/decompress.c +++ b/lib/decompress.c @@ -44,8 +44,8 @@ struct compress_format { }; static const struct compress_format compressed_formats[] __initconst = { - { {037, 0213}, "gzip", gunzip }, - { {037, 0236}, "gzip", gunzip }, + { {0x1f, 0x8b}, "gzip", gunzip }, + { {0x1f, 0x9e}, "gzip", gunzip }, { {0x42, 0x5a}, "bzip2", bunzip2 }, { {0x5d, 0x00}, "lzma", unlzma }, { {0xfd, 0x37}, "xz", unxz }, diff --git a/lib/decompress_bunzip2.c b/lib/decompress_bunzip2.c index 8290e0bef7eabd9f4284b9e2558f8a4e9afab6b0..6dd0335ea61b296b5dfd77818e58e294fb2e805c 100644 --- a/lib/decompress_bunzip2.c +++ b/lib/decompress_bunzip2.c @@ -184,7 +184,7 @@ static int INIT get_next_block(struct bunzip_data *bd) if (get_bits(bd, 1)) return RETVAL_OBSOLETE_INPUT; origPtr = get_bits(bd, 24); - if (origPtr > dbufSize) + if (origPtr >= dbufSize) return RETVAL_DATA_ERROR; /* mapping table: if some byte values are never used (encoding things like ascii text), the compression code removes the gaps to have fewer diff --git a/lib/fault-inject.c b/lib/fault-inject.c index d7d501ea856d743bef7acb9451d949bde1ed496d..f1cdeb024d172a488e1ed44f70ce1e090ee5e541 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -40,10 +40,16 @@ EXPORT_SYMBOL_GPL(setup_fault_attr); static void fail_dump(struct fault_attr *attr) { - if (attr->verbose > 0) - printk(KERN_NOTICE "FAULT_INJECTION: forcing a failure\n"); - if (attr->verbose > 1) - dump_stack(); + if (attr->verbose > 0 && __ratelimit(&attr->ratelimit_state)) { + printk(KERN_NOTICE "FAULT_INJECTION: forcing a failure.\n" + "name %pd, interval %lu, probability %lu, " + "space %d, times %d\n", attr->dname, + attr->probability, attr->interval, + atomic_read(&attr->space), + atomic_read(&attr->times)); + if (attr->verbose > 1) + dump_stack(); + } } #define atomic_dec_not_zero(v) atomic_add_unless((v), -1, 0) @@ -202,6 +208,12 @@ struct dentry *fault_create_debugfs_attr(const char *name, goto fail; if (!debugfs_create_ul("verbose", mode, dir, &attr->verbose)) goto fail; + if (!debugfs_create_u32("verbose_ratelimit_interval_ms", mode, dir, + &attr->ratelimit_state.interval)) + goto fail; + if (!debugfs_create_u32("verbose_ratelimit_burst", mode, dir, + &attr->ratelimit_state.burst)) + goto fail; if (!debugfs_create_bool("task-filter", mode, dir, &attr->task_filter)) goto fail; @@ -222,6 +234,7 @@ struct dentry *fault_create_debugfs_attr(const char *name, #endif /* CONFIG_FAULT_INJECTION_STACKTRACE_FILTER */ + attr->dname = dget(dir); return dir; fail: debugfs_remove_recursive(dir); diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 4b2443254de260d2e05cdecb8d3eea903d91734f..56badfc4810a8a4e70597ce82a0532929e6e6dda 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -1,8 +1,18 @@ +config PAGE_EXTENSION + bool "Extend memmap on extra space for more information on page" + ---help--- + Extend memmap on extra space for more information on page. This + could be used for debugging features that need to insert extra + field for every page. This extension enables us to save memory + by not allocating this extra memory according to boottime + configuration. + config DEBUG_PAGEALLOC bool "Debug page memory allocations" depends on DEBUG_KERNEL depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC depends on !KMEMCHECK + select PAGE_EXTENSION select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC ---help--- diff --git a/mm/Makefile b/mm/Makefile index b3c6ce932c64f172b436cd2ec13380180e5891ef..4bf586e663783a9a0e5992a01aacf044f199b2c9 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -63,6 +63,7 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o +obj-$(CONFIG_PAGE_OWNER) += page_owner.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o obj-$(CONFIG_ZPOOL) += zpool.o @@ -71,3 +72,4 @@ obj-$(CONFIG_ZSMALLOC) += zsmalloc.o obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o +obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o diff --git a/mm/cma.c b/mm/cma.c index 8e9ec13d31db2cce25384a4dc1a296e2a6fd2ca8..f8917629cbdd43da2bb4d0392f5f9198701e9cd7 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -33,6 +33,7 @@ #include <linux/log2.h> #include <linux/cma.h> #include <linux/highmem.h> +#include <linux/io.h> struct cma { unsigned long base_pfn; @@ -63,6 +64,17 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) return (1UL << (align_order - cma->order_per_bit)) - 1; } +static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order) +{ + unsigned int alignment; + + if (align_order <= cma->order_per_bit) + return 0; + alignment = 1UL << (align_order - cma->order_per_bit); + return ALIGN(cma->base_pfn, alignment) - + (cma->base_pfn >> cma->order_per_bit); +} + static unsigned long cma_bitmap_maxno(struct cma *cma) { return cma->count >> cma->order_per_bit; @@ -313,6 +325,11 @@ int __init cma_declare_contiguous(phys_addr_t base, } } + /* + * kmemleak scans/reads tracked objects for pointers to other + * objects but this address isn't mapped and accessible + */ + kmemleak_ignore(phys_to_virt(addr)); base = addr; } @@ -340,7 +357,7 @@ int __init cma_declare_contiguous(phys_addr_t base, */ struct page *cma_alloc(struct cma *cma, int count, unsigned int align) { - unsigned long mask, pfn, start = 0; + unsigned long mask, offset, pfn, start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; struct page *page = NULL; int ret; @@ -355,13 +372,15 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align) return NULL; mask = cma_bitmap_aligned_mask(cma, align); + offset = cma_bitmap_aligned_offset(cma, align); bitmap_maxno = cma_bitmap_maxno(cma); bitmap_count = cma_bitmap_pages_to_bits(cma, count); for (;;) { mutex_lock(&cma->lock); - bitmap_no = bitmap_find_next_zero_area(cma->bitmap, - bitmap_maxno, start, bitmap_count, mask); + bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap, + bitmap_maxno, start, bitmap_count, mask, + offset); if (bitmap_no >= bitmap_maxno) { mutex_unlock(&cma->lock); break; diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c index 789ff70c8a4ab7f84bb81410d3e3982a7cb3f987..5bf5906ce13b7316ccb3ff3d101ff6cff00c6818 100644 --- a/mm/debug-pagealloc.c +++ b/mm/debug-pagealloc.c @@ -2,23 +2,55 @@ #include <linux/string.h> #include <linux/mm.h> #include <linux/highmem.h> -#include <linux/page-debug-flags.h> +#include <linux/page_ext.h> #include <linux/poison.h> #include <linux/ratelimit.h> +static bool page_poisoning_enabled __read_mostly; + +static bool need_page_poisoning(void) +{ + if (!debug_pagealloc_enabled()) + return false; + + return true; +} + +static void init_page_poisoning(void) +{ + if (!debug_pagealloc_enabled()) + return; + + page_poisoning_enabled = true; +} + +struct page_ext_operations page_poisoning_ops = { + .need = need_page_poisoning, + .init = init_page_poisoning, +}; + static inline void set_page_poison(struct page *page) { - __set_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); + struct page_ext *page_ext; + + page_ext = lookup_page_ext(page); + __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); } static inline void clear_page_poison(struct page *page) { - __clear_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); + struct page_ext *page_ext; + + page_ext = lookup_page_ext(page); + __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); } static inline bool page_poison(struct page *page) { - return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); + struct page_ext *page_ext; + + page_ext = lookup_page_ext(page); + return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); } static void poison_page(struct page *page) @@ -93,8 +125,11 @@ static void unpoison_pages(struct page *page, int n) unpoison_page(page + i); } -void kernel_map_pages(struct page *page, int numpages, int enable) +void __kernel_map_pages(struct page *page, int numpages, int enable) { + if (!page_poisoning_enabled) + return; + if (enable) unpoison_pages(page, numpages); else diff --git a/mm/fadvise.c b/mm/fadvise.c index 3bcfd81db45ea1776b169cdc91430c7556dd14f8..2ad7adf4f0a459bcd3be1860d30a6382c8083a40 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -117,7 +117,11 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) __filemap_fdatawrite_range(mapping, offset, endbyte, WB_SYNC_NONE); - /* First and last FULL page! */ + /* + * First and last FULL page! Partial pages are deliberately + * preserved on the expectation that it is better to preserve + * needed memory than to discard unneeded memory. + */ start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; end_index = (endbyte >> PAGE_CACHE_SHIFT); diff --git a/mm/filemap.c b/mm/filemap.c index 14b4642279f1260e6bbd6a6b73b0518ee377acd1..e8905bc3cbd7c546dd9c1e586b61abac9d38cc8f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -62,16 +62,16 @@ /* * Lock ordering: * - * ->i_mmap_mutex (truncate_pagecache) + * ->i_mmap_rwsem (truncate_pagecache) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock * * ->i_mutex - * ->i_mmap_mutex (truncate->unmap_mapping_range) + * ->i_mmap_rwsem (truncate->unmap_mapping_range) * * ->mmap_sem - * ->i_mmap_mutex + * ->i_mmap_rwsem * ->page_table_lock or pte_lock (various, mainly in memory.c) * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) * @@ -85,7 +85,7 @@ * sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * - * ->i_mmap_mutex + * ->i_mmap_rwsem * ->anon_vma.lock (vma_adjust) * * ->anon_vma.lock @@ -105,7 +105,7 @@ * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * - * ->i_mmap_mutex + * ->i_mmap_rwsem * ->tasklist_lock (memory_failure, collect_procs_ao) */ diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index d8d9fe3f685c00c7e7f1430ca924850d89713b07..0d105aeff82fe258f06eaacbb8205ade160436cb 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -155,22 +155,14 @@ xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) EXPORT_SYMBOL_GPL(xip_file_read); /* - * __xip_unmap is invoked from xip_unmap and - * xip_write + * __xip_unmap is invoked from xip_unmap and xip_write * * This function walks all vmas of the address_space and unmaps the * __xip_sparse_page when found at pgoff. */ -static void -__xip_unmap (struct address_space * mapping, - unsigned long pgoff) +static void __xip_unmap(struct address_space * mapping, unsigned long pgoff) { struct vm_area_struct *vma; - struct mm_struct *mm; - unsigned long address; - pte_t *pte; - pte_t pteval; - spinlock_t *ptl; struct page *page; unsigned count; int locked = 0; @@ -182,11 +174,14 @@ __xip_unmap (struct address_space * mapping, return; retry: - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { - mm = vma->vm_mm; - address = vma->vm_start + + pte_t *pte, pteval; + spinlock_t *ptl; + struct mm_struct *mm = vma->vm_mm; + unsigned long address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + BUG_ON(address < vma->vm_start || address >= vma->vm_end); pte = page_check_address(page, mm, address, &ptl, 1); if (pte) { @@ -202,7 +197,7 @@ __xip_unmap (struct address_space * mapping, page_cache_release(page); } } - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_read(mapping); if (locked) { mutex_unlock(&xip_sparse_mutex); diff --git a/mm/fremap.c b/mm/fremap.c index 72b8fa36143328f728a44de1b5cd53b629653a0a..11ef7ec40d13b0a9baf6e1bb08cbf4c670cafa96 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -238,13 +238,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, } goto out_freed; } - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); flush_dcache_mmap_lock(mapping); vma->vm_flags |= VM_NONLINEAR; vma_interval_tree_remove(vma, &mapping->i_mmap); vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); flush_dcache_mmap_unlock(mapping); - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); } if (vma->vm_flags & VM_LOCKED) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 919b86a2164d2acacd3c17bf8ce3ddb1bfa1afe4..47f6070d7c46fa5d631c5ceeb64b9a31ab2553cf 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1457,7 +1457,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) return 0; found: - BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1)); + BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h))); /* Put them into a private list first because mem_map is not up yet */ list_add(&m->list, &huge_boot_pages); m->hstate = h; @@ -2083,7 +2083,7 @@ static void hugetlb_register_node(struct node *node) * devices of nodes that have memory. All on-line nodes should have * registered their associated device by this time. */ -static void hugetlb_register_all_nodes(void) +static void __init hugetlb_register_all_nodes(void) { int nid; @@ -2726,9 +2726,9 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, * on its way out. We're lucky that the flag has such an appropriate * name, and can in fact be safely cleared here. We could clear it * before the __unmap_hugepage_range above, but all that's necessary - * is to clear it before releasing the i_mmap_mutex. This works + * is to clear it before releasing the i_mmap_rwsem. This works * because in the context this is called, the VMA is about to be - * destroyed and the i_mmap_mutex is held. + * destroyed and the i_mmap_rwsem is held. */ vma->vm_flags &= ~VM_MAYSHARE; } @@ -2774,7 +2774,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * this mapping should be shared between all the VMAs, * __unmap_hugepage_range() is called as the lock is already held */ - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { /* Do not unmap the current VMA */ if (iter_vma == vma) @@ -2791,7 +2791,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, unmap_hugepage_range(iter_vma, address, address + huge_page_size(h), page); } - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); } /* @@ -3348,7 +3348,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, flush_cache_range(vma, address, end); mmu_notifier_invalidate_range_start(mm, start, end); - mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); + i_mmap_lock_write(vma->vm_file->f_mapping); for (; address < end; address += huge_page_size(h)) { spinlock_t *ptl; ptep = huge_pte_offset(mm, address); @@ -3370,13 +3370,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, spin_unlock(ptl); } /* - * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare + * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare * may have cleared our pud entry and done put_page on the page table: - * once we release i_mmap_mutex, another task can do the final put_page + * once we release i_mmap_rwsem, another task can do the final put_page * and that page table be reused and filled with junk. */ flush_tlb_range(vma, start, end); - mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); + i_mmap_unlock_write(vma->vm_file->f_mapping); mmu_notifier_invalidate_range_end(mm, start, end); return pages << h->order; @@ -3525,7 +3525,7 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) * and returns the corresponding pte. While this is not necessary for the * !shared pmd case because we can allocate the pmd later as well, it makes the * code much cleaner. pmd allocation is essential for the shared case because - * pud has to be populated inside the same i_mmap_mutex section - otherwise + * pud has to be populated inside the same i_mmap_rwsem section - otherwise * racing tasks could either miss the sharing (see huge_pte_offset) or select a * bad pmd for sharing. */ @@ -3544,7 +3544,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!vma_shareable(vma, addr)) return (pte_t *)pmd_alloc(mm, pud, addr); - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -3572,7 +3572,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); return pte; } diff --git a/mm/memblock.c b/mm/memblock.c index 6ecb0d937fb5f9bde4015d20b620484f5c965f84..252b77bdf65ef9ca65334e418e3ea231a3afb90b 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -715,16 +715,13 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) } /** - * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG. - * @base: the base phys addr of the region - * @size: the size of the region * - * This function isolates region [@base, @base + @size), and mark it with flag - * MEMBLOCK_HOTPLUG. + * This function isolates region [@base, @base + @size), and sets/clears flag * * Return 0 on succees, -errno on failure. */ -int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) +static int __init_memblock memblock_setclr_flag(phys_addr_t base, + phys_addr_t size, int set, int flag) { struct memblock_type *type = &memblock.memory; int i, ret, start_rgn, end_rgn; @@ -734,37 +731,37 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) return ret; for (i = start_rgn; i < end_rgn; i++) - memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG); + if (set) + memblock_set_region_flags(&type->regions[i], flag); + else + memblock_clear_region_flags(&type->regions[i], flag); memblock_merge_regions(type); return 0; } /** - * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. + * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG. * @base: the base phys addr of the region * @size: the size of the region * - * This function isolates region [@base, @base + @size), and clear flag - * MEMBLOCK_HOTPLUG for the isolated regions. + * Return 0 on succees, -errno on failure. + */ +int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(base, size, 1, MEMBLOCK_HOTPLUG); +} + +/** + * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. + * @base: the base phys addr of the region + * @size: the size of the region * * Return 0 on succees, -errno on failure. */ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) { - struct memblock_type *type = &memblock.memory; - int i, ret, start_rgn, end_rgn; - - ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); - if (ret) - return ret; - - for (i = start_rgn; i < end_rgn; i++) - memblock_clear_region_flags(&type->regions[i], - MEMBLOCK_HOTPLUG); - - memblock_merge_regions(type); - return 0; + return memblock_setclr_flag(base, size, 0, MEMBLOCK_HOTPLUG); } /** diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 85df503ec02347f14b9703c7318d8e67accdd8fb..ef91e856c7e456a0674e7b76e15cbd771a6c9acb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -296,7 +296,6 @@ struct mem_cgroup { * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; - unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ bool oom_lock; atomic_t under_oom; @@ -366,22 +365,11 @@ struct mem_cgroup { /* WARNING: nodeinfo must be the last member here */ }; -/* internal only representation about the status of kmem accounting. */ -enum { - KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */ -}; - #ifdef CONFIG_MEMCG_KMEM -static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) -{ - set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); -} - static bool memcg_kmem_is_active(struct mem_cgroup *memcg) { - return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); + return memcg->kmemcg_id >= 0; } - #endif /* Stuffs for move charges at task migration. */ @@ -1571,7 +1559,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, * select it. The goal is to allow it to allocate so that it may * quickly exit and free its memory. */ - if (fatal_signal_pending(current) || current->flags & PF_EXITING) { + if (fatal_signal_pending(current) || task_will_free_mem(current)) { set_thread_flag(TIF_MEMDIE); return; } @@ -1628,6 +1616,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, NULL, "Memory cgroup out of memory"); } +#if MAX_NUMNODES > 1 + /** * test_mem_cgroup_node_reclaimable * @memcg: the target memcg @@ -1650,7 +1640,6 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, return false; } -#if MAX_NUMNODES > 1 /* * Always updating the nodemask is not very good - even if we have an empty @@ -2646,7 +2635,6 @@ static void memcg_register_cache(struct mem_cgroup *memcg, if (!cachep) return; - css_get(&memcg->css); list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); /* @@ -2680,40 +2668,6 @@ static void memcg_unregister_cache(struct kmem_cache *cachep) list_del(&cachep->memcg_params->list); kmem_cache_destroy(cachep); - - /* drop the reference taken in memcg_register_cache */ - css_put(&memcg->css); -} - -/* - * During the creation a new cache, we need to disable our accounting mechanism - * altogether. This is true even if we are not creating, but rather just - * enqueing new caches to be created. - * - * This is because that process will trigger allocations; some visible, like - * explicit kmallocs to auxiliary data structures, name strings and internal - * cache structures; some well concealed, like INIT_WORK() that can allocate - * objects during debug. - * - * If any allocation happens during memcg_kmem_get_cache, we will recurse back - * to it. This may not be a bounded recursion: since the first cache creation - * failed to complete (waiting on the allocation), we'll just try to create the - * cache again, failing at the same point. - * - * memcg_kmem_get_cache is prepared to abort after seeing a positive count of - * memcg_kmem_skip_account. So we enclose anything that might allocate memory - * inside the following two functions. - */ -static inline void memcg_stop_kmem_account(void) -{ - VM_BUG_ON(!current->mm); - current->memcg_kmem_skip_account++; -} - -static inline void memcg_resume_kmem_account(void) -{ - VM_BUG_ON(!current->mm); - current->memcg_kmem_skip_account--; } int __memcg_cleanup_cache_params(struct kmem_cache *s) @@ -2747,9 +2701,7 @@ static void memcg_unregister_all_caches(struct mem_cgroup *memcg) mutex_lock(&memcg_slab_mutex); list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { cachep = memcg_params_to_cache(params); - kmem_cache_shrink(cachep); - if (atomic_read(&cachep->memcg_params->nr_pages) == 0) - memcg_unregister_cache(cachep); + memcg_unregister_cache(cachep); } mutex_unlock(&memcg_slab_mutex); } @@ -2784,10 +2736,10 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, struct memcg_register_cache_work *cw; cw = kmalloc(sizeof(*cw), GFP_NOWAIT); - if (cw == NULL) { - css_put(&memcg->css); + if (!cw) return; - } + + css_get(&memcg->css); cw->memcg = memcg; cw->cachep = cachep; @@ -2810,20 +2762,16 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg, * this point we can't allow ourselves back into memcg_kmem_get_cache, * the safest choice is to do it like this, wrapping the whole function. */ - memcg_stop_kmem_account(); + current->memcg_kmem_skip_account = 1; __memcg_schedule_register_cache(memcg, cachep); - memcg_resume_kmem_account(); + current->memcg_kmem_skip_account = 0; } int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) { unsigned int nr_pages = 1 << order; - int res; - res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); - if (!res) - atomic_add(nr_pages, &cachep->memcg_params->nr_pages); - return res; + return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); } void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) @@ -2831,7 +2779,6 @@ void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) unsigned int nr_pages = 1 << order; memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); - atomic_sub(nr_pages, &cachep->memcg_params->nr_pages); } /* @@ -2847,8 +2794,7 @@ void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) * Can't be called in interrupt context or from kernel threads. * This function needs to be called with rcu_read_lock() held. */ -struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, - gfp_t gfp) +struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) { struct mem_cgroup *memcg; struct kmem_cache *memcg_cachep; @@ -2856,25 +2802,16 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, VM_BUG_ON(!cachep->memcg_params); VM_BUG_ON(!cachep->memcg_params->is_root_cache); - if (!current->mm || current->memcg_kmem_skip_account) + if (current->memcg_kmem_skip_account) return cachep; - rcu_read_lock(); - memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); - + memcg = get_mem_cgroup_from_mm(current->mm); if (!memcg_kmem_is_active(memcg)) goto out; memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); - if (likely(memcg_cachep)) { - cachep = memcg_cachep; - goto out; - } - - /* The corresponding put will be done in the workqueue. */ - if (!css_tryget_online(&memcg->css)) - goto out; - rcu_read_unlock(); + if (likely(memcg_cachep)) + return memcg_cachep; /* * If we are in a safe context (can wait, and not in interrupt @@ -2889,12 +2826,17 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, * defer everything. */ memcg_schedule_register_cache(memcg, cachep); - return cachep; out: - rcu_read_unlock(); + css_put(&memcg->css); return cachep; } +void __memcg_kmem_put_cache(struct kmem_cache *cachep) +{ + if (!is_root_cache(cachep)) + css_put(&cachep->memcg_params->memcg->css); +} + /* * We need to verify if the allocation against current->mm->owner's memcg is * possible for the given order. But the page is not allocated yet, so we'll @@ -2917,34 +2859,6 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) *_memcg = NULL; - /* - * Disabling accounting is only relevant for some specific memcg - * internal allocations. Therefore we would initially not have such - * check here, since direct calls to the page allocator that are - * accounted to kmemcg (alloc_kmem_pages and friends) only happen - * outside memcg core. We are mostly concerned with cache allocations, - * and by having this test at memcg_kmem_get_cache, we are already able - * to relay the allocation to the root cache and bypass the memcg cache - * altogether. - * - * There is one exception, though: the SLUB allocator does not create - * large order caches, but rather service large kmallocs directly from - * the page allocator. Therefore, the following sequence when backed by - * the SLUB allocator: - * - * memcg_stop_kmem_account(); - * kmalloc(<large_number>) - * memcg_resume_kmem_account(); - * - * would effectively ignore the fact that we should skip accounting, - * since it will drive us directly to this function without passing - * through the cache selector memcg_kmem_get_cache. Such large - * allocations are extremely rare but can happen, for instance, for the - * cache arrays. We bring this test here. - */ - if (!current->mm || current->memcg_kmem_skip_account) - return true; - memcg = get_mem_cgroup_from_mm(current->mm); if (!memcg_kmem_is_active(memcg)) { @@ -2985,10 +2899,6 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) memcg_uncharge_kmem(memcg, 1 << order); page->mem_cgroup = NULL; } -#else -static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) -{ -} #endif /* CONFIG_MEMCG_KMEM */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -3538,12 +3448,6 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, if (memcg_kmem_is_active(memcg)) return 0; - /* - * We are going to allocate memory for data shared by all memory - * cgroups so let's stop accounting here. - */ - memcg_stop_kmem_account(); - /* * For simplicity, we won't allow this to be disabled. It also can't * be changed if the cgroup has children already, or if tasks had @@ -3570,25 +3474,22 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, goto out; } - memcg->kmemcg_id = memcg_id; - INIT_LIST_HEAD(&memcg->memcg_slab_caches); - /* - * We couldn't have accounted to this cgroup, because it hasn't got the - * active bit set yet, so this should succeed. + * We couldn't have accounted to this cgroup, because it hasn't got + * activated yet, so this should succeed. */ err = page_counter_limit(&memcg->kmem, nr_pages); VM_BUG_ON(err); static_key_slow_inc(&memcg_kmem_enabled_key); /* - * Setting the active bit after enabling static branching will + * A memory cgroup is considered kmem-active as soon as it gets + * kmemcg_id. Setting the id after enabling static branching will * guarantee no one starts accounting before all call sites are * patched. */ - memcg_kmem_set_active(memcg); + memcg->kmemcg_id = memcg_id; out: - memcg_resume_kmem_account(); return err; } @@ -3791,11 +3692,6 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) } #endif /* CONFIG_NUMA */ -static inline void mem_cgroup_lru_names_not_uptodate(void) -{ - BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); -} - static int memcg_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); @@ -3803,6 +3699,8 @@ static int memcg_stat_show(struct seq_file *m, void *v) struct mem_cgroup *mi; unsigned int i; + BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); + for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) continue; @@ -4259,7 +4157,6 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { int ret; - memcg->kmemcg_id = -1; ret = memcg_propagate_kmem(memcg); if (ret) return ret; @@ -4269,6 +4166,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) static void memcg_destroy_kmem(struct mem_cgroup *memcg) { + memcg_unregister_all_caches(memcg); mem_cgroup_sockets_destroy(memcg); } #else @@ -4724,17 +4622,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) free_percpu(memcg->stat); - /* - * We need to make sure that (at least for now), the jump label - * destruction code runs outside of the cgroup lock. This is because - * get_online_cpus(), which is called from the static_branch update, - * can't be called inside the cgroup_lock. cpusets are the ones - * enforcing this dependency, so if they ever change, we might as well. - * - * schedule_work() will guarantee this happens. Be careful if you need - * to move this code around, and make sure it is outside - * the cgroup_lock. - */ disarm_static_keys(memcg); kfree(memcg); } @@ -4804,6 +4691,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) vmpressure_init(&memcg->vmpressure); INIT_LIST_HEAD(&memcg->event_list); spin_lock_init(&memcg->event_list_lock); +#ifdef CONFIG_MEMCG_KMEM + memcg->kmemcg_id = -1; + INIT_LIST_HEAD(&memcg->memcg_slab_caches); +#endif return &memcg->css; @@ -4885,7 +4776,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) } spin_unlock(&memcg->event_list_lock); - memcg_unregister_all_caches(memcg); vmpressure_cleanup(&memcg->vmpressure); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e5ee0ca7ae854610fc2335d46dd46b99eadf707f..feb803bf344364132e92bc10d575ba26bdd0b45d 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -239,19 +239,14 @@ void shake_page(struct page *p, int access) } /* - * Only call shrink_slab here (which would also shrink other caches) if - * access is not potentially fatal. + * Only call shrink_node_slabs here (which would also shrink + * other caches) if access is not potentially fatal. */ if (access) { int nr; int nid = page_to_nid(p); do { - struct shrink_control shrink = { - .gfp_mask = GFP_KERNEL, - }; - node_set(nid, shrink.nodes_to_scan); - - nr = shrink_slab(&shrink, 1000, 1000); + nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000); if (page_count(p) == 1) break; } while (nr > 10); @@ -466,7 +461,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, struct task_struct *tsk; struct address_space *mapping = page->mapping; - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_read(mapping); read_lock(&tasklist_lock); for_each_process(tsk) { pgoff_t pgoff = page_to_pgoff(page); @@ -488,7 +483,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, } } read_unlock(&tasklist_lock); - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_read(mapping); } /* diff --git a/mm/memory.c b/mm/memory.c index 4b5a282e110739d957cd810c02179d63970ce911..fbf74112de5bfa39496ae491bd39ea9ef38ae00d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1326,9 +1326,9 @@ static void unmap_single_vma(struct mmu_gather *tlb, * safe to do nothing in this case. */ if (vma->vm_file) { - mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); + i_mmap_lock_write(vma->vm_file->f_mapping); __unmap_hugepage_range_final(tlb, vma, start, end, NULL); - mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); + i_mmap_unlock_write(vma->vm_file->f_mapping); } } else unmap_page_range(tlb, vma, start, end, details); @@ -2377,12 +2377,12 @@ void unmap_mapping_range(struct address_space *mapping, details.last_index = ULONG_MAX; - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_read(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_read(mapping); } EXPORT_SYMBOL(unmap_mapping_range); @@ -3365,6 +3365,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, return ret; } +EXPORT_SYMBOL_GPL(handle_mm_fault); #ifndef __PAGETABLE_PUD_FOLDED /* diff --git a/mm/migrate.c b/mm/migrate.c index 01439953abf548690ac17f1b994511b587e9326a..253474c22239a33b4f678d3f8a4dfd2a171252fb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -746,7 +746,7 @@ static int fallback_migrate_page(struct address_space *mapping, * MIGRATEPAGE_SUCCESS - success */ static int move_to_new_page(struct page *newpage, struct page *page, - int remap_swapcache, enum migrate_mode mode) + int page_was_mapped, enum migrate_mode mode) { struct address_space *mapping; int rc; @@ -784,7 +784,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, newpage->mapping = NULL; } else { mem_cgroup_migrate(page, newpage, false); - if (remap_swapcache) + if (page_was_mapped) remove_migration_ptes(page, newpage); page->mapping = NULL; } @@ -798,7 +798,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, int force, enum migrate_mode mode) { int rc = -EAGAIN; - int remap_swapcache = 1; + int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; if (!trylock_page(page)) { @@ -870,7 +870,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * migrated but are not remapped when migration * completes */ - remap_swapcache = 0; } else { goto out_unlock; } @@ -910,13 +909,17 @@ static int __unmap_and_move(struct page *page, struct page *newpage, } /* Establish migration ptes or remove ptes */ - try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + if (page_mapped(page)) { + try_to_unmap(page, + TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + page_was_mapped = 1; + } skip_unmap: if (!page_mapped(page)) - rc = move_to_new_page(newpage, page, remap_swapcache, mode); + rc = move_to_new_page(newpage, page, page_was_mapped, mode); - if (rc && remap_swapcache) + if (rc && page_was_mapped) remove_migration_ptes(page, page); /* Drop an anon_vma reference if we took one */ @@ -1017,6 +1020,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, { int rc = 0; int *result = NULL; + int page_was_mapped = 0; struct page *new_hpage; struct anon_vma *anon_vma = NULL; @@ -1047,12 +1051,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (PageAnon(hpage)) anon_vma = page_get_anon_vma(hpage); - try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + if (page_mapped(hpage)) { + try_to_unmap(hpage, + TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + page_was_mapped = 1; + } if (!page_mapped(hpage)) - rc = move_to_new_page(new_hpage, hpage, 1, mode); + rc = move_to_new_page(new_hpage, hpage, page_was_mapped, mode); - if (rc != MIGRATEPAGE_SUCCESS) + if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped) remove_migration_ptes(hpage, hpage); if (anon_vma) diff --git a/mm/mincore.c b/mm/mincore.c index 725c809610483c6822d3ffa158820d3ff64da1a6..c8c528b3664195a5373dbe5d1608e7b958a59f75 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -137,8 +137,11 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } else { /* pte is a swap entry */ swp_entry_t entry = pte_to_swp_entry(pte); - if (is_migration_entry(entry)) { - /* migration entries are always uptodate */ + if (non_swap_entry(entry)) { + /* + * migration or hwpoison entries are always + * uptodate + */ *vec = 1; } else { #ifdef CONFIG_SWAP diff --git a/mm/mmap.c b/mm/mmap.c index b6c0a77fc1c8260c700fa001a5e2b912f4b80db5..7b36aa7cc89a43c7c5909b7799b77d13106a2929 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -232,7 +232,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) } /* - * Requires inode->i_mapping->i_mmap_mutex + * Requires inode->i_mapping->i_mmap_rwsem */ static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) @@ -260,9 +260,9 @@ void unlink_file_vma(struct vm_area_struct *vma) if (file) { struct address_space *mapping = file->f_mapping; - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); __remove_shared_vm_struct(vma, file, mapping); - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); } } @@ -674,14 +674,14 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, if (vma->vm_file) { mapping = vma->vm_file->f_mapping; - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); } __vma_link(mm, vma, prev, rb_link, rb_parent); __vma_link_file(vma); if (mapping) - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); mm->map_count++; validate_mm(mm); @@ -796,7 +796,7 @@ again: remove_next = 1 + (end > next->vm_end); next->vm_end); } - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); if (insert) { /* * Put into interval tree now, so instantiated pages @@ -883,7 +883,7 @@ again: remove_next = 1 + (end > next->vm_end); anon_vma_unlock_write(anon_vma); } if (mapping) - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); if (root) { uprobe_mmap(vma); @@ -2362,6 +2362,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) } #endif +EXPORT_SYMBOL_GPL(find_extend_vma); + /* * Ok - we have the memory areas we should free on the vma list, * so release them, and do the vma updates. @@ -2791,7 +2793,7 @@ void exit_mmap(struct mm_struct *mm) /* Insert vm structure into process list sorted by address * and into the inode's i_mmap tree. If vm_file is non-NULL - * then i_mmap_mutex is taken here. + * then i_mmap_rwsem is taken here. */ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { @@ -3086,7 +3088,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) */ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); - mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem); + down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem); } } @@ -3113,7 +3115,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) * vma in this mm is backed by the same anon_vma or address_space. * * We can take all the locks in random order because the VM code - * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never + * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never * takes more than one of them in a row. Secondly we're protected * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. * @@ -3182,7 +3184,7 @@ static void vm_unlock_mapping(struct address_space *mapping) * AS_MM_ALL_LOCKS can't change to 0 from under us * because we hold the mm_all_locks_mutex. */ - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); if (!test_and_clear_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); diff --git a/mm/mremap.c b/mm/mremap.c index b147f66f4c40f8a639def879df125f48e8c5595c..84aa36f9f30886358d048aa3240eae36b9e72d0e 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -99,7 +99,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, spinlock_t *old_ptl, *new_ptl; /* - * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma + * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma * locks to ensure that rmap will always observe either the old or the * new ptes. This is the easiest way to avoid races with * truncate_pagecache(), page migration, etc... @@ -119,7 +119,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, if (need_rmap_locks) { if (vma->vm_file) { mapping = vma->vm_file->f_mapping; - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); } if (vma->anon_vma) { anon_vma = vma->anon_vma; @@ -156,7 +156,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, if (anon_vma) anon_vma_unlock_write(anon_vma); if (mapping) - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); } #define LATENCY_LIMIT (64 * PAGE_SIZE) diff --git a/mm/nommu.c b/mm/nommu.c index bd1808e194a7f36a2e1cc418c51ea85dffc713fc..b51eadf6d9528fa69ea80ad6d1c994763656e8da 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -722,11 +722,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) if (vma->vm_file) { mapping = vma->vm_file->f_mapping; - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); flush_dcache_mmap_lock(mapping); vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); } /* add the VMA to the tree */ @@ -795,11 +795,11 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) if (vma->vm_file) { mapping = vma->vm_file->f_mapping; - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); flush_dcache_mmap_lock(mapping); vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); } /* remove from the MM's tree and list */ @@ -1149,8 +1149,7 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len, unsigned long capabilities) { - struct page *pages; - unsigned long total, point, n; + unsigned long total, point; void *base; int ret, order; @@ -1182,33 +1181,23 @@ static int do_mmap_private(struct vm_area_struct *vma, order = get_order(len); kdebug("alloc order %d for %lx", order, len); - pages = alloc_pages(GFP_KERNEL, order); - if (!pages) - goto enomem; - total = 1 << order; - atomic_long_add(total, &mmap_pages_allocated); - point = len >> PAGE_SHIFT; - /* we allocated a power-of-2 sized page set, so we may want to trim off - * the excess */ + /* we don't want to allocate a power-of-2 sized page set */ if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { - while (total > point) { - order = ilog2(total - point); - n = 1 << order; - kdebug("shave %lu/%lu @%lu", n, total - point, total); - atomic_long_sub(n, &mmap_pages_allocated); - total -= n; - set_page_refcounted(pages + total); - __free_pages(pages + total, order); - } + total = point; + kdebug("try to alloc exact %lu pages", total); + base = alloc_pages_exact(len, GFP_KERNEL); + } else { + base = (void *)__get_free_pages(GFP_KERNEL, order); } - for (point = 1; point < total; point++) - set_page_refcounted(&pages[point]); + if (!base) + goto enomem; + + atomic_long_add(total, &mmap_pages_allocated); - base = page_address(pages); region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; region->vm_start = (unsigned long) base; region->vm_end = region->vm_start + len; @@ -2094,14 +2083,14 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; down_write(&nommu_region_sem); - mutex_lock(&inode->i_mapping->i_mmap_mutex); + i_mmap_lock_read(inode->i_mapping); /* search for VMAs that fall within the dead zone */ vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) { /* found one - only interested if it's shared out of the page * cache */ if (vma->vm_flags & VM_SHARED) { - mutex_unlock(&inode->i_mapping->i_mmap_mutex); + i_mmap_unlock_read(inode->i_mapping); up_write(&nommu_region_sem); return -ETXTBSY; /* not quite true, but near enough */ } @@ -2113,8 +2102,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, * we don't check for any regions that start beyond the EOF as there * shouldn't be any */ - vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, - 0, ULONG_MAX) { + vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, 0, ULONG_MAX) { if (!(vma->vm_flags & VM_SHARED)) continue; @@ -2129,7 +2117,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, } } - mutex_unlock(&inode->i_mapping->i_mmap_mutex); + i_mmap_unlock_read(inode->i_mapping); up_write(&nommu_region_sem); return 0; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 864bba99273532d0b10d4a6f9a80053bf8ac8784..d503e9ce1c7b7f82b1fa10918c34d933b23e3b01 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -281,14 +281,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, if (oom_task_origin(task)) return OOM_SCAN_SELECT; - if (task->flags & PF_EXITING && !force_kill) { - /* - * If this task is not being ptraced on exit, then wait for it - * to finish before killing some other task unnecessarily. - */ - if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) - return OOM_SCAN_ABORT; - } + if (task_will_free_mem(task) && !force_kill) + return OOM_SCAN_ABORT; + return OOM_SCAN_OK; } @@ -443,7 +438,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, * If the task is already exiting, don't alarm the sysadmin or kill * its children or threads, just set TIF_MEMDIE so it can die quickly */ - if (p->flags & PF_EXITING) { + if (task_will_free_mem(p)) { set_tsk_thread_flag(p, TIF_MEMDIE); put_task_struct(p); return; @@ -649,7 +644,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, * select it. The goal is to allow it to allocate so that it may * quickly exit and free its memory. */ - if (fatal_signal_pending(current) || current->flags & PF_EXITING) { + if (fatal_signal_pending(current) || task_will_free_mem(current)) { set_thread_flag(TIF_MEMDIE); return; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index df542feaac3bb6ce749c408cca37623af9ad19bf..fa974d87f60df7a614980148500da6dedb696cd8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -48,6 +48,7 @@ #include <linux/backing-dev.h> #include <linux/fault-inject.h> #include <linux/page-isolation.h> +#include <linux/page_ext.h> #include <linux/debugobjects.h> #include <linux/kmemleak.h> #include <linux/compaction.h> @@ -55,9 +56,10 @@ #include <linux/prefetch.h> #include <linux/mm_inline.h> #include <linux/migrate.h> -#include <linux/page-debug-flags.h> +#include <linux/page_ext.h> #include <linux/hugetlb.h> #include <linux/sched/rt.h> +#include <linux/page_owner.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -424,6 +426,42 @@ static inline void prep_zero_page(struct page *page, unsigned int order, #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; +bool _debug_pagealloc_enabled __read_mostly; +bool _debug_guardpage_enabled __read_mostly; + +static int __init early_debug_pagealloc(char *buf) +{ + if (!buf) + return -EINVAL; + + if (strcmp(buf, "on") == 0) + _debug_pagealloc_enabled = true; + + return 0; +} +early_param("debug_pagealloc", early_debug_pagealloc); + +static bool need_debug_guardpage(void) +{ + /* If we don't use debug_pagealloc, we don't need guard page */ + if (!debug_pagealloc_enabled()) + return false; + + return true; +} + +static void init_debug_guardpage(void) +{ + if (!debug_pagealloc_enabled()) + return; + + _debug_guardpage_enabled = true; +} + +struct page_ext_operations debug_guardpage_ops = { + .need = need_debug_guardpage, + .init = init_debug_guardpage, +}; static int __init debug_guardpage_minorder_setup(char *buf) { @@ -439,18 +477,44 @@ static int __init debug_guardpage_minorder_setup(char *buf) } __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); -static inline void set_page_guard_flag(struct page *page) +static inline void set_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) { - __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); + struct page_ext *page_ext; + + if (!debug_guardpage_enabled()) + return; + + page_ext = lookup_page_ext(page); + __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); + + INIT_LIST_HEAD(&page->lru); + set_page_private(page, order); + /* Guard pages are not available for any usage */ + __mod_zone_freepage_state(zone, -(1 << order), migratetype); } -static inline void clear_page_guard_flag(struct page *page) +static inline void clear_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) { - __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); + struct page_ext *page_ext; + + if (!debug_guardpage_enabled()) + return; + + page_ext = lookup_page_ext(page); + __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); + + set_page_private(page, 0); + if (!is_migrate_isolate(migratetype)) + __mod_zone_freepage_state(zone, (1 << order), migratetype); } #else -static inline void set_page_guard_flag(struct page *page) { } -static inline void clear_page_guard_flag(struct page *page) { } +struct page_ext_operations debug_guardpage_ops = { NULL, }; +static inline void set_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) {} +static inline void clear_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) {} #endif static inline void set_page_order(struct page *page, unsigned int order) @@ -581,12 +645,7 @@ static inline void __free_one_page(struct page *page, * merge with it and move up one order. */ if (page_is_guard(buddy)) { - clear_page_guard_flag(buddy); - set_page_private(buddy, 0); - if (!is_migrate_isolate(migratetype)) { - __mod_zone_freepage_state(zone, 1 << order, - migratetype); - } + clear_page_guard(zone, buddy, order, migratetype); } else { list_del(&buddy->lru); zone->free_area[order].nr_free--; @@ -755,6 +814,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order) if (bad) return false; + reset_page_owner(page, order); + if (!PageHighMem(page)) { debug_check_no_locks_freed(page_address(page), PAGE_SIZE << order); @@ -861,23 +922,18 @@ static inline void expand(struct zone *zone, struct page *page, size >>= 1; VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); -#ifdef CONFIG_DEBUG_PAGEALLOC - if (high < debug_guardpage_minorder()) { + if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && + debug_guardpage_enabled() && + high < debug_guardpage_minorder()) { /* * Mark as guard pages (or page), that will allow to * merge back to allocator when buddy will be freed. * Corresponding page table entries will not be touched, * pages will stay not present in virtual address space */ - INIT_LIST_HEAD(&page[size].lru); - set_page_guard_flag(&page[size]); - set_page_private(&page[size], high); - /* Guard pages are not available for any usage */ - __mod_zone_freepage_state(zone, -(1 << high), - migratetype); + set_page_guard(zone, &page[size], high, migratetype); continue; } -#endif list_add(&page[size].lru, &area->free_list[migratetype]); area->nr_free++; set_page_order(&page[size], high); @@ -935,6 +991,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); + set_page_owner(page, order, gfp_flags); + return 0; } @@ -1507,8 +1565,11 @@ void split_page(struct page *page, unsigned int order) split_page(virt_to_page(page[0].shadow), order); #endif - for (i = 1; i < (1 << order); i++) + set_page_owner(page, 0, 0); + for (i = 1; i < (1 << order); i++) { set_page_refcounted(page + i); + set_page_owner(page + i, 0, 0); + } } EXPORT_SYMBOL_GPL(split_page); @@ -1548,6 +1609,7 @@ int __isolate_free_page(struct page *page, unsigned int order) } } + set_page_owner(page, order, 0); return 1UL << order; } @@ -4856,6 +4918,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, #endif init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); + pgdat_page_ext_init(pgdat); for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; @@ -4874,16 +4937,18 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, * and per-cpu initialisations */ memmap_pages = calc_memmap_size(size, realsize); - if (freesize >= memmap_pages) { - freesize -= memmap_pages; - if (memmap_pages) - printk(KERN_DEBUG - " %s zone: %lu pages used for memmap\n", - zone_names[j], memmap_pages); - } else - printk(KERN_WARNING - " %s zone: %lu pages exceeds freesize %lu\n", - zone_names[j], memmap_pages, freesize); + if (!is_highmem_idx(j)) { + if (freesize >= memmap_pages) { + freesize -= memmap_pages; + if (memmap_pages) + printk(KERN_DEBUG + " %s zone: %lu pages used for memmap\n", + zone_names[j], memmap_pages); + } else + printk(KERN_WARNING + " %s zone: %lu pages exceeds freesize %lu\n", + zone_names[j], memmap_pages, freesize); + } /* Account for reserved pages */ if (j == 0 && freesize > dma_reserve) { @@ -6221,9 +6286,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, if (!PageLRU(page)) found++; /* - * If there are RECLAIMABLE pages, we need to check it. - * But now, memory offline itself doesn't call shrink_slab() - * and it still to be fixed. + * If there are RECLAIMABLE pages, we need to check + * it. But now, memory offline itself doesn't call + * shrink_node_slabs() and it still to be fixed. */ /* * If the page is not RAM, page_count()should be 0. diff --git a/mm/page_ext.c b/mm/page_ext.c new file mode 100644 index 0000000000000000000000000000000000000000..d86fd2f5353fcb05d39c27887f64b81e148a2bc6 --- /dev/null +++ b/mm/page_ext.c @@ -0,0 +1,403 @@ +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/bootmem.h> +#include <linux/page_ext.h> +#include <linux/memory.h> +#include <linux/vmalloc.h> +#include <linux/kmemleak.h> +#include <linux/page_owner.h> + +/* + * struct page extension + * + * This is the feature to manage memory for extended data per page. + * + * Until now, we must modify struct page itself to store extra data per page. + * This requires rebuilding the kernel and it is really time consuming process. + * And, sometimes, rebuild is impossible due to third party module dependency. + * At last, enlarging struct page could cause un-wanted system behaviour change. + * + * This feature is intended to overcome above mentioned problems. This feature + * allocates memory for extended data per page in certain place rather than + * the struct page itself. This memory can be accessed by the accessor + * functions provided by this code. During the boot process, it checks whether + * allocation of huge chunk of memory is needed or not. If not, it avoids + * allocating memory at all. With this advantage, we can include this feature + * into the kernel in default and can avoid rebuild and solve related problems. + * + * To help these things to work well, there are two callbacks for clients. One + * is the need callback which is mandatory if user wants to avoid useless + * memory allocation at boot-time. The other is optional, init callback, which + * is used to do proper initialization after memory is allocated. + * + * The need callback is used to decide whether extended memory allocation is + * needed or not. Sometimes users want to deactivate some features in this + * boot and extra memory would be unneccessary. In this case, to avoid + * allocating huge chunk of memory, each clients represent their need of + * extra memory through the need callback. If one of the need callbacks + * returns true, it means that someone needs extra memory so that + * page extension core should allocates memory for page extension. If + * none of need callbacks return true, memory isn't needed at all in this boot + * and page extension core can skip to allocate memory. As result, + * none of memory is wasted. + * + * The init callback is used to do proper initialization after page extension + * is completely initialized. In sparse memory system, extra memory is + * allocated some time later than memmap is allocated. In other words, lifetime + * of memory for page extension isn't same with memmap for struct page. + * Therefore, clients can't store extra data until page extension is + * initialized, even if pages are allocated and used freely. This could + * cause inadequate state of extra data per page, so, to prevent it, client + * can utilize this callback to initialize the state of it correctly. + */ + +static struct page_ext_operations *page_ext_ops[] = { + &debug_guardpage_ops, +#ifdef CONFIG_PAGE_POISONING + &page_poisoning_ops, +#endif +#ifdef CONFIG_PAGE_OWNER + &page_owner_ops, +#endif +}; + +static unsigned long total_usage; + +static bool __init invoke_need_callbacks(void) +{ + int i; + int entries = ARRAY_SIZE(page_ext_ops); + + for (i = 0; i < entries; i++) { + if (page_ext_ops[i]->need && page_ext_ops[i]->need()) + return true; + } + + return false; +} + +static void __init invoke_init_callbacks(void) +{ + int i; + int entries = ARRAY_SIZE(page_ext_ops); + + for (i = 0; i < entries; i++) { + if (page_ext_ops[i]->init) + page_ext_ops[i]->init(); + } +} + +#if !defined(CONFIG_SPARSEMEM) + + +void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) +{ + pgdat->node_page_ext = NULL; +} + +struct page_ext *lookup_page_ext(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + unsigned long offset; + struct page_ext *base; + + base = NODE_DATA(page_to_nid(page))->node_page_ext; +#ifdef CONFIG_DEBUG_VM + /* + * The sanity checks the page allocator does upon freeing a + * page can reach here before the page_ext arrays are + * allocated when feeding a range of pages to the allocator + * for the first time during bootup or memory hotplug. + */ + if (unlikely(!base)) + return NULL; +#endif + offset = pfn - round_down(node_start_pfn(page_to_nid(page)), + MAX_ORDER_NR_PAGES); + return base + offset; +} + +static int __init alloc_node_page_ext(int nid) +{ + struct page_ext *base; + unsigned long table_size; + unsigned long nr_pages; + + nr_pages = NODE_DATA(nid)->node_spanned_pages; + if (!nr_pages) + return 0; + + /* + * Need extra space if node range is not aligned with + * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm + * checks buddy's status, range could be out of exact node range. + */ + if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) || + !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) + nr_pages += MAX_ORDER_NR_PAGES; + + table_size = sizeof(struct page_ext) * nr_pages; + + base = memblock_virt_alloc_try_nid_nopanic( + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nid); + if (!base) + return -ENOMEM; + NODE_DATA(nid)->node_page_ext = base; + total_usage += table_size; + return 0; +} + +void __init page_ext_init_flatmem(void) +{ + + int nid, fail; + + if (!invoke_need_callbacks()) + return; + + for_each_online_node(nid) { + fail = alloc_node_page_ext(nid); + if (fail) + goto fail; + } + pr_info("allocated %ld bytes of page_ext\n", total_usage); + invoke_init_callbacks(); + return; + +fail: + pr_crit("allocation of page_ext failed.\n"); + panic("Out of memory"); +} + +#else /* CONFIG_FLAT_NODE_MEM_MAP */ + +struct page_ext *lookup_page_ext(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + struct mem_section *section = __pfn_to_section(pfn); +#ifdef CONFIG_DEBUG_VM + /* + * The sanity checks the page allocator does upon freeing a + * page can reach here before the page_ext arrays are + * allocated when feeding a range of pages to the allocator + * for the first time during bootup or memory hotplug. + */ + if (!section->page_ext) + return NULL; +#endif + return section->page_ext + pfn; +} + +static void *__meminit alloc_page_ext(size_t size, int nid) +{ + gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; + void *addr = NULL; + + addr = alloc_pages_exact_nid(nid, size, flags); + if (addr) { + kmemleak_alloc(addr, size, 1, flags); + return addr; + } + + if (node_state(nid, N_HIGH_MEMORY)) + addr = vzalloc_node(size, nid); + else + addr = vzalloc(size); + + return addr; +} + +static int __meminit init_section_page_ext(unsigned long pfn, int nid) +{ + struct mem_section *section; + struct page_ext *base; + unsigned long table_size; + + section = __pfn_to_section(pfn); + + if (section->page_ext) + return 0; + + table_size = sizeof(struct page_ext) * PAGES_PER_SECTION; + base = alloc_page_ext(table_size, nid); + + /* + * The value stored in section->page_ext is (base - pfn) + * and it does not point to the memory block allocated above, + * causing kmemleak false positives. + */ + kmemleak_not_leak(base); + + if (!base) { + pr_err("page ext allocation failure\n"); + return -ENOMEM; + } + + /* + * The passed "pfn" may not be aligned to SECTION. For the calculation + * we need to apply a mask. + */ + pfn &= PAGE_SECTION_MASK; + section->page_ext = base - pfn; + total_usage += table_size; + return 0; +} +#ifdef CONFIG_MEMORY_HOTPLUG +static void free_page_ext(void *addr) +{ + if (is_vmalloc_addr(addr)) { + vfree(addr); + } else { + struct page *page = virt_to_page(addr); + size_t table_size; + + table_size = sizeof(struct page_ext) * PAGES_PER_SECTION; + + BUG_ON(PageReserved(page)); + free_pages_exact(addr, table_size); + } +} + +static void __free_page_ext(unsigned long pfn) +{ + struct mem_section *ms; + struct page_ext *base; + + ms = __pfn_to_section(pfn); + if (!ms || !ms->page_ext) + return; + base = ms->page_ext + pfn; + free_page_ext(base); + ms->page_ext = NULL; +} + +static int __meminit online_page_ext(unsigned long start_pfn, + unsigned long nr_pages, + int nid) +{ + unsigned long start, end, pfn; + int fail = 0; + + start = SECTION_ALIGN_DOWN(start_pfn); + end = SECTION_ALIGN_UP(start_pfn + nr_pages); + + if (nid == -1) { + /* + * In this case, "nid" already exists and contains valid memory. + * "start_pfn" passed to us is a pfn which is an arg for + * online__pages(), and start_pfn should exist. + */ + nid = pfn_to_nid(start_pfn); + VM_BUG_ON(!node_state(nid, N_ONLINE)); + } + + for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { + if (!pfn_present(pfn)) + continue; + fail = init_section_page_ext(pfn, nid); + } + if (!fail) + return 0; + + /* rollback */ + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) + __free_page_ext(pfn); + + return -ENOMEM; +} + +static int __meminit offline_page_ext(unsigned long start_pfn, + unsigned long nr_pages, int nid) +{ + unsigned long start, end, pfn; + + start = SECTION_ALIGN_DOWN(start_pfn); + end = SECTION_ALIGN_UP(start_pfn + nr_pages); + + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) + __free_page_ext(pfn); + return 0; + +} + +static int __meminit page_ext_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mn = arg; + int ret = 0; + + switch (action) { + case MEM_GOING_ONLINE: + ret = online_page_ext(mn->start_pfn, + mn->nr_pages, mn->status_change_nid); + break; + case MEM_OFFLINE: + offline_page_ext(mn->start_pfn, + mn->nr_pages, mn->status_change_nid); + break; + case MEM_CANCEL_ONLINE: + offline_page_ext(mn->start_pfn, + mn->nr_pages, mn->status_change_nid); + break; + case MEM_GOING_OFFLINE: + break; + case MEM_ONLINE: + case MEM_CANCEL_OFFLINE: + break; + } + + return notifier_from_errno(ret); +} + +#endif + +void __init page_ext_init(void) +{ + unsigned long pfn; + int nid; + + if (!invoke_need_callbacks()) + return; + + for_each_node_state(nid, N_MEMORY) { + unsigned long start_pfn, end_pfn; + + start_pfn = node_start_pfn(nid); + end_pfn = node_end_pfn(nid); + /* + * start_pfn and end_pfn may not be aligned to SECTION and the + * page->flags of out of node pages are not initialized. So we + * scan [start_pfn, the biggest section's pfn < end_pfn) here. + */ + for (pfn = start_pfn; pfn < end_pfn; + pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { + + if (!pfn_valid(pfn)) + continue; + /* + * Nodes's pfns can be overlapping. + * We know some arch can have a nodes layout such as + * -------------pfn--------------> + * N0 | N1 | N2 | N0 | N1 | N2|.... + */ + if (pfn_to_nid(pfn) != nid) + continue; + if (init_section_page_ext(pfn, nid)) + goto oom; + } + } + hotplug_memory_notifier(page_ext_callback, 0); + pr_info("allocated %ld bytes of page_ext\n", total_usage); + invoke_init_callbacks(); + return; + +oom: + panic("Out of memory"); +} + +void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) +{ +} + +#endif diff --git a/mm/page_owner.c b/mm/page_owner.c new file mode 100644 index 0000000000000000000000000000000000000000..9ab4a9b5bc091b80915850038d4d3f04eb331ee1 --- /dev/null +++ b/mm/page_owner.c @@ -0,0 +1,311 @@ +#include <linux/debugfs.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/bootmem.h> +#include <linux/stacktrace.h> +#include <linux/page_owner.h> +#include "internal.h" + +static bool page_owner_disabled = true; +bool page_owner_inited __read_mostly; + +static void init_early_allocated_pages(void); + +static int early_page_owner_param(char *buf) +{ + if (!buf) + return -EINVAL; + + if (strcmp(buf, "on") == 0) + page_owner_disabled = false; + + return 0; +} +early_param("page_owner", early_page_owner_param); + +static bool need_page_owner(void) +{ + if (page_owner_disabled) + return false; + + return true; +} + +static void init_page_owner(void) +{ + if (page_owner_disabled) + return; + + page_owner_inited = true; + init_early_allocated_pages(); +} + +struct page_ext_operations page_owner_ops = { + .need = need_page_owner, + .init = init_page_owner, +}; + +void __reset_page_owner(struct page *page, unsigned int order) +{ + int i; + struct page_ext *page_ext; + + for (i = 0; i < (1 << order); i++) { + page_ext = lookup_page_ext(page + i); + __clear_bit(PAGE_EXT_OWNER, &page_ext->flags); + } +} + +void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) +{ + struct page_ext *page_ext; + struct stack_trace *trace; + + page_ext = lookup_page_ext(page); + + trace = &page_ext->trace; + trace->nr_entries = 0; + trace->max_entries = ARRAY_SIZE(page_ext->trace_entries); + trace->entries = &page_ext->trace_entries[0]; + trace->skip = 3; + save_stack_trace(&page_ext->trace); + + page_ext->order = order; + page_ext->gfp_mask = gfp_mask; + + __set_bit(PAGE_EXT_OWNER, &page_ext->flags); +} + +static ssize_t +print_page_owner(char __user *buf, size_t count, unsigned long pfn, + struct page *page, struct page_ext *page_ext) +{ + int ret; + int pageblock_mt, page_mt; + char *kbuf; + + kbuf = kmalloc(count, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + ret = snprintf(kbuf, count, + "Page allocated via order %u, mask 0x%x\n", + page_ext->order, page_ext->gfp_mask); + + if (ret >= count) + goto err; + + /* Print information relevant to grouping pages by mobility */ + pageblock_mt = get_pfnblock_migratetype(page, pfn); + page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); + ret += snprintf(kbuf + ret, count - ret, + "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n", + pfn, + pfn >> pageblock_order, + pageblock_mt, + pageblock_mt != page_mt ? "Fallback" : " ", + PageLocked(page) ? "K" : " ", + PageError(page) ? "E" : " ", + PageReferenced(page) ? "R" : " ", + PageUptodate(page) ? "U" : " ", + PageDirty(page) ? "D" : " ", + PageLRU(page) ? "L" : " ", + PageActive(page) ? "A" : " ", + PageSlab(page) ? "S" : " ", + PageWriteback(page) ? "W" : " ", + PageCompound(page) ? "C" : " ", + PageSwapCache(page) ? "B" : " ", + PageMappedToDisk(page) ? "M" : " "); + + if (ret >= count) + goto err; + + ret += snprint_stack_trace(kbuf + ret, count - ret, + &page_ext->trace, 0); + if (ret >= count) + goto err; + + ret += snprintf(kbuf + ret, count - ret, "\n"); + if (ret >= count) + goto err; + + if (copy_to_user(buf, kbuf, ret)) + ret = -EFAULT; + + kfree(kbuf); + return ret; + +err: + kfree(kbuf); + return -ENOMEM; +} + +static ssize_t +read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + unsigned long pfn; + struct page *page; + struct page_ext *page_ext; + + if (!page_owner_inited) + return -EINVAL; + + page = NULL; + pfn = min_low_pfn + *ppos; + + /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */ + while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) + pfn++; + + drain_all_pages(NULL); + + /* Find an allocated page */ + for (; pfn < max_pfn; pfn++) { + /* + * If the new page is in a new MAX_ORDER_NR_PAGES area, + * validate the area as existing, skip it if not + */ + if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) { + pfn += MAX_ORDER_NR_PAGES - 1; + continue; + } + + /* Check for holes within a MAX_ORDER area */ + if (!pfn_valid_within(pfn)) + continue; + + page = pfn_to_page(pfn); + if (PageBuddy(page)) { + unsigned long freepage_order = page_order_unsafe(page); + + if (freepage_order < MAX_ORDER) + pfn += (1UL << freepage_order) - 1; + continue; + } + + page_ext = lookup_page_ext(page); + + /* + * Some pages could be missed by concurrent allocation or free, + * because we don't hold the zone lock. + */ + if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) + continue; + + /* Record the next PFN to read in the file offset */ + *ppos = (pfn - min_low_pfn) + 1; + + return print_page_owner(buf, count, pfn, page, page_ext); + } + + return 0; +} + +static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) +{ + struct page *page; + struct page_ext *page_ext; + unsigned long pfn = zone->zone_start_pfn, block_end_pfn; + unsigned long end_pfn = pfn + zone->spanned_pages; + unsigned long count = 0; + + /* Scan block by block. First and last block may be incomplete */ + pfn = zone->zone_start_pfn; + + /* + * Walk the zone in pageblock_nr_pages steps. If a page block spans + * a zone boundary, it will be double counted between zones. This does + * not matter as the mixed block count will still be correct + */ + for (; pfn < end_pfn; ) { + if (!pfn_valid(pfn)) { + pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + continue; + } + + block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = min(block_end_pfn, end_pfn); + + page = pfn_to_page(pfn); + + for (; pfn < block_end_pfn; pfn++) { + if (!pfn_valid_within(pfn)) + continue; + + page = pfn_to_page(pfn); + + /* + * We are safe to check buddy flag and order, because + * this is init stage and only single thread runs. + */ + if (PageBuddy(page)) { + pfn += (1UL << page_order(page)) - 1; + continue; + } + + if (PageReserved(page)) + continue; + + page_ext = lookup_page_ext(page); + + /* Maybe overraping zone */ + if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) + continue; + + /* Found early allocated page */ + set_page_owner(page, 0, 0); + count++; + } + } + + pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n", + pgdat->node_id, zone->name, count); +} + +static void init_zones_in_node(pg_data_t *pgdat) +{ + struct zone *zone; + struct zone *node_zones = pgdat->node_zones; + unsigned long flags; + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!populated_zone(zone)) + continue; + + spin_lock_irqsave(&zone->lock, flags); + init_pages_in_zone(pgdat, zone); + spin_unlock_irqrestore(&zone->lock, flags); + } +} + +static void init_early_allocated_pages(void) +{ + pg_data_t *pgdat; + + drain_all_pages(NULL); + for_each_online_pgdat(pgdat) + init_zones_in_node(pgdat); +} + +static const struct file_operations proc_page_owner_operations = { + .read = read_page_owner, +}; + +static int __init pageowner_init(void) +{ + struct dentry *dentry; + + if (!page_owner_inited) { + pr_info("page_owner is disabled\n"); + return 0; + } + + dentry = debugfs_create_file("page_owner", S_IRUSR, NULL, + NULL, &proc_page_owner_operations); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + return 0; +} +module_init(pageowner_init) diff --git a/mm/rmap.c b/mm/rmap.c index 45eba36fd67300b84f9e30dafb461f5cf2c34ad2..c52f43a69eeaa6517b957093b41f783cb7b2f892 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -23,7 +23,7 @@ * inode->i_mutex (while writing or truncating, not reading or faulting) * mm->mmap_sem * page->flags PG_locked (lock_page) - * mapping->i_mmap_mutex + * mapping->i_mmap_rwsem * anon_vma->rwsem * mm->page_table_lock or pte_lock * zone->lru_lock (in mark_page_accessed, isolate_lru_page) @@ -1260,7 +1260,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* * We need mmap_sem locking, Otherwise VM_LOCKED check makes * unstable result and race. Plus, We can't wait here because - * we now hold anon_vma->rwsem or mapping->i_mmap_mutex. + * we now hold anon_vma->rwsem or mapping->i_mmap_rwsem. * if trylock failed, the page remain in evictable lru and later * vmscan could retry to move the page to unevictable lru if the * page is actually mlocked. @@ -1635,7 +1635,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; - pgoff_t pgoff = page_to_pgoff(page); + pgoff_t pgoff; struct anon_vma_chain *avc; int ret = SWAP_AGAIN; @@ -1643,6 +1643,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) if (!anon_vma) return ret; + pgoff = page_to_pgoff(page); anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); @@ -1676,7 +1677,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) { struct address_space *mapping = page->mapping; - pgoff_t pgoff = page_to_pgoff(page); + pgoff_t pgoff; struct vm_area_struct *vma; int ret = SWAP_AGAIN; @@ -1684,13 +1685,15 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) * The page lock not only makes sure that page->mapping cannot * suddenly be NULLified by truncation, it makes sure that the * structure at mapping cannot be freed and reused yet, - * so we can safely take mapping->i_mmap_mutex. + * so we can safely take mapping->i_mmap_rwsem. */ VM_BUG_ON_PAGE(!PageLocked(page), page); if (!mapping) return ret; - mutex_lock(&mapping->i_mmap_mutex); + + pgoff = page_to_pgoff(page); + i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); @@ -1711,9 +1714,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) goto done; ret = rwc->file_nonlinear(page, mapping, rwc->arg); - done: - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_read(mapping); return ret; } diff --git a/mm/slab.c b/mm/slab.c index fee275b5b6b73fad237bdf8cfe2831af71480803..65b5dcb6f67107d6e46515196cfdf5cfbfd8a70b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3015,7 +3015,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { nid = zone_to_nid(zone); - if (cpuset_zone_allowed(zone, flags | __GFP_HARDWALL) && + if (cpuset_zone_allowed(zone, flags) && get_node(cache, nid) && get_node(cache, nid)->free_objects) { obj = ____cache_alloc_node(cache, @@ -3182,6 +3182,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, memset(ptr, 0, cachep->object_size); } + memcg_kmem_put_cache(cachep); return ptr; } @@ -3247,6 +3248,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) memset(objp, 0, cachep->object_size); } + memcg_kmem_put_cache(cachep); return objp; } diff --git a/mm/slub.c b/mm/slub.c index 765c5884d03d46b2b4a41b977d5cf2dfe2d878b1..fe376fe1f4fe3ce5526997ef700da9eee5c714a5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1233,13 +1233,17 @@ static inline void kfree_hook(const void *x) kmemleak_free(x); } -static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) +static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, + gfp_t flags) { flags &= gfp_allowed_mask; lockdep_trace_alloc(flags); might_sleep_if(flags & __GFP_WAIT); - return should_failslab(s->object_size, flags, s->flags); + if (should_failslab(s->object_size, flags, s->flags)) + return NULL; + + return memcg_kmem_get_cache(s, flags); } static inline void slab_post_alloc_hook(struct kmem_cache *s, @@ -1248,6 +1252,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, flags &= gfp_allowed_mask; kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); + memcg_kmem_put_cache(s); } static inline void slab_free_hook(struct kmem_cache *s, void *x) @@ -1665,8 +1670,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, n = get_node(s, zone_to_nid(zone)); - if (n && cpuset_zone_allowed(zone, - flags | __GFP_HARDWALL) && + if (n && cpuset_zone_allowed(zone, flags) && n->nr_partial > s->min_partial) { object = get_partial_node(s, n, c, flags); if (object) { @@ -2384,10 +2388,9 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct page *page; unsigned long tid; - if (slab_pre_alloc_hook(s, gfpflags)) + s = slab_pre_alloc_hook(s, gfpflags); + if (!s) return NULL; - - s = memcg_kmem_get_cache(s, gfpflags); redo: /* * Must read kmem_cache cpu data via this cpu ptr. Preemption is diff --git a/mm/vmacache.c b/mm/vmacache.c index 9f25af825dec6d929348e16db85a6c17ab00f31d..b6e3662fe339532d8b8f1578f58dd208dd23e249 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c @@ -17,6 +17,8 @@ void vmacache_flush_all(struct mm_struct *mm) { struct task_struct *g, *p; + count_vm_vmacache_event(VMACACHE_FULL_FLUSHES); + /* * Single threaded tasks need not iterate the entire * list of process. We can avoid the flushing as well diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8a18196fcdff55e15db146ba4b7cf0324af0877e..39c338896416bd9f70d197ec30ddd003d935c65e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2574,10 +2574,10 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) if (!counters) return; - /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ - smp_rmb(); if (v->flags & VM_UNINITIALIZED) return; + /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ + smp_rmb(); memset(counters, 0, nr_node_ids * sizeof(unsigned int)); diff --git a/mm/vmscan.c b/mm/vmscan.c index a384339bf7186c277b861bd6199fd0734f6dc32e..bd9a72bc4a1b81f5b4a53e630360e725f6e1347b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -229,9 +229,10 @@ EXPORT_SYMBOL(unregister_shrinker); #define SHRINK_BATCH 128 -static unsigned long -shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, - unsigned long nr_pages_scanned, unsigned long lru_pages) +static unsigned long shrink_slabs(struct shrink_control *shrinkctl, + struct shrinker *shrinker, + unsigned long nr_scanned, + unsigned long nr_eligible) { unsigned long freed = 0; unsigned long long delta; @@ -255,9 +256,9 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); total_scan = nr; - delta = (4 * nr_pages_scanned) / shrinker->seeks; + delta = (4 * nr_scanned) / shrinker->seeks; delta *= freeable; - do_div(delta, lru_pages + 1); + do_div(delta, nr_eligible + 1); total_scan += delta; if (total_scan < 0) { pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", @@ -289,8 +290,8 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, total_scan = freeable * 2; trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, - nr_pages_scanned, lru_pages, - freeable, delta, total_scan); + nr_scanned, nr_eligible, + freeable, delta, total_scan); /* * Normally, we should not scan less than batch_size objects in one @@ -339,34 +340,37 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, return freed; } -/* - * Call the shrink functions to age shrinkable caches - * - * Here we assume it costs one seek to replace a lru page and that it also - * takes a seek to recreate a cache object. With this in mind we age equal - * percentages of the lru and ageable caches. This should balance the seeks - * generated by these structures. +/** + * shrink_node_slabs - shrink slab caches of a given node + * @gfp_mask: allocation context + * @nid: node whose slab caches to target + * @nr_scanned: pressure numerator + * @nr_eligible: pressure denominator * - * If the vm encountered mapped pages on the LRU it increase the pressure on - * slab to avoid swapping. + * Call the shrink functions to age shrinkable caches. * - * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. + * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, + * unaware shrinkers will receive a node id of 0 instead. * - * `lru_pages' represents the number of on-LRU pages in all the zones which - * are eligible for the caller's allocation attempt. It is used for balancing - * slab reclaim versus page reclaim. + * @nr_scanned and @nr_eligible form a ratio that indicate how much of + * the available objects should be scanned. Page reclaim for example + * passes the number of pages scanned and the number of pages on the + * LRU lists that it considered on @nid, plus a bias in @nr_scanned + * when it encountered mapped pages. The ratio is further biased by + * the ->seeks setting of the shrink function, which indicates the + * cost to recreate an object relative to that of an LRU page. * - * Returns the number of slab objects which we shrunk. + * Returns the number of reclaimed slab objects. */ -unsigned long shrink_slab(struct shrink_control *shrinkctl, - unsigned long nr_pages_scanned, - unsigned long lru_pages) +unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, + unsigned long nr_scanned, + unsigned long nr_eligible) { struct shrinker *shrinker; unsigned long freed = 0; - if (nr_pages_scanned == 0) - nr_pages_scanned = SWAP_CLUSTER_MAX; + if (nr_scanned == 0) + nr_scanned = SWAP_CLUSTER_MAX; if (!down_read_trylock(&shrinker_rwsem)) { /* @@ -380,20 +384,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl, } list_for_each_entry(shrinker, &shrinker_list, list) { - if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) { - shrinkctl->nid = 0; - freed += shrink_slab_node(shrinkctl, shrinker, - nr_pages_scanned, lru_pages); - continue; - } + struct shrink_control sc = { + .gfp_mask = gfp_mask, + .nid = nid, + }; - for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { - if (node_online(shrinkctl->nid)) - freed += shrink_slab_node(shrinkctl, shrinker, - nr_pages_scanned, lru_pages); + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + sc.nid = 0; - } + freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible); } + up_read(&shrinker_rwsem); out: cond_resched(); @@ -1876,7 +1877,8 @@ enum scan_balance { * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ static void get_scan_count(struct lruvec *lruvec, int swappiness, - struct scan_control *sc, unsigned long *nr) + struct scan_control *sc, unsigned long *nr, + unsigned long *lru_pages) { struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; @@ -2022,6 +2024,7 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness, some_scanned = false; /* Only use force_scan on second pass. */ for (pass = 0; !some_scanned && pass < 2; pass++) { + *lru_pages = 0; for_each_evictable_lru(lru) { int file = is_file_lru(lru); unsigned long size; @@ -2048,14 +2051,19 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness, case SCAN_FILE: case SCAN_ANON: /* Scan one type exclusively */ - if ((scan_balance == SCAN_FILE) != file) + if ((scan_balance == SCAN_FILE) != file) { + size = 0; scan = 0; + } break; default: /* Look ma, no brain */ BUG(); } + + *lru_pages += size; nr[lru] = scan; + /* * Skip the second pass and don't force_scan, * if we found something to scan. @@ -2069,7 +2077,7 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness, * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ static void shrink_lruvec(struct lruvec *lruvec, int swappiness, - struct scan_control *sc) + struct scan_control *sc, unsigned long *lru_pages) { unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; @@ -2080,7 +2088,7 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness, struct blk_plug plug; bool scan_adjusted; - get_scan_count(lruvec, swappiness, sc, nr); + get_scan_count(lruvec, swappiness, sc, nr, lru_pages); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); @@ -2258,7 +2266,8 @@ static inline bool should_continue_reclaim(struct zone *zone, } } -static bool shrink_zone(struct zone *zone, struct scan_control *sc) +static bool shrink_zone(struct zone *zone, struct scan_control *sc, + bool is_classzone) { unsigned long nr_reclaimed, nr_scanned; bool reclaimable = false; @@ -2269,6 +2278,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc) .zone = zone, .priority = sc->priority, }; + unsigned long zone_lru_pages = 0; struct mem_cgroup *memcg; nr_reclaimed = sc->nr_reclaimed; @@ -2276,13 +2286,15 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc) memcg = mem_cgroup_iter(root, NULL, &reclaim); do { + unsigned long lru_pages; struct lruvec *lruvec; int swappiness; lruvec = mem_cgroup_zone_lruvec(zone, memcg); swappiness = mem_cgroup_swappiness(memcg); - shrink_lruvec(lruvec, swappiness, sc); + shrink_lruvec(lruvec, swappiness, sc, &lru_pages); + zone_lru_pages += lru_pages; /* * Direct reclaim and kswapd have to scan all memory @@ -2302,6 +2314,25 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc) memcg = mem_cgroup_iter(root, memcg, &reclaim); } while (memcg); + /* + * Shrink the slab caches in the same proportion that + * the eligible LRU pages were scanned. + */ + if (global_reclaim(sc) && is_classzone) { + struct reclaim_state *reclaim_state; + + shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone), + sc->nr_scanned - nr_scanned, + zone_lru_pages); + + reclaim_state = current->reclaim_state; + if (reclaim_state) { + sc->nr_reclaimed += + reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } + } + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, sc->nr_scanned - nr_scanned, sc->nr_reclaimed - nr_reclaimed); @@ -2376,12 +2407,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; - unsigned long lru_pages = 0; - struct reclaim_state *reclaim_state = current->reclaim_state; gfp_t orig_mask; - struct shrink_control shrink = { - .gfp_mask = sc->gfp_mask, - }; enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); bool reclaimable = false; @@ -2394,12 +2420,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (buffer_heads_over_limit) sc->gfp_mask |= __GFP_HIGHMEM; - nodes_clear(shrink.nodes_to_scan); - for_each_zone_zonelist_nodemask(zone, z, zonelist, - gfp_zone(sc->gfp_mask), sc->nodemask) { + requested_highidx, sc->nodemask) { + enum zone_type classzone_idx; + if (!populated_zone(zone)) continue; + + classzone_idx = requested_highidx; + while (!populated_zone(zone->zone_pgdat->node_zones + + classzone_idx)) + classzone_idx--; + /* * Take care memory controller reclaiming has small influence * to global LRU. @@ -2409,9 +2441,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) GFP_KERNEL | __GFP_HARDWALL)) continue; - lru_pages += zone_reclaimable_pages(zone); - node_set(zone_to_nid(zone), shrink.nodes_to_scan); - if (sc->priority != DEF_PRIORITY && !zone_reclaimable(zone)) continue; /* Let kswapd poll it */ @@ -2450,7 +2479,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) /* need some check for avoid more shrink_zone() */ } - if (shrink_zone(zone, sc)) + if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx)) reclaimable = true; if (global_reclaim(sc) && @@ -2458,20 +2487,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) reclaimable = true; } - /* - * Don't shrink slabs when reclaiming memory from over limit cgroups - * but do shrink slab at least once when aborting reclaim for - * compaction to avoid unevenly scanning file/anon LRU pages over slab - * pages. - */ - if (global_reclaim(sc)) { - shrink_slab(&shrink, sc->nr_scanned, lru_pages); - if (reclaim_state) { - sc->nr_reclaimed += reclaim_state->reclaimed_slab; - reclaim_state->reclaimed_slab = 0; - } - } - /* * Restore to original mask to avoid the impact on the caller if we * promoted it to __GFP_HIGHMEM. @@ -2736,6 +2751,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, }; struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); int swappiness = mem_cgroup_swappiness(memcg); + unsigned long lru_pages; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -2751,7 +2767,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_lruvec(lruvec, swappiness, &sc); + shrink_lruvec(lruvec, swappiness, &sc, &lru_pages); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -2932,15 +2948,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, static bool kswapd_shrink_zone(struct zone *zone, int classzone_idx, struct scan_control *sc, - unsigned long lru_pages, unsigned long *nr_attempted) { int testorder = sc->order; unsigned long balance_gap; - struct reclaim_state *reclaim_state = current->reclaim_state; - struct shrink_control shrink = { - .gfp_mask = sc->gfp_mask, - }; bool lowmem_pressure; /* Reclaim above the high watermark. */ @@ -2975,13 +2986,7 @@ static bool kswapd_shrink_zone(struct zone *zone, balance_gap, classzone_idx)) return true; - shrink_zone(zone, sc); - nodes_clear(shrink.nodes_to_scan); - node_set(zone_to_nid(zone), shrink.nodes_to_scan); - - reclaim_state->reclaimed_slab = 0; - shrink_slab(&shrink, sc->nr_scanned, lru_pages); - sc->nr_reclaimed += reclaim_state->reclaimed_slab; + shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); /* Account for the number of pages attempted to reclaim */ *nr_attempted += sc->nr_to_reclaim; @@ -3042,7 +3047,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, count_vm_event(PAGEOUTRUN); do { - unsigned long lru_pages = 0; unsigned long nr_attempted = 0; bool raise_priority = true; bool pgdat_needs_compaction = (order > 0); @@ -3102,8 +3106,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, if (!populated_zone(zone)) continue; - lru_pages += zone_reclaimable_pages(zone); - /* * If any zone is currently balanced then kswapd will * not call compaction as it is expected that the @@ -3159,8 +3161,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * that that high watermark would be met at 100% * efficiency. */ - if (kswapd_shrink_zone(zone, end_zone, &sc, - lru_pages, &nr_attempted)) + if (kswapd_shrink_zone(zone, end_zone, + &sc, &nr_attempted)) raise_priority = false; } @@ -3612,10 +3614,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), .may_swap = 1, }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, - }; - unsigned long nr_slab_pages0, nr_slab_pages1; cond_resched(); /* @@ -3634,44 +3632,10 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * priorities until we have enough memory freed. */ do { - shrink_zone(zone, &sc); + shrink_zone(zone, &sc, true); } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); } - nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); - if (nr_slab_pages0 > zone->min_slab_pages) { - /* - * shrink_slab() does not currently allow us to determine how - * many pages were freed in this zone. So we take the current - * number of slab pages and shake the slab until it is reduced - * by the same nr_pages that we used for reclaiming unmapped - * pages. - */ - nodes_clear(shrink.nodes_to_scan); - node_set(zone_to_nid(zone), shrink.nodes_to_scan); - for (;;) { - unsigned long lru_pages = zone_reclaimable_pages(zone); - - /* No reclaimable slab or very low memory pressure */ - if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages)) - break; - - /* Freed enough memory */ - nr_slab_pages1 = zone_page_state(zone, - NR_SLAB_RECLAIMABLE); - if (nr_slab_pages1 + nr_pages <= nr_slab_pages0) - break; - } - - /* - * Update nr_reclaimed by the number of slab pages we - * reclaimed from this zone. - */ - nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); - if (nr_slab_pages1 < nr_slab_pages0) - sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1; - } - p->reclaim_state = NULL; current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); lockdep_clear_current_reclaim_state(); diff --git a/mm/vmstat.c b/mm/vmstat.c index 1b12d390dc6815d136cb0c812d00169c6f52895f..1284f89fca082aa907973d16ceed3f09d316a780 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -22,6 +22,8 @@ #include <linux/writeback.h> #include <linux/compaction.h> #include <linux/mm_inline.h> +#include <linux/page_ext.h> +#include <linux/page_owner.h> #include "internal.h" @@ -898,6 +900,7 @@ const char * const vmstat_text[] = { #ifdef CONFIG_DEBUG_VM_VMACACHE "vmacache_find_calls", "vmacache_find_hits", + "vmacache_full_flushes", #endif #endif /* CONFIG_VM_EVENTS_COUNTERS */ }; @@ -1017,6 +1020,104 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) return 0; } +#ifdef CONFIG_PAGE_OWNER +static void pagetypeinfo_showmixedcount_print(struct seq_file *m, + pg_data_t *pgdat, + struct zone *zone) +{ + struct page *page; + struct page_ext *page_ext; + unsigned long pfn = zone->zone_start_pfn, block_end_pfn; + unsigned long end_pfn = pfn + zone->spanned_pages; + unsigned long count[MIGRATE_TYPES] = { 0, }; + int pageblock_mt, page_mt; + int i; + + /* Scan block by block. First and last block may be incomplete */ + pfn = zone->zone_start_pfn; + + /* + * Walk the zone in pageblock_nr_pages steps. If a page block spans + * a zone boundary, it will be double counted between zones. This does + * not matter as the mixed block count will still be correct + */ + for (; pfn < end_pfn; ) { + if (!pfn_valid(pfn)) { + pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + continue; + } + + block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = min(block_end_pfn, end_pfn); + + page = pfn_to_page(pfn); + pageblock_mt = get_pfnblock_migratetype(page, pfn); + + for (; pfn < block_end_pfn; pfn++) { + if (!pfn_valid_within(pfn)) + continue; + + page = pfn_to_page(pfn); + if (PageBuddy(page)) { + pfn += (1UL << page_order(page)) - 1; + continue; + } + + if (PageReserved(page)) + continue; + + page_ext = lookup_page_ext(page); + + if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) + continue; + + page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); + if (pageblock_mt != page_mt) { + if (is_migrate_cma(pageblock_mt)) + count[MIGRATE_MOVABLE]++; + else + count[pageblock_mt]++; + + pfn = block_end_pfn; + break; + } + pfn += (1UL << page_ext->order) - 1; + } + } + + /* Print counts */ + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); + for (i = 0; i < MIGRATE_TYPES; i++) + seq_printf(m, "%12lu ", count[i]); + seq_putc(m, '\n'); +} +#endif /* CONFIG_PAGE_OWNER */ + +/* + * Print out the number of pageblocks for each migratetype that contain pages + * of other types. This gives an indication of how well fallbacks are being + * contained by rmqueue_fallback(). It requires information from PAGE_OWNER + * to determine what is going on + */ +static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) +{ +#ifdef CONFIG_PAGE_OWNER + int mtype; + + if (!page_owner_inited) + return; + + drain_all_pages(NULL); + + seq_printf(m, "\n%-23s", "Number of mixed blocks "); + for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) + seq_printf(m, "%12s ", migratetype_names[mtype]); + seq_putc(m, '\n'); + + walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print); +#endif /* CONFIG_PAGE_OWNER */ +} + /* * This prints out statistics in relation to grouping pages by mobility. * It is expensive to collect so do not constantly read the file. @@ -1034,6 +1135,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg) seq_putc(m, '\n'); pagetypeinfo_showfree(m, pgdat); pagetypeinfo_showblockcount(m, pgdat); + pagetypeinfo_showmixedcount(m, pgdat); return 0; } diff --git a/mm/zbud.c b/mm/zbud.c index ec71b37fb06cc9b9707314857bd8ba0addbb8394..4e387bea702eca644da2547c3bd8f9537291c189 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -132,7 +132,7 @@ static struct zbud_ops zbud_zpool_ops = { static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) { - return zbud_create_pool(gfp, &zbud_zpool_ops); + return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); } static void zbud_zpool_destroy(void *pool) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 839a48c3ca27b2bf008496c89867f7210aeb6610..4d0a063145ec45647a65654dd09388e8b7936fde 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -155,8 +155,6 @@ * (reason above) */ #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) -#define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \ - ZS_SIZE_CLASS_DELTA + 1) /* * We do not maintain any list for completely empty or full pages @@ -170,6 +168,11 @@ enum fullness_group { ZS_FULL }; +/* + * number of size_classes + */ +static int zs_size_classes; + /* * We assign a page to ZS_ALMOST_EMPTY fullness group when: * n <= N / f, where @@ -214,7 +217,7 @@ struct link_free { }; struct zs_pool { - struct size_class size_class[ZS_SIZE_CLASSES]; + struct size_class **size_class; gfp_t flags; /* allocation flags used when growing pool */ atomic_long_t pages_allocated; @@ -468,7 +471,7 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool, if (newfg == currfg) goto out; - class = &pool->size_class[class_idx]; + class = pool->size_class[class_idx]; remove_zspage(page, class, currfg); insert_zspage(page, class, newfg); set_zspage_mapping(page, class_idx, newfg); @@ -629,6 +632,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) struct page *next_page; struct link_free *link; unsigned int i = 1; + void *vaddr; /* * page->index stores offset of first object starting @@ -639,8 +643,8 @@ static void init_zspage(struct page *first_page, struct size_class *class) if (page != first_page) page->index = off; - link = (struct link_free *)kmap_atomic(page) + - off / sizeof(*link); + vaddr = kmap_atomic(page); + link = (struct link_free *)vaddr + off / sizeof(*link); while ((off += class->size) < PAGE_SIZE) { link->next = obj_location_to_handle(page, i++); @@ -654,7 +658,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) */ next_page = get_next_page(page); link->next = obj_location_to_handle(next_page, 0); - kunmap_atomic(link); + kunmap_atomic(vaddr); page = next_page; off %= PAGE_SIZE; } @@ -784,7 +788,7 @@ static inline int __zs_cpu_up(struct mapping_area *area) */ if (area->vm_buf) return 0; - area->vm_buf = (char *)__get_free_page(GFP_KERNEL); + area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL); if (!area->vm_buf) return -ENOMEM; return 0; @@ -792,8 +796,7 @@ static inline int __zs_cpu_up(struct mapping_area *area) static inline void __zs_cpu_down(struct mapping_area *area) { - if (area->vm_buf) - free_page((unsigned long)area->vm_buf); + kfree(area->vm_buf); area->vm_buf = NULL; } @@ -881,14 +884,10 @@ static struct notifier_block zs_cpu_nb = { .notifier_call = zs_cpu_notifier }; -static void zs_exit(void) +static void zs_unregister_cpu_notifier(void) { int cpu; -#ifdef CONFIG_ZPOOL - zpool_unregister_driver(&zs_zpool_driver); -#endif - cpu_notifier_register_begin(); for_each_online_cpu(cpu) @@ -898,31 +897,74 @@ static void zs_exit(void) cpu_notifier_register_done(); } -static int zs_init(void) +static int zs_register_cpu_notifier(void) { - int cpu, ret; + int cpu, uninitialized_var(ret); cpu_notifier_register_begin(); __register_cpu_notifier(&zs_cpu_nb); for_each_online_cpu(cpu) { ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); - if (notifier_to_errno(ret)) { - cpu_notifier_register_done(); - goto fail; - } + if (notifier_to_errno(ret)) + break; } cpu_notifier_register_done(); + return notifier_to_errno(ret); +} + +static void init_zs_size_classes(void) +{ + int nr; + nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1; + if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA) + nr += 1; + + zs_size_classes = nr; +} + +static void __exit zs_exit(void) +{ #ifdef CONFIG_ZPOOL - zpool_register_driver(&zs_zpool_driver); + zpool_unregister_driver(&zs_zpool_driver); #endif + zs_unregister_cpu_notifier(); +} +static int __init zs_init(void) +{ + int ret = zs_register_cpu_notifier(); + + if (ret) { + zs_unregister_cpu_notifier(); + return ret; + } + + init_zs_size_classes(); + +#ifdef CONFIG_ZPOOL + zpool_register_driver(&zs_zpool_driver); +#endif return 0; -fail: - zs_exit(); - return notifier_to_errno(ret); +} + +static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) +{ + return pages_per_zspage * PAGE_SIZE / size; +} + +static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) +{ + if (prev->pages_per_zspage != pages_per_zspage) + return false; + + if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage) + != get_maxobj_per_zspage(size, pages_per_zspage)) + return false; + + return true; } /** @@ -937,33 +979,71 @@ static int zs_init(void) */ struct zs_pool *zs_create_pool(gfp_t flags) { - int i, ovhd_size; + int i; struct zs_pool *pool; + struct size_class *prev_class = NULL; - ovhd_size = roundup(sizeof(*pool), PAGE_SIZE); - pool = kzalloc(ovhd_size, GFP_KERNEL); + pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) return NULL; - for (i = 0; i < ZS_SIZE_CLASSES; i++) { + pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), + GFP_KERNEL); + if (!pool->size_class) { + kfree(pool); + return NULL; + } + + /* + * Iterate reversly, because, size of size_class that we want to use + * for merging should be larger or equal to current size. + */ + for (i = zs_size_classes - 1; i >= 0; i--) { int size; + int pages_per_zspage; struct size_class *class; size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; if (size > ZS_MAX_ALLOC_SIZE) size = ZS_MAX_ALLOC_SIZE; + pages_per_zspage = get_pages_per_zspage(size); + + /* + * size_class is used for normal zsmalloc operation such + * as alloc/free for that size. Although it is natural that we + * have one size_class for each size, there is a chance that we + * can get more memory utilization if we use one size_class for + * many different sizes whose size_class have same + * characteristics. So, we makes size_class point to + * previous size_class if possible. + */ + if (prev_class) { + if (can_merge(prev_class, size, pages_per_zspage)) { + pool->size_class[i] = prev_class; + continue; + } + } + + class = kzalloc(sizeof(struct size_class), GFP_KERNEL); + if (!class) + goto err; - class = &pool->size_class[i]; class->size = size; class->index = i; + class->pages_per_zspage = pages_per_zspage; spin_lock_init(&class->lock); - class->pages_per_zspage = get_pages_per_zspage(size); + pool->size_class[i] = class; + prev_class = class; } pool->flags = flags; return pool; + +err: + zs_destroy_pool(pool); + return NULL; } EXPORT_SYMBOL_GPL(zs_create_pool); @@ -971,9 +1051,15 @@ void zs_destroy_pool(struct zs_pool *pool) { int i; - for (i = 0; i < ZS_SIZE_CLASSES; i++) { + for (i = 0; i < zs_size_classes; i++) { int fg; - struct size_class *class = &pool->size_class[i]; + struct size_class *class = pool->size_class[i]; + + if (!class) + continue; + + if (class->index != i) + continue; for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { if (class->fullness_list[fg]) { @@ -981,7 +1067,10 @@ void zs_destroy_pool(struct zs_pool *pool) class->size, fg); } } + kfree(class); } + + kfree(pool->size_class); kfree(pool); } EXPORT_SYMBOL_GPL(zs_destroy_pool); @@ -999,8 +1088,8 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) { unsigned long obj; struct link_free *link; - int class_idx; struct size_class *class; + void *vaddr; struct page *first_page, *m_page; unsigned long m_objidx, m_offset; @@ -1008,9 +1097,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) return 0; - class_idx = get_size_class_index(size); - class = &pool->size_class[class_idx]; - BUG_ON(class_idx != class->index); + class = pool->size_class[get_size_class_index(size)]; spin_lock(&class->lock); first_page = find_get_zspage(class); @@ -1031,11 +1118,11 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) obj_handle_to_location(obj, &m_page, &m_objidx); m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); - link = (struct link_free *)kmap_atomic(m_page) + - m_offset / sizeof(*link); + vaddr = kmap_atomic(m_page); + link = (struct link_free *)vaddr + m_offset / sizeof(*link); first_page->freelist = link->next; memset(link, POISON_INUSE, sizeof(*link)); - kunmap_atomic(link); + kunmap_atomic(vaddr); first_page->inuse++; /* Now move the zspage to another fullness group, if required */ @@ -1051,6 +1138,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj) struct link_free *link; struct page *first_page, *f_page; unsigned long f_objidx, f_offset; + void *vaddr; int class_idx; struct size_class *class; @@ -1063,16 +1151,16 @@ void zs_free(struct zs_pool *pool, unsigned long obj) first_page = get_first_page(f_page); get_zspage_mapping(first_page, &class_idx, &fullness); - class = &pool->size_class[class_idx]; + class = pool->size_class[class_idx]; f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); spin_lock(&class->lock); /* Insert this object in containing zspage's freelist */ - link = (struct link_free *)((unsigned char *)kmap_atomic(f_page) - + f_offset); + vaddr = kmap_atomic(f_page); + link = (struct link_free *)(vaddr + f_offset); link->next = first_page->freelist; - kunmap_atomic(link); + kunmap_atomic(vaddr); first_page->freelist = (void *)obj; first_page->inuse--; @@ -1124,7 +1212,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, obj_handle_to_location(handle, &page, &obj_idx); get_zspage_mapping(get_first_page(page), &class_idx, &fg); - class = &pool->size_class[class_idx]; + class = pool->size_class[class_idx]; off = obj_idx_to_offset(page, obj_idx, class->size); area = &get_cpu_var(zs_map_area); @@ -1158,7 +1246,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) obj_handle_to_location(handle, &page, &obj_idx); get_zspage_mapping(get_first_page(page), &class_idx, &fg); - class = &pool->size_class[class_idx]; + class = pool->size_class[class_idx]; off = obj_idx_to_offset(page, obj_idx, class->size); area = this_cpu_ptr(&zs_map_area); diff --git a/mm/zswap.c b/mm/zswap.c index c1543061a192df68c0620079635f77989b8ab89b..0cfce9bc51e4946c561e48b8cb5f9e6cf7669d5f 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -149,11 +149,10 @@ static int __init zswap_comp_init(void) return 0; } -static void zswap_comp_exit(void) +static void __init zswap_comp_exit(void) { /* free percpu transforms */ - if (zswap_comp_pcpu_tfms) - free_percpu(zswap_comp_pcpu_tfms); + free_percpu(zswap_comp_pcpu_tfms); } /********************************* @@ -206,7 +205,7 @@ static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; **********************************/ static struct kmem_cache *zswap_entry_cache; -static int zswap_entry_cache_create(void) +static int __init zswap_entry_cache_create(void) { zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); return zswap_entry_cache == NULL; @@ -389,7 +388,7 @@ static struct notifier_block zswap_cpu_notifier_block = { .notifier_call = zswap_cpu_notifier }; -static int zswap_cpu_init(void) +static int __init zswap_cpu_init(void) { unsigned long cpu; diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 45f145c6f84371e1f1905f1408e577f161cd0041..c14893b501a9588efdf6d538011a82e8ea023325 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -15,6 +15,7 @@ TARGETS += user TARGETS += sysctl TARGETS += firmware TARGETS += ftrace +TARGETS += exec TARGETS_HOTPLUG = cpu-hotplug TARGETS_HOTPLUG += memory-hotplug diff --git a/tools/testing/selftests/exec/.gitignore b/tools/testing/selftests/exec/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..64073e050c6a44a0120a81c426414c38165b536f --- /dev/null +++ b/tools/testing/selftests/exec/.gitignore @@ -0,0 +1,9 @@ +subdir* +script* +execveat +execveat.symlink +execveat.moved +execveat.path.ephemeral +execveat.ephemeral +execveat.denatured +xxxxxxxx* \ No newline at end of file diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..66dfc2ce178896ffe0938b5dc09f1f339d93076c --- /dev/null +++ b/tools/testing/selftests/exec/Makefile @@ -0,0 +1,25 @@ +CC = $(CROSS_COMPILE)gcc +CFLAGS = -Wall +BINARIES = execveat +DEPS = execveat.symlink execveat.denatured script subdir +all: $(BINARIES) $(DEPS) + +subdir: + mkdir -p $@ +script: + echo '#!/bin/sh' > $@ + echo 'exit $$*' >> $@ + chmod +x $@ +execveat.symlink: execveat + ln -s -f $< $@ +execveat.denatured: execveat + cp $< $@ + chmod -x $@ +%: %.c + $(CC) $(CFLAGS) -o $@ $^ + +run_tests: all + ./execveat + +clean: + rm -rf $(BINARIES) $(DEPS) subdir.moved execveat.moved xxxxx* diff --git a/tools/testing/selftests/exec/execveat.c b/tools/testing/selftests/exec/execveat.c new file mode 100644 index 0000000000000000000000000000000000000000..33a5c06d95caa038f682c411ad26df0788d16f5b --- /dev/null +++ b/tools/testing/selftests/exec/execveat.c @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2014 Google, Inc. + * + * Licensed under the terms of the GNU GPL License version 2 + * + * Selftests for execveat(2). + */ + +#define _GNU_SOURCE /* to get O_PATH, AT_EMPTY_PATH */ +#include <sys/sendfile.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +static char longpath[2 * PATH_MAX] = ""; +static char *envp[] = { "IN_TEST=yes", NULL, NULL }; +static char *argv[] = { "execveat", "99", NULL }; + +static int execveat_(int fd, const char *path, char **argv, char **envp, + int flags) +{ +#ifdef __NR_execveat + return syscall(__NR_execveat, fd, path, argv, envp, flags); +#else + errno = -ENOSYS; + return -1; +#endif +} + +#define check_execveat_fail(fd, path, flags, errno) \ + _check_execveat_fail(fd, path, flags, errno, #errno) +static int _check_execveat_fail(int fd, const char *path, int flags, + int expected_errno, const char *errno_str) +{ + int rc; + + errno = 0; + printf("Check failure of execveat(%d, '%s', %d) with %s... ", + fd, path?:"(null)", flags, errno_str); + rc = execveat_(fd, path, argv, envp, flags); + + if (rc > 0) { + printf("[FAIL] (unexpected success from execveat(2))\n"); + return 1; + } + if (errno != expected_errno) { + printf("[FAIL] (expected errno %d (%s) not %d (%s)\n", + expected_errno, strerror(expected_errno), + errno, strerror(errno)); + return 1; + } + printf("[OK]\n"); + return 0; +} + +static int check_execveat_invoked_rc(int fd, const char *path, int flags, + int expected_rc) +{ + int status; + int rc; + pid_t child; + int pathlen = path ? strlen(path) : 0; + + if (pathlen > 40) + printf("Check success of execveat(%d, '%.20s...%s', %d)... ", + fd, path, (path + pathlen - 20), flags); + else + printf("Check success of execveat(%d, '%s', %d)... ", + fd, path?:"(null)", flags); + child = fork(); + if (child < 0) { + printf("[FAIL] (fork() failed)\n"); + return 1; + } + if (child == 0) { + /* Child: do execveat(). */ + rc = execveat_(fd, path, argv, envp, flags); + printf("[FAIL]: execveat() failed, rc=%d errno=%d (%s)\n", + rc, errno, strerror(errno)); + exit(1); /* should not reach here */ + } + /* Parent: wait for & check child's exit status. */ + rc = waitpid(child, &status, 0); + if (rc != child) { + printf("[FAIL] (waitpid(%d,...) returned %d)\n", child, rc); + return 1; + } + if (!WIFEXITED(status)) { + printf("[FAIL] (child %d did not exit cleanly, status=%08x)\n", + child, status); + return 1; + } + if (WEXITSTATUS(status) != expected_rc) { + printf("[FAIL] (child %d exited with %d not %d)\n", + child, WEXITSTATUS(status), expected_rc); + return 1; + } + printf("[OK]\n"); + return 0; +} + +static int check_execveat(int fd, const char *path, int flags) +{ + return check_execveat_invoked_rc(fd, path, flags, 99); +} + +static char *concat(const char *left, const char *right) +{ + char *result = malloc(strlen(left) + strlen(right) + 1); + + strcpy(result, left); + strcat(result, right); + return result; +} + +static int open_or_die(const char *filename, int flags) +{ + int fd = open(filename, flags); + + if (fd < 0) { + printf("Failed to open '%s'; " + "check prerequisites are available\n", filename); + exit(1); + } + return fd; +} + +static void exe_cp(const char *src, const char *dest) +{ + int in_fd = open_or_die(src, O_RDONLY); + int out_fd = open(dest, O_RDWR|O_CREAT|O_TRUNC, 0755); + struct stat info; + + fstat(in_fd, &info); + sendfile(out_fd, in_fd, NULL, info.st_size); + close(in_fd); + close(out_fd); +} + +#define XX_DIR_LEN 200 +static int check_execveat_pathmax(int dot_dfd, const char *src, int is_script) +{ + int fail = 0; + int ii, count, len; + char longname[XX_DIR_LEN + 1]; + int fd; + + if (*longpath == '\0') { + /* Create a filename close to PATH_MAX in length */ + memset(longname, 'x', XX_DIR_LEN - 1); + longname[XX_DIR_LEN - 1] = '/'; + longname[XX_DIR_LEN] = '\0'; + count = (PATH_MAX - 3) / XX_DIR_LEN; + for (ii = 0; ii < count; ii++) { + strcat(longpath, longname); + mkdir(longpath, 0755); + } + len = (PATH_MAX - 3) - (count * XX_DIR_LEN); + if (len <= 0) + len = 1; + memset(longname, 'y', len); + longname[len] = '\0'; + strcat(longpath, longname); + } + exe_cp(src, longpath); + + /* + * Execute as a pre-opened file descriptor, which works whether this is + * a script or not (because the interpreter sees a filename like + * "/dev/fd/20"). + */ + fd = open(longpath, O_RDONLY); + if (fd > 0) { + printf("Invoke copy of '%s' via filename of length %lu:\n", + src, strlen(longpath)); + fail += check_execveat(fd, "", AT_EMPTY_PATH); + } else { + printf("Failed to open length %lu filename, errno=%d (%s)\n", + strlen(longpath), errno, strerror(errno)); + fail++; + } + + /* + * Execute as a long pathname relative to ".". If this is a script, + * the interpreter will launch but fail to open the script because its + * name ("/dev/fd/5/xxx....") is bigger than PATH_MAX. + */ + if (is_script) + fail += check_execveat_invoked_rc(dot_dfd, longpath, 0, 127); + else + fail += check_execveat(dot_dfd, longpath, 0); + + return fail; +} + +static int run_tests(void) +{ + int fail = 0; + char *fullname = realpath("execveat", NULL); + char *fullname_script = realpath("script", NULL); + char *fullname_symlink = concat(fullname, ".symlink"); + int subdir_dfd = open_or_die("subdir", O_DIRECTORY|O_RDONLY); + int subdir_dfd_ephemeral = open_or_die("subdir.ephemeral", + O_DIRECTORY|O_RDONLY); + int dot_dfd = open_or_die(".", O_DIRECTORY|O_RDONLY); + int dot_dfd_path = open_or_die(".", O_DIRECTORY|O_RDONLY|O_PATH); + int dot_dfd_cloexec = open_or_die(".", O_DIRECTORY|O_RDONLY|O_CLOEXEC); + int fd = open_or_die("execveat", O_RDONLY); + int fd_path = open_or_die("execveat", O_RDONLY|O_PATH); + int fd_symlink = open_or_die("execveat.symlink", O_RDONLY); + int fd_denatured = open_or_die("execveat.denatured", O_RDONLY); + int fd_denatured_path = open_or_die("execveat.denatured", + O_RDONLY|O_PATH); + int fd_script = open_or_die("script", O_RDONLY); + int fd_ephemeral = open_or_die("execveat.ephemeral", O_RDONLY); + int fd_ephemeral_path = open_or_die("execveat.path.ephemeral", + O_RDONLY|O_PATH); + int fd_script_ephemeral = open_or_die("script.ephemeral", O_RDONLY); + int fd_cloexec = open_or_die("execveat", O_RDONLY|O_CLOEXEC); + int fd_script_cloexec = open_or_die("script", O_RDONLY|O_CLOEXEC); + + /* Change file position to confirm it doesn't affect anything */ + lseek(fd, 10, SEEK_SET); + + /* Normal executable file: */ + /* dfd + path */ + fail += check_execveat(subdir_dfd, "../execveat", 0); + fail += check_execveat(dot_dfd, "execveat", 0); + fail += check_execveat(dot_dfd_path, "execveat", 0); + /* absolute path */ + fail += check_execveat(AT_FDCWD, fullname, 0); + /* absolute path with nonsense dfd */ + fail += check_execveat(99, fullname, 0); + /* fd + no path */ + fail += check_execveat(fd, "", AT_EMPTY_PATH); + /* O_CLOEXEC fd + no path */ + fail += check_execveat(fd_cloexec, "", AT_EMPTY_PATH); + /* O_PATH fd */ + fail += check_execveat(fd_path, "", AT_EMPTY_PATH); + + /* Mess with executable file that's already open: */ + /* fd + no path to a file that's been renamed */ + rename("execveat.ephemeral", "execveat.moved"); + fail += check_execveat(fd_ephemeral, "", AT_EMPTY_PATH); + /* fd + no path to a file that's been deleted */ + unlink("execveat.moved"); /* remove the file now fd open */ + fail += check_execveat(fd_ephemeral, "", AT_EMPTY_PATH); + + /* Mess with executable file that's already open with O_PATH */ + /* fd + no path to a file that's been deleted */ + unlink("execveat.path.ephemeral"); + fail += check_execveat(fd_ephemeral_path, "", AT_EMPTY_PATH); + + /* Invalid argument failures */ + fail += check_execveat_fail(fd, "", 0, ENOENT); + fail += check_execveat_fail(fd, NULL, AT_EMPTY_PATH, EFAULT); + + /* Symlink to executable file: */ + /* dfd + path */ + fail += check_execveat(dot_dfd, "execveat.symlink", 0); + fail += check_execveat(dot_dfd_path, "execveat.symlink", 0); + /* absolute path */ + fail += check_execveat(AT_FDCWD, fullname_symlink, 0); + /* fd + no path, even with AT_SYMLINK_NOFOLLOW (already followed) */ + fail += check_execveat(fd_symlink, "", AT_EMPTY_PATH); + fail += check_execveat(fd_symlink, "", + AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW); + + /* Symlink fails when AT_SYMLINK_NOFOLLOW set: */ + /* dfd + path */ + fail += check_execveat_fail(dot_dfd, "execveat.symlink", + AT_SYMLINK_NOFOLLOW, ELOOP); + fail += check_execveat_fail(dot_dfd_path, "execveat.symlink", + AT_SYMLINK_NOFOLLOW, ELOOP); + /* absolute path */ + fail += check_execveat_fail(AT_FDCWD, fullname_symlink, + AT_SYMLINK_NOFOLLOW, ELOOP); + + /* Shell script wrapping executable file: */ + /* dfd + path */ + fail += check_execveat(subdir_dfd, "../script", 0); + fail += check_execveat(dot_dfd, "script", 0); + fail += check_execveat(dot_dfd_path, "script", 0); + /* absolute path */ + fail += check_execveat(AT_FDCWD, fullname_script, 0); + /* fd + no path */ + fail += check_execveat(fd_script, "", AT_EMPTY_PATH); + fail += check_execveat(fd_script, "", + AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW); + /* O_CLOEXEC fd fails for a script (as script file inaccessible) */ + fail += check_execveat_fail(fd_script_cloexec, "", AT_EMPTY_PATH, + ENOENT); + fail += check_execveat_fail(dot_dfd_cloexec, "script", 0, ENOENT); + + /* Mess with script file that's already open: */ + /* fd + no path to a file that's been renamed */ + rename("script.ephemeral", "script.moved"); + fail += check_execveat(fd_script_ephemeral, "", AT_EMPTY_PATH); + /* fd + no path to a file that's been deleted */ + unlink("script.moved"); /* remove the file while fd open */ + fail += check_execveat(fd_script_ephemeral, "", AT_EMPTY_PATH); + + /* Rename a subdirectory in the path: */ + rename("subdir.ephemeral", "subdir.moved"); + fail += check_execveat(subdir_dfd_ephemeral, "../script", 0); + fail += check_execveat(subdir_dfd_ephemeral, "script", 0); + /* Remove the subdir and its contents */ + unlink("subdir.moved/script"); + unlink("subdir.moved"); + /* Shell loads via deleted subdir OK because name starts with .. */ + fail += check_execveat(subdir_dfd_ephemeral, "../script", 0); + fail += check_execveat_fail(subdir_dfd_ephemeral, "script", 0, ENOENT); + + /* Flag values other than AT_SYMLINK_NOFOLLOW => EINVAL */ + fail += check_execveat_fail(dot_dfd, "execveat", 0xFFFF, EINVAL); + /* Invalid path => ENOENT */ + fail += check_execveat_fail(dot_dfd, "no-such-file", 0, ENOENT); + fail += check_execveat_fail(dot_dfd_path, "no-such-file", 0, ENOENT); + fail += check_execveat_fail(AT_FDCWD, "no-such-file", 0, ENOENT); + /* Attempt to execute directory => EACCES */ + fail += check_execveat_fail(dot_dfd, "", AT_EMPTY_PATH, EACCES); + /* Attempt to execute non-executable => EACCES */ + fail += check_execveat_fail(dot_dfd, "Makefile", 0, EACCES); + fail += check_execveat_fail(fd_denatured, "", AT_EMPTY_PATH, EACCES); + fail += check_execveat_fail(fd_denatured_path, "", AT_EMPTY_PATH, + EACCES); + /* Attempt to execute nonsense FD => EBADF */ + fail += check_execveat_fail(99, "", AT_EMPTY_PATH, EBADF); + fail += check_execveat_fail(99, "execveat", 0, EBADF); + /* Attempt to execute relative to non-directory => ENOTDIR */ + fail += check_execveat_fail(fd, "execveat", 0, ENOTDIR); + + fail += check_execveat_pathmax(dot_dfd, "execveat", 0); + fail += check_execveat_pathmax(dot_dfd, "script", 1); + return fail; +} + +static void prerequisites(void) +{ + int fd; + const char *script = "#!/bin/sh\nexit $*\n"; + + /* Create ephemeral copies of files */ + exe_cp("execveat", "execveat.ephemeral"); + exe_cp("execveat", "execveat.path.ephemeral"); + exe_cp("script", "script.ephemeral"); + mkdir("subdir.ephemeral", 0755); + + fd = open("subdir.ephemeral/script", O_RDWR|O_CREAT|O_TRUNC, 0755); + write(fd, script, strlen(script)); + close(fd); +} + +int main(int argc, char **argv) +{ + int ii; + int rc; + const char *verbose = getenv("VERBOSE"); + + if (argc >= 2) { + /* If we are invoked with an argument, don't run tests. */ + const char *in_test = getenv("IN_TEST"); + + if (verbose) { + printf(" invoked with:"); + for (ii = 0; ii < argc; ii++) + printf(" [%d]='%s'", ii, argv[ii]); + printf("\n"); + } + + /* Check expected environment transferred. */ + if (!in_test || strcmp(in_test, "yes") != 0) { + printf("[FAIL] (no IN_TEST=yes in env)\n"); + return 1; + } + + /* Use the final argument as an exit code. */ + rc = atoi(argv[argc - 1]); + fflush(stdout); + } else { + prerequisites(); + if (verbose) + envp[1] = "VERBOSE=1"; + rc = run_tests(); + if (rc > 0) + printf("%d tests failed\n", rc); + } + return rc; +} diff --git a/tools/vm/Makefile b/tools/vm/Makefile index 3d907dacf2ac85c4e9d17a5dec1808859c27f5f5..ac884b65a0725fc9b4e2ed46a490996524fd96a3 100644 --- a/tools/vm/Makefile +++ b/tools/vm/Makefile @@ -1,6 +1,6 @@ # Makefile for vm tools # -TARGETS=page-types slabinfo +TARGETS=page-types slabinfo page_owner_sort LIB_DIR = ../lib/api LIBS = $(LIB_DIR)/libapikfs.a @@ -18,5 +18,5 @@ $(LIBS): $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) clean: - $(RM) page-types slabinfo + $(RM) page-types slabinfo page_owner_sort make -C $(LIB_DIR) clean diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c new file mode 100644 index 0000000000000000000000000000000000000000..77147b42d598be985abd087e5eb8ea5ea95875fc --- /dev/null +++ b/tools/vm/page_owner_sort.c @@ -0,0 +1,144 @@ +/* + * User-space helper to sort the output of /sys/kernel/debug/page_owner + * + * Example use: + * cat /sys/kernel/debug/page_owner > page_owner_full.txt + * grep -v ^PFN page_owner_full.txt > page_owner.txt + * ./sort page_owner.txt sorted_page_owner.txt +*/ + +#include <stdio.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <string.h> + +struct block_list { + char *txt; + int len; + int num; +}; + + +static struct block_list *list; +static int list_size; +static int max_size; + +struct block_list *block_head; + +int read_block(char *buf, int buf_size, FILE *fin) +{ + char *curr = buf, *const buf_end = buf + buf_size; + + while (buf_end - curr > 1 && fgets(curr, buf_end - curr, fin)) { + if (*curr == '\n') /* empty line */ + return curr - buf; + curr += strlen(curr); + } + + return -1; /* EOF or no space left in buf. */ +} + +static int compare_txt(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return strcmp(l1->txt, l2->txt); +} + +static int compare_num(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l2->num - l1->num; +} + +static void add_list(char *buf, int len) +{ + if (list_size != 0 && + len == list[list_size-1].len && + memcmp(buf, list[list_size-1].txt, len) == 0) { + list[list_size-1].num++; + return; + } + if (list_size == max_size) { + printf("max_size too small??\n"); + exit(1); + } + list[list_size].txt = malloc(len+1); + list[list_size].len = len; + list[list_size].num = 1; + memcpy(list[list_size].txt, buf, len); + list[list_size].txt[len] = 0; + list_size++; + if (list_size % 1000 == 0) { + printf("loaded %d\r", list_size); + fflush(stdout); + } +} + +#define BUF_SIZE 1024 + +int main(int argc, char **argv) +{ + FILE *fin, *fout; + char buf[BUF_SIZE]; + int ret, i, count; + struct block_list *list2; + struct stat st; + + if (argc < 3) { + printf("Usage: ./program <input> <output>\n"); + perror("open: "); + exit(1); + } + + fin = fopen(argv[1], "r"); + fout = fopen(argv[2], "w"); + if (!fin || !fout) { + printf("Usage: ./program <input> <output>\n"); + perror("open: "); + exit(1); + } + + fstat(fileno(fin), &st); + max_size = st.st_size / 100; /* hack ... */ + + list = malloc(max_size * sizeof(*list)); + + for ( ; ; ) { + ret = read_block(buf, BUF_SIZE, fin); + if (ret < 0) + break; + + add_list(buf, ret); + } + + printf("loaded %d\n", list_size); + + printf("sorting ....\n"); + + qsort(list, list_size, sizeof(list[0]), compare_txt); + + list2 = malloc(sizeof(*list) * list_size); + + printf("culling\n"); + + for (i = count = 0; i < list_size; i++) { + if (count == 0 || + strcmp(list2[count-1].txt, list[i].txt) != 0) { + list2[count++] = list[i]; + } else { + list2[count-1].num += list[i].num; + } + } + + qsort(list2, count, sizeof(list[0]), compare_num); + + for (i = 0; i < count; i++) + fprintf(fout, "%d times:\n%s\n", list2[i].num, list2[i].txt); + + return 0; +} diff --git a/usr/Kconfig b/usr/Kconfig index 2d4c77eecf2ef7dec344fb3cf6d13a40041c7016..572dcf7b6a444893829474e6160930fcc73fe6f8 100644 --- a/usr/Kconfig +++ b/usr/Kconfig @@ -46,17 +46,17 @@ config INITRAMFS_ROOT_GID If you are not sure, leave it set to "0". config RD_GZIP - bool "Support initial ramdisks compressed using gzip" if EXPERT - default y + bool "Support initial ramdisks compressed using gzip" depends on BLK_DEV_INITRD + default y select DECOMPRESS_GZIP help Support loading of a gzip encoded initial ramdisk or cpio buffer. If unsure, say Y. config RD_BZIP2 - bool "Support initial ramdisks compressed using bzip2" if EXPERT - default !EXPERT + bool "Support initial ramdisks compressed using bzip2" + default y depends on BLK_DEV_INITRD select DECOMPRESS_BZIP2 help @@ -64,8 +64,8 @@ config RD_BZIP2 If unsure, say N. config RD_LZMA - bool "Support initial ramdisks compressed using LZMA" if EXPERT - default !EXPERT + bool "Support initial ramdisks compressed using LZMA" + default y depends on BLK_DEV_INITRD select DECOMPRESS_LZMA help @@ -73,17 +73,17 @@ config RD_LZMA If unsure, say N. config RD_XZ - bool "Support initial ramdisks compressed using XZ" if EXPERT - default !EXPERT + bool "Support initial ramdisks compressed using XZ" depends on BLK_DEV_INITRD + default y select DECOMPRESS_XZ help Support loading of a XZ encoded initial ramdisk or cpio buffer. If unsure, say N. config RD_LZO - bool "Support initial ramdisks compressed using LZO" if EXPERT - default !EXPERT + bool "Support initial ramdisks compressed using LZO" + default y depends on BLK_DEV_INITRD select DECOMPRESS_LZO help @@ -91,8 +91,8 @@ config RD_LZO If unsure, say N. config RD_LZ4 - bool "Support initial ramdisks compressed using LZ4" if EXPERT - default !EXPERT + bool "Support initial ramdisks compressed using LZ4" + default y depends on BLK_DEV_INITRD select DECOMPRESS_LZ4 help