diff --git a/Documentation/ABI/testing/sysfs-class-bdi b/Documentation/ABI/testing/sysfs-class-bdi index d773d5697cf588349b74fd7d19e63983472b1632..3187a18af6da5bbbcc9ee2a0592e27f62ab1983c 100644 --- a/Documentation/ABI/testing/sysfs-class-bdi +++ b/Documentation/ABI/testing/sysfs-class-bdi @@ -53,3 +53,11 @@ stable_pages_required (read-only) If set, the backing device requires that all pages comprising a write request must not be changed until writeout is complete. + +strictlimit (read-write) + + Forces per-BDI checks for the share of given device in the write-back + cache even before the global background dirty limit is reached. This + is useful in situations where the global limit is much higher than + affordable for given relatively slow (or untrusted) device. Turning + strictlimit on has no visible effect if max_ratio is equal to 100%. diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 257e65714c6a216f3fce9ebce7398eb821f96176..875b2b56b87fc88131324bb10bbecc8594e02109 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -218,6 +218,7 @@ line of text and contains the following stats separated by whitespace: same_pages the number of same element filled pages written to this disk. No memory is allocated for such pages. pages_compacted the number of pages freed during compaction + huge_pages the number of incompressible pages 9) Deactivate: swapoff /dev/zram0 @@ -242,5 +243,29 @@ to backing storage rather than keeping it in memory. User should set up backing device via /sys/block/zramX/backing_dev before disksize setting. += memory tracking + +With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the +zram block. It could be useful to catch cold or incompressible +pages of the process with*pagemap. +If you enable the feature, you could see block state via +/sys/kernel/debug/zram/zram0/block_state". The output is as follows, + + 300 75.033841 .wh + 301 63.806904 s.. + 302 63.806919 ..h + +First column is zram's block index. +Second column is access time since the system was booted +Third column is state of the block. +(s: same page +w: written page to backing store +h: huge page) + +First line of above example says 300th block is accessed at 75.033841sec +and the block's state is huge so it is written back to the backing +storage. It's a debugging feature so anyone shouldn't rely on it to work +properly. + Nitin Gupta ngupta@vflare.org diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index 74cdeaed9f7afb4f0b514f1213b38b2eda106f56..657fe1769c754db4cba884d5f38da0ed13bc3662 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -48,6 +48,7 @@ v1 is available under Documentation/cgroup-v1/. 5-2-1. Memory Interface Files 5-2-2. Usage Guidelines 5-2-3. Memory Ownership + 5-2-4. OOM Killer 5-3. IO 5-3-1. IO Interface Files 5-3-2. Writeback @@ -1005,10 +1006,17 @@ PAGE_SIZE multiple when read back. A read-write single value file which exists on non-root cgroups. The default is "0". - Best-effort memory protection. If the memory usages of a - cgroup and all its ancestors are below their low boundaries, - the cgroup's memory won't be reclaimed unless memory can be - reclaimed from unprotected cgroups. + Best-effort memory protection. If the memory usage of a + cgroup is within its effective low boundary, the cgroup's + memory won't be reclaimed unless memory can be reclaimed + from unprotected cgroups. + + Effective low boundary is limited by memory.low values of + all ancestor cgroups. If there is memory.low overcommitment + (child cgroup or cgroups are requiring more protected memory, + than parent will allow), then each child cgroup will get + the part of parent's protection proportional to the its + actual memory usage below memory.low. Putting more memory than generally available under this protection is discouraged. @@ -1039,6 +1047,31 @@ PAGE_SIZE multiple when read back. high limit is used and monitored properly, this limit's utility is limited to providing the final safety net. + memory.oom_group + + A read-write single value file which exists on non-root + cgroups. The default is "0". + + If set, OOM killer will consider the memory cgroup as an + indivisible memory consumers and compare it with other memory + consumers by it's memory footprint. + If such memory cgroup is selected as an OOM victim, all + processes belonging to it or it's descendants will be killed. + + This applies to system-wide OOM conditions and reaching + the hard memory limit of the cgroup and their ancestor. + If OOM condition happens in a descendant cgroup with it's own + memory limit, the memory cgroup can't be considered + as an OOM victim, and OOM killer will not kill all belonging + tasks. + + Also, OOM killer respects the /proc/pid/oom_score_adj value -1000, + and will never kill the unkillable task, even if memory.oom_group + is set. + + If cgroup-aware OOM killer is not enabled, ENOTSUPP error + is returned on attempt to access the file. + memory.events A read-only flat-keyed file which exists on non-root cgroups. The following entries are defined. Unless specified @@ -1199,6 +1232,22 @@ PAGE_SIZE multiple when read back. Swap usage hard limit. If a cgroup's swap usage reaches this limit, anonymous memory of the cgroup will not be swapped out. + memory.swap.events + A read-only flat-keyed file which exists on non-root cgroups. + The following entries are defined. Unless specified + otherwise, a value change in this file generates a file + modified event. + + max + The number of times the cgroup's swap usage was about + to go over the max boundary and swap allocation + failed. + + fail + The number of times swap allocation failed either + because of running out of swap system-wide or max + limit. + Usage Guidelines ~~~~~~~~~~~~~~~~ @@ -1242,6 +1291,54 @@ to be accessed repeatedly by other cgroups, it may make sense to use POSIX_FADV_DONTNEED to relinquish the ownership of memory areas belonging to the affected files to ensure correct memory ownership. +OOM Killer +~~~~~~~~~~ + +Cgroup v2 memory controller implements a cgroup-aware OOM killer. +It means that it treats cgroups as first class OOM entities. + +Cgroup-aware OOM logic is turned off by default and requires +passing the "groupoom" option on mounting cgroupfs. It can also +by remounting cgroupfs with the following command:: + + # mount -o remount,groupoom $MOUNT_POINT + +Under OOM conditions the memory controller tries to make the best +choice of a victim, looking for a memory cgroup with the largest +memory footprint, considering leaf cgroups and cgroups with the +memory.oom_group option set, which are considered to be an indivisible +memory consumers. + +By default, OOM killer will kill the biggest task in the selected +memory cgroup. A user can change this behavior by enabling +the per-cgroup memory.oom_group option. If set, it causes +the OOM killer to kill all processes attached to the cgroup, +except processes with oom_score_adj set to -1000. + +This affects both system- and cgroup-wide OOMs. For a cgroup-wide OOM +the memory controller considers only cgroups belonging to the sub-tree +of the OOM'ing cgroup. + +Leaf cgroups and cgroups with oom_group option set are compared based +on their cumulative memory usage. The root cgroup is treated as a +leaf memory cgroup as well, so it is compared with other leaf memory +cgroups. Due to internal implementation restrictions the size of +the root cgroup is the cumulative sum of oom_badness of all its tasks +(in other words oom_score_adj of each task is obeyed). Relying on +oom_score_adj (apart from OOM_SCORE_ADJ_MIN) can lead to over- or +underestimation of the root cgroup consumption and it is therefore +discouraged. This might change in the future, however. + +If there are no cgroups with the enabled memory controller, +the OOM killer is using the "traditional" process-based approach. + +Please, note that memory charges are not migrating if tasks +are moved between different memory cgroups. Moving tasks with +significant memory footprint may affect OOM victim selection logic. +If it's a case, please, consider creating a common ancestor for +the source and destination memory cgroups and enabling oom_group +on ancestor layer. + IO -- @@ -1934,17 +2031,8 @@ system performance due to overreclaim, to the point where the feature becomes self-defeating. The memory.low boundary on the other hand is a top-down allocated -reserve. A cgroup enjoys reclaim protection when it and all its -ancestors are below their low boundaries, which makes delegation of -subtrees possible. Secondly, new cgroups have no reserve per default -and in the common case most cgroups are eligible for the preferred -reclaim pass. This allows the new low boundary to be efficiently -implemented with just a minor addition to the generic reclaim code, -without the need for out-of-band data structures and reclaim passes. -Because the generic reclaim code considers all cgroups except for the -ones running low in the preferred first reclaim pass, overreclaim of -individual groups is eliminated as well, resulting in much better -overall workload performance. +reserve. A cgroup enjoys reclaim protection when it's within its low, +which makes delegation of subtrees possible. The original high boundary, the hard limit, is defined as a strict limit that can not budge, even if the OOM killer has to be called. diff --git a/Documentation/features/vm/pte_special/arch-support.txt b/Documentation/features/vm/pte_special/arch-support.txt index 055004f467d2ccb2bbab3cc01d5c447573d18b25..cd05924ea875f6eac766ba3ceba19aaff314e98b 100644 --- a/Documentation/features/vm/pte_special/arch-support.txt +++ b/Documentation/features/vm/pte_special/arch-support.txt @@ -1,6 +1,6 @@ # # Feature name: pte_special -# Kconfig: __HAVE_ARCH_PTE_SPECIAL +# Kconfig: ARCH_HAS_PTE_SPECIAL # description: arch supports the pte_special()/pte_mkspecial() VM APIs # ----------------------- diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt index 6c00c1e2743fa2048882514d8644a908fff3a475..819caf8ca05f8b4b16226c0f5d6f76068de09131 100644 --- a/Documentation/sysctl/fs.txt +++ b/Documentation/sysctl/fs.txt @@ -34,7 +34,9 @@ Currently, these files are in /proc/sys/fs: - overflowgid - pipe-user-pages-hard - pipe-user-pages-soft +- protected_fifos - protected_hardlinks +- protected_regular - protected_symlinks - suid_dumpable - super-max @@ -182,6 +184,24 @@ applied. ============================================================== +protected_fifos: + +The intent of this protection is to avoid unintentional writes to +an attacker-controlled FIFO, where a program expected to create a regular +file. + +When set to "0", writing to FIFOs is unrestricted. + +When set to "1" don't allow O_CREAT open on FIFOs that we don't own +in world writable sticky directories, unless they are owned by the +owner of the directory. + +When set to "2" it also applies to group writable sticky directories. + +This protection is based on the restrictions in Openwall. + +============================================================== + protected_hardlinks: A long-standing class of security issues is the hardlink-based @@ -202,6 +222,22 @@ This protection is based on the restrictions in Openwall and grsecurity. ============================================================== +protected_regular: + +This protection is similar to protected_fifos, but it +avoids writes to an attacker-controlled regular file, where a program +expected to create one. + +When set to "0", writing to regular files is unrestricted. + +When set to "1" don't allow O_CREAT open on regular files that we +don't own in world writable sticky directories, unless they are +owned by the owner of the directory. + +When set to "2" it also applies to group writable sticky directories. + +============================================================== + protected_symlinks: A long-standing class of security issues is the symlink-based diff --git a/arch/Kconfig b/arch/Kconfig index 4377b0cec976bb8d4632721b2a68ef984b6af626..e30c3fa343bde72a24b5b23efb24efc454e7d735 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -464,6 +464,10 @@ config GCC_PLUGIN_LATENT_ENTROPY config GCC_PLUGIN_STRUCTLEAK bool "Force initialization of variables containing userspace addresses" depends on GCC_PLUGINS + # Currently STRUCTLEAK inserts initialization out of live scope of + # variables from KASAN point of view. This leads to KASAN false + # positive reports. Prohibit this combination for now. + depends on !KASAN_EXTRA help This plugin zero-initializes any structures containing a __user attribute. This can prevent some classes of information diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index d76bf4a8374016043963f1c04d42700e64ad605a..8516e2b0239ad43e716cc8dbb0ce328014008239 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -44,6 +44,7 @@ config ARC select HAVE_GENERIC_DMA_COHERENT select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZMA + select ARCH_HAS_PTE_SPECIAL config MIGHT_HAVE_PCI bool diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index 08fe33830d4b17337757d548399c897aa240de36..8ec5599a0957e3f2314a63450f46588125d94a82 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -320,8 +320,6 @@ PTE_BIT_FUNC(mkexec, |= (_PAGE_EXECUTE)); PTE_BIT_FUNC(mkspecial, |= (_PAGE_SPECIAL)); PTE_BIT_FUNC(mkhuge, |= (_PAGE_HW_SZ)); -#define __HAVE_ARCH_PTE_SPECIAL - static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 2d34c0a44877e85b178d8b9d2f354029e6d37883..fa0b190f8a38999c00ff0614d215e71df0447c46 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -8,6 +8,7 @@ config ARM select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE + select ARCH_HAS_PTE_SPECIAL if ARM_LPAE select ARCH_HAS_SET_MEMORY select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h index 4355f0ec44d62e9b5d7c40132f28f0ff6710f387..f98baaec0a1588c433c7221ebe77d62e120e0b2e 100644 --- a/arch/arm/include/asm/page.h +++ b/arch/arm/include/asm/page.h @@ -17,6 +17,8 @@ #ifndef __ASSEMBLY__ +#include <linux/personality.h> /* For READ_IMPLIES_EXEC */ + #ifndef CONFIG_MMU #include <asm/page-nommu.h> diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 2a4836087358bcb51bc844b1eabe5dbf58c2dc37..6d50a11d779345349cd2155fa433e3b9d6a75d08 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -219,7 +219,6 @@ static inline pte_t pte_mkspecial(pte_t pte) pte_val(pte) |= L_PTE_SPECIAL; return pte; } -#define __HAVE_ARCH_PTE_SPECIAL #define pmd_write(pmd) (pmd_isclear((pmd), L_PMD_SECT_RDONLY)) #define pmd_dirty(pmd) (pmd_isset((pmd), L_PMD_SECT_DIRTY)) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index eb2cf4938f6db124f21701d35415a15352a2ff86..9a3f1b1ab50c78ad07df11f92658578d3f527631 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -17,6 +17,7 @@ config ARM64 select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV select ARCH_HAS_MEMBARRIER_SYNC_CORE + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN select ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 7c4c8f318ba999afdac0b41300c9613fe161e91d..9f82d6b53851e4b6bedbb28f6d0e7480acd622a6 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -306,8 +306,6 @@ static inline int pte_same(pte_t pte_a, pte_t pte_b) #define HPAGE_MASK (~(HPAGE_SIZE - 1)) #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) -#define __HAVE_ARCH_PTE_SPECIAL - static inline pte_t pgd_pte(pgd_t pgd) { return __pte(pgd_val(pgd)); diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2618a9170a52a63f54ae2ed70cfed2be5a498dfe..c64aa6c48fd970abd383185d2c911b22960058f2 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -141,6 +141,7 @@ config PPC select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_PMEM_API if PPC64 + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_MEMBARRIER_CALLBACKS select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE select ARCH_HAS_SG_CHAIN diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 47b5ffc8715d9c02e342c31459f201d66c7de399..b3ac8948b2571901df90f9b6b826a54337db36d8 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -319,9 +319,6 @@ extern unsigned long pci_io_base; /* Advertise special mapping type for AGP */ #define HAVE_PAGE_AGP -/* Advertise support for _PAGE_SPECIAL */ -#define __HAVE_ARCH_PTE_SPECIAL - #ifndef __ASSEMBLY__ /* diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index c4a72c7a8c831749faa89b7be71cad1573b218c9..03dfddb1f49a5e80b66951235a2ac82b12f4717a 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -216,9 +216,6 @@ static inline bool pte_user(pte_t pte) #define PAGE_AGP (PAGE_KERNEL_NC) #define HAVE_PAGE_AGP -/* Advertise support for _PAGE_SPECIAL */ -#define __HAVE_ARCH_PTE_SPECIAL - #ifndef _PAGE_READ /* if not defined, we should not find _PAGE_WRITE too */ #define _PAGE_READ 0 diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index db5382d925660b6154e966ee0ad28028d9839325..526b0ea575a6138ac37ea242cd1013436634d9ff 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -35,6 +35,7 @@ config RISCV select THREAD_INFO_IN_TASK select RISCV_TIMER select GENERIC_IRQ_MULTI_HANDLER + select ARCH_HAS_PTE_SPECIAL config MMU def_bool y diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h index 997ddbb1d37092abf4b54636fea035b5c678a207..2fa2942be221e480088ab1a5681c9240ef3a1e23 100644 --- a/arch/riscv/include/asm/pgtable-bits.h +++ b/arch/riscv/include/asm/pgtable-bits.h @@ -42,7 +42,4 @@ _PAGE_WRITE | _PAGE_EXEC | \ _PAGE_USER | _PAGE_GLOBAL)) -/* Advertise support for _PAGE_SPECIAL */ -#define __HAVE_ARCH_PTE_SPECIAL - #endif /* _ASM_RISCV_PGTABLE_BITS_H */ diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 199ac3e4da1d9008a4a6abcdd5bec55e41354b19..71776b72483554a80dea0cb84d182d411308c9f2 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -68,6 +68,7 @@ config S390 select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN select ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 2d24d33bf188a0957927a6addd8a7eb38a5319f6..9809694e1389ef836f4d5ea1eb4d9b08be5e1260 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -171,7 +171,6 @@ static inline int is_module_addr(void *addr) #define _PAGE_WRITE 0x020 /* SW pte write bit */ #define _PAGE_SPECIAL 0x040 /* SW associated with special page */ #define _PAGE_UNUSED 0x080 /* SW bit for pgste usage state */ -#define __HAVE_ARCH_PTE_SPECIAL #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SOFT_DIRTY 0x002 /* SW pte soft dirty bit */ diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 97fe2931647648329c1b4b0bb88e8c3ac81e885f..a6c75b6806d2b9ff7e91cda849280528286fcfcf 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -50,6 +50,7 @@ config SUPERH select HAVE_ARCH_AUDITSYSCALL select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_NMI + select ARCH_HAS_PTE_SPECIAL help The SuperH is a RISC processor targeted for use in embedded systems and consumer electronics; it was also used in the Sega Dreamcast diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h index 89c513a982fcbbd83ac87a77e0c95c32782b73e6..f6abfe2bca93b8254278f635420b1fbc410b4a99 100644 --- a/arch/sh/include/asm/pgtable.h +++ b/arch/sh/include/asm/pgtable.h @@ -156,8 +156,6 @@ extern void page_table_range_init(unsigned long start, unsigned long end, #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -#define __HAVE_ARCH_PTE_SPECIAL - #include <asm-generic/pgtable.h> #endif /* __ASM_SH_PGTABLE_H */ diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 8767e45f1b2b70953583a7a7157707696466d407..6b5a4f05dcb2f4c1c7488453083530fe7eedbcca 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -86,6 +86,7 @@ config SPARC64 select ARCH_USE_QUEUED_SPINLOCKS select GENERIC_TIME_VSYSCALL select ARCH_CLOCKSOURCE_DATA + select ARCH_HAS_PTE_SPECIAL config ARCH_DEFCONFIG string diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 44d6ac47e035f8fe65b91771e8a758dffae4ae92..1393a8ac596b0a050433b1afcc7d8a97302f47f2 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -117,9 +117,6 @@ bool kern_addr_valid(unsigned long addr); #define _PAGE_PMD_HUGE _AC(0x0100000000000000,UL) /* Huge page */ #define _PAGE_PUD_HUGE _PAGE_PMD_HUGE -/* Advertise support for _PAGE_SPECIAL */ -#define __HAVE_ARCH_PTE_SPECIAL - /* SUN4U pte bits... */ #define _PAGE_SZ4MB_4U _AC(0x6000000000000000,UL) /* 4MB Page */ #define _PAGE_SZ512K_4U _AC(0x4000000000000000,UL) /* 512K Page */ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c07f492b871a8bf0f47cfb7be03a004dd740c2c1..afb91e62ef52e3c1373d1f3e2f1fc84d157455ad 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -58,6 +58,7 @@ config X86 select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_PMEM_API if X86_64 + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_REFCOUNT select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_SET_MEMORY diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 1e5a406739536d2376b81845899710017896367b..99fff853c9444466b6bd63163da17d9676773647 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -65,7 +65,6 @@ #define _PAGE_PKEY_BIT2 (_AT(pteval_t, 0)) #define _PAGE_PKEY_BIT3 (_AT(pteval_t, 0)) #endif -#define __HAVE_ARCH_PTE_SPECIAL #define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \ _PAGE_PKEY_BIT1 | \ diff --git a/block/genhd.c b/block/genhd.c index dc7e089373b9444a0b83e176b196e2539e6ff59b..10dcc29b5e9d77e19e021465b102244259ad2c02 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -991,7 +991,7 @@ static int show_partition(struct seq_file *seqf, void *v) char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_max_parts(sgp) && + if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) && (sgp->flags & GENHD_FL_REMOVABLE))) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index ac3a31d433b2e9f698d051bfcd06f76d0efad971..635235759a0ab06b593c2d100f8b9f2ec75cc392 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -13,7 +13,7 @@ config ZRAM It has several use cases, for example: /tmp storage, use as swap disks and maybe many more. - See zram.txt for more information. + See Documentation/blockdev/zram.txt for more information. config ZRAM_WRITEBACK bool "Write back incompressible page to backing device" @@ -25,4 +25,14 @@ config ZRAM_WRITEBACK For this feature, admin should set up backing device via /sys/block/zramX/backing_dev. - See zram.txt for more infomration. + See Documentation/blockdev/zram.txt for more information. + +config ZRAM_MEMORY_TRACKING + bool "Track zRam block status" + depends on ZRAM && DEBUG_FS + help + With this feature, admin can track the state of allocated blocks + of zRAM. Admin could see the information via + /sys/kernel/debug/zram/zramX/block_state. + + See Documentation/blockdev/zram.txt for more information. diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 0f3fadd712307efc7b5cf343d18fb42bd76cbd43..68d727d89d38f956eecd6dcb07fa393117ec1253 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -31,6 +31,7 @@ #include <linux/err.h> #include <linux/idr.h> #include <linux/sysfs.h> +#include <linux/debugfs.h> #include <linux/cpuhotplug.h> #include "zram_drv.h" @@ -52,11 +53,28 @@ static size_t huge_class_size; static void zram_free_page(struct zram *zram, size_t index); +static void zram_slot_lock(struct zram *zram, u32 index) +{ + bit_spin_lock(ZRAM_LOCK, &zram->table[index].value); +} + +static void zram_slot_unlock(struct zram *zram, u32 index) +{ + bit_spin_unlock(ZRAM_LOCK, &zram->table[index].value); +} + static inline bool init_done(struct zram *zram) { return zram->disksize; } +static inline bool zram_allocated(struct zram *zram, u32 index) +{ + + return (zram->table[index].value >> (ZRAM_FLAG_SHIFT + 1)) || + zram->table[index].handle; +} + static inline struct zram *dev_to_zram(struct device *dev) { return (struct zram *)dev_to_disk(dev)->private_data; @@ -73,7 +91,7 @@ static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle) } /* flag operations require table entry bit_spin_lock() being held */ -static int zram_test_flag(struct zram *zram, u32 index, +static bool zram_test_flag(struct zram *zram, u32 index, enum zram_pageflags flag) { return zram->table[index].value & BIT(flag); @@ -600,6 +618,113 @@ static int read_from_bdev(struct zram *zram, struct bio_vec *bvec, static void zram_wb_clear(struct zram *zram, u32 index) {} #endif +#ifdef CONFIG_ZRAM_MEMORY_TRACKING + +static struct dentry *zram_debugfs_root; + +static void zram_debugfs_create(void) +{ + zram_debugfs_root = debugfs_create_dir("zram", NULL); +} + +static void zram_debugfs_destroy(void) +{ + debugfs_remove_recursive(zram_debugfs_root); +} + +static void zram_accessed(struct zram *zram, u32 index) +{ + zram->table[index].ac_time = ktime_get_boottime(); +} + +static void zram_reset_access(struct zram *zram, u32 index) +{ + zram->table[index].ac_time = 0; +} + +static ssize_t read_block_state(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + char *kbuf; + ssize_t index, written = 0; + struct zram *zram = file->private_data; + unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; + struct timespec64 ts; + + kbuf = kvmalloc(count, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + down_read(&zram->init_lock); + if (!init_done(zram)) { + up_read(&zram->init_lock); + kvfree(kbuf); + return -EINVAL; + } + + for (index = *ppos; index < nr_pages; index++) { + int copied; + + zram_slot_lock(zram, index); + if (!zram_allocated(zram, index)) + goto next; + + ts = ktime_to_timespec64(zram->table[index].ac_time); + copied = snprintf(kbuf + written, count, + "%12lu %12lu.%06lu %c%c%c\n", + index, ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC, + zram_test_flag(zram, index, ZRAM_SAME) ? 's' : '.', + zram_test_flag(zram, index, ZRAM_WB) ? 'w' : '.', + zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.'); + + if (count < copied) { + zram_slot_unlock(zram, index); + break; + } + written += copied; + count -= copied; +next: + zram_slot_unlock(zram, index); + *ppos += 1; + } + + up_read(&zram->init_lock); + if (copy_to_user(buf, kbuf, written)) + written = -EFAULT; + kvfree(kbuf); + + return written; +} + +static const struct file_operations proc_zram_block_state_op = { + .open = simple_open, + .read = read_block_state, + .llseek = default_llseek, +}; + +static void zram_debugfs_register(struct zram *zram) +{ + if (!zram_debugfs_root) + return; + + zram->debugfs_dir = debugfs_create_dir(zram->disk->disk_name, + zram_debugfs_root); + debugfs_create_file("block_state", 0400, zram->debugfs_dir, + zram, &proc_zram_block_state_op); +} + +static void zram_debugfs_unregister(struct zram *zram) +{ + debugfs_remove_recursive(zram->debugfs_dir); +} +#else +static void zram_debugfs_create(void) {}; +static void zram_debugfs_destroy(void) {}; +static void zram_accessed(struct zram *zram, u32 index) {}; +static void zram_reset_access(struct zram *zram, u32 index) {}; +static void zram_debugfs_register(struct zram *zram) {}; +static void zram_debugfs_unregister(struct zram *zram) {}; +#endif /* * We switched to per-cpu streams and this attr is not needed anymore. @@ -719,14 +844,15 @@ static ssize_t mm_stat_show(struct device *dev, max_used = atomic_long_read(&zram->stats.max_used_pages); ret = scnprintf(buf, PAGE_SIZE, - "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n", + "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu\n", orig_size << PAGE_SHIFT, (u64)atomic64_read(&zram->stats.compr_data_size), mem_used << PAGE_SHIFT, zram->limit_pages << PAGE_SHIFT, max_used << PAGE_SHIFT, (u64)atomic64_read(&zram->stats.same_pages), - pool_stats.pages_compacted); + pool_stats.pages_compacted, + (u64)atomic64_read(&zram->stats.huge_pages)); up_read(&zram->init_lock); return ret; @@ -753,16 +879,6 @@ static DEVICE_ATTR_RO(io_stat); static DEVICE_ATTR_RO(mm_stat); static DEVICE_ATTR_RO(debug_stat); -static void zram_slot_lock(struct zram *zram, u32 index) -{ - bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value); -} - -static void zram_slot_unlock(struct zram *zram, u32 index) -{ - bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value); -} - static void zram_meta_free(struct zram *zram, u64 disksize) { size_t num_pages = disksize >> PAGE_SHIFT; @@ -805,6 +921,13 @@ static void zram_free_page(struct zram *zram, size_t index) { unsigned long handle; + zram_reset_access(zram, index); + + if (zram_test_flag(zram, index, ZRAM_HUGE)) { + zram_clear_flag(zram, index, ZRAM_HUGE); + atomic64_dec(&zram->stats.huge_pages); + } + if (zram_wb_enabled(zram) && zram_test_flag(zram, index, ZRAM_WB)) { zram_wb_clear(zram, index); atomic64_dec(&zram->stats.pages_stored); @@ -973,6 +1096,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, } if (unlikely(comp_len >= huge_class_size)) { + comp_len = PAGE_SIZE; if (zram_wb_enabled(zram) && allow_wb) { zcomp_stream_put(zram->comp); ret = write_to_bdev(zram, bvec, index, bio, &element); @@ -984,7 +1108,6 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, allow_wb = false; goto compress_again; } - comp_len = PAGE_SIZE; } /* @@ -1046,6 +1169,11 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, zram_slot_lock(zram, index); zram_free_page(zram, index); + if (comp_len == PAGE_SIZE) { + zram_set_flag(zram, index, ZRAM_HUGE); + atomic64_inc(&zram->stats.huge_pages); + } + if (flags) { zram_set_flag(zram, index, flags); zram_set_element(zram, index, element); @@ -1166,6 +1294,10 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, generic_end_io_acct(q, rw_acct, &zram->disk->part0, start_time); + zram_slot_lock(zram, index); + zram_accessed(zram, index); + zram_slot_unlock(zram, index); + if (unlikely(ret < 0)) { if (!is_write) atomic64_inc(&zram->stats.failed_reads); @@ -1577,6 +1709,7 @@ static int zram_add(void) } strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); + zram_debugfs_register(zram); pr_info("Added device: %s\n", zram->disk->disk_name); return device_id; @@ -1610,6 +1743,7 @@ static int zram_remove(struct zram *zram) zram->claim = true; mutex_unlock(&bdev->bd_mutex); + zram_debugfs_unregister(zram); /* * Remove sysfs first, so no one will perform a disksize * store while we destroy the devices. This also helps during @@ -1712,6 +1846,7 @@ static void destroy_devices(void) { class_unregister(&zram_control_class); idr_for_each(&zram_index_idr, &zram_remove_cb, NULL); + zram_debugfs_destroy(); idr_destroy(&zram_index_idr); unregister_blkdev(zram_major, "zram"); cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE); @@ -1733,6 +1868,7 @@ static int __init zram_init(void) return ret; } + zram_debugfs_create(); zram_major = register_blkdev(0, "zram"); if (zram_major <= 0) { pr_err("Unable to get major number\n"); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 008861220723cbb0da9338e390ff9f2bf3205818..72c8584b6dfff46847a73dbc066ceeff3ecbcec1 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -43,10 +43,11 @@ /* Flags for zram pages (table[page_no].value) */ enum zram_pageflags { - /* Page consists the same element */ - ZRAM_SAME = ZRAM_FLAG_SHIFT, - ZRAM_ACCESS, /* page is now accessed */ + /* zram slot is locked */ + ZRAM_LOCK = ZRAM_FLAG_SHIFT, + ZRAM_SAME, /* Page consists the same element */ ZRAM_WB, /* page is stored on backing_device */ + ZRAM_HUGE, /* Incompressible page */ __NR_ZRAM_PAGEFLAGS, }; @@ -60,6 +61,9 @@ struct zram_table_entry { unsigned long element; }; unsigned long value; +#ifdef CONFIG_ZRAM_MEMORY_TRACKING + ktime_t ac_time; +#endif }; struct zram_stats { @@ -71,6 +75,7 @@ struct zram_stats { atomic64_t invalid_io; /* non-page-aligned I/O requests */ atomic64_t notify_free; /* no. of swap slot free notifications */ atomic64_t same_pages; /* no. of same element filled pages */ + atomic64_t huge_pages; /* no. of huge pages */ atomic64_t pages_stored; /* no. of pages currently stored */ atomic_long_t max_used_pages; /* no. of maximum pages stored */ atomic64_t writestall; /* no. of write slow paths */ @@ -107,5 +112,8 @@ struct zram { unsigned long nr_pages; spinlock_t bitmap_lock; #endif +#ifdef CONFIG_ZRAM_MEMORY_TRACKING + struct dentry *debugfs_dir; +#endif }; #endif diff --git a/fs/Kconfig b/fs/Kconfig index ac4ac908f001a28f5af87f59be4fd6afe398ef23..51f78a28072af865324d03a149575d69033ebf6a 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -203,6 +203,9 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS +config MEMFD_CREATE + def_bool TMPFS || HUGETLBFS + config ARCH_HAS_GIGANTIC_PAGE bool diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 4ad6f669fe34b21ec592cdd25b7b284fb0f74398..8676bb01b5a99aa433ad58ecc3e3c7aa276ba00a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1621,8 +1621,8 @@ static int fill_files_note(struct memelfnote *note) if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */ return -EINVAL; size = round_up(size, PAGE_SIZE); - data = vmalloc(size); - if (!data) + data = kvmalloc(size, GFP_KERNEL); + if (ZERO_OR_NULL_PTR(data)) return -ENOMEM; start_end_ofs = data + 2; @@ -1639,7 +1639,7 @@ static int fill_files_note(struct memelfnote *note) filename = file_path(file, name_curpos, remaining); if (IS_ERR(filename)) { if (PTR_ERR(filename) == -ENAMETOOLONG) { - vfree(data); + kvfree(data); size = size * 5 / 4; goto alloc; } @@ -1932,7 +1932,7 @@ static void free_note_info(struct elf_note_info *info) kfree(t); } kfree(info->psinfo.data); - vfree(info->files.data); + kvfree(info->files.data); } #else @@ -2148,7 +2148,7 @@ static void free_note_info(struct elf_note_info *info) /* Free data possibly allocated by fill_files_note(): */ if (info->notes_files) - vfree(info->notes_files->data); + kvfree(info->notes_files->data); kfree(info->prstatus); kfree(info->psinfo); @@ -2294,8 +2294,8 @@ static int elf_core_dump(struct coredump_params *cprm) if (segs - 1 > ULONG_MAX / sizeof(*vma_filesz)) goto end_coredump; - vma_filesz = vmalloc((segs - 1) * sizeof(*vma_filesz)); - if (!vma_filesz) + vma_filesz = kvmalloc((segs - 1) * sizeof(*vma_filesz), GFP_KERNEL); + if (ZERO_OR_NULL_PTR(vma_filesz)) goto end_coredump; for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; @@ -2402,7 +2402,7 @@ static int elf_core_dump(struct coredump_params *cprm) cleanup: free_note_info(&info); kfree(shdr4extnum); - vfree(vma_filesz); + kvfree(vma_filesz); kfree(phdr4note); kfree(elf); out: diff --git a/fs/dcache.c b/fs/dcache.c index 60df712262c2e5245e907b699bcd7d1803e864ba..399325060883fcf2d344b675e4307e44ccf40cdd 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -292,7 +292,8 @@ void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry spin_unlock(&dentry->d_lock); name->name = p->name; } else { - memcpy(name->inline_name, dentry->d_iname, DNAME_INLINE_LEN); + memcpy(name->inline_name, dentry->d_iname, + dentry->d_name.len + 1); spin_unlock(&dentry->d_lock); name->name = name->inline_name; } diff --git a/fs/exec.c b/fs/exec.c index 183059c427b9c5552fc29d5eb01f90891930240f..32eea4c659093696ab72b6071a9ee0a1b0e00f52 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1338,6 +1338,7 @@ void setup_new_exec(struct linux_binprm * bprm) if (bprm->secureexec) { /* Make sure parent cannot signal privileged process. */ current->pdeath_signal = 0; + current->signal->pdeath_signal_proc = 0; /* * For secureexec, reset the stack limit to sane default to diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 3c6a9c156b7accb06d96905bc454fe41b2e7c857..cfa862ea19d2c343ad62603c1aeaddffb204582b 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -146,68 +146,82 @@ int _ore_get_io_state(struct ore_layout *layout, struct ore_io_state **pios) { struct ore_io_state *ios; - struct page **pages; - struct osd_sg_entry *sgilist; + size_t size_ios, size_extra, size_total; + void *ios_extra; + + /* + * The desired layout looks like this, with the extra_allocation + * items pointed at from fields within ios or per_dev: + struct __alloc_all_io_state { struct ore_io_state ios; struct ore_per_dev_state per_dev[numdevs]; union { struct osd_sg_entry sglist[sgs_per_dev * numdevs]; struct page *pages[num_par_pages]; - }; - } *_aios; - - if (likely(sizeof(*_aios) <= PAGE_SIZE)) { - _aios = kzalloc(sizeof(*_aios), GFP_KERNEL); - if (unlikely(!_aios)) { - ORE_DBGMSG("Failed kzalloc bytes=%zd\n", - sizeof(*_aios)); + } extra_allocation; + } whole_allocation; + + */ + + /* This should never happen, so abort early if it ever does. */ + if (sgs_per_dev && num_par_pages) { + ORE_DBGMSG("Tried to use both pages and sglist\n"); + *pios = NULL; + return -EINVAL; + } + + if (numdevs > (INT_MAX - sizeof(*ios)) / + sizeof(struct ore_per_dev_state)) + return -ENOMEM; + size_ios = sizeof(*ios) + sizeof(struct ore_per_dev_state) * numdevs; + + if (sgs_per_dev * numdevs > INT_MAX / sizeof(struct osd_sg_entry)) + return -ENOMEM; + if (num_par_pages > INT_MAX / sizeof(struct page *)) + return -ENOMEM; + size_extra = max(sizeof(struct osd_sg_entry) * (sgs_per_dev * numdevs), + sizeof(struct page *) * num_par_pages); + + size_total = size_ios + size_extra; + + if (likely(size_total <= PAGE_SIZE)) { + ios = kzalloc(size_total, GFP_KERNEL); + if (unlikely(!ios)) { + ORE_DBGMSG("Failed kzalloc bytes=%zd\n", size_total); *pios = NULL; return -ENOMEM; } - pages = num_par_pages ? _aios->pages : NULL; - sgilist = sgs_per_dev ? _aios->sglist : NULL; - ios = &_aios->ios; + ios_extra = (char *)ios + size_ios; } else { - struct __alloc_small_io_state { - struct ore_io_state ios; - struct ore_per_dev_state per_dev[numdevs]; - } *_aio_small; - union __extra_part { - struct osd_sg_entry sglist[sgs_per_dev * numdevs]; - struct page *pages[num_par_pages]; - } *extra_part; - - _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL); - if (unlikely(!_aio_small)) { + ios = kzalloc(size_ios, GFP_KERNEL); + if (unlikely(!ios)) { ORE_DBGMSG("Failed alloc first part bytes=%zd\n", - sizeof(*_aio_small)); + size_ios); *pios = NULL; return -ENOMEM; } - extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL); - if (unlikely(!extra_part)) { + ios_extra = kzalloc(size_extra, GFP_KERNEL); + if (unlikely(!ios_extra)) { ORE_DBGMSG("Failed alloc second part bytes=%zd\n", - sizeof(*extra_part)); - kfree(_aio_small); + size_extra); + kfree(ios); *pios = NULL; return -ENOMEM; } - pages = num_par_pages ? extra_part->pages : NULL; - sgilist = sgs_per_dev ? extra_part->sglist : NULL; /* In this case the per_dev[0].sgilist holds the pointer to * be freed */ - ios = &_aio_small->ios; ios->extra_part_alloc = true; } - if (pages) { - ios->parity_pages = pages; + if (num_par_pages) { + ios->parity_pages = ios_extra; ios->max_par_pages = num_par_pages; } - if (sgilist) { + if (sgs_per_dev) { + struct osd_sg_entry *sgilist = ios_extra; unsigned d; for (d = 0; d < numdevs; ++d) { diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 27cbdb6976495f5f830ca473e9b844fb1a2bd29d..199590f362030d2c4dad33968b2e0a12ef90299a 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -71,6 +71,11 @@ static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, { struct __stripe_pages_2d *sp2d; unsigned data_devs = group_width - parity; + + /* + * Desired allocation layout is, though when larger than PAGE_SIZE, + * each struct __alloc_1p_arrays is separately allocated: + struct _alloc_all_bytes { struct __alloc_stripe_pages_2d { struct __stripe_pages_2d sp2d; @@ -82,55 +87,85 @@ static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, char page_is_read[data_devs]; } __a1pa[pages_in_unit]; } *_aab; + struct __alloc_1p_arrays *__a1pa; struct __alloc_1p_arrays *__a1pa_end; - const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]); + + */ + + char *__a1pa; + char *__a1pa_end; + + const size_t sizeof_stripe_pages_2d = + sizeof(struct __stripe_pages_2d) + + sizeof(struct __1_page_stripe) * pages_in_unit; + const size_t sizeof__a1pa = + ALIGN(sizeof(struct page *) * (2 * group_width) + data_devs, + sizeof(void *)); + const size_t sizeof__a1pa_arrays = sizeof__a1pa * pages_in_unit; + const size_t alloc_total = sizeof_stripe_pages_2d + + sizeof__a1pa_arrays; + unsigned num_a1pa, alloc_size, i; /* FIXME: check these numbers in ore_verify_layout */ - BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE); + BUG_ON(sizeof_stripe_pages_2d > PAGE_SIZE); BUG_ON(sizeof__a1pa > PAGE_SIZE); - if (sizeof(*_aab) > PAGE_SIZE) { - num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa; - alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa; + /* + * If alloc_total would be larger than PAGE_SIZE, only allocate + * as many a1pa items as would fill the rest of the page, instead + * of the full pages_in_unit count. + */ + if (alloc_total > PAGE_SIZE) { + num_a1pa = (PAGE_SIZE - sizeof_stripe_pages_2d) / sizeof__a1pa; + alloc_size = sizeof_stripe_pages_2d + sizeof__a1pa * num_a1pa; } else { num_a1pa = pages_in_unit; - alloc_size = sizeof(*_aab); + alloc_size = alloc_total; } - _aab = kzalloc(alloc_size, GFP_KERNEL); - if (unlikely(!_aab)) { + *psp2d = sp2d = kzalloc(alloc_size, GFP_KERNEL); + if (unlikely(!sp2d)) { ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size); return -ENOMEM; } + /* From here Just call _sp2d_free */ - sp2d = &_aab->__asp2d.sp2d; - *psp2d = sp2d; /* From here Just call _sp2d_free */ - - __a1pa = _aab->__a1pa; - __a1pa_end = __a1pa + num_a1pa; + /* Find start of a1pa area. */ + __a1pa = (char *)sp2d + sizeof_stripe_pages_2d; + /* Find end of the _allocated_ a1pa area. */ + __a1pa_end = __a1pa + alloc_size; + /* Allocate additionally needed a1pa items in PAGE_SIZE chunks. */ for (i = 0; i < pages_in_unit; ++i) { + struct __1_page_stripe *stripe = &sp2d->_1p_stripes[i]; + if (unlikely(__a1pa >= __a1pa_end)) { num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, pages_in_unit - i); + alloc_size = sizeof__a1pa * num_a1pa; - __a1pa = kcalloc(num_a1pa, sizeof__a1pa, GFP_KERNEL); + __a1pa = kzalloc(alloc_size, GFP_KERNEL); if (unlikely(!__a1pa)) { ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", num_a1pa); return -ENOMEM; } - __a1pa_end = __a1pa + num_a1pa; + __a1pa_end = __a1pa + alloc_size; /* First *pages is marked for kfree of the buffer */ - sp2d->_1p_stripes[i].alloc = true; + stripe->alloc = true; } - sp2d->_1p_stripes[i].pages = __a1pa->pages; - sp2d->_1p_stripes[i].scribble = __a1pa->scribble ; - sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read; - ++__a1pa; + /* + * Attach all _lp_stripes pointers to the allocation for + * it which was either part of the original PAGE_SIZE + * allocation or the subsequent allocation in this loop. + */ + stripe->pages = (void *)__a1pa; + stripe->scribble = stripe->pages + group_width; + stripe->page_is_read = (char *)stripe->scribble + group_width; + __a1pa += sizeof__a1pa; } sp2d->parity = parity; diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 179cd5c2f52ac5f5199af0ae1002ddf9797037e5..fabd15e482befd156b1404ec65d799b8c2b86595 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -549,27 +549,26 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, static int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, struct exofs_dev **peds) { - struct __alloc_ore_devs_and_exofs_devs { - /* Twice bigger table: See exofs_init_comps() and comment at - * exofs_read_lookup_dev_table() - */ - struct ore_dev *oreds[numdevs * 2 - 1]; - struct exofs_dev eds[numdevs]; - } *aoded; + /* Twice bigger table: See exofs_init_comps() and comment at + * exofs_read_lookup_dev_table() + */ + const size_t numores = numdevs * 2 - 1; struct exofs_dev *eds; unsigned i; - aoded = kzalloc(sizeof(*aoded), GFP_KERNEL); - if (unlikely(!aoded)) { + sbi->oc.ods = kzalloc(numores * sizeof(struct ore_dev *) + + numdevs * sizeof(struct exofs_dev), GFP_KERNEL); + if (unlikely(!sbi->oc.ods)) { EXOFS_ERR("ERROR: failed allocating Device array[%d]\n", numdevs); return -ENOMEM; } - sbi->oc.ods = aoded->oreds; - *peds = eds = aoded->eds; + /* Start of allocated struct exofs_dev entries */ + *peds = eds = (void *)sbi->oc.ods[numores]; + /* Initialize pointers into struct exofs_dev */ for (i = 0; i < numdevs; ++i) - aoded->oreds[i] = &eds[i].ored; + sbi->oc.ods[i] = &eds[i].ored; return 0; } diff --git a/fs/fcntl.c b/fs/fcntl.c index c42169459298ba774a10fdcab553ee38543a530b..12273b6ea56dbdeb2a2bff0c6846030e4fc9bcd5 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -23,7 +23,7 @@ #include <linux/rcupdate.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> -#include <linux/shmem_fs.h> +#include <linux/memfd.h> #include <linux/compat.h> #include <linux/poll.h> diff --git a/fs/namei.c b/fs/namei.c index 186bd2464fd5a842480a37c55b57d60d9164cf86..3157c984414bea86a14871e530e162e2a9b48bb4 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -887,6 +887,8 @@ static inline void put_link(struct nameidata *nd) int sysctl_protected_symlinks __read_mostly = 0; int sysctl_protected_hardlinks __read_mostly = 0; +int sysctl_protected_fifos __read_mostly; +int sysctl_protected_regular __read_mostly; /** * may_follow_link - Check symlink following for unsafe situations @@ -1001,6 +1003,45 @@ static int may_linkat(struct path *link) return -EPERM; } +/** + * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory + * should be allowed, or not, on files that already + * exist. + * @dir: the sticky parent directory + * @inode: the inode of the file to open + * + * Block an O_CREAT open of a FIFO (or a regular file) when: + * - sysctl_protected_fifos (or sysctl_protected_regular) is enabled + * - the file already exists + * - we are in a sticky directory + * - we don't own the file + * - the owner of the directory doesn't own the file + * - the directory is world writable + * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2 + * the directory doesn't have to be world writable: being group writable will + * be enough. + * + * Returns 0 if the open is allowed, -ve on error. + */ +static int may_create_in_sticky(struct dentry * const dir, + struct inode * const inode) +{ + if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) || + (!sysctl_protected_regular && S_ISREG(inode->i_mode)) || + likely(!(dir->d_inode->i_mode & S_ISVTX)) || + uid_eq(inode->i_uid, dir->d_inode->i_uid) || + uid_eq(current_fsuid(), inode->i_uid)) + return 0; + + if (likely(dir->d_inode->i_mode & 0002) || + (dir->d_inode->i_mode & 0020 && + ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) || + (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) { + return -EACCES; + } + return 0; +} + static __always_inline const char *get_link(struct nameidata *nd) { @@ -3342,9 +3383,14 @@ static int do_last(struct nameidata *nd, if (error) return error; audit_inode(nd->name, nd->path.dentry, 0); - error = -EISDIR; - if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry)) - goto out; + if (open_flag & O_CREAT) { + error = -EISDIR; + if (d_is_dir(nd->path.dentry)) + goto out; + error = may_create_in_sticky(dir, inode); + if (unlikely(error)) + goto out; + } error = -ENOTDIR; if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) goto out; diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 63a1ca4b9deee22a786822229b8421bcdcf74e31..eb5c41284649e97266f2b882c878d10142faa146 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -384,8 +384,9 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) static int __init dnotify_init(void) { - dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC); - dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC); + dnotify_struct_cache = KMEM_CACHE(dnotify_struct, + SLAB_PANIC|SLAB_ACCOUNT); + dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops); if (IS_ERR(dnotify_group)) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index d94e8031fe5fb8e95a168a475ad0069df3016465..78cfdcfd9f8e770a34f91449a34df0c0ff8b704a 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -153,14 +153,16 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, if (fanotify_is_perm_event(mask)) { struct fanotify_perm_event_info *pevent; - pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp); + pevent = kmem_cache_alloc_memcg(fanotify_perm_event_cachep, gfp, + group->memcg); if (!pevent) return NULL; event = &pevent->fae; pevent->response = 0; goto init; } - event = kmem_cache_alloc(fanotify_event_cachep, gfp); + event = kmem_cache_alloc_memcg(fanotify_event_cachep, gfp, + group->memcg); if (!event) return NULL; init: __maybe_unused diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index ec4d8c59d0e379df56efef0c86d3d303326ff071..0cf45041dc326e1bfa2c5f834bb9d47de88094cb 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -16,6 +16,7 @@ #include <linux/uaccess.h> #include <linux/compat.h> #include <linux/sched/signal.h> +#include <linux/memcontrol.h> #include <asm/ioctls.h> @@ -756,6 +757,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->fanotify_data.user = user; atomic_inc(&user->fanotify_listeners); + group->memcg = get_mem_cgroup_from_mm(current->mm); oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL); if (unlikely(!oevent)) { @@ -957,7 +959,8 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark, */ static int __init fanotify_user_setup(void) { - fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); + fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, + SLAB_PANIC|SLAB_ACCOUNT); fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) { fanotify_perm_event_cachep = diff --git a/fs/notify/group.c b/fs/notify/group.c index b7a4b6a69efa97f5c698ee1c93cb0b3992d80e49..3e56459f477398d622c30d079d4dac0e357cfb55 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -22,6 +22,7 @@ #include <linux/srcu.h> #include <linux/rculist.h> #include <linux/wait.h> +#include <linux/memcontrol.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" @@ -36,6 +37,9 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) if (group->ops->free_group_priv) group->ops->free_group_priv(group); + if (group->memcg) + mem_cgroup_put(group->memcg); + kfree(group); } diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 40dedb37a1f3093f97ebca766c78284b4ba6385a..b184bff93d02aac11e42d58356d3a464f09aff9d 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -98,7 +98,7 @@ int inotify_handle_event(struct fsnotify_group *group, i_mark = container_of(inode_mark, struct inotify_inode_mark, fsn_mark); - event = kmalloc(alloc_len, GFP_KERNEL); + event = kmalloc_memcg(alloc_len, GFP_KERNEL, group->memcg); if (unlikely(!event)) { /* * Treat lost event due to ENOMEM the same way as queue diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index ef32f36579589090535cd046ae3a193b77bb830d..3c152e350805f3bb453131ea9a814f034ebc18c2 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -38,6 +38,7 @@ #include <linux/uaccess.h> #include <linux/poll.h> #include <linux/wait.h> +#include <linux/memcontrol.h> #include "inotify.h" #include "../fdinfo.h" @@ -632,6 +633,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) oevent->name_len = 0; group->max_events = max_events; + group->memcg = get_mem_cgroup_from_mm(current->mm); spin_lock_init(&group->inotify_data.idr_lock); idr_init(&group->inotify_data.idr); @@ -804,7 +806,8 @@ static int __init inotify_user_setup(void) BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); - inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); + inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, + SLAB_PANIC|SLAB_ACCOUNT); inotify_max_queued_events = 16384; init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128; diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index d9ebe11c89909b8587fa3d7faa491c75426bed87..7ae41476a379491d7e7e93dff117497cf82a0a69 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -99,25 +99,34 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, return ret; } +/* Caller must provide a bhs[] with all NULL or non-NULL entries, so it + * will be easier to handle read failure. + */ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, unsigned int nr, struct buffer_head *bhs[]) { int status = 0; unsigned int i; struct buffer_head *bh; + int new_bh = 0; trace_ocfs2_read_blocks_sync((unsigned long long)block, nr); if (!nr) goto bail; + /* Don't put buffer head and re-assign it to NULL if it is allocated + * outside since the call can't be aware of this alternation! + */ + new_bh = (bhs[0] == NULL); + for (i = 0 ; i < nr ; i++) { if (bhs[i] == NULL) { bhs[i] = sb_getblk(osb->sb, block++); if (bhs[i] == NULL) { status = -ENOMEM; mlog_errno(status); - goto bail; + break; } } bh = bhs[i]; @@ -158,9 +167,26 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, submit_bh(REQ_OP_READ, 0, bh); } +read_failure: for (i = nr; i > 0; i--) { bh = bhs[i - 1]; + if (unlikely(status)) { + if (new_bh && !bh) { + /* If middle bh fails, let previous bh + * finish its read and then put it to + * aovoid bh leak + */ + if (!buffer_jbd(bh)) + wait_on_buffer(bh); + put_bh(bh); + bhs[i - 1] = NULL; + } else if (buffer_uptodate(bh)) { + clear_buffer_uptodate(bh); + } + continue; + } + /* No need to wait on the buffer if it's managed by JBD. */ if (!buffer_jbd(bh)) wait_on_buffer(bh); @@ -170,8 +196,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, * so we can safely record this and loop back * to cleanup the other buffers. */ status = -EIO; - put_bh(bh); - bhs[i - 1] = NULL; + goto read_failure; } } @@ -179,6 +204,9 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, return status; } +/* Caller must provide a bhs[] with all NULL or non-NULL entries, so it + * will be easier to handle read failure. + */ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, struct buffer_head *bhs[], int flags, int (*validate)(struct super_block *sb, @@ -188,6 +216,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, int i, ignore_cache = 0; struct buffer_head *bh; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + int new_bh = 0; trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags); @@ -213,6 +242,11 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, goto bail; } + /* Don't put buffer head and re-assign it to NULL if it is allocated + * outside since the call can't be aware of this alternation! + */ + new_bh = (bhs[0] == NULL); + ocfs2_metadata_cache_io_lock(ci); for (i = 0 ; i < nr ; i++) { if (bhs[i] == NULL) { @@ -221,7 +255,8 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, ocfs2_metadata_cache_io_unlock(ci); status = -ENOMEM; mlog_errno(status); - goto bail; + /* Don't forget to put previous bh! */ + break; } } bh = bhs[i]; @@ -316,16 +351,27 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, } } - status = 0; - +read_failure: for (i = (nr - 1); i >= 0; i--) { bh = bhs[i]; if (!(flags & OCFS2_BH_READAHEAD)) { - if (status) { - /* Clear the rest of the buffers on error */ - put_bh(bh); - bhs[i] = NULL; + if (unlikely(status)) { + /* Clear the buffers on error including those + * ever succeeded in reading + */ + if (new_bh && !bh) { + /* If middle bh fails, let previous bh + * finish its read and then put it to + * aovoid bh leak + */ + if (!buffer_jbd(bh)) + wait_on_buffer(bh); + put_bh(bh); + bhs[i] = NULL; + } else if (buffer_uptodate(bh)) { + clear_buffer_uptodate(bh); + } continue; } /* We know this can't have changed as we hold the @@ -342,9 +388,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, * for this bh as it's not marked locally * uptodate. */ status = -EIO; - put_bh(bh); - bhs[i] = NULL; - continue; + goto read_failure; } if (buffer_needs_validate(bh)) { @@ -354,11 +398,8 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, BUG_ON(buffer_jbd(bh)); clear_buffer_needs_validate(bh); status = validate(sb, bh); - if (status) { - put_bh(bh); - bhs[i] = NULL; - continue; - } + if (status) + goto read_failure; } } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 91a8889abf9ba9f4d3df399996346942f48c2555..2809e29d612dbeb564241597540028096cf812d4 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -540,11 +540,12 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, struct bio *bio; struct page *page; +#define O2HB_BIO_VECS 16 /* Testing has shown this allocation to take long enough under * GFP_KERNEL that the local node can get fenced. It would be * nicest if we could pre-allocate these bios and avoid this * all together. */ - bio = bio_alloc(GFP_ATOMIC, 16); + bio = bio_alloc(GFP_ATOMIC, O2HB_BIO_VECS); if (!bio) { mlog(ML_ERROR, "Could not alloc slots BIO!\n"); bio = ERR_PTR(-ENOMEM); @@ -570,7 +571,10 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, current_page, vec_len, vec_start); len = bio_add_page(bio, page, vec_len, vec_start); - if (len != vec_len) { + if (len == 0 && current_page == O2HB_BIO_VECS) { + /* bio is full now. */ + goto bail; + } else if (len != vec_len) { mlog(ML_ERROR, "Adding page[%d] to bio failed, " "page %p, len %d, vec_len %u, vec_start %u, " "bi_sector %llu\n", current_page, page, len, diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 97a972efab83b9312bec62e1b6bd378a413371d1..ce33ac354602eeb4ff120999a0aea40b37a0b0b2 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3534,7 +3534,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb, * we can recover correctly from node failure. Otherwise, we may get * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set. */ - if (!ocfs2_is_o2cb_active() && + if (ocfs2_userspace_stack(osb) && lockres->l_ops->flags & LOCK_TYPE_USES_LVB) lvb = 1; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6ee94bc23f5b106421eaff374d3dc7aa5df9deac..6207beca9b051df57cfa41384ea75fb94514ee4c 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2343,7 +2343,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, written = __generic_file_write_iter(iocb, from); /* buffered aio wouldn't have proper lock coverage today */ - BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT)); + BUG_ON(written == -EIOCBQUEUED && !direct_io); /* * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io @@ -2463,7 +2463,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, trace_generic_file_read_iter_ret(ret); /* buffered aio wouldn't have proper lock coverage today */ - BUG_ON(ret == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT)); + BUG_ON(ret == -EIOCBQUEUED && !direct_io); /* see ocfs2_file_write_iter */ if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index e5dcea6cee5ff678b33e78083c7240ddc56e3500..b63c97f4318e063889fe1ca203d19092c1abbedf 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1378,15 +1378,23 @@ static int __ocfs2_recovery_thread(void *arg) int rm_quota_used = 0, i; struct ocfs2_quota_recovery *qrec; + /* Whether the quota supported. */ + int quota_enabled = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, + OCFS2_FEATURE_RO_COMPAT_USRQUOTA) + || OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA); + status = ocfs2_wait_on_mount(osb); if (status < 0) { goto bail; } - rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS); - if (!rm_quota) { - status = -ENOMEM; - goto bail; + if (quota_enabled) { + rm_quota = kcalloc(osb->max_slots, sizeof(int), GFP_NOFS); + if (!rm_quota) { + status = -ENOMEM; + goto bail; + } } restart: status = ocfs2_super_lock(osb, 1); @@ -1422,9 +1430,14 @@ static int __ocfs2_recovery_thread(void *arg) * then quota usage would be out of sync until some node takes * the slot. So we remember which nodes need quota recovery * and when everything else is done, we recover quotas. */ - for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++); - if (i == rm_quota_used) - rm_quota[rm_quota_used++] = slot_num; + if (quota_enabled) { + for (i = 0; i < rm_quota_used + && rm_quota[i] != slot_num; i++) + ; + + if (i == rm_quota_used) + rm_quota[rm_quota_used++] = slot_num; + } status = ocfs2_recover_node(osb, node_num, slot_num); skip_recovery: @@ -1452,16 +1465,19 @@ static int __ocfs2_recovery_thread(void *arg) /* Now it is right time to recover quotas... We have to do this under * superblock lock so that no one can start using the slot (and crash) * before we recover it */ - for (i = 0; i < rm_quota_used; i++) { - qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]); - if (IS_ERR(qrec)) { - status = PTR_ERR(qrec); - mlog_errno(status); - continue; + if (quota_enabled) { + for (i = 0; i < rm_quota_used; i++) { + qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]); + if (IS_ERR(qrec)) { + status = PTR_ERR(qrec); + mlog_errno(status); + continue; + } + ocfs2_queue_recovery_completion(osb->journal, + rm_quota[i], + NULL, NULL, qrec, + ORPHAN_NEED_TRUNCATE); } - ocfs2_queue_recovery_completion(osb->journal, rm_quota[i], - NULL, NULL, qrec, - ORPHAN_NEED_TRUNCATE); } ocfs2_super_unlock(osb, 1); @@ -1483,7 +1499,8 @@ static int __ocfs2_recovery_thread(void *arg) mutex_unlock(&osb->recovery_lock); - kfree(rm_quota); + if (quota_enabled) + kfree(rm_quota); /* no one is callint kthread_stop() for us so the kthread() api * requires that we call do_exit(). And it isn't exported, but diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index d6c350ba25b96ec9886cdc11b46a94fb17769261..c4b029c43464e0d14424a8a9af216d9168ca4bc9 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -48,12 +48,6 @@ static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl"; */ static struct ocfs2_stack_plugin *active_stack; -inline int ocfs2_is_o2cb_active(void) -{ - return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB); -} -EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active); - static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name) { struct ocfs2_stack_plugin *p; diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index e3036e1790e86da7b4e13dcb0b8c88e3f19b6d50..f2dce10fae543c254dcb4e6628d357b60a3ac16c 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -298,9 +298,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin); void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin); -/* In ocfs2_downconvert_lock(), we need to know which stack we are using */ -int ocfs2_is_o2cb_active(void); - extern struct kset *ocfs2_kset; #endif /* STACKGLUE_H */ diff --git a/fs/proc/base.c b/fs/proc/base.c index 1b2ede6abcdfc97f1ab87244a18e0cb4c9035e5f..2822c7ab609ce098b2c402311c53b3354ff912c0 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -205,33 +205,53 @@ static int proc_root_link(struct dentry *dentry, struct path *path) return result; } +static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) +{ + struct mm_struct *mm = proc_mem_open(inode, mode); + + if (IS_ERR(mm)) + return PTR_ERR(mm); + + file->private_data = mm; + return 0; +} + +static int proc_pid_cmdline_open(struct inode *inode, struct file *file) +{ + return __mem_open(inode, file, PTRACE_MODE_READ); +} + +static int mem_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + if (mm) + mmdrop(mm); + return 0; +} + static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, size_t _count, loff_t *pos) { - struct task_struct *tsk; - struct mm_struct *mm; + struct mm_struct *mm = file->private_data; char *page; unsigned long count = _count; unsigned long arg_start, arg_end, env_start, env_end; - unsigned long len1, len2, len; - unsigned long p; + unsigned long len1, len2; + char __user *buf0 = buf; + struct { + unsigned long p; + unsigned long len; + } cmdline[2]; char c; - ssize_t rv; + int rv; BUG_ON(*pos < 0); - tsk = get_proc_task(file_inode(file)); - if (!tsk) - return -ESRCH; - mm = get_task_mm(tsk); - put_task_struct(tsk); - if (!mm) - return 0; /* Check if process spawned far enough to have cmdline. */ - if (!mm->env_end) { - rv = 0; - goto out_mmput; - } + if (!mm || !mm->env_end) + return 0; + if (!mmget_not_zero(mm)) + return 0; page = (char *)__get_free_page(GFP_KERNEL); if (!page) { @@ -239,12 +259,12 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, goto out_mmput; } - down_read(&mm->mmap_sem); + spin_lock(&mm->arg_lock); arg_start = mm->arg_start; arg_end = mm->arg_end; env_start = mm->env_start; env_end = mm->env_end; - up_read(&mm->mmap_sem); + spin_unlock(&mm->arg_lock); BUG_ON(arg_start > arg_end); BUG_ON(env_start > env_end); @@ -253,61 +273,31 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, len2 = env_end - env_start; /* Empty ARGV. */ - if (len1 == 0) { - rv = 0; - goto out_free_page; - } + if (len1 == 0) + goto end; + /* * Inherently racy -- command line shares address space * with code and data. */ - rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0); - if (rv <= 0) - goto out_free_page; - - rv = 0; + if (access_remote_vm(mm, arg_end - 1, &c, 1, 0) != 1) + goto end; + cmdline[0].p = arg_start; + cmdline[0].len = len1; if (c == '\0') { /* Command line (set of strings) occupies whole ARGV. */ - if (len1 <= *pos) - goto out_free_page; - - p = arg_start + *pos; - len = len1 - *pos; - while (count > 0 && len > 0) { - unsigned int _count; - int nr_read; - - _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); - if (nr_read < 0) - rv = nr_read; - if (nr_read <= 0) - goto out_free_page; - - if (copy_to_user(buf, page, nr_read)) { - rv = -EFAULT; - goto out_free_page; - } - - p += nr_read; - len -= nr_read; - buf += nr_read; - count -= nr_read; - rv += nr_read; - } + cmdline[1].len = 0; } else { /* * Command line (1 string) occupies ARGV and * extends into ENVP. */ - struct { - unsigned long p; - unsigned long len; - } cmdline[2] = { - { .p = arg_start, .len = len1 }, - { .p = env_start, .len = len2 }, - }; + cmdline[1].p = env_start; + cmdline[1].len = len2; + } + + { loff_t pos1 = *pos; unsigned int i; @@ -317,44 +307,40 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, i++; } while (i < 2) { + unsigned long p; + unsigned long len; + p = cmdline[i].p + pos1; len = cmdline[i].len - pos1; while (count > 0 && len > 0) { - unsigned int _count, l; - int nr_read; - bool final; - - _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); - if (nr_read < 0) - rv = nr_read; - if (nr_read <= 0) - goto out_free_page; + unsigned int nr_read, nr_write; + + nr_read = min3(count, len, PAGE_SIZE); + nr_read = access_remote_vm(mm, p, page, nr_read, 0); + if (nr_read == 0) + goto end; /* * Command line can be shorter than whole ARGV * even if last "marker" byte says it is not. */ - final = false; - l = strnlen(page, nr_read); - if (l < nr_read) { - nr_read = l; - final = true; - } + if (c == '\0') + nr_write = nr_read; + else + nr_write = strnlen(page, nr_read); - if (copy_to_user(buf, page, nr_read)) { + if (copy_to_user(buf, page, nr_write)) { rv = -EFAULT; goto out_free_page; } - p += nr_read; - len -= nr_read; - buf += nr_read; - count -= nr_read; - rv += nr_read; + p += nr_write; + len -= nr_write; + buf += nr_write; + count -= nr_write; - if (final) - goto out_free_page; + if (nr_write < nr_read) + goto end; } /* Only first chunk can be read partially. */ @@ -363,18 +349,21 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, } } +end: + *pos += buf - buf0; + rv = buf - buf0; out_free_page: free_page((unsigned long)page); out_mmput: mmput(mm); - if (rv > 0) - *pos += rv; return rv; } static const struct file_operations proc_pid_cmdline_ops = { - .read = proc_pid_cmdline_read, - .llseek = generic_file_llseek, + .open = proc_pid_cmdline_open, + .read = proc_pid_cmdline_read, + .llseek = generic_file_llseek, + .release = mem_release, }; #ifdef CONFIG_KALLSYMS @@ -783,17 +772,6 @@ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) return mm; } -static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) -{ - struct mm_struct *mm = proc_mem_open(inode, mode); - - if (IS_ERR(mm)) - return PTR_ERR(mm); - - file->private_data = mm; - return 0; -} - static int mem_open(struct inode *inode, struct file *file) { int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH); @@ -887,14 +865,6 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig) return file->f_pos; } -static int mem_release(struct inode *inode, struct file *file) -{ - struct mm_struct *mm = file->private_data; - if (mm) - mmdrop(mm); - return 0; -} - static const struct file_operations proc_mem_operations = { .llseek = mem_lseek, .read = mem_read, @@ -929,10 +899,10 @@ static ssize_t environ_read(struct file *file, char __user *buf, if (!mmget_not_zero(mm)) goto free; - down_read(&mm->mmap_sem); + spin_lock(&mm->arg_lock); env_start = mm->env_start; env_end = mm->env_end; - up_read(&mm->mmap_sem); + spin_unlock(&mm->arg_lock); while (count > 0) { size_t this_len, max_len; diff --git a/fs/seq_file.c b/fs/seq_file.c index c6c27f1f9c9850634700e4898adae6ab7755e113..8389da9d0eb804e05ba82abc03125b2756cb60b7 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -709,11 +709,6 @@ void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter, if (m->count + width >= m->size) goto overflow; - if (num < 10) { - m->buf[m->count++] = num + '0'; - return; - } - len = num_to_str(m->buf + m->count, m->size - m->count, num, width); if (!len) goto overflow; @@ -800,11 +795,6 @@ void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num num = -num; } - if (num < 10) { - m->buf[m->count++] = num + '0'; - return; - } - len = num_to_str(m->buf + m->count, m->size - m->count, num, 0); if (!len) goto overflow; diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index c0e68f903011cb294fe34386dd42896766ab462b..92af04018176ea810ab6ebdb7466ca843eb5be76 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -81,6 +81,11 @@ enum { * Enable cpuset controller in v1 cgroup to use v2 behavior. */ CGRP_ROOT_CPUSET_V2_MODE = (1 << 4), + + /* + * Enable cgroup-aware OOM killer. + */ + CGRP_GROUP_OOM = (1 << 5), }; /* cftype->flags */ diff --git a/include/linux/fs.h b/include/linux/fs.h index edaaa0aab8a4ccdf86d3b419c232276b29648480..0eedf745667b2dda1d0ac3843b928dbb4648316d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -73,6 +73,8 @@ extern struct inodes_stat_t inodes_stat; extern int leases_enable, lease_break_time; extern int sysctl_protected_symlinks; extern int sysctl_protected_hardlinks; +extern int sysctl_protected_fifos; +extern int sysctl_protected_regular; typedef __kernel_rwf_t rwf_t; diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index e64c0294f50bf82dd50d447e37a5c77f86eabc26..505d4fcc7a4ab1ec9f82020be8d908f3bc118bc5 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -84,6 +84,8 @@ struct fsnotify_event_private_data; struct fsnotify_fname; struct fsnotify_iter_info; +struct mem_cgroup; + /* * Each group much define these ops. The fsnotify infrastructure will call * these operations for each relevant group. @@ -129,6 +131,8 @@ struct fsnotify_event { * everything will be cleaned up. */ struct fsnotify_group { + const struct fsnotify_ops *ops; /* how this group handles things */ + /* * How the refcnt is used is up to each group. When the refcnt hits 0 * fsnotify will clean up all of the resources associated with this group. @@ -139,8 +143,6 @@ struct fsnotify_group { */ refcount_t refcnt; /* things with interest in this group */ - const struct fsnotify_ops *ops; /* how this group handles things */ - /* needed to send notification to userspace */ spinlock_t notification_lock; /* protect the notification_list */ struct list_head notification_list; /* list of event_holder this group needs to send to userspace */ @@ -162,6 +164,8 @@ struct fsnotify_group { atomic_t num_marks; /* 1 for each mark and 1 for not being * past the point of no return when freeing * a group */ + atomic_t user_waits; /* Number of tasks waiting for user + * response */ struct list_head marks_list; /* all inode marks for this group */ struct fasync_struct *fsn_fa; /* async notification */ @@ -169,8 +173,8 @@ struct fsnotify_group { struct fsnotify_event *overflow_event; /* Event we queue when the * notification list is too * full */ - atomic_t user_waits; /* Number of tasks waiting for user - * response */ + + struct mem_cgroup *memcg; /* memcg to charge allocations */ /* groups can define private fields here or use the void *private */ union { diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3dfa3f260fc4381cf6d68feee81324e42eb7faab..593fb8d250a2cdb04b927625a1d98ed3dbf82b5a 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -29,6 +29,7 @@ #define LLONG_MIN (-LLONG_MAX - 1) #define ULLONG_MAX (~0ULL) #define SIZE_MAX (~(size_t)0) +#define PHYS_ADDR_MAX (~(phys_addr_t)0) #define U8_MAX ((u8)~0U) #define S8_MAX ((s8)(U8_MAX>>1)) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d99b71bc2c667a512541d33894ec073ff1bb6c91..ab60ff55bdb33cc70977a5c4e87dfc71dc779edf 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -35,6 +35,7 @@ struct mem_cgroup; struct page; struct mm_struct; struct kmem_cache; +struct oom_control; /* Cgroup-specific page state, on top of universal node page state */ enum memcg_stat_item { @@ -53,6 +54,8 @@ enum memcg_memory_event { MEMCG_HIGH, MEMCG_MAX, MEMCG_OOM, + MEMCG_SWAP_MAX, + MEMCG_SWAP_FAIL, MEMCG_NR_MEMORY_EVENTS, }; @@ -179,8 +182,7 @@ struct mem_cgroup { struct page_counter kmem; struct page_counter tcpmem; - /* Normal memory consumption range */ - unsigned long low; + /* Upper bound of normal memory consumption range */ unsigned long high; /* Range enforcement for interrupt charges */ @@ -204,10 +206,20 @@ struct mem_cgroup { /* OOM-Killer disable */ int oom_kill_disable; + /* + * Treat the sub-tree as an indivisible memory consumer, + * kill all belonging tasks if the memory cgroup selected + * as OOM victim. + */ + bool oom_group; + /* memory.events */ atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; struct cgroup_file events_file; + /* handle for "memory.swap.events" */ + struct cgroup_file swap_events_file; + /* protect arrays of thresholds */ struct mutex thresholds_lock; @@ -343,11 +355,18 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *); bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); +struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); + static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; } +static inline void mem_cgroup_put(struct mem_cgroup *memcg) +{ + css_put(&memcg->css); +} + #define mem_cgroup_from_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) @@ -462,7 +481,7 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, void mem_cgroup_handle_over_high(void); -unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg); +unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p); @@ -486,6 +505,13 @@ static inline bool task_in_memcg_oom(struct task_struct *p) bool mem_cgroup_oom_synchronize(bool wait); +bool mem_cgroup_select_oom_victim(struct oom_control *oc); + +static inline bool mem_cgroup_oom_group(struct mem_cgroup *memcg) +{ + return memcg->oom_group; +} + #ifdef CONFIG_MEMCG_SWAP extern int do_swap_account; #endif @@ -793,6 +819,15 @@ static inline bool task_in_mem_cgroup(struct task_struct *task, return true; } +static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) +{ + return NULL; +} + +static inline void mem_cgroup_put(struct mem_cgroup *memcg) +{ +} + static inline struct mem_cgroup * mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, @@ -853,7 +888,7 @@ mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, return 0; } -static inline unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) +static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { return 0; } @@ -985,6 +1020,16 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } + +static inline bool mem_cgroup_select_oom_victim(struct oom_control *oc) +{ + return false; +} + +static inline bool mem_cgroup_oom_group(struct mem_cgroup *memcg) +{ + return false; +} #endif /* CONFIG_MEMCG */ /* idx can be of type enum memcg_stat_item or node_stat_item */ diff --git a/include/linux/memfd.h b/include/linux/memfd.h new file mode 100644 index 0000000000000000000000000000000000000000..4f1600413f916e9c606a8fb338dd4dd1dd607ee6 --- /dev/null +++ b/include/linux/memfd.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_MEMFD_H +#define __LINUX_MEMFD_H + +#include <linux/file.h> + +#ifdef CONFIG_MEMFD_CREATE +extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg); +#else +static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a) +{ + return -EINVAL; +} +#endif + +#endif /* __LINUX_MEMFD_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 21612347d311815fe2c526df1d6095bd28f3d4f8..49dd59ea90f7327d526dd42539ba4bab68574317 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -413,6 +413,8 @@ struct mm_struct { unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ unsigned long stack_vm; /* VM_STACK */ unsigned long def_flags; + + spinlock_t arg_lock; /* protect the below fields */ unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; diff --git a/include/linux/oom.h b/include/linux/oom.h index 5bad038ac012e6e3047fd6a63a625c6814966d51..d4d41c01a74df9eab0ae09546a0e1c8a6e2ea7ac 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -10,6 +10,13 @@ #include <linux/sched/coredump.h> /* MMF_* */ #include <linux/mm.h> /* VM_FAULT* */ + +/* + * Special value returned by victim selection functions to indicate + * that are inflight OOM victims. + */ +#define INFLIGHT_VICTIM ((void *)-1UL) + struct zonelist; struct notifier_block; struct mem_cgroup; @@ -40,7 +47,8 @@ struct oom_control { /* Used by oom implementation, do not set */ unsigned long totalpages; - struct task_struct *chosen; + struct task_struct *chosen_task; + struct mem_cgroup *chosen_memcg; unsigned long chosen_points; }; @@ -111,6 +119,8 @@ extern void oom_killer_enable(void); extern struct task_struct *find_lock_task_mm(struct task_struct *p); +extern int oom_evaluate_task(struct task_struct *task, void *arg); + /* sysctls */ extern int sysctl_oom_dump_tasks; extern int sysctl_oom_kill_allocating_task; diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index c15ab80ad32de4387879d29421b77521a73556e4..7902a727d3b6ea3c5c02adc5ad5e8a11e784b83b 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -7,10 +7,16 @@ #include <asm/page.h> struct page_counter { - atomic_long_t count; - unsigned long limit; + atomic_long_t usage; + unsigned long max; + unsigned long low; struct page_counter *parent; + /* effective memory.low and memory.low usage tracking */ + unsigned long elow; + atomic_long_t low_usage; + atomic_long_t children_low_usage; + /* legacy */ unsigned long watermark; unsigned long failcnt; @@ -25,14 +31,14 @@ struct page_counter { static inline void page_counter_init(struct page_counter *counter, struct page_counter *parent) { - atomic_long_set(&counter->count, 0); - counter->limit = PAGE_COUNTER_MAX; + atomic_long_set(&counter->usage, 0); + counter->max = PAGE_COUNTER_MAX; counter->parent = parent; } static inline unsigned long page_counter_read(struct page_counter *counter) { - return atomic_long_read(&counter->count); + return atomic_long_read(&counter->usage); } void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages); @@ -41,7 +47,8 @@ bool page_counter_try_charge(struct page_counter *counter, unsigned long nr_pages, struct page_counter **fail); void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); -int page_counter_limit(struct page_counter *counter, unsigned long limit); +int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); +void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages); int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages); diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index a03c2642a87c259c23e1157dfaf21d46080eec91..21713dc14ce23605861f8ba71726648cce5c03c9 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -122,7 +122,7 @@ pud_t pud_mkdevmap(pud_t pud); #endif #endif /* __HAVE_ARCH_PTE_DEVMAP */ -#ifdef __HAVE_ARCH_PTE_SPECIAL +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL static inline bool pfn_t_special(pfn_t pfn) { return (pfn.val & PFN_SPECIAL) == PFN_SPECIAL; @@ -132,5 +132,5 @@ static inline bool pfn_t_special(pfn_t pfn) { return false; } -#endif /* __HAVE_ARCH_PTE_SPECIAL */ +#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ #endif /* _LINUX_PFN_T_H_ */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 6fc99045658add62405a6a7fe8dcdef88dc24e29..7aa611b17e17872c91c3cf9da014e51cffbf0968 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1097,6 +1097,9 @@ struct task_struct { /* Number of pages to reclaim on returning to userland: */ unsigned int memcg_nr_pages_over_high; + + /* Used by memcontrol for targeted memcg charge: */ + struct mem_cgroup *target_memcg; #endif #ifdef CONFIG_UPROBES diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 4e1411bbbcfcdf253c6bd75eec3d1cb2705eb8aa..1548c3d61fdcd82f21baba2c770c3b0ef133a4af 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -206,6 +206,30 @@ static inline void memalloc_noreclaim_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC) | flags; } +#ifdef CONFIG_MEMCG +static inline struct mem_cgroup *memalloc_memcg_save(struct mem_cgroup *memcg) +{ + struct mem_cgroup *old_memcg = current->target_memcg; + + current->target_memcg = memcg; + return old_memcg; +} + +static inline void memalloc_memcg_restore(struct mem_cgroup *memcg) +{ + current->target_memcg = memcg; +} +#else +static inline struct mem_cgroup *memalloc_memcg_save(struct mem_cgroup *memcg) +{ + return NULL; +} + +static inline void memalloc_memcg_restore(struct mem_cgroup *memcg) +{ +} +#endif /* CONFIG_MEMCG */ + #ifdef CONFIG_MEMBARRIER enum { MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0), diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index a7ce74c74e494ae05ba687bf9ad0497a690faed2..a52683a5fe90f1a5cde8285986eb6b9442577ebb 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -104,6 +104,9 @@ struct signal_struct { int group_stop_count; unsigned int flags; /* see SIGNAL_* flags below */ + /* The signal sent when the parent dies: */ + int pdeath_signal_proc; + /* * PR_SET_CHILD_SUBREAPER marks a process, like a service * manager, to re-parent orphan (double-forking) child processes diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 73b5e655a76ea835639629d8b9faab7f6399a721..f155dc607112e14420fb0eaf2865fb73b270f886 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -110,19 +110,6 @@ static inline bool shmem_file(struct file *file) extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages); -#ifdef CONFIG_TMPFS - -extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg); - -#else - -static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a) -{ - return -EINVAL; -} - -#endif - #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE extern bool shmem_huge_enabled(struct vm_area_struct *vma); #else diff --git a/include/linux/slab.h b/include/linux/slab.h index 81ebd71f8c03cc58bf393f3dad430e63970a622d..9ebe659bd4a5564ff6782612915fee916aad05ea 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -15,6 +15,7 @@ #include <linux/gfp.h> #include <linux/types.h> #include <linux/workqueue.h> +#include <linux/sched/mm.h> /* @@ -374,6 +375,21 @@ static __always_inline void kfree_bulk(size_t size, void **p) kmem_cache_free_bulk(NULL, size, p); } +/* + * Calling kmem_cache_alloc_memcg implicitly assumes that the caller + * wants a __GFP_ACCOUNT allocation. + */ +static __always_inline void *kmem_cache_alloc_memcg(struct kmem_cache *cachep, + gfp_t flags, + struct mem_cgroup *memcg) +{ + struct mem_cgroup *old_memcg = memalloc_memcg_save(memcg); + void *ptr = kmem_cache_alloc(cachep, flags | __GFP_ACCOUNT); + + memalloc_memcg_restore(old_memcg); + return ptr; +} + #ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc; void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc; @@ -389,6 +405,21 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f } #endif +/* + * Calling kmem_cache_alloc_node_memcg implicitly assumes that the caller + * wants a __GFP_ACCOUNT allocation. + */ +static __always_inline void * +kmem_cache_alloc_node_memcg(struct kmem_cache *cachep, gfp_t flags, int node, + struct mem_cgroup *memcg) +{ + struct mem_cgroup *old_memcg = memalloc_memcg_save(memcg); + void *ptr = kmem_cache_alloc_node(cachep, flags | __GFP_ACCOUNT, node); + + memalloc_memcg_restore(old_memcg); + return ptr; +} + #ifdef CONFIG_TRACING extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment __malloc; @@ -517,6 +548,20 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags) return __kmalloc(size, flags); } +/* + * Calling kmalloc_memcg implicitly assumes that the caller wants a + * __GFP_ACCOUNT allocation. + */ +static __always_inline void *kmalloc_memcg(size_t size, gfp_t flags, + struct mem_cgroup *memcg) +{ + struct mem_cgroup *old_memcg = memalloc_memcg_save(memcg); + void *ptr = kmalloc(size, flags | __GFP_ACCOUNT); + + memalloc_memcg_restore(old_memcg); + return ptr; +} + /* * Determine size used for the nth kmalloc cache. * return size or 0 if a kmalloc cache for that @@ -554,6 +599,20 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) return __kmalloc_node(size, flags, node); } +/* + * Calling kmalloc_node_memcg implicitly assumes that the caller wants a + * __GFP_ACCOUNT allocation. + */ +static __always_inline void * +kmalloc_node_memcg(size_t size, gfp_t flags, int node, struct mem_cgroup *memcg) +{ + struct mem_cgroup *old_memcg = memalloc_memcg_save(memcg); + void *ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node); + + memalloc_memcg_restore(old_memcg); + return ptr; +} + struct memcg_cache_array { struct rcu_head rcu; struct kmem_cache *entries[0]; diff --git a/include/linux/swap.h b/include/linux/swap.h index c063443d86381eabac4cde76d723dd0991091271..f73eafcaf4e9874beb234d82fcfca5f7aed607b2 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -172,8 +172,9 @@ enum { SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */ SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */ SWP_SYNCHRONOUS_IO = (1 << 11), /* synchronous IO is efficient */ + SWP_VALID = (1 << 12), /* swap is valid to be operated on? */ /* add others here before... */ - SWP_SCANNING = (1 << 12), /* refcount in scan_swap_map */ + SWP_SCANNING = (1 << 13), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL @@ -460,7 +461,7 @@ extern unsigned int count_swap_pages(int, int); extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); -extern int __swap_count(struct swap_info_struct *si, swp_entry_t entry); +extern int __swap_count(swp_entry_t entry); extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); @@ -470,6 +471,12 @@ extern int try_to_free_swap(struct page *); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); +extern struct swap_info_struct *get_swap_device(swp_entry_t entry); + +static inline void put_swap_device(struct swap_info_struct *si) +{ + preempt_enable(); +} #else /* CONFIG_SWAP */ @@ -575,7 +582,7 @@ static inline int page_swapcount(struct page *page) return 0; } -static inline int __swap_count(struct swap_info_struct *si, swp_entry_t entry) +static inline int __swap_count(swp_entry_t entry) { return 0; } diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index af5f8c2df87ac51b401acab46c817b649e99893d..3165863aa187c14066962f8ec8a42a0c8b5dfbc1 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -207,4 +207,8 @@ struct prctl_mm_map { # define PR_SVE_VL_LEN_MASK 0xffff # define PR_SVE_VL_INHERIT (1 << 17) /* inherit across exec */ +/* Process-based variant of PDEATHSIG */ +#define PR_SET_PDEATHSIG_PROC 48 +#define PR_GET_PDEATHSIG_PROC 49 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 7f1b64e6eb63bad6fdd0aa3667f870c50949ac7f..59e4ba6c5ddeca4f6ce537d594efee5bd76fa517 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1748,6 +1748,9 @@ static int parse_cgroup_root_flags(char *data, unsigned int *root_flags) if (!strcmp(token, "nsdelegate")) { *root_flags |= CGRP_ROOT_NS_DELEGATE; continue; + } else if (!strcmp(token, "groupoom")) { + *root_flags |= CGRP_GROUP_OOM; + continue; } pr_err("cgroup2: unknown option \"%s\"\n", token); @@ -1764,6 +1767,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags) cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; else cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; + + if (root_flags & CGRP_GROUP_OOM) + cgrp_dfl_root.flags |= CGRP_GROUP_OOM; + else + cgrp_dfl_root.flags &= ~CGRP_GROUP_OOM; } } @@ -1771,6 +1779,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root { if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) seq_puts(seq, ",nsdelegate"); + if (cgrp_dfl_root.flags & CGRP_GROUP_OOM) + seq_puts(seq, ",groupoom"); return 0; } @@ -5958,7 +5968,8 @@ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate); static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "nsdelegate\n"); + return snprintf(buf, PAGE_SIZE, "nsdelegate\n" + "groupoom\n"); } static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); diff --git a/kernel/cred.c b/kernel/cred.c index ecf03657e71c9c2eb959cc2ee46cee0414f5f0ca..0192a94670e12a1c389b3f8008a0f59da527411c 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -448,6 +448,7 @@ int commit_creds(struct cred *new) if (task->mm) set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; + task->signal->pdeath_signal_proc = 0; smp_wmb(); } diff --git a/kernel/exit.c b/kernel/exit.c index c3c7ac5601140961563534441908b52ced5a557c..63f7cea6456ecf877cab8cdb68aecc732ca5b50a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -635,6 +635,10 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, if (unlikely(p->exit_state == EXIT_DEAD)) return; + if (p->signal->pdeath_signal_proc) + group_send_sig_info(p->signal->pdeath_signal_proc, + SEND_SIG_NOINFO, p); + /* We don't want people slaying init. */ p->exit_signal = SIGCHLD; diff --git a/kernel/fork.c b/kernel/fork.c index a5d21c42acfc8ca2d0e14a0969c9cdc9222fb42d..a05baf4e2ca867b5da4be24b8f1a11cc01727027 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -440,6 +440,13 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, continue; } charge = 0; + /* + * Don't duplicate many vmas if we've been oom-killed + */ + if (fatal_signal_pending(current)) { + retval = -EINTR; + goto out; + } if (mpnt->vm_flags & VM_ACCOUNT) { unsigned long len = vma_pages(mpnt); @@ -835,6 +842,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->fail_nth = 0; #endif +#ifdef CONFIG_MEMCG + tsk->target_memcg = NULL; +#endif return tsk; free_stack: @@ -899,6 +909,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->pinned_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); + spin_lock_init(&mm->arg_lock); mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); @@ -1470,6 +1481,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) mutex_init(&sig->cred_guard_mutex); + sig->pdeath_signal_proc = current->signal->pdeath_signal_proc; + return 0; } diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 751593ed7c0b0b9cc9bf74735fb9bd5d7e100be2..32b479468e4d50d69530ad766ba7e3897afa5cc3 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -44,6 +44,7 @@ int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; static bool hung_task_show_lock; +static bool hung_task_call_panic; static struct task_struct *watchdog_task; @@ -127,10 +128,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) touch_nmi_watchdog(); if (sysctl_hung_task_panic) { - if (hung_task_show_lock) - debug_show_all_locks(); - trigger_all_cpu_backtrace(); - panic("hung_task: blocked tasks"); + hung_task_show_lock = true; + hung_task_call_panic = true; } } @@ -193,6 +192,10 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) rcu_read_unlock(); if (hung_task_show_lock) debug_show_all_locks(); + if (hung_task_call_panic) { + trigger_all_cpu_backtrace(); + panic("hung_task: blocked tasks"); + } } static long hung_timeout_jiffies(unsigned long last_checked, diff --git a/kernel/sys.c b/kernel/sys.c index ad692183dfe9327ca3fd9a35de137240f3f28b78..86e5ef1a56126dd57795a7a5d90128c869150c99 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1815,68 +1815,7 @@ SYSCALL_DEFINE1(umask, int, mask) return mask; } -static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) -{ - struct fd exe; - struct file *old_exe, *exe_file; - struct inode *inode; - int err; - - exe = fdget(fd); - if (!exe.file) - return -EBADF; - - inode = file_inode(exe.file); - - /* - * Because the original mm->exe_file points to executable file, make - * sure that this one is executable as well, to avoid breaking an - * overall picture. - */ - err = -EACCES; - if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path)) - goto exit; - - err = inode_permission(inode, MAY_EXEC); - if (err) - goto exit; - - /* - * Forbid mm->exe_file change if old file still mapped. - */ - exe_file = get_mm_exe_file(mm); - err = -EBUSY; - if (exe_file) { - struct vm_area_struct *vma; - - down_read(&mm->mmap_sem); - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (!vma->vm_file) - continue; - if (path_equal(&vma->vm_file->f_path, - &exe_file->f_path)) - goto exit_err; - } - - up_read(&mm->mmap_sem); - fput(exe_file); - } - - err = 0; - /* set the new file, lockless */ - get_file(exe.file); - old_exe = xchg(&mm->exe_file, exe.file); - if (old_exe) - fput(old_exe); -exit: - fdput(exe); - return err; -exit_err: - up_read(&mm->mmap_sem); - fput(exe_file); - goto exit; -} - +#ifdef CONFIG_CHECKPOINT_RESTORE /* * WARNING: we don't require any capability here so be very careful * in what is allowed for modification from userspace. @@ -1968,7 +1907,68 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map) return error; } -#ifdef CONFIG_CHECKPOINT_RESTORE +static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) +{ + struct fd exe; + struct file *old_exe, *exe_file; + struct inode *inode; + int err; + + exe = fdget(fd); + if (!exe.file) + return -EBADF; + + inode = file_inode(exe.file); + + /* + * Because the original mm->exe_file points to executable file, make + * sure that this one is executable as well, to avoid breaking an + * overall picture. + */ + err = -EACCES; + if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path)) + goto exit; + + err = inode_permission(inode, MAY_EXEC); + if (err) + goto exit; + + /* + * Forbid mm->exe_file change if old file still mapped. + */ + exe_file = get_mm_exe_file(mm); + err = -EBUSY; + if (exe_file) { + struct vm_area_struct *vma; + + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (!vma->vm_file) + continue; + if (path_equal(&vma->vm_file->f_path, + &exe_file->f_path)) + goto exit_err; + } + + up_read(&mm->mmap_sem); + fput(exe_file); + } + + err = 0; + /* set the new file, lockless */ + get_file(exe.file); + old_exe = xchg(&mm->exe_file, exe.file); + if (old_exe) + fput(old_exe); +exit: + fdput(exe); + return err; +exit_err: + up_read(&mm->mmap_sem); + fput(exe_file); + goto exit; +} + static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size) { struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, }; @@ -2011,7 +2011,11 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data return error; } - down_write(&mm->mmap_sem); + /* + * arg_lock protects concurent updates but we still need mmap_sem for + * read to exclude races with sys_brk. + */ + down_read(&mm->mmap_sem); /* * We don't validate if these members are pointing to @@ -2025,6 +2029,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data * to any problem in kernel itself */ + spin_lock(&mm->arg_lock); mm->start_code = prctl_map.start_code; mm->end_code = prctl_map.end_code; mm->start_data = prctl_map.start_data; @@ -2036,6 +2041,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data mm->arg_end = prctl_map.arg_end; mm->env_start = prctl_map.env_start; mm->env_end = prctl_map.env_end; + spin_unlock(&mm->arg_lock); /* * Note this update of @saved_auxv is lockless thus @@ -2048,168 +2054,21 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (prctl_map.auxv_size) memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); - up_write(&mm->mmap_sem); + up_read(&mm->mmap_sem); return 0; } #endif /* CONFIG_CHECKPOINT_RESTORE */ -static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr, - unsigned long len) -{ - /* - * This doesn't move the auxiliary vector itself since it's pinned to - * mm_struct, but it permits filling the vector with new values. It's - * up to the caller to provide sane values here, otherwise userspace - * tools which use this vector might be unhappy. - */ - unsigned long user_auxv[AT_VECTOR_SIZE]; - - if (len > sizeof(user_auxv)) - return -EINVAL; - - if (copy_from_user(user_auxv, (const void __user *)addr, len)) - return -EFAULT; - - /* Make sure the last entry is always AT_NULL */ - user_auxv[AT_VECTOR_SIZE - 2] = 0; - user_auxv[AT_VECTOR_SIZE - 1] = 0; - - BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); - - task_lock(current); - memcpy(mm->saved_auxv, user_auxv, len); - task_unlock(current); - - return 0; -} - static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { - struct mm_struct *mm = current->mm; - struct prctl_mm_map prctl_map; - struct vm_area_struct *vma; - int error; - - if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV && - opt != PR_SET_MM_MAP && - opt != PR_SET_MM_MAP_SIZE))) - return -EINVAL; - #ifdef CONFIG_CHECKPOINT_RESTORE if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE) return prctl_set_mm_map(opt, (const void __user *)addr, arg4); #endif - if (!capable(CAP_SYS_RESOURCE)) - return -EPERM; - - if (opt == PR_SET_MM_EXE_FILE) - return prctl_set_mm_exe_file(mm, (unsigned int)addr); - - if (opt == PR_SET_MM_AUXV) - return prctl_set_auxv(mm, addr, arg4); - - if (addr >= TASK_SIZE || addr < mmap_min_addr) - return -EINVAL; - - error = -EINVAL; - - down_write(&mm->mmap_sem); - vma = find_vma(mm, addr); - - prctl_map.start_code = mm->start_code; - prctl_map.end_code = mm->end_code; - prctl_map.start_data = mm->start_data; - prctl_map.end_data = mm->end_data; - prctl_map.start_brk = mm->start_brk; - prctl_map.brk = mm->brk; - prctl_map.start_stack = mm->start_stack; - prctl_map.arg_start = mm->arg_start; - prctl_map.arg_end = mm->arg_end; - prctl_map.env_start = mm->env_start; - prctl_map.env_end = mm->env_end; - prctl_map.auxv = NULL; - prctl_map.auxv_size = 0; - prctl_map.exe_fd = -1; - - switch (opt) { - case PR_SET_MM_START_CODE: - prctl_map.start_code = addr; - break; - case PR_SET_MM_END_CODE: - prctl_map.end_code = addr; - break; - case PR_SET_MM_START_DATA: - prctl_map.start_data = addr; - break; - case PR_SET_MM_END_DATA: - prctl_map.end_data = addr; - break; - case PR_SET_MM_START_STACK: - prctl_map.start_stack = addr; - break; - case PR_SET_MM_START_BRK: - prctl_map.start_brk = addr; - break; - case PR_SET_MM_BRK: - prctl_map.brk = addr; - break; - case PR_SET_MM_ARG_START: - prctl_map.arg_start = addr; - break; - case PR_SET_MM_ARG_END: - prctl_map.arg_end = addr; - break; - case PR_SET_MM_ENV_START: - prctl_map.env_start = addr; - break; - case PR_SET_MM_ENV_END: - prctl_map.env_end = addr; - break; - default: - goto out; - } - - error = validate_prctl_map(&prctl_map); - if (error) - goto out; - - switch (opt) { - /* - * If command line arguments and environment - * are placed somewhere else on stack, we can - * set them up here, ARG_START/END to setup - * command line argumets and ENV_START/END - * for environment. - */ - case PR_SET_MM_START_STACK: - case PR_SET_MM_ARG_START: - case PR_SET_MM_ARG_END: - case PR_SET_MM_ENV_START: - case PR_SET_MM_ENV_END: - if (!vma) { - error = -EFAULT; - goto out; - } - } - - mm->start_code = prctl_map.start_code; - mm->end_code = prctl_map.end_code; - mm->start_data = prctl_map.start_data; - mm->end_data = prctl_map.end_data; - mm->start_brk = prctl_map.start_brk; - mm->brk = prctl_map.brk; - mm->start_stack = prctl_map.start_stack; - mm->arg_start = prctl_map.arg_start; - mm->arg_end = prctl_map.arg_end; - mm->env_start = prctl_map.env_start; - mm->env_end = prctl_map.env_end; - - error = 0; -out: - up_write(&mm->mmap_sem); - return error; + pr_warn_once("PR_SET_MM_* has been removed. Use PR_SET_MM_MAP instead\n"); + return -EINVAL; } #ifdef CONFIG_CHECKPOINT_RESTORE @@ -2265,6 +2124,17 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_PDEATHSIG: error = put_user(me->pdeath_signal, (int __user *)arg2); break; + case PR_SET_PDEATHSIG_PROC: + if (!valid_signal(arg2)) { + error = -EINVAL; + break; + } + me->signal->pdeath_signal_proc = arg2; + break; + case PR_GET_PDEATHSIG_PROC: + error = put_user(me->signal->pdeath_signal_proc, + (int __user *)arg2); + break; case PR_GET_DUMPABLE: error = get_dumpable(me->mm); break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6a78cf70761db3c436316f8e3f7453d20962dc08..f45ed9e696eb55765f78c6b8862b421ee2e837a0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1804,6 +1804,24 @@ static struct ctl_table fs_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "protected_fifos", + .data = &sysctl_protected_fifos, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, + { + .procname = "protected_regular", + .data = &sysctl_protected_regular, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, { .procname = "suid_dumpable", .data = &suid_dumpable, diff --git a/lib/bitmap.c b/lib/bitmap.c index a42eff7e8c48bd81dd3db841a1e04f1672980b91..58f9750e49c6867d7e26ab00ef28c62966168461 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -64,12 +64,9 @@ EXPORT_SYMBOL(__bitmap_equal); void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int bits) { - unsigned int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = BITS_TO_LONGS(bits); for (k = 0; k < lim; ++k) dst[k] = ~src[k]; - - if (bits % BITS_PER_LONG) - dst[k] = ~src[k]; } EXPORT_SYMBOL(__bitmap_complement); diff --git a/lib/find_bit_benchmark.c b/lib/find_bit_benchmark.c index 5985a25e6cbcff7392c50a27a4beea379ab5d62b..5367ffa5c18f9cf98a811dc0e1a2fa15433fa401 100644 --- a/lib/find_bit_benchmark.c +++ b/lib/find_bit_benchmark.c @@ -132,7 +132,12 @@ static int __init find_bit_test(void) test_find_next_bit(bitmap, BITMAP_LEN); test_find_next_zero_bit(bitmap, BITMAP_LEN); test_find_last_bit(bitmap, BITMAP_LEN); - test_find_first_bit(bitmap, BITMAP_LEN); + + /* + * test_find_first_bit() may take some time, so + * traverse only part of bitmap to avoid soft lockup. + */ + test_find_first_bit(bitmap, BITMAP_LEN / 10); test_find_next_and_bit(bitmap, bitmap2, BITMAP_LEN); pr_err("\nStart testing find_bit() with sparse bitmap\n"); diff --git a/lib/idr.c b/lib/idr.c index 823b813f08f862b4c80463ad69f91ba9acf62703..ed9c169c12bdc966448d96b79de30f899e185a34 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -4,9 +4,9 @@ #include <linux/idr.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include <linux/xarray.h> DEFINE_PER_CPU(struct ida_bitmap *, ida_bitmap); -static DEFINE_SPINLOCK(simple_ida_lock); /** * idr_alloc_u32() - Allocate an ID. @@ -581,7 +581,7 @@ int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end, if (!ida_pre_get(ida, gfp_mask)) return -ENOMEM; - spin_lock_irqsave(&simple_ida_lock, flags); + xa_lock_irqsave(&ida->ida_rt, flags); ret = ida_get_new_above(ida, start, &id); if (!ret) { if (id > max) { @@ -591,7 +591,7 @@ int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end, ret = id; } } - spin_unlock_irqrestore(&simple_ida_lock, flags); + xa_unlock_irqrestore(&ida->ida_rt, flags); if (unlikely(ret == -EAGAIN)) goto again; @@ -615,8 +615,8 @@ void ida_simple_remove(struct ida *ida, unsigned int id) unsigned long flags; BUG_ON((int)id < 0); - spin_lock_irqsave(&simple_ida_lock, flags); + xa_lock_irqsave(&ida->ida_rt, flags); ida_remove(ida, id); - spin_unlock_irqrestore(&simple_ida_lock, flags); + xa_unlock_irqrestore(&ida->ida_rt, flags); } EXPORT_SYMBOL(ida_simple_remove); diff --git a/mm/Kconfig b/mm/Kconfig index 2d7ef6207e1e4d98a9be8b49965192301c9e83f5..f27a60d982ff1a500a3bf6e29aa64575f067152c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -753,3 +753,6 @@ config GUP_BENCHMARK performance of get_user_pages_fast(). See tools/testing/selftests/vm/gup_benchmark.c + +config ARCH_HAS_PTE_SPECIAL + bool diff --git a/mm/Makefile b/mm/Makefile index b4e54a9ae9c591cf6a60071da482e6da801c4e0a..8716bdabe1e69f4758d1eedf9c6c8d9d98d2bec2 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -105,3 +105,4 @@ obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_HMM) += hmm.o +obj-$(CONFIG_MEMFD_CREATE) += memfd.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 023190c69dce71c528cccb7498e7456d41f674d3..2fc3f38e4c4ff044d009a275eca0e9a9ebb6e0f4 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -220,11 +220,46 @@ static ssize_t stable_pages_required_show(struct device *dev, } static DEVICE_ATTR_RO(stable_pages_required); +static ssize_t strictlimit_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int val; + ssize_t ret; + + ret = kstrtouint(buf, 10, &val); + if (ret < 0) + return ret; + + switch (val) { + case 0: + bdi->capabilities &= ~BDI_CAP_STRICTLIMIT; + break; + case 1: + bdi->capabilities |= BDI_CAP_STRICTLIMIT; + break; + default: + return -EINVAL; + } + + return count; +} +static ssize_t strictlimit_show(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE-1, "%d\n", + !!(bdi->capabilities & BDI_CAP_STRICTLIMIT)); +} +static DEVICE_ATTR_RW(strictlimit); + static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, &dev_attr_max_ratio.attr, &dev_attr_stable_pages_required.attr, + &dev_attr_strictlimit.attr, NULL, }; ATTRIBUTE_GROUPS(bdi_dev); diff --git a/mm/gup.c b/mm/gup.c index 76af4cfeaf68149f365cf9b29bef64ebe6f1474b..cf43ff4168b69aa98ea252fae0a003acc81dfd84 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -212,53 +212,69 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, unsigned int flags, unsigned int *page_mask) { - pmd_t *pmd; + pmd_t *pmd, pmdval; spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; pmd = pmd_offset(pudp, address); - if (pmd_none(*pmd)) + /* + * The READ_ONCE() will stabilize the pmdval in a register or + * on the stack so that it will stop changing under the code. + */ + pmdval = READ_ONCE(*pmd); + if (pmd_none(pmdval)) return no_page_table(vma, flags); - if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { + if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) { page = follow_huge_pmd(mm, address, pmd, flags); if (page) return page; return no_page_table(vma, flags); } - if (is_hugepd(__hugepd(pmd_val(*pmd)))) { + if (is_hugepd(__hugepd(pmd_val(pmdval)))) { page = follow_huge_pd(vma, address, - __hugepd(pmd_val(*pmd)), flags, + __hugepd(pmd_val(pmdval)), flags, PMD_SHIFT); if (page) return page; return no_page_table(vma, flags); } retry: - if (!pmd_present(*pmd)) { + if (!pmd_present(pmdval)) { if (likely(!(flags & FOLL_MIGRATION))) return no_page_table(vma, flags); VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(*pmd)); - if (is_pmd_migration_entry(*pmd)) + !is_pmd_migration_entry(pmdval)); + if (is_pmd_migration_entry(pmdval)) pmd_migration_entry_wait(mm, pmd); + pmdval = READ_ONCE(*pmd); + /* + * MADV_DONTNEED may convert the pmd to null because + * mmap_sem is held in read mode + */ + if (pmd_none(pmdval)) + return no_page_table(vma, flags); goto retry; } - if (pmd_devmap(*pmd)) { + if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags); spin_unlock(ptl); if (page) return page; } - if (likely(!pmd_trans_huge(*pmd))) + if (likely(!pmd_trans_huge(pmdval))) return follow_page_pte(vma, address, pmd, flags); - if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) + if ((flags & FOLL_NUMA) && pmd_protnone(pmdval)) return no_page_table(vma, flags); retry_locked: ptl = pmd_lock(mm, pmd); + if (unlikely(pmd_none(*pmd))) { + spin_unlock(ptl); + return no_page_table(vma, flags); + } if (unlikely(!pmd_present(*pmd))) { spin_unlock(ptl); if (likely(!(flags & FOLL_MIGRATION))) @@ -1351,7 +1367,7 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) } } -#ifdef __HAVE_ARCH_PTE_SPECIAL +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { @@ -1427,7 +1443,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, { return 0; } -#endif /* __HAVE_ARCH_PTE_SPECIAL */ +#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ #if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __gup_device_huge(unsigned long pfn, unsigned long addr, diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index eec1150125b9857f3f84185d1d5beeff64ec034f..68c2f2f3c05b76203fae22ad8cf727482c95216e 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -84,7 +84,7 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, limit = round_down(PAGE_COUNTER_MAX, 1 << huge_page_order(&hstates[idx])); - ret = page_counter_limit(counter, limit); + ret = page_counter_set_max(counter, limit); VM_BUG_ON(ret); } } @@ -273,7 +273,7 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, case RES_USAGE: return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_LIMIT: - return (u64)counter->limit * PAGE_SIZE; + return (u64)counter->max * PAGE_SIZE; case RES_MAX_USAGE: return (u64)counter->watermark * PAGE_SIZE; case RES_FAILCNT: @@ -306,7 +306,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_LIMIT: mutex_lock(&hugetlb_limit_mutex); - ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages); + ret = page_counter_set_max(&h_cg->hugepage[idx], nr_pages); mutex_unlock(&hugetlb_limit_mutex); break; default: diff --git a/mm/init-mm.c b/mm/init-mm.c index f94d5d15ebc07a853cd5b1c5b50d364dfd97d6a1..f0179c9c04c26084ba3e76c9954eeb25a2319d1d 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -22,6 +22,7 @@ struct mm_struct init_mm = { .mm_count = ATOMIC_INIT(1), .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), .user_ns = &init_user_ns, INIT_MM_CONTEXT(init_mm) diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index bc0e68f7dc756104ff47005a4f9403af7e08fe4e..135ce2838c893dfefa285805330aa00c054a597c 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -792,6 +792,41 @@ DEFINE_ASAN_SET_SHADOW(f5); DEFINE_ASAN_SET_SHADOW(f8); #ifdef CONFIG_MEMORY_HOTPLUG +static bool shadow_mapped(unsigned long addr) +{ + pgd_t *pgd = pgd_offset_k(addr); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + if (pgd_none(*pgd)) + return false; + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) + return false; + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) + return false; + + /* + * We can't use pud_large() or pud_huge(), the first one + * is arch-specific, the last one depend on HUGETLB_PAGE. + * So let's abuse pud_bad(), if bud is bad it's has to + * because it's huge. + */ + if (pud_bad(*pud)) + return true; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return false; + + if (pmd_bad(*pmd)) + return true; + pte = pte_offset_kernel(pmd, addr); + return !pte_none(*pte); +} + static int __meminit kasan_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) { @@ -813,6 +848,14 @@ static int __meminit kasan_mem_notifier(struct notifier_block *nb, case MEM_GOING_ONLINE: { void *ret; + /* + * If shadow is mapped already than it must have been mapped + * during the boot. This could happen if we onlining previously + * offlined memory. + */ + if (shadow_mapped(shadow_start)) + return NOTIFY_OK; + ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start, shadow_end, GFP_KERNEL, PAGE_KERNEL, VM_NO_GUARD, @@ -824,8 +867,18 @@ static int __meminit kasan_mem_notifier(struct notifier_block *nb, kmemleak_ignore(ret); return NOTIFY_OK; } - case MEM_OFFLINE: - vfree((void *)shadow_start); + case MEM_OFFLINE: { + struct vm_struct *vm; + + /* + * Only hot-added memory have vm_area. Freeing shadow + * mapped during boot would be tricky, so we'll just + * have to keep it. + */ + vm = find_vm_area((void *)shadow_start); + if (vm) + vfree((void *)shadow_start); + } } return NOTIFY_OK; diff --git a/mm/list_lru.c b/mm/list_lru.c index fcfb6c89ed4777f46cb09f173c79719c3dbb46b9..d9c84c5bda1d559ae5e27c63ca32604d93387fa8 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/mm.h> #include <linux/list_lru.h> +#include <linux/prefetch.h> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/memcontrol.h> @@ -133,6 +134,12 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; + /* + * Prefetch the neighboring list entries to reduce lock hold time. + */ + prefetchw(item->prev); + prefetchw(item->next); + spin_lock(&nlru->lock); if (!list_empty(item)) { l = list_lru_from_kmem(nlru, item); diff --git a/mm/memblock.c b/mm/memblock.c index 5108356ad8aaddc9d24d9a0edf716553d75507b5..eec988c21c7e473da97c7e66e09244158ff3577d 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -68,7 +68,7 @@ ulong __init_memblock choose_memblock_flags(void) /* adjust *@size so that (@base + *@size) doesn't overflow, return new size */ static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size) { - return *size = min(*size, (phys_addr_t)ULLONG_MAX - base); + return *size = min(*size, PHYS_ADDR_MAX - base); } /* @@ -925,7 +925,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, r = &type_b->regions[idx_b]; r_start = idx_b ? r[-1].base + r[-1].size : 0; r_end = idx_b < type_b->cnt ? - r->base : (phys_addr_t)ULLONG_MAX; + r->base : PHYS_ADDR_MAX; /* * if idx_b advanced past idx_a, @@ -1041,7 +1041,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, r = &type_b->regions[idx_b]; r_start = idx_b ? r[-1].base + r[-1].size : 0; r_end = idx_b < type_b->cnt ? - r->base : (phys_addr_t)ULLONG_MAX; + r->base : PHYS_ADDR_MAX; /* * if idx_b advanced past idx_a, * break out to advance idx_a @@ -1516,13 +1516,13 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void) static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit) { - phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; + phys_addr_t max_addr = PHYS_ADDR_MAX; struct memblock_region *r; /* * translate the memory @limit size into the max address within one of * the memory memblock regions, if the @limit exceeds the total size - * of those regions, max_addr will keep original value ULLONG_MAX + * of those regions, max_addr will keep original value PHYS_ADDR_MAX */ for_each_memblock(memory, r) { if (limit <= r->size) { @@ -1537,7 +1537,7 @@ static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit) void __init memblock_enforce_memory_limit(phys_addr_t limit) { - phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; + phys_addr_t max_addr = PHYS_ADDR_MAX; if (!limit) return; @@ -1545,14 +1545,14 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit) max_addr = __find_max_addr(limit); /* @limit exceeds the total size of the memory, do nothing */ - if (max_addr == (phys_addr_t)ULLONG_MAX) + if (max_addr == PHYS_ADDR_MAX) return; /* truncate both memory and reserved regions */ memblock_remove_range(&memblock.memory, max_addr, - (phys_addr_t)ULLONG_MAX); + PHYS_ADDR_MAX); memblock_remove_range(&memblock.reserved, max_addr, - (phys_addr_t)ULLONG_MAX); + PHYS_ADDR_MAX); } void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size) @@ -1580,7 +1580,7 @@ void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size) /* truncate the reserved regions */ memblock_remove_range(&memblock.reserved, 0, base); memblock_remove_range(&memblock.reserved, - base + size, (phys_addr_t)ULLONG_MAX); + base + size, PHYS_ADDR_MAX); } void __init memblock_mem_limit_remove_map(phys_addr_t limit) @@ -1593,7 +1593,7 @@ void __init memblock_mem_limit_remove_map(phys_addr_t limit) max_addr = __find_max_addr(limit); /* @limit exceeds the total size of the memory, do nothing */ - if (max_addr == (phys_addr_t)ULLONG_MAX) + if (max_addr == PHYS_ADDR_MAX) return; memblock_cap_memory_range(0, max_addr); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2bd3df3d101a777144f7e91895dbcbcdd0e455cc..25b148c2d2228639d2fc167a1e0caf63c870758d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -678,7 +678,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) } EXPORT_SYMBOL(mem_cgroup_from_task); -static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) +struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { struct mem_cgroup *memcg = NULL; @@ -701,6 +701,20 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) return memcg; } +static __always_inline struct mem_cgroup *get_mem_cgroup( + struct mem_cgroup *memcg, struct mm_struct *mm) +{ + if (unlikely(memcg)) { + rcu_read_lock(); + if (css_tryget_online(&memcg->css)) { + rcu_read_unlock(); + return memcg; + } + rcu_read_unlock(); + } + return get_mem_cgroup_from_mm(mm); +} + /** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root @@ -888,7 +902,8 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) * value, the function breaks the iteration loop and returns the value. * Otherwise, it will iterate over all tasks and return 0. * - * This function must not be called for the root memory cgroup. + * If memcg is the root memory cgroup, this function will iterate only + * over tasks belonging directly to the root memory cgroup. */ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, int (*fn)(struct task_struct *, void *), void *arg) @@ -896,8 +911,6 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, struct mem_cgroup *iter; int ret = 0; - BUG_ON(memcg == root_mem_cgroup); - for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; struct task_struct *task; @@ -906,7 +919,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, while (!ret && (task = css_task_iter_next(&it))) ret = fn(task, arg); css_task_iter_end(&it); - if (ret) { + if (ret || memcg == root_mem_cgroup) { mem_cgroup_iter_break(memcg, iter); break; } @@ -1034,13 +1047,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) unsigned long limit; count = page_counter_read(&memcg->memory); - limit = READ_ONCE(memcg->memory.limit); + limit = READ_ONCE(memcg->memory.max); if (count < limit) margin = limit - count; if (do_memsw_account()) { count = page_counter_read(&memcg->memsw); - limit = READ_ONCE(memcg->memsw.limit); + limit = READ_ONCE(memcg->memsw.max); if (count <= limit) margin = min(margin, limit - count); else @@ -1148,13 +1161,13 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memory)), - K((u64)memcg->memory.limit), memcg->memory.failcnt); + K((u64)memcg->memory.max), memcg->memory.failcnt); pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memsw)), - K((u64)memcg->memsw.limit), memcg->memsw.failcnt); + K((u64)memcg->memsw.max), memcg->memsw.failcnt); pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->kmem)), - K((u64)memcg->kmem.limit), memcg->kmem.failcnt); + K((u64)memcg->kmem.max), memcg->kmem.failcnt); for_each_mem_cgroup_tree(iter, memcg) { pr_info("Memory cgroup stats for "); @@ -1179,21 +1192,21 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) /* * Return the memory (and swap, if configured) limit for a memcg. */ -unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) +unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { - unsigned long limit; + unsigned long max; - limit = memcg->memory.limit; + max = memcg->memory.max; if (mem_cgroup_swappiness(memcg)) { - unsigned long memsw_limit; - unsigned long swap_limit; + unsigned long memsw_max; + unsigned long swap_max; - memsw_limit = memcg->memsw.limit; - swap_limit = memcg->swap.limit; - swap_limit = min(swap_limit, (unsigned long)total_swap_pages); - limit = min(limit + swap_limit, memsw_limit); + memsw_max = memcg->memsw.max; + swap_max = memcg->swap.max; + swap_max = min(swap_max, (unsigned long)total_swap_pages); + max = min(max + swap_max, memsw_max); } - return limit; + return max; } static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, @@ -2261,7 +2274,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) if (current->memcg_kmem_skip_account) return cachep; - memcg = get_mem_cgroup_from_mm(current->mm); + memcg = get_mem_cgroup(current->target_memcg, current->mm); kmemcg_id = READ_ONCE(memcg->kmemcg_id); if (kmemcg_id < 0) goto out; @@ -2345,7 +2358,7 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) if (memcg_kmem_bypass()) return 0; - memcg = get_mem_cgroup_from_mm(current->mm); + memcg = get_mem_cgroup(current->target_memcg, current->mm); if (!mem_cgroup_is_root(memcg)) { ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); if (!ret) @@ -2444,10 +2457,10 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, } #endif -static DEFINE_MUTEX(memcg_limit_mutex); +static DEFINE_MUTEX(memcg_max_mutex); -static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, - unsigned long limit, bool memsw) +static int mem_cgroup_resize_max(struct mem_cgroup *memcg, + unsigned long max, bool memsw) { bool enlarge = false; int ret; @@ -2460,22 +2473,22 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, break; } - mutex_lock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); /* * Make sure that the new limit (memsw or memory limit) doesn't - * break our basic invariant rule memory.limit <= memsw.limit. + * break our basic invariant rule memory.max <= memsw.max. */ - limits_invariant = memsw ? limit >= memcg->memory.limit : - limit <= memcg->memsw.limit; + limits_invariant = memsw ? max >= memcg->memory.max : + max <= memcg->memsw.max; if (!limits_invariant) { - mutex_unlock(&memcg_limit_mutex); + mutex_unlock(&memcg_max_mutex); ret = -EINVAL; break; } - if (limit > counter->limit) + if (max > counter->max) enlarge = true; - ret = page_counter_limit(counter, limit); - mutex_unlock(&memcg_limit_mutex); + ret = page_counter_set_max(counter, max); + mutex_unlock(&memcg_max_mutex); if (!ret) break; @@ -2592,6 +2605,224 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg) return ret; } +static long memcg_oom_badness(struct mem_cgroup *memcg, + const nodemask_t *nodemask, + unsigned long totalpages) +{ + long points = 0; + int nid; + pg_data_t *pgdat; + + for_each_node_state(nid, N_MEMORY) { + if (nodemask && !node_isset(nid, *nodemask)) + continue; + + points += mem_cgroup_node_nr_lru_pages(memcg, nid, + LRU_ALL_ANON | BIT(LRU_UNEVICTABLE)); + + pgdat = NODE_DATA(nid); + points += lruvec_page_state(mem_cgroup_lruvec(pgdat, memcg), + NR_SLAB_UNRECLAIMABLE); + } + + points += memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) / + (PAGE_SIZE / 1024); + points += memcg_page_state(memcg, MEMCG_SOCK); + points += memcg_page_state(memcg, MEMCG_SWAP); + + return points; +} + +/* + * Checks if the given memcg is a valid OOM victim and returns a number, + * which means the folowing: + * -1: there are inflight OOM victim tasks, belonging to the memcg + * 0: memcg is not eligible, e.g. all belonging tasks are protected + * by oom_score_adj set to OOM_SCORE_ADJ_MIN + * >0: memcg is eligible, and the returned value is an estimation + * of the memory footprint + */ +static long oom_evaluate_memcg(struct mem_cgroup *memcg, + const nodemask_t *nodemask, + unsigned long totalpages) +{ + struct css_task_iter it; + struct task_struct *task; + int eligible = 0; + + /* + * Root memory cgroup is a special case: + * we don't have necessary stats to evaluate it exactly as + * leaf memory cgroups, so we approximate it's oom_score + * by summing oom_score of all belonging tasks, which are + * owners of their mm structs. + * + * If there are inflight OOM victim tasks inside + * the root memcg, we return -1. + */ + if (memcg == root_mem_cgroup) { + struct css_task_iter it; + struct task_struct *task; + long score = 0; + + css_task_iter_start(&memcg->css, 0, &it); + while ((task = css_task_iter_next(&it))) { + if (tsk_is_oom_victim(task) && + !test_bit(MMF_OOM_SKIP, + &task->signal->oom_mm->flags)) { + score = -1; + break; + } + + task_lock(task); + if (!task->mm || task->mm->owner != task) { + task_unlock(task); + continue; + } + task_unlock(task); + + score += oom_badness(task, memcg, nodemask, + totalpages); + } + css_task_iter_end(&it); + + return score; + } + + /* + * Memcg is OOM eligible if there are OOM killable tasks inside. + * + * We treat tasks with oom_score_adj set to OOM_SCORE_ADJ_MIN + * as unkillable. + * + * If there are inflight OOM victim tasks inside the memcg, + * we return -1. + */ + css_task_iter_start(&memcg->css, 0, &it); + while ((task = css_task_iter_next(&it))) { + if (!eligible && + task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) + eligible = 1; + + if (tsk_is_oom_victim(task) && + !test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) { + eligible = -1; + break; + } + } + css_task_iter_end(&it); + + if (eligible <= 0) + return eligible; + + return memcg_oom_badness(memcg, nodemask, totalpages); +} + +static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc) +{ + struct mem_cgroup *iter, *group = NULL; + long group_score = 0; + + oc->chosen_memcg = NULL; + oc->chosen_points = 0; + + /* + * If OOM is memcg-wide, and the memcg has the oom_group flag set, + * all tasks belonging to the memcg should be killed. + * So, we mark the memcg as a victim. + */ + if (oc->memcg && mem_cgroup_oom_group(oc->memcg)) { + oc->chosen_memcg = oc->memcg; + css_get(&oc->chosen_memcg->css); + return; + } + + /* + * The oom_score is calculated for leaf memory cgroups (including + * the root memcg). + * Non-leaf oom_group cgroups accumulating score of descendant + * leaf memory cgroups. + */ + rcu_read_lock(); + for_each_mem_cgroup_tree(iter, root) { + long score; + + /* + * We don't consider non-leaf non-oom_group memory cgroups + * as OOM victims. + */ + if (memcg_has_children(iter) && iter != root_mem_cgroup && + !mem_cgroup_oom_group(iter)) + continue; + + /* + * If group is not set or we've ran out of the group's sub-tree, + * we should set group and reset group_score. + */ + if (!group || group == root_mem_cgroup || + !mem_cgroup_is_descendant(iter, group)) { + group = iter; + group_score = 0; + } + + if (memcg_has_children(iter) && iter != root_mem_cgroup) + continue; + + score = oom_evaluate_memcg(iter, oc->nodemask, oc->totalpages); + + /* + * Ignore empty and non-eligible memory cgroups. + */ + if (score == 0) + continue; + + /* + * If there are inflight OOM victims, we don't need + * to look further for new victims. + */ + if (score == -1) { + oc->chosen_memcg = INFLIGHT_VICTIM; + mem_cgroup_iter_break(root, iter); + break; + } + + group_score += score; + + if (group_score > oc->chosen_points) { + oc->chosen_points = group_score; + oc->chosen_memcg = group; + } + } + + if (oc->chosen_memcg && oc->chosen_memcg != INFLIGHT_VICTIM) + css_get(&oc->chosen_memcg->css); + + rcu_read_unlock(); +} + +bool mem_cgroup_select_oom_victim(struct oom_control *oc) +{ + struct mem_cgroup *root; + + if (mem_cgroup_disabled()) + return false; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return false; + + if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM)) + return false; + + if (oc->memcg) + root = oc->memcg; + else + root = root_mem_cgroup; + + select_victim_memcg(root, oc); + + return oc->chosen_memcg; +} + /* * Reclaims as many pages from the given memcg as possible. * @@ -2757,7 +2988,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_LIMIT: - return (u64)counter->limit * PAGE_SIZE; + return (u64)counter->max * PAGE_SIZE; case RES_MAX_USAGE: return (u64)counter->watermark * PAGE_SIZE; case RES_FAILCNT: @@ -2871,24 +3102,24 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) } #endif /* !CONFIG_SLOB */ -static int memcg_update_kmem_limit(struct mem_cgroup *memcg, - unsigned long limit) +static int memcg_update_kmem_max(struct mem_cgroup *memcg, + unsigned long max) { int ret; - mutex_lock(&memcg_limit_mutex); - ret = page_counter_limit(&memcg->kmem, limit); - mutex_unlock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); + ret = page_counter_set_max(&memcg->kmem, max); + mutex_unlock(&memcg_max_mutex); return ret; } -static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) +static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) { int ret; - mutex_lock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); - ret = page_counter_limit(&memcg->tcpmem, limit); + ret = page_counter_set_max(&memcg->tcpmem, max); if (ret) goto out; @@ -2913,7 +3144,7 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) memcg->tcpmem_active = true; } out: - mutex_unlock(&memcg_limit_mutex); + mutex_unlock(&memcg_max_mutex); return ret; } @@ -2941,16 +3172,16 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, } switch (MEMFILE_TYPE(of_cft(of)->private)) { case _MEM: - ret = mem_cgroup_resize_limit(memcg, nr_pages, false); + ret = mem_cgroup_resize_max(memcg, nr_pages, false); break; case _MEMSWAP: - ret = mem_cgroup_resize_limit(memcg, nr_pages, true); + ret = mem_cgroup_resize_max(memcg, nr_pages, true); break; case _KMEM: - ret = memcg_update_kmem_limit(memcg, nr_pages); + ret = memcg_update_kmem_max(memcg, nr_pages); break; case _TCP: - ret = memcg_update_tcp_limit(memcg, nr_pages); + ret = memcg_update_tcp_max(memcg, nr_pages); break; } break; @@ -3126,8 +3357,8 @@ static int memcg_stat_show(struct seq_file *m, void *v) /* Hierarchical information */ memory = memsw = PAGE_COUNTER_MAX; for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { - memory = min(memory, mi->memory.limit); - memsw = min(memsw, mi->memsw.limit); + memory = min(memory, mi->memory.max); + memsw = min(memsw, mi->memsw.max); } seq_printf(m, "hierarchical_memory_limit %llu\n", (u64)memory * PAGE_SIZE); @@ -3626,7 +3857,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, *pheadroom = PAGE_COUNTER_MAX; while ((parent = parent_mem_cgroup(memcg))) { - unsigned long ceiling = min(memcg->memory.limit, memcg->high); + unsigned long ceiling = min(memcg->memory.max, memcg->high); unsigned long used = page_counter_read(&memcg->memory); *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); @@ -4031,6 +4262,14 @@ static struct cftype mem_cgroup_legacy_files[] = { static DEFINE_IDR(mem_cgroup_idr); +static void mem_cgroup_id_remove(struct mem_cgroup *memcg) +{ + if (memcg->id.id > 0) { + idr_remove(&mem_cgroup_idr, memcg->id.id); + memcg->id.id = 0; + } +} + static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) { VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0); @@ -4041,8 +4280,7 @@ static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { VM_BUG_ON(atomic_read(&memcg->id.ref) < n); if (atomic_sub_and_test(n, &memcg->id.ref)) { - idr_remove(&mem_cgroup_idr, memcg->id.id); - memcg->id.id = 0; + mem_cgroup_id_remove(memcg); /* Memcg ID pins CSS */ css_put(&memcg->css); @@ -4179,8 +4417,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); return memcg; fail: - if (memcg->id.id > 0) - idr_remove(&mem_cgroup_idr, memcg->id.id); + mem_cgroup_id_remove(memcg); __mem_cgroup_free(memcg); return NULL; } @@ -4239,6 +4476,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) return &memcg->css; fail: + mem_cgroup_id_remove(memcg); mem_cgroup_free(memcg); return ERR_PTR(-ENOMEM); } @@ -4270,7 +4508,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) } spin_unlock(&memcg->event_list_lock); - memcg->low = 0; + page_counter_set_low(&memcg->memory, 0); memcg_offline_kmem(memcg); wb_memcg_offline(memcg); @@ -4319,12 +4557,12 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX); - memcg->low = 0; + page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); + page_counter_set_low(&memcg->memory, 0); memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; memcg_wb_domain_size_changed(memcg); @@ -5064,7 +5302,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, static int memory_low_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long low = READ_ONCE(memcg->low); + unsigned long low = READ_ONCE(memcg->memory.low); if (low == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -5086,7 +5324,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, if (err) return err; - memcg->low = low; + page_counter_set_low(&memcg->memory, low); return nbytes; } @@ -5131,7 +5369,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, static int memory_max_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long max = READ_ONCE(memcg->memory.limit); + unsigned long max = READ_ONCE(memcg->memory.max); if (max == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -5155,7 +5393,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (err) return err; - xchg(&memcg->memory.limit, max); + xchg(&memcg->memory.max, max); for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); @@ -5190,6 +5428,39 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, return nbytes; } +static int memory_oom_group_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + bool oom_group = memcg->oom_group; + + if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM)) + return -ENOTSUPP; + + seq_printf(m, "%d\n", oom_group); + + return 0; +} + +static ssize_t memory_oom_group_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + int oom_group; + int err; + + if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM)) + return -ENOTSUPP; + + err = kstrtoint(strstrip(buf), 0, &oom_group); + if (err) + return err; + + memcg->oom_group = oom_group; + + return nbytes; +} + static int memory_events_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); @@ -5313,6 +5584,12 @@ static struct cftype memory_files[] = { .seq_show = memory_max_show, .write = memory_max_write, }, + { + .name = "oom_group", + .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, + .seq_show = memory_oom_group_show, + .write = memory_oom_group_write, + }, { .name = "events", .flags = CFTYPE_NOT_ON_ROOT, @@ -5344,40 +5621,76 @@ struct cgroup_subsys memory_cgrp_subsys = { }; /** - * mem_cgroup_low - check if memory consumption is below the normal range + * mem_cgroup_low - check if memory consumption is in the normal range * @root: the top ancestor of the sub-tree being checked * @memcg: the memory cgroup to check * - * Returns %true if memory consumption of @memcg, and that of all - * ancestors up to (but not including) @root, is below the normal range. + * WARNING: This function is not stateless! It can only be used as part + * of a top-down tree iteration, not for isolated queries. + * + * Returns %true if memory consumption of @memcg is in the normal range. + * + * @root is exclusive; it is never low when looked at directly + * + * To provide a proper hierarchical behavior, effective memory.low value + * is used. + * + * Effective memory.low is always equal or less than the original memory.low. + * If there is no memory.low overcommittment (which is always true for + * top-level memory cgroups), these two values are equal. + * Otherwise, it's a part of parent's effective memory.low, + * calculated as a cgroup's memory.low usage divided by sum of sibling's + * memory.low usages, where memory.low usage is the size of actually + * protected memory. + * + * low_usage + * elow = min( memory.low, parent->elow * ------------------ ), + * siblings_low_usage * - * @root is exclusive; it is never low when looked at directly and isn't - * checked when traversing the hierarchy. + * | memory.current, if memory.current < memory.low + * low_usage = | + | 0, otherwise. * - * Excluding @root enables using memory.low to prioritize memory usage - * between cgroups within a subtree of the hierarchy that is limited by - * memory.high or memory.max. * - * For example, given cgroup A with children B and C: + * Such definition of the effective memory.low provides the expected + * hierarchical behavior: parent's memory.low value is limiting + * children, unprotected memory is reclaimed first and cgroups, + * which are not using their guarantee do not affect actual memory + * distribution. * - * A - * / \ - * B C + * For example, if there are memcgs A, A/B, A/C, A/D and A/E: * - * and + * A A/memory.low = 2G, A/memory.current = 6G + * //\\ + * BC DE B/memory.low = 3G B/memory.current = 2G + * C/memory.low = 1G C/memory.current = 2G + * D/memory.low = 0 D/memory.current = 2G + * E/memory.low = 10G E/memory.current = 0 * - * 1. A/memory.current > A/memory.high - * 2. A/B/memory.current < A/B/memory.low - * 3. A/C/memory.current >= A/C/memory.low + * and the memory pressure is applied, the following memory distribution + * is expected (approximately): * - * As 'A' is high, i.e. triggers reclaim from 'A', and 'B' is low, we - * should reclaim from 'C' until 'A' is no longer high or until we can - * no longer reclaim from 'C'. If 'A', i.e. @root, isn't excluded by - * mem_cgroup_low when reclaming from 'A', then 'B' won't be considered - * low and we will reclaim indiscriminately from both 'B' and 'C'. + * A/memory.current = 2G + * + * B/memory.current = 1.3G + * C/memory.current = 0.6G + * D/memory.current = 0 + * E/memory.current = 0 + * + * These calculations require constant tracking of the actual low usages + * (see propagate_low_usage()), as well as recursive calculation of + * effective memory.low values. But as we do call mem_cgroup_low() + * path for each memory cgroup top-down from the reclaim, + * it's possible to optimize this part, and save calculated elow + * for next usage. This part is intentionally racy, but it's ok, + * as memory.low is a best-effort mechanism. */ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) { + unsigned long usage, low_usage, siblings_low_usage; + unsigned long elow, parent_elow; + struct mem_cgroup *parent; + if (mem_cgroup_disabled()) return false; @@ -5386,12 +5699,30 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) if (memcg == root) return false; - for (; memcg != root; memcg = parent_mem_cgroup(memcg)) { - if (page_counter_read(&memcg->memory) >= memcg->low) - return false; - } + elow = memcg->memory.low; + usage = page_counter_read(&memcg->memory); + parent = parent_mem_cgroup(memcg); - return true; + if (parent == root) + goto exit; + + parent_elow = READ_ONCE(parent->memory.elow); + elow = min(elow, parent_elow); + + if (!elow || !parent_elow) + goto exit; + + low_usage = min(usage, memcg->memory.low); + siblings_low_usage = atomic_long_read( + &parent->memory.children_low_usage); + + if (!low_usage || !siblings_low_usage) + goto exit; + + elow = min(elow, parent_elow * low_usage / siblings_low_usage); +exit: + memcg->memory.elow = elow; + return usage && usage <= elow; } /** @@ -6012,10 +6343,17 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) if (!memcg) return 0; + if (!entry.val) { + memcg_memory_event(memcg, MEMCG_SWAP_FAIL); + return 0; + } + memcg = mem_cgroup_id_get_online(memcg); if (!mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { + memcg_memory_event(memcg, MEMCG_SWAP_MAX); + memcg_memory_event(memcg, MEMCG_SWAP_FAIL); mem_cgroup_id_put(memcg); return -ENOMEM; } @@ -6067,7 +6405,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) return nr_swap_pages; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) nr_swap_pages = min_t(long, nr_swap_pages, - READ_ONCE(memcg->swap.limit) - + READ_ONCE(memcg->swap.max) - page_counter_read(&memcg->swap)); return nr_swap_pages; } @@ -6088,7 +6426,7 @@ bool mem_cgroup_swap_full(struct page *page) return false; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) - if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit) + if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max) return true; return false; @@ -6122,7 +6460,7 @@ static u64 swap_current_read(struct cgroup_subsys_state *css, static int swap_max_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long max = READ_ONCE(memcg->swap.limit); + unsigned long max = READ_ONCE(memcg->swap.max); if (max == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -6144,15 +6482,27 @@ static ssize_t swap_max_write(struct kernfs_open_file *of, if (err) return err; - mutex_lock(&memcg_limit_mutex); - err = page_counter_limit(&memcg->swap, max); - mutex_unlock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); + err = page_counter_set_max(&memcg->swap, max); + mutex_unlock(&memcg_max_mutex); if (err) return err; return nbytes; } +static int swap_events_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + seq_printf(m, "max %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); + seq_printf(m, "fail %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL])); + + return 0; +} + static struct cftype swap_files[] = { { .name = "swap.current", @@ -6165,6 +6515,12 @@ static struct cftype swap_files[] = { .seq_show = swap_max_show, .write = swap_max_write, }, + { + .name = "swap.events", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct mem_cgroup, swap_events_file), + .seq_show = swap_events_show, + }, { } /* terminate */ }; diff --git a/mm/memfd.c b/mm/memfd.c new file mode 100644 index 0000000000000000000000000000000000000000..27069518e3c598fcbd3564a019442e933a6cf038 --- /dev/null +++ b/mm/memfd.c @@ -0,0 +1,345 @@ +/* + * memfd_create system call and file sealing support + * + * Code was originally included in shmem.c, and broken out to facilitate + * use by hugetlbfs as well as tmpfs. + * + * This file is released under the GPL. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/pagemap.h> +#include <linux/file.h> +#include <linux/mm.h> +#include <linux/sched/signal.h> +#include <linux/khugepaged.h> +#include <linux/syscalls.h> +#include <linux/hugetlb.h> +#include <linux/shmem_fs.h> +#include <linux/memfd.h> +#include <uapi/linux/memfd.h> + +/* + * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, + * so reuse a tag which we firmly believe is never set or cleared on tmpfs + * or hugetlbfs because they are memory only filesystems. + */ +#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE +#define LAST_SCAN 4 /* about 150ms max */ + +static void memfd_tag_pins(struct address_space *mapping) +{ + struct radix_tree_iter iter; + void __rcu **slot; + pgoff_t start; + struct page *page; + + lru_add_drain(); + start = 0; + rcu_read_lock(); + + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { + page = radix_tree_deref_slot(slot); + if (!page || radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + } else if (page_count(page) - page_mapcount(page) > 1) { + xa_lock_irq(&mapping->i_pages); + radix_tree_tag_set(&mapping->i_pages, iter.index, + MEMFD_TAG_PINNED); + xa_unlock_irq(&mapping->i_pages); + } + + if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); + cond_resched_rcu(); + } + } + rcu_read_unlock(); +} + +/* + * Setting SEAL_WRITE requires us to verify there's no pending writer. However, + * via get_user_pages(), drivers might have some pending I/O without any active + * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages + * and see whether it has an elevated ref-count. If so, we tag them and wait for + * them to be dropped. + * The caller must guarantee that no new user will acquire writable references + * to those pages to avoid races. + */ +static int memfd_wait_for_pins(struct address_space *mapping) +{ + struct radix_tree_iter iter; + void __rcu **slot; + pgoff_t start; + struct page *page; + int error, scan; + + memfd_tag_pins(mapping); + + error = 0; + for (scan = 0; scan <= LAST_SCAN; scan++) { + if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED)) + break; + + if (!scan) + lru_add_drain_all(); + else if (schedule_timeout_killable((HZ << scan) / 200)) + scan = LAST_SCAN; + + start = 0; + rcu_read_lock(); + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, + start, MEMFD_TAG_PINNED) { + + page = radix_tree_deref_slot(slot); + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + + page = NULL; + } + + if (page && + page_count(page) - page_mapcount(page) != 1) { + if (scan < LAST_SCAN) + goto continue_resched; + + /* + * On the last scan, we clean up all those tags + * we inserted; but make a note that we still + * found pages pinned. + */ + error = -EBUSY; + } + + xa_lock_irq(&mapping->i_pages); + radix_tree_tag_clear(&mapping->i_pages, + iter.index, MEMFD_TAG_PINNED); + xa_unlock_irq(&mapping->i_pages); +continue_resched: + if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); + cond_resched_rcu(); + } + } + rcu_read_unlock(); + } + + return error; +} + +static unsigned int *memfd_file_seals_ptr(struct file *file) +{ + if (shmem_file(file)) + return &SHMEM_I(file_inode(file))->seals; + +#ifdef CONFIG_HUGETLBFS + if (is_file_hugepages(file)) + return &HUGETLBFS_I(file_inode(file))->seals; +#endif + + return NULL; +} + +#define F_ALL_SEALS (F_SEAL_SEAL | \ + F_SEAL_SHRINK | \ + F_SEAL_GROW | \ + F_SEAL_WRITE) + +static int memfd_add_seals(struct file *file, unsigned int seals) +{ + struct inode *inode = file_inode(file); + unsigned int *file_seals; + int error; + + /* + * SEALING + * Sealing allows multiple parties to share a tmpfs or hugetlbfs file + * but restrict access to a specific subset of file operations. Seals + * can only be added, but never removed. This way, mutually untrusted + * parties can share common memory regions with a well-defined policy. + * A malicious peer can thus never perform unwanted operations on a + * shared object. + * + * Seals are only supported on special tmpfs or hugetlbfs files and + * always affect the whole underlying inode. Once a seal is set, it + * may prevent some kinds of access to the file. Currently, the + * following seals are defined: + * SEAL_SEAL: Prevent further seals from being set on this file + * SEAL_SHRINK: Prevent the file from shrinking + * SEAL_GROW: Prevent the file from growing + * SEAL_WRITE: Prevent write access to the file + * + * As we don't require any trust relationship between two parties, we + * must prevent seals from being removed. Therefore, sealing a file + * only adds a given set of seals to the file, it never touches + * existing seals. Furthermore, the "setting seals"-operation can be + * sealed itself, which basically prevents any further seal from being + * added. + * + * Semantics of sealing are only defined on volatile files. Only + * anonymous tmpfs and hugetlbfs files support sealing. More + * importantly, seals are never written to disk. Therefore, there's + * no plan to support it on other file types. + */ + + if (!(file->f_mode & FMODE_WRITE)) + return -EPERM; + if (seals & ~(unsigned int)F_ALL_SEALS) + return -EINVAL; + + inode_lock(inode); + + file_seals = memfd_file_seals_ptr(file); + if (!file_seals) { + error = -EINVAL; + goto unlock; + } + + if (*file_seals & F_SEAL_SEAL) { + error = -EPERM; + goto unlock; + } + + if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { + error = mapping_deny_writable(file->f_mapping); + if (error) + goto unlock; + + error = memfd_wait_for_pins(file->f_mapping); + if (error) { + mapping_allow_writable(file->f_mapping); + goto unlock; + } + } + + *file_seals |= seals; + error = 0; + +unlock: + inode_unlock(inode); + return error; +} + +static int memfd_get_seals(struct file *file) +{ + unsigned int *seals = memfd_file_seals_ptr(file); + + return seals ? *seals : -EINVAL; +} + +long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long error; + + switch (cmd) { + case F_ADD_SEALS: + /* disallow upper 32bit */ + if (arg > UINT_MAX) + return -EINVAL; + + error = memfd_add_seals(file, arg); + break; + case F_GET_SEALS: + error = memfd_get_seals(file); + break; + default: + error = -EINVAL; + break; + } + + return error; +} + +#define MFD_NAME_PREFIX "memfd:" +#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) +#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) + +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) + +SYSCALL_DEFINE2(memfd_create, + const char __user *, uname, + unsigned int, flags) +{ + unsigned int *file_seals; + struct file *file; + int fd, error; + char *name; + long len; + + if (!(flags & MFD_HUGETLB)) { + if (flags & ~(unsigned int)MFD_ALL_FLAGS) + return -EINVAL; + } else { + /* Allow huge page size encoding in flags. */ + if (flags & ~(unsigned int)(MFD_ALL_FLAGS | + (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) + return -EINVAL; + } + + /* length includes terminating zero */ + len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); + if (len <= 0) + return -EFAULT; + if (len > MFD_NAME_MAX_LEN + 1) + return -EINVAL; + + name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL); + if (!name) + return -ENOMEM; + + strcpy(name, MFD_NAME_PREFIX); + if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { + error = -EFAULT; + goto err_name; + } + + /* terminating-zero may have changed after strnlen_user() returned */ + if (name[len + MFD_NAME_PREFIX_LEN - 1]) { + error = -EFAULT; + goto err_name; + } + + fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); + if (fd < 0) { + error = fd; + goto err_name; + } + + if (flags & MFD_HUGETLB) { + struct user_struct *user = NULL; + + file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, + HUGETLB_ANONHUGE_INODE, + (flags >> MFD_HUGE_SHIFT) & + MFD_HUGE_MASK); + } else + file = shmem_file_setup(name, 0, VM_NORESERVE); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_fd; + } + file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; + file->f_flags |= O_RDWR | O_LARGEFILE; + + if (flags & MFD_ALLOW_SEALING) { + file_seals = memfd_file_seals_ptr(file); + *file_seals &= ~F_SEAL_SEAL; + } + + fd_install(fd, file); + kfree(name); + return fd; + +err_fd: + put_unused_fd(fd); +err_name: + kfree(name); + return error; +} diff --git a/mm/memory.c b/mm/memory.c index 01f5464e0fd28802753c0085e1d30f98a4557d3e..345e562a138dbf7f9274f2e40498d8fed0f2e0a3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -817,17 +817,12 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, * PFNMAP mappings in order to support COWable mappings. * */ -#ifdef __HAVE_ARCH_PTE_SPECIAL -# define HAVE_PTE_SPECIAL 1 -#else -# define HAVE_PTE_SPECIAL 0 -#endif struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte, bool with_public_device) { unsigned long pfn = pte_pfn(pte); - if (HAVE_PTE_SPECIAL) { + if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { if (likely(!pte_special(pte))) goto check_pfn; if (vma->vm_ops && vma->vm_ops->find_special_page) @@ -862,7 +857,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, return NULL; } - /* !HAVE_PTE_SPECIAL case follows: */ + /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { @@ -881,6 +876,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, if (is_zero_pfn(pfn)) return NULL; + check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); @@ -904,7 +900,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, /* * There is no pmd_special() but there may be special pmds, e.g. * in a direct-access (dax) mapping, so let's just replicate the - * !HAVE_PTE_SPECIAL case from vm_normal_page() here. + * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here. */ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { @@ -1933,7 +1929,8 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP * without pte special, it would there be refcounted as a normal page. */ - if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { + if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && + !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { struct page *page; /* @@ -2925,7 +2922,7 @@ int do_swap_page(struct vm_fault *vmf) struct swap_info_struct *si = swp_swap_info(entry); if (si->flags & SWP_SYNCHRONOUS_IO && - __swap_count(si, entry) == 1) { + __swap_count(entry) == 1) { /* skip swapcache */ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); @@ -3035,7 +3032,6 @@ int do_swap_page(struct vm_fault *vmf) flush_icache_page(vma, page); if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); vmf->orig_pte = pte; @@ -3049,6 +3045,7 @@ int do_swap_page(struct vm_fault *vmf) mem_cgroup_commit_charge(page, memcg, true, false); activate_page(page); } + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); swap_free(entry); if (mem_cgroup_swap_full(page) || diff --git a/mm/mincore.c b/mm/mincore.c index fc37afe226e65881613e7cd2b5384c521983aef6..a66f2052c7b116b1e8b8f2fe5842df6fb9cc42db 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -68,8 +68,16 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) */ if (radix_tree_exceptional_entry(page)) { swp_entry_t swp = radix_to_swp_entry(page); - page = find_get_page(swap_address_space(swp), - swp_offset(swp)); + struct swap_info_struct *si; + + /* Prevent swap device to being swapoff under us */ + si = get_swap_device(swp); + if (si) { + page = find_get_page(swap_address_space(swp), + swp_offset(swp)); + put_swap_device(si); + } else + page = NULL; } } else page = find_get_page(mapping, pgoff); diff --git a/mm/mmap.c b/mm/mmap.c index 7da0cf5faceeb6846da0ce4f62fb6d8fc65877de..c13e0b9dfb16c8c0bcdb2bd230cba529db11cee3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3024,6 +3024,25 @@ void exit_mmap(struct mm_struct *mm) /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); + if (unlikely(mm_is_oom_victim(mm))) { + /* + * Wait for oom_reap_task() to stop working on this mm. Because + * MMF_UNSTABLE is already set before calling down_read(), + * oom_reap_task() will not run on this mm after up_write(). + * oom_reap_task() also depends on a stable VM_LOCKED flag to + * indicate it should not unmap during munlock_vma_pages_all(). + * + * mm_is_oom_victim() cannot be set from under us because + * victim->mm is already set to NULL under task_lock before + * calling mmput() and victim->signal->oom_mm is set by the oom + * killer only if victim->mm is non-NULL while holding + * task_lock(). + */ + set_bit(MMF_UNSTABLE, &mm->flags); + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } + if (mm->locked_vm) { vma = mm->mmap; while (vma) { @@ -3045,26 +3064,9 @@ void exit_mmap(struct mm_struct *mm) /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, vma, 0, -1); - - if (unlikely(mm_is_oom_victim(mm))) { - /* - * Wait for oom_reap_task() to stop working on this - * mm. Because MMF_OOM_SKIP is already set before - * calling down_read(), oom_reap_task() will not run - * on this "mm" post up_write(). - * - * mm_is_oom_victim() cannot be set from under us - * either because victim->mm is already set to NULL - * under task_lock before calling mmput and oom_mm is - * set not NULL by the OOM killer only if victim->mm - * is found not NULL while holding the task_lock. - */ - set_bit(MMF_OOM_SKIP, &mm->flags); - down_write(&mm->mmap_sem); - up_write(&mm->mmap_sem); - } free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, 0, -1); + set_bit(MMF_OOM_SKIP, &mm->flags); /* * Walk the list again, actually closing and freeing it, diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ff992fa8760aa1e22fc8d5d07df12e293690e877..f79e58287f83d30c871fcf58d709b367bf314e6f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -256,7 +256,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) int nid; if (is_memcg_oom(oc)) { - oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1; + oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1; return CONSTRAINT_MEMCG; } @@ -304,7 +304,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) return CONSTRAINT_NONE; } -static int oom_evaluate_task(struct task_struct *task, void *arg) +int oom_evaluate_task(struct task_struct *task, void *arg) { struct oom_control *oc = arg; unsigned long points; @@ -338,26 +338,26 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) goto next; /* Prefer thread group leaders for display purposes */ - if (points == oc->chosen_points && thread_group_leader(oc->chosen)) + if (points == oc->chosen_points && thread_group_leader(oc->chosen_task)) goto next; select: - if (oc->chosen) - put_task_struct(oc->chosen); + if (oc->chosen_task) + put_task_struct(oc->chosen_task); get_task_struct(task); - oc->chosen = task; + oc->chosen_task = task; oc->chosen_points = points; next: return 0; abort: - if (oc->chosen) - put_task_struct(oc->chosen); - oc->chosen = (void *)-1UL; + if (oc->chosen_task) + put_task_struct(oc->chosen_task); + oc->chosen_task = INFLIGHT_VICTIM; return 1; } /* * Simple selection loop. We choose the process with the highest number of - * 'points'. In case scan was aborted, oc->chosen is set to -1. + * 'points'. In case scan was aborted, oc->chosen_task is set to -1. */ static void select_bad_process(struct oom_control *oc) { @@ -521,12 +521,17 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) } /* - * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't - * work on the mm anymore. The check for MMF_OOM_SKIP must run + * Tell all users of get_user/copy_from_user etc... that the content + * is no longer stable. No barriers really needed because unmapping + * should imply barriers already and the reader would hit a page fault + * if it stumbled over reaped memory. + * + * MMF_UNSTABLE is also set by exit_mmap when the OOM reaper shouldn't + * work on the mm anymore. The check for MMF_OOM_UNSTABLE must run * under mmap_sem for reading because it serializes against the * down_write();up_write() cycle in exit_mmap(). */ - if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + if (test_and_set_bit(MMF_UNSTABLE, &mm->flags)) { up_read(&mm->mmap_sem); trace_skip_task_reaping(tsk->pid); goto unlock_oom; @@ -534,14 +539,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) trace_start_task_reaping(tsk->pid); - /* - * Tell all users of get_user/copy_from_user etc... that the content - * is no longer stable. No barriers really needed because unmapping - * should imply barriers already and the reader would hit a page fault - * if it stumbled over a reaped memory. - */ - set_bit(MMF_UNSTABLE, &mm->flags); - for (vma = mm->mmap ; vma; vma = vma->vm_next) { if (!can_madv_dontneed_vma(vma)) continue; @@ -567,6 +564,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) tlb_finish_mmu(&tlb, start, end); } } + set_bit(MMF_OOM_SKIP, &mm->flags); pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(tsk), tsk->comm, K(get_mm_counter(mm, MM_ANONPAGES)), @@ -594,7 +592,6 @@ static void oom_reap_task(struct task_struct *tsk) test_bit(MMF_OOM_SKIP, &mm->flags)) goto done; - pr_info("oom_reaper: unable to reap pid:%d (%s)\n", task_pid_nr(tsk), tsk->comm); debug_show_all_locks(); @@ -603,10 +600,11 @@ static void oom_reap_task(struct task_struct *tsk) tsk->oom_reaper_list = NULL; /* - * Hide this mm from OOM killer because it has been either reaped or - * somebody can't call up_write(mmap_sem). + * If the oom reaper could not get started on this mm and it has not yet + * reached exit_mmap(), set MMF_OOM_SKIP to disregard. */ - set_bit(MMF_OOM_SKIP, &mm->flags); + if (!test_bit(MMF_UNSTABLE, &mm->flags)) + set_bit(MMF_OOM_SKIP, &mm->flags); /* Drop a reference taken by wake_oom_reaper */ put_task_struct(tsk); @@ -830,67 +828,22 @@ static bool task_will_free_mem(struct task_struct *task) return ret; } -static void oom_kill_process(struct oom_control *oc, const char *message) +static void __oom_kill_process(struct task_struct *victim) { - struct task_struct *p = oc->chosen; - unsigned int points = oc->chosen_points; - struct task_struct *victim = p; - struct task_struct *child; - struct task_struct *t; + struct task_struct *p; struct mm_struct *mm; - unsigned int victim_points = 0; - static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); bool can_oom_reap = true; /* - * If the task is already exiting, don't alarm the sysadmin or kill - * its children or threads, just give it access to memory reserves - * so it can die quickly + * __oom_kill_process() is used to kill all tasks belonging to + * the selected memory cgroup, so we should check that we're not + * trying to kill an unkillable task. */ - task_lock(p); - if (task_will_free_mem(p)) { - mark_oom_victim(p); - wake_oom_reaper(p); - task_unlock(p); - put_task_struct(p); + if (is_global_init(victim) || (victim->flags & PF_KTHREAD) || + victim->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + put_task_struct(victim); return; } - task_unlock(p); - - if (__ratelimit(&oom_rs)) - dump_header(oc, p); - - pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", - message, task_pid_nr(p), p->comm, points); - - /* - * If any of p's children has a different mm and is eligible for kill, - * the one with the highest oom_badness() score is sacrificed for its - * parent. This attempts to lose the minimal amount of work done while - * still freeing memory. - */ - read_lock(&tasklist_lock); - for_each_thread(p, t) { - list_for_each_entry(child, &t->children, sibling) { - unsigned int child_points; - - if (process_shares_mm(child, p->mm)) - continue; - /* - * oom_badness() returns 0 if the thread is unkillable - */ - child_points = oom_badness(child, - oc->memcg, oc->nodemask, oc->totalpages); - if (child_points > victim_points) { - put_task_struct(victim); - victim = child; - victim_points = child_points; - get_task_struct(victim); - } - } - } - read_unlock(&tasklist_lock); p = find_lock_task_mm(victim); if (!p) { @@ -965,6 +918,108 @@ static void oom_kill_process(struct oom_control *oc, const char *message) } #undef K +static void oom_kill_process(struct oom_control *oc, const char *message) +{ + struct task_struct *p = oc->chosen_task; + unsigned int points = oc->chosen_points; + struct task_struct *victim = p; + struct task_struct *child; + struct task_struct *t; + unsigned int victim_points = 0; + static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + /* + * If the task is already exiting, don't alarm the sysadmin or kill + * its children or threads, just give it access to memory reserves + * so it can die quickly + */ + task_lock(p); + if (task_will_free_mem(p)) { + mark_oom_victim(p); + wake_oom_reaper(p); + task_unlock(p); + put_task_struct(p); + return; + } + task_unlock(p); + + if (__ratelimit(&oom_rs)) + dump_header(oc, p); + + pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", + message, task_pid_nr(p), p->comm, points); + + /* + * If any of p's children has a different mm and is eligible for kill, + * the one with the highest oom_badness() score is sacrificed for its + * parent. This attempts to lose the minimal amount of work done while + * still freeing memory. + */ + read_lock(&tasklist_lock); + for_each_thread(p, t) { + list_for_each_entry(child, &t->children, sibling) { + unsigned int child_points; + + if (process_shares_mm(child, p->mm)) + continue; + /* + * oom_badness() returns 0 if the thread is unkillable + */ + child_points = oom_badness(child, + oc->memcg, oc->nodemask, oc->totalpages); + if (child_points > victim_points) { + put_task_struct(victim); + victim = child; + victim_points = child_points; + get_task_struct(victim); + } + } + } + read_unlock(&tasklist_lock); + + __oom_kill_process(victim); +} + +static int oom_kill_memcg_member(struct task_struct *task, void *unused) +{ + get_task_struct(task); + __oom_kill_process(task); + return 0; +} + +static bool oom_kill_memcg_victim(struct oom_control *oc) +{ + if (oc->chosen_memcg == NULL || oc->chosen_memcg == INFLIGHT_VICTIM) + return oc->chosen_memcg; + + /* + * If memory.oom_group is set, kill all tasks belonging to the sub-tree + * of the chosen memory cgroup, otherwise kill the task with the biggest + * memory footprint. + */ + if (mem_cgroup_oom_group(oc->chosen_memcg)) { + mem_cgroup_scan_tasks(oc->chosen_memcg, oom_kill_memcg_member, + NULL); + /* We have one or more terminating processes at this point. */ + oc->chosen_task = INFLIGHT_VICTIM; + } else { + oc->chosen_points = 0; + oc->chosen_task = NULL; + mem_cgroup_scan_tasks(oc->chosen_memcg, oom_evaluate_task, oc); + + if (oc->chosen_task == NULL || + oc->chosen_task == INFLIGHT_VICTIM) + goto out; + + __oom_kill_process(oc->chosen_task); + } + +out: + mem_cgroup_put(oc->chosen_memcg); + return oc->chosen_task; +} + /* * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ @@ -1017,6 +1072,7 @@ bool out_of_memory(struct oom_control *oc) { unsigned long freed = 0; enum oom_constraint constraint = CONSTRAINT_NONE; + bool delay = false; /* if set, delay next allocation attempt */ if (oom_killer_disabled) return false; @@ -1061,27 +1117,37 @@ bool out_of_memory(struct oom_control *oc) current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { get_task_struct(current); - oc->chosen = current; + oc->chosen_task = current; oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)"); return true; } + if (mem_cgroup_select_oom_victim(oc) && oom_kill_memcg_victim(oc)) { + delay = true; + goto out; + } + select_bad_process(oc); /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) { + if (!oc->chosen_task && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) { dump_header(oc, NULL); panic("Out of memory and no killable processes...\n"); } - if (oc->chosen && oc->chosen != (void *)-1UL) { + if (oc->chosen_task && oc->chosen_task != INFLIGHT_VICTIM) { oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : "Memory cgroup out of memory"); - /* - * Give the killed process a good chance to exit before trying - * to allocate memory again. - */ - schedule_timeout_killable(1); + delay = true; } - return !!oc->chosen; + +out: + /* + * Give the killed process a good chance to exit before trying + * to allocate memory again. + */ + if (delay) + schedule_timeout_killable(1); + + return !!oc->chosen_task; } /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a13077bb6de863fbe0e7a1bbadb8d50e43d1c5dc..d52e1c9f66b5520b37531e33b650eb9c18efef3c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6327,22 +6327,21 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; - unsigned long size, realsize, freesize, memmap_pages; + unsigned long size, freesize, memmap_pages; unsigned long zone_start_pfn = zone->zone_start_pfn; unsigned long movable_size = 0; size = zone->spanned_pages; - realsize = freesize = zone->present_pages; + freesize = zone->present_pages; if (zone_end_pfn(zone) > node_end_pfn) node_end_pfn = zone_end_pfn(zone); - /* * Adjust freesize so that it accounts for how much memory * is used by this zone for memmap. This affects the watermark * and per-cpu initialisations */ - memmap_pages = calc_memmap_size(size, realsize); + memmap_pages = calc_memmap_size(size, freesize); if (!is_highmem_idx(j)) { if (freesize >= memmap_pages) { freesize -= memmap_pages; @@ -6374,7 +6373,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) * when the bootmem allocator frees pages into the buddy system. * And all highmem pages will be managed by the buddy system. */ - zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; + zone->managed_pages = freesize; #ifdef CONFIG_NUMA zone->node = nid; #endif diff --git a/mm/page_counter.c b/mm/page_counter.c index 2a8df3ad60a4ecce1b4e4839f1b42033ad759a25..a5ff4cbc355a4d270dacfc556536c94999f9ad88 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -13,6 +13,28 @@ #include <linux/bug.h> #include <asm/page.h> +static void propagate_low_usage(struct page_counter *c, unsigned long usage) +{ + unsigned long low_usage, old; + long delta; + + if (!c->parent) + return; + + if (!c->low && !atomic_long_read(&c->low_usage)) + return; + + if (usage <= c->low) + low_usage = usage; + else + low_usage = 0; + + old = atomic_long_xchg(&c->low_usage, low_usage); + delta = low_usage - old; + if (delta) + atomic_long_add(delta, &c->parent->children_low_usage); +} + /** * page_counter_cancel - take pages out of the local counter * @counter: counter @@ -22,7 +44,8 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) { long new; - new = atomic_long_sub_return(nr_pages, &counter->count); + new = atomic_long_sub_return(nr_pages, &counter->usage); + propagate_low_usage(counter, new); /* More uncharges than charges? */ WARN_ON_ONCE(new < 0); } @@ -41,7 +64,8 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) for (c = counter; c; c = c->parent) { long new; - new = atomic_long_add_return(nr_pages, &c->count); + new = atomic_long_add_return(nr_pages, &c->usage); + propagate_low_usage(counter, new); /* * This is indeed racy, but we can live with some * inaccuracy in the watermark. @@ -82,9 +106,10 @@ bool page_counter_try_charge(struct page_counter *counter, * we either see the new limit or the setter sees the * counter has changed and retries. */ - new = atomic_long_add_return(nr_pages, &c->count); - if (new > c->limit) { - atomic_long_sub(nr_pages, &c->count); + new = atomic_long_add_return(nr_pages, &c->usage); + if (new > c->max) { + atomic_long_sub(nr_pages, &c->usage); + propagate_low_usage(counter, new); /* * This is racy, but we can live with some * inaccuracy in the failcnt. @@ -93,6 +118,7 @@ bool page_counter_try_charge(struct page_counter *counter, *fail = c; goto failed; } + propagate_low_usage(counter, new); /* * Just like with failcnt, we can live with some * inaccuracy in the watermark. @@ -123,20 +149,20 @@ void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) } /** - * page_counter_limit - limit the number of pages allowed + * page_counter_set_max - set the maximum number of pages allowed * @counter: counter - * @limit: limit to set + * @nr_pages: limit to set * * Returns 0 on success, -EBUSY if the current number of pages on the * counter already exceeds the specified limit. * * The caller must serialize invocations on the same counter. */ -int page_counter_limit(struct page_counter *counter, unsigned long limit) +int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) { for (;;) { unsigned long old; - long count; + long usage; /* * Update the limit while making sure that it's not @@ -149,21 +175,38 @@ int page_counter_limit(struct page_counter *counter, unsigned long limit) * the limit, so if it sees the old limit, we see the * modified counter and retry. */ - count = atomic_long_read(&counter->count); + usage = atomic_long_read(&counter->usage); - if (count > limit) + if (usage > nr_pages) return -EBUSY; - old = xchg(&counter->limit, limit); + old = xchg(&counter->max, nr_pages); - if (atomic_long_read(&counter->count) <= count) + if (atomic_long_read(&counter->usage) <= usage) return 0; - counter->limit = old; + counter->max = old; cond_resched(); } } +/** + * page_counter_set_low - set the amount of protected memory + * @counter: counter + * @nr_pages: value to set + * + * The caller must serialize invocations on the same counter. + */ +void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + counter->low = nr_pages; + + for (c = counter; c; c = c->parent) + propagate_low_usage(c, atomic_long_read(&c->usage)); +} + /** * page_counter_memparse - memparse() for page counter limits * @buf: string to parse diff --git a/mm/page_owner.c b/mm/page_owner.c index 75d21a2259b3b1ff9d2ac624fc9fbab10100580d..77d9e791ae8acdbd2a796294a23c91e438e9bf41 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -274,7 +274,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, */ for (; pfn < end_pfn; ) { if (!pfn_valid(pfn)) { - pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + pfn = ALIGN(pfn + 1, pageblock_nr_pages); continue; } @@ -541,7 +541,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) unsigned long block_end_pfn; if (!pfn_valid(pfn)) { - pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + pfn = ALIGN(pfn + 1, pageblock_nr_pages); continue; } diff --git a/mm/shmem.c b/mm/shmem.c index 9d6c7e5954153b6b678ff4660b0dfddd143c21fb..cb4d7fa389d0a93a254d116a8fc59f13882a6978 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -327,7 +327,7 @@ static int shmem_radix_tree_replace(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { struct radix_tree_node *node; - void **pslot; + void __rcu **pslot; void *item; VM_BUG_ON(!expected); @@ -395,7 +395,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE /* ifdef here to avoid bloating shmem.o when not necessary */ -int shmem_huge __read_mostly; +static int shmem_huge __read_mostly; #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) static int shmem_parse_huge(const char *str) @@ -682,7 +682,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct radix_tree_iter iter; - void **slot; + void __rcu **slot; struct page *page; unsigned long swapped = 0; @@ -1098,13 +1098,19 @@ static void shmem_evict_inode(struct inode *inode) static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) { struct radix_tree_iter iter; - void **slot; + void __rcu **slot; unsigned long found = -1; unsigned int checked = 0; rcu_read_lock(); radix_tree_for_each_slot(slot, root, &iter, 0) { - if (*slot == item) { + void *entry = radix_tree_deref_slot(slot); + + if (radix_tree_deref_retry(entry)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + if (entry == item) { found = iter.index; break; } @@ -1322,9 +1328,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (!swap.val) goto redirty; - if (mem_cgroup_try_charge_swap(page, swap)) - goto free_swap; - /* * Add inode to shmem_unuse()'s list of swapped-out inodes, * if it's not already there. Do it now before the page is @@ -1353,7 +1356,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) } mutex_unlock(&shmem_swaplist_mutex); -free_swap: put_swap_page(page, swap); redirty: set_page_dirty(page); @@ -2616,241 +2618,6 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) return offset; } -/* - * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, - * so reuse a tag which we firmly believe is never set or cleared on shmem. - */ -#define SHMEM_TAG_PINNED PAGECACHE_TAG_TOWRITE -#define LAST_SCAN 4 /* about 150ms max */ - -static void shmem_tag_pins(struct address_space *mapping) -{ - struct radix_tree_iter iter; - void **slot; - pgoff_t start; - struct page *page; - - lru_add_drain(); - start = 0; - rcu_read_lock(); - - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - page = radix_tree_deref_slot(slot); - if (!page || radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - } else if (page_count(page) - page_mapcount(page) > 1) { - xa_lock_irq(&mapping->i_pages); - radix_tree_tag_set(&mapping->i_pages, iter.index, - SHMEM_TAG_PINNED); - xa_unlock_irq(&mapping->i_pages); - } - - if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); - } - } - rcu_read_unlock(); -} - -/* - * Setting SEAL_WRITE requires us to verify there's no pending writer. However, - * via get_user_pages(), drivers might have some pending I/O without any active - * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages - * and see whether it has an elevated ref-count. If so, we tag them and wait for - * them to be dropped. - * The caller must guarantee that no new user will acquire writable references - * to those pages to avoid races. - */ -static int shmem_wait_for_pins(struct address_space *mapping) -{ - struct radix_tree_iter iter; - void **slot; - pgoff_t start; - struct page *page; - int error, scan; - - shmem_tag_pins(mapping); - - error = 0; - for (scan = 0; scan <= LAST_SCAN; scan++) { - if (!radix_tree_tagged(&mapping->i_pages, SHMEM_TAG_PINNED)) - break; - - if (!scan) - lru_add_drain_all(); - else if (schedule_timeout_killable((HZ << scan) / 200)) - scan = LAST_SCAN; - - start = 0; - rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, - start, SHMEM_TAG_PINNED) { - - page = radix_tree_deref_slot(slot); - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - - page = NULL; - } - - if (page && - page_count(page) - page_mapcount(page) != 1) { - if (scan < LAST_SCAN) - goto continue_resched; - - /* - * On the last scan, we clean up all those tags - * we inserted; but make a note that we still - * found pages pinned. - */ - error = -EBUSY; - } - - xa_lock_irq(&mapping->i_pages); - radix_tree_tag_clear(&mapping->i_pages, - iter.index, SHMEM_TAG_PINNED); - xa_unlock_irq(&mapping->i_pages); -continue_resched: - if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); - } - } - rcu_read_unlock(); - } - - return error; -} - -static unsigned int *memfd_file_seals_ptr(struct file *file) -{ - if (file->f_op == &shmem_file_operations) - return &SHMEM_I(file_inode(file))->seals; - -#ifdef CONFIG_HUGETLBFS - if (file->f_op == &hugetlbfs_file_operations) - return &HUGETLBFS_I(file_inode(file))->seals; -#endif - - return NULL; -} - -#define F_ALL_SEALS (F_SEAL_SEAL | \ - F_SEAL_SHRINK | \ - F_SEAL_GROW | \ - F_SEAL_WRITE) - -static int memfd_add_seals(struct file *file, unsigned int seals) -{ - struct inode *inode = file_inode(file); - unsigned int *file_seals; - int error; - - /* - * SEALING - * Sealing allows multiple parties to share a shmem-file but restrict - * access to a specific subset of file operations. Seals can only be - * added, but never removed. This way, mutually untrusted parties can - * share common memory regions with a well-defined policy. A malicious - * peer can thus never perform unwanted operations on a shared object. - * - * Seals are only supported on special shmem-files and always affect - * the whole underlying inode. Once a seal is set, it may prevent some - * kinds of access to the file. Currently, the following seals are - * defined: - * SEAL_SEAL: Prevent further seals from being set on this file - * SEAL_SHRINK: Prevent the file from shrinking - * SEAL_GROW: Prevent the file from growing - * SEAL_WRITE: Prevent write access to the file - * - * As we don't require any trust relationship between two parties, we - * must prevent seals from being removed. Therefore, sealing a file - * only adds a given set of seals to the file, it never touches - * existing seals. Furthermore, the "setting seals"-operation can be - * sealed itself, which basically prevents any further seal from being - * added. - * - * Semantics of sealing are only defined on volatile files. Only - * anonymous shmem files support sealing. More importantly, seals are - * never written to disk. Therefore, there's no plan to support it on - * other file types. - */ - - if (!(file->f_mode & FMODE_WRITE)) - return -EPERM; - if (seals & ~(unsigned int)F_ALL_SEALS) - return -EINVAL; - - inode_lock(inode); - - file_seals = memfd_file_seals_ptr(file); - if (!file_seals) { - error = -EINVAL; - goto unlock; - } - - if (*file_seals & F_SEAL_SEAL) { - error = -EPERM; - goto unlock; - } - - if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { - error = mapping_deny_writable(file->f_mapping); - if (error) - goto unlock; - - error = shmem_wait_for_pins(file->f_mapping); - if (error) { - mapping_allow_writable(file->f_mapping); - goto unlock; - } - } - - *file_seals |= seals; - error = 0; - -unlock: - inode_unlock(inode); - return error; -} - -static int memfd_get_seals(struct file *file) -{ - unsigned int *seals = memfd_file_seals_ptr(file); - - return seals ? *seals : -EINVAL; -} - -long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) -{ - long error; - - switch (cmd) { - case F_ADD_SEALS: - /* disallow upper 32bit */ - if (arg > UINT_MAX) - return -EINVAL; - - error = memfd_add_seals(file, arg); - break; - case F_GET_SEALS: - error = memfd_get_seals(file); - break; - default: - error = -EINVAL; - break; - } - - return error; -} - static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { @@ -3673,93 +3440,6 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) return 0; } -#define MFD_NAME_PREFIX "memfd:" -#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) -#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) - -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) - -SYSCALL_DEFINE2(memfd_create, - const char __user *, uname, - unsigned int, flags) -{ - unsigned int *file_seals; - struct file *file; - int fd, error; - char *name; - long len; - - if (!(flags & MFD_HUGETLB)) { - if (flags & ~(unsigned int)MFD_ALL_FLAGS) - return -EINVAL; - } else { - /* Allow huge page size encoding in flags. */ - if (flags & ~(unsigned int)(MFD_ALL_FLAGS | - (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) - return -EINVAL; - } - - /* length includes terminating zero */ - len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); - if (len <= 0) - return -EFAULT; - if (len > MFD_NAME_MAX_LEN + 1) - return -EINVAL; - - name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL); - if (!name) - return -ENOMEM; - - strcpy(name, MFD_NAME_PREFIX); - if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { - error = -EFAULT; - goto err_name; - } - - /* terminating-zero may have changed after strnlen_user() returned */ - if (name[len + MFD_NAME_PREFIX_LEN - 1]) { - error = -EFAULT; - goto err_name; - } - - fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); - if (fd < 0) { - error = fd; - goto err_name; - } - - if (flags & MFD_HUGETLB) { - struct user_struct *user = NULL; - - file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, - HUGETLB_ANONHUGE_INODE, - (flags >> MFD_HUGE_SHIFT) & - MFD_HUGE_MASK); - } else - file = shmem_file_setup(name, 0, VM_NORESERVE); - if (IS_ERR(file)) { - error = PTR_ERR(file); - goto err_fd; - } - file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; - file->f_flags |= O_RDWR | O_LARGEFILE; - - if (flags & MFD_ALLOW_SEALING) { - file_seals = memfd_file_seals_ptr(file); - *file_seals &= ~F_SEAL_SEAL; - } - - fd_install(fd, file); - kfree(name); - return fd; - -err_fd: - put_unused_fd(fd); -err_name: - kfree(name); - return error; -} - #endif /* CONFIG_TMPFS */ static void shmem_put_super(struct super_block *sb) diff --git a/mm/slab.c b/mm/slab.c index 2f308253c3d7689c677dbdc0b8dd7a686689bb7b..c1fe8099b3cdca687e092dec248223cf88350346 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2665,6 +2665,7 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep, invalid_mask, &invalid_mask, flags, &flags); dump_stack(); } + WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); check_irq_off(); @@ -3071,6 +3072,7 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, void *objp, unsigned long caller) { + WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); if (!objp) return objp; if (cachep->flags & SLAB_POISON) { diff --git a/mm/slob.c b/mm/slob.c index 623e8a5c46ce632d452e1863e65467cbfb878832..307c2c9feb441d99789f031b9bdc6eaf46234618 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -555,8 +555,10 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) flags, node); } - if (b && c->ctor) + if (b && c->ctor) { + WARN_ON_ONCE(flags & __GFP_ZERO); c->ctor(b); + } kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); return b; diff --git a/mm/slub.c b/mm/slub.c index 44aa7847324ac4f8ea99cfe9e53834d56c4818de..dc960401ce90ec0d9c383e74adf378c5e442b143 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2444,6 +2444,8 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, struct kmem_cache_cpu *c = *pc; struct page *page; + WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO)); + freelist = get_partial(s, flags, node, c); if (freelist) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index bd0276d5f66b17f6b6c78e503b91972c7677e599..640e68f8324b4e746b69e370db894c8df415208a 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -303,7 +303,6 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, ms = __nr_to_section(pnum); pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); - ms->section_mem_map = 0; } if (vmemmap_buf_start) { diff --git a/mm/sparse.c b/mm/sparse.c index 62eef264a7bd38313aa6855f90b4fa7fd52b0b76..835076861f932b8087e46f4de60c45c81c15c363 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -190,18 +190,22 @@ static inline int next_present_section_nr(int section_nr) section_nr++; if (present_section_nr(section_nr)) return section_nr; - } while ((section_nr < NR_MEM_SECTIONS) && - (section_nr <= __highest_present_section_nr)); + } while ((section_nr <= __highest_present_section_nr)); return -1; } #define for_each_present_section_nr(start, section_nr) \ for (section_nr = next_present_section_nr(start-1); \ ((section_nr >= 0) && \ - (section_nr < NR_MEM_SECTIONS) && \ (section_nr <= __highest_present_section_nr)); \ section_nr = next_present_section_nr(section_nr)) +/* + * Record how many memory sections are marked as present + * during system bootup. + */ +static int __initdata nr_present_sections; + /* Record a memory area against a node. */ void __init memory_present(int nid, unsigned long start, unsigned long end) { @@ -231,6 +235,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) ms->section_mem_map = sparse_encode_early_nid(nid) | SECTION_IS_ONLINE; section_mark_present(ms); + nr_present_sections++; } } } @@ -446,7 +451,6 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, ms = __nr_to_section(pnum); pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); - ms->section_mem_map = 0; } } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ @@ -474,7 +478,6 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); - ms->section_mem_map = 0; return NULL; } #endif @@ -486,10 +489,12 @@ void __weak __meminit vmemmap_populate_print_last(void) /** * alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap * @map: usemap_map for pageblock flags or mmap_map for vmemmap + * @unit_size: size of map unit */ static void __init alloc_usemap_and_memmap(void (*alloc_func) (void *, unsigned long, unsigned long, - unsigned long, int), void *data) + unsigned long, int), void *data, + int data_unit_size) { unsigned long pnum; unsigned long map_count; @@ -524,7 +529,7 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func) map_count = 1; } /* ok, last chunk */ - alloc_func(data, pnum_begin, NR_MEM_SECTIONS, + alloc_func(data, pnum_begin, __highest_present_section_nr+1, map_count, nodeid_begin); } @@ -566,7 +571,8 @@ void __init sparse_init(void) if (!usemap_map) panic("can not allocate usemap_map\n"); alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, - (void *)usemap_map); + (void *)usemap_map, + sizeof(usemap_map[0])); #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER size2 = sizeof(struct page *) * NR_MEM_SECTIONS; @@ -574,21 +580,28 @@ void __init sparse_init(void) if (!map_map) panic("can not allocate map_map\n"); alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, - (void *)map_map); + (void *)map_map, + sizeof(map_map[0])); #endif for_each_present_section_nr(0, pnum) { + struct mem_section *ms; + ms = __nr_to_section(pnum); usemap = usemap_map[pnum]; - if (!usemap) + if (!usemap) { + ms->section_mem_map = 0; continue; + } #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER map = map_map[pnum]; #else map = sparse_early_mem_map_alloc(pnum); #endif - if (!map) + if (!map) { + ms->section_mem_map = 0; continue; + } sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap); diff --git a/mm/swap_slots.c b/mm/swap_slots.c index f2641894f4409ad27fcc5319dbbcb336eae14a7d..f51ac051c0c9ede270c925081ba4be6428f9fe4b 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -317,7 +317,7 @@ swp_entry_t get_swap_page(struct page *page) if (PageTransHuge(page)) { if (IS_ENABLED(CONFIG_THP_SWAP)) get_swap_pages(1, true, &entry); - return entry; + goto out; } /* @@ -347,10 +347,14 @@ swp_entry_t get_swap_page(struct page *page) } mutex_unlock(&cache->alloc_lock); if (entry.val) - return entry; + goto out; } get_swap_pages(1, false, &entry); - +out: + if (mem_cgroup_try_charge_swap(page, entry)) { + put_swap_page(page, entry); + entry.val = 0; + } return entry; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 07f9aa2340c3a4b5c0b1138feffa1d901aa7ccae..c6b3eab73fde48ad42858b3831bf8889e86eb8f3 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -216,9 +216,6 @@ int add_to_swap(struct page *page) if (!entry.val) return 0; - if (mem_cgroup_try_charge_swap(page, entry)) - goto fail; - /* * Radix-tree node allocations from PF_MEMALLOC contexts could * completely exhaust the page allocator. __GFP_NOMEMALLOC @@ -337,8 +334,13 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr) { struct page *page; + struct swap_info_struct *si; + si = get_swap_device(entry); + if (!si) + return NULL; page = find_get_page(swap_address_space(entry), swp_offset(entry)); + put_swap_device(si); INC_CACHE_INFO(find_total); if (page) { @@ -381,8 +383,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, bool *new_page_allocated) { - struct page *found_page, *new_page = NULL; - struct address_space *swapper_space = swap_address_space(entry); + struct page *found_page = NULL, *new_page = NULL; + struct swap_info_struct *si; int err; *new_page_allocated = false; @@ -392,7 +394,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ - found_page = find_get_page(swapper_space, swp_offset(entry)); + si = get_swap_device(entry); + if (!si) + break; + found_page = find_get_page(swap_address_space(entry), + swp_offset(entry)); + put_swap_device(si); if (found_page) break; diff --git a/mm/swapfile.c b/mm/swapfile.c index cc2cf04d9018ad0c546648dfa6cb8167082bbc99..5a280972bd877deef545b28441dde965d6265b47 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -38,6 +38,7 @@ #include <linux/export.h> #include <linux/swap_slots.h> #include <linux/sort.h> +#include <linux/stop_machine.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -1107,6 +1108,64 @@ static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, return p; } +/* + * Check whether swap entry is valid in the swap device. If so, + * return pointer to swap_info_struct, and keep the swap entry valid + * via preventing the swap device from being swapoff, until + * put_swap_device() is called. Otherwise return NULL. + * + * Notice that swapoff or swapoff+swapon can still happen before the + * preempt_disable() in get_swap_device() or after the + * preempt_enable() in put_swap_device() if there isn't any other way + * to prevent swapoff, such as page lock, page table lock, etc. The + * caller must be prepared for that. For example, the following + * situation is possible. + * + * CPU1 CPU2 + * do_swap_page() + * ... swapoff+swapon + * __read_swap_cache_async() + * swapcache_prepare() + * __swap_duplicate() + * // check swap_map + * // verify PTE not changed + * + * In __swap_duplicate(), the swap_map need to be checked before + * changing partly because the specified swap entry may be for another + * swap device which has been swapoff. And in do_swap_page(), after + * the page is read from the swap device, the PTE is verified not + * changed with the page table locked to check whether the swap device + * has been swapoff or swapoff+swapon. + */ +struct swap_info_struct *get_swap_device(swp_entry_t entry) +{ + struct swap_info_struct *si; + unsigned long type, offset; + + if (!entry.val) + goto out; + type = swp_type(entry); + if (type >= nr_swapfiles) + goto bad_nofile; + si = swap_info[type]; + + preempt_disable(); + if (!(si->flags & SWP_VALID)) + goto unlock_out; + offset = swp_offset(entry); + if (offset >= si->max) + goto unlock_out; + + return si; +bad_nofile: + pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); +out: + return NULL; +unlock_out: + preempt_enable(); + return NULL; +} + static unsigned char __swap_entry_free(struct swap_info_struct *p, swp_entry_t entry, unsigned char usage) { @@ -1328,11 +1387,18 @@ int page_swapcount(struct page *page) return count; } -int __swap_count(struct swap_info_struct *si, swp_entry_t entry) +int __swap_count(swp_entry_t entry) { + struct swap_info_struct *si; pgoff_t offset = swp_offset(entry); + int count = 0; - return swap_count(si->swap_map[offset]); + si = get_swap_device(entry); + if (si) { + count = swap_count(si->swap_map[offset]); + put_swap_device(si); + } + return count; } static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) @@ -1357,9 +1423,11 @@ int __swp_swapcount(swp_entry_t entry) int count = 0; struct swap_info_struct *si; - si = __swap_info_get(entry); - if (si) + si = get_swap_device(entry); + if (si) { count = swap_swapcount(si, entry); + put_swap_device(si); + } return count; } @@ -1800,8 +1868,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); - set_pte_at(vma->vm_mm, addr, pte, - pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { page_add_anon_rmap(page, vma, addr, false); mem_cgroup_commit_charge(page, memcg, true, false); @@ -1810,6 +1876,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } + set_pte_at(vma->vm_mm, addr, pte, + pte_mkold(mk_pte(page, vma->vm_page_prot))); swap_free(entry); /* * Move the page to the active list so it is not @@ -2451,9 +2519,9 @@ static int swap_node(struct swap_info_struct *p) return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; } -static void _enable_swap_info(struct swap_info_struct *p, int prio, - unsigned char *swap_map, - struct swap_cluster_info *cluster_info) +static void setup_swap_info(struct swap_info_struct *p, int prio, + unsigned char *swap_map, + struct swap_cluster_info *cluster_info) { int i; @@ -2478,7 +2546,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, } p->swap_map = swap_map; p->cluster_info = cluster_info; - p->flags |= SWP_WRITEOK; +} + +static void _enable_swap_info(struct swap_info_struct *p) +{ + p->flags |= SWP_WRITEOK | SWP_VALID; atomic_long_add(p->pages, &nr_swap_pages); total_swap_pages += p->pages; @@ -2497,6 +2569,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, add_to_avail_list(p); } +static int swap_onoff_stop(void *arg) +{ + return 0; +} + static void enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info, @@ -2505,7 +2582,17 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, frontswap_init(p->type, frontswap_map); spin_lock(&swap_lock); spin_lock(&p->lock); - _enable_swap_info(p, prio, swap_map, cluster_info); + setup_swap_info(p, prio, swap_map, cluster_info); + spin_unlock(&p->lock); + spin_unlock(&swap_lock); + /* + * Guarantee swap_map, cluster_info, etc. fields are used + * between get/put_swap_device() only if SWP_VALID bit is set + */ + stop_machine(swap_onoff_stop, NULL, cpu_online_mask); + spin_lock(&swap_lock); + spin_lock(&p->lock); + _enable_swap_info(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); } @@ -2514,7 +2601,8 @@ static void reinsert_swap_info(struct swap_info_struct *p) { spin_lock(&swap_lock); spin_lock(&p->lock); - _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info); + setup_swap_info(p, p->prio, p->swap_map, p->cluster_info); + _enable_swap_info(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); } @@ -2617,6 +2705,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) reenable_swap_slots_cache_unlock(); + spin_lock(&swap_lock); + spin_lock(&p->lock); + p->flags &= ~SWP_VALID; /* mark swap device as invalid */ + spin_unlock(&p->lock); + spin_unlock(&swap_lock); + /* + * wait for swap operations protected by get/put_swap_device() + * to complete + */ + stop_machine(swap_onoff_stop, NULL, cpu_online_mask); + flush_work(&p->discard_work); destroy_swap_extents(p); @@ -3360,22 +3459,16 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) { struct swap_info_struct *p; struct swap_cluster_info *ci; - unsigned long offset, type; + unsigned long offset; unsigned char count; unsigned char has_cache; int err = -EINVAL; - if (non_swap_entry(entry)) + p = get_swap_device(entry); + if (!p) goto out; - type = swp_type(entry); - if (type >= nr_swapfiles) - goto bad_file; - p = swap_info[type]; offset = swp_offset(entry); - if (unlikely(offset >= p->max)) - goto out; - ci = lock_cluster_or_swap_info(p, offset); count = p->swap_map[offset]; @@ -3421,11 +3514,9 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) unlock_out: unlock_cluster_or_swap_info(p, ci); out: + if (p) + put_swap_device(p); return err; - -bad_file: - pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val); - goto out; } /* @@ -3517,6 +3608,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) struct page *list_page; pgoff_t offset; unsigned char count; + int ret = 0; /* * When debugging, it's easier to use __GFP_ZERO here; but it's better @@ -3524,15 +3616,15 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) */ page = alloc_page(gfp_mask | __GFP_HIGHMEM); - si = swap_info_get(entry); + si = get_swap_device(entry); if (!si) { /* * An acceptable race has occurred since the failing - * __swap_duplicate(): the swap entry has been freed, - * perhaps even the whole swap_map cleared for swapoff. + * __swap_duplicate(): the swap device may be swapoff */ goto outer; } + spin_lock(&si->lock); offset = swp_offset(entry); @@ -3550,9 +3642,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) } if (!page) { - unlock_cluster(ci); - spin_unlock(&si->lock); - return -ENOMEM; + ret = -ENOMEM; + goto out; } /* @@ -3604,10 +3695,11 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) out: unlock_cluster(ci); spin_unlock(&si->lock); + put_swap_device(si); outer: if (page) __free_page(page); - return 0; + return ret; } /* diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index e16d6713f2368473d96eaec321fda1685cb3cac5..24618dffc5cb07a6c9d1bfbdb4b07dbbdf9dbcd8 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6257,6 +6257,13 @@ sub process { "Avoid using bool as bitfield. Prefer bool bitfields as unsigned int or u<8|16|32>\n" . $herecurr); } +# check for bool use in .h files + if ($realfile =~ /\.h$/ && + $sline =~ /^.\s+bool\s*$Ident\s*(?::\s*d+\s*)?;/) { + CHK("BOOL_MEMBER", + "Avoid using bool structure members because of possible alignment issues - see: https://lkml.org/lkml/2017/11/21/384\n" . $herecurr); + } + # check for semaphores initialized locked if ($line =~ /^.\s*sema_init.+,\W?0\W?\)/) { WARN("CONSIDER_COMPLETION", diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index ce2b89e9ad94eb9b5b657a8ccce703894c021e19..cf00c8520d3042aa379e9e51b69221e21eeeab6d 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -693,6 +693,7 @@ static void apparmor_bprm_committing_creds(struct linux_binprm *bprm) aa_inherit_files(bprm->cred, current->files); current->pdeath_signal = 0; + current->signal->pdeath_signal_proc = 0; /* reset soft limits and set hard limits for the new label */ __aa_transition_rlimits(label, new_label); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 4cafe6a19167613cb64b29ac59c895e91285b390..a5acf9dae6b72728cd3099e7c80bb65ce38962f1 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2641,6 +2641,7 @@ static void selinux_bprm_committing_creds(struct linux_binprm *bprm) /* Always clear parent death signal on SID transitions. */ current->pdeath_signal = 0; + current->signal->pdeath_signal_proc = 0; /* Check whether the new SID can inherit resource limits from the old * SID. If not, reset all soft limits to the lower of the current