Commit 99baac21 authored by Minchan Kim's avatar Minchan Kim Committed by Linus Torvalds
Browse files

mm: fix MADV_[FREE|DONTNEED] TLB flush miss problem

Nadav reported parallel MADV_DONTNEED on same range has a stale TLB
problem and Mel fixed it[1] and found same problem on MADV_FREE[2].

Quote from Mel Gorman:
 "The race in question is CPU 0 running madv_free and updating some PTEs
  while CPU 1 is also running madv_free and looking at the same PTEs.
  CPU 1 may have writable TLB entries for a page but fail the pte_dirty
  check (because CPU 0 has updated it already) and potentially fail to
  flush.

  Hence, when madv_free on CPU 1 returns, there are still potentially
  writable TLB entries and the underlying PTE is still present so that a
  subsequent write does not necessarily propagate the dirty bit to the
  underlying PTE any more. Reclaim at some unknown time at the future
  may then see that the PTE is still clean and discard the page even
  though a write has happened in the meantime. I think this is possible
  but I could have missed some protection in madv_free that prevents it
  happening."

...
parent 0a2dd266
......@@ -168,8 +168,13 @@ arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
static inline void
arch_tlb_finish_mmu(struct mmu_gather *tlb,
unsigned long start, unsigned long end)
unsigned long start, unsigned long end, bool force)
{
if (force) {
tlb->range_start = start;
tlb->range_end = end;
}
tlb_flush_mmu(tlb);
/* keep the page table cache within bounds */
......
......@@ -187,8 +187,10 @@ arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
*/
static inline void
arch_tlb_finish_mmu(struct mmu_gather *tlb,
unsigned long start, unsigned long end)
unsigned long start, unsigned long end, bool force)
{
if (force)
tlb->need_flush = 1;
/*
* Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and
* tlb->end_addr.
......
......@@ -77,8 +77,13 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb)
static inline void
arch_tlb_finish_mmu(struct mmu_gather *tlb,
unsigned long start, unsigned long end)
unsigned long start, unsigned long end, bool force)
{
if (force) {
tlb->start = start;
tlb->end = end;
}
tlb_flush_mmu(tlb);
}
......
......@@ -49,9 +49,9 @@ arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
static inline void
arch_tlb_finish_mmu(struct mmu_gather *tlb,
unsigned long start, unsigned long end)
unsigned long start, unsigned long end, bool force)
{
if (tlb->fullmm)
if (tlb->fullmm || force)
flush_tlb_mm(tlb->mm);
/* keep the page table cache within bounds */
......
......@@ -87,8 +87,13 @@ tlb_flush_mmu(struct mmu_gather *tlb)
*/
static inline void
arch_tlb_finish_mmu(struct mmu_gather *tlb,
unsigned long start, unsigned long end)
unsigned long start, unsigned long end, bool force)
{
if (force) {
tlb->start = start;
tlb->end = end;
tlb->need_flush = 1;
}
tlb_flush_mmu(tlb);
/* keep the page table cache within bounds */
......
......@@ -116,7 +116,7 @@ void arch_tlb_gather_mmu(struct mmu_gather *tlb,
struct mm_struct *mm, unsigned long start, unsigned long end);
void tlb_flush_mmu(struct mmu_gather *tlb);
void arch_tlb_finish_mmu(struct mmu_gather *tlb,
unsigned long start, unsigned long end);
unsigned long start, unsigned long end, bool force);
extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
int page_size);
......
......@@ -538,6 +538,14 @@ static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
return atomic_read(&mm->tlb_flush_pending) > 0;
}
/*
* Returns true if there are two above TLB batching threads in parallel.
*/
static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
{
return atomic_read(&mm->tlb_flush_pending) > 1;
}
static inline void init_tlb_flush_pending(struct mm_struct *mm)
{
atomic_set(&mm->tlb_flush_pending, 0);
......
......@@ -272,10 +272,13 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
* that were required.
*/
void arch_tlb_finish_mmu(struct mmu_gather *tlb,
unsigned long start, unsigned long end)
unsigned long start, unsigned long end, bool force)
{
struct mmu_gather_batch *batch, *next;
if (force)
__tlb_adjust_range(tlb, start, end - start);
tlb_flush_mmu(tlb);
/* keep the page table cache within bounds */
......@@ -404,12 +407,23 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
unsigned long start, unsigned long end)
{
arch_tlb_gather_mmu(tlb, mm, start, end);
inc_tlb_flush_pending(tlb->mm);
}
void tlb_finish_mmu(struct mmu_gather *tlb,
unsigned long start, unsigned long end)
{
arch_tlb_finish_mmu(tlb, start, end);
/*
* If there are parallel threads are doing PTE changes on same range
* under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB
* flush by batching, a thread has stable TLB entry can fail to flush
* the TLB by observing pte_none|!pte_dirty, for example so flush TLB
* forcefully if we detect parallel PTE batching threads.
*/
bool force = mm_tlb_flush_nested(tlb->mm);
arch_tlb_finish_mmu(tlb, start, end, force);
dec_tlb_flush_pending(tlb->mm);
}
/*
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment