diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index 7f5b2a2d3861947cb8c8196e40e4d400c39d9483..d5562f9ce60079139d360e5d6afac59469051454 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -168,8 +168,13 @@ arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
 
 static inline void
 arch_tlb_finish_mmu(struct mmu_gather *tlb,
-			unsigned long start, unsigned long end)
+			unsigned long start, unsigned long end, bool force)
 {
+	if (force) {
+		tlb->range_start = start;
+		tlb->range_end = end;
+	}
+
 	tlb_flush_mmu(tlb);
 
 	/* keep the page table cache within bounds */
diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h
index 93cadc04ac6268583bf04863f1de792c35ca7247..cbe5ac3699bf0f9dbdfd726c112f6fc6bd1271f0 100644
--- a/arch/ia64/include/asm/tlb.h
+++ b/arch/ia64/include/asm/tlb.h
@@ -187,8 +187,10 @@ arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
  */
 static inline void
 arch_tlb_finish_mmu(struct mmu_gather *tlb,
-			unsigned long start, unsigned long end)
+			unsigned long start, unsigned long end, bool force)
 {
+	if (force)
+		tlb->need_flush = 1;
 	/*
 	 * Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and
 	 * tlb->end_addr.
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index d574d0820dc8ace763cbeb517a01901ac372cf87..2eb8ff0d6fca443543c32ac80ff690b4b67be1ef 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -77,8 +77,13 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb)
 
 static inline void
 arch_tlb_finish_mmu(struct mmu_gather *tlb,
-		unsigned long start, unsigned long end)
+		unsigned long start, unsigned long end, bool force)
 {
+	if (force) {
+		tlb->start = start;
+		tlb->end = end;
+	}
+
 	tlb_flush_mmu(tlb);
 }
 
diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h
index 89786560dbd4ab119b16d213a9a163ac78f8567f..51a8bc967e75f1e3c96a70783e9da439310edbcb 100644
--- a/arch/sh/include/asm/tlb.h
+++ b/arch/sh/include/asm/tlb.h
@@ -49,9 +49,9 @@ arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
 
 static inline void
 arch_tlb_finish_mmu(struct mmu_gather *tlb,
-		unsigned long start, unsigned long end)
+		unsigned long start, unsigned long end, bool force)
 {
-	if (tlb->fullmm)
+	if (tlb->fullmm || force)
 		flush_tlb_mm(tlb->mm);
 
 	/* keep the page table cache within bounds */
diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h
index 2a901eca71456bb5854e711cb3611b44e54500ab..344d95619d0334659e6f4a9f3a5bff70ae95f67c 100644
--- a/arch/um/include/asm/tlb.h
+++ b/arch/um/include/asm/tlb.h
@@ -87,8 +87,13 @@ tlb_flush_mmu(struct mmu_gather *tlb)
  */
 static inline void
 arch_tlb_finish_mmu(struct mmu_gather *tlb,
-		unsigned long start, unsigned long end)
+		unsigned long start, unsigned long end, bool force)
 {
+	if (force) {
+		tlb->start = start;
+		tlb->end = end;
+		tlb->need_flush = 1;
+	}
 	tlb_flush_mmu(tlb);
 
 	/* keep the page table cache within bounds */
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 8f71521e7a4422cc9258b6d77b7b2fdc3e8bb4f6..faddde44de8c902e6884e64eeb8b22bd0d11b75a 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -116,7 +116,7 @@ void arch_tlb_gather_mmu(struct mmu_gather *tlb,
 	struct mm_struct *mm, unsigned long start, unsigned long end);
 void tlb_flush_mmu(struct mmu_gather *tlb);
 void arch_tlb_finish_mmu(struct mmu_gather *tlb,
-			 unsigned long start, unsigned long end);
+			 unsigned long start, unsigned long end, bool force);
 extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
 				   int page_size);
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 892a7b0196fd58a64e7df9d49b009020d32f0094..3cadee0a350889f748e7b1a999b449ae003e9c3f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -538,6 +538,14 @@ static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
 	return atomic_read(&mm->tlb_flush_pending) > 0;
 }
 
+/*
+ * Returns true if there are two above TLB batching threads in parallel.
+ */
+static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
+{
+	return atomic_read(&mm->tlb_flush_pending) > 1;
+}
+
 static inline void init_tlb_flush_pending(struct mm_struct *mm)
 {
 	atomic_set(&mm->tlb_flush_pending, 0);
diff --git a/mm/memory.c b/mm/memory.c
index 34cba5113e06e764a1b74b58eff560a9fbbdc222..e158f7ac67300b10b8827fe6825667506095f550 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -272,10 +272,13 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
  *	that were required.
  */
 void arch_tlb_finish_mmu(struct mmu_gather *tlb,
-		unsigned long start, unsigned long end)
+		unsigned long start, unsigned long end, bool force)
 {
 	struct mmu_gather_batch *batch, *next;
 
+	if (force)
+		__tlb_adjust_range(tlb, start, end - start);
+
 	tlb_flush_mmu(tlb);
 
 	/* keep the page table cache within bounds */
@@ -404,12 +407,23 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
 			unsigned long start, unsigned long end)
 {
 	arch_tlb_gather_mmu(tlb, mm, start, end);
+	inc_tlb_flush_pending(tlb->mm);
 }
 
 void tlb_finish_mmu(struct mmu_gather *tlb,
 		unsigned long start, unsigned long end)
 {
-	arch_tlb_finish_mmu(tlb, start, end);
+	/*
+	 * If there are parallel threads are doing PTE changes on same range
+	 * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB
+	 * flush by batching, a thread has stable TLB entry can fail to flush
+	 * the TLB by observing pte_none|!pte_dirty, for example so flush TLB
+	 * forcefully if we detect parallel PTE batching threads.
+	 */
+	bool force = mm_tlb_flush_nested(tlb->mm);
+
+	arch_tlb_finish_mmu(tlb, start, end, force);
+	dec_tlb_flush_pending(tlb->mm);
 }
 
 /*