diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index d49cde0cdca79..204f5c8dfb777 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -222,12 +222,16 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void *table); #define tlb_needs_table_invalidate() (true) #endif +void tlb_remove_table_sync_one(void); + #else #ifdef tlb_needs_table_invalidate #error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE #endif +static inline void tlb_remove_table_sync_one(void) { } + #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6a5e3becc3a5f..63013df78f9bc 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -215,6 +215,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, bool is_hugetlb_entry_migration(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); +void hugetlb_split(struct vm_area_struct *vma, unsigned long addr); #else /* !CONFIG_HUGETLB_PAGE */ @@ -420,6 +421,8 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm, static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { } +static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {} + #endif /* !CONFIG_HUGETLB_PAGE */ /* * hugepages at page global directory. If arch support diff --git a/include/linux/mm.h b/include/linux/mm.h index 6326a985c283f..1e09beccde525 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2436,6 +2436,9 @@ static inline bool pgtable_pmd_page_ctor(struct page *page) if (!pmd_ptlock_init(page)) return false; __SetPageTable(page); +#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE + atomic_set(&page->pt_share_count, 0); +#endif inc_lruvec_page_state(page, NR_PAGETABLE); return true; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 652ec687128e9..7745bdbc17884 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -155,6 +155,9 @@ struct page { union { struct mm_struct *pt_mm; /* x86 pgds only */ atomic_t pt_frag_refcount; /* powerpc */ +#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE + RH_KABI_BROKEN_INSERT(atomic_t pt_share_count) +#endif }; #if ALLOC_SPLIT_PTLOCKS spinlock_t *ptl; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6ac25e7f51c7a..df8107c92e254 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -95,6 +95,8 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); +static void hugetlb_unshare_pmds(struct vm_area_struct *vma, + unsigned long start, unsigned long end, bool take_locks); static inline bool subpool_is_free(struct hugepage_subpool *spool) { @@ -4631,6 +4633,39 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) return 0; } +void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) +{ + /* + * PMD sharing is only possible for PUD_SIZE-aligned address ranges + * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this + * split, unshare PMDs in the PUD_SIZE interval surrounding addr now. + * This function is called in the middle of a VMA split operation, with + * MM, VMA and rmap all write-locked to prevent concurrent page table + * walks (except hardware and gup_fast()). + */ + mmap_assert_write_locked(vma->vm_mm); + i_mmap_assert_write_locked(vma->vm_file->f_mapping); + + if (addr & ~PUD_MASK) { + unsigned long floor = addr & PUD_MASK; + unsigned long ceil = floor + PUD_SIZE; + + if (floor >= vma->vm_start && ceil <= vma->vm_end) { + /* + * Locking: + * Use take_locks=false here. + * The file rmap lock is already held. + * The hugetlb VMA lock can't be taken when we already + * hold the file rmap lock, and we don't need it because + * its purpose is to synchronize against concurrent page + * table walks, which are not possible thanks to the + * locks held by our caller. + */ + hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false); + } + } +} + static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) { return huge_page_size(hstate_vma(vma)); @@ -4734,7 +4769,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { - pte_t *src_pte, *dst_pte, entry, dst_entry; + pte_t *src_pte, *dst_pte, entry; struct page *ptepage; unsigned long addr; bool cow = is_cow_mapping(src_vma->vm_flags); @@ -4773,30 +4808,20 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, break; } - /* - * If the pagetables are shared don't copy or take references. - * dst_pte == src_pte is the common case of src/dest sharing. - * - * However, src could have 'unshared' and dst shares with - * another vma. If dst_pte !none, this implies sharing. - * Check here before taking page table lock, and once again - * after taking the lock below. - */ - dst_entry = huge_ptep_get(dst_pte); - if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) +#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE + /* If the pagetables are shared, there is nothing to do */ + if (!!atomic_read(&virt_to_page(dst_pte)->pt_share_count)) continue; +#endif dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); - dst_entry = huge_ptep_get(dst_pte); again: - if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) { + if (huge_pte_none(entry)) { /* - * Skip if src entry none. Also, skip in the - * unlikely case dst entry !none as this implies - * sharing with another vma. + * Skip if src entry none. */ ; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) { @@ -4876,7 +4901,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, restore_reserve_on_error(h, dst_vma, addr, new); put_page(new); - /* dst_entry won't change as in child */ + /* huge_ptep of dst_pte won't change as in child */ goto again; } hugetlb_install_page(dst_vma, dst_pte, addr, new); @@ -6697,7 +6722,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, spte = huge_pte_offset(svma->vm_mm, saddr, vma_mmu_pagesize(svma)); if (spte) { - get_page(virt_to_page(spte)); + atomic_inc(&virt_to_page(spte)->pt_share_count); break; } } @@ -6712,7 +6737,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, (pmd_t *)((unsigned long)spte & PAGE_MASK)); mm_inc_nr_pmds(mm); } else { - put_page(virt_to_page(spte)); + atomic_dec(&virt_to_page(spte)->pt_share_count); } spin_unlock(ptl); out: @@ -6723,11 +6748,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, /* * unmap huge page backed by shared pte. * - * Hugetlb pte page is ref counted at the time of mapping. If pte is shared - * indicated by page_count > 1, unmap is achieved by clearing pud and - * decrementing the ref count. If count == 1, the pte page is not shared. - * - * Called with page table lock held and i_mmap_rwsem held in write mode. + * Called with page table lock held. * * returns: 1 successfully unmapped a shared pte page * 0 the underlying pte page is not shared, or it is the last user @@ -6735,17 +6756,26 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long *addr, pte_t *ptep) { + unsigned long sz = huge_page_size(hstate_vma(vma)); pgd_t *pgd = pgd_offset(mm, *addr); p4d_t *p4d = p4d_offset(pgd, *addr); pud_t *pud = pud_offset(p4d, *addr); i_mmap_assert_write_locked(vma->vm_file->f_mapping); - BUG_ON(page_count(virt_to_page(ptep)) == 0); - if (page_count(virt_to_page(ptep)) == 1) + if (sz != PMD_SIZE) + return 0; + if (!atomic_read(&virt_to_page(ptep)->pt_share_count)) return 0; pud_clear(pud); - put_page(virt_to_page(ptep)); + /* + * Once our caller drops the rmap lock, some other process might be + * using this page table as a normal, non-hugetlb page table. + * Wait for pending gup_fast() in other threads to finish before letting + * that happen. + */ + tlb_remove_table_sync_one(); + atomic_dec(&virt_to_page(ptep)->pt_share_count); mm_dec_nr_pmds(mm); /* * This update of passed address optimizes loops sequentially @@ -7037,25 +7067,27 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) } /* - * This function will unconditionally remove all the shared pmd pgtable entries - * within the specific vma for a hugetlbfs memory range. + * If @take_locks is false, the caller must ensure that no concurrent page table + * access can happen (except for gup_fast() and hardware page walks). + * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like + * concurrent page fault handling) and the file rmap lock. */ -void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) +static void hugetlb_unshare_pmds(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + bool take_locks) { struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; - unsigned long address, start, end; + unsigned long address; spinlock_t *ptl; pte_t *ptep; if (!(vma->vm_flags & VM_MAYSHARE)) return; - start = ALIGN(vma->vm_start, PUD_SIZE); - end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); - if (start >= end) return; @@ -7067,7 +7099,11 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, start, end); mmu_notifier_invalidate_range_start(&range); - i_mmap_lock_write(vma->vm_file->f_mapping); + if (take_locks) { + i_mmap_lock_write(vma->vm_file->f_mapping); + } else { + i_mmap_assert_write_locked(vma->vm_file->f_mapping); + } for (address = start; address < end; address += PUD_SIZE) { unsigned long tmp = address; @@ -7080,7 +7116,9 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) spin_unlock(ptl); } flush_hugetlb_tlb_range(vma, start, end); - i_mmap_unlock_write(vma->vm_file->f_mapping); + if (take_locks) { + i_mmap_unlock_write(vma->vm_file->f_mapping); + } /* * No need to call mmu_notifier_invalidate_range(), see * Documentation/vm/mmu_notifier.rst. @@ -7088,6 +7126,17 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) mmu_notifier_invalidate_range_end(&range); } +/* + * This function will unconditionally remove all the shared pmd pgtable entries + * within the specific vma for a hugetlbfs memory range. + */ +void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) +{ + hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), + ALIGN_DOWN(vma->vm_end, PUD_SIZE), + /* take_locks = */ true); +} + #ifdef CONFIG_CMA static bool cma_reserve_called __initdata; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6223b06e48027..f77c7f5b9d8f5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1138,6 +1138,7 @@ static void collapse_huge_page(struct mm_struct *mm, _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(&range); + tlb_remove_table_sync_one(); spin_lock(pte_ptl); isolated = __collapse_huge_page_isolate(vma, address, pte, @@ -1413,6 +1414,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v ptl = pmd_lock(vma->vm_mm, pmdp); pmd = pmdp_collapse_flush(vma, addr, pmdp); spin_unlock(ptl); + tlb_remove_table_sync_one(); mm_dec_nr_ptes(mm); page_table_check_pte_clear_range(mm, addr, pmd); pte_free(mm, pmd_pgtable(pmd)); diff --git a/mm/mmap.c b/mm/mmap.c index 342479c826b3c..85442f81295d5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -851,7 +851,15 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } } again: + /* + * Get rid of huge pages and shared page tables straddling the split + * boundary. + */ vma_adjust_trans_huge(orig_vma, start, end, adjust_next); + if (is_vm_hugetlb_page(orig_vma)) { + hugetlb_split(orig_vma, start); + hugetlb_split(orig_vma, end); + } if (file) { mapping = file->f_mapping; diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index afb7185ffdc45..8d46aa7af6f40 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -140,7 +140,7 @@ static void tlb_remove_table_smp_sync(void *arg) /* Simply deliver the interrupt */ } -static void tlb_remove_table_sync_one(void) +void tlb_remove_table_sync_one(void) { /* * This isn't an RCU grace period and hence the page-tables cannot be @@ -164,8 +164,6 @@ static void tlb_remove_table_free(struct mmu_table_batch *batch) #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */ -static void tlb_remove_table_sync_one(void) { } - static void tlb_remove_table_free(struct mmu_table_batch *batch) { __tlb_remove_table_free(batch);