From 17d14cb084bb6f5d326a6eaa0fafdf635ed45e29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Wed, 3 Dec 2025 00:07:25 +0100 Subject: [PATCH 1/7] hugetlb: unshare some PMDs when splitting VMAs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-71577 cve-pre CVE-2025-38084 commit-author James Houghton commit b30c14cd61025eeea2f2e8569606cd167ba9ad2d upstream-diff Stable 5.15 backport bd9a23a4bb8a320ffa6e45cf09a068ec0a335350 was used for the actual (clean) cherry-pick PMD sharing can only be done in PUD_SIZE-aligned pieces of VMAs; however, it is possible that HugeTLB VMAs are split without unsharing the PMDs first. Without this fix, it is possible to hit the uffd-wp-related WARN_ON_ONCE in hugetlb_change_protection [1]. The key there is that hugetlb_unshare_all_pmds will not attempt to unshare PMDs in non-PUD_SIZE-aligned sections of the VMA. It might seem ideal to unshare in hugetlb_vm_op_open, but we need to unshare in both the new and old VMAs, so unsharing in hugetlb_vm_op_split seems natural. [1]: https://lore.kernel.org/linux-mm/CADrL8HVeOkj0QH5VZZbRzybNE8CG-tEGFshnA+bG9nMgcWtBSg@mail.gmail.com/ Link: https://lkml.kernel.org/r/20230104231910.1464197-1-jthoughton@google.com Fixes: 6dfeaff93be1 ("hugetlb/userfaultfd: unshare all pmds for hugetlbfs when register wp") Signed-off-by: James Houghton Reviewed-by: Mike Kravetz Acked-by: Peter Xu Cc: Axel Rasmussen Cc: Muchun Song Cc: Signed-off-by: Andrew Morton (cherry picked from commit b30c14cd61025eeea2f2e8569606cd167ba9ad2d) Signed-off-by: Marcin Wcisło --- mm/hugetlb.c | 44 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6ac25e7f51c7a..2c625f6128cfe 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -95,6 +95,8 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); +static void hugetlb_unshare_pmds(struct vm_area_struct *vma, + unsigned long start, unsigned long end); static inline bool subpool_is_free(struct hugepage_subpool *spool) { @@ -4628,6 +4630,25 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) { if (addr & ~(huge_page_mask(hstate_vma(vma)))) return -EINVAL; + + /* + * PMD sharing is only possible for PUD_SIZE-aligned address ranges + * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this + * split, unshare PMDs in the PUD_SIZE interval surrounding addr now. + */ + if (addr & ~PUD_MASK) { + /* + * hugetlb_vm_op_split is called right before we attempt to + * split the VMA. We will need to unshare PMDs in the old and + * new VMAs, so let's unshare before we split. + */ + unsigned long floor = addr & PUD_MASK; + unsigned long ceil = floor + PUD_SIZE; + + if (floor >= vma->vm_start && ceil <= vma->vm_end) + hugetlb_unshare_pmds(vma, floor, ceil); + } + return 0; } @@ -7036,26 +7057,21 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) } } -/* - * This function will unconditionally remove all the shared pmd pgtable entries - * within the specific vma for a hugetlbfs memory range. - */ -void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) +static void hugetlb_unshare_pmds(struct vm_area_struct *vma, + unsigned long start, + unsigned long end) { struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; - unsigned long address, start, end; + unsigned long address; spinlock_t *ptl; pte_t *ptep; if (!(vma->vm_flags & VM_MAYSHARE)) return; - start = ALIGN(vma->vm_start, PUD_SIZE); - end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); - if (start >= end) return; @@ -7088,6 +7104,16 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) mmu_notifier_invalidate_range_end(&range); } +/* + * This function will unconditionally remove all the shared pmd pgtable entries + * within the specific vma for a hugetlbfs memory range. + */ +void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) +{ + hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), + ALIGN_DOWN(vma->vm_end, PUD_SIZE)); +} + #ifdef CONFIG_CMA static bool cma_reserve_called __initdata; From c67de4c5ba52ebca36c04fbcbb0cdad3166e63f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 18 Nov 2025 23:22:37 +0100 Subject: [PATCH 2/7] mm: hugetlb: independent PMD page table shared count MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-46929 cve CVE-2024-57883 commit-author Liu Shixin commit 59d9094df3d79443937add8700b2ef1a866b1081 upstream-diff Stable 5.15 backport 8410996eb6fea116fe1483ed977aacf580eee7b4 was used for the actual (clean) cherry-pick. Additionally the `atomic_t pt_share_count' field in `include/linux/mm_types.h' was wrapped in RH_KABI_BROKEN_INSERT macro to avoid kABI checker complains. It's justified, because the inserted field (it's included, as CONFIG_ARCH_WANT_HUGE_PMD_SHARE gets enabled for at least `kernel-x86_64-rhel.config') is placed within a union which already contained a field of the same type `atomic_t pt_frag_refcount', so the size of it cannot change. Moreover this union serves as a scratch space for the subsystems using the struct page. Upon releasing the ownership to buddy allocator the union contents no longer matter. When the page is allocated again the scratch space will be used by the new owner in its own way. The folio refcount may be increased unexpectly through try_get_folio() by caller such as split_huge_pages. In huge_pmd_unshare(), we use refcount to check whether a pmd page table is shared. The check is incorrect if the refcount is increased by the above caller, and this can cause the page table leaked: BUG: Bad page state in process sh pfn:109324 page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x66 pfn:0x109324 flags: 0x17ffff800000000(node=0|zone=2|lastcpupid=0xfffff) page_type: f2(table) raw: 017ffff800000000 0000000000000000 0000000000000000 0000000000000000 raw: 0000000000000066 0000000000000000 00000000f2000000 0000000000000000 page dumped because: nonzero mapcount ... CPU: 31 UID: 0 PID: 7515 Comm: sh Kdump: loaded Tainted: G B 6.13.0-rc2master+ #7 Tainted: [B]=BAD_PAGE Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 Call trace: show_stack+0x20/0x38 (C) dump_stack_lvl+0x80/0xf8 dump_stack+0x18/0x28 bad_page+0x8c/0x130 free_page_is_bad_report+0xa4/0xb0 free_unref_page+0x3cc/0x620 __folio_put+0xf4/0x158 split_huge_pages_all+0x1e0/0x3e8 split_huge_pages_write+0x25c/0x2d8 full_proxy_write+0x64/0xd8 vfs_write+0xcc/0x280 ksys_write+0x70/0x110 __arm64_sys_write+0x24/0x38 invoke_syscall+0x50/0x120 el0_svc_common.constprop.0+0xc8/0xf0 do_el0_svc+0x24/0x38 el0_svc+0x34/0x128 el0t_64_sync_handler+0xc8/0xd0 el0t_64_sync+0x190/0x198 The issue may be triggered by damon, offline_page, page_idle, etc, which will increase the refcount of page table. 1. The page table itself will be discarded after reporting the "nonzero mapcount". 2. The HugeTLB page mapped by the page table miss freeing since we treat the page table as shared and a shared page table will not be unmapped. Fix it by introducing independent PMD page table shared count. As described by comment, pt_index/pt_mm/pt_frag_refcount are used for s390 gmap, x86 pgds and powerpc, pt_share_count is used for x86/arm64/riscv pmds, so we can reuse the field as pt_share_count. Link: https://lkml.kernel.org/r/20241216071147.3984217-1-liushixin2@huawei.com Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page") Signed-off-by: Liu Shixin Cc: Kefeng Wang Cc: Ken Chen Cc: Muchun Song Cc: Nanyong Sun Cc: Jane Chu Cc: Signed-off-by: Andrew Morton (cherry picked from commit 59d9094df3d79443937add8700b2ef1a866b1081) Signed-off-by: Marcin Wcisło --- include/linux/mm.h | 3 +++ include/linux/mm_types.h | 3 +++ mm/hugetlb.c | 18 ++++++++---------- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 6326a985c283f..1e09beccde525 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2436,6 +2436,9 @@ static inline bool pgtable_pmd_page_ctor(struct page *page) if (!pmd_ptlock_init(page)) return false; __SetPageTable(page); +#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE + atomic_set(&page->pt_share_count, 0); +#endif inc_lruvec_page_state(page, NR_PAGETABLE); return true; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 652ec687128e9..7745bdbc17884 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -155,6 +155,9 @@ struct page { union { struct mm_struct *pt_mm; /* x86 pgds only */ atomic_t pt_frag_refcount; /* powerpc */ +#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE + RH_KABI_BROKEN_INSERT(atomic_t pt_share_count) +#endif }; #if ALLOC_SPLIT_PTLOCKS spinlock_t *ptl; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2c625f6128cfe..6164822e1f18b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6718,7 +6718,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, spte = huge_pte_offset(svma->vm_mm, saddr, vma_mmu_pagesize(svma)); if (spte) { - get_page(virt_to_page(spte)); + atomic_inc(&virt_to_page(spte)->pt_share_count); break; } } @@ -6733,7 +6733,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, (pmd_t *)((unsigned long)spte & PAGE_MASK)); mm_inc_nr_pmds(mm); } else { - put_page(virt_to_page(spte)); + atomic_dec(&virt_to_page(spte)->pt_share_count); } spin_unlock(ptl); out: @@ -6744,11 +6744,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, /* * unmap huge page backed by shared pte. * - * Hugetlb pte page is ref counted at the time of mapping. If pte is shared - * indicated by page_count > 1, unmap is achieved by clearing pud and - * decrementing the ref count. If count == 1, the pte page is not shared. - * - * Called with page table lock held and i_mmap_rwsem held in write mode. + * Called with page table lock held. * * returns: 1 successfully unmapped a shared pte page * 0 the underlying pte page is not shared, or it is the last user @@ -6756,17 +6752,19 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long *addr, pte_t *ptep) { + unsigned long sz = huge_page_size(hstate_vma(vma)); pgd_t *pgd = pgd_offset(mm, *addr); p4d_t *p4d = p4d_offset(pgd, *addr); pud_t *pud = pud_offset(p4d, *addr); i_mmap_assert_write_locked(vma->vm_file->f_mapping); - BUG_ON(page_count(virt_to_page(ptep)) == 0); - if (page_count(virt_to_page(ptep)) == 1) + if (sz != PMD_SIZE) + return 0; + if (!atomic_read(&virt_to_page(ptep)->pt_share_count)) return 0; pud_clear(pud); - put_page(virt_to_page(ptep)); + atomic_dec(&virt_to_page(ptep)->pt_share_count); mm_dec_nr_pmds(mm); /* * This update of passed address optimizes loops sequentially From 26e1e1b27d35ee7c059eb125da3f5cbe484b352c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 18 Nov 2025 23:25:39 +0100 Subject: [PATCH 3/7] mm/hugetlb: unshare page tables during VMA split, not before MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-71577 cve CVE-2025-38084 commit-author Jann Horn commit 081056dc00a27bccb55ccc3c6f230a3d5fd3f7e0 upstream-diff Stable 5.15 backport 366298f2b04d2bf1f2f2b7078405bdf9df9bd5d0 was used for the actual (clean) cherry-pick Currently, __split_vma() triggers hugetlb page table unsharing through vm_ops->may_split(). This happens before the VMA lock and rmap locks are taken - which is too early, it allows racing VMA-locked page faults in our process and racing rmap walks from other processes to cause page tables to be shared again before we actually perform the split. Fix it by explicitly calling into the hugetlb unshare logic from __split_vma() in the same place where THP splitting also happens. At that point, both the VMA and the rmap(s) are write-locked. An annoying detail is that we can now call into the helper hugetlb_unshare_pmds() from two different locking contexts: 1. from hugetlb_split(), holding: - mmap lock (exclusively) - VMA lock - file rmap lock (exclusively) 2. hugetlb_unshare_all_pmds(), which I think is designed to be able to call us with only the mmap lock held (in shared mode), but currently only runs while holding mmap lock (exclusively) and VMA lock Backporting note: This commit fixes a racy protection that was introduced in commit b30c14cd6102 ("hugetlb: unshare some PMDs when splitting VMAs"); that commit claimed to fix an issue introduced in 5.13, but it should actually also go all the way back. [jannh@google.com: v2] Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-1-1329349bad1a@google.com Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-0-1329349bad1a@google.com Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-1-f4136f5ec58a@google.com Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page") Signed-off-by: Jann Horn Cc: Liam Howlett Reviewed-by: Lorenzo Stoakes Reviewed-by: Oscar Salvador Cc: Lorenzo Stoakes Cc: Vlastimil Babka Cc: [b30c14cd6102: hugetlb: unshare some PMDs when splitting VMAs] Cc: Signed-off-by: Andrew Morton (cherry picked from commit 081056dc00a27bccb55ccc3c6f230a3d5fd3f7e0) Signed-off-by: Marcin Wcisło --- include/linux/hugetlb.h | 3 +++ mm/hugetlb.c | 56 ++++++++++++++++++++++++++++++----------- mm/mmap.c | 8 ++++++ 3 files changed, 53 insertions(+), 14 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6a5e3becc3a5f..63013df78f9bc 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -215,6 +215,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, bool is_hugetlb_entry_migration(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); +void hugetlb_split(struct vm_area_struct *vma, unsigned long addr); #else /* !CONFIG_HUGETLB_PAGE */ @@ -420,6 +421,8 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm, static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { } +static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {} + #endif /* !CONFIG_HUGETLB_PAGE */ /* * hugepages at page global directory. If arch support diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6164822e1f18b..1db95fabbdad3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -96,7 +96,7 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); static void hugetlb_unshare_pmds(struct vm_area_struct *vma, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, bool take_locks); static inline bool subpool_is_free(struct hugepage_subpool *spool) { @@ -4630,26 +4630,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) { if (addr & ~(huge_page_mask(hstate_vma(vma)))) return -EINVAL; + return 0; +} +void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) +{ /* * PMD sharing is only possible for PUD_SIZE-aligned address ranges * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this * split, unshare PMDs in the PUD_SIZE interval surrounding addr now. + * This function is called in the middle of a VMA split operation, with + * MM, VMA and rmap all write-locked to prevent concurrent page table + * walks (except hardware and gup_fast()). */ + mmap_assert_write_locked(vma->vm_mm); + i_mmap_assert_write_locked(vma->vm_file->f_mapping); + if (addr & ~PUD_MASK) { - /* - * hugetlb_vm_op_split is called right before we attempt to - * split the VMA. We will need to unshare PMDs in the old and - * new VMAs, so let's unshare before we split. - */ unsigned long floor = addr & PUD_MASK; unsigned long ceil = floor + PUD_SIZE; - if (floor >= vma->vm_start && ceil <= vma->vm_end) - hugetlb_unshare_pmds(vma, floor, ceil); + if (floor >= vma->vm_start && ceil <= vma->vm_end) { + /* + * Locking: + * Use take_locks=false here. + * The file rmap lock is already held. + * The hugetlb VMA lock can't be taken when we already + * hold the file rmap lock, and we don't need it because + * its purpose is to synchronize against concurrent page + * table walks, which are not possible thanks to the + * locks held by our caller. + */ + hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false); + } } - - return 0; } static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) @@ -7055,9 +7069,16 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) } } +/* + * If @take_locks is false, the caller must ensure that no concurrent page table + * access can happen (except for gup_fast() and hardware page walks). + * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like + * concurrent page fault handling) and the file rmap lock. + */ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, - unsigned long end) + unsigned long end, + bool take_locks) { struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); @@ -7081,7 +7102,11 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, start, end); mmu_notifier_invalidate_range_start(&range); - i_mmap_lock_write(vma->vm_file->f_mapping); + if (take_locks) { + i_mmap_lock_write(vma->vm_file->f_mapping); + } else { + i_mmap_assert_write_locked(vma->vm_file->f_mapping); + } for (address = start; address < end; address += PUD_SIZE) { unsigned long tmp = address; @@ -7094,7 +7119,9 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, spin_unlock(ptl); } flush_hugetlb_tlb_range(vma, start, end); - i_mmap_unlock_write(vma->vm_file->f_mapping); + if (take_locks) { + i_mmap_unlock_write(vma->vm_file->f_mapping); + } /* * No need to call mmu_notifier_invalidate_range(), see * Documentation/vm/mmu_notifier.rst. @@ -7109,7 +7136,8 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), - ALIGN_DOWN(vma->vm_end, PUD_SIZE)); + ALIGN_DOWN(vma->vm_end, PUD_SIZE), + /* take_locks = */ true); } #ifdef CONFIG_CMA diff --git a/mm/mmap.c b/mm/mmap.c index 342479c826b3c..85442f81295d5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -851,7 +851,15 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } } again: + /* + * Get rid of huge pages and shared page tables straddling the split + * boundary. + */ vma_adjust_trans_huge(orig_vma, start, end, adjust_next); + if (is_vm_hugetlb_page(orig_vma)) { + hugetlb_split(orig_vma, start); + hugetlb_split(orig_vma, end); + } if (file) { mapping = file->f_mapping; From 6be0cf8926073f174ca19f1761f3ae9406b2127b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Fri, 5 Dec 2025 19:39:52 +0100 Subject: [PATCH 4/7] mm/khugepaged: fix GUP-fast interaction by sending IPI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-71586 cve-pre CVE-2025-38085 commit-author Jann Horn commit 2ba99c5e08812494bc57f319fb562f527d9bacd8 upstream-diff Resolved contextual conflicts due to lack of backported 8d3c106e19e8d251da31ff4cc7462e4565d65084 Since commit 70cbc3cc78a99 ("mm: gup: fix the fast GUP race against THP collapse"), the lockless_pages_from_mm() fastpath rechecks the pmd_t to ensure that the page table was not removed by khugepaged in between. However, lockless_pages_from_mm() still requires that the page table is not concurrently freed. Fix it by sending IPIs (if the architecture uses semi-RCU-style page table freeing) before freeing/reusing page tables. Link: https://lkml.kernel.org/r/20221129154730.2274278-2-jannh@google.com Link: https://lkml.kernel.org/r/20221128180252.1684965-2-jannh@google.com Link: https://lkml.kernel.org/r/20221125213714.4115729-2-jannh@google.com Fixes: ba76149f47d8 ("thp: khugepaged") Signed-off-by: Jann Horn Reviewed-by: Yang Shi Acked-by: David Hildenbrand Cc: John Hubbard Cc: Peter Xu Cc: Signed-off-by: Andrew Morton (cherry picked from commit 2ba99c5e08812494bc57f319fb562f527d9bacd8) Signed-off-by: Marcin Wcisło --- include/asm-generic/tlb.h | 4 ++++ mm/khugepaged.c | 2 ++ mm/mmu_gather.c | 4 +--- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index d49cde0cdca79..204f5c8dfb777 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -222,12 +222,16 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void *table); #define tlb_needs_table_invalidate() (true) #endif +void tlb_remove_table_sync_one(void); + #else #ifdef tlb_needs_table_invalidate #error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE #endif +static inline void tlb_remove_table_sync_one(void) { } + #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6223b06e48027..f77c7f5b9d8f5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1138,6 +1138,7 @@ static void collapse_huge_page(struct mm_struct *mm, _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(&range); + tlb_remove_table_sync_one(); spin_lock(pte_ptl); isolated = __collapse_huge_page_isolate(vma, address, pte, @@ -1413,6 +1414,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v ptl = pmd_lock(vma->vm_mm, pmdp); pmd = pmdp_collapse_flush(vma, addr, pmdp); spin_unlock(ptl); + tlb_remove_table_sync_one(); mm_dec_nr_ptes(mm); page_table_check_pte_clear_range(mm, addr, pmd); pte_free(mm, pmd_pgtable(pmd)); diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index afb7185ffdc45..8d46aa7af6f40 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -140,7 +140,7 @@ static void tlb_remove_table_smp_sync(void *arg) /* Simply deliver the interrupt */ } -static void tlb_remove_table_sync_one(void) +void tlb_remove_table_sync_one(void) { /* * This isn't an RCU grace period and hence the page-tables cannot be @@ -164,8 +164,6 @@ static void tlb_remove_table_free(struct mmu_table_batch *batch) #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */ -static void tlb_remove_table_sync_one(void) { } - static void tlb_remove_table_free(struct mmu_table_batch *batch) { __tlb_remove_table_free(batch); From c27ed4d8b5f821596981f305624eb1b462df7630 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Fri, 5 Dec 2025 19:42:28 +0100 Subject: [PATCH 5/7] mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-71586 cve CVE-2025-38085 commit-author Jann Horn commit 1013af4f585fccc4d3e5c5824d174de2257f7d6d upstream-diff Stable 5.15 backport a3d864c901a300c295692d129159fc3001a56185 was used for the actual (clean) cherry-pick. huge_pmd_unshare() drops a reference on a page table that may have previously been shared across processes, potentially turning it into a normal page table used in another process in which unrelated VMAs can afterwards be installed. If this happens in the middle of a concurrent gup_fast(), gup_fast() could end up walking the page tables of another process. While I don't see any way in which that immediately leads to kernel memory corruption, it is really weird and unexpected. Fix it with an explicit broadcast IPI through tlb_remove_table_sync_one(), just like we do in khugepaged when removing page tables for a THP collapse. Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-2-1329349bad1a@google.com Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-2-f4136f5ec58a@google.com Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page") Signed-off-by: Jann Horn Reviewed-by: Lorenzo Stoakes Cc: Liam Howlett Cc: Muchun Song Cc: Oscar Salvador Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton (cherry picked from commit 1013af4f585fccc4d3e5c5824d174de2257f7d6d) Signed-off-by: Marcin Wcisło --- mm/hugetlb.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1db95fabbdad3..7e25e852731bf 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6778,6 +6778,13 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, return 0; pud_clear(pud); + /* + * Once our caller drops the rmap lock, some other process might be + * using this page table as a normal, non-hugetlb page table. + * Wait for pending gup_fast() in other threads to finish before letting + * that happen. + */ + tlb_remove_table_sync_one(); atomic_dec(&virt_to_page(ptep)->pt_share_count); mm_dec_nr_pmds(mm); /* From ac7ebbbf8eb065593ac35a9ee8bd5dbecad55bcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 25 Nov 2025 21:20:55 +0100 Subject: [PATCH 6/7] mm/hugetlb: make detecting shared pte more reliable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-46929 cve-bf CVE-2024-57883 commit-author Miaohe Lin commit 3aa4ed8040e1535d95c03cef8b52cf11bf0d8546 upstream-diff Accounted for e95a9851787bbb3cd4deb40fe8bab03f731852d1 not being backported to ciqlts9_2 - dropped the unnecessary braces in a one-statement `if' conditional. If the pagetables are shared, we shouldn't copy or take references. Since src could have unshared and dst shares with another vma, huge_pte_none() is thus used to determine whether dst_pte is shared. But this check isn't reliable. A shared pte could have pte none in pagetable in fact. The page count of ptep page should be checked here in order to reliably determine whether pte is shared. [lukas.bulwahn@gmail.com: remove unused local variable dst_entry in copy_hugetlb_page_range()] Link: https://lkml.kernel.org/r/20220822082525.26071-1-lukas.bulwahn@gmail.com Link: https://lkml.kernel.org/r/20220816130553.31406-7-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Lukas Bulwahn Reviewed-by: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton (cherry picked from commit 3aa4ed8040e1535d95c03cef8b52cf11bf0d8546) Signed-off-by: Marcin Wcisło --- mm/hugetlb.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7e25e852731bf..832fe8451197a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4769,7 +4769,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { - pte_t *src_pte, *dst_pte, entry, dst_entry; + pte_t *src_pte, *dst_pte, entry; struct page *ptepage; unsigned long addr; bool cow = is_cow_mapping(src_vma->vm_flags); @@ -4810,28 +4810,23 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, /* * If the pagetables are shared don't copy or take references. - * dst_pte == src_pte is the common case of src/dest sharing. * + * dst_pte == src_pte is the common case of src/dest sharing. * However, src could have 'unshared' and dst shares with - * another vma. If dst_pte !none, this implies sharing. - * Check here before taking page table lock, and once again - * after taking the lock below. + * another vma. So page_count of ptep page is checked instead + * to reliably determine whether pte is shared. */ - dst_entry = huge_ptep_get(dst_pte); - if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) + if (page_count(virt_to_page(dst_pte)) > 1) continue; dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); - dst_entry = huge_ptep_get(dst_pte); again: - if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) { + if (huge_pte_none(entry)) { /* - * Skip if src entry none. Also, skip in the - * unlikely case dst entry !none as this implies - * sharing with another vma. + * Skip if src entry none. */ ; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) { @@ -4911,7 +4906,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, restore_reserve_on_error(h, dst_vma, addr, new); put_page(new); - /* dst_entry won't change as in child */ + /* huge_ptep of dst_pte won't change as in child */ goto again; } hugetlb_install_page(dst_vma, dst_pte, addr, new); From 7fd33d1368b3865133ef7ca5e2900661288b1e69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Wed, 26 Nov 2025 01:28:33 +0100 Subject: [PATCH 7/7] mm/hugetlb: fix copy_hugetlb_page_range() to use ->pt_share_count MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-46929 cve-bf CVE-2024-57883 commit-author Jane Chu commit 14967a9c7d247841b0312c48dcf8cd29e55a4cc8 upstream-diff | include/linux/mm_types.h Removed the definition of `ptdesc_pmd_is_shared()' function in alignment with stable-5.15 backport 8410996eb6fea116fe1483ed977aacf580eee7b4 (it omits the definition of `ptdesc_pmd_pts_*()' functions family, to which `ptdesc_pmd_is_shared()' belongs). mm/hugetlb.c copy_hugetlb_page_range() 1. Used CONFIG_ARCH_WANT_HUGE_PMD_SHARE instead of CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING, because the latter was introduced only in the non-backported commit 188cac58a8bcdf82c7f63275b68f7a46871e45d6. 2. Since `ptdesc_pmd_is_shared()' was not defined, read the `pt_share_count' field directly, as is don in the stable-5.15 backport 8410996eb6fea116fe1483ed977aacf580eee7b4. (Compare changes to `huge_pmd_unshare()' in `mm/hugetlb.c' between upstream 59d9094df3d79443937add8700b2ef1a866b1081 and stable-5.15 8410996eb6fea116fe1483ed977aacf580eee7b4.) huge_pmd_unshare() No change to the conditional. It was arguably not needed in the upstream as well, probably introduced only for the sake of clarity in the presence of `ptdesc_pmd_is_shared()' function, which is missing here. commit 59d9094df3d79 ("mm: hugetlb: independent PMD page table shared count") introduced ->pt_share_count dedicated to hugetlb PMD share count tracking, but omitted fixing copy_hugetlb_page_range(), leaving the function relying on page_count() for tracking that no longer works. When lazy page table copy for hugetlb is disabled, that is, revert commit bcd51a3c679d ("hugetlb: lazy page table copies in fork()") fork()'ing with hugetlb PMD sharing quickly lockup - [ 239.446559] watchdog: BUG: soft lockup - CPU#75 stuck for 27s! [ 239.446611] RIP: 0010:native_queued_spin_lock_slowpath+0x7e/0x2e0 [ 239.446631] Call Trace: [ 239.446633] [ 239.446636] _raw_spin_lock+0x3f/0x60 [ 239.446639] copy_hugetlb_page_range+0x258/0xb50 [ 239.446645] copy_page_range+0x22b/0x2c0 [ 239.446651] dup_mmap+0x3e2/0x770 [ 239.446654] dup_mm.constprop.0+0x5e/0x230 [ 239.446657] copy_process+0xd17/0x1760 [ 239.446660] kernel_clone+0xc0/0x3e0 [ 239.446661] __do_sys_clone+0x65/0xa0 [ 239.446664] do_syscall_64+0x82/0x930 [ 239.446668] ? count_memcg_events+0xd2/0x190 [ 239.446671] ? syscall_trace_enter+0x14e/0x1f0 [ 239.446676] ? syscall_exit_work+0x118/0x150 [ 239.446677] ? arch_exit_to_user_mode_prepare.constprop.0+0x9/0xb0 [ 239.446681] ? clear_bhb_loop+0x30/0x80 [ 239.446684] ? clear_bhb_loop+0x30/0x80 [ 239.446686] entry_SYSCALL_64_after_hwframe+0x76/0x7e There are two options to resolve the potential latent issue: 1. warn against PMD sharing in copy_hugetlb_page_range(), 2. fix it. This patch opts for the second option. While at it, simplify the comment, the details are not actually relevant anymore. Link: https://lkml.kernel.org/r/20250916004520.1604530-1-jane.chu@oracle.com Fixes: 59d9094df3d7 ("mm: hugetlb: independent PMD page table shared count") Signed-off-by: Jane Chu Reviewed-by: Harry Yoo Acked-by: Oscar Salvador Acked-by: David Hildenbrand Cc: Jann Horn Cc: Liu Shixin Cc: Muchun Song Signed-off-by: Andrew Morton (cherry picked from commit 14967a9c7d247841b0312c48dcf8cd29e55a4cc8) Signed-off-by: Marcin Wcisło --- mm/hugetlb.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 832fe8451197a..df8107c92e254 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4808,16 +4808,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, break; } - /* - * If the pagetables are shared don't copy or take references. - * - * dst_pte == src_pte is the common case of src/dest sharing. - * However, src could have 'unshared' and dst shares with - * another vma. So page_count of ptep page is checked instead - * to reliably determine whether pte is shared. - */ - if (page_count(virt_to_page(dst_pte)) > 1) +#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE + /* If the pagetables are shared, there is nothing to do */ + if (!!atomic_read(&virt_to_page(dst_pte)->pt_share_count)) continue; +#endif dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte);