From 9c479db580951c12b44f48039289ee409595e301 Mon Sep 17 00:00:00 2001 From: Brett Mastbergen Date: Mon, 24 Nov 2025 13:56:34 -0500 Subject: [PATCH] mm: scrape LRU pages for offlined memcgs jira KERNEL-172 feature Add ability to scrape LRU pages from offlined memcgs commit-author Yu Zhao commit-source v1-0001-mm-scrape-LRU-pages-for-offlined-memcgs.patch commit-source-path Provided by Google Engineering upstream-diff A few tweaks to the original patch were necessary: * Removed unused nid variable from scrape_offlined_memcgs * Switched extra2 to 8 (otherwise 'echo 8 > /proc/sys/vm/drop_caches' would be rejected) * Renamed nr_pages_to_scrape to offlined_memcg_nr_pages in the !CONFIG_MEMCG case to match the CONFIG_MEMCG case * Added 'return 0' to scrape_offlined_memcgs in the !CONFIG_MEMCG case For offlined memcgs, kmem (slab) is reparented so that it does not hold refcnts which would in turn prevent those memcgs from being released. However, reparenting does not apply to LRU pages (pagecache), and therefore they need to be scraped as well for offlined memcgs. "echo 8 > /proc/sys/vm/drop_caches" was introduced for this reason. And unlike "echo 1", it does not have performance impact on online memcgs in terms of zapping pagecache. Signed-off-by: Yu Zhao Signed-off-by: Brett Mastbergen --- Documentation/sysctl/vm.txt | 10 ++++++++++ fs/drop_caches.c | 20 ++++++++++++++++++++ include/linux/memcontrol.h | 22 ++++++++++++++++++++++ kernel/sysctl.c | 3 ++- mm/memcontrol.c | 6 ++++++ mm/vmscan.c | 34 ++++++++++++++++++++++++++++++++++ 6 files changed, 94 insertions(+), 1 deletion(-) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 6cac7f0744f80..b4814b91894dc 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -225,6 +225,8 @@ To free reclaimable slab objects (includes dentries and inodes): echo 2 > /proc/sys/vm/drop_caches To free slab objects and pagecache: echo 3 > /proc/sys/vm/drop_caches +To scrape LRU pages from offlined memcgs: + echo 8 > /proc/sys/vm/drop_caches This is a non-destructive operation and will not free any dirty objects. To increase the number of objects freed by this operation, the user may run @@ -249,6 +251,14 @@ used: These are informational only. They do not mean that anything is wrong with your system. To disable them, echo 4 (bit 3) into drop_caches. +Note that for offlined memcgs, kmem (slab) is reparented so that it +does not hold refcnts which would in turn prevent those memcgs from +being released. However, reparenting does not apply to LRU pages +(pagecache), and therefore they need to be scraped as well for +offlined memcgs. "echo 8" was introduced for this reason. And unlike +"echo 1", it does not have performance impact on online memcgs in +terms of zapping pagecache. + ============================================================== extfrag_threshold diff --git a/fs/drop_caches.c b/fs/drop_caches.c index f0ef2cea50ac8..07fb9dfb9d5d9 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include "internal.h" /* A global variable is a bit ugly, but it keeps the code simple */ @@ -66,6 +68,24 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write, drop_slab(); count_vm_event(DROP_SLAB); } + if (sysctl_drop_caches & 8) { + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; + unsigned long target = offlined_memcg_nr_pages(); + + while (nr_retries) { + unsigned long progress = scrape_offlined_memcgs(target); + + if (progress >= target) + break; + + if (!progress) { + congestion_wait(BLK_RW_ASYNC, HZ / 10); + nr_retries--; + } + + target -= progress; + } + } if (!stfu) { pr_info("%s (%d): drop_caches: %d\n", current->comm, task_pid_nr(current), diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ab77fb4b0a073..9113177dd7246 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -73,6 +73,8 @@ struct mem_cgroup_reclaim_cookie { unsigned int generation; }; +#define MEM_CGROUP_RECLAIM_RETRIES 5 + #ifdef CONFIG_MEMCG #define MEM_CGROUP_ID_SHIFT 16 @@ -1150,6 +1152,15 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); +static inline unsigned long offlined_memcg_nr_pages(void) +{ + extern atomic_t nr_offlined_memcgs; + + return atomic_read(&nr_offlined_memcgs) * MEMCG_CHARGE_BATCH; +} + +unsigned long scrape_offlined_memcgs(unsigned long nr_to_reclaim); + #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1526,6 +1537,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, { return 0; } + +static inline unsigned long offlined_memcg_nr_pages(void) +{ + return 0; +} + +static inline unsigned long scrape_offlined_memcgs(unsigned long nr_to_reclaim) +{ + return 0; +} + #endif /* CONFIG_MEMCG */ static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e72da0181fc2d..53bdc9b687494 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -128,6 +128,7 @@ static int sixty = 60; static int __maybe_unused neg_one = -1; static int __maybe_unused two = 2; static int __maybe_unused four = 4; +static int __maybe_unused eight = 8; static unsigned long zero_ul; static unsigned long one_ul = 1; static unsigned long long_max = LONG_MAX; @@ -1483,7 +1484,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = drop_caches_sysctl_handler, .extra1 = SYSCTL_ONE, - .extra2 = &four, + .extra2 = &eight, }, #ifdef CONFIG_COMPACTION { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6e2a077af4c16..ca92e430242b4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5594,6 +5594,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) return 0; } +atomic_t nr_offlined_memcgs = ATOMIC_INIT(0); + static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); @@ -5621,6 +5623,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) drain_all_stock(memcg); memcg_percpu_stats_disable(memcg); + + atomic_inc(&nr_offlined_memcgs); } static void mem_cgroup_css_released(struct cgroup_subsys_state *css) @@ -5628,6 +5632,8 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); invalidate_reclaim_iterators(memcg); + + atomic_dec(&nr_offlined_memcgs); } static void mem_cgroup_css_free(struct cgroup_subsys_state *css) diff --git a/mm/vmscan.c b/mm/vmscan.c index 0a9ac5c53b567..7b928c94ca8aa 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -123,6 +123,9 @@ struct scan_control { /* The file pages on the current node are dangerously low */ unsigned int file_is_tiny:1; + /* Scrape LRU pages from offlined memcgs */ + unsigned int scrape_offlined_memcgs:1; + /* Allocation order */ s8 order; @@ -3034,6 +3037,9 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) memcg_memory_event(memcg, MEMCG_LOW); } + if (sc->scrape_offlined_memcgs && mem_cgroup_online(memcg)) + continue; + reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; @@ -4736,3 +4742,31 @@ void check_move_unevictable_pages(struct pagevec *pvec) } } EXPORT_SYMBOL_GPL(check_move_unevictable_pages); + +#ifdef CONFIG_MEMCG +unsigned long scrape_offlined_memcgs(unsigned long nr_to_reclaim) +{ + unsigned int flags; + unsigned long nr_reclaimed; + struct scan_control sc = { + .nr_to_reclaim = max(nr_to_reclaim, SWAP_CLUSTER_MAX), + .gfp_mask = GFP_KERNEL, + .target_mem_cgroup = root_mem_cgroup, + .reclaim_idx = MAX_NR_ZONES - 1, + .may_writepage = true, + .may_unmap = true, + .scrape_offlined_memcgs = true, + }; + struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); + + set_task_reclaim_state(current, &sc.reclaim_state); + flags = memalloc_noreclaim_save(); + + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + + memalloc_noreclaim_restore(flags); + set_task_reclaim_state(current, NULL); + + return nr_reclaimed; +} +#endif