From b41f544830299a08a5bd25c0432f20d5528cf501 Mon Sep 17 00:00:00 2001 From: Brett Mastbergen Date: Mon, 24 Nov 2025 13:48:57 -0500 Subject: [PATCH] mm: scrape LRU pages for offlined memcgs jira KERNEL-173 feature Add ability to scrape LRU pages from offlined memcgs commit-author: Yu Zhao commit-source v1-0001-mm-scrape-LRU-pages-for-offlined-memcgs.patch commit-source-path Provided by Google Engineering upstream-diff A few tweaks to the original patch were necessary: * Format changes because Documentation/sysctl/vm.txt has been changed to Documentation/admin-guide/sysctl/vm.rst * Removed unused nid variable from scrape_offlined_memcgs * Switched drop_caches_sysctl_handler to use SYSCTL_EIGHT (otherwise 'echo 8 > /proc/sys/vm/drop_caches' would be rejected) * Renamed nr_pages_to_scrape to offlined_memcg_nr_pages in the !CONFIG_MEMCG case to match the CONFIG_MEMCG case * Added 'return 0' to scrape_offlined_memcgs in the !CONFIG_MEMCG case For offlined memcgs, kmem (slab) is reparented so that it does not hold refcnts which would in turn prevent those memcgs from being released. However, reparenting does not apply to LRU pages (pagecache), and therefore they need to be scraped as well for offlined memcgs. "echo 8 > /proc/sys/vm/drop_caches" was introduced for this reason. And unlike "echo 1", it does not have performance impact on online memcgs in terms of zapping pagecache. Signed-off-by: Yu Zhao Signed-off-by: Brett Mastbergen --- Documentation/admin-guide/sysctl/vm.rst | 12 +++++++++ fs/drop_caches.c | 20 +++++++++++++++ fs/proc/proc_sysctl.c | 2 +- include/linux/memcontrol.h | 22 ++++++++++++++++ include/linux/sysctl.h | 11 ++++---- kernel/sysctl.c | 2 +- mm/memcontrol.c | 6 +++++ mm/vmscan.c | 34 +++++++++++++++++++++++++ 8 files changed, 102 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 36cf21f3b7ab3..6e0428843c5c6 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -243,6 +243,10 @@ To free slab objects and pagecache:: echo 3 > /proc/sys/vm/drop_caches +To scrape LRU pages from offlined memcgs: + + echo 8 > /proc/sys/vm/drop_caches + This is a non-destructive operation and will not free any dirty objects. To increase the number of objects freed by this operation, the user may run `sync` prior to writing to /proc/sys/vm/drop_caches. This will minimize the @@ -266,6 +270,14 @@ used:: These are informational only. They do not mean that anything is wrong with your system. To disable them, echo 4 (bit 2) into drop_caches. +Note that for offlined memcgs, kmem (slab) is reparented so that it +does not hold refcnts which would in turn prevent those memcgs from +being released. However, reparenting does not apply to LRU pages +(pagecache), and therefore they need to be scraped as well for +offlined memcgs. "echo 8" was introduced for this reason. And unlike +"echo 1", it does not have performance impact on online memcgs in +terms of zapping pagecache. + extfrag_threshold ================= diff --git a/fs/drop_caches.c b/fs/drop_caches.c index e619c31b6bd92..33c4a71222c93 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include "internal.h" /* A global variable is a bit ugly, but it keeps the code simple */ @@ -66,6 +68,24 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write, drop_slab(); count_vm_event(DROP_SLAB); } + if (sysctl_drop_caches & 8) { + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; + unsigned long target = offlined_memcg_nr_pages(); + + while (nr_retries) { + unsigned long progress = scrape_offlined_memcgs(target); + + if (progress >= target) + break; + + if (!progress) { + congestion_wait(BLK_RW_ASYNC, HZ / 10); + nr_retries--; + } + + target -= progress; + } + } if (!stfu) { pr_info("%s (%d): drop_caches: %d\n", current->comm, task_pid_nr(current), diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 0b7a00ed6c49b..c81262675ac93 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -26,7 +26,7 @@ static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; /* shared constants to be used in various sysctls */ -const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 200, 1000, 3000, INT_MAX }; +const int sysctl_vals[] = { -1, 0, 1, 2, 4, 8, 100, 200, 1000, 3000, INT_MAX }; EXPORT_SYMBOL(sysctl_vals); /* Support for permanently empty directories */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5f847f6e30a04..2bcda053bd87f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -58,6 +58,8 @@ struct mem_cgroup_reclaim_cookie { unsigned int generation; }; +#define MEM_CGROUP_RECLAIM_RETRIES 5 + #ifdef CONFIG_MEMCG #define MEM_CGROUP_ID_SHIFT 16 @@ -1137,6 +1139,15 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); +static inline unsigned long offlined_memcg_nr_pages(void) +{ + extern atomic_t nr_offlined_memcgs; + + return atomic_read(&nr_offlined_memcgs) * MEMCG_CHARGE_BATCH; +} + +unsigned long scrape_offlined_memcgs(unsigned long nr_to_reclaim); + #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1545,6 +1556,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, { return 0; } + +static inline unsigned long offlined_memcg_nr_pages(void) +{ + return 0; +} + +static inline unsigned long scrape_offlined_memcgs(unsigned long nr_to_reclaim) +{ + return 0; +} + #endif /* CONFIG_MEMCG */ static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 7ee273ee23cfe..cc535c263f7d5 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -43,11 +43,12 @@ struct ctl_dir; #define SYSCTL_ONE ((void *)&sysctl_vals[2]) #define SYSCTL_TWO ((void *)&sysctl_vals[3]) #define SYSCTL_FOUR ((void *)&sysctl_vals[4]) -#define SYSCTL_ONE_HUNDRED ((void *)&sysctl_vals[5]) -#define SYSCTL_TWO_HUNDRED ((void *)&sysctl_vals[6]) -#define SYSCTL_ONE_THOUSAND ((void *)&sysctl_vals[7]) -#define SYSCTL_THREE_THOUSAND ((void *)&sysctl_vals[8]) -#define SYSCTL_INT_MAX ((void *)&sysctl_vals[9]) +#define SYSCTL_EIGHT ((void *)&sysctl_vals[5]) +#define SYSCTL_ONE_HUNDRED ((void *)&sysctl_vals[6]) +#define SYSCTL_TWO_HUNDRED ((void *)&sysctl_vals[7]) +#define SYSCTL_ONE_THOUSAND ((void *)&sysctl_vals[8]) +#define SYSCTL_THREE_THOUSAND ((void *)&sysctl_vals[9]) +#define SYSCTL_INT_MAX ((void *)&sysctl_vals[10]) extern const int sysctl_vals[]; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 538dfb457e076..f772c413e4e04 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2627,7 +2627,7 @@ static struct ctl_table vm_table[] = { .mode = 0200, .proc_handler = drop_caches_sysctl_handler, .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_FOUR, + .extra2 = SYSCTL_EIGHT, }, #ifdef CONFIG_COMPACTION { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e5c776740504e..a60057a463b3b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5350,6 +5350,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) return -ENOMEM; } +atomic_t nr_offlined_memcgs = ATOMIC_INIT(0); + static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); @@ -5377,6 +5379,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) drain_all_stock(memcg); mem_cgroup_id_put(memcg); + + atomic_inc(&nr_offlined_memcgs); } static void mem_cgroup_css_released(struct cgroup_subsys_state *css) @@ -5384,6 +5388,8 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); invalidate_reclaim_iterators(memcg); + + atomic_dec(&nr_offlined_memcgs); } static void mem_cgroup_css_free(struct cgroup_subsys_state *css) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4530549b2b843..cc8885fd40fef 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -123,6 +123,9 @@ struct scan_control { /* The file pages on the current node are dangerously low */ unsigned int file_is_tiny:1; + /* Scrape LRU pages from offlined memcgs */ + unsigned int scrape_offlined_memcgs:1; + /* Always discard instead of demoting to lower tier memory */ unsigned int no_demotion:1; @@ -3092,6 +3095,9 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) memcg_memory_event(memcg, MEMCG_LOW); } + if (sc->scrape_offlined_memcgs && mem_cgroup_online(memcg)) + continue; + reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; @@ -4816,3 +4822,31 @@ void check_move_unevictable_pages(struct pagevec *pvec) } } EXPORT_SYMBOL_GPL(check_move_unevictable_pages); + +#ifdef CONFIG_MEMCG +unsigned long scrape_offlined_memcgs(unsigned long nr_to_reclaim) +{ + unsigned int flags; + unsigned long nr_reclaimed; + struct scan_control sc = { + .nr_to_reclaim = max(nr_to_reclaim, SWAP_CLUSTER_MAX), + .gfp_mask = GFP_KERNEL, + .target_mem_cgroup = root_mem_cgroup, + .reclaim_idx = MAX_NR_ZONES - 1, + .may_writepage = true, + .may_unmap = true, + .scrape_offlined_memcgs = true, + }; + struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); + + set_task_reclaim_state(current, &sc.reclaim_state); + flags = memalloc_noreclaim_save(); + + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + + memalloc_noreclaim_restore(flags); + set_task_reclaim_state(current, NULL); + + return nr_reclaimed; +} +#endif