diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 6cac7f0744f80..b4814b91894dc 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -225,6 +225,8 @@ To free reclaimable slab objects (includes dentries and inodes): echo 2 > /proc/sys/vm/drop_caches To free slab objects and pagecache: echo 3 > /proc/sys/vm/drop_caches +To scrape LRU pages from offlined memcgs: + echo 8 > /proc/sys/vm/drop_caches This is a non-destructive operation and will not free any dirty objects. To increase the number of objects freed by this operation, the user may run @@ -249,6 +251,14 @@ used: These are informational only. They do not mean that anything is wrong with your system. To disable them, echo 4 (bit 3) into drop_caches. +Note that for offlined memcgs, kmem (slab) is reparented so that it +does not hold refcnts which would in turn prevent those memcgs from +being released. However, reparenting does not apply to LRU pages +(pagecache), and therefore they need to be scraped as well for +offlined memcgs. "echo 8" was introduced for this reason. And unlike +"echo 1", it does not have performance impact on online memcgs in +terms of zapping pagecache. + ============================================================== extfrag_threshold diff --git a/fs/drop_caches.c b/fs/drop_caches.c index f0ef2cea50ac8..07fb9dfb9d5d9 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include "internal.h" /* A global variable is a bit ugly, but it keeps the code simple */ @@ -66,6 +68,24 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write, drop_slab(); count_vm_event(DROP_SLAB); } + if (sysctl_drop_caches & 8) { + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; + unsigned long target = offlined_memcg_nr_pages(); + + while (nr_retries) { + unsigned long progress = scrape_offlined_memcgs(target); + + if (progress >= target) + break; + + if (!progress) { + congestion_wait(BLK_RW_ASYNC, HZ / 10); + nr_retries--; + } + + target -= progress; + } + } if (!stfu) { pr_info("%s (%d): drop_caches: %d\n", current->comm, task_pid_nr(current), diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ab77fb4b0a073..9113177dd7246 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -73,6 +73,8 @@ struct mem_cgroup_reclaim_cookie { unsigned int generation; }; +#define MEM_CGROUP_RECLAIM_RETRIES 5 + #ifdef CONFIG_MEMCG #define MEM_CGROUP_ID_SHIFT 16 @@ -1150,6 +1152,15 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); +static inline unsigned long offlined_memcg_nr_pages(void) +{ + extern atomic_t nr_offlined_memcgs; + + return atomic_read(&nr_offlined_memcgs) * MEMCG_CHARGE_BATCH; +} + +unsigned long scrape_offlined_memcgs(unsigned long nr_to_reclaim); + #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1526,6 +1537,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, { return 0; } + +static inline unsigned long offlined_memcg_nr_pages(void) +{ + return 0; +} + +static inline unsigned long scrape_offlined_memcgs(unsigned long nr_to_reclaim) +{ + return 0; +} + #endif /* CONFIG_MEMCG */ static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e72da0181fc2d..53bdc9b687494 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -128,6 +128,7 @@ static int sixty = 60; static int __maybe_unused neg_one = -1; static int __maybe_unused two = 2; static int __maybe_unused four = 4; +static int __maybe_unused eight = 8; static unsigned long zero_ul; static unsigned long one_ul = 1; static unsigned long long_max = LONG_MAX; @@ -1483,7 +1484,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = drop_caches_sysctl_handler, .extra1 = SYSCTL_ONE, - .extra2 = &four, + .extra2 = &eight, }, #ifdef CONFIG_COMPACTION { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6e2a077af4c16..ca92e430242b4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5594,6 +5594,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) return 0; } +atomic_t nr_offlined_memcgs = ATOMIC_INIT(0); + static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); @@ -5621,6 +5623,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) drain_all_stock(memcg); memcg_percpu_stats_disable(memcg); + + atomic_inc(&nr_offlined_memcgs); } static void mem_cgroup_css_released(struct cgroup_subsys_state *css) @@ -5628,6 +5632,8 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); invalidate_reclaim_iterators(memcg); + + atomic_dec(&nr_offlined_memcgs); } static void mem_cgroup_css_free(struct cgroup_subsys_state *css) diff --git a/mm/vmscan.c b/mm/vmscan.c index 0a9ac5c53b567..7b928c94ca8aa 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -123,6 +123,9 @@ struct scan_control { /* The file pages on the current node are dangerously low */ unsigned int file_is_tiny:1; + /* Scrape LRU pages from offlined memcgs */ + unsigned int scrape_offlined_memcgs:1; + /* Allocation order */ s8 order; @@ -3034,6 +3037,9 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) memcg_memory_event(memcg, MEMCG_LOW); } + if (sc->scrape_offlined_memcgs && mem_cgroup_online(memcg)) + continue; + reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; @@ -4736,3 +4742,31 @@ void check_move_unevictable_pages(struct pagevec *pvec) } } EXPORT_SYMBOL_GPL(check_move_unevictable_pages); + +#ifdef CONFIG_MEMCG +unsigned long scrape_offlined_memcgs(unsigned long nr_to_reclaim) +{ + unsigned int flags; + unsigned long nr_reclaimed; + struct scan_control sc = { + .nr_to_reclaim = max(nr_to_reclaim, SWAP_CLUSTER_MAX), + .gfp_mask = GFP_KERNEL, + .target_mem_cgroup = root_mem_cgroup, + .reclaim_idx = MAX_NR_ZONES - 1, + .may_writepage = true, + .may_unmap = true, + .scrape_offlined_memcgs = true, + }; + struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); + + set_task_reclaim_state(current, &sc.reclaim_state); + flags = memalloc_noreclaim_save(); + + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + + memalloc_noreclaim_restore(flags); + set_task_reclaim_state(current, NULL); + + return nr_reclaimed; +} +#endif