diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 36cf21f3b7ab3..6e0428843c5c6 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -243,6 +243,10 @@ To free slab objects and pagecache:: echo 3 > /proc/sys/vm/drop_caches +To scrape LRU pages from offlined memcgs: + + echo 8 > /proc/sys/vm/drop_caches + This is a non-destructive operation and will not free any dirty objects. To increase the number of objects freed by this operation, the user may run `sync` prior to writing to /proc/sys/vm/drop_caches. This will minimize the @@ -266,6 +270,14 @@ used:: These are informational only. They do not mean that anything is wrong with your system. To disable them, echo 4 (bit 2) into drop_caches. +Note that for offlined memcgs, kmem (slab) is reparented so that it +does not hold refcnts which would in turn prevent those memcgs from +being released. However, reparenting does not apply to LRU pages +(pagecache), and therefore they need to be scraped as well for +offlined memcgs. "echo 8" was introduced for this reason. And unlike +"echo 1", it does not have performance impact on online memcgs in +terms of zapping pagecache. + extfrag_threshold ================= diff --git a/fs/drop_caches.c b/fs/drop_caches.c index e619c31b6bd92..33c4a71222c93 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include "internal.h" /* A global variable is a bit ugly, but it keeps the code simple */ @@ -66,6 +68,24 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write, drop_slab(); count_vm_event(DROP_SLAB); } + if (sysctl_drop_caches & 8) { + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; + unsigned long target = offlined_memcg_nr_pages(); + + while (nr_retries) { + unsigned long progress = scrape_offlined_memcgs(target); + + if (progress >= target) + break; + + if (!progress) { + congestion_wait(BLK_RW_ASYNC, HZ / 10); + nr_retries--; + } + + target -= progress; + } + } if (!stfu) { pr_info("%s (%d): drop_caches: %d\n", current->comm, task_pid_nr(current), diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 0b7a00ed6c49b..c81262675ac93 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -26,7 +26,7 @@ static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; /* shared constants to be used in various sysctls */ -const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 200, 1000, 3000, INT_MAX }; +const int sysctl_vals[] = { -1, 0, 1, 2, 4, 8, 100, 200, 1000, 3000, INT_MAX }; EXPORT_SYMBOL(sysctl_vals); /* Support for permanently empty directories */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5f847f6e30a04..2bcda053bd87f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -58,6 +58,8 @@ struct mem_cgroup_reclaim_cookie { unsigned int generation; }; +#define MEM_CGROUP_RECLAIM_RETRIES 5 + #ifdef CONFIG_MEMCG #define MEM_CGROUP_ID_SHIFT 16 @@ -1137,6 +1139,15 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); +static inline unsigned long offlined_memcg_nr_pages(void) +{ + extern atomic_t nr_offlined_memcgs; + + return atomic_read(&nr_offlined_memcgs) * MEMCG_CHARGE_BATCH; +} + +unsigned long scrape_offlined_memcgs(unsigned long nr_to_reclaim); + #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1545,6 +1556,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, { return 0; } + +static inline unsigned long offlined_memcg_nr_pages(void) +{ + return 0; +} + +static inline unsigned long scrape_offlined_memcgs(unsigned long nr_to_reclaim) +{ + return 0; +} + #endif /* CONFIG_MEMCG */ static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 7ee273ee23cfe..cc535c263f7d5 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -43,11 +43,12 @@ struct ctl_dir; #define SYSCTL_ONE ((void *)&sysctl_vals[2]) #define SYSCTL_TWO ((void *)&sysctl_vals[3]) #define SYSCTL_FOUR ((void *)&sysctl_vals[4]) -#define SYSCTL_ONE_HUNDRED ((void *)&sysctl_vals[5]) -#define SYSCTL_TWO_HUNDRED ((void *)&sysctl_vals[6]) -#define SYSCTL_ONE_THOUSAND ((void *)&sysctl_vals[7]) -#define SYSCTL_THREE_THOUSAND ((void *)&sysctl_vals[8]) -#define SYSCTL_INT_MAX ((void *)&sysctl_vals[9]) +#define SYSCTL_EIGHT ((void *)&sysctl_vals[5]) +#define SYSCTL_ONE_HUNDRED ((void *)&sysctl_vals[6]) +#define SYSCTL_TWO_HUNDRED ((void *)&sysctl_vals[7]) +#define SYSCTL_ONE_THOUSAND ((void *)&sysctl_vals[8]) +#define SYSCTL_THREE_THOUSAND ((void *)&sysctl_vals[9]) +#define SYSCTL_INT_MAX ((void *)&sysctl_vals[10]) extern const int sysctl_vals[]; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 538dfb457e076..f772c413e4e04 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2627,7 +2627,7 @@ static struct ctl_table vm_table[] = { .mode = 0200, .proc_handler = drop_caches_sysctl_handler, .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_FOUR, + .extra2 = SYSCTL_EIGHT, }, #ifdef CONFIG_COMPACTION { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e5c776740504e..a60057a463b3b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5350,6 +5350,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) return -ENOMEM; } +atomic_t nr_offlined_memcgs = ATOMIC_INIT(0); + static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); @@ -5377,6 +5379,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) drain_all_stock(memcg); mem_cgroup_id_put(memcg); + + atomic_inc(&nr_offlined_memcgs); } static void mem_cgroup_css_released(struct cgroup_subsys_state *css) @@ -5384,6 +5388,8 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); invalidate_reclaim_iterators(memcg); + + atomic_dec(&nr_offlined_memcgs); } static void mem_cgroup_css_free(struct cgroup_subsys_state *css) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4530549b2b843..cc8885fd40fef 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -123,6 +123,9 @@ struct scan_control { /* The file pages on the current node are dangerously low */ unsigned int file_is_tiny:1; + /* Scrape LRU pages from offlined memcgs */ + unsigned int scrape_offlined_memcgs:1; + /* Always discard instead of demoting to lower tier memory */ unsigned int no_demotion:1; @@ -3092,6 +3095,9 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) memcg_memory_event(memcg, MEMCG_LOW); } + if (sc->scrape_offlined_memcgs && mem_cgroup_online(memcg)) + continue; + reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; @@ -4816,3 +4822,31 @@ void check_move_unevictable_pages(struct pagevec *pvec) } } EXPORT_SYMBOL_GPL(check_move_unevictable_pages); + +#ifdef CONFIG_MEMCG +unsigned long scrape_offlined_memcgs(unsigned long nr_to_reclaim) +{ + unsigned int flags; + unsigned long nr_reclaimed; + struct scan_control sc = { + .nr_to_reclaim = max(nr_to_reclaim, SWAP_CLUSTER_MAX), + .gfp_mask = GFP_KERNEL, + .target_mem_cgroup = root_mem_cgroup, + .reclaim_idx = MAX_NR_ZONES - 1, + .may_writepage = true, + .may_unmap = true, + .scrape_offlined_memcgs = true, + }; + struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); + + set_task_reclaim_state(current, &sc.reclaim_state); + flags = memalloc_noreclaim_save(); + + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + + memalloc_noreclaim_restore(flags); + set_task_reclaim_state(current, NULL); + + return nr_reclaimed; +} +#endif