Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions Documentation/admin-guide/sysctl/vm.rst
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,10 @@ To free slab objects and pagecache::

echo 3 > /proc/sys/vm/drop_caches

To scrape LRU pages from offlined memcgs:

echo 8 > /proc/sys/vm/drop_caches

This is a non-destructive operation and will not free any dirty objects.
To increase the number of objects freed by this operation, the user may run
`sync` prior to writing to /proc/sys/vm/drop_caches. This will minimize the
Expand All @@ -266,6 +270,14 @@ used::
These are informational only. They do not mean that anything is wrong
with your system. To disable them, echo 4 (bit 2) into drop_caches.

Note that for offlined memcgs, kmem (slab) is reparented so that it
does not hold refcnts which would in turn prevent those memcgs from
being released. However, reparenting does not apply to LRU pages
(pagecache), and therefore they need to be scraped as well for
offlined memcgs. "echo 8" was introduced for this reason. And unlike
"echo 1", it does not have performance impact on online memcgs in
terms of zapping pagecache.


extfrag_threshold
=================
Expand Down
20 changes: 20 additions & 0 deletions fs/drop_caches.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
#include <linux/writeback.h>
#include <linux/sysctl.h>
#include <linux/gfp.h>
#include <linux/memcontrol.h>
#include <linux/backing-dev.h>
#include "internal.h"

/* A global variable is a bit ugly, but it keeps the code simple */
Expand Down Expand Up @@ -66,6 +68,24 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write,
drop_slab();
count_vm_event(DROP_SLAB);
}
if (sysctl_drop_caches & 8) {
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
unsigned long target = offlined_memcg_nr_pages();

while (nr_retries) {
unsigned long progress = scrape_offlined_memcgs(target);

if (progress >= target)
break;

if (!progress) {
congestion_wait(BLK_RW_ASYNC, HZ / 10);
nr_retries--;
}

target -= progress;
}
}
if (!stfu) {
pr_info("%s (%d): drop_caches: %d\n",
current->comm, task_pid_nr(current),
Expand Down
2 changes: 1 addition & 1 deletion fs/proc/proc_sysctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ static const struct file_operations proc_sys_dir_file_operations;
static const struct inode_operations proc_sys_dir_operations;

/* shared constants to be used in various sysctls */
const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 200, 1000, 3000, INT_MAX };
const int sysctl_vals[] = { -1, 0, 1, 2, 4, 8, 100, 200, 1000, 3000, INT_MAX };
EXPORT_SYMBOL(sysctl_vals);

/* Support for permanently empty directories */
Expand Down
22 changes: 22 additions & 0 deletions include/linux/memcontrol.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ struct mem_cgroup_reclaim_cookie {
unsigned int generation;
};

#define MEM_CGROUP_RECLAIM_RETRIES 5

#ifdef CONFIG_MEMCG

#define MEM_CGROUP_ID_SHIFT 16
Expand Down Expand Up @@ -1137,6 +1139,15 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask,
unsigned long *total_scanned);

static inline unsigned long offlined_memcg_nr_pages(void)
{
extern atomic_t nr_offlined_memcgs;

return atomic_read(&nr_offlined_memcgs) * MEMCG_CHARGE_BATCH;
}

unsigned long scrape_offlined_memcgs(unsigned long nr_to_reclaim);

#else /* CONFIG_MEMCG */

#define MEM_CGROUP_ID_SHIFT 0
Expand Down Expand Up @@ -1545,6 +1556,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
{
return 0;
}

static inline unsigned long offlined_memcg_nr_pages(void)
{
return 0;
}

static inline unsigned long scrape_offlined_memcgs(unsigned long nr_to_reclaim)
{
return 0;
}

#endif /* CONFIG_MEMCG */

static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
Expand Down
11 changes: 6 additions & 5 deletions include/linux/sysctl.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,12 @@ struct ctl_dir;
#define SYSCTL_ONE ((void *)&sysctl_vals[2])
#define SYSCTL_TWO ((void *)&sysctl_vals[3])
#define SYSCTL_FOUR ((void *)&sysctl_vals[4])
#define SYSCTL_ONE_HUNDRED ((void *)&sysctl_vals[5])
#define SYSCTL_TWO_HUNDRED ((void *)&sysctl_vals[6])
#define SYSCTL_ONE_THOUSAND ((void *)&sysctl_vals[7])
#define SYSCTL_THREE_THOUSAND ((void *)&sysctl_vals[8])
#define SYSCTL_INT_MAX ((void *)&sysctl_vals[9])
#define SYSCTL_EIGHT ((void *)&sysctl_vals[5])
#define SYSCTL_ONE_HUNDRED ((void *)&sysctl_vals[6])
#define SYSCTL_TWO_HUNDRED ((void *)&sysctl_vals[7])
#define SYSCTL_ONE_THOUSAND ((void *)&sysctl_vals[8])
#define SYSCTL_THREE_THOUSAND ((void *)&sysctl_vals[9])
#define SYSCTL_INT_MAX ((void *)&sysctl_vals[10])

extern const int sysctl_vals[];

Expand Down
2 changes: 1 addition & 1 deletion kernel/sysctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -2627,7 +2627,7 @@ static struct ctl_table vm_table[] = {
.mode = 0200,
.proc_handler = drop_caches_sysctl_handler,
.extra1 = SYSCTL_ONE,
.extra2 = SYSCTL_FOUR,
.extra2 = SYSCTL_EIGHT,
},
#ifdef CONFIG_COMPACTION
{
Expand Down
6 changes: 6 additions & 0 deletions mm/memcontrol.c
Original file line number Diff line number Diff line change
Expand Up @@ -5350,6 +5350,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
return -ENOMEM;
}

atomic_t nr_offlined_memcgs = ATOMIC_INIT(0);

static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
Expand Down Expand Up @@ -5377,13 +5379,17 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
drain_all_stock(memcg);

mem_cgroup_id_put(memcg);

atomic_inc(&nr_offlined_memcgs);
}

static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);

invalidate_reclaim_iterators(memcg);

atomic_dec(&nr_offlined_memcgs);
}

static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
Expand Down
34 changes: 34 additions & 0 deletions mm/vmscan.c
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,9 @@ struct scan_control {
/* The file pages on the current node are dangerously low */
unsigned int file_is_tiny:1;

/* Scrape LRU pages from offlined memcgs */
unsigned int scrape_offlined_memcgs:1;

/* Always discard instead of demoting to lower tier memory */
unsigned int no_demotion:1;

Expand Down Expand Up @@ -3092,6 +3095,9 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
memcg_memory_event(memcg, MEMCG_LOW);
}

if (sc->scrape_offlined_memcgs && mem_cgroup_online(memcg))
continue;

reclaimed = sc->nr_reclaimed;
scanned = sc->nr_scanned;

Expand Down Expand Up @@ -4816,3 +4822,31 @@ void check_move_unevictable_pages(struct pagevec *pvec)
}
}
EXPORT_SYMBOL_GPL(check_move_unevictable_pages);

#ifdef CONFIG_MEMCG
unsigned long scrape_offlined_memcgs(unsigned long nr_to_reclaim)
{
unsigned int flags;
unsigned long nr_reclaimed;
struct scan_control sc = {
.nr_to_reclaim = max(nr_to_reclaim, SWAP_CLUSTER_MAX),
.gfp_mask = GFP_KERNEL,
.target_mem_cgroup = root_mem_cgroup,
.reclaim_idx = MAX_NR_ZONES - 1,
.may_writepage = true,
.may_unmap = true,
.scrape_offlined_memcgs = true,
};
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);

set_task_reclaim_state(current, &sc.reclaim_state);
flags = memalloc_noreclaim_save();

nr_reclaimed = do_try_to_free_pages(zonelist, &sc);

memalloc_noreclaim_restore(flags);
set_task_reclaim_state(current, NULL);

return nr_reclaimed;
}
#endif