Linux内核内存回收主要有快速内存回收、直接内存回收、kswapd内存回收,这篇文章主要讨论kswapd内存回收,也称为后台回收。
在系统启动过程执行start_kernel的时候会执行kswapd_init,在kswapd_init函数里执行kthread_run,为每个内存节点(node)创建kswapd内核线程。Kswapd内核线程函数体如下所示:
static int kswapd(void *p)
{
unsigned int alloc_order, reclaim_order;
unsigned int classzone_idx = MAX_NR_ZONES - 1;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
if (!cpumask_empty(cpumask))
set_cpus_allowed_ptr(tsk, cpumask);
/*
* Tell the memory management that we're a "memory allocator",
* and that if we need more memory we should get access to it
* regardless (see "__alloc_pages()"). "kswapd" should
* never get caught in the normal page freeing logic.
*
* (Kswapd normally doesn't need memory anyway, but sometimes
* you need a small amount of memory in order to be able to
* page out something else, and this flag essentially protects
* us from recursively trying to free more memory as we're
* trying to free the first piece of memory in the first place).
*/
//标识自己是kswap进程,并允许回写脏页到swap分区
tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
set_freezable();
pgdat->kswapd_order = 0;
pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
for ( ; ; ) {
bool ret;
alloc_order = reclaim_order = pgdat->kswapd_order;
classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
kswapd_try_sleep:
//kswap进程尝试睡眠
kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
classzone_idx);
/* Read the new order and classzone_idx */
alloc_order = reclaim_order = pgdat->kswapd_order;
classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
pgdat->kswapd_order = 0;
pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
ret = try_to_freeze();
if (kthread_should_stop())
break;
/*
* We can speed up thawing tasks if we don't call balance_pgdat
* after returning from the refrigerator
*/
if (ret)
continue;
/*
* Reclaim begins at the requested order but if a high-order
* reclaim fails then kswapd falls back to reclaiming for
* order-0. If that happens, kswapd will consider sleeping
* for the order it finished reclaiming at (reclaim_order)
* but kcompactd is woken to compact for the original
* request (alloc_order).
*/
trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
alloc_order);
//开始页面回收
reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
if (reclaim_order < alloc_order)
goto kswapd_try_sleep;
}
tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
return 0;
}
看下kswapd_try_to_sleep函数,该函数主要判断kswapd是否满足休眠状态,满足则继续休眠,即调度出去,让出CPU。
static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
unsigned int classzone_idx)
{
long remaining = 0;
DEFINE_WAIT(wait);
if (freezing(current) || kthread_should_stop())
return;
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
/*
* Try to sleep for a short interval. Note that kcompactd will only be
* woken if it is possible to sleep for a short interval. This is
* deliberate on the assumption that if reclaim cannot keep an
* eligible zone balanced that it's also unlikely that compaction will
* succeed.
*/
if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
/*
* Compaction records what page blocks it recently failed to
* isolate pages from and skips them in the future scanning.
* When kswapd is going to sleep, it is reasonable to assume
* that pages and compaction may succeed so reset the cache.
*/
reset_isolation_suitable(pgdat);
/*
* We have freed the memory, now we should compact it to make
* allocation of the requested order possible.
*/
//由于我们目前系统有空闲内存,因此可以尝试内存整理进程
wakeup_kcompactd(pgdat, alloc_order, classzone_idx);
//kswap尝试睡眠0.1s,等待唤醒或超时
remaining = schedule_timeout(HZ/10);
/*
* If woken prematurely then reset kswapd_classzone_idx and
* order. The values will either be from a wakeup request or
* the previous request that slept prematurely.
*/
//如果睡眠中途被唤醒,重置kswap相关变量,用于这次的内存回收
if (remaining) {
pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
}
//将kswap从kswapd_wait链表中摘除,并设置为TASK_RUNNING
finish_wait(&pgdat->kswapd_wait, &wait);
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
}
/*
* After a short sleep, check if it was a premature sleep. If not, then
* go fully to sleep until explicitly woken up.
*/
//如果kswap进程刚才睡眠超时后才返回,也就是remaining=0,意味着kswap可以真正的睡眠
//也就是可以睡久一点,同时再次确认下kswap进程可以睡眠
if (!remaining &&
prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
/*
* vmstat counters are not perfectly accurate and the estimated
* value for counters such as NR_FREE_PAGES can deviate from the
* true value by nr_online_cpus * threshold. To avoid the zone
* watermarks being breached while under pressure, we reduce the
* per-cpu vmstat threshold while kswapd is awake and restore
* them before going back to sleep.
*/
set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
if (!kthread_should_stop())
schedule();//主动调度出去
set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
} else {
if (remaining)
count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
else
count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
}
finish_wait(&pgdat->kswapd_wait, &wait);
}
static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
{
/*
* The throttled processes are normally woken up in balance_pgdat() as
* soon as allow_direct_reclaim() is true. But there is a potential
* race between when kswapd checks the watermarks and a process gets
* throttled. There is also a potential race if processes get
* throttled, kswapd wakes, a large process exits thereby balancing the
* zones, which causes kswapd to exit balance_pgdat() before reaching
* the wake up checks. If kswapd is going to sleep, no process should
* be sleeping on pfmemalloc_wait, so wake them now if necessary. If
* the wake up is premature, processes will wake kswapd and get
* throttled again. The difference from wake ups in balance_pgdat() is
* that here we are under prepare_to_wait().
*/
if (waitqueue_active(&pgdat->pfmemalloc_wait))
wake_up_all(&pgdat->pfmemalloc_wait);
/* Hopeless node, leave it to direct reclaim */
//kswap回收失败次数达到16次,不再尝试回收,留给进程自行处理
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
return true;
//如果至少有一个zone区域满足在high_wmark水位,kswap进程也无需再工作
if (pgdat_balanced(pgdat, order, classzone_idx)) {
clear_pgdat_congested(pgdat);
return true;
}
return false;
}
static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
{
int i;
unsigned long mark = -1;
struct zone *zone;
/*
* Check watermarks bottom-up as lower zones are more likely to
* meet watermarks.
*/
for (i = 0; i <= classzone_idx; i++) {
zone = pgdat->node_zones + i;
if (!managed_zone(zone))//被BUDDY管理的内存为0,则继续
continue;
mark = high_wmark_pages(zone);
if (zone_watermark_ok_safe(zone, order, mark, classzone_idx))
return true;
}
/*
* If a node has no populated zone within classzone_idx, it does not
* need balancing by definition. This can happen if a zone-restricted
* allocation tries to wake a remote kswapd.
*/
if (mark == -1)
return true;
return false;
}
//函数来更新该zone中与内存规整相关的数据,让本次对zone进行内存规整操作时,
//能够对该zone进行更大范围的页扫描,以此来提高本次内存规整的成功率
void reset_isolation_suitable(pg_data_t *pgdat)
{
int zoneid;
for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
struct zone *zone = &pgdat->node_zones[zoneid];
if (!populated_zone(zone))//除去内存空洞的总物理内存数目
continue;//为0则继续
/* Only flush if a full compaction finished recently */
if (zone->compact_blockskip_flush)//
__reset_isolation_suitable(zone);//调整内存规整的参数
}
}
看下populated_zone函数
/* Returns true if a zone has memory */
static inline bool populated_zone(struct zone *zone)
{
return zone->present_pages;//present_pages = spanned_pages - absent_pages,absent_pages表示内存空洞的数目
}
//通过__reset_isolation_suitable执行操作可以看出,就是加大内存规整对该zone的页的扫描返回(360度无死角扫描),
//力求本次内存规整成功.
static void __reset_isolation_suitable(struct zone *zone)
{
unsigned long migrate_pfn = zone->zone_start_pfn;
unsigned long free_pfn = zone_end_pfn(zone) - 1;
unsigned long reset_migrate = free_pfn;
unsigned long reset_free = migrate_pfn;
bool source_set = false;
bool free_set = false;
if (!zone->compact_blockskip_flush)
return;
zone->compact_blockskip_flush = false;
/*
* Walk the zone and update pageblock skip information. Source looks
* for PageLRU while target looks for PageBuddy. When the scanner
* is found, both PageBuddy and PageLRU are checked as the pageblock
* is suitable as both source and target.
*/
for (; migrate_pfn < free_pfn; migrate_pfn += pageblock_nr_pages,
free_pfn -= pageblock_nr_pages) {
cond_resched();
/* Update the migrate PFN */
if (__reset_isolation_pfn(zone, migrate_pfn, true, source_set) &&
migrate_pfn < reset_migrate) {
source_set = true;
reset_migrate = migrate_pfn;
zone->compact_init_migrate_pfn = reset_migrate;
zone->compact_cached_migrate_pfn[0] = reset_migrate;
zone->compact_cached_migrate_pfn[1] = reset_migrate;
}
/* Update the free PFN */
if (__reset_isolation_pfn(zone, free_pfn, free_set, true) &&
free_pfn > reset_free) {
free_set = true;
reset_free = free_pfn;
zone->compact_init_free_pfn = reset_free;
zone->compact_cached_free_pfn = reset_free;
}
}
/* Leave no distance if no suitable block was reset */
if (reset_migrate >= reset_free) {
zone->compact_cached_migrate_pfn[0] = migrate_pfn;
zone->compact_cached_migrate_pfn[1] = migrate_pfn;
zone->compact_cached_free_pfn = free_pfn;
}
}
接下来看下balance_pgdat函数,该函数主要实现了内存回收。
static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
{
int i;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
unsigned long pflags;
unsigned long nr_boost_reclaim;
unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
bool boosted;
struct zone *zone;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.order = order,
.may_unmap = 1,//允许解除页到进程的映射
};
set_task_reclaim_state(current, &sc.reclaim_state);
psi_memstall_enter(&pflags);
__fs_reclaim_acquire();
count_vm_event(PAGEOUTRUN);
/*
* Account for the reclaim boost. Note that the zone boost is left in
* place so that parallel allocations that are near the watermark will
* stall or direct reclaim until kswapd is finished.
*/
nr_boost_reclaim = 0;
for (i = 0; i <= classzone_idx; i++) {
zone = pgdat->node_zones + i;
if (!managed_zone(zone))//managed_pages为0则continue
continue;
nr_boost_reclaim += zone->watermark_boost;//
zone_boosts[i] = zone->watermark_boost;
}
boosted = nr_boost_reclaim;
restart:
sc.priority = DEF_PRIORITY; //控制每次扫描数量,默认是总页数的1/4096
do {
unsigned long nr_reclaimed = sc.nr_reclaimed;
bool raise_priority = true;
bool balanced;
bool ret;
sc.reclaim_idx = classzone_idx;
/*
* If the number of buffer_heads exceeds the maximum allowed
* then consider reclaiming from all zones. This has a dual
* purpose -- on 64-bit systems it is expected that
* buffer_heads are stripped during active rotation. On 32-bit
* systems, highmem pages can pin lowmem memory and shrinking
* buffers can relieve lowmem pressure. Reclaim may still not
* go ahead if all eligible zones for the original allocation
* request are balanced to avoid excessive reclaim from kswapd.
*/
//如果buffer_head缓存太多就从最高内存域开始回收
//buffer_head最多占10%的ZONE_NORMAL
if (buffer_heads_over_limit) {
for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;
sc.reclaim_idx = i;
break;
}
}
/*
* If the pgdat is imbalanced then ignore boosting and preserve
* the watermarks for a later time and restart. Note that the
* zone watermarks will be still reset at the end of balancing
* on the grounds that the normal reclaim should be enough to
* re-evaluate if boosting is required when kswapd next wakes.
*/
//如果有一个zone区域能满足当前内存分区需求,无需回收
//只有当所有zone都无法满足此order的需求,才会真正去回收内存
//这也是kswap停止工作的条件
balanced = pgdat_balanced(pgdat, sc.order, classzone_idx);
if (!balanced && nr_boost_reclaim) {
nr_boost_reclaim = 0;
goto restart;
}
/*
* If boosting is not active then only reclaim if there are no
* eligible zones. Note that sc.reclaim_idx is not used as
* buffer_heads_over_limit may have adjusted it.
*/
if (!nr_boost_reclaim && balanced)
goto out;
/* Limit the priority of boosting to avoid reclaim writeback */
if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
raise_priority = false;
/*
* Do not writeback or swap pages for boosted reclaim. The
* intent is to relieve pressure not issue sub-optimal IO
* from reclaim context. If no pages are reclaimed, the
* reclaim will be aborted.
*/
sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
sc.may_swap = !nr_boost_reclaim;
/*
* Do some background aging of the anon list, to give
* pages a chance to be referenced before reclaiming. All
* pages are rotated regardless of classzone as this is
* about consistent aging.
*/
//如果非活动匿名页太少,对匿名active链表做老化处理,
//让页面有机会在回收之前被引用,如果系统没有swap分区无需操作
age_active_anon(pgdat, &sc);
/*
* If we're getting trouble reclaiming, start doing writepage
* even in laptop mode.
*/
//如果优先级较高,就允许回写操作,以便能回收更多内存
if (sc.priority < DEF_PRIORITY - 2)
sc.may_writepage = 1;
/* Call soft limit reclaim before calling shrink_node. */
sc.nr_scanned = 0;
nr_soft_scanned = 0;
nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
sc.gfp_mask, &nr_soft_scanned);
sc.nr_reclaimed += nr_soft_reclaimed;
/*
* There should be no need to raise the scanning priority if
* enough pages are already being scanned that that high
* watermark would be met at 100% efficiency.
*/
//针对该节点回收内存页面
if (kswapd_shrink_node(pgdat, &sc))
raise_priority = false;
/*
* If the low watermark is met there is no need for processes
* to be throttled on pfmemalloc_wait as they should not be
* able to safely make forward progress. Wake them
*/
//kswap处理完回收,判断是否需要让之前等待内存释放而睡眠的进程醒来
//有可能是回收到了需要的内存,也有可能是kswap无能为力,
if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
allow_direct_reclaim(pgdat))
wake_up_all(&pgdat->pfmemalloc_wait);
/* Check if kswapd should be suspending */
__fs_reclaim_release();
ret = try_to_freeze();
__fs_reclaim_acquire();
if (ret || kthread_should_stop())
break;
/*
* Raise priority if scanning rate is too low or there was no
* progress in reclaiming pages
*/
nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
/*
* If reclaim made no progress for a boost, stop reclaim as
* IO cannot be queued and it could be an infinite loop in
* extreme circumstances.
*/
if (nr_boost_reclaim && !nr_reclaimed)
break;
//如果此次循环没有回收到页面,提高优先级,扫描更多页面
if (raise_priority || !nr_reclaimed)
sc.priority--;
} while (sc.priority >= 1);
//如果此次kswap没有回收到页面,失败次数加1,达到16次就放弃
if (!sc.nr_reclaimed)
pgdat->kswapd_failures++;
out:
/* If reclaim was boosted, account for the reclaim done in this pass */
if (boosted) {
unsigned long flags;
for (i = 0; i <= classzone_idx; i++) {
if (!zone_boosts[i])
continue;
/* Increments are under the zone lock */
zone = pgdat->node_zones + i;
spin_lock_irqsave(&zone->lock, flags);
zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
spin_unlock_irqrestore(&zone->lock, flags);
}
/*
* As there is now likely space, wakeup kcompact to defragment
* pageblocks.
*/
wakeup_kcompactd(pgdat, pageblock_order, classzone_idx);
}
snapshot_refaults(NULL, pgdat);
__fs_reclaim_release();
psi_memstall_leave(&pflags);
set_task_reclaim_state(current, NULL);
/*
* Return the order kswapd stopped reclaiming at as
* prepare_kswapd_sleep() takes it into account. If another caller
* entered the allocator slow path while kswapd was awake, order will
* remain at the higher level.
*/
return sc.order;
}