我们知道内核内存回收最后,是执行shrink_inactive_list->shrink_page_list函数对各种page进行内存回收,而针对mmapped的文件页需要执行try_to_unmap函数解除进程对该page的物理内存的页表页目录映射,如果函数返回0则解除失败,导致该page内存回收失败。最近在开发以文件为单位对mmapped文件页进行回收时,发现大量mmapped文件页在执行try_to_unmap函数返回0,也就是说解除页表页目录映射失败了,从而导致这些page内存回收失败。什么原因导致的try_to_unmap函数解除page的页表页目录映射失败呢?
本文基于linux-4.18.0-240版本内核源码。先看下try_to_unmap函数执行流程,源码简化后如下:
- bool try_to_unmap(struct page *page, enum ttu_flags flags)
- {
- struct rmap_walk_control rwc = {
- .rmap_one = try_to_unmap_one,
- .arg = (void *)flags,
- .done = page_mapcount_is_zero,
- .anon_lock = page_lock_anon_vma_read,
- };
- //解除进程对该page的物理内存的表页目录映射,如果成功则page_mapcount(page)是0,该函数返回true,否则返回false
- rmap_walk(page, &rwc);
- return !page_mapcount(page) ? true : false;
- }
- void rmap_walk(struct page *page, struct rmap_walk_control *rwc)
- {
- rmap_walk_file(page, rwc, false);
- }
- static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
- bool locked)
- {
- struct address_space *mapping = page_mapping(page);
- pgoff_t pgoff_start, pgoff_end;
- struct vm_area_struct *vma;
- //就是返回page->index索引,也是进程映射该文件页page的vma的起始虚拟地址
- pgoff_start = page_to_pgoff(page);
- pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
- //遍历该文件保存的所有vma的rb tree,得到映射page物理内存的所有虚拟地址vma。因为一个page的物理内存可能被多个进程映射,故会有多个vma
- vma_interval_tree_foreach(vma, &mapping->i_mmap,
- pgoff_start, pgoff_end) {
- //address是映射page物理内存的起始虚拟地址
- unsigned long address = vma_address(page, vma);
- ...............
- //解除进程对该page的物理内存的表页目录映射
- if (!rwc->rmap_one(page, vma, address, rwc->arg))//try_to_unmap_one
- goto done;
- //如果没有进程再映射page的物理内存了,page的mapcount是0,直接goto done
- if (rwc->done && rwc->done(page))//page_mapcount_is_zero
- goto done;
- }
- done:
- .........
- }
- static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
- unsigned long address, void *arg)
- {
- struct mm_struct *mm = vma->vm_mm;
- struct page_vma_mapped_walk pvmw = {
- .page = page,
- .vma = vma,
- //address是映射page物理内存的起始虚拟地址
- .address = address,
- };
- ..........
- //由映射page物理内存的起始虚拟地址address得到对应的页表项pte指针,保存到pvmw.pte
- while (page_vma_mapped_walk(&pvmw)) {
- .......
- //如果page映射的页表pte置位了,结束unmap
- if (ptep_clear_flush_young_notify(vma, address,
- pvmw.pte)) {
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
- }
- .............
- //对pte页表项清0,真正解除page页表页目录映射
- pteval = ptep_get_and_clear(mm, address, pvmw.pte);
- .......
- //对page的mapcount减1
- page_remove_rmap(subpage, PageHuge(page));
- }
- }
因此,综合分析,怀疑原因是try_to_unmap_one->ptep_clear_flush_young_notify 函数里” 对pte页表项清0,真正解除page页表页目录映射”失败了!为什么这样怀疑,看下源码就知道了
- #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \
- ({ \
- int __young; \
- struct vm_area_struct *___vma = __vma; \
- unsigned long ___address = __address; \
- __young = ptep_clear_flush_young(___vma, ___address, __ptep); \
- __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \
- ___address, \
- ___address + \
- PAGE_SIZE); \
- __young; \
- })
- int ptep_clear_flush_young(struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep)
- {
- return ptep_test_and_clear_young(vma, address, ptep);
- }
怀疑是执行 ptep_clear_flush_young_notify 函数,发现page的pte页表置位了,然后直接break返回false。用systemtap调试一下即可:
- global unmap_begin = 0;
- global unmap_fail = 0;
- global page_mapcount;
- probe kernel.function("try_to_unmap")
- {
- unmap_begin = 1;
- page_mapcount = $page->_mapcount->counter
- }
- probe kernel.function("ptep_clear_flush_young").return
- {
- if(unmap_begin && $return != 0){
- unmap_fail = 1
- }
- }
- probe kernel.function("try_to_unmap").return
- {
- if(unmap_fail && $return == 0){
- unmap_fail = 0
- printf("%s %d ret:%d page:%x $page->_mapcount:%d origin_page_mapcount:%d\n",execname(),tid(),$return,$page,$page->_mapcount->counter,page_mapcount)
- }
- unmap_begin = 0
- }
打印
- hot_cold_file_t 10398 ret:0 page:ffffef4448acec00 $page->_mapcount:22 origin_page_mapcount:22
- hot_cold_file_t 10398 ret:0 page:ffffef4448acec40 $page->_mapcount:22 origin_page_mapcount:22
- hot_cold_file_t 10398 ret:0 page:ffffef4448380100 $page->_mapcount:22 origin_page_mapcount:22
- hot_cold_file_t 10398 ret:0 page:ffffef4448380140 $page->_mapcount:22 origin_page_mapcount:22
- hot_cold_file_t 10398 ret:0 page:ffffef44483c6880 $page->_mapcount:21 origin_page_mapcount:21
- hot_cold_file_t 10398 ret:0 page:ffffef44483c68c0 $page->_mapcount:21 origin_page_mapcount:21
- hot_cold_file_t 10398 ret:0 page:ffffef4444d19700 $page->_mapcount:21 origin_page_mapcount:21
- hot_cold_file_t 10398 ret:0 page:ffffef44483ca680 $page->_mapcount:22 origin_page_mapcount:22
- hot_cold_file_t 10398 ret:0 page:ffffef4448af6fc0 $page->_mapcount:4 origin_page_mapcount:4
- hot_cold_file_t 10398 ret:0 page:ffffef4448af0200 $page->_mapcount:4 origin_page_mapcount:4
- hot_cold_file_t 10398 ret:0 page:ffffef4448af0240 $page->_mapcount:4 origin_page_mapcount:4
- hot_cold_file_t 10398 ret:0 page:ffffef4448ffbb80 $page->_mapcount:4 origin_page_mapcount:4
- hot_cold_file_t 10398 ret:0 page:ffffef44486a1d40 $page->_mapcount:3 origin_page_mapcount:3
- hot_cold_file_t 10398 ret:0 page:ffffef44486a1d80 $page->_mapcount:3 origin_page_mapcount:3
- hot_cold_file_t 10398 ret:0 page:ffffef44486a1dc0 $page->_mapcount:3 origin_page_mapcount:3
- hot_cold_file_t 10398 ret:0 page:ffffef4448339140 $page->_mapcount:3 origin_page_mapcount:3
- hot_cold_file_t 10398 ret:0 page:ffffef4448d6fa80 $page->_mapcount:3 origin_page_mapcount:3
- hot_cold_file_t 10398 ret:0 page:ffffef4448d6fac0 $page->_mapcount:3 origin_page_mapcount:3
- hot_cold_file_t 10398 ret:0 page:ffffef444833e000 $page->_mapcount:3 origin_page_mapcount:3
- hot_cold_file_t 10398 ret:0 page:ffffef444833e040 $page->_mapcount:3 origin_page_mapcount:3
- hot_cold_file_t 10398 ret:0 page:ffffef444835f040 $page->_mapcount:3 origin_page_mapcount:3
- hot_cold_file_t 10398 ret:0 page:ffffef444835f080 $page->_mapcount:3 origin_page_mapcount:3
- hot_cold_file_t 10398 ret:0 page:ffffef444835f0c0 $page->_mapcount:3 origin_page_mapcount:3
- hot_cold_file_t 10398 ret:0 page:ffffef44486a1d00 $page->_mapcount:3 origin_page_mapcount:3
- hot_cold_file_t 10398 ret:0 page:ffffef4444d1fd00 $page->_mapcount:22 origin_page_mapcount:22
- hot_cold_file_t 10398 ret:0 page:ffffef4444d1fd40 $page->_mapcount:22 origin_page_mapcount:22
- hot_cold_file_t 10398 ret:0 page:ffffef4448ff8f80 $page->_mapcount:21 origin_page_mapcount:21
- hot_cold_file_t 10398 ret:0 page:ffffef4448ff8fc0 $page->_mapcount:21 origin_page_mapcount:21
这确实说明unmap失败是因为page映射的pte页表被访问了。并且,这些page的mapcount都很大,说明有多个进程mmap映射了这个page,自然被访问的概率就更大,内存回收时需要尽可能避开这种page!水平有限,如有错误请指出。