Bootstrap

为什么内核内存回收对page进行try_to_unmap会失败?

我们知道内核内存回收最后,是执行shrink_inactive_list->shrink_page_list函数对各种page进行内存回收,而针对mmapped的文件页需要执行try_to_unmap函数解除进程对该page的物理内存的页表页目录映射,如果函数返回0则解除失败,导致该page内存回收失败。最近在开发以文件为单位对mmapped文件页进行回收时,发现大量mmapped文件页在执行try_to_unmap函数返回0,也就是说解除页表页目录映射失败了,从而导致这些page内存回收失败。什么原因导致的try_to_unmap函数解除page的页表页目录映射失败呢?

本文基于linux-4.18.0-240版本内核源码。先看下try_to_unmap函数执行流程,源码简化后如下:

  1. bool try_to_unmap(struct page *page, enum ttu_flags flags)
  2. {
  3.     struct rmap_walk_control rwc = {
  4.         .rmap_one = try_to_unmap_one,
  5.         .arg = (void *)flags,
  6.         .done = page_mapcount_is_zero,
  7.         .anon_lock = page_lock_anon_vma_read,
  8.     };
  9.     //解除进程对该page的物理内存的表页目录映射,如果成功则page_mapcount(page)0,该函数返回true,否则返回false
  10.     rmap_walk(page, &rwc);
  11.    
  12.     return !page_mapcount(page) ? true : false;
  13. }
  14. void rmap_walk(struct page *page, struct rmap_walk_control *rwc)
  15. {
  16.     rmap_walk_file(page, rwc, false);
  17. }
  18. static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
  19.         bool locked)
  20. {
  21.     struct address_space *mapping = page_mapping(page);
  22.     pgoff_t pgoff_start, pgoff_end;
  23.     struct vm_area_struct *vma;
  24.     //就是返回page->index索引,也是进程映射该文件页pagevma的起始虚拟地址
  25.     pgoff_start = page_to_pgoff(page);
  26.     pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
  27.     //遍历该文件保存的所有vmarb tree,得到映射page物理内存的所有虚拟地址vma。因为一个page的物理内存可能被多个进程映射,故会有多个vma
  28.     vma_interval_tree_foreach(vma, &mapping->i_mmap,
  29.             pgoff_start, pgoff_end) {
  30.         //address是映射page物理内存的起始虚拟地址
  31.         unsigned long address = vma_address(page, vma);
  32.         ...............
  33.         //解除进程对该page的物理内存的表页目录映射
  34.         if (!rwc->rmap_one(page, vma, address, rwc->arg))//try_to_unmap_one
  35.             goto done;
  36.         //如果没有进程再映射page的物理内存了,pagemapcount0,直接goto done
  37.         if (rwc->done && rwc->done(page))//page_mapcount_is_zero
  38.             goto done;
  39.     }
  40. done:
  41.    .........
  42. }
  43. static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
  44.              unsigned long address, void *arg)
  45. {
  46.     struct mm_struct *mm = vma->vm_mm;
  47.     struct page_vma_mapped_walk pvmw = {
  48.         .page = page,
  49.         .vma = vma,
  50.         //address是映射page物理内存的起始虚拟地址
  51.         .address = address,
  52.     };
  53.     ..........
  54.     //由映射page物理内存的起始虚拟地址address得到对应的页表项pte指针,保存到pvmw.pte
  55.     while (page_vma_mapped_walk(&pvmw)) {
  56.         .......
  57.         //如果page映射的页表pte置位了,结束unmap
  58.         if (ptep_clear_flush_young_notify(vma, address,
  59.                     pvmw.pte)) {
  60.             ret = false;
  61.             page_vma_mapped_walk_done(&pvmw);
  62.             break;
  63.         }
  64.         .............
  65.         //pte页表项清0,真正解除page页表页目录映射
  66.         pteval = ptep_get_and_clear(mm, address, pvmw.pte);
  67.         .......
  68.         //pagemapcount1
  69.         page_remove_rmap(subpage, PageHuge(page));
  70.     }
  71. }

因此,综合分析,怀疑原因是try_to_unmap_one->ptep_clear_flush_young_notify 函数里” 对pte页表项清0,真正解除page页表页目录映射”失败了!为什么这样怀疑,看下源码就知道了

  1. #define ptep_clear_flush_young_notify(__vma, __address, __ptep)     \
  2. ({                                  \
  3.     int __young;                            \
  4.     struct vm_area_struct *___vma = __vma;              \
  5.     unsigned long ___address = __address;               \
  6.     __young = ptep_clear_flush_young(___vma, ___address, __ptep);   \
  7.     __young |= mmu_notifier_clear_flush_young(___vma->vm_mm,    \
  8.                           ___address,       \
  9.                           ___address +      \
  10.                             PAGE_SIZE); \
  11.     __young;                            \
  12. })
  13. int ptep_clear_flush_young(struct vm_area_struct *vma,
  14.                unsigned long address, pte_t *ptep)
  15. {
  16.     return ptep_test_and_clear_young(vma, address, ptep);
  17. }

怀疑是执行 ptep_clear_flush_young_notify 函数,发现page的pte页表置位了,然后直接break返回false。用systemtap调试一下即可:

  1. global unmap_begin = 0;
  2. global unmap_fail = 0;
  3. global page_mapcount;
  4. probe kernel.function("try_to_unmap")
  5. {
  6.     unmap_begin = 1;
  7.     page_mapcount = $page->_mapcount->counter
  8. }
  9. probe kernel.function("ptep_clear_flush_young").return                                                                                                                                      
  10. {
  11.     if(unmap_begin && $return != 0){
  12.         unmap_fail = 1
  13.     }
  14. }
  15. probe kernel.function("try_to_unmap").return
  16. {
  17.     if(unmap_fail && $return == 0){
  18.         unmap_fail = 0
  19.         printf("%s %d ret:%d page:%x $page->_mapcount:%d origin_page_mapcount:%d\n",execname(),tid(),$return,$page,$page->_mapcount->counter,page_mapcount)
  20.     }
  21.     unmap_begin = 0
  22. }

打印

  • hot_cold_file_t 10398 ret:0 page:ffffef4448acec00 $page->_mapcount:22 origin_page_mapcount:22
  • hot_cold_file_t 10398 ret:0 page:ffffef4448acec40 $page->_mapcount:22 origin_page_mapcount:22
  • hot_cold_file_t 10398 ret:0 page:ffffef4448380100 $page->_mapcount:22 origin_page_mapcount:22
  • hot_cold_file_t 10398 ret:0 page:ffffef4448380140 $page->_mapcount:22 origin_page_mapcount:22
  • hot_cold_file_t 10398 ret:0 page:ffffef44483c6880 $page->_mapcount:21 origin_page_mapcount:21
  • hot_cold_file_t 10398 ret:0 page:ffffef44483c68c0 $page->_mapcount:21 origin_page_mapcount:21
  • hot_cold_file_t 10398 ret:0 page:ffffef4444d19700 $page->_mapcount:21 origin_page_mapcount:21
  • hot_cold_file_t 10398 ret:0 page:ffffef44483ca680 $page->_mapcount:22 origin_page_mapcount:22
  • hot_cold_file_t 10398 ret:0 page:ffffef4448af6fc0 $page->_mapcount:4 origin_page_mapcount:4
  • hot_cold_file_t 10398 ret:0 page:ffffef4448af0200 $page->_mapcount:4 origin_page_mapcount:4
  • hot_cold_file_t 10398 ret:0 page:ffffef4448af0240 $page->_mapcount:4 origin_page_mapcount:4
  • hot_cold_file_t 10398 ret:0 page:ffffef4448ffbb80 $page->_mapcount:4 origin_page_mapcount:4
  • hot_cold_file_t 10398 ret:0 page:ffffef44486a1d40 $page->_mapcount:3 origin_page_mapcount:3
  • hot_cold_file_t 10398 ret:0 page:ffffef44486a1d80 $page->_mapcount:3 origin_page_mapcount:3
  • hot_cold_file_t 10398 ret:0 page:ffffef44486a1dc0 $page->_mapcount:3 origin_page_mapcount:3
  • hot_cold_file_t 10398 ret:0 page:ffffef4448339140 $page->_mapcount:3 origin_page_mapcount:3
  • hot_cold_file_t 10398 ret:0 page:ffffef4448d6fa80 $page->_mapcount:3 origin_page_mapcount:3
  • hot_cold_file_t 10398 ret:0 page:ffffef4448d6fac0 $page->_mapcount:3 origin_page_mapcount:3
  • hot_cold_file_t 10398 ret:0 page:ffffef444833e000 $page->_mapcount:3 origin_page_mapcount:3
  • hot_cold_file_t 10398 ret:0 page:ffffef444833e040 $page->_mapcount:3 origin_page_mapcount:3
  • hot_cold_file_t 10398 ret:0 page:ffffef444835f040 $page->_mapcount:3 origin_page_mapcount:3
  • hot_cold_file_t 10398 ret:0 page:ffffef444835f080 $page->_mapcount:3 origin_page_mapcount:3
  • hot_cold_file_t 10398 ret:0 page:ffffef444835f0c0 $page->_mapcount:3 origin_page_mapcount:3
  • hot_cold_file_t 10398 ret:0 page:ffffef44486a1d00 $page->_mapcount:3 origin_page_mapcount:3
  • hot_cold_file_t 10398 ret:0 page:ffffef4444d1fd00 $page->_mapcount:22 origin_page_mapcount:22
  • hot_cold_file_t 10398 ret:0 page:ffffef4444d1fd40 $page->_mapcount:22 origin_page_mapcount:22
  • hot_cold_file_t 10398 ret:0 page:ffffef4448ff8f80 $page->_mapcount:21 origin_page_mapcount:21
  • hot_cold_file_t 10398 ret:0 page:ffffef4448ff8fc0 $page->_mapcount:21 origin_page_mapcount:21

这确实说明unmap失败是因为page映射的pte页表被访问了。并且,这些page的mapcount都很大,说明有多个进程mmap映射了这个page,自然被访问的概率就更大,内存回收时需要尽可能避开这种page!水平有限,如有错误请指出。

;