git.maemo.org Git - kernel-bfs/blob - kernel-power-2.6.28/debian/patches/issue_12309_0.diff

   1 Fix "system goes unresponsive under memory pressure and lots of
   2 dirty/writeback pages" bug.
   3
   4         http://lkml.org/lkml/2010/4/4/86
   5
   6 In the above thread, Andreas Mohr described that
   7
   8         Invoking any command locked up for minutes (note that I'm
   9         talking about attempted additional I/O to the _other_,
  10         _unaffected_ main system HDD - such as loading some shell
  11         binaries -, NOT the external SSD18M!!).
  12
  13 This happens when the two conditions are both meet:
  14  - under memory pressure
  15  - writing heavily to a slow device
  16
  17 OOM also happens in Andreas' system. The OOM trace shows that 3
  18 processes are stuck in wait_on_page_writeback() in the direct reclaim
  19 path. One in do_fork() and the other two in unix_stream_sendmsg(). They
  20 are blocked on this condition:
  21
  22         (sc->order && priority < DEF_PRIORITY - 2)
  23
  24 which was introduced in commit 78dc583d (vmscan: low order lumpy reclaim
  25 also should use PAGEOUT_IO_SYNC) one year ago. That condition may be too
  26 permissive. In Andreas' case, 512MB/1024 = 512KB. If the direct reclaim
  27 for the order-1 fork() allocation runs into a range of 512KB
  28 hard-to-reclaim LRU pages, it will be stalled.
  29
  30 It's a severe problem in three ways.
  31
  32 Firstly, it can easily happen in daily desktop usage.  vmscan priority
  33 can easily go below (DEF_PRIORITY - 2) on _local_ memory pressure. Even
  34 if the system has 50% globally reclaimable pages, it still has good
  35 opportunity to have 0.1% sized hard-to-reclaim ranges. For example, a
  36 simple dd can easily create a big range (up to 20%) of dirty pages in
  37 the LRU lists. And order-1 to order-3 allocations are more than common
  38 with SLUB. Try "grep -v '1 :' /proc/slabinfo" to get the list of high
  39 order slab caches. For example, the order-1 radix_tree_node slab cache
  40 may stall applications at swap-in time; the order-3 inode cache on most
  41 filesystems may stall applications when trying to read some file; the
  42 order-2 proc_inode_cache may stall applications when trying to open a
  43 /proc file.
  44
  45 Secondly, once triggered, it will stall unrelated processes (not doing IO
  46 at all) in the system. This "one slow USB device stalls the whole system"
  47 avalanching effect is very bad.
  48
  49 Thirdly, once stalled, the stall time could be intolerable long for the
  50 users.  When there are 20MB queued writeback pages and USB 1.1 is
  51 writing them in 1MB/s, wait_on_page_writeback() will stuck for up to 20
  52 seconds.  Not to mention it may be called multiple times.
  53
  54 So raise the bar to only enable PAGEOUT_IO_SYNC when priority goes below
  55 DEF_PRIORITY/3, or 6.25% LRU size. As the default dirty throttle ratio is
  56  20%, it will hardly be triggered by pure dirty pages. We'd better treat
  57 PAGEOUT_IO_SYNC as some last resort workaround -- its stall time is so
  58 uncomfortably long (easily goes beyond 1s).
  59
  60 The bar is only raised for (order < PAGE_ALLOC_COSTLY_ORDER) allocations,
  61 which are easy to satisfy in 1TB memory boxes. So, although 6.25% of
  62 memory could be an awful lot of pages to scan on a system with 1TB of
  63 memory, it won't really have to busy scan that much.
  64
  65 Andreas tested an older version of this patch and reported that it
  66 mostly fixed his problem. Mel Gorman helped improve it and KOSAKI
  67 Motohiro will fix it further in the next patch.
  68
  69 Reported-by: Andreas Mohr <andi@lisas.de>
  70 Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
  71 Signed-off-by: Mel Gorman <mel@csn.ul.ie>
  72 Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
  73 ---
  74
  75 Index: kernel-2.6.28/mm/vmscan.c
  76 ===================================================================
  77 --- kernel-2.6.28.orig/mm/vmscan.c
  78 +++ kernel-2.6.28/mm/vmscan.c
  79 @@ -52,6 +52,12 @@ struct scan_control {
  80         /* Incremented by the number of inactive pages that were scanned */
  81         unsigned long nr_scanned;
  82
  83 +       /* Number of pages freed so far during a call to shrink_zones() */
  84 +       unsigned long nr_reclaimed;
  85 +
  86 +       /* How many pages shrink_list() should reclaim */
  87 +       unsigned long nr_to_reclaim;
  88 +
  89         /* This context's GFP mask */
  90         gfp_t gfp_mask;
  91
  92 @@ -72,6 +78,12 @@ struct scan_control {
  93
  94         int order;
  95
  96 +       /*
  97 +        * Intend to reclaim enough contenious memory rather than to reclaim
  98 +        * enough amount memory. I.e, it's the mode for high order allocation.
  99 +        */
 100 +       bool lumpy_reclaim_mode;
 101 +
 102         /* Which cgroup do we reclaim from */
 103         struct mem_cgroup *mem_cgroup;
 104
 105 @@ -549,7 +561,6 @@ void putback_lru_page(struct page *page)
 106  }
 107  #endif /* CONFIG_UNEVICTABLE_LRU */
 108
 109 -
 110  /*
 111   * shrink_page_list() returns the number of reclaimed pages
 112   */
 113 @@ -613,8 +624,10 @@ static unsigned long shrink_page_list(st
 114
 115                 referenced = page_referenced(page, 1, sc->mem_cgroup);
 116                 /* In active use or really unfreeable?  Activate it. */
 117 -               if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
 118 -                                       referenced && page_mapping_inuse(page))
 119 +               if ( ( sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
 120 +                                       referenced && page_mapping_inuse(page)
 121 +             ) || sc->lumpy_reclaim_mode /* ignore refrences in lumpy reclaim mode */
 122 +           )
 123                         goto activate_locked;
 124
 125  #ifdef CONFIG_SWAP
 126 @@ -1024,6 +1037,47 @@ int isolate_lru_page(struct page *page)
 127  }
 128
 129  /*
 130 + * Returns true if the caller should wait to clean dirty/writeback pages.
 131 + *
 132 + * If we are direct reclaiming for contiguous pages and we do not reclaim
 133 + * everything in the list, try again and wait for writeback IO to complete.
 134 + * This will stall high-order allocations noticeably. Only do that when really
 135 + * need to free the pages under high memory pressure.
 136 + */
 137 +static inline bool should_reclaim_stall(unsigned long nr_taken,
 138 +                                       unsigned long nr_freed,
 139 +                                       int priority,
 140 +                                       struct scan_control *sc)
 141 +{
 142 +       int lumpy_stall_priority;
 143 +
 144 +       /* kswapd should not stall on sync IO */
 145 +       if (current_is_kswapd())
 146 +               return false;
 147 +
 148 +       /* Only stall on lumpy reclaim */
 149 +       if (!sc->lumpy_reclaim_mode)
 150 +               return false;
 151 +
 152 +       /* If we have relaimed everything on the isolated list, no stall */
 153 +       if (nr_freed == nr_taken)
 154 +               return false;
 155 +
 156 +       /*
 157 +        * For high-order allocations, there are two stall thresholds.
 158 +        * High-cost allocations stall immediately where as lower
 159 +        * order allocations such as stacks require the scanning
 160 +        * priority to be much higher before stalling.
 161 +        */
 162 +       if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
 163 +               lumpy_stall_priority = DEF_PRIORITY;
 164 +       else
 165 +               lumpy_stall_priority = DEF_PRIORITY / 3;
 166 +
 167 +       return priority <= lumpy_stall_priority;
 168 +}
 169 +
 170 +/*
 171   * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
 172   * of reclaimed pages
 173   */
 174 @@ -1047,7 +1101,7 @@ static unsigned long shrink_inactive_lis
 175                 unsigned long nr_freed;
 176                 unsigned long nr_active;
 177                 unsigned int count[NR_LRU_LISTS] = { 0, };
 178 -               int mode = ISOLATE_INACTIVE;
 179 +               int mode = sc->lumpy_reclaim_mode ? ISOLATE_BOTH : ISOLATE_INACTIVE;
 180
 181                 /*
 182                  * If we need a large contiguous chunk of memory, or have
 183 @@ -1088,15 +1142,8 @@ static unsigned long shrink_inactive_lis
 184                 nr_scanned += nr_scan;
 185                 nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
 186
 187 -               /*
 188 -                * If we are direct reclaiming for contiguous pages and we do
 189 -                * not reclaim everything in the list, try again and wait
 190 -                * for IO to complete. This will stall high-order allocations
 191 -                * but that should be acceptable to the caller
 192 -                */
 193 -               if (nr_freed < nr_taken && !current_is_kswapd() &&
 194 -                                       sc->order > PAGE_ALLOC_COSTLY_ORDER) {
 195 -                       congestion_wait(WRITE, HZ/10);
 196 +        /* Check if we should syncronously wait for writeback */
 197 +               if (should_reclaim_stall(nr_taken, nr_freed, priority, sc)) {
 198
 199                         /*
 200                          * The attempt at page out may have made some
 201 @@ -1404,6 +1451,20 @@ static void get_scan_ratio(struct zone *
 202         percent[1] = 100 - percent[0];
 203  }
 204
 205 +static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc)
 206 +{
 207 +       /*
 208 +        * If we need a large contiguous chunk of memory, or have
 209 +        * trouble getting a small set of contiguous pages, we
 210 +        * will reclaim both active and inactive pages.
 211 +        */
 212 +       if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
 213 +               sc->lumpy_reclaim_mode = 1;
 214 +       else if (sc->order && priority < DEF_PRIORITY - 2)
 215 +               sc->lumpy_reclaim_mode = 1;
 216 +       else
 217 +               sc->lumpy_reclaim_mode = 0;
 218 +}
 219
 220  /*
 221   * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
 222 @@ -1419,6 +1480,8 @@ static unsigned long shrink_zone(int pri
 223
 224         get_scan_ratio(zone, sc, percent);
 225
 226 +       set_lumpy_reclaim_mode(priority, sc);
 227 +
 228         for_each_evictable_lru(l) {
 229                 if (scan_global_lru(sc)) {
 230                         int file = is_file_lru(l);