Another patchset from CK to lighten vm load
authorPeter Hunt <peter_j_hunt@hotmail.com>
Thu, 26 May 2011 11:24:34 +0000 (11:24 +0000)
committerPeter Hunt <peter_j_hunt@hotmail.com>
Thu, 26 May 2011 11:24:34 +0000 (11:24 +0000)
kernel-bfs-2.6.28/debian/patches/mm-background_scan-2.patch [new file with mode: 0644]
kernel-bfs-2.6.28/debian/patches/mm-lots_watermark.diff [new file with mode: 0644]
kernel-bfs-2.6.28/debian/patches/series

diff --git a/kernel-bfs-2.6.28/debian/patches/mm-background_scan-2.patch b/kernel-bfs-2.6.28/debian/patches/mm-background_scan-2.patch
new file mode 100644 (file)
index 0000000..80286f8
--- /dev/null
@@ -0,0 +1,138 @@
+Add a background scanning timer to restore the watermarks to the pages_lots
+level and only call on it if kswapd has not been called upon for the last 5
+seconds. This allows us to balance all zones to the more generous pages_lots
+watermark at a time unrelated to page allocation thus leading to lighter
+levels of vm load when called upon under page allocation.
+
+Signed-off-by: Con Kolivas <kernel@kolivas.org>
+
+The -ck patches modify mm/vmscan.c and add a timer to wake up kswapd every 5 
+seconds. This timer is initialized after the creation of the kswapd thread.
+
+The kswapd() thread function calls mod_timer at the front of its infinite 
+service loop (to reset the timer to 5 seconds in the future). mod_timer() 
+includes a BUG_ON() to assert that the timer's callback function is set.
+
+Since the wakeup timer is initialized after the kswapd thread is created, if 
+kswapd gets scheduled before kswapd_run() has prepared the timer, the 
+BUG_ON() check will throw a stack trace and immediately terminate the kswapd 
+thread.
+
+This patch modifies the kswapd_run() function in mm/vmscan.c to initialize the 
+watermark timer before starting the kswapd thread.
+
+Signed-off-by: Chase Venters <chase.venters at clientec.com>
+
+ include/linux/mmzone.h |    2 ++
+ mm/vmscan.c            |   42 ++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 44 insertions(+)
+
+Index: linux-2.6.22-ck1/include/linux/mmzone.h
+===================================================================
+--- linux-2.6.22-ck1.orig/include/linux/mmzone.h       2007-07-09 18:44:40.000000000 +1000
++++ linux-2.6.22-ck1/include/linux/mmzone.h    2007-07-09 18:44:40.000000000 +1000
+@@ -13,6 +13,7 @@
+ #include <linux/init.h>
+ #include <linux/seqlock.h>
+ #include <linux/nodemask.h>
++#include <linux/timer.h>
+ #include <linux/pageblock-flags.h>
+ #include <linux/bounds.h>
+ #include <asm/atomic.h>
+@@ -452,6 +453,7 @@ typedef struct pglist_data {
+       wait_queue_head_t kswapd_wait;
+       struct task_struct *kswapd;
+       int kswapd_max_order;
++      struct timer_list watermark_timer;
+ } pg_data_t;
+ #define node_present_pages(nid)       (NODE_DATA(nid)->node_present_pages)
+Index: linux-2.6.22-ck1/mm/vmscan.c
+===================================================================
+--- linux-2.6.22-ck1.orig/mm/vmscan.c  2007-07-09 18:44:40.000000000 +1000
++++ linux-2.6.22-ck1/mm/vmscan.c       2007-07-09 18:44:40.000000000 +1000
+@@ -37,6 +37,7 @@
+ #include <linux/rwsem.h>
+ #include <linux/delay.h>
+ #include <linux/kthread.h>
++#include <linux/timer.h>
+ #include <linux/freezer.h>
+ #include <linux/memcontrol.h>
+ #include <linux/delayacct.h>
+@@ -1330,6 +1331,8 @@ out:
+       return nr_reclaimed;
+ }
++#define WT_EXPIRY     (HZ * 5)        /* Time to wakeup watermark_timer */
++
+ /*
+  * The background pageout daemon, started as a kernel thread
+  * from the init process. 
+@@ -1377,6 +1380,8 @@ static int kswapd(void *p)
+       for ( ; ; ) {
+               unsigned long new_order;
++              /* kswapd has been busy so delay watermark_timer */
++              mod_timer(&pgdat->watermark_timer, jiffies + WT_EXPIRY);
+               prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+               new_order = pgdat->kswapd_max_order;
+               pgdat->kswapd_max_order = 0;
+@@ -1604,20 +1609,57 @@ static int __devinit cpu_callback(struct
+ }
+ /*
++ * We wake up kswapd every WT_EXPIRY till free ram is above pages_lots
++ */
++static void watermark_wakeup(unsigned long data)
++{
++      pg_data_t *pgdat = (pg_data_t *)data;
++      struct timer_list *wt = &pgdat->watermark_timer;
++      int i;
++
++      if (!waitqueue_active(&pgdat->kswapd_wait) || above_background_load())
++              goto out;
++      for (i = pgdat->nr_zones - 1; i >= 0; i--) {
++              struct zone *z = pgdat->node_zones + i;
++
++              if (!populated_zone(z) || is_highmem(z)) {
++                      /* We are better off leaving highmem full */
++                      continue;
++              }
++              if (!zone_watermark_ok(z, 0, z->pages_lots, 0, 0)) {
++                      wake_up_interruptible(&pgdat->kswapd_wait);
++                      goto out;
++              }
++      }
++out:
++      mod_timer(wt, jiffies + WT_EXPIRY);
++      return;
++}
++
++/*
+  * This kswapd start function will be called by init and node-hot-add.
+  * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
+  */
+ int kswapd_run(int nid)
+ {
+       pg_data_t *pgdat = NODE_DATA(nid);
++      struct timer_list *wt;
+       int ret = 0;
+       if (pgdat->kswapd)
+               return 0;
++      wt = &pgdat->watermark_timer;
++      init_timer(wt);
++      wt->data = (unsigned long)pgdat;
++      wt->function = watermark_wakeup;
++      wt->expires = jiffies + WT_EXPIRY;
++      add_timer(wt);
++
+       pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+       if (IS_ERR(pgdat->kswapd)) {
+               /* failure at boot is fatal */
++              del_timer(wt);
+               BUG_ON(system_state == SYSTEM_BOOTING);
+               printk("Failed to start kswapd on node %d\n",nid);
+               ret = -1;
+
diff --git a/kernel-bfs-2.6.28/debian/patches/mm-lots_watermark.diff b/kernel-bfs-2.6.28/debian/patches/mm-lots_watermark.diff
new file mode 100644 (file)
index 0000000..c756bd6
--- /dev/null
@@ -0,0 +1,108 @@
+The vm currently performs scanning when allocating ram once the watermarks
+are below the pages_low value and tries to restore them to the pages_high
+watermark. The disadvantage of this is that we are scanning most aggresssively
+at the same time we are allocating ram regardless of the stress the vm is
+under. Add a pages_lots watermark and allow the watermark to be relaxed
+according to the stress the vm is at the time (according to the priority
+value). Thus we have more in reserve next time we are allocating ram and end
+up scanning less aggresssively. Note the actual pages_lots isn't used directly
+in this code.
+
+Signed-off-by: Con Kolivas <kernel@kolivas.org>
+
+ include/linux/mmzone.h |    2 +-
+ mm/page_alloc.c        |    3 +++
+ mm/vmscan.c            |   17 ++++++++++++++---
+ 3 files changed, 18 insertions(+), 4 deletions(-)
+
+Index: linux-2.6.22-ck1/include/linux/mmzone.h
+===================================================================
+--- linux-2.6.22-ck1.orig/include/linux/mmzone.h       2007-07-09 18:44:34.000000000 +1000
++++ linux-2.6.22-ck1/include/linux/mmzone.h    2007-07-09 18:44:39.000000000 +1000
+@@ -181,7 +181,7 @@ enum zone_type {
+ struct zone {
+       /* Fields commonly accessed by the page allocator */
+-      unsigned long           pages_min, pages_low, pages_high;
++      unsigned long           pages_min, pages_low, pages_high, pages_lots;
+       /*
+        * We don't know if the memory that we're going to allocate will be freeable
+        * or/and it will be released eventually, so to avoid totally wasting several
+Index: linux-2.6.22-ck1/mm/page_alloc.c
+===================================================================
+--- linux-2.6.22-ck1.orig/mm/page_alloc.c      2007-07-09 18:44:34.000000000 +1000
++++ linux-2.6.22-ck1/mm/page_alloc.c   2007-07-09 18:44:39.000000000 +1000
+@@ -1570,6 +1570,7 @@ void show_free_areas(void)
+                       " min:%lukB"
+                       " low:%lukB"
+                       " high:%lukB"
++                      " lots:%lukB"
+                       " active_anon:%lukB"
+                       " inactive_anon:%lukB"
+                       " active_file:%lukB"
+@@ -1581,6 +1582,7 @@ void show_free_areas(void)
+                       K(zone->pages_min),
+                       K(zone->pages_low),
+                       K(zone->pages_high),
++                      K(zone->pages_lots),
+                       K(zone_page_state(zone, NR_ACTIVE_ANON)),
+                       K(zone_page_state(zone, NR_INACTIVE_ANON)),
+                       K(zone_page_state(zone, NR_ACTIVE_FILE)),
+@@ -3142,6 +3144,7 @@ void setup_per_zone_pages_min(void)
+               zone->pages_low   = zone->pages_min + (tmp >> 2);
+               zone->pages_high  = zone->pages_min + (tmp >> 1);
++              zone->pages_lots  = zone->pages_min + tmp;
+               setup_zone_migrate_reserve(zone);
+               spin_unlock_irqrestore(&zone->lock, flags);
+       }
+Index: linux-2.6.22-ck1/mm/vmscan.c
+===================================================================
+--- linux-2.6.22-ck1.orig/mm/vmscan.c  2007-07-09 18:44:39.000000000 +1000
++++ linux-2.6.22-ck1/mm/vmscan.c       2007-07-09 18:44:39.000000000 +1000
+@@ -1171,6 +1171,7 @@ loop_again:
+                */
+               for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+                       struct zone *zone = pgdat->node_zones + i;
++                      unsigned long watermark;
+                       if (!populated_zone(zone))
+                               continue;
+@@ -1178,8 +1179,14 @@ loop_again:
+                               shrink_active_list(SWAP_CLUSTER_MAX, zone,
+                                                       &sc, priority, 0);
+-                      if (!zone_watermark_ok(zone, order, zone->pages_high,
+-                                             0, 0)) {
++                      /*
++                       * The watermark is relaxed depending on the
++                       * level of "priority" till it drops to
++                       * pages_high.
++                       */
++                      watermark = zone->pages_high + (zone->pages_high *
++                                  priority / DEF_PRIORITY);
++                      if (!zone_watermark_ok(zone, order, watermark, 0, 0)) {
+                               end_zone = i;
+                               break;
+                       }
+@@ -1206,6 +1213,7 @@ loop_again:
+               for (i = 0; i <= end_zone; i++) {
+                       struct zone *zone = pgdat->node_zones + i;
+                       int nr_slab;
++                      unsigned long watermark;
+                       if (!populated_zone(zone))
+                               continue;
+@@ -1213,7 +1221,10 @@ loop_again:
+                                       priority != DEF_PRIORITY)
+                               continue;
+-                      if (!zone_watermark_ok(zone, order, zone->pages_high,
++                      watermark = zone->pages_high + (zone->pages_high *
++                                  priority / DEF_PRIORITY);
++
++                      if (!zone_watermark_ok(zone, order, watermark,
+                                              end_zone, 0))
+                               all_zones_ok = 0;
+                       temp_priority[i] = priority;
+
index bc219dd..3339ffd 100644 (file)
@@ -38,8 +38,10 @@ sched-add-above-background-load-function.patch
 mm-make_swappiness_really_mean_it.patch
 mm-enable_swaptoken_only_when_swap_full.patch
 mm-drop_swap_cache_aggressively.patch
+mm-lots_watermark.diff
 mm-kswapd_inherit_prio-1.patch
 mm-idleprio_prio-1.patch
+mm-background_scan-2.patch
 mm-lru_cache_add_lru_tail.patch
 hz-raise_max.patch
 cpufreq-bfs_tweaks.patch