--- /dev/null
+Make it possible to have interactivity and responsiveness at very high load
+levels by making deadlines offset by the fork depth from init. This has a
+similar effect to 'nice'ing loads that are fork heavy. 'make' is a perfect
+example of this and will, with fork_depth_penalty enabled, be felt as much
+at 'make -j24' as it normally would be with just 'make'.
+
+Note that this drastically affects CPU distribution, and also has the
+indirect side effect of partitioning CPU entitlement to different users as
+well. No assumption as to CPU distribution should be made based on past
+behaviour.
+
+This is achieved by separating out forks to new processes vs new threads.
+When a new process is detected, its fork depth is inherited from its parent
+across fork() and then is incremented by one. That fork_depth is then used
+to cause a relative offset of its deadline.
+
+This feature is enabled in this patch by default and can be optionally
+disabled.
+
+Threads are kept at the same fork_depth as their parent process, and can
+optionally have their CPU entitlement all managed as one process together
+by enabling the group_thread_accounting feature. This feature is disabled
+by default in this patch, as many desktop applications such as firefox,
+amarok, etc are multithreaded. By disabling this feature and enabling the
+fork_depth_penalty feature (default) it favours CPU towards desktop
+applications.
+
+Extensive testing is required to ensure this does not cause regressions in
+common workloads.
+
+There are two sysctls to enable/disable these features.
+
+They are in /proc/sys/kernel/
+
+group_thread_accounting - groups CPU accounting by threads
+fork_depth_penalty - penalises according to depth of forking from init
+
+-ck
+
+---
+ include/linux/sched.h | 7 +++
+ kernel/sched_bfs.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++----
+ kernel/sysctl.c | 20 +++++++++++
+ 3 files changed, 108 insertions(+), 7 deletions(-)
+
+Index: linux-2.6.36-rc7-ck1/include/linux/sched.h
+===================================================================
+--- linux-2.6.36-rc7-ck1.orig/include/linux/sched.h 2010-10-08 09:39:38.016240768 +1100
++++ linux-2.6.36-rc7-ck1/include/linux/sched.h 2010-10-08 09:39:53.575007838 +1100
+@@ -1187,10 +1187,15 @@ struct task_struct {
+ unsigned int rt_priority;
+ #ifdef CONFIG_SCHED_BFS
+ int time_slice;
+- u64 deadline;
++ /* Virtual deadline in niffies, and when the deadline was set */
++ u64 deadline, deadline_niffy;
+ struct list_head run_list;
+ u64 last_ran;
+ u64 sched_time; /* sched_clock time spent running */
++ /* Number of threads currently requesting CPU time */
++ unsigned long threads_running;
++ /* Depth of forks from init */
++ int fork_depth;
+
+ unsigned long rt_timeout;
+ #else /* CONFIG_SCHED_BFS */
+Index: linux-2.6.36-rc7-ck1/kernel/sched_bfs.c
+===================================================================
+--- linux-2.6.36-rc7-ck1.orig/kernel/sched_bfs.c 2010-10-08 09:39:37.918242270 +1100
++++ linux-2.6.36-rc7-ck1/kernel/sched_bfs.c 2010-10-08 11:16:01.382198622 +1100
+@@ -139,6 +139,15 @@ int rr_interval __read_mostly = 6;
+ int sched_iso_cpu __read_mostly = 70;
+
+ /*
++ * group_thread_accounting - sysctl to decide whether to treat whole thread
++ * groups as a single entity for the purposes of CPU distribution.
++ */
++int group_thread_accounting __read_mostly;
++
++/* fork_depth_penalty - Whether to penalise CPU according to fork depth. */
++int fork_depth_penalty __read_mostly = 1;
++
++/*
+ * The relative length of deadline for each priority(nice) level.
+ */
+ static int prio_ratios[PRIO_RANGE] __read_mostly;
+@@ -661,11 +670,29 @@ static int isoprio_suitable(void)
+ return !grq.iso_refractory;
+ }
+
++static inline u64 __task_deadline_diff(struct task_struct *p);
++static inline u64 task_deadline_diff(struct task_struct *p);
++
+ /*
+ * Adding to the global runqueue. Enter with grq locked.
+ */
+ static void enqueue_task(struct task_struct *p)
+ {
++ s64 max_tdd = task_deadline_diff(p);
++
++ /*
++ * Make sure that when we're queueing this task again that it
++ * doesn't have any old deadlines from when the thread group was
++ * being penalised and cap the deadline to the highest it could
++ * be, based on the current number of threads running.
++ */
++ if (group_thread_accounting) {
++ max_tdd += p->group_leader->threads_running *
++ __task_deadline_diff(p);
++ }
++ if (p->deadline - p->deadline_niffy > max_tdd)
++ p->deadline = p->deadline_niffy + max_tdd;
++
+ if (!rt_task(p)) {
+ /* Check it hasn't gotten rt from PI */
+ if ((idleprio_task(p) && idleprio_suitable(p)) ||
+@@ -967,10 +994,13 @@ static int effective_prio(struct task_st
+ }
+
+ /*
+- * activate_task - move a task to the runqueue. Enter with grq locked.
++ * activate_task - move a task to the runqueue. Enter with grq locked. The
++ * number of threads running is stored in the group_leader struct.
+ */
+ static void activate_task(struct task_struct *p, struct rq *rq)
+ {
++ unsigned long *threads_running = &p->group_leader->threads_running;
++
+ update_clocks(rq);
+
+ /*
+@@ -987,6 +1017,14 @@ static void activate_task(struct task_st
+ p->prio = effective_prio(p);
+ if (task_contributes_to_load(p))
+ grq.nr_uninterruptible--;
++ /*
++ * Adjust deadline according to number of running threads within
++ * this thread group. This ends up distributing CPU to the thread
++ * group as a single entity.
++ */
++ ++*threads_running;
++ if (*threads_running > 1 && group_thread_accounting)
++ p->deadline += __task_deadline_diff(p);
+ enqueue_task(p);
+ grq.nr_running++;
+ inc_qnr();
+@@ -998,9 +1036,14 @@ static void activate_task(struct task_st
+ */
+ static inline void deactivate_task(struct task_struct *p)
+ {
++ unsigned long *threads_running = &p->group_leader->threads_running;
++
+ if (task_contributes_to_load(p))
+ grq.nr_uninterruptible++;
+ grq.nr_running--;
++ --*threads_running;
++ if (*threads_running > 0 && group_thread_accounting)
++ p->deadline -= __task_deadline_diff(p);
+ }
+
+ #ifdef CONFIG_SMP
+@@ -1635,6 +1678,10 @@ void wake_up_new_task(struct task_struct
+ parent = p->parent;
+ /* Unnecessary but small chance that the parent changed CPU */
+ set_task_cpu(p, task_cpu(parent));
++ if (!(clone_flags & CLONE_THREAD)) {
++ p->fork_depth++;
++ p->threads_running = 0;
++ }
+ activate_task(p, rq);
+ trace_sched_wakeup_new(p, 1);
+ if (!(clone_flags & CLONE_VM) && rq->curr == parent &&
+@@ -2524,11 +2571,20 @@ static inline u64 prio_deadline_diff(int
+ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
+ }
+
+-static inline u64 task_deadline_diff(struct task_struct *p)
++static inline u64 __task_deadline_diff(struct task_struct *p)
+ {
+ return prio_deadline_diff(TASK_USER_PRIO(p));
+ }
+
++static inline u64 task_deadline_diff(struct task_struct *p)
++{
++ u64 pdd = __task_deadline_diff(p);
++
++ if (fork_depth_penalty && p->fork_depth > 1)
++ pdd *= p->fork_depth;
++ return pdd;
++}
++
+ static inline u64 static_deadline_diff(int static_prio)
+ {
+ return prio_deadline_diff(USER_PRIO(static_prio));
+@@ -2545,8 +2601,24 @@ static inline int ms_longest_deadline_di
+ */
+ static void time_slice_expired(struct task_struct *p)
+ {
++ u64 tdd = task_deadline_diff(p);
++
++ /*
++ * We proportionately increase the deadline according to how many
++ * threads are running. This effectively makes a thread group have
++ * the same CPU as one task, no matter how many threads are running.
++ * time_slice_expired can be called when there may be none running
++ * when p is deactivated so we must explicitly test for more than 1.
++ */
++ if (group_thread_accounting) {
++ unsigned long *threads_running = &p->group_leader->threads_running;
++
++ if (*threads_running > 1)
++ tdd += *threads_running * __task_deadline_diff(p);
++ }
+ p->time_slice = timeslice();
+- p->deadline = grq.niffies + task_deadline_diff(p);
++ p->deadline_niffy = grq.niffies;
++ p->deadline = grq.niffies + tdd;
+ }
+
+ /*
+@@ -3513,7 +3585,7 @@ SYSCALL_DEFINE1(nice, int, increment)
+ *
+ * This is the priority value as seen by users in /proc.
+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes
+- * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO).
++ * from 0 (SCHED_ISO) upwards (to nice +19 SCHED_IDLEPRIO).
+ */
+ int task_prio(const struct task_struct *p)
+ {
+@@ -3525,8 +3597,12 @@ int task_prio(const struct task_struct *
+
+ /* Convert to ms to avoid overflows */
+ delta = NS_TO_MS(p->deadline - grq.niffies);
+- delta = delta * 40 / ms_longest_deadline_diff();
+- if (delta > 0 && delta <= 80)
++ if (fork_depth_penalty)
++ delta *= 4;
++ else
++ delta *= 40;
++ delta /= ms_longest_deadline_diff();
++ if (delta > 0)
+ prio += delta;
+ if (idleprio_task(p))
+ prio += 40;
+Index: linux-2.6.36-rc7-ck1/kernel/sysctl.c
+===================================================================
+--- linux-2.6.36-rc7-ck1.orig/kernel/sysctl.c 2010-10-08 09:39:11.603648964 +1100
++++ linux-2.6.36-rc7-ck1/kernel/sysctl.c 2010-10-08 09:39:53.579007778 +1100
+@@ -121,6 +121,8 @@ static int __maybe_unused one_hundred =
+ #ifdef CONFIG_SCHED_BFS
+ extern int rr_interval;
+ extern int sched_iso_cpu;
++extern int group_thread_accounting;
++extern int fork_depth_penalty;
+ static int __read_mostly one_thousand = 1000;
+ #endif
+ #ifdef CONFIG_PRINTK
+@@ -834,6 +836,24 @@ static struct ctl_table kern_table[] = {
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
++ {
++ .procname = "group_thread_accounting",
++ .data = &group_thread_accounting,
++ .maxlen = sizeof (int),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec_minmax,
++ .extra1 = &zero,
++ .extra2 = &one,
++ },
++ {
++ .procname = "fork_depth_penalty",
++ .data = &fork_depth_penalty,
++ .maxlen = sizeof (int),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec_minmax,
++ .extra1 = &zero,
++ .extra2 = &one,
++ },
+ #endif
+ #if defined(CONFIG_S390) && defined(CONFIG_SMP)
+ {
--- /dev/null
+diff -uprN linux-2.6.28/block/genhd.c linux-2.6.28.new/block/genhd.c
+--- linux-2.6.28/block/genhd.c 2011-03-13 15:15:43.815647000 +0100
++++ linux-2.6.28.new/block/genhd.c 2011-03-15 10:13:35.145764805 +0100
+@@ -1129,6 +1129,8 @@ struct gendisk *alloc_disk_node(int mino
+ disk->part_tbl->part[0] = &disk->part0;
+
+ disk->minors = minors;
++ disk->flags |=
++ (GENHD_FL_REMAP_SWAPPED_PAGES | GENHD_FL_NOTIFY_REMAPPED_ONLY);
+ rand_initialize_disk(disk);
+ disk_to_dev(disk)->class = &block_class;
+ disk_to_dev(disk)->type = &disk_type;
+diff -uprN linux-2.6.28/include/linux/blkdev.h linux-2.6.28.new/include/linux/blkdev.h
+--- linux-2.6.28/include/linux/blkdev.h 2011-02-01 09:54:54.519982520 +0100
++++ linux-2.6.28.new/include/linux/blkdev.h 2011-02-01 10:15:39.369903561 +0100
+@@ -1068,6 +1068,8 @@ struct block_device_operations {
+ int (*media_changed) (struct gendisk *);
+ int (*revalidate_disk) (struct gendisk *);
+ int (*getgeo)(struct block_device *, struct hd_geometry *);
++ /* this callback is with swap_lock and sometimes page table lock held */
++ void (*swap_slot_free_notify) (struct block_device *, unsigned long);
+ struct module *owner;
+ };
+
+diff -uprN linux-2.6.28/include/linux/genhd.h linux-2.6.28.new/include/linux/genhd.h
+--- linux-2.6.28/include/linux/genhd.h 2011-03-13 15:23:58.275368057 +0100
++++ linux-2.6.28.new/include/linux/genhd.h 2011-03-15 10:14:01.575121499 +0100
+@@ -113,6 +113,8 @@ struct hd_struct {
+ #define GENHD_FL_UP 16
+ #define GENHD_FL_SUPPRESS_PARTITION_INFO 32
+ #define GENHD_FL_EXT_DEVT 64 /* allow extended devt */
++#define GENHD_FL_REMAP_SWAPPED_PAGES 128
++#define GENHD_FL_NOTIFY_REMAPPED_ONLY 256
+
+ #define BLK_SCSI_MAX_CMDS (256)
+ #define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
+diff -uprN linux-2.6.28/mm/swapfile.c linux-2.6.28.new/mm/swapfile.c
+--- linux-2.6.28/mm/swapfile.c 2011-02-01 09:54:31.434289623 +0100
++++ linux-2.6.28.new/mm/swapfile.c 2011-03-15 10:14:50.998178343 +0100
+@@ -270,10 +270,23 @@ out:
+ return NULL;
+ }
+
++static void swap_entry_update(struct swap_info_struct *p, unsigned long offset)
++{
++ if (offset < p->lowest_bit)
++ p->lowest_bit = offset;
++ if (offset > p->highest_bit)
++ p->highest_bit = offset;
++ if (p->prio > swap_info[swap_list.next].prio)
++ swap_list.next = p - swap_info;
++ nr_swap_pages++;
++ p->inuse_pages--;
++}
++
+ static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
+ {
+ int count = p->swap_map[offset];
+ unsigned old;
++ struct gendisk *disk;
+
+ if (count >= SWAP_MAP_MAX)
+ return count;
+@@ -283,28 +296,40 @@ static int swap_entry_free(struct swap_i
+ if (count)
+ return count;
+
+- spin_lock(&p->remap_lock);
++ disk = p->bdev->bd_disk;
+
+- if (offset < p->lowest_bit)
+- p->lowest_bit = offset;
+- if (offset > p->highest_bit)
+- p->highest_bit = offset;
+- if (p->prio > swap_info[swap_list.next].prio)
+- swap_list.next = p - swap_info;
+- nr_swap_pages++;
+- p->inuse_pages--;
++ if (p->swap_remap) {
++ spin_lock(&p->remap_lock);
++ swap_entry_update(p, offset);
++ }
++ else {
++ swap_entry_update(p, offset);
++ if (disk->fops->swap_slot_free_notify)
++ disk->fops->swap_slot_free_notify(p->bdev, offset);
++ return 0;
++ }
+
+ /* Re-map the page number */
+ old = p->swap_remap[offset] & 0x7FFFFFFF;
+ /* Zero means it was not re-mapped */
+- if (!old)
+- goto out;
++ if (!old) {
++ /* Skip notify if flag is set or the page is used */
++ if ((disk->flags & GENHD_FL_NOTIFY_REMAPPED_ONLY) ||
++ (p->swap_remap[offset] & 0x80000000))
++ goto out;
++
++ old = offset;
++ goto notify;
++ }
+ /* Clear the re-mapping */
+ p->swap_remap[offset] &= 0x80000000;
+ /* Mark the re-mapped page as unused */
+ p->swap_remap[old] &= 0x7FFFFFFF;
+ /* Record how many free pages there are */
+ p->gaps_exist += 1;
++notify:
++ if (disk->fops->swap_slot_free_notify)
++ disk->fops->swap_slot_free_notify(p->bdev, old);
+ out:
+ spin_unlock(&p->remap_lock);
+ return 0;
+@@ -1110,6 +1135,8 @@ sector_t map_swap_page(struct swap_info_
+ struct swap_extent *start_se = se;
+ unsigned old;
+
++ if (!sis->swap_remap)
++ goto out;
+ /*
+ * Instead of using the offset we are given, re-map it to the next
+ * sequential position.
+@@ -1159,7 +1186,7 @@ sector_t map_swap_page(struct swap_info_
+ offset = old;
+ }
+ spin_unlock(&sis->remap_lock);
+-
++out:
+ for ( ; ; ) {
+ struct list_head *lh;
+
+@@ -1517,8 +1544,10 @@ SYSCALL_DEFINE1(swapoff, const char __us
+ p->flags = 0;
+ spin_unlock(&swap_lock);
+ mutex_unlock(&swapon_mutex);
+- kfree(p->gap_pool_arr);
+- vfree(p->swap_remap);
++ if (p->swap_remap) {
++ kfree(p->gap_pool_arr);
++ vfree(p->swap_remap);
++ }
+ vfree(swap_map);
+ inode = mapping->host;
+ if (S_ISBLK(inode->i_mode)) {
+@@ -1832,15 +1861,17 @@ SYSCALL_DEFINE2(swapon, const char __use
+ error = -ENOMEM;
+ goto bad_swap;
+ }
+- swap_remap = vmalloc(maxpages * sizeof(unsigned));
+- if (!swap_remap) {
+- error = -ENOMEM;
+- goto bad_swap;
++ if (p->bdev->bd_disk->flags & GENHD_FL_REMAP_SWAPPED_PAGES) {
++ swap_remap = vmalloc(maxpages * sizeof(unsigned));
++ if (!swap_remap) {
++ error = -ENOMEM;
++ goto bad_swap;
++ }
++ memset(swap_remap, 0, maxpages * sizeof(unsigned));
+ }
+
+ error = 0;
+ memset(swap_map, 0, maxpages * sizeof(short));
+- memset(swap_remap, 0, maxpages * sizeof(unsigned));
+ for (i = 0; i < swap_header->info.nr_badpages; i++) {
+ int page_nr = swap_header->info.badpages[i];
+ if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
+@@ -1872,13 +1903,15 @@ SYSCALL_DEFINE2(swapon, const char __use
+ goto bad_swap;
+ }
+
+- p->gap_pool_arr = kmalloc(sizeof(struct swap_gap_node)*
+- SWAP_GAP_TREE_SIZE, GFP_KERNEL);
+- if (!p->gap_pool_arr) {
+- error = -ENOMEM;
+- goto bad_swap;
++ if (swap_remap) {
++ p->gap_pool_arr = kmalloc(sizeof(struct swap_gap_node)*
++ SWAP_GAP_TREE_SIZE, GFP_KERNEL);
++ if (!p->gap_pool_arr) {
++ error = -ENOMEM;
++ goto bad_swap;
++ }
++ p->gaps_tree = RB_ROOT;
+ }
+- p->gaps_tree = RB_ROOT;
+
+ mutex_lock(&swapon_mutex);
+ spin_lock(&swap_lock);
+@@ -1889,11 +1922,13 @@ SYSCALL_DEFINE2(swapon, const char __use
+ p->prio = --least_priority;
+ p->swap_map = swap_map;
+ p->swap_remap = swap_remap;
+- p->gap_next = 1;
+- p->gap_end = p->max - 1;
+- p->gaps_exist = p->max - 1;
+- spin_lock_init(&p->remap_lock);
+- mutex_init(&p->remap_mutex);
++ if (swap_remap) {
++ p->gap_next = 1;
++ p->gap_end = p->max - 1;
++ p->gaps_exist = p->max - 1;
++ spin_lock_init(&p->remap_lock);
++ mutex_init(&p->remap_mutex);
++ }
+ p->flags = SWP_ACTIVE;
+ nr_swap_pages += nr_good_pages;
+ total_swap_pages += nr_good_pages;
+@@ -1932,7 +1967,8 @@ bad_swap_2:
+ p->swap_file = NULL;
+ p->flags = 0;
+ spin_unlock(&swap_lock);
+- vfree(swap_remap);
++ if (swap_remap)
++ vfree(swap_remap);
+ vfree(swap_map);
+ if (swap_file)
+ filp_close(swap_file, NULL);