From 835ec08ba453b4bf94152feb11516e3732a5a40b Mon Sep 17 00:00:00 2001
From: Corey O'Connor <coreyoconnor@gmail.com>
Date: Mon, 27 Sep 2010 14:00:48 -0700
Subject: [PATCH] adding initial BFS 330 to 350 patch

---
 .../debian/patches/bfs-330-to-350.patch            | 1036 ++++++++++++++++++++
 kernel-power-2.6.28/debian/patches/series          |    1 +
 2 files changed, 1037 insertions(+)
 create mode 100644 kernel-power-2.6.28/debian/patches/bfs-330-to-350.patch

diff --git a/kernel-power-2.6.28/debian/patches/bfs-330-to-350.patch b/kernel-power-2.6.28/debian/patches/bfs-330-to-350.patch
new file mode 100644
index 0000000..46af1f9
--- /dev/null
+++ b/kernel-power-2.6.28/debian/patches/bfs-330-to-350.patch
@@ -0,0 +1,1036 @@
+- Major overhaul of queued changes.
+
+- Microoptimise multiplications/divisions to be shifts where suitable.
+
+- Change ISO calculations to have their own lock so as to not grab the grq
+lock during a scheduler tick.
+
+- Change all deadline accounting to use nanosecond values.
+
+- Introduce local niffies variable which is updated from any runqueue using the
+TSC clock whenever the grq lock is taken. Use niffies to compare deadlines to.
+This will give much more granular deadlines when jiffies are low resolution
+such as 100Hz, and rarely will tasks have the same deadlines now.
+
+- Drop the "skip_clock_update" concept as we update the niffies each time we
+update the rq clocks, thus we want to update it more often.
+
+- Rework try_preempt.
+
+- Bypass rechecking deadline when we know that prev will run again in schedule.
+
+- Check to see if prev can run on an idle CPU when being descheduled as may
+happen when a task must use a certain CPU for affinity reasons.
+
+- Decrease maximum rr_interval possible to 1000 (one second) as there is no
+demonstrable advantage to higher values, and may overflow with other changes
+being introduced.
+
+- Check for when tasks are scheduled within a tick to see if they're in the
+1st or 2nd half of the tick. Use this to decide how far into last tick a task
+can run. This will make the greatest difference on lower Hz values with small
+rr intervals.
+
+- Change the test for exhausted time slice to 100us as rescheduling with less
+time available than this will either greatly overrun its quota or reschedule
+very quickly.
+
+- Change SCHED_BATCH tasks to refill timeslices and reset deadline every time
+they're descheduled as they've been flagged as latency insensitive, likely
+fully CPU bound tasks. This should decrease the impact running batch tasks
+has on other tasks.
+
+- Microoptimise before context switch in schedule()
+
+- Add a local last_task variable to each runqueue which keeps a copy of the
+last non-idle task that ran on this CPU. Use this value to determine that a
+task is still cache warm on this CPU even if it has run elsewhere in the
+meantime. This improves throughput on relatively idle systems with >2 logical
+CPUs.
+
+- Remove the first_time_slice concept as it wasn't contributing on any
+meaningful level but was adding to overhead, especially on sched_exit.
+
+- Check that when a task forks and loses timeslice to its child that it isn't
+due to be descheduled and ensure that a child inherits the proper variables
+from its parent across fork.
+
+- Fix try_preempt which may have been picking a higher priority runqueue on
+SMP if it had a later deadline. Remove the test for equal deadline as
+nanosecond deadline resolution means this will never happen.
+
+- Random other minor cleanups.
+
+-ck
+---
+
+ Documentation/scheduler/sched-BFS.txt |    2 
+ include/linux/sched.h                 |    4 
+ kernel/sched_bfs.c                    |  466 +++++++++++++++++++++-------------
+ kernel/sysctl.c                       |    4 
+ 4 files changed, 293 insertions(+), 183 deletions(-)
+
+Index: linux-2.6.35-bfs/Documentation/scheduler/sched-BFS.txt
+===================================================================
+--- linux-2.6.35-bfs.orig/Documentation/scheduler/sched-BFS.txt	2010-09-25 08:18:30.134360792 +1000
++++ linux-2.6.35-bfs/Documentation/scheduler/sched-BFS.txt	2010-09-25 08:20:25.830887001 +1000
+@@ -257,7 +257,7 @@ uniprocessor machine, and automatically
+ multiprocessor machines. The reasoning behind increasing the value on more CPUs
+ is that the effective latency is decreased by virtue of there being more CPUs on
+ BFS (for reasons explained above), and increasing the value allows for less
+-cache contention and more throughput. Valid values are from 1 to 5000
++cache contention and more throughput. Valid values are from 1 to 1000
+ Decreasing the value will decrease latencies at the cost of decreasing
+ throughput, while increasing it will improve throughput, but at the cost of
+ worsening latencies. The accuracy of the rr interval is limited by HZ resolution
+Index: linux-2.6.35-bfs/include/linux/sched.h
+===================================================================
+--- linux-2.6.35-bfs.orig/include/linux/sched.h	2010-09-25 08:18:08.792894602 +1000
++++ linux-2.6.35-bfs/include/linux/sched.h	2010-09-25 08:20:25.822886826 +1000
+@@ -1197,7 +1197,7 @@ struct task_struct {
+ 	int prio, static_prio, normal_prio;
+ 	unsigned int rt_priority;
+ #ifdef CONFIG_SCHED_BFS
+-	int time_slice, first_time_slice;
++	int time_slice;
+ 	u64 deadline;
+ 	struct list_head run_list;
+ 	u64 last_ran;
+@@ -1547,7 +1547,7 @@ static inline void tsk_cpus_current(stru
+ 
+ static inline void print_scheduler_version(void)
+ {
+-	printk(KERN_INFO"BFS CPU scheduler v0.330 by Con Kolivas.\n");
++	printk(KERN_INFO"BFS CPU scheduler v0.350 by Con Kolivas.\n");
+ }
+ 
+ static inline int iso_task(struct task_struct *p)
+Index: linux-2.6.35-bfs/kernel/sched_bfs.c
+===================================================================
+--- linux-2.6.35-bfs.orig/kernel/sched_bfs.c	2010-09-25 08:18:08.804894864 +1000
++++ linux-2.6.35-bfs/kernel/sched_bfs.c	2010-09-25 08:20:25.827886935 +1000
+@@ -106,10 +106,19 @@
+ #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
+ #define SCHED_PRIO(p)		((p)+MAX_RT_PRIO)
+ 
+-/* Some helpers for converting to/from various scales.*/
++/*
++ * Some helpers for converting to/from various scales. Use shifts to get
++ * approximate multiples of ten for less overhead.
++ */
+ #define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
+-#define MS_TO_NS(TIME)		((TIME) * 1000000)
+-#define MS_TO_US(TIME)		((TIME) * 1000)
++#define HALF_JIFFY_NS		(1000000000 / HZ / 2)
++#define HALF_JIFFY_US		(1000000 / HZ / 2)
++#define MS_TO_NS(TIME)		((TIME) << 20)
++#define MS_TO_US(TIME)		((TIME) << 10)
++#define NS_TO_MS(TIME)		((TIME) >> 20)
++#define NS_TO_US(TIME)		((TIME) >> 10)
++
++#define RESCHED_US	(100) /* Reschedule if less than this many us left */
+ 
+ /*
+  * This is the time all tasks within the same priority round robin.
+@@ -140,8 +149,9 @@ static inline unsigned long timeslice(vo
+ }
+ 
+ /*
+- * The global runqueue data that all CPUs work off. All data is protected
+- * by grq.lock.
++ * The global runqueue data that all CPUs work off. Data is protected either
++ * by the global grq lock, or the discrete lock that precedes the data in this
++ * struct.
+  */
+ struct global_rq {
+ 	raw_spinlock_t lock;
+@@ -150,17 +160,17 @@ struct global_rq {
+ 	unsigned long long nr_switches;
+ 	struct list_head queue[PRIO_LIMIT];
+ 	DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1);
+-	int iso_ticks;
+-	int iso_refractory;
+ #ifdef CONFIG_SMP
+ 	unsigned long qnr; /* queued not running */
+ 	cpumask_t cpu_idle_map;
+ 	int idle_cpus;
+ #endif
+-#if BITS_PER_LONG < 64
+-	unsigned long jiffies;
+-	u64 jiffies_64;
+-#endif
++	/* Nanosecond jiffies */
++	u64 niffies;
++
++	raw_spinlock_t iso_lock;
++	int iso_ticks;
++	int iso_refractory;
+ };
+ 
+ /* There can be only one */
+@@ -176,8 +186,8 @@ struct rq {
+ 	u64 nohz_stamp;
+ 	unsigned char in_nohz_recently;
+ #endif
++	struct task_struct *last_task;
+ #endif
+-	unsigned int skip_clock_update;
+ 
+ 	struct task_struct *curr, *idle;
+ 	struct mm_struct *prev_mm;
+@@ -213,9 +223,11 @@ struct rq {
+ 	/* See if all cache siblings are idle */
+ 	cpumask_t cache_siblings;
+ #endif
++	u64 last_niffy; /* Last time this RQ updated grq.niffies */
+ #endif
++	u64 clock, old_clock, last_tick;
++	int dither;
+ 
+-	u64 clock;
+ #ifdef CONFIG_SCHEDSTATS
+ 
+ 	/* latency stats */
+@@ -286,15 +298,6 @@ struct root_domain {
+ static struct root_domain def_root_domain;
+ #endif
+ 
+-static inline int cpu_of(struct rq *rq)
+-{
+-#ifdef CONFIG_SMP
+-	return rq->cpu;
+-#else
+-	return 0;
+-#endif
+-}
+-
+ #define rcu_dereference_check_sched_domain(p) \
+ 	rcu_dereference_check((p), \
+ 			      rcu_read_lock_sched_held() || \
+@@ -310,17 +313,65 @@ static inline int cpu_of(struct rq *rq)
+ #define for_each_domain(cpu, __sd) \
+ 	for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
+ 
++static inline void update_rq_clock(struct rq *rq);
++
+ #ifdef CONFIG_SMP
+ #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
+ #define this_rq()		(&__get_cpu_var(runqueues))
+ #define task_rq(p)		cpu_rq(task_cpu(p))
+ #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
++static inline int cpu_of(struct rq *rq)
++{
++	return rq->cpu;
++}
++
++/*
++ * Niffies are a globally increasing nanosecond counter. Whenever a runqueue
++ * clock is updated with the grq.lock held, it is an opportunity to update the
++ * niffies value. Any CPU can update it by adding how much its clock has
++ * increased since it last updated niffies, minus any added niffies by other
++ * CPUs.
++ */
++static inline void update_clocks(struct rq *rq)
++{
++	s64 ndiff;
++
++	update_rq_clock(rq);
++	ndiff = rq->clock - rq->old_clock;
++	/* old_clock is only updated when we are updating niffies */
++	rq->old_clock = rq->clock;
++	ndiff -= grq.niffies - rq->last_niffy;
++	/*
++	 * Sanity check should sched_clock return bogus values or be limited to
++	 * just jiffy resolution. Some time will always have passed.
++	 */
++	if (unlikely(ndiff < 1 || ndiff > MS_TO_NS(rr_interval)))
++		ndiff = 1;
++	grq.niffies += ndiff;
++	rq->last_niffy = grq.niffies;
++}
+ #else /* CONFIG_SMP */
+ static struct rq *uprq;
+ #define cpu_rq(cpu)	(uprq)
+ #define this_rq()	(uprq)
+ #define task_rq(p)	(uprq)
+ #define cpu_curr(cpu)	((uprq)->curr)
++static inline int cpu_of(struct rq *rq)
++{
++	return 0;
++}
++
++static inline void update_clocks(struct rq *rq)
++{
++	s64 ndiff;
++
++	update_rq_clock(rq);
++	ndiff = rq->clock - rq->old_clock;
++	rq->old_clock = rq->clock;
++	if (unlikely(ndiff < 1 || ndiff > MS_TO_US(rr_interval)))
++		ndiff = 1;
++	grq.niffies += ndiff;
++}
+ #endif
+ #define raw_rq()	(&__raw_get_cpu_var(runqueues))
+ 
+@@ -335,13 +386,13 @@ static struct rq *uprq;
+ 
+ /*
+  * All common locking functions performed on grq.lock. rq->clock is local to
+- * the cpu accessing it so it can be modified just with interrupts disabled,
+- * but looking up task_rq must be done under grq.lock to be safe.
++ * the CPU accessing it so it can be modified just with interrupts disabled
++ * when we're not updating niffies.
++ * Looking up task_rq must be done under grq.lock to be safe.
+  */
+-inline void update_rq_clock(struct rq *rq)
++static inline void update_rq_clock(struct rq *rq)
+ {
+-	if (!rq->skip_clock_update)
+-		rq->clock = sched_clock_cpu(cpu_of(rq));
++	rq->clock = sched_clock_cpu(cpu_of(rq));
+ }
+ 
+ static inline int task_running(struct task_struct *p)
+@@ -370,8 +421,8 @@ static inline void grq_lock_irq(void)
+ static inline void time_lock_grq(struct rq *rq)
+ 	__acquires(grq.lock)
+ {
+-	update_rq_clock(rq);
+ 	grq_lock();
++	update_clocks(rq);
+ }
+ 
+ static inline void grq_unlock_irq(void)
+@@ -405,7 +456,7 @@ static inline struct rq
+ 	__acquires(grq.lock)
+ {
+ 	struct rq *rq = task_grq_lock(p, flags);
+-	update_rq_clock(rq);
++	update_clocks(rq);
+ 	return rq;
+ }
+ 
+@@ -420,7 +471,7 @@ static inline void time_task_grq_lock_ir
+ 	__acquires(grq.lock)
+ {
+ 	struct rq *rq = task_grq_lock_irq(p);
+-	update_rq_clock(rq);
++	update_clocks(rq);
+ }
+ 
+ static inline void task_grq_unlock_irq(void)
+@@ -515,33 +566,6 @@ static inline void finish_lock_switch(st
+ }
+ #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
+ 
+-/*
+- * In order to have a monotonic clock that does not wrap we have a 64 bit
+- * unsigned long that's protected by grq.lock used in place of jiffies on
+- * 32 bit builds.
+- */
+-#if BITS_PER_LONG < 64
+-static inline void update_gjiffies(void)
+-{
+-	if (grq.jiffies != jiffies) {
+-		grq_lock();
+-		grq.jiffies = jiffies;
+-		grq.jiffies_64++;
+-		grq_unlock();
+-	}
+-}
+-
+-#define gjiffies (grq.jiffies_64)
+-
+-#else /* BITS_PER_LONG < 64 */
+-static inline void update_gjiffies(void)
+-{
+-}
+-
+-#define gjiffies jiffies
+-
+-#endif /* BITS_PER_LONG < 64 */
+-
+ static inline int deadline_before(u64 deadline, u64 time)
+ {
+ 	return (deadline < time);
+@@ -574,17 +598,6 @@ static void dequeue_task(struct task_str
+ }
+ 
+ /*
+- * When a task is freshly forked, the first_time_slice flag is set to say
+- * it has taken time_slice from its parent and if it exits on this first
+- * time_slice it can return its time_slice back to the parent.
+- */
+-static inline void reset_first_time_slice(struct task_struct *p)
+-{
+-	if (unlikely(p->first_time_slice))
+-		p->first_time_slice = 0;
+-}
+-
+-/*
+  * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as
+  * an idle task, we ensure none of the following conditions are met.
+  */
+@@ -646,11 +659,11 @@ static inline int task_prio_ratio(struct
+ /*
+  * task_timeslice - all tasks of all priorities get the exact same timeslice
+  * length. CPU distribution is handled by giving different deadlines to
+- * tasks of different priorities.
++ * tasks of different priorities. Use 128 as the base value for fast shifts.
+  */
+ static inline int task_timeslice(struct task_struct *p)
+ {
+-	return (rr_interval * task_prio_ratio(p) / 100);
++	return (rr_interval * task_prio_ratio(p) / 128);
+ }
+ 
+ #ifdef CONFIG_SMP
+@@ -702,6 +715,15 @@ static int suitable_idle_cpus(struct tas
+ 
+ static void resched_task(struct task_struct *p);
+ 
++/*
++ * last_task stores the last non-idle task scheduled on the local rq for
++ * cache warmth testing.
++ */
++static inline void set_last_task(struct rq *rq, struct task_struct *p)
++{
++	rq->last_task = p;
++}
++
+ #define CPUIDLE_CACHE_BUSY	(1)
+ #define CPUIDLE_DIFF_CPU	(2)
+ #define CPUIDLE_THREAD_BUSY	(4)
+@@ -724,6 +746,9 @@ static void resched_task(struct task_str
+  * Other node, other CPU, idle cache, idle threads.
+  * Other node, other CPU, busy cache, idle threads.
+  * Other node, other CPU, busy threads.
++ *
++ * If p was the last task running on this rq, then regardless of where
++ * it has been running since then, it is cache warm on this rq.
+  */
+ static void resched_best_idle(struct task_struct *p)
+ {
+@@ -756,11 +781,14 @@ static void resched_best_idle(struct tas
+ 		tmp_rq = cpu_rq(cpu_tmp);
+ 
+ 		if (rq->cpu_locality[cpu_tmp]) {
++			/* Check rq->last_task hasn't been dereferenced */
++			if (rq->last_task && p != rq->last_task) {
+ #ifdef CONFIG_NUMA
+-			if (rq->cpu_locality[cpu_tmp] > 1)
+-				ranking |= CPUIDLE_DIFF_NODE;
++				if (rq->cpu_locality[cpu_tmp] > 1)
++					ranking |= CPUIDLE_DIFF_NODE;
+ #endif
+-			ranking |= CPUIDLE_DIFF_CPU;
++				ranking |= CPUIDLE_DIFF_CPU;
++			}
+ 		}
+ #ifdef CONFIG_SCHED_MC
+ 		if (!(tmp_rq->cache_idle(cpu_tmp)))
+@@ -802,6 +830,11 @@ static inline void resched_suitable_idle
+ static inline int
+ cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p)
+ {
++	/* Check rq->last_task hasn't been dereferenced */
++	if (likely(rq->last_task)) {
++		if (rq->last_task == p)
++			return 0;
++	}
+ 	return rq->cpu_locality[cpu_of(task_rq)] * task_timeslice(p);
+ }
+ #else /* CONFIG_SMP */
+@@ -840,6 +873,10 @@ cache_distance(struct rq *task_rq, struc
+ {
+ 	return 0;
+ }
++
++static inline void set_last_task(struct rq *rq, struct task_struct *p)
++{
++}
+ #endif /* CONFIG_SMP */
+ 
+ /*
+@@ -887,7 +924,7 @@ static int effective_prio(struct task_st
+  */
+ static void activate_task(struct task_struct *p, struct rq *rq)
+ {
+-	update_rq_clock(rq);
++	update_clocks(rq);
+ 
+ 	/*
+ 	 * Sleep time is in units of nanosecs, so shift by 20 to get a
+@@ -1157,8 +1194,28 @@ EXPORT_SYMBOL_GPL(kick_process);
+ #endif
+ 
+ #define rq_idle(rq)	((rq)->rq_prio == PRIO_LIMIT)
+-#define task_idle(p)	((p)->prio == PRIO_LIMIT)
+ 
++/*
++ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the
++ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or
++ * between themselves, they cooperatively multitask. An idle rq scores as
++ * prio PRIO_LIMIT so it is always preempted.
++ */
++static inline int
++can_preempt(struct task_struct *p, int prio, unsigned long deadline,
++	    unsigned int policy)
++{
++	/* Better static priority RT task or better policy preemption */
++	if (p->prio < prio)
++		return 1;
++	if (p->prio > prio)
++		return 0;
++	/* SCHED_NORMAL, BATCH and ISO will preempt based on deadline */
++	if (!deadline_before(p->deadline, deadline))
++		return 0;
++	return 1;
++}
++#ifdef CONFIG_SMP
+ #ifdef CONFIG_HOTPLUG_CPU
+ /*
+  * Check to see if there is a task that is affined only to offline CPUs but
+@@ -1178,14 +1235,20 @@ static inline int online_cpus(struct tas
+ #endif
+ 
+ /*
+- * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the
+- * basis of earlier deadlines. SCHED_BATCH, ISO and IDLEPRIO don't preempt
+- * between themselves, they cooperatively multitask. An idle rq scores as
+- * prio PRIO_LIMIT so it is always preempted. latest_deadline and
+- * highest_prio_rq are initialised only to silence the compiler. When
+- * all else is equal, still prefer this_rq.
++ * Check to see if p can run on cpu, and if not, whether there are any online
++ * CPUs it can run on instead.
++ */
++static inline int needs_other_cpu(struct task_struct *p, int cpu)
++{
++	if (unlikely(!cpu_isset(cpu, p->cpus_allowed) && online_cpus(p)))
++		return 1;
++	return 0;
++}
++
++/*
++ * latest_deadline and highest_prio_rq are initialised only to silence the
++ * compiler. When all else is equal, still prefer this_rq.
+  */
+-#ifdef CONFIG_SMP
+ static void try_preempt(struct task_struct *p, struct rq *this_rq)
+ {
+ 	struct rq *highest_prio_rq = this_rq;
+@@ -1193,6 +1256,10 @@ static void try_preempt(struct task_stru
+ 	int highest_prio;
+ 	cpumask_t tmp;
+ 
++	/* IDLEPRIO tasks never preempt anything */
++	if (p->policy == SCHED_IDLEPRIO)
++		return;
++
+ 	if (suitable_idle_cpus(p)) {
+ 		resched_best_idle(p);
+ 		return;
+@@ -1219,30 +1286,32 @@ static void try_preempt(struct task_stru
+ 		offset_deadline = rq->rq_deadline -
+ 				  cache_distance(this_rq, rq, p);
+ 
+-		if (rq_prio > highest_prio ||
+-		    (deadline_after(offset_deadline, latest_deadline) ||
+-		    (offset_deadline == latest_deadline && this_rq == rq))) {
++		if (rq_prio > highest_prio || (rq_prio == highest_prio &&
++		    deadline_after(offset_deadline, latest_deadline))) {
+ 			latest_deadline = offset_deadline;
+ 			highest_prio = rq_prio;
+ 			highest_prio_rq = rq;
+ 		}
+ 	}
+ 
+-	if (p->prio > highest_prio || (p->prio == highest_prio &&
+-	    p->policy == SCHED_NORMAL &&
+-	    !deadline_before(p->deadline, latest_deadline)))
++	if (!can_preempt(p, highest_prio, highest_prio_rq->rq_deadline,
++	    highest_prio_rq->rq_policy))
+ 		return;
+ 
+-	/* p gets to preempt highest_prio_rq->curr */
+ 	resched_task(highest_prio_rq->curr);
+-	highest_prio_rq->skip_clock_update = 1;
+ }
+ #else /* CONFIG_SMP */
++static inline int needs_other_cpu(struct task_struct *p, int cpu)
++{
++	return 0;
++}
++
+ static void try_preempt(struct task_struct *p, struct rq *this_rq)
+ {
+-	if (p->prio < uprq->rq_prio ||
+-	    (p->prio == uprq->rq_prio && p->policy == SCHED_NORMAL &&
+-	     deadline_before(p->deadline, uprq->rq_deadline)))
++	if (p->policy == SCHED_IDLEPRIO)
++		return;
++	if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline,
++	    uprq->rq_policy))
+ 		resched_task(uprq->curr);
+ }
+ #endif /* CONFIG_SMP */
+@@ -1352,12 +1421,15 @@ int wake_up_state(struct task_struct *p,
+ 	return try_to_wake_up(p, state, 0);
+ }
+ 
++static void time_slice_expired(struct task_struct *p);
++
+ /*
+  * Perform scheduler related setup for a newly forked process p.
+  * p is forked by current.
+  */
+ void sched_fork(struct task_struct *p, int clone_flags)
+ {
++	struct task_struct *curr;
+ 	int cpu = get_cpu();
+ 	struct rq *rq;
+ 
+@@ -1396,10 +1468,11 @@ void sched_fork(struct task_struct *p, i
+ 		p->sched_reset_on_fork = 0;
+ 	}
+ 
++	curr = current;
+ 	/*
+ 	 * Make sure we do not leak PI boosting priority to the child.
+ 	 */
+-	p->prio = current->normal_prio;
++	p->prio = curr->normal_prio;
+ 
+ 	INIT_LIST_HEAD(&p->run_list);
+ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+@@ -1420,18 +1493,26 @@ void sched_fork(struct task_struct *p, i
+ 	 * total amount of pending timeslices in the system doesn't change,
+ 	 * resulting in more scheduling fairness. If it's negative, it won't
+ 	 * matter since that's the same as being 0. current's time_slice is
+-	 * actually in rq_time_slice when it's running.
++	 * actually in rq_time_slice when it's running, as is its last_ran
++	 * value. rq->rq_deadline is only modified within schedule() so it
++	 * is always equal to current->deadline.
+ 	 */
+-	rq = task_grq_lock_irq(current);
+-	if (likely(rq->rq_time_slice > 0)) {
++	rq = task_grq_lock_irq(curr);
++	if (likely(rq->rq_time_slice >= RESCHED_US * 2)) {
+ 		rq->rq_time_slice /= 2;
++		p->time_slice = rq->rq_time_slice;
++	} else {
+ 		/*
+-		 * The remainder of the first timeslice might be recovered by
+-		 * the parent if the child exits early enough.
++		 * Forking task has run out of timeslice. Reschedule it and
++		 * start its child with a new time slice and deadline. The
++		 * child will end up running first because its deadline will
++		 * be slightly earlier.
+ 		 */
+-		p->first_time_slice = 1;
++		rq->rq_time_slice = 0;
++		set_tsk_need_resched(curr);
++		time_slice_expired(p);
+ 	}
+-	p->time_slice = rq->rq_time_slice;
++	p->last_ran = rq->rq_last_ran;
+ 	task_grq_unlock_irq();
+ out:
+ 	put_cpu();
+@@ -1470,40 +1551,9 @@ void wake_up_new_task(struct task_struct
+ 	task_grq_unlock(&flags);
+ }
+ 
+-/*
+- * Potentially available exiting-child timeslices are
+- * retrieved here - this way the parent does not get
+- * penalised for creating too many threads.
+- *
+- * (this cannot be used to 'generate' timeslices
+- * artificially, because any timeslice recovered here
+- * was given away by the parent in the first place.)
+- */
++/* Nothing to do here */
+ void sched_exit(struct task_struct *p)
+ {
+-	struct task_struct *parent;
+-	unsigned long flags;
+-	struct rq *rq;
+-
+-	if (unlikely(p->first_time_slice)) {
+-		int *par_tslice, *p_tslice;
+-
+-		parent = p->parent;
+-		par_tslice = &parent->time_slice;
+-		p_tslice = &p->time_slice;
+-
+-		rq = task_grq_lock(parent, &flags);
+-		/* The real time_slice of the "curr" task is on the rq var.*/
+-		if (p == rq->curr)
+-			p_tslice = &rq->rq_time_slice;
+-		else if (parent == task_rq(parent)->curr)
+-			par_tslice = &rq->rq_time_slice;
+-
+-		*par_tslice += *p_tslice;
+-		if (unlikely(*par_tslice > timeslice()))
+-			*par_tslice = timeslice();
+-		task_grq_unlock(&flags);
+-	}
+ }
+ 
+ #ifdef CONFIG_PREEMPT_NOTIFIERS
+@@ -1981,7 +2031,7 @@ update_cpu_clock(struct rq *rq, struct t
+ 		else if (unlikely(time_diff > JIFFIES_TO_NS(1)))
+ 			time_diff = JIFFIES_TO_NS(1);
+ 
+-		rq->rq_time_slice -= time_diff / 1000;
++		rq->rq_time_slice -= NS_TO_US(time_diff);
+ 	}
+ 	rq->rq_last_ran = rq->timekeep_clock = rq->clock;
+ }
+@@ -1997,7 +2047,7 @@ static u64 do_task_delta_exec(struct tas
+ 	u64 ns = 0;
+ 
+ 	if (p == rq->curr) {
+-		update_rq_clock(rq);
++		update_clocks(rq);
+ 		ns = rq->clock - rq->rq_last_ran;
+ 		if (unlikely((s64)ns < 0))
+ 			ns = 0;
+@@ -2171,10 +2221,22 @@ void account_idle_ticks(unsigned long ti
+ }
+ #endif
+ 
++static inline void grq_iso_lock(void)
++	__acquires(grq.iso_lock)
++{
++	raw_spin_lock(&grq.iso_lock);
++}
++
++static inline void grq_iso_unlock(void)
++	__releases(grq.iso_lock)
++{
++	raw_spin_unlock(&grq.iso_lock);
++}
++
+ /*
+  * Functions to test for when SCHED_ISO tasks have used their allocated
+  * quota as real time scheduling and convert them back to SCHED_NORMAL.
+- * Where possible, the data is tested lockless, to avoid grabbing grq_lock
++ * Where possible, the data is tested lockless, to avoid grabbing iso_lock
+  * because the occasional inaccurate result won't matter. However the
+  * tick data is only ever modified under lock. iso_refractory is only simply
+  * set to 0 or 1 so it's not worth grabbing the lock yet again for that.
+@@ -2209,21 +2271,21 @@ static unsigned int test_ret_isorefracto
+ 
+ static void iso_tick(void)
+ {
+-	grq_lock();
++	grq_iso_lock();
+ 	grq.iso_ticks += 100;
+-	grq_unlock();
++	grq_iso_unlock();
+ }
+ 
+ /* No SCHED_ISO task was running so decrease rq->iso_ticks */
+ static inline void no_iso_tick(void)
+ {
+ 	if (grq.iso_ticks) {
+-		grq_lock();
++		grq_iso_lock();
+ 		grq.iso_ticks -= grq.iso_ticks / ISO_PERIOD + 1;
+ 		if (unlikely(grq.iso_refractory && grq.iso_ticks <
+ 		    ISO_PERIOD * (sched_iso_cpu * 115 / 128)))
+ 			clear_iso_refractory();
+-		grq_unlock();
++		grq_iso_unlock();
+ 	}
+ }
+ 
+@@ -2262,10 +2324,23 @@ static void task_running_tick(struct rq
+ 	}
+ 
+ 	/* SCHED_FIFO tasks never run out of timeslice. */
+-	if (rq_idle(rq) || rq->rq_time_slice > 0 || rq->rq_policy == SCHED_FIFO)
++	if (rq->rq_policy == SCHED_FIFO)
+ 		return;
++	/*
++	 * Tasks that were scheduled in the first half of a tick are not
++	 * allowed to run into the 2nd half of the next tick if they will
++	 * run out of time slice in the interim. Otherwise, if they have
++	 * less than 100us of time slice left they will be rescheduled.
++	 */
++	if (rq->dither) {
++		if (rq->rq_time_slice > HALF_JIFFY_US)
++			return;
++		else
++			rq->rq_time_slice = 0;
++	} else if (rq->rq_time_slice >= RESCHED_US)
++			return;
+ 
+-	/* p->time_slice <= 0. We only modify task_struct under grq lock */
++	/* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */
+ 	p = rq->curr;
+ 	requeue_task(p);
+ 	grq_lock();
+@@ -2286,13 +2361,14 @@ void scheduler_tick(void)
+ 	struct rq *rq = cpu_rq(cpu);
+ 
+ 	sched_clock_tick();
++	/* grq lock not grabbed, so only update rq clock */
+ 	update_rq_clock(rq);
+ 	update_cpu_clock(rq, rq->curr, 1);
+-	update_gjiffies();
+ 	if (!rq_idle(rq))
+ 		task_running_tick(rq);
+ 	else
+ 		no_iso_tick();
++	rq->last_tick = rq->clock;
+ 	perf_event_task_tick(rq->curr);
+ }
+ 
+@@ -2354,7 +2430,7 @@ EXPORT_SYMBOL(sub_preempt_count);
+ #endif
+ 
+ /*
+- * Deadline is "now" in gjiffies + (offset by priority). Setting the deadline
++ * Deadline is "now" in niffies + (offset by priority). Setting the deadline
+  * is the key to everything. It distributes cpu fairly amongst tasks of the
+  * same nice value, it proportions cpu according to nice level, it means the
+  * task that last woke up the longest ago has the earliest deadline, thus
+@@ -2364,7 +2440,7 @@ EXPORT_SYMBOL(sub_preempt_count);
+  */
+ static inline int prio_deadline_diff(int user_prio)
+ {
+-	return (prio_ratios[user_prio] * rr_interval * HZ / (1000 * 100)) ? : 1;
++	return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
+ }
+ 
+ static inline int task_deadline_diff(struct task_struct *p)
+@@ -2377,25 +2453,33 @@ static inline int static_deadline_diff(i
+ 	return prio_deadline_diff(USER_PRIO(static_prio));
+ }
+ 
+-static inline int longest_deadline_diff(void)
++static inline int ms_longest_deadline_diff(void)
+ {
+-	return prio_deadline_diff(39);
++	return NS_TO_MS(prio_deadline_diff(39));
+ }
+ 
+ /*
+  * The time_slice is only refilled when it is empty and that is when we set a
+  * new deadline.
+  */
+-static inline void time_slice_expired(struct task_struct *p)
++static void time_slice_expired(struct task_struct *p)
+ {
+-	reset_first_time_slice(p);
+ 	p->time_slice = timeslice();
+-	p->deadline = gjiffies + task_deadline_diff(p);
++	p->deadline = grq.niffies + task_deadline_diff(p);
+ }
+ 
++/*
++ * Timeslices below RESCHED_US are considered as good as expired as there's no
++ * point rescheduling when there's so little time left. SCHED_BATCH tasks
++ * have been flagged be not latency sensitive and likely to be fully CPU
++ * bound so every time they're rescheduled they have their time_slice
++ * refilled, but get a new later deadline to have little effect on
++ * SCHED_NORMAL tasks.
++
++ */
+ static inline void check_deadline(struct task_struct *p)
+ {
+-	if (p->time_slice <= 0)
++	if (p->time_slice < RESCHED_US || batch_task(p))
+ 		time_slice_expired(p);
+ }
+ 
+@@ -2433,7 +2517,7 @@ retry:
+ 	queue = grq.queue + idx;
+ 	list_for_each_entry(p, queue, run_list) {
+ 		/* Make sure cpu affinity is ok */
+-		if (online_cpus(p) && !cpu_isset(cpu, p->cpus_allowed))
++		if (needs_other_cpu(p, cpu))
+ 			continue;
+ 		if (idx < MAX_RT_PRIO) {
+ 			/* We found an rt task */
+@@ -2560,12 +2644,14 @@ need_resched_nonpreemptible:
+ 	deactivate = 0;
+ 	schedule_debug(prev);
+ 
+-	local_irq_disable();
+-	update_rq_clock(rq);
++	grq_lock_irq();
++	update_clocks(rq);
+ 	update_cpu_clock(rq, prev, 0);
+-	rq->skip_clock_update = 0;
++	if (rq->clock - rq->last_tick > HALF_JIFFY_NS)
++		rq->dither = 0;
++	else
++		rq->dither = 1;
+ 
+-	grq_lock();
+ 	clear_tsk_need_resched(prev);
+ 
+ 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+@@ -2581,36 +2667,54 @@ need_resched_nonpreemptible:
+ 		prev->time_slice = rq->rq_time_slice;
+ 		prev->deadline = rq->rq_deadline;
+ 		check_deadline(prev);
+-		return_task(prev, deactivate);
+-		/* Task changed affinity off this cpu */
+-		if (unlikely(!cpus_intersects(prev->cpus_allowed,
+-		    cpumask_of_cpu(cpu)))) {
+-			if (online_cpus(prev))
++		prev->last_ran = rq->clock;
++
++		/* Task changed affinity off this CPU */
++		if (needs_other_cpu(prev, cpu))
++			resched_suitable_idle(prev);
++		else if (!deactivate) {
++			if (!queued_notrunning()) {
++				/*
++				* We now know prev is the only thing that is
++				* awaiting CPU so we can bypass rechecking for
++				* the earliest deadline task and just run it
++				* again.
++				*/
++				grq_unlock_irq();
++				goto rerun_prev_unlocked;
++			} else {
++				/*
++				 * If prev got kicked off by a task that has to
++				 * run on this CPU for affinity reasons then
++				 * there may be an idle CPU it can go to.
++				 */
+ 				resched_suitable_idle(prev);
+ 			}
++		}
++		return_task(prev, deactivate);
+ 	}
+ 
+-	if (likely(queued_notrunning())) {
+-		next = earliest_deadline_task(rq, idle);
+-	} else {
++	if (unlikely(!queued_notrunning())) {
++		/*
++		 * This CPU is now truly idle as opposed to when idle is
++		 * scheduled as a high priority task in its own right.
++		 */
+ 		next = idle;
+ 		schedstat_inc(rq, sched_goidle);
+-	}
+-
+-	prefetch(next);
+-	prefetch_stack(next);
+-
+-	if (task_idle(next))
+ 		set_cpuidle_map(cpu);
+-	else
++	} else {
++		next = earliest_deadline_task(rq, idle);
++		prefetch(next);
++		prefetch_stack(next);
+ 		clear_cpuidle_map(cpu);
+-
+-	prev->last_ran = rq->clock;
++	}
+ 
+ 	if (likely(prev != next)) {
+ 		sched_info_switch(prev, next);
+ 		perf_event_task_sched_out(prev, next);
+ 
++		if (prev != idle)
++			set_last_task(rq, prev);
+ 		set_rq_task(rq, next);
+ 		grq.nr_switches++;
+ 		prev->oncpu = 0;
+@@ -2629,6 +2733,7 @@ need_resched_nonpreemptible:
+ 	} else
+ 		grq_unlock_irq();
+ 
++rerun_prev_unlocked:
+ 	if (unlikely(reacquire_kernel_lock(current) < 0)) {
+ 		prev = rq->curr;
+ 		switch_count = &prev->nivcsw;
+@@ -3324,8 +3429,9 @@ int task_prio(const struct task_struct *
+ 	if (prio <= 0)
+ 		goto out;
+ 
+-	delta = p->deadline - gjiffies;
+-	delta = delta * 40 / longest_deadline_diff();
++	/* Convert to ms to avoid overflows */
++	delta = NS_TO_MS(p->deadline - grq.niffies);
++	delta = delta * 40 / ms_longest_deadline_diff();
+ 	if (delta > 0 && delta <= 80)
+ 		prio += delta;
+ 	if (idleprio_task(p))
+@@ -3533,7 +3639,7 @@ recheck:
+ 		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ 		goto recheck;
+ 	}
+-	update_rq_clock(rq);
++	update_clocks(rq);
+ 	p->sched_reset_on_fork = reset_on_fork;
+ 
+ 	queued = task_queued(p);
+@@ -4835,7 +4941,7 @@ migration_call(struct notifier_block *nf
+ 		__setscheduler(idle, rq, SCHED_NORMAL, 0);
+ 		idle->prio = PRIO_LIMIT;
+ 		set_rq_task(rq, idle);
+-		update_rq_clock(rq);
++		update_clocks(rq);
+ 		grq_unlock_irq();
+ 		break;
+ 
+@@ -6531,12 +6637,14 @@ void __init sched_init(void)
+ 	int i;
+ 	struct rq *rq;
+ 
+-	prio_ratios[0] = 100;
++	prio_ratios[0] = 128;
+ 	for (i = 1 ; i < PRIO_RANGE ; i++)
+ 		prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
+ 
+ 	raw_spin_lock_init(&grq.lock);
+ 	grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0;
++	grq.niffies = 0;
++	raw_spin_lock_init(&grq.iso_lock);
+ 	grq.iso_ticks = grq.iso_refractory = 0;
+ #ifdef CONFIG_SMP
+ 	init_defrootdomain();
+@@ -6549,7 +6657,9 @@ void __init sched_init(void)
+ 		rq = cpu_rq(i);
+ 		rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc =
+ 			      rq->iowait_pc = rq->idle_pc = 0;
++		rq->dither = 0;
+ #ifdef CONFIG_SMP
++		rq->last_niffy = 0;
+ 		rq->sd = NULL;
+ 		rq->rd = NULL;
+ 		rq->online = 0;
+Index: linux-2.6.35-bfs/kernel/sysctl.c
+===================================================================
+--- linux-2.6.35-bfs.orig/kernel/sysctl.c	2010-09-25 08:18:30.147361076 +1000
++++ linux-2.6.35-bfs/kernel/sysctl.c	2010-09-25 08:20:25.823886848 +1000
+@@ -119,7 +119,7 @@ static int __maybe_unused one_hundred =
+ #ifdef CONFIG_SCHED_BFS
+ extern int rr_interval;
+ extern int sched_iso_cpu;
+-static int __read_mostly five_thousand = 5000;
++static int __read_mostly one_thousand = 1000;
+ #endif
+ #ifdef CONFIG_PRINTK
+ static int ten_thousand = 10000;
+@@ -794,7 +794,7 @@ static struct ctl_table kern_table[] = {
+ 		.mode		= 0644,
+ 		.proc_handler	= &proc_dointvec_minmax,
+ 		.extra1		= &one,
+-		.extra2		= &five_thousand,
++		.extra2		= &one_thousand,
+ 	},
+ 	{
+ 		.procname	= "iso_cpu",
diff --git a/kernel-power-2.6.28/debian/patches/series b/kernel-power-2.6.28/debian/patches/series
index a8fde40..07fe2e7 100644
--- a/kernel-power-2.6.28/debian/patches/series
+++ b/kernel-power-2.6.28/debian/patches/series
@@ -33,6 +33,7 @@ overclock.diff
 bfs.patch
 bfs-316-to-318.patch
 bfs-318-to-330.patch
+bfs-330-to-350.patch
 voltage_scaling_1.diff
 voltage_scaling_0.diff
 armthumb.diff
-- 
1.7.9.5