--- /dev/null
+- Major overhaul of queued changes.
+
+- Microoptimise multiplications/divisions to be shifts where suitable.
+
+- Change ISO calculations to have their own lock so as to not grab the grq
+lock during a scheduler tick.
+
+- Change all deadline accounting to use nanosecond values.
+
+- Introduce local niffies variable which is updated from any runqueue using the
+TSC clock whenever the grq lock is taken. Use niffies to compare deadlines to.
+This will give much more granular deadlines when jiffies are low resolution
+such as 100Hz, and rarely will tasks have the same deadlines now.
+
+- Drop the "skip_clock_update" concept as we update the niffies each time we
+update the rq clocks, thus we want to update it more often.
+
+- Rework try_preempt.
+
+- Bypass rechecking deadline when we know that prev will run again in schedule.
+
+- Check to see if prev can run on an idle CPU when being descheduled as may
+happen when a task must use a certain CPU for affinity reasons.
+
+- Decrease maximum rr_interval possible to 1000 (one second) as there is no
+demonstrable advantage to higher values, and may overflow with other changes
+being introduced.
+
+- Check for when tasks are scheduled within a tick to see if they're in the
+1st or 2nd half of the tick. Use this to decide how far into last tick a task
+can run. This will make the greatest difference on lower Hz values with small
+rr intervals.
+
+- Change the test for exhausted time slice to 100us as rescheduling with less
+time available than this will either greatly overrun its quota or reschedule
+very quickly.
+
+- Change SCHED_BATCH tasks to refill timeslices and reset deadline every time
+they're descheduled as they've been flagged as latency insensitive, likely
+fully CPU bound tasks. This should decrease the impact running batch tasks
+has on other tasks.
+
+- Microoptimise before context switch in schedule()
+
+- Add a local last_task variable to each runqueue which keeps a copy of the
+last non-idle task that ran on this CPU. Use this value to determine that a
+task is still cache warm on this CPU even if it has run elsewhere in the
+meantime. This improves throughput on relatively idle systems with >2 logical
+CPUs.
+
+- Remove the first_time_slice concept as it wasn't contributing on any
+meaningful level but was adding to overhead, especially on sched_exit.
+
+- Check that when a task forks and loses timeslice to its child that it isn't
+due to be descheduled and ensure that a child inherits the proper variables
+from its parent across fork.
+
+- Fix try_preempt which may have been picking a higher priority runqueue on
+SMP if it had a later deadline. Remove the test for equal deadline as
+nanosecond deadline resolution means this will never happen.
+
+- Random other minor cleanups.
+
+-ck
+---
+
+ Documentation/scheduler/sched-BFS.txt | 2
+ include/linux/sched.h | 4
+ kernel/sched_bfs.c | 466 +++++++++++++++++++++-------------
+ kernel/sysctl.c | 4
+ 4 files changed, 293 insertions(+), 183 deletions(-)
+
+Index: linux-2.6.35-bfs/Documentation/scheduler/sched-BFS.txt
+===================================================================
+--- linux-2.6.35-bfs.orig/Documentation/scheduler/sched-BFS.txt 2010-09-25 08:18:30.134360792 +1000
++++ linux-2.6.35-bfs/Documentation/scheduler/sched-BFS.txt 2010-09-25 08:20:25.830887001 +1000
+@@ -257,7 +257,7 @@ uniprocessor machine, and automatically
+ multiprocessor machines. The reasoning behind increasing the value on more CPUs
+ is that the effective latency is decreased by virtue of there being more CPUs on
+ BFS (for reasons explained above), and increasing the value allows for less
+-cache contention and more throughput. Valid values are from 1 to 5000
++cache contention and more throughput. Valid values are from 1 to 1000
+ Decreasing the value will decrease latencies at the cost of decreasing
+ throughput, while increasing it will improve throughput, but at the cost of
+ worsening latencies. The accuracy of the rr interval is limited by HZ resolution
+Index: linux-2.6.35-bfs/include/linux/sched.h
+===================================================================
+--- linux-2.6.35-bfs.orig/include/linux/sched.h 2010-09-25 08:18:08.792894602 +1000
++++ linux-2.6.35-bfs/include/linux/sched.h 2010-09-25 08:20:25.822886826 +1000
+@@ -1197,7 +1197,7 @@ struct task_struct {
+ int prio, static_prio, normal_prio;
+ unsigned int rt_priority;
+ #ifdef CONFIG_SCHED_BFS
+- int time_slice, first_time_slice;
++ int time_slice;
+ u64 deadline;
+ struct list_head run_list;
+ u64 last_ran;
+@@ -1547,7 +1547,7 @@ static inline void tsk_cpus_current(stru
+
+ static inline void print_scheduler_version(void)
+ {
+- printk(KERN_INFO"BFS CPU scheduler v0.330 by Con Kolivas.\n");
++ printk(KERN_INFO"BFS CPU scheduler v0.350 by Con Kolivas.\n");
+ }
+
+ static inline int iso_task(struct task_struct *p)
+Index: linux-2.6.35-bfs/kernel/sched_bfs.c
+===================================================================
+--- linux-2.6.35-bfs.orig/kernel/sched_bfs.c 2010-09-25 08:18:08.804894864 +1000
++++ linux-2.6.35-bfs/kernel/sched_bfs.c 2010-09-25 08:20:25.827886935 +1000
+@@ -106,10 +106,19 @@
+ #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
+ #define SCHED_PRIO(p) ((p)+MAX_RT_PRIO)
+
+-/* Some helpers for converting to/from various scales.*/
++/*
++ * Some helpers for converting to/from various scales. Use shifts to get
++ * approximate multiples of ten for less overhead.
++ */
+ #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
+-#define MS_TO_NS(TIME) ((TIME) * 1000000)
+-#define MS_TO_US(TIME) ((TIME) * 1000)
++#define HALF_JIFFY_NS (1000000000 / HZ / 2)
++#define HALF_JIFFY_US (1000000 / HZ / 2)
++#define MS_TO_NS(TIME) ((TIME) << 20)
++#define MS_TO_US(TIME) ((TIME) << 10)
++#define NS_TO_MS(TIME) ((TIME) >> 20)
++#define NS_TO_US(TIME) ((TIME) >> 10)
++
++#define RESCHED_US (100) /* Reschedule if less than this many us left */
+
+ /*
+ * This is the time all tasks within the same priority round robin.
+@@ -140,8 +149,9 @@ static inline unsigned long timeslice(vo
+ }
+
+ /*
+- * The global runqueue data that all CPUs work off. All data is protected
+- * by grq.lock.
++ * The global runqueue data that all CPUs work off. Data is protected either
++ * by the global grq lock, or the discrete lock that precedes the data in this
++ * struct.
+ */
+ struct global_rq {
+ raw_spinlock_t lock;
+@@ -150,17 +160,17 @@ struct global_rq {
+ unsigned long long nr_switches;
+ struct list_head queue[PRIO_LIMIT];
+ DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1);
+- int iso_ticks;
+- int iso_refractory;
+ #ifdef CONFIG_SMP
+ unsigned long qnr; /* queued not running */
+ cpumask_t cpu_idle_map;
+ int idle_cpus;
+ #endif
+-#if BITS_PER_LONG < 64
+- unsigned long jiffies;
+- u64 jiffies_64;
+-#endif
++ /* Nanosecond jiffies */
++ u64 niffies;
++
++ raw_spinlock_t iso_lock;
++ int iso_ticks;
++ int iso_refractory;
+ };
+
+ /* There can be only one */
+@@ -176,8 +186,8 @@ struct rq {
+ u64 nohz_stamp;
+ unsigned char in_nohz_recently;
+ #endif
++ struct task_struct *last_task;
+ #endif
+- unsigned int skip_clock_update;
+
+ struct task_struct *curr, *idle;
+ struct mm_struct *prev_mm;
+@@ -213,9 +223,11 @@ struct rq {
+ /* See if all cache siblings are idle */
+ cpumask_t cache_siblings;
+ #endif
++ u64 last_niffy; /* Last time this RQ updated grq.niffies */
+ #endif
++ u64 clock, old_clock, last_tick;
++ int dither;
+
+- u64 clock;
+ #ifdef CONFIG_SCHEDSTATS
+
+ /* latency stats */
+@@ -286,15 +298,6 @@ struct root_domain {
+ static struct root_domain def_root_domain;
+ #endif
+
+-static inline int cpu_of(struct rq *rq)
+-{
+-#ifdef CONFIG_SMP
+- return rq->cpu;
+-#else
+- return 0;
+-#endif
+-}
+-
+ #define rcu_dereference_check_sched_domain(p) \
+ rcu_dereference_check((p), \
+ rcu_read_lock_sched_held() || \
+@@ -310,17 +313,65 @@ static inline int cpu_of(struct rq *rq)
+ #define for_each_domain(cpu, __sd) \
+ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
+
++static inline void update_rq_clock(struct rq *rq);
++
+ #ifdef CONFIG_SMP
+ #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
+ #define this_rq() (&__get_cpu_var(runqueues))
+ #define task_rq(p) cpu_rq(task_cpu(p))
+ #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
++static inline int cpu_of(struct rq *rq)
++{
++ return rq->cpu;
++}
++
++/*
++ * Niffies are a globally increasing nanosecond counter. Whenever a runqueue
++ * clock is updated with the grq.lock held, it is an opportunity to update the
++ * niffies value. Any CPU can update it by adding how much its clock has
++ * increased since it last updated niffies, minus any added niffies by other
++ * CPUs.
++ */
++static inline void update_clocks(struct rq *rq)
++{
++ s64 ndiff;
++
++ update_rq_clock(rq);
++ ndiff = rq->clock - rq->old_clock;
++ /* old_clock is only updated when we are updating niffies */
++ rq->old_clock = rq->clock;
++ ndiff -= grq.niffies - rq->last_niffy;
++ /*
++ * Sanity check should sched_clock return bogus values or be limited to
++ * just jiffy resolution. Some time will always have passed.
++ */
++ if (unlikely(ndiff < 1 || ndiff > MS_TO_NS(rr_interval)))
++ ndiff = 1;
++ grq.niffies += ndiff;
++ rq->last_niffy = grq.niffies;
++}
+ #else /* CONFIG_SMP */
+ static struct rq *uprq;
+ #define cpu_rq(cpu) (uprq)
+ #define this_rq() (uprq)
+ #define task_rq(p) (uprq)
+ #define cpu_curr(cpu) ((uprq)->curr)
++static inline int cpu_of(struct rq *rq)
++{
++ return 0;
++}
++
++static inline void update_clocks(struct rq *rq)
++{
++ s64 ndiff;
++
++ update_rq_clock(rq);
++ ndiff = rq->clock - rq->old_clock;
++ rq->old_clock = rq->clock;
++ if (unlikely(ndiff < 1 || ndiff > MS_TO_US(rr_interval)))
++ ndiff = 1;
++ grq.niffies += ndiff;
++}
+ #endif
+ #define raw_rq() (&__raw_get_cpu_var(runqueues))
+
+@@ -335,13 +386,13 @@ static struct rq *uprq;
+
+ /*
+ * All common locking functions performed on grq.lock. rq->clock is local to
+- * the cpu accessing it so it can be modified just with interrupts disabled,
+- * but looking up task_rq must be done under grq.lock to be safe.
++ * the CPU accessing it so it can be modified just with interrupts disabled
++ * when we're not updating niffies.
++ * Looking up task_rq must be done under grq.lock to be safe.
+ */
+-inline void update_rq_clock(struct rq *rq)
++static inline void update_rq_clock(struct rq *rq)
+ {
+- if (!rq->skip_clock_update)
+- rq->clock = sched_clock_cpu(cpu_of(rq));
++ rq->clock = sched_clock_cpu(cpu_of(rq));
+ }
+
+ static inline int task_running(struct task_struct *p)
+@@ -370,8 +421,8 @@ static inline void grq_lock_irq(void)
+ static inline void time_lock_grq(struct rq *rq)
+ __acquires(grq.lock)
+ {
+- update_rq_clock(rq);
+ grq_lock();
++ update_clocks(rq);
+ }
+
+ static inline void grq_unlock_irq(void)
+@@ -405,7 +456,7 @@ static inline struct rq
+ __acquires(grq.lock)
+ {
+ struct rq *rq = task_grq_lock(p, flags);
+- update_rq_clock(rq);
++ update_clocks(rq);
+ return rq;
+ }
+
+@@ -420,7 +471,7 @@ static inline void time_task_grq_lock_ir
+ __acquires(grq.lock)
+ {
+ struct rq *rq = task_grq_lock_irq(p);
+- update_rq_clock(rq);
++ update_clocks(rq);
+ }
+
+ static inline void task_grq_unlock_irq(void)
+@@ -515,33 +566,6 @@ static inline void finish_lock_switch(st
+ }
+ #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
+
+-/*
+- * In order to have a monotonic clock that does not wrap we have a 64 bit
+- * unsigned long that's protected by grq.lock used in place of jiffies on
+- * 32 bit builds.
+- */
+-#if BITS_PER_LONG < 64
+-static inline void update_gjiffies(void)
+-{
+- if (grq.jiffies != jiffies) {
+- grq_lock();
+- grq.jiffies = jiffies;
+- grq.jiffies_64++;
+- grq_unlock();
+- }
+-}
+-
+-#define gjiffies (grq.jiffies_64)
+-
+-#else /* BITS_PER_LONG < 64 */
+-static inline void update_gjiffies(void)
+-{
+-}
+-
+-#define gjiffies jiffies
+-
+-#endif /* BITS_PER_LONG < 64 */
+-
+ static inline int deadline_before(u64 deadline, u64 time)
+ {
+ return (deadline < time);
+@@ -574,17 +598,6 @@ static void dequeue_task(struct task_str
+ }
+
+ /*
+- * When a task is freshly forked, the first_time_slice flag is set to say
+- * it has taken time_slice from its parent and if it exits on this first
+- * time_slice it can return its time_slice back to the parent.
+- */
+-static inline void reset_first_time_slice(struct task_struct *p)
+-{
+- if (unlikely(p->first_time_slice))
+- p->first_time_slice = 0;
+-}
+-
+-/*
+ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as
+ * an idle task, we ensure none of the following conditions are met.
+ */
+@@ -646,11 +659,11 @@ static inline int task_prio_ratio(struct
+ /*
+ * task_timeslice - all tasks of all priorities get the exact same timeslice
+ * length. CPU distribution is handled by giving different deadlines to
+- * tasks of different priorities.
++ * tasks of different priorities. Use 128 as the base value for fast shifts.
+ */
+ static inline int task_timeslice(struct task_struct *p)
+ {
+- return (rr_interval * task_prio_ratio(p) / 100);
++ return (rr_interval * task_prio_ratio(p) / 128);
+ }
+
+ #ifdef CONFIG_SMP
+@@ -702,6 +715,15 @@ static int suitable_idle_cpus(struct tas
+
+ static void resched_task(struct task_struct *p);
+
++/*
++ * last_task stores the last non-idle task scheduled on the local rq for
++ * cache warmth testing.
++ */
++static inline void set_last_task(struct rq *rq, struct task_struct *p)
++{
++ rq->last_task = p;
++}
++
+ #define CPUIDLE_CACHE_BUSY (1)
+ #define CPUIDLE_DIFF_CPU (2)
+ #define CPUIDLE_THREAD_BUSY (4)
+@@ -724,6 +746,9 @@ static void resched_task(struct task_str
+ * Other node, other CPU, idle cache, idle threads.
+ * Other node, other CPU, busy cache, idle threads.
+ * Other node, other CPU, busy threads.
++ *
++ * If p was the last task running on this rq, then regardless of where
++ * it has been running since then, it is cache warm on this rq.
+ */
+ static void resched_best_idle(struct task_struct *p)
+ {
+@@ -756,11 +781,14 @@ static void resched_best_idle(struct tas
+ tmp_rq = cpu_rq(cpu_tmp);
+
+ if (rq->cpu_locality[cpu_tmp]) {
++ /* Check rq->last_task hasn't been dereferenced */
++ if (rq->last_task && p != rq->last_task) {
+ #ifdef CONFIG_NUMA
+- if (rq->cpu_locality[cpu_tmp] > 1)
+- ranking |= CPUIDLE_DIFF_NODE;
++ if (rq->cpu_locality[cpu_tmp] > 1)
++ ranking |= CPUIDLE_DIFF_NODE;
+ #endif
+- ranking |= CPUIDLE_DIFF_CPU;
++ ranking |= CPUIDLE_DIFF_CPU;
++ }
+ }
+ #ifdef CONFIG_SCHED_MC
+ if (!(tmp_rq->cache_idle(cpu_tmp)))
+@@ -802,6 +830,11 @@ static inline void resched_suitable_idle
+ static inline int
+ cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p)
+ {
++ /* Check rq->last_task hasn't been dereferenced */
++ if (likely(rq->last_task)) {
++ if (rq->last_task == p)
++ return 0;
++ }
+ return rq->cpu_locality[cpu_of(task_rq)] * task_timeslice(p);
+ }
+ #else /* CONFIG_SMP */
+@@ -840,6 +873,10 @@ cache_distance(struct rq *task_rq, struc
+ {
+ return 0;
+ }
++
++static inline void set_last_task(struct rq *rq, struct task_struct *p)
++{
++}
+ #endif /* CONFIG_SMP */
+
+ /*
+@@ -887,7 +924,7 @@ static int effective_prio(struct task_st
+ */
+ static void activate_task(struct task_struct *p, struct rq *rq)
+ {
+- update_rq_clock(rq);
++ update_clocks(rq);
+
+ /*
+ * Sleep time is in units of nanosecs, so shift by 20 to get a
+@@ -1157,8 +1194,28 @@ EXPORT_SYMBOL_GPL(kick_process);
+ #endif
+
+ #define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT)
+-#define task_idle(p) ((p)->prio == PRIO_LIMIT)
+
++/*
++ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the
++ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or
++ * between themselves, they cooperatively multitask. An idle rq scores as
++ * prio PRIO_LIMIT so it is always preempted.
++ */
++static inline int
++can_preempt(struct task_struct *p, int prio, unsigned long deadline,
++ unsigned int policy)
++{
++ /* Better static priority RT task or better policy preemption */
++ if (p->prio < prio)
++ return 1;
++ if (p->prio > prio)
++ return 0;
++ /* SCHED_NORMAL, BATCH and ISO will preempt based on deadline */
++ if (!deadline_before(p->deadline, deadline))
++ return 0;
++ return 1;
++}
++#ifdef CONFIG_SMP
+ #ifdef CONFIG_HOTPLUG_CPU
+ /*
+ * Check to see if there is a task that is affined only to offline CPUs but
+@@ -1178,14 +1235,20 @@ static inline int online_cpus(struct tas
+ #endif
+
+ /*
+- * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the
+- * basis of earlier deadlines. SCHED_BATCH, ISO and IDLEPRIO don't preempt
+- * between themselves, they cooperatively multitask. An idle rq scores as
+- * prio PRIO_LIMIT so it is always preempted. latest_deadline and
+- * highest_prio_rq are initialised only to silence the compiler. When
+- * all else is equal, still prefer this_rq.
++ * Check to see if p can run on cpu, and if not, whether there are any online
++ * CPUs it can run on instead.
++ */
++static inline int needs_other_cpu(struct task_struct *p, int cpu)
++{
++ if (unlikely(!cpu_isset(cpu, p->cpus_allowed) && online_cpus(p)))
++ return 1;
++ return 0;
++}
++
++/*
++ * latest_deadline and highest_prio_rq are initialised only to silence the
++ * compiler. When all else is equal, still prefer this_rq.
+ */
+-#ifdef CONFIG_SMP
+ static void try_preempt(struct task_struct *p, struct rq *this_rq)
+ {
+ struct rq *highest_prio_rq = this_rq;
+@@ -1193,6 +1256,10 @@ static void try_preempt(struct task_stru
+ int highest_prio;
+ cpumask_t tmp;
+
++ /* IDLEPRIO tasks never preempt anything */
++ if (p->policy == SCHED_IDLEPRIO)
++ return;
++
+ if (suitable_idle_cpus(p)) {
+ resched_best_idle(p);
+ return;
+@@ -1219,30 +1286,32 @@ static void try_preempt(struct task_stru
+ offset_deadline = rq->rq_deadline -
+ cache_distance(this_rq, rq, p);
+
+- if (rq_prio > highest_prio ||
+- (deadline_after(offset_deadline, latest_deadline) ||
+- (offset_deadline == latest_deadline && this_rq == rq))) {
++ if (rq_prio > highest_prio || (rq_prio == highest_prio &&
++ deadline_after(offset_deadline, latest_deadline))) {
+ latest_deadline = offset_deadline;
+ highest_prio = rq_prio;
+ highest_prio_rq = rq;
+ }
+ }
+
+- if (p->prio > highest_prio || (p->prio == highest_prio &&
+- p->policy == SCHED_NORMAL &&
+- !deadline_before(p->deadline, latest_deadline)))
++ if (!can_preempt(p, highest_prio, highest_prio_rq->rq_deadline,
++ highest_prio_rq->rq_policy))
+ return;
+
+- /* p gets to preempt highest_prio_rq->curr */
+ resched_task(highest_prio_rq->curr);
+- highest_prio_rq->skip_clock_update = 1;
+ }
+ #else /* CONFIG_SMP */
++static inline int needs_other_cpu(struct task_struct *p, int cpu)
++{
++ return 0;
++}
++
+ static void try_preempt(struct task_struct *p, struct rq *this_rq)
+ {
+- if (p->prio < uprq->rq_prio ||
+- (p->prio == uprq->rq_prio && p->policy == SCHED_NORMAL &&
+- deadline_before(p->deadline, uprq->rq_deadline)))
++ if (p->policy == SCHED_IDLEPRIO)
++ return;
++ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline,
++ uprq->rq_policy))
+ resched_task(uprq->curr);
+ }
+ #endif /* CONFIG_SMP */
+@@ -1352,12 +1421,15 @@ int wake_up_state(struct task_struct *p,
+ return try_to_wake_up(p, state, 0);
+ }
+
++static void time_slice_expired(struct task_struct *p);
++
+ /*
+ * Perform scheduler related setup for a newly forked process p.
+ * p is forked by current.
+ */
+ void sched_fork(struct task_struct *p, int clone_flags)
+ {
++ struct task_struct *curr;
+ int cpu = get_cpu();
+ struct rq *rq;
+
+@@ -1396,10 +1468,11 @@ void sched_fork(struct task_struct *p, i
+ p->sched_reset_on_fork = 0;
+ }
+
++ curr = current;
+ /*
+ * Make sure we do not leak PI boosting priority to the child.
+ */
+- p->prio = current->normal_prio;
++ p->prio = curr->normal_prio;
+
+ INIT_LIST_HEAD(&p->run_list);
+ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+@@ -1420,18 +1493,26 @@ void sched_fork(struct task_struct *p, i
+ * total amount of pending timeslices in the system doesn't change,
+ * resulting in more scheduling fairness. If it's negative, it won't
+ * matter since that's the same as being 0. current's time_slice is
+- * actually in rq_time_slice when it's running.
++ * actually in rq_time_slice when it's running, as is its last_ran
++ * value. rq->rq_deadline is only modified within schedule() so it
++ * is always equal to current->deadline.
+ */
+- rq = task_grq_lock_irq(current);
+- if (likely(rq->rq_time_slice > 0)) {
++ rq = task_grq_lock_irq(curr);
++ if (likely(rq->rq_time_slice >= RESCHED_US * 2)) {
+ rq->rq_time_slice /= 2;
++ p->time_slice = rq->rq_time_slice;
++ } else {
+ /*
+- * The remainder of the first timeslice might be recovered by
+- * the parent if the child exits early enough.
++ * Forking task has run out of timeslice. Reschedule it and
++ * start its child with a new time slice and deadline. The
++ * child will end up running first because its deadline will
++ * be slightly earlier.
+ */
+- p->first_time_slice = 1;
++ rq->rq_time_slice = 0;
++ set_tsk_need_resched(curr);
++ time_slice_expired(p);
+ }
+- p->time_slice = rq->rq_time_slice;
++ p->last_ran = rq->rq_last_ran;
+ task_grq_unlock_irq();
+ out:
+ put_cpu();
+@@ -1470,40 +1551,9 @@ void wake_up_new_task(struct task_struct
+ task_grq_unlock(&flags);
+ }
+
+-/*
+- * Potentially available exiting-child timeslices are
+- * retrieved here - this way the parent does not get
+- * penalised for creating too many threads.
+- *
+- * (this cannot be used to 'generate' timeslices
+- * artificially, because any timeslice recovered here
+- * was given away by the parent in the first place.)
+- */
++/* Nothing to do here */
+ void sched_exit(struct task_struct *p)
+ {
+- struct task_struct *parent;
+- unsigned long flags;
+- struct rq *rq;
+-
+- if (unlikely(p->first_time_slice)) {
+- int *par_tslice, *p_tslice;
+-
+- parent = p->parent;
+- par_tslice = &parent->time_slice;
+- p_tslice = &p->time_slice;
+-
+- rq = task_grq_lock(parent, &flags);
+- /* The real time_slice of the "curr" task is on the rq var.*/
+- if (p == rq->curr)
+- p_tslice = &rq->rq_time_slice;
+- else if (parent == task_rq(parent)->curr)
+- par_tslice = &rq->rq_time_slice;
+-
+- *par_tslice += *p_tslice;
+- if (unlikely(*par_tslice > timeslice()))
+- *par_tslice = timeslice();
+- task_grq_unlock(&flags);
+- }
+ }
+
+ #ifdef CONFIG_PREEMPT_NOTIFIERS
+@@ -1981,7 +2031,7 @@ update_cpu_clock(struct rq *rq, struct t
+ else if (unlikely(time_diff > JIFFIES_TO_NS(1)))
+ time_diff = JIFFIES_TO_NS(1);
+
+- rq->rq_time_slice -= time_diff / 1000;
++ rq->rq_time_slice -= NS_TO_US(time_diff);
+ }
+ rq->rq_last_ran = rq->timekeep_clock = rq->clock;
+ }
+@@ -1997,7 +2047,7 @@ static u64 do_task_delta_exec(struct tas
+ u64 ns = 0;
+
+ if (p == rq->curr) {
+- update_rq_clock(rq);
++ update_clocks(rq);
+ ns = rq->clock - rq->rq_last_ran;
+ if (unlikely((s64)ns < 0))
+ ns = 0;
+@@ -2171,10 +2221,22 @@ void account_idle_ticks(unsigned long ti
+ }
+ #endif
+
++static inline void grq_iso_lock(void)
++ __acquires(grq.iso_lock)
++{
++ raw_spin_lock(&grq.iso_lock);
++}
++
++static inline void grq_iso_unlock(void)
++ __releases(grq.iso_lock)
++{
++ raw_spin_unlock(&grq.iso_lock);
++}
++
+ /*
+ * Functions to test for when SCHED_ISO tasks have used their allocated
+ * quota as real time scheduling and convert them back to SCHED_NORMAL.
+- * Where possible, the data is tested lockless, to avoid grabbing grq_lock
++ * Where possible, the data is tested lockless, to avoid grabbing iso_lock
+ * because the occasional inaccurate result won't matter. However the
+ * tick data is only ever modified under lock. iso_refractory is only simply
+ * set to 0 or 1 so it's not worth grabbing the lock yet again for that.
+@@ -2209,21 +2271,21 @@ static unsigned int test_ret_isorefracto
+
+ static void iso_tick(void)
+ {
+- grq_lock();
++ grq_iso_lock();
+ grq.iso_ticks += 100;
+- grq_unlock();
++ grq_iso_unlock();
+ }
+
+ /* No SCHED_ISO task was running so decrease rq->iso_ticks */
+ static inline void no_iso_tick(void)
+ {
+ if (grq.iso_ticks) {
+- grq_lock();
++ grq_iso_lock();
+ grq.iso_ticks -= grq.iso_ticks / ISO_PERIOD + 1;
+ if (unlikely(grq.iso_refractory && grq.iso_ticks <
+ ISO_PERIOD * (sched_iso_cpu * 115 / 128)))
+ clear_iso_refractory();
+- grq_unlock();
++ grq_iso_unlock();
+ }
+ }
+
+@@ -2262,10 +2324,23 @@ static void task_running_tick(struct rq
+ }
+
+ /* SCHED_FIFO tasks never run out of timeslice. */
+- if (rq_idle(rq) || rq->rq_time_slice > 0 || rq->rq_policy == SCHED_FIFO)
++ if (rq->rq_policy == SCHED_FIFO)
+ return;
++ /*
++ * Tasks that were scheduled in the first half of a tick are not
++ * allowed to run into the 2nd half of the next tick if they will
++ * run out of time slice in the interim. Otherwise, if they have
++ * less than 100us of time slice left they will be rescheduled.
++ */
++ if (rq->dither) {
++ if (rq->rq_time_slice > HALF_JIFFY_US)
++ return;
++ else
++ rq->rq_time_slice = 0;
++ } else if (rq->rq_time_slice >= RESCHED_US)
++ return;
+
+- /* p->time_slice <= 0. We only modify task_struct under grq lock */
++ /* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */
+ p = rq->curr;
+ requeue_task(p);
+ grq_lock();
+@@ -2286,13 +2361,14 @@ void scheduler_tick(void)
+ struct rq *rq = cpu_rq(cpu);
+
+ sched_clock_tick();
++ /* grq lock not grabbed, so only update rq clock */
+ update_rq_clock(rq);
+ update_cpu_clock(rq, rq->curr, 1);
+- update_gjiffies();
+ if (!rq_idle(rq))
+ task_running_tick(rq);
+ else
+ no_iso_tick();
++ rq->last_tick = rq->clock;
+ perf_event_task_tick(rq->curr);
+ }
+
+@@ -2354,7 +2430,7 @@ EXPORT_SYMBOL(sub_preempt_count);
+ #endif
+
+ /*
+- * Deadline is "now" in gjiffies + (offset by priority). Setting the deadline
++ * Deadline is "now" in niffies + (offset by priority). Setting the deadline
+ * is the key to everything. It distributes cpu fairly amongst tasks of the
+ * same nice value, it proportions cpu according to nice level, it means the
+ * task that last woke up the longest ago has the earliest deadline, thus
+@@ -2364,7 +2440,7 @@ EXPORT_SYMBOL(sub_preempt_count);
+ */
+ static inline int prio_deadline_diff(int user_prio)
+ {
+- return (prio_ratios[user_prio] * rr_interval * HZ / (1000 * 100)) ? : 1;
++ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
+ }
+
+ static inline int task_deadline_diff(struct task_struct *p)
+@@ -2377,25 +2453,33 @@ static inline int static_deadline_diff(i
+ return prio_deadline_diff(USER_PRIO(static_prio));
+ }
+
+-static inline int longest_deadline_diff(void)
++static inline int ms_longest_deadline_diff(void)
+ {
+- return prio_deadline_diff(39);
++ return NS_TO_MS(prio_deadline_diff(39));
+ }
+
+ /*
+ * The time_slice is only refilled when it is empty and that is when we set a
+ * new deadline.
+ */
+-static inline void time_slice_expired(struct task_struct *p)
++static void time_slice_expired(struct task_struct *p)
+ {
+- reset_first_time_slice(p);
+ p->time_slice = timeslice();
+- p->deadline = gjiffies + task_deadline_diff(p);
++ p->deadline = grq.niffies + task_deadline_diff(p);
+ }
+
++/*
++ * Timeslices below RESCHED_US are considered as good as expired as there's no
++ * point rescheduling when there's so little time left. SCHED_BATCH tasks
++ * have been flagged be not latency sensitive and likely to be fully CPU
++ * bound so every time they're rescheduled they have their time_slice
++ * refilled, but get a new later deadline to have little effect on
++ * SCHED_NORMAL tasks.
++
++ */
+ static inline void check_deadline(struct task_struct *p)
+ {
+- if (p->time_slice <= 0)
++ if (p->time_slice < RESCHED_US || batch_task(p))
+ time_slice_expired(p);
+ }
+
+@@ -2433,7 +2517,7 @@ retry:
+ queue = grq.queue + idx;
+ list_for_each_entry(p, queue, run_list) {
+ /* Make sure cpu affinity is ok */
+- if (online_cpus(p) && !cpu_isset(cpu, p->cpus_allowed))
++ if (needs_other_cpu(p, cpu))
+ continue;
+ if (idx < MAX_RT_PRIO) {
+ /* We found an rt task */
+@@ -2560,12 +2644,14 @@ need_resched_nonpreemptible:
+ deactivate = 0;
+ schedule_debug(prev);
+
+- local_irq_disable();
+- update_rq_clock(rq);
++ grq_lock_irq();
++ update_clocks(rq);
+ update_cpu_clock(rq, prev, 0);
+- rq->skip_clock_update = 0;
++ if (rq->clock - rq->last_tick > HALF_JIFFY_NS)
++ rq->dither = 0;
++ else
++ rq->dither = 1;
+
+- grq_lock();
+ clear_tsk_need_resched(prev);
+
+ if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+@@ -2581,36 +2667,54 @@ need_resched_nonpreemptible:
+ prev->time_slice = rq->rq_time_slice;
+ prev->deadline = rq->rq_deadline;
+ check_deadline(prev);
+- return_task(prev, deactivate);
+- /* Task changed affinity off this cpu */
+- if (unlikely(!cpus_intersects(prev->cpus_allowed,
+- cpumask_of_cpu(cpu)))) {
+- if (online_cpus(prev))
++ prev->last_ran = rq->clock;
++
++ /* Task changed affinity off this CPU */
++ if (needs_other_cpu(prev, cpu))
++ resched_suitable_idle(prev);
++ else if (!deactivate) {
++ if (!queued_notrunning()) {
++ /*
++ * We now know prev is the only thing that is
++ * awaiting CPU so we can bypass rechecking for
++ * the earliest deadline task and just run it
++ * again.
++ */
++ grq_unlock_irq();
++ goto rerun_prev_unlocked;
++ } else {
++ /*
++ * If prev got kicked off by a task that has to
++ * run on this CPU for affinity reasons then
++ * there may be an idle CPU it can go to.
++ */
+ resched_suitable_idle(prev);
+ }
++ }
++ return_task(prev, deactivate);
+ }
+
+- if (likely(queued_notrunning())) {
+- next = earliest_deadline_task(rq, idle);
+- } else {
++ if (unlikely(!queued_notrunning())) {
++ /*
++ * This CPU is now truly idle as opposed to when idle is
++ * scheduled as a high priority task in its own right.
++ */
+ next = idle;
+ schedstat_inc(rq, sched_goidle);
+- }
+-
+- prefetch(next);
+- prefetch_stack(next);
+-
+- if (task_idle(next))
+ set_cpuidle_map(cpu);
+- else
++ } else {
++ next = earliest_deadline_task(rq, idle);
++ prefetch(next);
++ prefetch_stack(next);
+ clear_cpuidle_map(cpu);
+-
+- prev->last_ran = rq->clock;
++ }
+
+ if (likely(prev != next)) {
+ sched_info_switch(prev, next);
+ perf_event_task_sched_out(prev, next);
+
++ if (prev != idle)
++ set_last_task(rq, prev);
+ set_rq_task(rq, next);
+ grq.nr_switches++;
+ prev->oncpu = 0;
+@@ -2629,6 +2733,7 @@ need_resched_nonpreemptible:
+ } else
+ grq_unlock_irq();
+
++rerun_prev_unlocked:
+ if (unlikely(reacquire_kernel_lock(current) < 0)) {
+ prev = rq->curr;
+ switch_count = &prev->nivcsw;
+@@ -3324,8 +3429,9 @@ int task_prio(const struct task_struct *
+ if (prio <= 0)
+ goto out;
+
+- delta = p->deadline - gjiffies;
+- delta = delta * 40 / longest_deadline_diff();
++ /* Convert to ms to avoid overflows */
++ delta = NS_TO_MS(p->deadline - grq.niffies);
++ delta = delta * 40 / ms_longest_deadline_diff();
+ if (delta > 0 && delta <= 80)
+ prio += delta;
+ if (idleprio_task(p))
+@@ -3533,7 +3639,7 @@ recheck:
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ goto recheck;
+ }
+- update_rq_clock(rq);
++ update_clocks(rq);
+ p->sched_reset_on_fork = reset_on_fork;
+
+ queued = task_queued(p);
+@@ -4835,7 +4941,7 @@ migration_call(struct notifier_block *nf
+ __setscheduler(idle, rq, SCHED_NORMAL, 0);
+ idle->prio = PRIO_LIMIT;
+ set_rq_task(rq, idle);
+- update_rq_clock(rq);
++ update_clocks(rq);
+ grq_unlock_irq();
+ break;
+
+@@ -6531,12 +6637,14 @@ void __init sched_init(void)
+ int i;
+ struct rq *rq;
+
+- prio_ratios[0] = 100;
++ prio_ratios[0] = 128;
+ for (i = 1 ; i < PRIO_RANGE ; i++)
+ prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
+
+ raw_spin_lock_init(&grq.lock);
+ grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0;
++ grq.niffies = 0;
++ raw_spin_lock_init(&grq.iso_lock);
+ grq.iso_ticks = grq.iso_refractory = 0;
+ #ifdef CONFIG_SMP
+ init_defrootdomain();
+@@ -6549,7 +6657,9 @@ void __init sched_init(void)
+ rq = cpu_rq(i);
+ rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc =
+ rq->iowait_pc = rq->idle_pc = 0;
++ rq->dither = 0;
+ #ifdef CONFIG_SMP
++ rq->last_niffy = 0;
+ rq->sd = NULL;
+ rq->rd = NULL;
+ rq->online = 0;
+Index: linux-2.6.35-bfs/kernel/sysctl.c
+===================================================================
+--- linux-2.6.35-bfs.orig/kernel/sysctl.c 2010-09-25 08:18:30.147361076 +1000
++++ linux-2.6.35-bfs/kernel/sysctl.c 2010-09-25 08:20:25.823886848 +1000
+@@ -119,7 +119,7 @@ static int __maybe_unused one_hundred =
+ #ifdef CONFIG_SCHED_BFS
+ extern int rr_interval;
+ extern int sched_iso_cpu;
+-static int __read_mostly five_thousand = 5000;
++static int __read_mostly one_thousand = 1000;
+ #endif
+ #ifdef CONFIG_PRINTK
+ static int ten_thousand = 10000;
+@@ -794,7 +794,7 @@ static struct ctl_table kern_table[] = {
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &one,
+- .extra2 = &five_thousand,
++ .extra2 = &one_thousand,
+ },
+ {
+ .procname = "iso_cpu",