+++ /dev/null
-- Major overhaul of queued changes.
-
-- Microoptimise multiplications/divisions to be shifts where suitable.
-
-- Change ISO calculations to have their own lock so as to not grab the grq
-lock during a scheduler tick.
-
-- Change all deadline accounting to use nanosecond values.
-
-- Introduce local niffies variable which is updated from any runqueue using the
-TSC clock whenever the grq lock is taken. Use niffies to compare deadlines to.
-This will give much more granular deadlines when jiffies are low resolution
-such as 100Hz, and rarely will tasks have the same deadlines now.
-
-- Drop the "skip_clock_update" concept as we update the niffies each time we
-update the rq clocks, thus we want to update it more often.
-
-- Rework try_preempt.
-
-- Bypass rechecking deadline when we know that prev will run again in schedule.
-
-- Check to see if prev can run on an idle CPU when being descheduled as may
-happen when a task must use a certain CPU for affinity reasons.
-
-- Decrease maximum rr_interval possible to 1000 (one second) as there is no
-demonstrable advantage to higher values, and may overflow with other changes
-being introduced.
-
-- Check for when tasks are scheduled within a tick to see if they're in the
-1st or 2nd half of the tick. Use this to decide how far into last tick a task
-can run. This will make the greatest difference on lower Hz values with small
-rr intervals.
-
-- Change the test for exhausted time slice to 100us as rescheduling with less
-time available than this will either greatly overrun its quota or reschedule
-very quickly.
-
-- Change SCHED_BATCH tasks to refill timeslices and reset deadline every time
-they're descheduled as they've been flagged as latency insensitive, likely
-fully CPU bound tasks. This should decrease the impact running batch tasks
-has on other tasks.
-
-- Microoptimise before context switch in schedule()
-
-- Add a local last_task variable to each runqueue which keeps a copy of the
-last non-idle task that ran on this CPU. Use this value to determine that a
-task is still cache warm on this CPU even if it has run elsewhere in the
-meantime. This improves throughput on relatively idle systems with >2 logical
-CPUs.
-
-- Remove the first_time_slice concept as it wasn't contributing on any
-meaningful level but was adding to overhead, especially on sched_exit.
-
-- Check that when a task forks and loses timeslice to its child that it isn't
-due to be descheduled and ensure that a child inherits the proper variables
-from its parent across fork.
-
-- Fix try_preempt which may have been picking a higher priority runqueue on
-SMP if it had a later deadline. Remove the test for equal deadline as
-nanosecond deadline resolution means this will never happen.
-
-- Random other minor cleanups.
-
--ck
----
-
- include/linux/sched.h | 4
- kernel/sched_bfs.c | 466 +++++++++++++++++++++-------------
- kernel/sysctl.c | 4
- 4 files changed, 293 insertions(+), 183 deletions(-)
-
-Index: kernel-2.6.28/include/linux/sched.h
-===================================================================
---- kernel-2.6.28.orig/include/linux/sched.h
-+++ kernel-2.6.28/include/linux/sched.h
-@@ -1121,8 +1121,8 @@ struct task_struct {
- int prio, static_prio, normal_prio;
- unsigned int rt_priority;
- #ifdef CONFIG_SCHED_BFS
-- int time_slice, first_time_slice;
-- unsigned long deadline;
-+ int time_slice;
-+ u64 deadline;
- struct list_head run_list;
- u64 last_ran;
- u64 sched_time; /* sched_clock time spent running */
-@@ -1426,7 +1426,7 @@ static inline void tsk_cpus_current(stru
-
- static inline void print_scheduler_version(void)
- {
-- printk(KERN_INFO"BFS CPU scheduler v0.330 by Con Kolivas ported by ToAsTcfh.\n");
-+ printk(KERN_INFO"BFS CPU scheduler v0.350 by Con Kolivas ported by ToAsTcfh.\n");
- }
-
- static inline int iso_task(struct task_struct *p)
-Index: kernel-2.6.28/kernel/sched_bfs.c
-===================================================================
---- kernel-2.6.28.orig/kernel/sched_bfs.c
-+++ kernel-2.6.28/kernel/sched_bfs.c
-@@ -102,10 +102,19 @@
- #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
- #define SCHED_PRIO(p) ((p)+MAX_RT_PRIO)
-
--/* Some helpers for converting to/from various scales.*/
-+/*
-+ * Some helpers for converting to/from various scales. Use shifts to get
-+ * approximate multiples of ten for less overhead.
-+ */
- #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
--#define MS_TO_NS(TIME) ((TIME) * 1000000)
--#define MS_TO_US(TIME) ((TIME) * 1000)
-+#define HALF_JIFFY_NS (1000000000 / HZ / 2)
-+#define HALF_JIFFY_US (1000000 / HZ / 2)
-+#define MS_TO_NS(TIME) ((TIME) << 20)
-+#define MS_TO_US(TIME) ((TIME) << 10)
-+#define NS_TO_MS(TIME) ((TIME) >> 20)
-+#define NS_TO_US(TIME) ((TIME) >> 10)
-+
-+#define RESCHED_US (100) /* Reschedule if less than this many us left */
-
- #ifdef CONFIG_SMP
- /*
-@@ -157,8 +166,9 @@ static inline unsigned long timeslice(vo
- }
-
- /*
-- * The global runqueue data that all CPUs work off. All data is protected
-- * by grq.lock.
-+ * The global runqueue data that all CPUs work off. Data is protected either
-+ * by the global grq lock, or the discrete lock that precedes the data in this
-+ * struct.
- */
- struct global_rq {
- spinlock_t lock;
-@@ -167,17 +177,17 @@ struct global_rq {
- unsigned long long nr_switches;
- struct list_head queue[PRIO_LIMIT];
- DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1);
-- int iso_ticks;
-- int iso_refractory;
- #ifdef CONFIG_SMP
- unsigned long qnr; /* queued not running */
- cpumask_t cpu_idle_map;
- int idle_cpus;
- #endif
--#if BITS_PER_LONG < 64
-- unsigned long jiffies;
-- u64 jiffies_64;
--#endif
-+ /* Nanosecond jiffies */
-+ u64 niffies;
-+
-+ spinlock_t iso_lock;
-+ int iso_ticks;
-+ int iso_refractory;
- };
-
- /* There can be only one */
-@@ -192,8 +202,8 @@ struct rq {
- #ifdef CONFIG_NO_HZ
- unsigned char in_nohz_recently;
- #endif
-+ struct task_struct *last_task;
- #endif
-- unsigned int skip_clock_update;
-
- struct task_struct *curr, *idle;
- struct mm_struct *prev_mm;
-@@ -229,9 +239,11 @@ struct rq {
- /* See if all cache siblings are idle */
- cpumask_t cache_siblings;
- #endif
-+ u64 last_niffy; /* Last time this RQ updated grq.niffies */
- #endif
-+ u64 clock, old_clock, last_tick;
-+ int dither;
-
-- u64 clock;
- #ifdef CONFIG_SCHEDSTATS
-
- /* latency stats */
-@@ -290,14 +302,6 @@ struct root_domain {
- static struct root_domain def_root_domain;
- #endif
-
--static inline int cpu_of(struct rq *rq)
--{
--#ifdef CONFIG_SMP
-- return rq->cpu;
--#else
-- return 0;
--#endif
--}
-
- /*
- * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
-@@ -309,17 +313,65 @@ static inline int cpu_of(struct rq *rq)
- #define for_each_domain(cpu, __sd) \
- for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
-
-+static inline void update_rq_clock(struct rq *rq);
-+
- #ifdef CONFIG_SMP
- #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
- #define this_rq() (&__get_cpu_var(runqueues))
- #define task_rq(p) cpu_rq(task_cpu(p))
- #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
-+static inline int cpu_of(struct rq *rq)
-+{
-+ return rq->cpu;
-+}
-+
-+/*
-+ * Niffies are a globally increasing nanosecond counter. Whenever a runqueue
-+ * clock is updated with the grq.lock held, it is an opportunity to update the
-+ * niffies value. Any CPU can update it by adding how much its clock has
-+ * increased since it last updated niffies, minus any added niffies by other
-+ * CPUs.
-+ */
-+static inline void update_clocks(struct rq *rq)
-+{
-+ s64 ndiff;
-+
-+ update_rq_clock(rq);
-+ ndiff = rq->clock - rq->old_clock;
-+ /* old_clock is only updated when we are updating niffies */
-+ rq->old_clock = rq->clock;
-+ ndiff -= grq.niffies - rq->last_niffy;
-+ /*
-+ * Sanity check should sched_clock return bogus values or be limited to
-+ * just jiffy resolution. Some time will always have passed.
-+ */
-+ if (unlikely(ndiff < 1 || ndiff > MS_TO_NS(rr_interval)))
-+ ndiff = 1;
-+ grq.niffies += ndiff;
-+ rq->last_niffy = grq.niffies;
-+}
- #else /* CONFIG_SMP */
- static struct rq *uprq;
- #define cpu_rq(cpu) (uprq)
- #define this_rq() (uprq)
- #define task_rq(p) (uprq)
- #define cpu_curr(cpu) ((uprq)->curr)
-+static inline int cpu_of(struct rq *rq)
-+{
-+ return 0;
-+}
-+
-+static inline void update_clocks(struct rq *rq)
-+{
-+ s64 ndiff;
-+
-+ update_rq_clock(rq);
-+ ndiff = rq->clock - rq->old_clock;
-+ rq->old_clock = rq->clock;
-+ if (unlikely(ndiff < 1 || ndiff > MS_TO_US(rr_interval)))
-+ ndiff = 1;
-+ grq.niffies += ndiff;
-+}
- #endif
-
- #include "sched_stats.h"
-@@ -333,13 +385,13 @@ static struct rq *uprq;
-
- /*
- * All common locking functions performed on grq.lock. rq->clock is local to
-- * the cpu accessing it so it can be modified just with interrupts disabled,
-- * but looking up task_rq must be done under grq.lock to be safe.
-+ * the CPU accessing it so it can be modified just with interrupts disabled
-+ * when we're not updating niffies.
-+ * Looking up task_rq must be done under grq.lock to be safe.
- */
- static inline void update_rq_clock(struct rq *rq)
- {
-- if (!rq->skip_clock_update)
-- rq->clock = sched_clock_cpu(cpu_of(rq));
-+ rq->clock = sched_clock_cpu(cpu_of(rq));
- }
-
- static inline int task_running(struct task_struct *p)
-@@ -368,8 +420,8 @@ static inline void grq_lock_irq(void)
- static inline void time_lock_grq(struct rq *rq)
- __acquires(grq.lock)
- {
-- update_rq_clock(rq);
- grq_lock();
-+ update_clocks(rq);
- }
-
- static inline void grq_unlock_irq(void)
-@@ -403,7 +455,7 @@ static inline struct rq
- __acquires(grq.lock)
- {
- struct rq *rq = task_grq_lock(p, flags);
-- update_rq_clock(rq);
-+ update_clocks(rq);
- return rq;
- }
-
-@@ -418,7 +470,7 @@ static inline void time_task_grq_lock_ir
- __acquires(grq.lock)
- {
- struct rq *rq = task_grq_lock_irq(p);
-- update_rq_clock(rq);
-+ update_clocks(rq);
- }
-
- static inline void task_grq_unlock_irq(void)
-@@ -513,33 +565,6 @@ static inline void finish_lock_switch(st
- }
- #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
-
--/*
-- * In order to have a monotonic clock that does not wrap we have a 64 bit
-- * unsigned long that's protected by grq.lock used in place of jiffies on
-- * 32 bit builds.
-- */
--#if BITS_PER_LONG < 64
--static inline void update_gjiffies(void)
--{
-- if (grq.jiffies != jiffies) {
-- grq_lock();
-- grq.jiffies = jiffies;
-- grq.jiffies_64++;
-- grq_unlock();
-- }
--}
--
--#define gjiffies (grq.jiffies_64)
--
--#else /* BITS_PER_LONG < 64 */
--static inline void update_gjiffies(void)
--{
--}
--
--#define gjiffies jiffies
--
--#endif /* BITS_PER_LONG < 64 */
--
- static inline int deadline_before(u64 deadline, u64 time)
- {
- return (deadline < time);
-@@ -572,17 +597,6 @@ static void dequeue_task(struct task_str
- }
-
- /*
-- * When a task is freshly forked, the first_time_slice flag is set to say
-- * it has taken time_slice from its parent and if it exits on this first
-- * time_slice it can return its time_slice back to the parent.
-- */
--static inline void reset_first_time_slice(struct task_struct *p)
--{
-- if (unlikely(p->first_time_slice))
-- p->first_time_slice = 0;
--}
--
--/*
- * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as
- * an idle task, we ensure none of the following conditions are met.
- */
-@@ -644,11 +658,11 @@ static inline int task_prio_ratio(struct
- /*
- * task_timeslice - all tasks of all priorities get the exact same timeslice
- * length. CPU distribution is handled by giving different deadlines to
-- * tasks of different priorities.
-+ * tasks of different priorities. Use 128 as the base value for fast shifts.
- */
- static inline int task_timeslice(struct task_struct *p)
- {
-- return (rr_interval * task_prio_ratio(p) / 100);
-+ return (rr_interval * task_prio_ratio(p) / 128);
- }
-
- #ifdef CONFIG_SMP
-@@ -700,6 +714,15 @@ static int suitable_idle_cpus(struct tas
-
- static void resched_task(struct task_struct *p);
-
-+/*
-+ * last_task stores the last non-idle task scheduled on the local rq for
-+ * cache warmth testing.
-+ */
-+static inline void set_last_task(struct rq *rq, struct task_struct *p)
-+{
-+ rq->last_task = p;
-+}
-+
- #define CPUIDLE_CACHE_BUSY (1)
- #define CPUIDLE_DIFF_CPU (2)
- #define CPUIDLE_THREAD_BUSY (4)
-@@ -722,6 +745,9 @@ static void resched_task(struct task_str
- * Other node, other CPU, idle cache, idle threads.
- * Other node, other CPU, busy cache, idle threads.
- * Other node, other CPU, busy threads.
-+ *
-+ * If p was the last task running on this rq, then regardless of where
-+ * it has been running since then, it is cache warm on this rq.
- */
- static void resched_best_idle(struct task_struct *p)
- {
-@@ -754,11 +780,14 @@ static void resched_best_idle(struct tas
- tmp_rq = cpu_rq(cpu_tmp);
-
- if (rq->cpu_locality[cpu_tmp]) {
-+ /* Check rq->last_task hasn't been dereferenced */
-+ if (rq->last_task && p != rq->last_task) {
- #ifdef CONFIG_NUMA
-- if (rq->cpu_locality[cpu_tmp] > 1)
-- ranking |= CPUIDLE_DIFF_NODE;
-+ if (rq->cpu_locality[cpu_tmp] > 1)
-+ ranking |= CPUIDLE_DIFF_NODE;
- #endif
-- ranking |= CPUIDLE_DIFF_CPU;
-+ ranking |= CPUIDLE_DIFF_CPU;
-+ }
- }
- #ifdef CONFIG_SCHED_MC
- if (!(tmp_rq->cache_idle(cpu_tmp)))
-@@ -800,6 +829,11 @@ static inline void resched_suitable_idle
- static inline int
- cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p)
- {
-+ /* Check rq->last_task hasn't been dereferenced */
-+ if (likely(rq->last_task)) {
-+ if (rq->last_task == p)
-+ return 0;
-+ }
- return rq->cpu_locality[cpu_of(task_rq)] * task_timeslice(p);
- }
- #else /* CONFIG_SMP */
-@@ -839,6 +873,10 @@ cache_distance(struct rq *task_rq, struc
- {
- return 0;
- }
-+
-+static inline void set_last_task(struct rq *rq, struct task_struct *p)
-+{
-+}
- #endif /* CONFIG_SMP */
-
- /*
-@@ -886,7 +924,7 @@ static int effective_prio(struct task_st
- */
- static void activate_task(struct task_struct *p, struct rq *rq)
- {
-- update_rq_clock(rq);
-+ update_clocks(rq);
-
- /*
- * Sleep time is in units of nanosecs, so shift by 20 to get a
-@@ -1157,8 +1195,28 @@ void kick_process(struct task_struct *p)
- #endif
-
- #define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT)
--#define task_idle(p) ((p)->prio == PRIO_LIMIT)
-
-+/*
-+ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the
-+ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or
-+ * between themselves, they cooperatively multitask. An idle rq scores as
-+ * prio PRIO_LIMIT so it is always preempted.
-+ */
-+static inline int
-+can_preempt(struct task_struct *p, int prio, unsigned long deadline,
-+ unsigned int policy)
-+{
-+ /* Better static priority RT task or better policy preemption */
-+ if (p->prio < prio)
-+ return 1;
-+ if (p->prio > prio)
-+ return 0;
-+ /* SCHED_NORMAL, BATCH and ISO will preempt based on deadline */
-+ if (!deadline_before(p->deadline, deadline))
-+ return 0;
-+ return 1;
-+}
-+#ifdef CONFIG_SMP
- #ifdef CONFIG_HOTPLUG_CPU
- /*
- * Check to see if there is a task that is affined only to offline CPUs but
-@@ -1179,14 +1237,20 @@ static inline int online_cpus(struct tas
-
-
- /*
-- * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the
-- * basis of earlier deadlines. SCHED_BATCH, ISO and IDLEPRIO don't preempt
-- * between themselves, they cooperatively multitask. An idle rq scores as
-- * prio PRIO_LIMIT so it is always preempted. latest_deadline and
-- * highest_prio_rq are initialised only to silence the compiler. When
-- * all else is equal, still prefer this_rq.
-+ * Check to see if p can run on cpu, and if not, whether there are any online
-+ * CPUs it can run on instead.
-+ */
-+static inline int needs_other_cpu(struct task_struct *p, int cpu)
-+{
-+ if (unlikely(!cpu_isset(cpu, p->cpus_allowed) && online_cpus(p)))
-+ return 1;
-+ return 0;
-+}
-+
-+/*
-+ * latest_deadline and highest_prio_rq are initialised only to silence the
-+ * compiler. When all else is equal, still prefer this_rq.
- */
--#ifdef CONFIG_SMP
- static void try_preempt(struct task_struct *p, struct rq *this_rq)
- {
- struct rq *highest_prio_rq = this_rq;
-@@ -1194,6 +1258,10 @@ static void try_preempt(struct task_stru
- int highest_prio;
- cpumask_t tmp;
-
-+ /* IDLEPRIO tasks never preempt anything */
-+ if (p->policy == SCHED_IDLEPRIO)
-+ return;
-+
- if (suitable_idle_cpus(p)) {
- resched_best_idle(p);
- return;
-@@ -1220,30 +1288,32 @@ static void try_preempt(struct task_stru
- offset_deadline = rq->rq_deadline -
- cache_distance(this_rq, rq, p);
-
-- if (rq_prio > highest_prio ||
-- (deadline_after(offset_deadline, latest_deadline) ||
-- (offset_deadline == latest_deadline && this_rq == rq))) {
-+ if (rq_prio > highest_prio || (rq_prio == highest_prio &&
-+ deadline_after(offset_deadline, latest_deadline))) {
- latest_deadline = offset_deadline;
- highest_prio = rq_prio;
- highest_prio_rq = rq;
- }
- }
-
-- if (p->prio > highest_prio || (p->prio == highest_prio &&
-- p->policy == SCHED_NORMAL &&
-- !deadline_before(p->deadline, latest_deadline)))
-+ if (!can_preempt(p, highest_prio, highest_prio_rq->rq_deadline,
-+ highest_prio_rq->rq_policy))
- return;
-
-- /* p gets to preempt highest_prio_rq->curr */
- resched_task(highest_prio_rq->curr);
-- highest_prio_rq->skip_clock_update = 1;
- }
- #else /* CONFIG_SMP */
-+static inline int needs_other_cpu(struct task_struct *p, int cpu)
-+{
-+ return 0;
-+}
-+
- static void try_preempt(struct task_struct *p, struct rq *this_rq)
- {
-- if (p->prio < uprq->rq_prio ||
-- (p->prio == uprq->rq_prio && p->policy == SCHED_NORMAL &&
-- deadline_before(p->deadline, uprq->rq_deadline)))
-+ if (p->policy == SCHED_IDLEPRIO)
-+ return;
-+ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline,
-+ uprq->rq_policy))
- resched_task(uprq->curr);
- }
- #endif /* CONFIG_SMP */
-@@ -1331,12 +1401,15 @@ int wake_up_state(struct task_struct *p,
- return try_to_wake_up(p, state, 0);
- }
-
-+static void time_slice_expired(struct task_struct *p);
-+
- /*
- * Perform scheduler related setup for a newly forked process p.
- * p is forked by current.
- */
- void sched_fork(struct task_struct *p, int clone_flags)
- {
-+ struct task_struct *curr;
- int cpu = get_cpu();
- struct rq *rq;
-
-@@ -1376,10 +1449,11 @@ void sched_fork(struct task_struct *p, i
- p->sched_reset_on_fork = 0;
- }
-
-+ curr = current;
- /*
- * Make sure we do not leak PI boosting priority to the child:
- */
-- p->prio = current->normal_prio;
-+ p->prio = curr->normal_prio;
-
- INIT_LIST_HEAD(&p->run_list);
- #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
-@@ -1400,18 +1474,26 @@ void sched_fork(struct task_struct *p, i
- * total amount of pending timeslices in the system doesn't change,
- * resulting in more scheduling fairness. If it's negative, it won't
- * matter since that's the same as being 0. current's time_slice is
-- * actually in rq_time_slice when it's running.
-+ * actually in rq_time_slice when it's running, as is its last_ran
-+ * value. rq->rq_deadline is only modified within schedule() so it
-+ * is always equal to current->deadline.
- */
-- rq = task_grq_lock_irq(current);
-- if (likely(rq->rq_time_slice > 0)) {
-+ rq = task_grq_lock_irq(curr);
-+ if (likely(rq->rq_time_slice >= RESCHED_US * 2)) {
- rq->rq_time_slice /= 2;
-+ p->time_slice = rq->rq_time_slice;
-+ } else {
- /*
-- * The remainder of the first timeslice might be recovered by
-- * the parent if the child exits early enough.
-+ * Forking task has run out of timeslice. Reschedule it and
-+ * start its child with a new time slice and deadline. The
-+ * child will end up running first because its deadline will
-+ * be slightly earlier.
- */
-- p->first_time_slice = 1;
-+ rq->rq_time_slice = 0;
-+ set_tsk_need_resched(curr);
-+ time_slice_expired(p);
- }
-- p->time_slice = rq->rq_time_slice;
-+ p->last_ran = rq->rq_last_ran;
- task_grq_unlock_irq();
- out:
- put_cpu();
-@@ -1452,40 +1534,9 @@ void wake_up_new_task(struct task_struct
- task_grq_unlock(&flags);
- }
-
--/*
-- * Potentially available exiting-child timeslices are
-- * retrieved here - this way the parent does not get
-- * penalised for creating too many threads.
-- *
-- * (this cannot be used to 'generate' timeslices
-- * artificially, because any timeslice recovered here
-- * was given away by the parent in the first place.)
-- */
-+/* Nothing to do here */
- void sched_exit(struct task_struct *p)
- {
-- struct task_struct *parent;
-- unsigned long flags;
-- struct rq *rq;
--
-- if (unlikely(p->first_time_slice)) {
-- int *par_tslice, *p_tslice;
--
-- parent = p->parent;
-- par_tslice = &parent->time_slice;
-- p_tslice = &p->time_slice;
--
-- rq = task_grq_lock(parent, &flags);
-- /* The real time_slice of the "curr" task is on the rq var.*/
-- if (p == rq->curr)
-- p_tslice = &rq->rq_time_slice;
-- else if (parent == task_rq(parent)->curr)
-- par_tslice = &rq->rq_time_slice;
--
-- *par_tslice += *p_tslice;
-- if (unlikely(*par_tslice > timeslice()))
-- *par_tslice = timeslice();
-- task_grq_unlock(&flags);
-- }
- }
-
- #ifdef CONFIG_PREEMPT_NOTIFIERS
-@@ -1900,7 +1951,7 @@ update_cpu_clock(struct rq *rq, struct t
- else if (unlikely(time_diff > JIFFIES_TO_NS(1)))
- time_diff = JIFFIES_TO_NS(1);
-
-- rq->rq_time_slice -= time_diff / 1000;
-+ rq->rq_time_slice -= NS_TO_US(time_diff);
- }
- rq->rq_last_ran = rq->timekeep_clock = rq->clock;
- }
-@@ -1916,7 +1967,7 @@ static u64 do_task_delta_exec(struct tas
- u64 ns = 0;
-
- if (p == rq->curr) {
-- update_rq_clock(rq);
-+ update_clocks(rq);
- ns = rq->clock - rq->rq_last_ran;
- if (unlikely((s64)ns < 0))
- ns = 0;
-@@ -2090,10 +2141,22 @@ void account_idle_ticks(unsigned long ti
- }
- #endif
-
-+static inline void grq_iso_lock(void)
-+ __acquires(grq.iso_lock)
-+{
-+ spin_lock(&grq.iso_lock);
-+}
-+
-+static inline void grq_iso_unlock(void)
-+ __releases(grq.iso_lock)
-+{
-+ spin_unlock(&grq.iso_lock);
-+}
-+
- /*
- * Functions to test for when SCHED_ISO tasks have used their allocated
- * quota as real time scheduling and convert them back to SCHED_NORMAL.
-- * Where possible, the data is tested lockless, to avoid grabbing grq_lock
-+ * Where possible, the data is tested lockless, to avoid grabbing iso_lock
- * because the occasional inaccurate result won't matter. However the
- * tick data is only ever modified under lock. iso_refractory is only simply
- * set to 0 or 1 so it's not worth grabbing the lock yet again for that.
-@@ -2128,21 +2191,21 @@ static unsigned int test_ret_isorefracto
-
- static void iso_tick(void)
- {
-- grq_lock();
-+ grq_iso_lock();
- grq.iso_ticks += 100;
-- grq_unlock();
-+ grq_iso_unlock();
- }
-
- /* No SCHED_ISO task was running so decrease rq->iso_ticks */
- static inline void no_iso_tick(void)
- {
- if (grq.iso_ticks) {
-- grq_lock();
-+ grq_iso_lock();
- grq.iso_ticks -= grq.iso_ticks / ISO_PERIOD + 1;
- if (unlikely(grq.iso_refractory && grq.iso_ticks <
- ISO_PERIOD * (sched_iso_cpu * 115 / 128)))
- clear_iso_refractory();
-- grq_unlock();
-+ grq_iso_unlock();
- }
- }
-
-@@ -2181,10 +2244,23 @@ static void task_running_tick(struct rq
- }
-
- /* SCHED_FIFO tasks never run out of timeslice. */
-- if (rq_idle(rq) || rq->rq_time_slice > 0 || rq->rq_policy == SCHED_FIFO)
-+ if (rq->rq_policy == SCHED_FIFO)
- return;
-+ /*
-+ * Tasks that were scheduled in the first half of a tick are not
-+ * allowed to run into the 2nd half of the next tick if they will
-+ * run out of time slice in the interim. Otherwise, if they have
-+ * less than 100us of time slice left they will be rescheduled.
-+ */
-+ if (rq->dither) {
-+ if (rq->rq_time_slice > HALF_JIFFY_US)
-+ return;
-+ else
-+ rq->rq_time_slice = 0;
-+ } else if (rq->rq_time_slice >= RESCHED_US)
-+ return;
-
-- /* p->time_slice <= 0. We only modify task_struct under grq lock */
-+ /* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */
- p = rq->curr;
- requeue_task(p);
- grq_lock();
-@@ -2205,13 +2281,14 @@ void scheduler_tick(void)
- struct rq *rq = cpu_rq(cpu);
-
- sched_clock_tick();
-+ /* grq lock not grabbed, so only update rq clock */
- update_rq_clock(rq);
- update_cpu_clock(rq, rq->curr, 1);
-- update_gjiffies();
- if (!rq_idle(rq))
- task_running_tick(rq);
- else
- no_iso_tick();
-+ rq->last_tick = rq->clock;
- }
-
- #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
-@@ -2273,7 +2350,7 @@ EXPORT_SYMBOL(sub_preempt_count);
- #endif
-
- /*
-- * Deadline is "now" in gjiffies + (offset by priority). Setting the deadline
-+ * Deadline is "now" in niffies + (offset by priority). Setting the deadline
- * is the key to everything. It distributes cpu fairly amongst tasks of the
- * same nice value, it proportions cpu according to nice level, it means the
- * task that last woke up the longest ago has the earliest deadline, thus
-@@ -2283,7 +2360,7 @@ EXPORT_SYMBOL(sub_preempt_count);
- */
- static inline int prio_deadline_diff(int user_prio)
- {
-- return (prio_ratios[user_prio] * rr_interval * HZ / (1000 * 100)) ? : 1;
-+ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
- }
-
- static inline int task_deadline_diff(struct task_struct *p)
-@@ -2296,25 +2373,33 @@ static inline int static_deadline_diff(i
- return prio_deadline_diff(USER_PRIO(static_prio));
- }
-
--static inline int longest_deadline_diff(void)
-+static inline int ms_longest_deadline_diff(void)
- {
-- return prio_deadline_diff(39);
-+ return NS_TO_MS(prio_deadline_diff(39));
- }
-
- /*
- * The time_slice is only refilled when it is empty and that is when we set a
- * new deadline.
- */
--static inline void time_slice_expired(struct task_struct *p)
-+static void time_slice_expired(struct task_struct *p)
- {
-- reset_first_time_slice(p);
- p->time_slice = timeslice();
-- p->deadline = gjiffies + task_deadline_diff(p);
-+ p->deadline = grq.niffies + task_deadline_diff(p);
- }
-
-+/*
-+ * Timeslices below RESCHED_US are considered as good as expired as there's no
-+ * point rescheduling when there's so little time left. SCHED_BATCH tasks
-+ * have been flagged be not latency sensitive and likely to be fully CPU
-+ * bound so every time they're rescheduled they have their time_slice
-+ * refilled, but get a new later deadline to have little effect on
-+ * SCHED_NORMAL tasks.
-+
-+ */
- static inline void check_deadline(struct task_struct *p)
- {
-- if (p->time_slice <= 0)
-+ if (p->time_slice < RESCHED_US || batch_task(p))
- time_slice_expired(p);
- }
-
-@@ -2352,7 +2437,7 @@ retry:
- queue = grq.queue + idx;
- list_for_each_entry(p, queue, run_list) {
- /* Make sure cpu affinity is ok */
-- if (online_cpus(p) && !cpu_isset(cpu, p->cpus_allowed))
-+ if (needs_other_cpu(p, cpu))
- continue;
- if (idx < MAX_RT_PRIO) {
- /* We found an rt task */
-@@ -2479,12 +2564,14 @@ need_resched_nonpreemptible:
- deactivate = 0;
- schedule_debug(prev);
-
-- local_irq_disable();
-- update_rq_clock(rq);
-+ grq_lock_irq();
-+ update_clocks(rq);
- update_cpu_clock(rq, prev, 0);
-- rq->skip_clock_update = 0;
-+ if (rq->clock - rq->last_tick > HALF_JIFFY_NS)
-+ rq->dither = 0;
-+ else
-+ rq->dither = 1;
-
-- grq_lock();
- clear_tsk_need_resched(prev);
-
- if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
-@@ -2500,35 +2587,53 @@ need_resched_nonpreemptible:
- prev->time_slice = rq->rq_time_slice;
- prev->deadline = rq->rq_deadline;
- check_deadline(prev);
-- return_task(prev, deactivate);
-- /* Task changed affinity off this cpu */
-- if (unlikely(!cpus_intersects(prev->cpus_allowed,
-- cpumask_of_cpu(cpu)))) {
-- if (online_cpus(prev))
-+ prev->last_ran = rq->clock;
-+
-+ /* Task changed affinity off this CPU */
-+ if (needs_other_cpu(prev, cpu))
-+ resched_suitable_idle(prev);
-+ else if (!deactivate) {
-+ if (!queued_notrunning()) {
-+ /*
-+ * We now know prev is the only thing that is
-+ * awaiting CPU so we can bypass rechecking for
-+ * the earliest deadline task and just run it
-+ * again.
-+ */
-+ grq_unlock_irq();
-+ goto rerun_prev_unlocked;
-+ } else {
-+ /*
-+ * If prev got kicked off by a task that has to
-+ * run on this CPU for affinity reasons then
-+ * there may be an idle CPU it can go to.
-+ */
- resched_suitable_idle(prev);
- }
-+ }
-+ return_task(prev, deactivate);
- }
-
-- if (likely(queued_notrunning())) {
-- next = earliest_deadline_task(rq, idle);
-- } else {
-+ if (unlikely(!queued_notrunning())) {
-+ /*
-+ * This CPU is now truly idle as opposed to when idle is
-+ * scheduled as a high priority task in its own right.
-+ */
- next = idle;
- schedstat_inc(rq, sched_goidle);
-- }
--
-- prefetch(next);
-- prefetch_stack(next);
--
-- if (task_idle(next))
- set_cpuidle_map(cpu);
-- else
-+ } else {
-+ next = earliest_deadline_task(rq, idle);
-+ prefetch(next);
-+ prefetch_stack(next);
- clear_cpuidle_map(cpu);
--
-- prev->last_ran = rq->clock;
-+ }
-
- if (likely(prev != next)) {
- sched_info_switch(prev, next);
-
-+ if (prev != idle)
-+ set_last_task(rq, prev);
- set_rq_task(rq, next);
- grq.nr_switches++;
- prev->oncpu = 0;
-@@ -2547,10 +2652,15 @@ need_resched_nonpreemptible:
- } else
- grq_unlock_irq();
-
-- if (unlikely(reacquire_kernel_lock(current) < 0))
-+rerun_prev_unlocked:
-+ if (unlikely(reacquire_kernel_lock(current) < 0)) {
-+// prev = rq->curr;
-+// switch_count = &prev->nivcsw;
- goto need_resched_nonpreemptible;
-+ }
-+
- preempt_enable_no_resched();
-- if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
-+ if (need_resched())
- goto need_resched;
- }
- EXPORT_SYMBOL(schedule);
-@@ -3066,8 +3176,9 @@ int task_prio(const struct task_struct *
- if (prio <= 0)
- goto out;
-
-- delta = p->deadline - gjiffies;
-- delta = delta * 40 / longest_deadline_diff();
-+ /* Convert to ms to avoid overflows */
-+ delta = NS_TO_MS(p->deadline - grq.niffies);
-+ delta = delta * 40 / ms_longest_deadline_diff();
- if (delta > 0 && delta <= 80)
- prio += delta;
- if (idleprio_task(p))
-@@ -3266,7 +3377,7 @@ recheck:
- policy = oldpolicy = -1;
- goto recheck;
- }
-- update_rq_clock(rq);
-+ update_clocks(rq);
- p->sched_reset_on_fork = reset_on_fork;
-
- queued = task_queued(p);
-@@ -4444,7 +4556,6 @@ migration_call(struct notifier_block *nf
-
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
-- cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
- idle = rq->idle;
- /* Idle task back to normal (off runqueue, low prio) */
- grq_lock_irq();
-@@ -4453,9 +4564,8 @@ migration_call(struct notifier_block *nf
- __setscheduler(idle, rq, SCHED_NORMAL, 0);
- idle->prio = PRIO_LIMIT;
- set_rq_task(rq, idle);
-- update_rq_clock(rq);
-+ update_clocks(rq);
- grq_unlock_irq();
-- cpuset_unlock();
- break;
-
- case CPU_DYING:
-@@ -5982,12 +6093,14 @@ void __init sched_init(void)
- int i;
- struct rq *rq;
-
-- prio_ratios[0] = 100;
-+ prio_ratios[0] = 128;
- for (i = 1 ; i < PRIO_RANGE ; i++)
- prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
-
- spin_lock_init(&grq.lock);
- grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0;
-+ grq.niffies = 0;
-+ spin_lock_init(&grq.iso_lock);
- grq.iso_ticks = grq.iso_refractory = 0;
- #ifdef CONFIG_SMP
- init_defrootdomain();
-@@ -6000,7 +6113,9 @@ void __init sched_init(void)
- rq = cpu_rq(i);
- rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc =
- rq->iowait_pc = rq->idle_pc = 0;
-+ rq->dither = 0;
- #ifdef CONFIG_SMP
-+ rq->last_niffy = 0;
- rq->sd = NULL;
- rq->rd = NULL;
- rq->online = 0;
-@@ -6219,10 +6334,6 @@ cputime_t task_stime(struct task_struct
- }
- #endif
-
--void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
--{
--}
--
- inline cputime_t task_gtime(struct task_struct *p)
- {
- return p->gtime;
-Index: kernel-2.6.28/kernel/sysctl.c
-===================================================================
---- kernel-2.6.28.orig/kernel/sysctl.c
-+++ kernel-2.6.28/kernel/sysctl.c
-@@ -102,7 +102,7 @@ static int __read_mostly one_hundred = 1
- #ifdef CONFIG_SCHED_BFS
- extern int rr_interval;
- extern int sched_iso_cpu;
--static int __read_mostly five_thousand = 5000;
-+static int __read_mostly one_thousand = 1000;
- #endif
- /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
- static int maxolduid = 65535;
-@@ -732,7 +732,7 @@ static struct ctl_table kern_table[] = {
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
- .extra1 = &one,
-- .extra2 = &five_thousand,
-+ .extra2 = &one_thousand,
- },
- {
- .ctl_name = CTL_UNNUMBERED,