From 835ec08ba453b4bf94152feb11516e3732a5a40b Mon Sep 17 00:00:00 2001 From: Corey O'Connor Date: Mon, 27 Sep 2010 14:00:48 -0700 Subject: [PATCH] adding initial BFS 330 to 350 patch --- .../debian/patches/bfs-330-to-350.patch | 1036 ++++++++++++++++++++ kernel-power-2.6.28/debian/patches/series | 1 + 2 files changed, 1037 insertions(+) create mode 100644 kernel-power-2.6.28/debian/patches/bfs-330-to-350.patch diff --git a/kernel-power-2.6.28/debian/patches/bfs-330-to-350.patch b/kernel-power-2.6.28/debian/patches/bfs-330-to-350.patch new file mode 100644 index 0000000..46af1f9 --- /dev/null +++ b/kernel-power-2.6.28/debian/patches/bfs-330-to-350.patch @@ -0,0 +1,1036 @@ +- Major overhaul of queued changes. + +- Microoptimise multiplications/divisions to be shifts where suitable. + +- Change ISO calculations to have their own lock so as to not grab the grq +lock during a scheduler tick. + +- Change all deadline accounting to use nanosecond values. + +- Introduce local niffies variable which is updated from any runqueue using the +TSC clock whenever the grq lock is taken. Use niffies to compare deadlines to. +This will give much more granular deadlines when jiffies are low resolution +such as 100Hz, and rarely will tasks have the same deadlines now. + +- Drop the "skip_clock_update" concept as we update the niffies each time we +update the rq clocks, thus we want to update it more often. + +- Rework try_preempt. + +- Bypass rechecking deadline when we know that prev will run again in schedule. + +- Check to see if prev can run on an idle CPU when being descheduled as may +happen when a task must use a certain CPU for affinity reasons. + +- Decrease maximum rr_interval possible to 1000 (one second) as there is no +demonstrable advantage to higher values, and may overflow with other changes +being introduced. + +- Check for when tasks are scheduled within a tick to see if they're in the +1st or 2nd half of the tick. Use this to decide how far into last tick a task +can run. This will make the greatest difference on lower Hz values with small +rr intervals. + +- Change the test for exhausted time slice to 100us as rescheduling with less +time available than this will either greatly overrun its quota or reschedule +very quickly. + +- Change SCHED_BATCH tasks to refill timeslices and reset deadline every time +they're descheduled as they've been flagged as latency insensitive, likely +fully CPU bound tasks. This should decrease the impact running batch tasks +has on other tasks. + +- Microoptimise before context switch in schedule() + +- Add a local last_task variable to each runqueue which keeps a copy of the +last non-idle task that ran on this CPU. Use this value to determine that a +task is still cache warm on this CPU even if it has run elsewhere in the +meantime. This improves throughput on relatively idle systems with >2 logical +CPUs. + +- Remove the first_time_slice concept as it wasn't contributing on any +meaningful level but was adding to overhead, especially on sched_exit. + +- Check that when a task forks and loses timeslice to its child that it isn't +due to be descheduled and ensure that a child inherits the proper variables +from its parent across fork. + +- Fix try_preempt which may have been picking a higher priority runqueue on +SMP if it had a later deadline. Remove the test for equal deadline as +nanosecond deadline resolution means this will never happen. + +- Random other minor cleanups. + +-ck +--- + + Documentation/scheduler/sched-BFS.txt | 2 + include/linux/sched.h | 4 + kernel/sched_bfs.c | 466 +++++++++++++++++++++------------- + kernel/sysctl.c | 4 + 4 files changed, 293 insertions(+), 183 deletions(-) + +Index: linux-2.6.35-bfs/Documentation/scheduler/sched-BFS.txt +=================================================================== +--- linux-2.6.35-bfs.orig/Documentation/scheduler/sched-BFS.txt 2010-09-25 08:18:30.134360792 +1000 ++++ linux-2.6.35-bfs/Documentation/scheduler/sched-BFS.txt 2010-09-25 08:20:25.830887001 +1000 +@@ -257,7 +257,7 @@ uniprocessor machine, and automatically + multiprocessor machines. The reasoning behind increasing the value on more CPUs + is that the effective latency is decreased by virtue of there being more CPUs on + BFS (for reasons explained above), and increasing the value allows for less +-cache contention and more throughput. Valid values are from 1 to 5000 ++cache contention and more throughput. Valid values are from 1 to 1000 + Decreasing the value will decrease latencies at the cost of decreasing + throughput, while increasing it will improve throughput, but at the cost of + worsening latencies. The accuracy of the rr interval is limited by HZ resolution +Index: linux-2.6.35-bfs/include/linux/sched.h +=================================================================== +--- linux-2.6.35-bfs.orig/include/linux/sched.h 2010-09-25 08:18:08.792894602 +1000 ++++ linux-2.6.35-bfs/include/linux/sched.h 2010-09-25 08:20:25.822886826 +1000 +@@ -1197,7 +1197,7 @@ struct task_struct { + int prio, static_prio, normal_prio; + unsigned int rt_priority; + #ifdef CONFIG_SCHED_BFS +- int time_slice, first_time_slice; ++ int time_slice; + u64 deadline; + struct list_head run_list; + u64 last_ran; +@@ -1547,7 +1547,7 @@ static inline void tsk_cpus_current(stru + + static inline void print_scheduler_version(void) + { +- printk(KERN_INFO"BFS CPU scheduler v0.330 by Con Kolivas.\n"); ++ printk(KERN_INFO"BFS CPU scheduler v0.350 by Con Kolivas.\n"); + } + + static inline int iso_task(struct task_struct *p) +Index: linux-2.6.35-bfs/kernel/sched_bfs.c +=================================================================== +--- linux-2.6.35-bfs.orig/kernel/sched_bfs.c 2010-09-25 08:18:08.804894864 +1000 ++++ linux-2.6.35-bfs/kernel/sched_bfs.c 2010-09-25 08:20:25.827886935 +1000 +@@ -106,10 +106,19 @@ + #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) + #define SCHED_PRIO(p) ((p)+MAX_RT_PRIO) + +-/* Some helpers for converting to/from various scales.*/ ++/* ++ * Some helpers for converting to/from various scales. Use shifts to get ++ * approximate multiples of ten for less overhead. ++ */ + #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) +-#define MS_TO_NS(TIME) ((TIME) * 1000000) +-#define MS_TO_US(TIME) ((TIME) * 1000) ++#define HALF_JIFFY_NS (1000000000 / HZ / 2) ++#define HALF_JIFFY_US (1000000 / HZ / 2) ++#define MS_TO_NS(TIME) ((TIME) << 20) ++#define MS_TO_US(TIME) ((TIME) << 10) ++#define NS_TO_MS(TIME) ((TIME) >> 20) ++#define NS_TO_US(TIME) ((TIME) >> 10) ++ ++#define RESCHED_US (100) /* Reschedule if less than this many us left */ + + /* + * This is the time all tasks within the same priority round robin. +@@ -140,8 +149,9 @@ static inline unsigned long timeslice(vo + } + + /* +- * The global runqueue data that all CPUs work off. All data is protected +- * by grq.lock. ++ * The global runqueue data that all CPUs work off. Data is protected either ++ * by the global grq lock, or the discrete lock that precedes the data in this ++ * struct. + */ + struct global_rq { + raw_spinlock_t lock; +@@ -150,17 +160,17 @@ struct global_rq { + unsigned long long nr_switches; + struct list_head queue[PRIO_LIMIT]; + DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1); +- int iso_ticks; +- int iso_refractory; + #ifdef CONFIG_SMP + unsigned long qnr; /* queued not running */ + cpumask_t cpu_idle_map; + int idle_cpus; + #endif +-#if BITS_PER_LONG < 64 +- unsigned long jiffies; +- u64 jiffies_64; +-#endif ++ /* Nanosecond jiffies */ ++ u64 niffies; ++ ++ raw_spinlock_t iso_lock; ++ int iso_ticks; ++ int iso_refractory; + }; + + /* There can be only one */ +@@ -176,8 +186,8 @@ struct rq { + u64 nohz_stamp; + unsigned char in_nohz_recently; + #endif ++ struct task_struct *last_task; + #endif +- unsigned int skip_clock_update; + + struct task_struct *curr, *idle; + struct mm_struct *prev_mm; +@@ -213,9 +223,11 @@ struct rq { + /* See if all cache siblings are idle */ + cpumask_t cache_siblings; + #endif ++ u64 last_niffy; /* Last time this RQ updated grq.niffies */ + #endif ++ u64 clock, old_clock, last_tick; ++ int dither; + +- u64 clock; + #ifdef CONFIG_SCHEDSTATS + + /* latency stats */ +@@ -286,15 +298,6 @@ struct root_domain { + static struct root_domain def_root_domain; + #endif + +-static inline int cpu_of(struct rq *rq) +-{ +-#ifdef CONFIG_SMP +- return rq->cpu; +-#else +- return 0; +-#endif +-} +- + #define rcu_dereference_check_sched_domain(p) \ + rcu_dereference_check((p), \ + rcu_read_lock_sched_held() || \ +@@ -310,17 +313,65 @@ static inline int cpu_of(struct rq *rq) + #define for_each_domain(cpu, __sd) \ + for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) + ++static inline void update_rq_clock(struct rq *rq); ++ + #ifdef CONFIG_SMP + #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) + #define this_rq() (&__get_cpu_var(runqueues)) + #define task_rq(p) cpu_rq(task_cpu(p)) + #define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++static inline int cpu_of(struct rq *rq) ++{ ++ return rq->cpu; ++} ++ ++/* ++ * Niffies are a globally increasing nanosecond counter. Whenever a runqueue ++ * clock is updated with the grq.lock held, it is an opportunity to update the ++ * niffies value. Any CPU can update it by adding how much its clock has ++ * increased since it last updated niffies, minus any added niffies by other ++ * CPUs. ++ */ ++static inline void update_clocks(struct rq *rq) ++{ ++ s64 ndiff; ++ ++ update_rq_clock(rq); ++ ndiff = rq->clock - rq->old_clock; ++ /* old_clock is only updated when we are updating niffies */ ++ rq->old_clock = rq->clock; ++ ndiff -= grq.niffies - rq->last_niffy; ++ /* ++ * Sanity check should sched_clock return bogus values or be limited to ++ * just jiffy resolution. Some time will always have passed. ++ */ ++ if (unlikely(ndiff < 1 || ndiff > MS_TO_NS(rr_interval))) ++ ndiff = 1; ++ grq.niffies += ndiff; ++ rq->last_niffy = grq.niffies; ++} + #else /* CONFIG_SMP */ + static struct rq *uprq; + #define cpu_rq(cpu) (uprq) + #define this_rq() (uprq) + #define task_rq(p) (uprq) + #define cpu_curr(cpu) ((uprq)->curr) ++static inline int cpu_of(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline void update_clocks(struct rq *rq) ++{ ++ s64 ndiff; ++ ++ update_rq_clock(rq); ++ ndiff = rq->clock - rq->old_clock; ++ rq->old_clock = rq->clock; ++ if (unlikely(ndiff < 1 || ndiff > MS_TO_US(rr_interval))) ++ ndiff = 1; ++ grq.niffies += ndiff; ++} + #endif + #define raw_rq() (&__raw_get_cpu_var(runqueues)) + +@@ -335,13 +386,13 @@ static struct rq *uprq; + + /* + * All common locking functions performed on grq.lock. rq->clock is local to +- * the cpu accessing it so it can be modified just with interrupts disabled, +- * but looking up task_rq must be done under grq.lock to be safe. ++ * the CPU accessing it so it can be modified just with interrupts disabled ++ * when we're not updating niffies. ++ * Looking up task_rq must be done under grq.lock to be safe. + */ +-inline void update_rq_clock(struct rq *rq) ++static inline void update_rq_clock(struct rq *rq) + { +- if (!rq->skip_clock_update) +- rq->clock = sched_clock_cpu(cpu_of(rq)); ++ rq->clock = sched_clock_cpu(cpu_of(rq)); + } + + static inline int task_running(struct task_struct *p) +@@ -370,8 +421,8 @@ static inline void grq_lock_irq(void) + static inline void time_lock_grq(struct rq *rq) + __acquires(grq.lock) + { +- update_rq_clock(rq); + grq_lock(); ++ update_clocks(rq); + } + + static inline void grq_unlock_irq(void) +@@ -405,7 +456,7 @@ static inline struct rq + __acquires(grq.lock) + { + struct rq *rq = task_grq_lock(p, flags); +- update_rq_clock(rq); ++ update_clocks(rq); + return rq; + } + +@@ -420,7 +471,7 @@ static inline void time_task_grq_lock_ir + __acquires(grq.lock) + { + struct rq *rq = task_grq_lock_irq(p); +- update_rq_clock(rq); ++ update_clocks(rq); + } + + static inline void task_grq_unlock_irq(void) +@@ -515,33 +566,6 @@ static inline void finish_lock_switch(st + } + #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ + +-/* +- * In order to have a monotonic clock that does not wrap we have a 64 bit +- * unsigned long that's protected by grq.lock used in place of jiffies on +- * 32 bit builds. +- */ +-#if BITS_PER_LONG < 64 +-static inline void update_gjiffies(void) +-{ +- if (grq.jiffies != jiffies) { +- grq_lock(); +- grq.jiffies = jiffies; +- grq.jiffies_64++; +- grq_unlock(); +- } +-} +- +-#define gjiffies (grq.jiffies_64) +- +-#else /* BITS_PER_LONG < 64 */ +-static inline void update_gjiffies(void) +-{ +-} +- +-#define gjiffies jiffies +- +-#endif /* BITS_PER_LONG < 64 */ +- + static inline int deadline_before(u64 deadline, u64 time) + { + return (deadline < time); +@@ -574,17 +598,6 @@ static void dequeue_task(struct task_str + } + + /* +- * When a task is freshly forked, the first_time_slice flag is set to say +- * it has taken time_slice from its parent and if it exits on this first +- * time_slice it can return its time_slice back to the parent. +- */ +-static inline void reset_first_time_slice(struct task_struct *p) +-{ +- if (unlikely(p->first_time_slice)) +- p->first_time_slice = 0; +-} +- +-/* + * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as + * an idle task, we ensure none of the following conditions are met. + */ +@@ -646,11 +659,11 @@ static inline int task_prio_ratio(struct + /* + * task_timeslice - all tasks of all priorities get the exact same timeslice + * length. CPU distribution is handled by giving different deadlines to +- * tasks of different priorities. ++ * tasks of different priorities. Use 128 as the base value for fast shifts. + */ + static inline int task_timeslice(struct task_struct *p) + { +- return (rr_interval * task_prio_ratio(p) / 100); ++ return (rr_interval * task_prio_ratio(p) / 128); + } + + #ifdef CONFIG_SMP +@@ -702,6 +715,15 @@ static int suitable_idle_cpus(struct tas + + static void resched_task(struct task_struct *p); + ++/* ++ * last_task stores the last non-idle task scheduled on the local rq for ++ * cache warmth testing. ++ */ ++static inline void set_last_task(struct rq *rq, struct task_struct *p) ++{ ++ rq->last_task = p; ++} ++ + #define CPUIDLE_CACHE_BUSY (1) + #define CPUIDLE_DIFF_CPU (2) + #define CPUIDLE_THREAD_BUSY (4) +@@ -724,6 +746,9 @@ static void resched_task(struct task_str + * Other node, other CPU, idle cache, idle threads. + * Other node, other CPU, busy cache, idle threads. + * Other node, other CPU, busy threads. ++ * ++ * If p was the last task running on this rq, then regardless of where ++ * it has been running since then, it is cache warm on this rq. + */ + static void resched_best_idle(struct task_struct *p) + { +@@ -756,11 +781,14 @@ static void resched_best_idle(struct tas + tmp_rq = cpu_rq(cpu_tmp); + + if (rq->cpu_locality[cpu_tmp]) { ++ /* Check rq->last_task hasn't been dereferenced */ ++ if (rq->last_task && p != rq->last_task) { + #ifdef CONFIG_NUMA +- if (rq->cpu_locality[cpu_tmp] > 1) +- ranking |= CPUIDLE_DIFF_NODE; ++ if (rq->cpu_locality[cpu_tmp] > 1) ++ ranking |= CPUIDLE_DIFF_NODE; + #endif +- ranking |= CPUIDLE_DIFF_CPU; ++ ranking |= CPUIDLE_DIFF_CPU; ++ } + } + #ifdef CONFIG_SCHED_MC + if (!(tmp_rq->cache_idle(cpu_tmp))) +@@ -802,6 +830,11 @@ static inline void resched_suitable_idle + static inline int + cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p) + { ++ /* Check rq->last_task hasn't been dereferenced */ ++ if (likely(rq->last_task)) { ++ if (rq->last_task == p) ++ return 0; ++ } + return rq->cpu_locality[cpu_of(task_rq)] * task_timeslice(p); + } + #else /* CONFIG_SMP */ +@@ -840,6 +873,10 @@ cache_distance(struct rq *task_rq, struc + { + return 0; + } ++ ++static inline void set_last_task(struct rq *rq, struct task_struct *p) ++{ ++} + #endif /* CONFIG_SMP */ + + /* +@@ -887,7 +924,7 @@ static int effective_prio(struct task_st + */ + static void activate_task(struct task_struct *p, struct rq *rq) + { +- update_rq_clock(rq); ++ update_clocks(rq); + + /* + * Sleep time is in units of nanosecs, so shift by 20 to get a +@@ -1157,8 +1194,28 @@ EXPORT_SYMBOL_GPL(kick_process); + #endif + + #define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) +-#define task_idle(p) ((p)->prio == PRIO_LIMIT) + ++/* ++ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the ++ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or ++ * between themselves, they cooperatively multitask. An idle rq scores as ++ * prio PRIO_LIMIT so it is always preempted. ++ */ ++static inline int ++can_preempt(struct task_struct *p, int prio, unsigned long deadline, ++ unsigned int policy) ++{ ++ /* Better static priority RT task or better policy preemption */ ++ if (p->prio < prio) ++ return 1; ++ if (p->prio > prio) ++ return 0; ++ /* SCHED_NORMAL, BATCH and ISO will preempt based on deadline */ ++ if (!deadline_before(p->deadline, deadline)) ++ return 0; ++ return 1; ++} ++#ifdef CONFIG_SMP + #ifdef CONFIG_HOTPLUG_CPU + /* + * Check to see if there is a task that is affined only to offline CPUs but +@@ -1178,14 +1235,20 @@ static inline int online_cpus(struct tas + #endif + + /* +- * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the +- * basis of earlier deadlines. SCHED_BATCH, ISO and IDLEPRIO don't preempt +- * between themselves, they cooperatively multitask. An idle rq scores as +- * prio PRIO_LIMIT so it is always preempted. latest_deadline and +- * highest_prio_rq are initialised only to silence the compiler. When +- * all else is equal, still prefer this_rq. ++ * Check to see if p can run on cpu, and if not, whether there are any online ++ * CPUs it can run on instead. ++ */ ++static inline int needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ if (unlikely(!cpu_isset(cpu, p->cpus_allowed) && online_cpus(p))) ++ return 1; ++ return 0; ++} ++ ++/* ++ * latest_deadline and highest_prio_rq are initialised only to silence the ++ * compiler. When all else is equal, still prefer this_rq. + */ +-#ifdef CONFIG_SMP + static void try_preempt(struct task_struct *p, struct rq *this_rq) + { + struct rq *highest_prio_rq = this_rq; +@@ -1193,6 +1256,10 @@ static void try_preempt(struct task_stru + int highest_prio; + cpumask_t tmp; + ++ /* IDLEPRIO tasks never preempt anything */ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ + if (suitable_idle_cpus(p)) { + resched_best_idle(p); + return; +@@ -1219,30 +1286,32 @@ static void try_preempt(struct task_stru + offset_deadline = rq->rq_deadline - + cache_distance(this_rq, rq, p); + +- if (rq_prio > highest_prio || +- (deadline_after(offset_deadline, latest_deadline) || +- (offset_deadline == latest_deadline && this_rq == rq))) { ++ if (rq_prio > highest_prio || (rq_prio == highest_prio && ++ deadline_after(offset_deadline, latest_deadline))) { + latest_deadline = offset_deadline; + highest_prio = rq_prio; + highest_prio_rq = rq; + } + } + +- if (p->prio > highest_prio || (p->prio == highest_prio && +- p->policy == SCHED_NORMAL && +- !deadline_before(p->deadline, latest_deadline))) ++ if (!can_preempt(p, highest_prio, highest_prio_rq->rq_deadline, ++ highest_prio_rq->rq_policy)) + return; + +- /* p gets to preempt highest_prio_rq->curr */ + resched_task(highest_prio_rq->curr); +- highest_prio_rq->skip_clock_update = 1; + } + #else /* CONFIG_SMP */ ++static inline int needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ return 0; ++} ++ + static void try_preempt(struct task_struct *p, struct rq *this_rq) + { +- if (p->prio < uprq->rq_prio || +- (p->prio == uprq->rq_prio && p->policy == SCHED_NORMAL && +- deadline_before(p->deadline, uprq->rq_deadline))) ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline, ++ uprq->rq_policy)) + resched_task(uprq->curr); + } + #endif /* CONFIG_SMP */ +@@ -1352,12 +1421,15 @@ int wake_up_state(struct task_struct *p, + return try_to_wake_up(p, state, 0); + } + ++static void time_slice_expired(struct task_struct *p); ++ + /* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + */ + void sched_fork(struct task_struct *p, int clone_flags) + { ++ struct task_struct *curr; + int cpu = get_cpu(); + struct rq *rq; + +@@ -1396,10 +1468,11 @@ void sched_fork(struct task_struct *p, i + p->sched_reset_on_fork = 0; + } + ++ curr = current; + /* + * Make sure we do not leak PI boosting priority to the child. + */ +- p->prio = current->normal_prio; ++ p->prio = curr->normal_prio; + + INIT_LIST_HEAD(&p->run_list); + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +@@ -1420,18 +1493,26 @@ void sched_fork(struct task_struct *p, i + * total amount of pending timeslices in the system doesn't change, + * resulting in more scheduling fairness. If it's negative, it won't + * matter since that's the same as being 0. current's time_slice is +- * actually in rq_time_slice when it's running. ++ * actually in rq_time_slice when it's running, as is its last_ran ++ * value. rq->rq_deadline is only modified within schedule() so it ++ * is always equal to current->deadline. + */ +- rq = task_grq_lock_irq(current); +- if (likely(rq->rq_time_slice > 0)) { ++ rq = task_grq_lock_irq(curr); ++ if (likely(rq->rq_time_slice >= RESCHED_US * 2)) { + rq->rq_time_slice /= 2; ++ p->time_slice = rq->rq_time_slice; ++ } else { + /* +- * The remainder of the first timeslice might be recovered by +- * the parent if the child exits early enough. ++ * Forking task has run out of timeslice. Reschedule it and ++ * start its child with a new time slice and deadline. The ++ * child will end up running first because its deadline will ++ * be slightly earlier. + */ +- p->first_time_slice = 1; ++ rq->rq_time_slice = 0; ++ set_tsk_need_resched(curr); ++ time_slice_expired(p); + } +- p->time_slice = rq->rq_time_slice; ++ p->last_ran = rq->rq_last_ran; + task_grq_unlock_irq(); + out: + put_cpu(); +@@ -1470,40 +1551,9 @@ void wake_up_new_task(struct task_struct + task_grq_unlock(&flags); + } + +-/* +- * Potentially available exiting-child timeslices are +- * retrieved here - this way the parent does not get +- * penalised for creating too many threads. +- * +- * (this cannot be used to 'generate' timeslices +- * artificially, because any timeslice recovered here +- * was given away by the parent in the first place.) +- */ ++/* Nothing to do here */ + void sched_exit(struct task_struct *p) + { +- struct task_struct *parent; +- unsigned long flags; +- struct rq *rq; +- +- if (unlikely(p->first_time_slice)) { +- int *par_tslice, *p_tslice; +- +- parent = p->parent; +- par_tslice = &parent->time_slice; +- p_tslice = &p->time_slice; +- +- rq = task_grq_lock(parent, &flags); +- /* The real time_slice of the "curr" task is on the rq var.*/ +- if (p == rq->curr) +- p_tslice = &rq->rq_time_slice; +- else if (parent == task_rq(parent)->curr) +- par_tslice = &rq->rq_time_slice; +- +- *par_tslice += *p_tslice; +- if (unlikely(*par_tslice > timeslice())) +- *par_tslice = timeslice(); +- task_grq_unlock(&flags); +- } + } + + #ifdef CONFIG_PREEMPT_NOTIFIERS +@@ -1981,7 +2031,7 @@ update_cpu_clock(struct rq *rq, struct t + else if (unlikely(time_diff > JIFFIES_TO_NS(1))) + time_diff = JIFFIES_TO_NS(1); + +- rq->rq_time_slice -= time_diff / 1000; ++ rq->rq_time_slice -= NS_TO_US(time_diff); + } + rq->rq_last_ran = rq->timekeep_clock = rq->clock; + } +@@ -1997,7 +2047,7 @@ static u64 do_task_delta_exec(struct tas + u64 ns = 0; + + if (p == rq->curr) { +- update_rq_clock(rq); ++ update_clocks(rq); + ns = rq->clock - rq->rq_last_ran; + if (unlikely((s64)ns < 0)) + ns = 0; +@@ -2171,10 +2221,22 @@ void account_idle_ticks(unsigned long ti + } + #endif + ++static inline void grq_iso_lock(void) ++ __acquires(grq.iso_lock) ++{ ++ raw_spin_lock(&grq.iso_lock); ++} ++ ++static inline void grq_iso_unlock(void) ++ __releases(grq.iso_lock) ++{ ++ raw_spin_unlock(&grq.iso_lock); ++} ++ + /* + * Functions to test for when SCHED_ISO tasks have used their allocated + * quota as real time scheduling and convert them back to SCHED_NORMAL. +- * Where possible, the data is tested lockless, to avoid grabbing grq_lock ++ * Where possible, the data is tested lockless, to avoid grabbing iso_lock + * because the occasional inaccurate result won't matter. However the + * tick data is only ever modified under lock. iso_refractory is only simply + * set to 0 or 1 so it's not worth grabbing the lock yet again for that. +@@ -2209,21 +2271,21 @@ static unsigned int test_ret_isorefracto + + static void iso_tick(void) + { +- grq_lock(); ++ grq_iso_lock(); + grq.iso_ticks += 100; +- grq_unlock(); ++ grq_iso_unlock(); + } + + /* No SCHED_ISO task was running so decrease rq->iso_ticks */ + static inline void no_iso_tick(void) + { + if (grq.iso_ticks) { +- grq_lock(); ++ grq_iso_lock(); + grq.iso_ticks -= grq.iso_ticks / ISO_PERIOD + 1; + if (unlikely(grq.iso_refractory && grq.iso_ticks < + ISO_PERIOD * (sched_iso_cpu * 115 / 128))) + clear_iso_refractory(); +- grq_unlock(); ++ grq_iso_unlock(); + } + } + +@@ -2262,10 +2324,23 @@ static void task_running_tick(struct rq + } + + /* SCHED_FIFO tasks never run out of timeslice. */ +- if (rq_idle(rq) || rq->rq_time_slice > 0 || rq->rq_policy == SCHED_FIFO) ++ if (rq->rq_policy == SCHED_FIFO) + return; ++ /* ++ * Tasks that were scheduled in the first half of a tick are not ++ * allowed to run into the 2nd half of the next tick if they will ++ * run out of time slice in the interim. Otherwise, if they have ++ * less than 100us of time slice left they will be rescheduled. ++ */ ++ if (rq->dither) { ++ if (rq->rq_time_slice > HALF_JIFFY_US) ++ return; ++ else ++ rq->rq_time_slice = 0; ++ } else if (rq->rq_time_slice >= RESCHED_US) ++ return; + +- /* p->time_slice <= 0. We only modify task_struct under grq lock */ ++ /* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */ + p = rq->curr; + requeue_task(p); + grq_lock(); +@@ -2286,13 +2361,14 @@ void scheduler_tick(void) + struct rq *rq = cpu_rq(cpu); + + sched_clock_tick(); ++ /* grq lock not grabbed, so only update rq clock */ + update_rq_clock(rq); + update_cpu_clock(rq, rq->curr, 1); +- update_gjiffies(); + if (!rq_idle(rq)) + task_running_tick(rq); + else + no_iso_tick(); ++ rq->last_tick = rq->clock; + perf_event_task_tick(rq->curr); + } + +@@ -2354,7 +2430,7 @@ EXPORT_SYMBOL(sub_preempt_count); + #endif + + /* +- * Deadline is "now" in gjiffies + (offset by priority). Setting the deadline ++ * Deadline is "now" in niffies + (offset by priority). Setting the deadline + * is the key to everything. It distributes cpu fairly amongst tasks of the + * same nice value, it proportions cpu according to nice level, it means the + * task that last woke up the longest ago has the earliest deadline, thus +@@ -2364,7 +2440,7 @@ EXPORT_SYMBOL(sub_preempt_count); + */ + static inline int prio_deadline_diff(int user_prio) + { +- return (prio_ratios[user_prio] * rr_interval * HZ / (1000 * 100)) ? : 1; ++ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); + } + + static inline int task_deadline_diff(struct task_struct *p) +@@ -2377,25 +2453,33 @@ static inline int static_deadline_diff(i + return prio_deadline_diff(USER_PRIO(static_prio)); + } + +-static inline int longest_deadline_diff(void) ++static inline int ms_longest_deadline_diff(void) + { +- return prio_deadline_diff(39); ++ return NS_TO_MS(prio_deadline_diff(39)); + } + + /* + * The time_slice is only refilled when it is empty and that is when we set a + * new deadline. + */ +-static inline void time_slice_expired(struct task_struct *p) ++static void time_slice_expired(struct task_struct *p) + { +- reset_first_time_slice(p); + p->time_slice = timeslice(); +- p->deadline = gjiffies + task_deadline_diff(p); ++ p->deadline = grq.niffies + task_deadline_diff(p); + } + ++/* ++ * Timeslices below RESCHED_US are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. SCHED_BATCH tasks ++ * have been flagged be not latency sensitive and likely to be fully CPU ++ * bound so every time they're rescheduled they have their time_slice ++ * refilled, but get a new later deadline to have little effect on ++ * SCHED_NORMAL tasks. ++ ++ */ + static inline void check_deadline(struct task_struct *p) + { +- if (p->time_slice <= 0) ++ if (p->time_slice < RESCHED_US || batch_task(p)) + time_slice_expired(p); + } + +@@ -2433,7 +2517,7 @@ retry: + queue = grq.queue + idx; + list_for_each_entry(p, queue, run_list) { + /* Make sure cpu affinity is ok */ +- if (online_cpus(p) && !cpu_isset(cpu, p->cpus_allowed)) ++ if (needs_other_cpu(p, cpu)) + continue; + if (idx < MAX_RT_PRIO) { + /* We found an rt task */ +@@ -2560,12 +2644,14 @@ need_resched_nonpreemptible: + deactivate = 0; + schedule_debug(prev); + +- local_irq_disable(); +- update_rq_clock(rq); ++ grq_lock_irq(); ++ update_clocks(rq); + update_cpu_clock(rq, prev, 0); +- rq->skip_clock_update = 0; ++ if (rq->clock - rq->last_tick > HALF_JIFFY_NS) ++ rq->dither = 0; ++ else ++ rq->dither = 1; + +- grq_lock(); + clear_tsk_need_resched(prev); + + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { +@@ -2581,36 +2667,54 @@ need_resched_nonpreemptible: + prev->time_slice = rq->rq_time_slice; + prev->deadline = rq->rq_deadline; + check_deadline(prev); +- return_task(prev, deactivate); +- /* Task changed affinity off this cpu */ +- if (unlikely(!cpus_intersects(prev->cpus_allowed, +- cpumask_of_cpu(cpu)))) { +- if (online_cpus(prev)) ++ prev->last_ran = rq->clock; ++ ++ /* Task changed affinity off this CPU */ ++ if (needs_other_cpu(prev, cpu)) ++ resched_suitable_idle(prev); ++ else if (!deactivate) { ++ if (!queued_notrunning()) { ++ /* ++ * We now know prev is the only thing that is ++ * awaiting CPU so we can bypass rechecking for ++ * the earliest deadline task and just run it ++ * again. ++ */ ++ grq_unlock_irq(); ++ goto rerun_prev_unlocked; ++ } else { ++ /* ++ * If prev got kicked off by a task that has to ++ * run on this CPU for affinity reasons then ++ * there may be an idle CPU it can go to. ++ */ + resched_suitable_idle(prev); + } ++ } ++ return_task(prev, deactivate); + } + +- if (likely(queued_notrunning())) { +- next = earliest_deadline_task(rq, idle); +- } else { ++ if (unlikely(!queued_notrunning())) { ++ /* ++ * This CPU is now truly idle as opposed to when idle is ++ * scheduled as a high priority task in its own right. ++ */ + next = idle; + schedstat_inc(rq, sched_goidle); +- } +- +- prefetch(next); +- prefetch_stack(next); +- +- if (task_idle(next)) + set_cpuidle_map(cpu); +- else ++ } else { ++ next = earliest_deadline_task(rq, idle); ++ prefetch(next); ++ prefetch_stack(next); + clear_cpuidle_map(cpu); +- +- prev->last_ran = rq->clock; ++ } + + if (likely(prev != next)) { + sched_info_switch(prev, next); + perf_event_task_sched_out(prev, next); + ++ if (prev != idle) ++ set_last_task(rq, prev); + set_rq_task(rq, next); + grq.nr_switches++; + prev->oncpu = 0; +@@ -2629,6 +2733,7 @@ need_resched_nonpreemptible: + } else + grq_unlock_irq(); + ++rerun_prev_unlocked: + if (unlikely(reacquire_kernel_lock(current) < 0)) { + prev = rq->curr; + switch_count = &prev->nivcsw; +@@ -3324,8 +3429,9 @@ int task_prio(const struct task_struct * + if (prio <= 0) + goto out; + +- delta = p->deadline - gjiffies; +- delta = delta * 40 / longest_deadline_diff(); ++ /* Convert to ms to avoid overflows */ ++ delta = NS_TO_MS(p->deadline - grq.niffies); ++ delta = delta * 40 / ms_longest_deadline_diff(); + if (delta > 0 && delta <= 80) + prio += delta; + if (idleprio_task(p)) +@@ -3533,7 +3639,7 @@ recheck: + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + goto recheck; + } +- update_rq_clock(rq); ++ update_clocks(rq); + p->sched_reset_on_fork = reset_on_fork; + + queued = task_queued(p); +@@ -4835,7 +4941,7 @@ migration_call(struct notifier_block *nf + __setscheduler(idle, rq, SCHED_NORMAL, 0); + idle->prio = PRIO_LIMIT; + set_rq_task(rq, idle); +- update_rq_clock(rq); ++ update_clocks(rq); + grq_unlock_irq(); + break; + +@@ -6531,12 +6637,14 @@ void __init sched_init(void) + int i; + struct rq *rq; + +- prio_ratios[0] = 100; ++ prio_ratios[0] = 128; + for (i = 1 ; i < PRIO_RANGE ; i++) + prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; + + raw_spin_lock_init(&grq.lock); + grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0; ++ grq.niffies = 0; ++ raw_spin_lock_init(&grq.iso_lock); + grq.iso_ticks = grq.iso_refractory = 0; + #ifdef CONFIG_SMP + init_defrootdomain(); +@@ -6549,7 +6657,9 @@ void __init sched_init(void) + rq = cpu_rq(i); + rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc = + rq->iowait_pc = rq->idle_pc = 0; ++ rq->dither = 0; + #ifdef CONFIG_SMP ++ rq->last_niffy = 0; + rq->sd = NULL; + rq->rd = NULL; + rq->online = 0; +Index: linux-2.6.35-bfs/kernel/sysctl.c +=================================================================== +--- linux-2.6.35-bfs.orig/kernel/sysctl.c 2010-09-25 08:18:30.147361076 +1000 ++++ linux-2.6.35-bfs/kernel/sysctl.c 2010-09-25 08:20:25.823886848 +1000 +@@ -119,7 +119,7 @@ static int __maybe_unused one_hundred = + #ifdef CONFIG_SCHED_BFS + extern int rr_interval; + extern int sched_iso_cpu; +-static int __read_mostly five_thousand = 5000; ++static int __read_mostly one_thousand = 1000; + #endif + #ifdef CONFIG_PRINTK + static int ten_thousand = 10000; +@@ -794,7 +794,7 @@ static struct ctl_table kern_table[] = { + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &one, +- .extra2 = &five_thousand, ++ .extra2 = &one_thousand, + }, + { + .procname = "iso_cpu", diff --git a/kernel-power-2.6.28/debian/patches/series b/kernel-power-2.6.28/debian/patches/series index a8fde40..07fe2e7 100644 --- a/kernel-power-2.6.28/debian/patches/series +++ b/kernel-power-2.6.28/debian/patches/series @@ -33,6 +33,7 @@ overclock.diff bfs.patch bfs-316-to-318.patch bfs-318-to-330.patch +bfs-330-to-350.patch voltage_scaling_1.diff voltage_scaling_0.diff armthumb.diff -- 1.7.9.5