git.maemo.org Git - kernel-bfs/blob - kernel-bfs-2.6.28/debian/patches/bfs-363-to-400.patch

   1 diff -urpN linux-2.6.28.orig/Documentation/scheduler/sched-BFS.txt linux-2.6.28/Documentation/scheduler/sched-BFS.txt
   2 --- linux-2.6.28.orig/Documentation/scheduler/sched-BFS.txt     2011-04-10 21:50:27.152902332 +0200
   3 +++ linux-2.6.28/Documentation/scheduler/sched-BFS.txt  2011-04-10 21:32:08.616315645 +0200
   4 @@ -1,359 +1,347 @@
   5 -***************
   6 -*** 0 ****
   7 ---- 1,356 ----
   8 -+ BFS - The Brain Fuck Scheduler by Con Kolivas.
   9 -+
  10 -+ Goals.
  11 -+
  12 -+ The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to
  13 -+ completely do away with the complex designs of the past for the cpu process
  14 -+ scheduler and instead implement one that is very simple in basic design.
  15 -+ The main focus of BFS is to achieve excellent desktop interactivity and
  16 -+ responsiveness without heuristics and tuning knobs that are difficult to
  17 -+ understand, impossible to model and predict the effect of, and when tuned to
  18 -+ one workload cause massive detriment to another.
  19 -+
  20 -+
  21 -+ Design summary.
  22 -+
  23 -+ BFS is best described as a single runqueue, O(n) lookup, earliest effective
  24 -+ virtual deadline first design, loosely based on EEVDF (earliest eligible virtual
  25 -+ deadline first) and my previous Staircase Deadline scheduler. Each component
  26 -+ shall be described in order to understand the significance of, and reasoning for
  27 -+ it. The codebase when the first stable version was released was approximately
  28 -+ 9000 lines less code than the existing mainline linux kernel scheduler (in
  29 -+ 2.6.31). This does not even take into account the removal of documentation and
  30 -+ the cgroups code that is not used.
  31 -+
  32 -+ Design reasoning.
  33 -+
  34 -+ The single runqueue refers to the queued but not running processes for the
  35 -+ entire system, regardless of the number of CPUs. The reason for going back to
  36 -+ a single runqueue design is that once multiple runqueues are introduced,
  37 -+ per-CPU or otherwise, there will be complex interactions as each runqueue will
  38 -+ be responsible for the scheduling latency and fairness of the tasks only on its
  39 -+ own runqueue, and to achieve fairness and low latency across multiple CPUs, any
  40 -+ advantage in throughput of having CPU local tasks causes other disadvantages.
  41 -+ This is due to requiring a very complex balancing system to at best achieve some
  42 -+ semblance of fairness across CPUs and can only maintain relatively low latency
  43 -+ for tasks bound to the same CPUs, not across them. To increase said fairness
  44 -+ and latency across CPUs, the advantage of local runqueue locking, which makes
  45 -+ for better scalability, is lost due to having to grab multiple locks.
  46 -+
  47 -+ A significant feature of BFS is that all accounting is done purely based on CPU
  48 -+ used and nowhere is sleep time used in any way to determine entitlement or
  49 -+ interactivity. Interactivity "estimators" that use some kind of sleep/run
  50 -+ algorithm are doomed to fail to detect all interactive tasks, and to falsely tag
  51 -+ tasks that aren't interactive as being so. The reason for this is that it is
  52 -+ close to impossible to determine that when a task is sleeping, whether it is
  53 -+ doing it voluntarily, as in a userspace application waiting for input in the
  54 -+ form of a mouse click or otherwise, or involuntarily, because it is waiting for
  55 -+ another thread, process, I/O, kernel activity or whatever. Thus, such an
  56 -+ estimator will introduce corner cases, and more heuristics will be required to
  57 -+ cope with those corner cases, introducing more corner cases and failed
  58 -+ interactivity detection and so on. Interactivity in BFS is built into the design
  59 -+ by virtue of the fact that tasks that are waking up have not used up their quota
  60 -+ of CPU time, and have earlier effective deadlines, thereby making it very likely
  61 -+ they will preempt any CPU bound task of equivalent nice level. See below for
  62 -+ more information on the virtual deadline mechanism. Even if they do not preempt
  63 -+ a running task, because the rr interval is guaranteed to have a bound upper
  64 -+ limit on how long a task will wait for, it will be scheduled within a timeframe
  65 -+ that will not cause visible interface jitter.
  66 -+
  67 -+
  68 -+ Design details.
  69 -+
  70 -+ Task insertion.
  71 -+
  72 -+ BFS inserts tasks into each relevant queue as an O(1) insertion into a double
  73 -+ linked list. On insertion, *every* running queue is checked to see if the newly
  74 -+ queued task can run on any idle queue, or preempt the lowest running task on the
  75 -+ system. This is how the cross-CPU scheduling of BFS achieves significantly lower
  76 -+ latency per extra CPU the system has. In this case the lookup is, in the worst
  77 -+ case scenario, O(n) where n is the number of CPUs on the system.
  78 -+
  79 -+ Data protection.
  80 -+
  81 -+ BFS has one single lock protecting the process local data of every task in the
  82 -+ global queue. Thus every insertion, removal and modification of task data in the
  83 -+ global runqueue needs to grab the global lock. However, once a task is taken by
  84 -+ a CPU, the CPU has its own local data copy of the running process' accounting
  85 -+ information which only that CPU accesses and modifies (such as during a
  86 -+ timer tick) thus allowing the accounting data to be updated lockless. Once a
  87 -+ CPU has taken a task to run, it removes it from the global queue. Thus the
  88 -+ global queue only ever has, at most,
  89 -+
  90 -+      (number of tasks requesting cpu time) - (number of logical CPUs) + 1
  91 -+
  92 -+ tasks in the global queue. This value is relevant for the time taken to look up
  93 -+ tasks during scheduling. This will increase if many tasks with CPU affinity set
  94 -+ in their policy to limit which CPUs they're allowed to run on if they outnumber
  95 -+ the number of CPUs. The +1 is because when rescheduling a task, the CPU's
  96 -+ currently running task is put back on the queue. Lookup will be described after
  97 -+ the virtual deadline mechanism is explained.
  98 -+
  99 -+ Virtual deadline.
 100 -+
 101 -+ The key to achieving low latency, scheduling fairness, and "nice level"
 102 -+ distribution in BFS is entirely in the virtual deadline mechanism. The one
 103 -+ tunable in BFS is the rr_interval, or "round robin interval". This is the
 104 -+ maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy)
 105 -+ tasks of the same nice level will be running for, or looking at it the other
 106 -+ way around, the longest duration two tasks of the same nice level will be
 107 -+ delayed for. When a task requests cpu time, it is given a quota (time_slice)
 108 -+ equal to the rr_interval and a virtual deadline. The virtual deadline is
 109 -+ offset from the current time in jiffies by this equation:
 110 -+
 111 -+      jiffies + (prio_ratio * rr_interval)
 112 -+
 113 -+ The prio_ratio is determined as a ratio compared to the baseline of nice -20
 114 -+ and increases by 10% per nice level. The deadline is a virtual one only in that
 115 -+ no guarantee is placed that a task will actually be scheduled by this time, but
 116 -+ it is used to compare which task should go next. There are three components to
 117 -+ how a task is next chosen. First is time_slice expiration. If a task runs out
 118 -+ of its time_slice, it is descheduled, the time_slice is refilled, and the
 119 -+ deadline reset to that formula above. Second is sleep, where a task no longer
 120 -+ is requesting CPU for whatever reason. The time_slice and deadline are _not_
 121 -+ adjusted in this case and are just carried over for when the task is next
 122 -+ scheduled. Third is preemption, and that is when a newly waking task is deemed
 123 -+ higher priority than a currently running task on any cpu by virtue of the fact
 124 -+ that it has an earlier virtual deadline than the currently running task. The
 125 -+ earlier deadline is the key to which task is next chosen for the first and
 126 -+ second cases. Once a task is descheduled, it is put back on the queue, and an
 127 -+ O(n) lookup of all queued-but-not-running tasks is done to determine which has
 128 -+ the earliest deadline and that task is chosen to receive CPU next. The one
 129 -+ caveat to this is that if a deadline has already passed (jiffies is greater
 130 -+ than the deadline), the tasks are chosen in FIFO (first in first out) order as
 131 -+ the deadlines are old and their absolute value becomes decreasingly relevant
 132 -+ apart from being a flag that they have been asleep and deserve CPU time ahead
 133 -+ of all later deadlines.
 134 -+
 135 -+ The CPU proportion of different nice tasks works out to be approximately the
 136 -+
 137 -+      (prio_ratio difference)^2
 138 -+
 139 -+ The reason it is squared is that a task's deadline does not change while it is
 140 -+ running unless it runs out of time_slice. Thus, even if the time actually
 141 -+ passes the deadline of another task that is queued, it will not get CPU time
 142 -+ unless the current running task deschedules, and the time "base" (jiffies) is
 143 -+ constantly moving.
 144 -+
 145 -+ Task lookup.
 146 -+
 147 -+ BFS has 103 priority queues. 100 of these are dedicated to the static priority
 148 -+ of realtime tasks, and the remaining 3 are, in order of best to worst priority,
 149 -+ SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority
 150 -+ scheduling). When a task of these priorities is queued, a bitmap of running
 151 -+ priorities is set showing which of these priorities has tasks waiting for CPU
 152 -+ time. When a CPU is made to reschedule, the lookup for the next task to get
 153 -+ CPU time is performed in the following way:
 154 -+
 155 -+ First the bitmap is checked to see what static priority tasks are queued. If
 156 -+ any realtime priorities are found, the corresponding queue is checked and the
 157 -+ first task listed there is taken (provided CPU affinity is suitable) and lookup
 158 -+ is complete. If the priority corresponds to a SCHED_ISO task, they are also
 159 -+ taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds
 160 -+ to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this
 161 -+ stage, every task in the runlist that corresponds to that priority is checked
 162 -+ to see which has the earliest set deadline, and (provided it has suitable CPU
 163 -+ affinity) it is taken off the runqueue and given the CPU. If a task has an
 164 -+ expired deadline, it is taken and the rest of the lookup aborted (as they are
 165 -+ chosen in FIFO order).
 166 -+
 167 -+ Thus, the lookup is O(n) in the worst case only, where n is as described
 168 -+ earlier, as tasks may be chosen before the whole task list is looked over.
 169 -+
 170 -+
 171 -+ Scalability.
 172 -+
 173 -+ The major limitations of BFS will be that of scalability, as the separate
 174 -+ runqueue designs will have less lock contention as the number of CPUs rises.
 175 -+ However they do not scale linearly even with separate runqueues as multiple
 176 -+ runqueues will need to be locked concurrently on such designs to be able to
 177 -+ achieve fair CPU balancing, to try and achieve some sort of nice-level fairness
 178 -+ across CPUs, and to achieve low enough latency for tasks on a busy CPU when
 179 -+ other CPUs would be more suited. BFS has the advantage that it requires no
 180 -+ balancing algorithm whatsoever, as balancing occurs by proxy simply because
 181 -+ all CPUs draw off the global runqueue, in priority and deadline order. Despite
 182 -+ the fact that scalability is _not_ the prime concern of BFS, it both shows very
 183 -+ good scalability to smaller numbers of CPUs and is likely a more scalable design
 184 -+ at these numbers of CPUs.
 185 -+
 186 -+ It also has some very low overhead scalability features built into the design
 187 -+ when it has been deemed their overhead is so marginal that they're worth adding.
 188 -+ The first is the local copy of the running process' data to the CPU it's running
 189 -+ on to allow that data to be updated lockless where possible. Then there is
 190 -+ deference paid to the last CPU a task was running on, by trying that CPU first
 191 -+ when looking for an idle CPU to use the next time it's scheduled. Finally there
 192 -+ is the notion of cache locality beyond the last running CPU. The sched_domains
 193 -+ information is used to determine the relative virtual "cache distance" that
 194 -+ other CPUs have from the last CPU a task was running on. CPUs with shared
 195 -+ caches, such as SMT siblings, or multicore CPUs with shared caches, are treated
 196 -+ as cache local. CPUs without shared caches are treated as not cache local, and
 197 -+ CPUs on different NUMA nodes are treated as very distant. This "relative cache
 198 -+ distance" is used by modifying the virtual deadline value when doing lookups.
 199 -+ Effectively, the deadline is unaltered between "cache local" CPUs, doubled for
 200 -+ "cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning
 201 -+ behind the doubling of deadlines is as follows. The real cost of migrating a
 202 -+ task from one CPU to another is entirely dependant on the cache footprint of
 203 -+ the task, how cache intensive the task is, how long it's been running on that
 204 -+ CPU to take up the bulk of its cache, how big the CPU cache is, how fast and
 205 -+ how layered the CPU cache is, how fast a context switch is... and so on. In
 206 -+ other words, it's close to random in the real world where we do more than just
 207 -+ one sole workload. The only thing we can be sure of is that it's not free. So
 208 -+ BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs
 209 -+ is more important than cache locality, and cache locality only plays a part
 210 -+ after that. Doubling the effective deadline is based on the premise that the
 211 -+ "cache local" CPUs will tend to work on the same tasks up to double the number
 212 -+ of cache local CPUs, and once the workload is beyond that amount, it is likely
 213 -+ that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA
 214 -+ is a value I pulled out of my arse.
 215 -+
 216 -+ When choosing an idle CPU for a waking task, the cache locality is determined
 217 -+ according to where the task last ran and then idle CPUs are ranked from best
 218 -+ to worst to choose the most suitable idle CPU based on cache locality, NUMA
 219 -+ node locality and hyperthread sibling business. They are chosen in the
 220 -+ following preference (if idle):
 221 -+
 222 -+ * Same core, idle or busy cache, idle threads
 223 -+ * Other core, same cache, idle or busy cache, idle threads.
 224 -+ * Same node, other CPU, idle cache, idle threads.
 225 -+ * Same node, other CPU, busy cache, idle threads.
 226 -+ * Same core, busy threads.
 227 -+ * Other core, same cache, busy threads.
 228 -+ * Same node, other CPU, busy threads.
 229 -+ * Other node, other CPU, idle cache, idle threads.
 230 -+ * Other node, other CPU, busy cache, idle threads.
 231 -+ * Other node, other CPU, busy threads.
 232 -+
 233 -+ This shows the SMT or "hyperthread" awareness in the design as well which will
 234 -+ choose a real idle core first before a logical SMT sibling which already has
 235 -+ tasks on the physical CPU.
 236 -+
 237 -+ Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark.
 238 -+ However this benchmarking was performed on an earlier design that was far less
 239 -+ scalable than the current one so it's hard to know how scalable it is in terms
 240 -+ of both CPUs (due to the global runqueue) and heavily loaded machines (due to
 241 -+ O(n) lookup) at this stage. Note that in terms of scalability, the number of
 242 -+ _logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x)
 243 -+ quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark
 244 -+ results are very promising indeed, without needing to tweak any knobs, features
 245 -+ or options. Benchmark contributions are most welcome.
 246 -+
 247 -+
 248 -+ Features
 249 -+
 250 -+ As the initial prime target audience for BFS was the average desktop user, it
 251 -+ was designed to not need tweaking, tuning or have features set to obtain benefit
 252 -+ from it. Thus the number of knobs and features has been kept to an absolute
 253 -+ minimum and should not require extra user input for the vast majority of cases.
 254 -+ There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval
 255 -+ and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition
 256 -+ to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is
 257 -+ support for CGROUPS. The average user should neither need to know what these
 258 -+ are, nor should they need to be using them to have good desktop behaviour.
 259 -+
 260 -+ rr_interval
 261 -+
 262 -+ There is only one "scheduler" tunable, the round robin interval. This can be
 263 -+ accessed in
 264 -+
 265 -+      /proc/sys/kernel/rr_interval
 266 -+
 267 -+ The value is in milliseconds, and the default value is set to 6 on a
 268 -+ uniprocessor machine, and automatically set to a progressively higher value on
 269 -+ multiprocessor machines. The reasoning behind increasing the value on more CPUs
 270 -+ is that the effective latency is decreased by virtue of there being more CPUs on
 271 -+ BFS (for reasons explained above), and increasing the value allows for less
 272 -+ cache contention and more throughput. Valid values are from 1 to 5000
 273 -+ Decreasing the value will decrease latencies at the cost of decreasing
 274 -+ throughput, while increasing it will improve throughput, but at the cost of
 275 -+ worsening latencies. The accuracy of the rr interval is limited by HZ resolution
 276 -+ of the kernel configuration. Thus, the worst case latencies are usually slightly
 277 -+ higher than this actual value. The default value of 6 is not an arbitrary one.
 278 -+ It is based on the fact that humans can detect jitter at approximately 7ms, so
 279 -+ aiming for much lower latencies is pointless under most circumstances. It is
 280 -+ worth noting this fact when comparing the latency performance of BFS to other
 281 -+ schedulers. Worst case latencies being higher than 7ms are far worse than
 282 -+ average latencies not being in the microsecond range.
 283 -+
 284 -+ Isochronous scheduling.
 285 -+
 286 -+ Isochronous scheduling is a unique scheduling policy designed to provide
 287 -+ near-real-time performance to unprivileged (ie non-root) users without the
 288 -+ ability to starve the machine indefinitely. Isochronous tasks (which means
 289 -+ "same time") are set using, for example, the schedtool application like so:
 290 -+
 291 -+      schedtool -I -e amarok
 292 -+
 293 -+ This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works
 294 -+ is that it has a priority level between true realtime tasks and SCHED_NORMAL
 295 -+ which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie,
 296 -+ if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval
 297 -+ rate). However if ISO tasks run for more than a tunable finite amount of time,
 298 -+ they are then demoted back to SCHED_NORMAL scheduling. This finite amount of
 299 -+ time is the percentage of _total CPU_ available across the machine, configurable
 300 -+ as a percentage in the following "resource handling" tunable (as opposed to a
 301 -+ scheduler tunable):
 302 -+
 303 -+      /proc/sys/kernel/iso_cpu
 304 -+
 305 -+ and is set to 70% by default. It is calculated over a rolling 5 second average
 306 -+ Because it is the total CPU available, it means that on a multi CPU machine, it
 307 -+ is possible to have an ISO task running as realtime scheduling indefinitely on
 308 -+ just one CPU, as the other CPUs will be available. Setting this to 100 is the
 309 -+ equivalent of giving all users SCHED_RR access and setting it to 0 removes the
 310 -+ ability to run any pseudo-realtime tasks.
 311 -+
 312 -+ A feature of BFS is that it detects when an application tries to obtain a
 313 -+ realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the
 314 -+ appropriate privileges to use those policies. When it detects this, it will
 315 -+ give the task SCHED_ISO policy instead. Thus it is transparent to the user.
 316 -+ Because some applications constantly set their policy as well as their nice
 317 -+ level, there is potential for them to undo the override specified by the user
 318 -+ on the command line of setting the policy to SCHED_ISO. To counter this, once
 319 -+ a task has been set to SCHED_ISO policy, it needs superuser privileges to set
 320 -+ it back to SCHED_NORMAL. This will ensure the task remains ISO and all child
 321 -+ processes and threads will also inherit the ISO policy.
 322 -+
 323 -+ Idleprio scheduling.
 324 -+
 325 -+ Idleprio scheduling is a scheduling policy designed to give out CPU to a task
 326 -+ _only_ when the CPU would be otherwise idle. The idea behind this is to allow
 327 -+ ultra low priority tasks to be run in the background that have virtually no
 328 -+ effect on the foreground tasks. This is ideally suited to distributed computing
 329 -+ clients (like setiathome, folding, mprime etc) but can also be used to start
 330 -+ a video encode or so on without any slowdown of other tasks. To avoid this
 331 -+ policy from grabbing shared resources and holding them indefinitely, if it
 332 -+ detects a state where the task is waiting on I/O, the machine is about to
 333 -+ suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As
 334 -+ per the Isochronous task management, once a task has been scheduled as IDLEPRIO,
 335 -+ it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can
 336 -+ be set to start as SCHED_IDLEPRIO with the schedtool command like so:
 337 -+
 338 -+      schedtool -D -e ./mprime
 339 -+
 340 -+ Subtick accounting.
 341 -+
 342 -+ It is surprisingly difficult to get accurate CPU accounting, and in many cases,
 343 -+ the accounting is done by simply determining what is happening at the precise
 344 -+ moment a timer tick fires off. This becomes increasingly inaccurate as the
 345 -+ timer tick frequency (HZ) is lowered. It is possible to create an application
 346 -+ which uses almost 100% CPU, yet by being descheduled at the right time, records
 347 -+ zero CPU usage. While the main problem with this is that there are possible
 348 -+ security implications, it is also difficult to determine how much CPU a task
 349 -+ really does use. BFS tries to use the sub-tick accounting from the TSC clock,
 350 -+ where possible, to determine real CPU usage. This is not entirely reliable, but
 351 -+ is far more likely to produce accurate CPU usage data than the existing designs
 352 -+ and will not show tasks as consuming no CPU usage when they actually are. Thus,
 353 -+ the amount of CPU reported as being used by BFS will more accurately represent
 354 -+ how much CPU the task itself is using (as is shown for example by the 'time'
 355 -+ application), so the reported values may be quite different to other schedulers.
 356 -+ Values reported as the 'load' are more prone to problems with this design, but
 357 -+ per process values are closer to real usage. When comparing throughput of BFS
 358 -+ to other designs, it is important to compare the actual completed work in terms
 359 -+ of total wall clock time taken and total work done, rather than the reported
 360 -+ "cpu usage".
 361 -+
 362 -+
 363 -+ Con Kolivas <kernel@kolivas.org> Thu Dec 3 2009
 364 +BFS - The Brain Fuck Scheduler by Con Kolivas.
 365 +
 366 +Goals.
 367 +
 368 +The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to
 369 +completely do away with the complex designs of the past for the cpu process
 370 +scheduler and instead implement one that is very simple in basic design.
 371 +The main focus of BFS is to achieve excellent desktop interactivity and
 372 +responsiveness without heuristics and tuning knobs that are difficult to
 373 +understand, impossible to model and predict the effect of, and when tuned to
 374 +one workload cause massive detriment to another.
 375 +
 376 +
 377 +Design summary.
 378 +
 379 +BFS is best described as a single runqueue, O(n) lookup, earliest effective
 380 +virtual deadline first design, loosely based on EEVDF (earliest eligible virtual
 381 +deadline first) and my previous Staircase Deadline scheduler. Each component
 382 +shall be described in order to understand the significance of, and reasoning for
 383 +it. The codebase when the first stable version was released was approximately
 384 +9000 lines less code than the existing mainline linux kernel scheduler (in
 385 +2.6.31). This does not even take into account the removal of documentation and
 386 +the cgroups code that is not used.
 387 +
 388 +Design reasoning.
 389 +
 390 +The single runqueue refers to the queued but not running processes for the
 391 +entire system, regardless of the number of CPUs. The reason for going back to
 392 +a single runqueue design is that once multiple runqueues are introduced,
 393 +per-CPU or otherwise, there will be complex interactions as each runqueue will
 394 +be responsible for the scheduling latency and fairness of the tasks only on its
 395 +own runqueue, and to achieve fairness and low latency across multiple CPUs, any
 396 +advantage in throughput of having CPU local tasks causes other disadvantages.
 397 +This is due to requiring a very complex balancing system to at best achieve some
 398 +semblance of fairness across CPUs and can only maintain relatively low latency
 399 +for tasks bound to the same CPUs, not across them. To increase said fairness
 400 +and latency across CPUs, the advantage of local runqueue locking, which makes
 401 +for better scalability, is lost due to having to grab multiple locks.
 402 +
 403 +A significant feature of BFS is that all accounting is done purely based on CPU
 404 +used and nowhere is sleep time used in any way to determine entitlement or
 405 +interactivity. Interactivity "estimators" that use some kind of sleep/run
 406 +algorithm are doomed to fail to detect all interactive tasks, and to falsely tag
 407 +tasks that aren't interactive as being so. The reason for this is that it is
 408 +close to impossible to determine that when a task is sleeping, whether it is
 409 +doing it voluntarily, as in a userspace application waiting for input in the
 410 +form of a mouse click or otherwise, or involuntarily, because it is waiting for
 411 +another thread, process, I/O, kernel activity or whatever. Thus, such an
 412 +estimator will introduce corner cases, and more heuristics will be required to
 413 +cope with those corner cases, introducing more corner cases and failed
 414 +interactivity detection and so on. Interactivity in BFS is built into the design
 415 +by virtue of the fact that tasks that are waking up have not used up their quota
 416 +of CPU time, and have earlier effective deadlines, thereby making it very likely
 417 +they will preempt any CPU bound task of equivalent nice level. See below for
 418 +more information on the virtual deadline mechanism. Even if they do not preempt
 419 +a running task, because the rr interval is guaranteed to have a bound upper
 420 +limit on how long a task will wait for, it will be scheduled within a timeframe
 421 +that will not cause visible interface jitter.
 422 +
 423 +
 424 +Design details.
 425 +
 426 +Task insertion.
 427 +
 428 +BFS inserts tasks into each relevant queue as an O(1) insertion into a double
 429 +linked list. On insertion, *every* running queue is checked to see if the newly
 430 +queued task can run on any idle queue, or preempt the lowest running task on the
 431 +system. This is how the cross-CPU scheduling of BFS achieves significantly lower
 432 +latency per extra CPU the system has. In this case the lookup is, in the worst
 433 +case scenario, O(n) where n is the number of CPUs on the system.
 434 +
 435 +Data protection.
 436 +
 437 +BFS has one single lock protecting the process local data of every task in the
 438 +global queue. Thus every insertion, removal and modification of task data in the
 439 +global runqueue needs to grab the global lock. However, once a task is taken by
 440 +a CPU, the CPU has its own local data copy of the running process' accounting
 441 +information which only that CPU accesses and modifies (such as during a
 442 +timer tick) thus allowing the accounting data to be updated lockless. Once a
 443 +CPU has taken a task to run, it removes it from the global queue. Thus the
 444 +global queue only ever has, at most,
 445 +
 446 +       (number of tasks requesting cpu time) - (number of logical CPUs) + 1
 447 +
 448 +tasks in the global queue. This value is relevant for the time taken to look up
 449 +tasks during scheduling. This will increase if many tasks with CPU affinity set
 450 +in their policy to limit which CPUs they're allowed to run on if they outnumber
 451 +the number of CPUs. The +1 is because when rescheduling a task, the CPU's
 452 +currently running task is put back on the queue. Lookup will be described after
 453 +the virtual deadline mechanism is explained.
 454 +
 455 +Virtual deadline.
 456 +
 457 +The key to achieving low latency, scheduling fairness, and "nice level"
 458 +distribution in BFS is entirely in the virtual deadline mechanism. The one
 459 +tunable in BFS is the rr_interval, or "round robin interval". This is the
 460 +maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy)
 461 +tasks of the same nice level will be running for, or looking at it the other
 462 +way around, the longest duration two tasks of the same nice level will be
 463 +delayed for. When a task requests cpu time, it is given a quota (time_slice)
 464 +equal to the rr_interval and a virtual deadline. The virtual deadline is
 465 +offset from the current time in jiffies by this equation:
 466 +
 467 +       jiffies + (prio_ratio * rr_interval)
 468 +
 469 +The prio_ratio is determined as a ratio compared to the baseline of nice -20
 470 +and increases by 10% per nice level. The deadline is a virtual one only in that
 471 +no guarantee is placed that a task will actually be scheduled by this time, but
 472 +it is used to compare which task should go next. There are three components to
 473 +how a task is next chosen. First is time_slice expiration. If a task runs out
 474 +of its time_slice, it is descheduled, the time_slice is refilled, and the
 475 +deadline reset to that formula above. Second is sleep, where a task no longer
 476 +is requesting CPU for whatever reason. The time_slice and deadline are _not_
 477 +adjusted in this case and are just carried over for when the task is next
 478 +scheduled. Third is preemption, and that is when a newly waking task is deemed
 479 +higher priority than a currently running task on any cpu by virtue of the fact
 480 +that it has an earlier virtual deadline than the currently running task. The
 481 +earlier deadline is the key to which task is next chosen for the first and
 482 +second cases. Once a task is descheduled, it is put back on the queue, and an
 483 +O(n) lookup of all queued-but-not-running tasks is done to determine which has
 484 +the earliest deadline and that task is chosen to receive CPU next.
 485 +
 486 +The CPU proportion of different nice tasks works out to be approximately the
 487 +
 488 +       (prio_ratio difference)^2
 489 +
 490 +The reason it is squared is that a task's deadline does not change while it is
 491 +running unless it runs out of time_slice. Thus, even if the time actually
 492 +passes the deadline of another task that is queued, it will not get CPU time
 493 +unless the current running task deschedules, and the time "base" (jiffies) is
 494 +constantly moving.
 495 +
 496 +Task lookup.
 497 +
 498 +BFS has 103 priority queues. 100 of these are dedicated to the static priority
 499 +of realtime tasks, and the remaining 3 are, in order of best to worst priority,
 500 +SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority
 501 +scheduling). When a task of these priorities is queued, a bitmap of running
 502 +priorities is set showing which of these priorities has tasks waiting for CPU
 503 +time. When a CPU is made to reschedule, the lookup for the next task to get
 504 +CPU time is performed in the following way:
 505 +
 506 +First the bitmap is checked to see what static priority tasks are queued. If
 507 +any realtime priorities are found, the corresponding queue is checked and the
 508 +first task listed there is taken (provided CPU affinity is suitable) and lookup
 509 +is complete. If the priority corresponds to a SCHED_ISO task, they are also
 510 +taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds
 511 +to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this
 512 +stage, every task in the runlist that corresponds to that priority is checked
 513 +to see which has the earliest set deadline, and (provided it has suitable CPU
 514 +affinity) it is taken off the runqueue and given the CPU. If a task has an
 515 +expired deadline, it is taken and the rest of the lookup aborted (as they are
 516 +chosen in FIFO order).
 517 +
 518 +Thus, the lookup is O(n) in the worst case only, where n is as described
 519 +earlier, as tasks may be chosen before the whole task list is looked over.
 520 +
 521 +
 522 +Scalability.
 523 +
 524 +The major limitations of BFS will be that of scalability, as the separate
 525 +runqueue designs will have less lock contention as the number of CPUs rises.
 526 +However they do not scale linearly even with separate runqueues as multiple
 527 +runqueues will need to be locked concurrently on such designs to be able to
 528 +achieve fair CPU balancing, to try and achieve some sort of nice-level fairness
 529 +across CPUs, and to achieve low enough latency for tasks on a busy CPU when
 530 +other CPUs would be more suited. BFS has the advantage that it requires no
 531 +balancing algorithm whatsoever, as balancing occurs by proxy simply because
 532 +all CPUs draw off the global runqueue, in priority and deadline order. Despite
 533 +the fact that scalability is _not_ the prime concern of BFS, it both shows very
 534 +good scalability to smaller numbers of CPUs and is likely a more scalable design
 535 +at these numbers of CPUs.
 536 +
 537 +It also has some very low overhead scalability features built into the design
 538 +when it has been deemed their overhead is so marginal that they're worth adding.
 539 +The first is the local copy of the running process' data to the CPU it's running
 540 +on to allow that data to be updated lockless where possible. Then there is
 541 +deference paid to the last CPU a task was running on, by trying that CPU first
 542 +when looking for an idle CPU to use the next time it's scheduled. Finally there
 543 +is the notion of "sticky" tasks that are flagged when they are involuntarily
 544 +descheduled, meaning they still want further CPU time. This sticky flag is
 545 +used to bias heavily against those tasks being scheduled on a different CPU
 546 +unless that CPU would be otherwise idle. When a cpu frequency governor is used
 547 +that scales with CPU load, such as ondemand, sticky tasks are not scheduled
 548 +on a different CPU at all, preferring instead to go idle. This means the CPU
 549 +they were bound to is more likely to increase its speed while the other CPU
 550 +will go idle, thus speeding up total task execution time and likely decreasing
 551 +power usage. This is the only scenario where BFS will allow a CPU to go idle
 552 +in preference to scheduling a task on the earliest available spare CPU.
 553 +
 554 +The real cost of migrating a task from one CPU to another is entirely dependant
 555 +on the cache footprint of the task, how cache intensive the task is, how long
 556 +it's been running on that CPU to take up the bulk of its cache, how big the CPU
 557 +cache is, how fast and how layered the CPU cache is, how fast a context switch
 558 +is... and so on. In other words, it's close to random in the real world where we
 559 +do more than just one sole workload. The only thing we can be sure of is that
 560 +it's not free. So BFS uses the principle that an idle CPU is a wasted CPU and
 561 +utilising idle CPUs is more important than cache locality, and cache locality
 562 +only plays a part after that.
 563 +
 564 +When choosing an idle CPU for a waking task, the cache locality is determined
 565 +according to where the task last ran and then idle CPUs are ranked from best
 566 +to worst to choose the most suitable idle CPU based on cache locality, NUMA
 567 +node locality and hyperthread sibling business. They are chosen in the
 568 +following preference (if idle):
 569 +
 570 +* Same core, idle or busy cache, idle threads
 571 +* Other core, same cache, idle or busy cache, idle threads.
 572 +* Same node, other CPU, idle cache, idle threads.
 573 +* Same node, other CPU, busy cache, idle threads.
 574 +* Same core, busy threads.
 575 +* Other core, same cache, busy threads.
 576 +* Same node, other CPU, busy threads.
 577 +* Other node, other CPU, idle cache, idle threads.
 578 +* Other node, other CPU, busy cache, idle threads.
 579 +* Other node, other CPU, busy threads.
 580 +
 581 +This shows the SMT or "hyperthread" awareness in the design as well which will
 582 +choose a real idle core first before a logical SMT sibling which already has
 583 +tasks on the physical CPU.
 584 +
 585 +Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark.
 586 +However this benchmarking was performed on an earlier design that was far less
 587 +scalable than the current one so it's hard to know how scalable it is in terms
 588 +of both CPUs (due to the global runqueue) and heavily loaded machines (due to
 589 +O(n) lookup) at this stage. Note that in terms of scalability, the number of
 590 +_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x)
 591 +quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark
 592 +results are very promising indeed, without needing to tweak any knobs, features
 593 +or options. Benchmark contributions are most welcome.
 594 +
 595 +
 596 +Features
 597 +
 598 +As the initial prime target audience for BFS was the average desktop user, it
 599 +was designed to not need tweaking, tuning or have features set to obtain benefit
 600 +from it. Thus the number of knobs and features has been kept to an absolute
 601 +minimum and should not require extra user input for the vast majority of cases.
 602 +There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval
 603 +and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition
 604 +to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is
 605 +support for CGROUPS. The average user should neither need to know what these
 606 +are, nor should they need to be using them to have good desktop behaviour.
 607 +
 608 +rr_interval
 609 +
 610 +There is only one "scheduler" tunable, the round robin interval. This can be
 611 +accessed in
 612 +
 613 +       /proc/sys/kernel/rr_interval
 614 +
 615 +The value is in milliseconds, and the default value is set to 6ms. Valid values
 616 +are from 1 to 1000. Decreasing the value will decrease latencies at the cost of
 617 +decreasing throughput, while increasing it will improve throughput, but at the
 618 +cost of worsening latencies. The accuracy of the rr interval is limited by HZ
 619 +resolution of the kernel configuration. Thus, the worst case latencies are
 620 +usually slightly higher than this actual value. BFS uses "dithering" to try and
 621 +minimise the effect the Hz limitation has. The default value of 6 is not an
 622 +arbitrary one. It is based on the fact that humans can detect jitter at
 623 +approximately 7ms, so aiming for much lower latencies is pointless under most
 624 +circumstances. It is worth noting this fact when comparing the latency
 625 +performance of BFS to other schedulers. Worst case latencies being higher than
 626 +7ms are far worse than average latencies not being in the microsecond range.
 627 +Experimentation has shown that rr intervals being increased up to 300 can
 628 +improve throughput but beyond that, scheduling noise from elsewhere prevents
 629 +further demonstrable throughput.
 630 +
 631 +Isochronous scheduling.
 632 +
 633 +Isochronous scheduling is a unique scheduling policy designed to provide
 634 +near-real-time performance to unprivileged (ie non-root) users without the
 635 +ability to starve the machine indefinitely. Isochronous tasks (which means
 636 +"same time") are set using, for example, the schedtool application like so:
 637 +
 638 +       schedtool -I -e amarok
 639 +
 640 +This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works
 641 +is that it has a priority level between true realtime tasks and SCHED_NORMAL
 642 +which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie,
 643 +if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval
 644 +rate). However if ISO tasks run for more than a tunable finite amount of time,
 645 +they are then demoted back to SCHED_NORMAL scheduling. This finite amount of
 646 +time is the percentage of _total CPU_ available across the machine, configurable
 647 +as a percentage in the following "resource handling" tunable (as opposed to a
 648 +scheduler tunable):
 649 +
 650 +       /proc/sys/kernel/iso_cpu
 651 +
 652 +and is set to 70% by default. It is calculated over a rolling 5 second average
 653 +Because it is the total CPU available, it means that on a multi CPU machine, it
 654 +is possible to have an ISO task running as realtime scheduling indefinitely on
 655 +just one CPU, as the other CPUs will be available. Setting this to 100 is the
 656 +equivalent of giving all users SCHED_RR access and setting it to 0 removes the
 657 +ability to run any pseudo-realtime tasks.
 658 +
 659 +A feature of BFS is that it detects when an application tries to obtain a
 660 +realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the
 661 +appropriate privileges to use those policies. When it detects this, it will
 662 +give the task SCHED_ISO policy instead. Thus it is transparent to the user.
 663 +Because some applications constantly set their policy as well as their nice
 664 +level, there is potential for them to undo the override specified by the user
 665 +on the command line of setting the policy to SCHED_ISO. To counter this, once
 666 +a task has been set to SCHED_ISO policy, it needs superuser privileges to set
 667 +it back to SCHED_NORMAL. This will ensure the task remains ISO and all child
 668 +processes and threads will also inherit the ISO policy.
 669 +
 670 +Idleprio scheduling.
 671 +
 672 +Idleprio scheduling is a scheduling policy designed to give out CPU to a task
 673 +_only_ when the CPU would be otherwise idle. The idea behind this is to allow
 674 +ultra low priority tasks to be run in the background that have virtually no
 675 +effect on the foreground tasks. This is ideally suited to distributed computing
 676 +clients (like setiathome, folding, mprime etc) but can also be used to start
 677 +a video encode or so on without any slowdown of other tasks. To avoid this
 678 +policy from grabbing shared resources and holding them indefinitely, if it
 679 +detects a state where the task is waiting on I/O, the machine is about to
 680 +suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As
 681 +per the Isochronous task management, once a task has been scheduled as IDLEPRIO,
 682 +it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can
 683 +be set to start as SCHED_IDLEPRIO with the schedtool command like so:
 684 +
 685 +       schedtool -D -e ./mprime
 686 +
 687 +Subtick accounting.
 688 +
 689 +It is surprisingly difficult to get accurate CPU accounting, and in many cases,
 690 +the accounting is done by simply determining what is happening at the precise
 691 +moment a timer tick fires off. This becomes increasingly inaccurate as the
 692 +timer tick frequency (HZ) is lowered. It is possible to create an application
 693 +which uses almost 100% CPU, yet by being descheduled at the right time, records
 694 +zero CPU usage. While the main problem with this is that there are possible
 695 +security implications, it is also difficult to determine how much CPU a task
 696 +really does use. BFS tries to use the sub-tick accounting from the TSC clock,
 697 +where possible, to determine real CPU usage. This is not entirely reliable, but
 698 +is far more likely to produce accurate CPU usage data than the existing designs
 699 +and will not show tasks as consuming no CPU usage when they actually are. Thus,
 700 +the amount of CPU reported as being used by BFS will more accurately represent
 701 +how much CPU the task itself is using (as is shown for example by the 'time'
 702 +application), so the reported values may be quite different to other schedulers.
 703 +Values reported as the 'load' are more prone to problems with this design, but
 704 +per process values are closer to real usage. When comparing throughput of BFS
 705 +to other designs, it is important to compare the actual completed work in terms
 706 +of total wall clock time taken and total work done, rather than the reported
 707 +"cpu usage".
 708 +
 709 +
 710 +Con Kolivas <kernel@kolivas.org> Tue, 5 Apr 2011
 711 diff -urpN linux-2.6.28.orig/drivers/cpufreq/cpufreq.c linux-2.6.28/drivers/cpufreq/cpufreq.c
 712 --- linux-2.6.28.orig/drivers/cpufreq/cpufreq.c 2011-04-10 21:50:27.082902337 +0200
 713 +++ linux-2.6.28/drivers/cpufreq/cpufreq.c      2011-04-10 21:29:38.566326570 +0200
 714 @@ -28,6 +28,7 @@
 715  #include <linux/cpu.h>
 716  #include <linux/completion.h>
 717  #include <linux/mutex.h>
 718 +#include <linux/sched.h>
 719
 720  #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_CORE, \
 721                                                 "cpufreq-core", msg)
 722 @@ -1460,6 +1461,12 @@ int __cpufreq_driver_target(struct cpufr
 723                 target_freq, relation);
 724         if (cpu_online(policy->cpu) && cpufreq_driver->target)
 725                 retval = cpufreq_driver->target(policy, target_freq, relation);
 726 +       if (likely(retval != -EINVAL)) {
 727 +               if (target_freq == policy->max)
 728 +                       cpu_nonscaling(policy->cpu);
 729 +               else
 730 +                       cpu_scaling(policy->cpu);
 731 +       }
 732
 733         return retval;
 734  }
 735 diff -urpN linux-2.6.28.orig/include/linux/sched.h linux-2.6.28/include/linux/sched.h
 736 --- linux-2.6.28.orig/include/linux/sched.h     2011-04-10 21:50:27.659568962 +0200
 737 +++ linux-2.6.28/include/linux/sched.h  2011-04-10 21:45:23.102924469 +0200
 738 @@ -1126,7 +1126,9 @@ struct task_struct {
 739         struct list_head run_list;
 740         u64 last_ran;
 741         u64 sched_time; /* sched_clock time spent running */
 742 -
 743 +#ifdef CONFIG_SMP
 744 +       int sticky; /* Soft affined flag */
 745 +#endif
 746         unsigned long rt_timeout;
 747  #else /* CONFIG_SCHED_BFS */
 748         const struct sched_class *sched_class;
 749 @@ -1409,6 +1411,8 @@ struct task_struct {
 750  #ifdef CONFIG_SCHED_BFS
 751  extern int grunqueue_is_locked(void);
 752  extern void grq_unlock_wait(void);
 753 +extern void cpu_scaling(int cpu);
 754 +extern void cpu_nonscaling(int cpu);
 755  #define tsk_seruntime(t)               ((t)->sched_time)
 756  #define tsk_rttimeout(t)               ((t)->rt_timeout)
 757  #define task_rq_unlock_wait(tsk)       grq_unlock_wait()
 758 @@ -1426,16 +1430,23 @@ static inline void tsk_cpus_current(stru
 759
 760  static inline void print_scheduler_version(void)
 761  {
 762 -       printk(KERN_INFO"BFS CPU scheduler v0.363 by Con Kolivas.\n");
 763 +       printk(KERN_INFO"BFS CPU scheduler v0.400 by Con Kolivas.\n");
 764  }
 765
 766  static inline int iso_task(struct task_struct *p)
 767  {
 768         return (p->policy == SCHED_ISO);
 769  }
 770 -#else
 771 +#else /* CFS */
 772  extern int runqueue_is_locked(void);
 773  extern void task_rq_unlock_wait(struct task_struct *p);
 774 +static inline void cpu_scaling(int cpu)
 775 +{
 776 +}
 777 +
 778 +static inline void cpu_nonscaling(int cpu)
 779 +{
 780 +}
 781  #define tsk_seruntime(t)       ((t)->se.sum_exec_runtime)
 782  #define tsk_rttimeout(t)       ((t)->rt.timeout)
 783
 784 diff -urpN linux-2.6.28.orig/kernel/sched_bfs.c linux-2.6.28/kernel/sched_bfs.c
 785 --- linux-2.6.28.orig/kernel/sched_bfs.c        2011-04-10 21:50:27.659568962 +0200
 786 +++ linux-2.6.28/kernel/sched_bfs.c     2011-04-10 21:43:43.506265052 +0200
 787 @@ -81,7 +81,7 @@
 788  #define idleprio_task(p)       unlikely((p)->policy == SCHED_IDLEPRIO)
 789  #define iso_task(p)            unlikely((p)->policy == SCHED_ISO)
 790  #define iso_queue(rq)          unlikely((rq)->rq_policy == SCHED_ISO)
 791 -#define ISO_PERIOD             ((5 * HZ * num_online_cpus()) + 1)
 792 +#define ISO_PERIOD             ((5 * HZ * grq.noc) + 1)
 793
 794  /*
 795   * Convert user-nice values [ -20 ... 0 ... 19 ]
 796 @@ -184,6 +184,7 @@ struct global_rq {
 797         cpumask_t cpu_idle_map;
 798         int idle_cpus;
 799  #endif
 800 +       int noc; /* num_online_cpus stored and updated when it changes */
 801         u64 niffies; /* Nanosecond jiffies */
 802         unsigned long last_jiffy; /* Last jiffy we updated niffies */
 803
 804 @@ -226,6 +227,8 @@ struct rq {
 805  #ifdef CONFIG_SMP
 806         int cpu;                /* cpu of this runqueue */
 807         int online;
 808 +       int scaling; /* This CPU is managed by a scaling CPU freq governor */
 809 +       struct task_struct *sticky_task;
 810
 811         struct root_domain *rd;
 812         struct sched_domain *sd;
 813 @@ -242,7 +245,11 @@ struct rq {
 814  #endif
 815         u64 last_niffy; /* Last time this RQ updated grq.niffies */
 816  #endif
 817 +#ifdef CONFIG_IRQ_TIME_ACCOUNTING
 818 +       u64 prev_irq_time;
 819 +#endif
 820         u64 clock, old_clock, last_tick;
 821 +       u64 clock_task;
 822         int dither;
 823
 824  #ifdef CONFIG_SCHEDSTATS
 825 @@ -407,9 +414,14 @@ static inline void update_clocks(struct
 826   * when we're not updating niffies.
 827   * Looking up task_rq must be done under grq.lock to be safe.
 828   */
 829 +static void update_rq_clock_task(struct rq *rq, s64 delta);
 830 +
 831  static inline void update_rq_clock(struct rq *rq)
 832  {
 833 -       rq->clock = sched_clock_cpu(cpu_of(rq));
 834 +       s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
 835 +
 836 +       rq->clock += delta;
 837 +       update_rq_clock_task(rq, delta);
 838  }
 839
 840  static inline int task_running(struct task_struct *p)
 841 @@ -741,10 +753,8 @@ static void resched_task(struct task_str
 842
 843  /*
 844   * The best idle CPU is chosen according to the CPUIDLE ranking above where the
 845 - * lowest value would give the most suitable CPU to schedule p onto next. We
 846 - * iterate from the last CPU upwards instead of using for_each_cpu_mask so as
 847 - * to be able to break out immediately if the last CPU is idle. The order works
 848 - * out to be the following:
 849 + * lowest value would give the most suitable CPU to schedule p onto next. The
 850 + * order works out to be the following:
 851   *
 852   * Same core, idle or busy cache, idle threads
 853   * Other core, same cache, idle or busy cache, idle threads.
 854 @@ -756,38 +766,19 @@ static void resched_task(struct task_str
 855   * Other node, other CPU, idle cache, idle threads.
 856   * Other node, other CPU, busy cache, idle threads.
 857   * Other node, other CPU, busy threads.
 858 - *
 859 - * If p was the last task running on this rq, then regardless of where
 860 - * it has been running since then, it is cache warm on this rq.
 861   */
 862 -static void resched_best_idle(struct task_struct *p)
 863 +static void
 864 +resched_best_mask(unsigned long best_cpu, struct rq *rq, cpumask_t *tmpmask)
 865  {
 866 -       unsigned long cpu_tmp, best_cpu, best_ranking;
 867 -       cpumask_t tmpmask;
 868 -       struct rq *rq;
 869 -       int iterate;
 870 +       unsigned long cpu_tmp, best_ranking;
 871
 872 -       cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);
 873 -       iterate = cpus_weight(tmpmask);
 874 -       best_cpu = task_cpu(p);
 875 -       /*
 876 -        * Start below the last CPU and work up with next_cpu_nr as the last
 877 -        * CPU might not be idle or affinity might not allow it.
 878 -        */
 879 -       cpu_tmp = best_cpu - 1;
 880 -       rq = cpu_rq(best_cpu);
 881         best_ranking = ~0UL;
 882
 883 -       do {
 884 +       for_each_cpu_mask_nr(cpu_tmp, *tmpmask) {
 885                 unsigned long ranking;
 886                 struct rq *tmp_rq;
 887
 888                 ranking = 0;
 889 -               cpu_tmp = next_cpu_nr(cpu_tmp, tmpmask);
 890 -               if (cpu_tmp >= nr_cpu_ids) {
 891 -                       cpu_tmp = -1;
 892 -                       cpu_tmp = next_cpu_nr(cpu_tmp, tmpmask);
 893 -               }
 894                 tmp_rq = cpu_rq(cpu_tmp);
 895
 896  #ifdef CONFIG_NUMA
 897 @@ -815,37 +806,42 @@ static void resched_best_idle(struct tas
 898                                 break;
 899                         best_ranking = ranking;
 900                 }
 901 -       } while (--iterate > 0);
 902 +       }
 903
 904         resched_task(cpu_rq(best_cpu)->curr);
 905  }
 906
 907 +static void resched_best_idle(struct task_struct *p)
 908 +{
 909 +       cpumask_t tmpmask;
 910 +
 911 +       cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);
 912 +       resched_best_mask(task_cpu(p), task_rq(p), &tmpmask);
 913 +}
 914 +
 915  static inline void resched_suitable_idle(struct task_struct *p)
 916  {
 917         if (suitable_idle_cpus(p))
 918                 resched_best_idle(p);
 919  }
 920 -
 921  /*
 922 - * The cpu cache locality difference between CPUs is used to determine how far
 923 - * to offset the virtual deadline. <2 difference in locality means that one
 924 - * timeslice difference is allowed longer for the cpu local tasks. This is
 925 - * enough in the common case when tasks are up to 2* number of CPUs to keep
 926 - * tasks within their shared cache CPUs only. CPUs on different nodes or not
 927 - * even in this domain (NUMA) have "4" difference, allowing 4 times longer
 928 - * deadlines before being taken onto another cpu, allowing for 2* the double
 929 - * seen by separate CPUs above.
 930 - * Simple summary: Virtual deadlines are equal on shared cache CPUs, double
 931 - * on separate CPUs and quadruple in separate NUMA nodes.
 932 + * Flags to tell us whether this CPU is running a CPU frequency governor that
 933 + * has slowed its speed or not. No locking required as the very rare wrongly
 934 + * read value would be harmless.
 935   */
 936 -static inline int
 937 -cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p)
 938 +void cpu_scaling(int cpu)
 939  {
 940 -       int locality = rq->cpu_locality[cpu_of(task_rq)] - 2;
 941 +       cpu_rq(cpu)->scaling = 1;
 942 +}
 943
 944 -       if (locality > 0)
 945 -               return task_timeslice(p) << locality;
 946 -       return 0;
 947 +void cpu_nonscaling(int cpu)
 948 +{
 949 +       cpu_rq(cpu)->scaling = 0;
 950 +}
 951 +
 952 +static inline int scaling_rq(struct rq *rq)
 953 +{
 954 +       return rq->scaling;
 955  }
 956  #else /* CONFIG_SMP */
 957  static inline void inc_qnr(void)
 958 @@ -879,12 +875,25 @@ static inline void resched_suitable_idle
 959  {
 960  }
 961
 962 -static inline int
 963 -cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p)
 964 +void cpu_scaling(int __unused)
 965 +{
 966 +}
 967 +
 968 +void cpu_nonscaling(int __unused)
 969 +{
 970 +}
 971 +
 972 +/*
 973 + * Although CPUs can scale in UP, there is nowhere else for tasks to go so this
 974 + * always returns 0.
 975 + */
 976 +static inline int scaling_rq(struct rq *rq)
 977  {
 978         return 0;
 979  }
 980  #endif /* CONFIG_SMP */
 981 +EXPORT_SYMBOL_GPL(cpu_scaling);
 982 +EXPORT_SYMBOL_GPL(cpu_nonscaling);
 983
 984  /*
 985   * activate_idle_task - move idle task to the _front_ of runqueue.
 986 @@ -974,6 +983,82 @@ void set_task_cpu(struct task_struct *p,
 987         smp_wmb();
 988         task_thread_info(p)->cpu = cpu;
 989  }
 990 +
 991 +static inline void clear_sticky(struct task_struct *p)
 992 +{
 993 +       p->sticky = 0;
 994 +}
 995 +
 996 +static inline int task_sticky(struct task_struct *p)
 997 +{
 998 +       return p->sticky;
 999 +}
1000 +
1001 +/* Reschedule the best idle CPU that is not this one. */
1002 +static void
1003 +resched_closest_idle(struct rq *rq, unsigned long cpu, struct task_struct *p)
1004 +{
1005 +       cpumask_t tmpmask;
1006 +
1007 +       cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);
1008 +       cpu_clear(cpu, tmpmask);
1009 +       if (cpus_empty(tmpmask))
1010 +               return;
1011 +       resched_best_mask(cpu, rq, &tmpmask);
1012 +}
1013 +
1014 +/*
1015 + * We set the sticky flag on a task that is descheduled involuntarily meaning
1016 + * it is awaiting further CPU time. If the last sticky task is still sticky
1017 + * but unlucky enough to not be the next task scheduled, we unstick it and try
1018 + * to find it an idle CPU. Realtime tasks do not stick to minimise their
1019 + * latency at all times.
1020 + */
1021 +static inline void
1022 +swap_sticky(struct rq *rq, unsigned long cpu, struct task_struct *p)
1023 +{
1024 +       if (rq->sticky_task) {
1025 +               if (rq->sticky_task == p) {
1026 +                       p->sticky = 1;
1027 +                       return;
1028 +               }
1029 +               if (rq->sticky_task->sticky) {
1030 +                       rq->sticky_task->sticky = 0;
1031 +                       resched_closest_idle(rq, cpu, rq->sticky_task);
1032 +               }
1033 +       }
1034 +       if (!rt_task(p)) {
1035 +               p->sticky = 1;
1036 +               rq->sticky_task = p;
1037 +       } else {
1038 +               resched_closest_idle(rq, cpu, p);
1039 +               rq->sticky_task = NULL;
1040 +       }
1041 +}
1042 +
1043 +static inline void unstick_task(struct rq *rq, struct task_struct *p)
1044 +{
1045 +       rq->sticky_task = NULL;
1046 +       clear_sticky(p);
1047 +}
1048 +#else
1049 +static inline void clear_sticky(struct task_struct *p)
1050 +{
1051 +}
1052 +
1053 +static inline int task_sticky(struct task_struct *p)
1054 +{
1055 +       return 0;
1056 +}
1057 +
1058 +static inline void
1059 +swap_sticky(struct rq *rq, unsigned long cpu, struct task_struct *p)
1060 +{
1061 +}
1062 +
1063 +static inline void unstick_task(struct rq *rq, struct task_struct *p)
1064 +{
1065 +}
1066  #endif
1067
1068  /*
1069 @@ -984,6 +1069,7 @@ static inline void take_task(struct rq *
1070  {
1071         set_task_cpu(p, cpu_of(rq));
1072         dequeue_task(p);
1073 +       clear_sticky(p);
1074         dec_qnr();
1075  }
1076
1077 @@ -1266,6 +1352,13 @@ static void try_preempt(struct task_stru
1078         int highest_prio;
1079         cpumask_t tmp;
1080
1081 +       /*
1082 +        * We clear the sticky flag here because for a task to have called
1083 +        * try_preempt with the sticky flag enabled means some complicated
1084 +        * re-scheduling has occurred and we should ignore the sticky flag.
1085 +        */
1086 +       clear_sticky(p);
1087 +
1088         if (suitable_idle_cpus(p)) {
1089                 resched_best_idle(p);
1090                 return;
1091 @@ -1284,7 +1377,6 @@ static void try_preempt(struct task_stru
1092         highest_prio = -1;
1093
1094         for_each_cpu_mask_nr(cpu, tmp) {
1095 -               u64 offset_deadline;
1096                 struct rq *rq;
1097                 int rq_prio;
1098
1099 @@ -1293,12 +1385,9 @@ static void try_preempt(struct task_stru
1100                 if (rq_prio < highest_prio)
1101                         continue;
1102
1103 -               offset_deadline = rq->rq_deadline -
1104 -                                 cache_distance(this_rq, rq, p);
1105 -
1106                 if (rq_prio > highest_prio || (rq_prio == highest_prio &&
1107 -                   deadline_after(offset_deadline, latest_deadline))) {
1108 -                       latest_deadline = offset_deadline;
1109 +                   deadline_after(rq->rq_deadline, latest_deadline))) {
1110 +                       latest_deadline = rq->rq_deadline;
1111                         highest_prio = rq_prio;
1112                         highest_prio_rq = rq;
1113                 }
1114 @@ -1470,6 +1559,7 @@ void sched_fork(struct task_struct *p, i
1115  #endif
1116
1117         p->oncpu = 0;
1118 +       clear_sticky(p);
1119
1120  #ifdef CONFIG_PREEMPT
1121         /* Want to start with kernel preemption disabled. */
1122 @@ -1809,6 +1899,152 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
1123
1124  EXPORT_PER_CPU_SYMBOL(kstat);
1125
1126 +#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1127 +
1128 +/*
1129 + * There are no locks covering percpu hardirq/softirq time.
1130 + * They are only modified in account_system_vtime, on corresponding CPU
1131 + * with interrupts disabled. So, writes are safe.
1132 + * They are read and saved off onto struct rq in update_rq_clock().
1133 + * This may result in other CPU reading this CPU's irq time and can
1134 + * race with irq/account_system_vtime on this CPU. We would either get old
1135 + * or new value with a side effect of accounting a slice of irq time to wrong
1136 + * task when irq is in progress while we read rq->clock. That is a worthy
1137 + * compromise in place of having locks on each irq in account_system_time.
1138 + */
1139 +static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1140 +static DEFINE_PER_CPU(u64, cpu_softirq_time);
1141 +
1142 +static DEFINE_PER_CPU(u64, irq_start_time);
1143 +static int sched_clock_irqtime;
1144 +
1145 +void enable_sched_clock_irqtime(void)
1146 +{
1147 +       sched_clock_irqtime = 1;
1148 +}
1149 +
1150 +void disable_sched_clock_irqtime(void)
1151 +{
1152 +       sched_clock_irqtime = 0;
1153 +}
1154 +
1155 +#ifndef CONFIG_64BIT
1156 +static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
1157 +
1158 +static inline void irq_time_write_begin(void)
1159 +{
1160 +       __this_cpu_inc(irq_time_seq.sequence);
1161 +       smp_wmb();
1162 +}
1163 +
1164 +static inline void irq_time_write_end(void)
1165 +{
1166 +       smp_wmb();
1167 +       __this_cpu_inc(irq_time_seq.sequence);
1168 +}
1169 +
1170 +static inline u64 irq_time_read(int cpu)
1171 +{
1172 +       u64 irq_time;
1173 +       unsigned seq;
1174 +
1175 +       do {
1176 +               seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1177 +               irq_time = per_cpu(cpu_softirq_time, cpu) +
1178 +                          per_cpu(cpu_hardirq_time, cpu);
1179 +       } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1180 +
1181 +       return irq_time;
1182 +}
1183 +#else /* CONFIG_64BIT */
1184 +static inline void irq_time_write_begin(void)
1185 +{
1186 +}
1187 +
1188 +static inline void irq_time_write_end(void)
1189 +{
1190 +}
1191 +
1192 +static inline u64 irq_time_read(int cpu)
1193 +{
1194 +       return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1195 +}
1196 +#endif /* CONFIG_64BIT */
1197 +
1198 +/*
1199 + * Called before incrementing preempt_count on {soft,}irq_enter
1200 + * and before decrementing preempt_count on {soft,}irq_exit.
1201 + */
1202 +void account_system_vtime(struct task_struct *curr)
1203 +{
1204 +       unsigned long flags;
1205 +       s64 delta;
1206 +       int cpu;
1207 +
1208 +       if (!sched_clock_irqtime)
1209 +               return;
1210 +
1211 +       local_irq_save(flags);
1212 +
1213 +       cpu = smp_processor_id();
1214 +       delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
1215 +       __this_cpu_add(irq_start_time, delta);
1216 +
1217 +       irq_time_write_begin();
1218 +       /*
1219 +        * We do not account for softirq time from ksoftirqd here.
1220 +        * We want to continue accounting softirq time to ksoftirqd thread
1221 +        * in that case, so as not to confuse scheduler with a special task
1222 +        * that do not consume any time, but still wants to run.
1223 +        */
1224 +       if (hardirq_count())
1225 +               __this_cpu_add(cpu_hardirq_time, delta);
1226 +       else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
1227 +               __this_cpu_add(cpu_softirq_time, delta);
1228 +
1229 +       irq_time_write_end();
1230 +       local_irq_restore(flags);
1231 +}
1232 +EXPORT_SYMBOL_GPL(account_system_vtime);
1233 +
1234 +static void update_rq_clock_task(struct rq *rq, s64 delta)
1235 +{
1236 +       s64 irq_delta;
1237 +
1238 +       irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
1239 +
1240 +       /*
1241 +        * Since irq_time is only updated on {soft,}irq_exit, we might run into
1242 +        * this case when a previous update_rq_clock() happened inside a
1243 +        * {soft,}irq region.
1244 +        *
1245 +        * When this happens, we stop ->clock_task and only update the
1246 +        * prev_irq_time stamp to account for the part that fit, so that a next
1247 +        * update will consume the rest. This ensures ->clock_task is
1248 +        * monotonic.
1249 +        *
1250 +        * It does however cause some slight miss-attribution of {soft,}irq
1251 +        * time, a more accurate solution would be to update the irq_time using
1252 +        * the current rq->clock timestamp, except that would require using
1253 +        * atomic ops.
1254 +        */
1255 +       if (irq_delta > delta)
1256 +               irq_delta = delta;
1257 +
1258 +       rq->prev_irq_time += irq_delta;
1259 +       delta -= irq_delta;
1260 +       rq->clock_task += delta;
1261 +}
1262 +
1263 +#else /* CONFIG_IRQ_TIME_ACCOUNTING */
1264 +
1265 +static void update_rq_clock_task(struct rq *rq, s64 delta)
1266 +{
1267 +       rq->clock_task += delta;
1268 +}
1269 +
1270 +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1271 +
1272  /*
1273   * On each tick, see what percentage of that tick was attributed to each
1274   * component and add the percentage to the _pc values. Once a _pc value has
1275 @@ -2372,9 +2608,14 @@ static inline u64 static_deadline_diff(i
1276         return prio_deadline_diff(USER_PRIO(static_prio));
1277  }
1278
1279 +static inline int longest_deadline_diff(void)
1280 +{
1281 +       return prio_deadline_diff(39);
1282 +}
1283 +
1284  static inline int ms_longest_deadline_diff(void)
1285  {
1286 -       return NS_TO_MS(prio_deadline_diff(39));
1287 +       return NS_TO_MS(longest_deadline_diff());
1288  }
1289
1290  /*
1291 @@ -2444,7 +2685,19 @@ retry:
1292                         goto out_take;
1293                 }
1294
1295 -               dl = p->deadline + cache_distance(task_rq(p), rq, p);
1296 +               /*
1297 +                * Soft affinity happens here by not scheduling a task with
1298 +                * its sticky flag set that ran on a different CPU last when
1299 +                * the CPU is scaling, or by greatly biasing against its
1300 +                * deadline when not.
1301 +                */
1302 +               if (task_rq(p) != rq && task_sticky(p)) {
1303 +                       if (scaling_rq(rq))
1304 +                               continue;
1305 +                       else
1306 +                               dl = p->deadline + longest_deadline_diff();
1307 +               } else
1308 +                       dl = p->deadline;
1309
1310                 /*
1311                  * No rt tasks. Find the earliest deadline task. Now we're in
1312 @@ -2601,14 +2854,8 @@ need_resched_nonpreemptible:
1313                                 */
1314                                 grq_unlock_irq();
1315                                 goto rerun_prev_unlocked;
1316 -                       } else {
1317 -                               /*
1318 -                                * If prev got kicked off by a task that has to
1319 -                                * run on this CPU for affinity reasons then
1320 -                                * there may be an idle CPU it can go to.
1321 -                                */
1322 -                               resched_suitable_idle(prev);
1323 -                       }
1324 +                       } else
1325 +                               swap_sticky(rq, cpu, prev);
1326                 }
1327                 return_task(prev, deactivate);
1328         }
1329 @@ -2623,12 +2870,21 @@ need_resched_nonpreemptible:
1330                 set_cpuidle_map(cpu);
1331         } else {
1332                 next = earliest_deadline_task(rq, idle);
1333 -               prefetch(next);
1334 -               prefetch_stack(next);
1335 -               clear_cpuidle_map(cpu);
1336 +               if (likely(next->prio != PRIO_LIMIT)) {
1337 +                       prefetch(next);
1338 +                       prefetch_stack(next);
1339 +                       clear_cpuidle_map(cpu);
1340 +               } else
1341 +                       set_cpuidle_map(cpu);
1342         }
1343
1344         if (likely(prev != next)) {
1345 +               /*
1346 +                * Don't stick tasks when a real time task is going to run as
1347 +                * they may literally get stuck.
1348 +                */
1349 +               if (rt_task(next))
1350 +                       unstick_task(rq, prev);
1351                 sched_info_switch(prev, next);
1352
1353                 set_rq_task(rq, next);
1354 @@ -4009,7 +4265,6 @@ void init_idle(struct task_struct *idle,
1355         rcu_read_unlock();
1356         rq->curr = rq->idle = idle;
1357         idle->oncpu = 1;
1358 -       set_cpuidle_map(cpu);
1359         grq_unlock_irqrestore(&flags);
1360
1361         /* Set the preempt count _outside_ the spinlocks! */
1362 @@ -4245,6 +4500,7 @@ static void break_sole_affinity(int src_
1363                                        task_pid_nr(p), p->comm, src_cpu);
1364                         }
1365                 }
1366 +               clear_sticky(p);
1367         } while_each_thread(t, p);
1368  }
1369
1370 @@ -4569,6 +4825,7 @@ migration_call(struct notifier_block *nf
1371
1372                         set_rq_online(rq);
1373                 }
1374 +               grq.noc = num_online_cpus();
1375                 grq_unlock_irqrestore(&flags);
1376                 break;
1377
1378 @@ -4601,6 +4858,7 @@ migration_call(struct notifier_block *nf
1379                         BUG_ON(!cpu_isset(cpu, rq->rd->span));
1380                         set_rq_offline(rq);
1381                 }
1382 +               grq.noc = num_online_cpus();
1383                 grq_unlock_irqrestore(&flags);
1384                 break;
1385  #endif
1386 @@ -6005,7 +6263,7 @@ static int cache_cpu_idle(unsigned long
1387  void __init sched_init_smp(void)
1388  {
1389         struct sched_domain *sd;
1390 -       int cpu, cpus;
1391 +       int cpu;
1392
1393         cpumask_t non_isolated_cpus;
1394
1395 @@ -6035,14 +6293,6 @@ void __init sched_init_smp(void)
1396         if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
1397                 BUG();
1398
1399 -       /*
1400 -        * Assume that every added cpu gives us slightly less overall latency
1401 -        * allowing us to increase the base rr_interval, non-linearly and with
1402 -        * an upper bound.
1403 -        */
1404 -       cpus = num_online_cpus();
1405 -       rr_interval = rr_interval * (4 * cpus + 4) / (cpus + 6);
1406 -
1407         grq_lock_irq();
1408         /*
1409          * Set up the relative cache distance of each online cpu from each
1410 @@ -6129,6 +6379,7 @@ void __init sched_init(void)
1411         grq.last_jiffy = jiffies;
1412         spin_lock_init(&grq.iso_lock);
1413         grq.iso_ticks = grq.iso_refractory = 0;
1414 +       grq.noc = 1;
1415  #ifdef CONFIG_SMP
1416         init_defrootdomain();
1417         grq.qnr = grq.idle_cpus = 0;
1418 @@ -6142,6 +6393,7 @@ void __init sched_init(void)
1419                               rq->iowait_pc = rq->idle_pc = 0;
1420                 rq->dither = 0;
1421  #ifdef CONFIG_SMP
1422 +               rq->sticky_task = NULL;
1423                 rq->last_niffy = 0;
1424                 rq->sd = NULL;
1425                 rq->rd = NULL;