git.maemo.org Git - kernel-bfs/blob - kernel-bfs-2.6.28/debian/patches/bfs401-penalise_fork_depth_account_threads.patch

   1 Make it possible to have interactivity and responsiveness at very high load
   2 levels by making deadlines offset by the fork depth from init. This has a
   3 similar effect to 'nice'ing loads that are fork heavy. 'make' is a perfect
   4 example of this and will, with fork_depth_penalty enabled, be felt as much
   5 at 'make -j24' as it normally would be with just 'make'.
   6
   7 Note that this drastically affects CPU distribution, and also has the
   8 indirect side effect of partitioning CPU entitlement to different users as
   9 well. No assumption as to CPU distribution should be made based on past
  10 behaviour.
  11
  12 This is achieved by separating out forks to new processes vs new threads.
  13 When a new process is detected, its fork depth is inherited from its parent
  14 across fork() and then is incremented by one. That fork_depth is then used
  15 to cause a relative offset of its deadline.
  16
  17 This feature is disabled in this patch by default and can be optionally
  18 enabled.
  19
  20 Threads are kept at the same fork_depth as their parent process, and can
  21 optionally have their CPU entitlement all managed as one process together
  22 by enabling the group_thread_accounting feature. This feature is disabled
  23 by default in this patch, as many desktop applications such as firefox,
  24 amarok, etc are multithreaded. By disabling this feature and enabling the
  25 fork_depth_penalty feature (default) it favours CPU towards desktop
  26 applications.
  27
  28 Extensive testing is required to ensure this does not cause regressions in
  29 common workloads.
  30
  31 There are two sysctls to enable/disable these features.
  32
  33 They are in /proc/sys/kernel/
  34
  35 group_thread_accounting - groups CPU accounting by threads
  36 fork_depth_penalty - penalises according to depth of forking from init
  37
  38 -ck
  39
  40 ---
  41  include/linux/sched.h |    7 +++
  42  kernel/sched_bfs.c    |   88 ++++++++++++++++++++++++++++++++++++++++++++++----
  43  kernel/sysctl.c       |   20 +++++++++++
  44  3 files changed, 108 insertions(+), 7 deletions(-)
  45
  46 Index: linux-2.6.36-rc7-ck1/include/linux/sched.h
  47 ===================================================================
  48 --- linux-2.6.36-rc7-ck1.orig/include/linux/sched.h     2010-10-08 09:39:38.016240768 +1100
  49 +++ linux-2.6.36-rc7-ck1/include/linux/sched.h  2010-10-08 09:39:53.575007838 +1100
  50 @@ -1187,10 +1187,15 @@ struct task_struct {
  51         unsigned int rt_priority;
  52  #ifdef CONFIG_SCHED_BFS
  53         int time_slice;
  54 -       u64 deadline;
  55 +       /* Virtual deadline in niffies, and when the deadline was set */
  56 +       u64 deadline, deadline_niffy;
  57         struct list_head run_list;
  58         u64 last_ran;
  59         u64 sched_time; /* sched_clock time spent running */
  60 +       /* Number of threads currently requesting CPU time */
  61 +       unsigned long threads_running;
  62 +       /* Depth of forks from init */
  63 +       int fork_depth;
  64  #ifdef CONFIG_SMP
  65         int sticky; /* Soft affined flag */
  66  #endif
  67 Index: linux-2.6.36-rc7-ck1/kernel/sched_bfs.c
  68 ===================================================================
  69 --- linux-2.6.36-rc7-ck1.orig/kernel/sched_bfs.c        2010-10-08 09:39:37.918242270 +1100
  70 +++ linux-2.6.36-rc7-ck1/kernel/sched_bfs.c     2010-10-08 11:16:01.382198622 +1100
  71 @@ -139,6 +139,15 @@ int rr_interval __read_mostly = 6;
  72  int sched_iso_cpu __read_mostly = 70;
  73
  74  /*
  75 + * group_thread_accounting - sysctl to decide whether to treat whole thread
  76 + * groups as a single entity for the purposes of CPU distribution.
  77 + */
  78 +int group_thread_accounting __read_mostly;
  79 +
  80 +/* fork_depth_penalty - Whether to penalise CPU according to fork depth. */
  81 +int fork_depth_penalty __read_mostly;
  82 +
  83 +/*
  84   * The relative length of deadline for each priority(nice) level.
  85   */
  86  static int prio_ratios[PRIO_RANGE] __read_mostly;
  87 @@ -661,11 +670,29 @@ static int isoprio_suitable(void)
  88         return !grq.iso_refractory;
  89  }
  90
  91 +static inline u64 __task_deadline_diff(struct task_struct *p);
  92 +static inline u64 task_deadline_diff(struct task_struct *p);
  93 +
  94  /*
  95   * Adding to the global runqueue. Enter with grq locked.
  96   */
  97  static void enqueue_task(struct task_struct *p)
  98  {
  99 +       s64 max_tdd = task_deadline_diff(p);
 100 +
 101 +       /*
 102 +        * Make sure that when we're queueing this task again that it
 103 +        * doesn't have any old deadlines from when the thread group was
 104 +        * being penalised and cap the deadline to the highest it could
 105 +        * be, based on the current number of threads running.
 106 +        */
 107 +       if (group_thread_accounting) {
 108 +               max_tdd += p->group_leader->threads_running *
 109 +                          __task_deadline_diff(p);
 110 +       }
 111 +       if (p->deadline - p->deadline_niffy > max_tdd)
 112 +               p->deadline = p->deadline_niffy + max_tdd;
 113 +
 114         if (!rt_task(p)) {
 115                 /* Check it hasn't gotten rt from PI */
 116                 if ((idleprio_task(p) && idleprio_suitable(p)) ||
 117 @@ -967,10 +994,13 @@ static int effective_prio(struct task_st
 118  }
 119
 120  /*
 121 - * activate_task - move a task to the runqueue. Enter with grq locked.
 122 + * activate_task - move a task to the runqueue. Enter with grq locked. The
 123 + * number of threads running is stored in the group_leader struct.
 124   */
 125  static void activate_task(struct task_struct *p, struct rq *rq)
 126  {
 127 +       unsigned long *threads_running = &p->group_leader->threads_running;
 128 +
 129         update_clocks(rq);
 130
 131         /*
 132 @@ -987,6 +1017,14 @@ static void activate_task(struct task_st
 133         p->prio = effective_prio(p);
 134         if (task_contributes_to_load(p))
 135                 grq.nr_uninterruptible--;
 136 +       /*
 137 +        * Adjust deadline according to number of running threads within
 138 +        * this thread group. This ends up distributing CPU to the thread
 139 +        * group as a single entity.
 140 +        */
 141 +       ++*threads_running;
 142 +       if (*threads_running > 1 && group_thread_accounting)
 143 +               p->deadline += __task_deadline_diff(p);
 144         enqueue_task(p);
 145         grq.nr_running++;
 146         inc_qnr();
 147 @@ -998,9 +1036,14 @@ static void activate_task(struct task_st
 148   */
 149  static inline void deactivate_task(struct task_struct *p)
 150  {
 151 +       unsigned long *threads_running = &p->group_leader->threads_running;
 152 +
 153         if (task_contributes_to_load(p))
 154                 grq.nr_uninterruptible++;
 155         grq.nr_running--;
 156 +       --*threads_running;
 157 +       if (*threads_running > 0 && group_thread_accounting)
 158 +               p->deadline -= __task_deadline_diff(p);
 159  }
 160
 161  #ifdef CONFIG_SMP
 162 @@ -1635,6 +1678,10 @@ void wake_up_new_task(struct task_struct
 163         parent = p->parent;
 164         /* Unnecessary but small chance that the parent changed CPU */
 165         set_task_cpu(p, task_cpu(parent));
 166 +       if (!(clone_flags & CLONE_THREAD)) {
 167 +               p->fork_depth++;
 168 +               p->threads_running = 0;
 169 +       }
 170         activate_task(p, rq);
 171         trace_mark(kernel_sched_wakeup_new,
 172                 "pid %d state %ld ## rq %p task %p rq->curr %p",
 173 @@ -2524,11 +2571,20 @@ static inline u64 prio_deadline_diff(int
 174         return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
 175  }
 176
 177 -static inline u64 task_deadline_diff(struct task_struct *p)
 178 +static inline u64 __task_deadline_diff(struct task_struct *p)
 179  {
 180         return prio_deadline_diff(TASK_USER_PRIO(p));
 181  }
 182
 183 +static inline u64 task_deadline_diff(struct task_struct *p)
 184 +{
 185 +       u64 pdd = __task_deadline_diff(p);
 186 +
 187 +       if (fork_depth_penalty && p->fork_depth > 1)
 188 +               pdd *= p->fork_depth;
 189 +       return pdd;
 190 +}
 191 +
 192  static inline u64 static_deadline_diff(int static_prio)
 193  {
 194         return prio_deadline_diff(USER_PRIO(static_prio));
 195 @@ -2545,8 +2601,24 @@ static inline int ms_longest_deadline_di
 196   */
 197  static void time_slice_expired(struct task_struct *p)
 198  {
 199 +       u64 tdd = task_deadline_diff(p);
 200 +
 201 +       /*
 202 +        * We proportionately increase the deadline according to how many
 203 +        * threads are running. This effectively makes a thread group have
 204 +        * the same CPU as one task, no matter how many threads are running.
 205 +        * time_slice_expired can be called when there may be none running
 206 +        * when p is deactivated so we must explicitly test for more than 1.
 207 +        */
 208 +       if (group_thread_accounting) {
 209 +               unsigned long *threads_running = &p->group_leader->threads_running;
 210 +
 211 +               if (*threads_running > 1)
 212 +                       tdd += *threads_running * __task_deadline_diff(p);
 213 +       }
 214         p->time_slice = timeslice();
 215 -       p->deadline = grq.niffies + task_deadline_diff(p);
 216 +       p->deadline_niffy = grq.niffies;
 217 +       p->deadline = grq.niffies + tdd;
 218  }
 219
 220  /*
 221 @@ -3513,7 +3585,7 @@ SYSCALL_DEFINE1(nice, int, increment)
 222   *
 223   * This is the priority value as seen by users in /proc.
 224   * RT tasks are offset by -100. Normal tasks are centered around 1, value goes
 225 - * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO).
 226 + * from 0 (SCHED_ISO) upwards (to nice +19 SCHED_IDLEPRIO).
 227   */
 228  int task_prio(const struct task_struct *p)
 229  {
 230 @@ -3525,8 +3597,12 @@ int task_prio(const struct task_struct *
 231
 232         /* Convert to ms to avoid overflows */
 233         delta = NS_TO_MS(p->deadline - grq.niffies);
 234 -       delta = delta * 40 / ms_longest_deadline_diff();
 235 -       if (delta > 0 && delta <= 80)
 236 +       if (fork_depth_penalty)
 237 +               delta *= 4;
 238 +       else
 239 +               delta *= 40;
 240 +       delta /= ms_longest_deadline_diff();
 241 +       if (delta > 0)
 242                 prio += delta;
 243         if (idleprio_task(p))
 244                 prio += 40;
 245 Index: linux-2.6.36-rc7-ck1/kernel/sysctl.c
 246 ===================================================================
 247 --- linux-2.6.36-rc7-ck1.orig/kernel/sysctl.c   2010-10-08 09:39:11.603648964 +1100
 248 +++ linux-2.6.36-rc7-ck1/kernel/sysctl.c        2010-10-08 09:39:53.579007778 +1100
 249 @@ -121,6 +121,8 @@ static int __maybe_unused one_hundred =
 250  #ifdef CONFIG_SCHED_BFS
 251  extern int rr_interval;
 252  extern int sched_iso_cpu;
 253 +extern int group_thread_accounting;
 254 +extern int fork_depth_penalty;
 255  static int __read_mostly one_thousand = 1000;
 256  #endif
 257  /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 258 @@ -834,6 +836,24 @@ static struct ctl_table kern_table[] = {
 259                 .extra1         = &zero,
 260                 .extra2         = &one_hundred,
 261         },
 262 +       {
 263 +               .procname       = "group_thread_accounting",
 264 +               .data           = &group_thread_accounting,
 265 +               .maxlen         = sizeof (int),
 266 +               .mode           = 0644,
 267 +               .proc_handler   = &proc_dointvec_minmax,
 268 +               .extra1         = &zero,
 269 +               .extra2         = &one,
 270 +       },
 271 +       {
 272 +               .procname       = "fork_depth_penalty",
 273 +               .data           = &fork_depth_penalty,
 274 +               .maxlen         = sizeof (int),
 275 +               .mode           = 0644,
 276 +               .proc_handler   = &proc_dointvec_minmax,
 277 +               .extra1         = &zero,
 278 +               .extra2         = &one,
 279 +       },
 280  #endif
 281  #if defined(CONFIG_S390) && defined(CONFIG_SMP)
 282         {
 283 v
 284 vv