From 3b15c77ab4949c6771c740011dd640390c63d00e Mon Sep 17 00:00:00 2001 From: tigerite Date: Wed, 25 May 2011 22:50:40 +0200 Subject: [PATCH] BFS 401 update to hierarchical tree based penalty patch (that can also group threads and enable/disable features) - series updated accordingly, but both features disabled by default --- ...fs401-penalise_fork_depth_account_threads.patch | 284 ++++++++++++++++++++ kernel-bfs-2.6.28/debian/patches/series | 2 +- 2 files changed, 285 insertions(+), 1 deletion(-) create mode 100644 kernel-bfs-2.6.28/debian/patches/bfs401-penalise_fork_depth_account_threads.patch diff --git a/kernel-bfs-2.6.28/debian/patches/bfs401-penalise_fork_depth_account_threads.patch b/kernel-bfs-2.6.28/debian/patches/bfs401-penalise_fork_depth_account_threads.patch new file mode 100644 index 0000000..764ff6c --- /dev/null +++ b/kernel-bfs-2.6.28/debian/patches/bfs401-penalise_fork_depth_account_threads.patch @@ -0,0 +1,284 @@ +Make it possible to have interactivity and responsiveness at very high load +levels by making deadlines offset by the fork depth from init. This has a +similar effect to 'nice'ing loads that are fork heavy. 'make' is a perfect +example of this and will, with fork_depth_penalty enabled, be felt as much +at 'make -j24' as it normally would be with just 'make'. + +Note that this drastically affects CPU distribution, and also has the +indirect side effect of partitioning CPU entitlement to different users as +well. No assumption as to CPU distribution should be made based on past +behaviour. + +This is achieved by separating out forks to new processes vs new threads. +When a new process is detected, its fork depth is inherited from its parent +across fork() and then is incremented by one. That fork_depth is then used +to cause a relative offset of its deadline. + +This feature is disabled in this patch by default and can be optionally +enabled. + +Threads are kept at the same fork_depth as their parent process, and can +optionally have their CPU entitlement all managed as one process together +by enabling the group_thread_accounting feature. This feature is disabled +by default in this patch, as many desktop applications such as firefox, +amarok, etc are multithreaded. By disabling this feature and enabling the +fork_depth_penalty feature (default) it favours CPU towards desktop +applications. + +Extensive testing is required to ensure this does not cause regressions in +common workloads. + +There are two sysctls to enable/disable these features. + +They are in /proc/sys/kernel/ + +group_thread_accounting - groups CPU accounting by threads +fork_depth_penalty - penalises according to depth of forking from init + +-ck + +--- + include/linux/sched.h | 7 +++ + kernel/sched_bfs.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++---- + kernel/sysctl.c | 20 +++++++++++ + 3 files changed, 108 insertions(+), 7 deletions(-) + +Index: linux-2.6.36-rc7-ck1/include/linux/sched.h +=================================================================== +--- linux-2.6.36-rc7-ck1.orig/include/linux/sched.h 2010-10-08 09:39:38.016240768 +1100 ++++ linux-2.6.36-rc7-ck1/include/linux/sched.h 2010-10-08 09:39:53.575007838 +1100 +@@ -1187,10 +1187,15 @@ struct task_struct { + unsigned int rt_priority; + #ifdef CONFIG_SCHED_BFS + int time_slice; +- u64 deadline; ++ /* Virtual deadline in niffies, and when the deadline was set */ ++ u64 deadline, deadline_niffy; + struct list_head run_list; + u64 last_ran; + u64 sched_time; /* sched_clock time spent running */ ++ /* Number of threads currently requesting CPU time */ ++ unsigned long threads_running; ++ /* Depth of forks from init */ ++ int fork_depth; + #ifdef CONFIG_SMP + int sticky; /* Soft affined flag */ + #endif +Index: linux-2.6.36-rc7-ck1/kernel/sched_bfs.c +=================================================================== +--- linux-2.6.36-rc7-ck1.orig/kernel/sched_bfs.c 2010-10-08 09:39:37.918242270 +1100 ++++ linux-2.6.36-rc7-ck1/kernel/sched_bfs.c 2010-10-08 11:16:01.382198622 +1100 +@@ -139,6 +139,15 @@ int rr_interval __read_mostly = 6; + int sched_iso_cpu __read_mostly = 70; + + /* ++ * group_thread_accounting - sysctl to decide whether to treat whole thread ++ * groups as a single entity for the purposes of CPU distribution. ++ */ ++int group_thread_accounting __read_mostly; ++ ++/* fork_depth_penalty - Whether to penalise CPU according to fork depth. */ ++int fork_depth_penalty __read_mostly; ++ ++/* + * The relative length of deadline for each priority(nice) level. + */ + static int prio_ratios[PRIO_RANGE] __read_mostly; +@@ -661,11 +670,29 @@ static int isoprio_suitable(void) + return !grq.iso_refractory; + } + ++static inline u64 __task_deadline_diff(struct task_struct *p); ++static inline u64 task_deadline_diff(struct task_struct *p); ++ + /* + * Adding to the global runqueue. Enter with grq locked. + */ + static void enqueue_task(struct task_struct *p) + { ++ s64 max_tdd = task_deadline_diff(p); ++ ++ /* ++ * Make sure that when we're queueing this task again that it ++ * doesn't have any old deadlines from when the thread group was ++ * being penalised and cap the deadline to the highest it could ++ * be, based on the current number of threads running. ++ */ ++ if (group_thread_accounting) { ++ max_tdd += p->group_leader->threads_running * ++ __task_deadline_diff(p); ++ } ++ if (p->deadline - p->deadline_niffy > max_tdd) ++ p->deadline = p->deadline_niffy + max_tdd; ++ + if (!rt_task(p)) { + /* Check it hasn't gotten rt from PI */ + if ((idleprio_task(p) && idleprio_suitable(p)) || +@@ -967,10 +994,13 @@ static int effective_prio(struct task_st + } + + /* +- * activate_task - move a task to the runqueue. Enter with grq locked. ++ * activate_task - move a task to the runqueue. Enter with grq locked. The ++ * number of threads running is stored in the group_leader struct. + */ + static void activate_task(struct task_struct *p, struct rq *rq) + { ++ unsigned long *threads_running = &p->group_leader->threads_running; ++ + update_clocks(rq); + + /* +@@ -987,6 +1017,14 @@ static void activate_task(struct task_st + p->prio = effective_prio(p); + if (task_contributes_to_load(p)) + grq.nr_uninterruptible--; ++ /* ++ * Adjust deadline according to number of running threads within ++ * this thread group. This ends up distributing CPU to the thread ++ * group as a single entity. ++ */ ++ ++*threads_running; ++ if (*threads_running > 1 && group_thread_accounting) ++ p->deadline += __task_deadline_diff(p); + enqueue_task(p); + grq.nr_running++; + inc_qnr(); +@@ -998,9 +1036,14 @@ static void activate_task(struct task_st + */ + static inline void deactivate_task(struct task_struct *p) + { ++ unsigned long *threads_running = &p->group_leader->threads_running; ++ + if (task_contributes_to_load(p)) + grq.nr_uninterruptible++; + grq.nr_running--; ++ --*threads_running; ++ if (*threads_running > 0 && group_thread_accounting) ++ p->deadline -= __task_deadline_diff(p); + } + + #ifdef CONFIG_SMP +@@ -1635,6 +1678,10 @@ void wake_up_new_task(struct task_struct + parent = p->parent; + /* Unnecessary but small chance that the parent changed CPU */ + set_task_cpu(p, task_cpu(parent)); ++ if (!(clone_flags & CLONE_THREAD)) { ++ p->fork_depth++; ++ p->threads_running = 0; ++ } + activate_task(p, rq); + trace_mark(kernel_sched_wakeup_new, + "pid %d state %ld ## rq %p task %p rq->curr %p", +@@ -2524,11 +2571,20 @@ static inline u64 prio_deadline_diff(int + return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); + } + +-static inline u64 task_deadline_diff(struct task_struct *p) ++static inline u64 __task_deadline_diff(struct task_struct *p) + { + return prio_deadline_diff(TASK_USER_PRIO(p)); + } + ++static inline u64 task_deadline_diff(struct task_struct *p) ++{ ++ u64 pdd = __task_deadline_diff(p); ++ ++ if (fork_depth_penalty && p->fork_depth > 1) ++ pdd *= p->fork_depth; ++ return pdd; ++} ++ + static inline u64 static_deadline_diff(int static_prio) + { + return prio_deadline_diff(USER_PRIO(static_prio)); +@@ -2545,8 +2601,24 @@ static inline int ms_longest_deadline_di + */ + static void time_slice_expired(struct task_struct *p) + { ++ u64 tdd = task_deadline_diff(p); ++ ++ /* ++ * We proportionately increase the deadline according to how many ++ * threads are running. This effectively makes a thread group have ++ * the same CPU as one task, no matter how many threads are running. ++ * time_slice_expired can be called when there may be none running ++ * when p is deactivated so we must explicitly test for more than 1. ++ */ ++ if (group_thread_accounting) { ++ unsigned long *threads_running = &p->group_leader->threads_running; ++ ++ if (*threads_running > 1) ++ tdd += *threads_running * __task_deadline_diff(p); ++ } + p->time_slice = timeslice(); +- p->deadline = grq.niffies + task_deadline_diff(p); ++ p->deadline_niffy = grq.niffies; ++ p->deadline = grq.niffies + tdd; + } + + /* +@@ -3513,7 +3585,7 @@ SYSCALL_DEFINE1(nice, int, increment) + * + * This is the priority value as seen by users in /proc. + * RT tasks are offset by -100. Normal tasks are centered around 1, value goes +- * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). ++ * from 0 (SCHED_ISO) upwards (to nice +19 SCHED_IDLEPRIO). + */ + int task_prio(const struct task_struct *p) + { +@@ -3525,8 +3597,12 @@ int task_prio(const struct task_struct * + + /* Convert to ms to avoid overflows */ + delta = NS_TO_MS(p->deadline - grq.niffies); +- delta = delta * 40 / ms_longest_deadline_diff(); +- if (delta > 0 && delta <= 80) ++ if (fork_depth_penalty) ++ delta *= 4; ++ else ++ delta *= 40; ++ delta /= ms_longest_deadline_diff(); ++ if (delta > 0) + prio += delta; + if (idleprio_task(p)) + prio += 40; +Index: linux-2.6.36-rc7-ck1/kernel/sysctl.c +=================================================================== +--- linux-2.6.36-rc7-ck1.orig/kernel/sysctl.c 2010-10-08 09:39:11.603648964 +1100 ++++ linux-2.6.36-rc7-ck1/kernel/sysctl.c 2010-10-08 09:39:53.579007778 +1100 +@@ -121,6 +121,8 @@ static int __maybe_unused one_hundred = + #ifdef CONFIG_SCHED_BFS + extern int rr_interval; + extern int sched_iso_cpu; ++extern int group_thread_accounting; ++extern int fork_depth_penalty; + static int __read_mostly one_thousand = 1000; + #endif + /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ +@@ -834,6 +836,24 @@ static struct ctl_table kern_table[] = { + .extra1 = &zero, + .extra2 = &one_hundred, + }, ++ { ++ .procname = "group_thread_accounting", ++ .data = &group_thread_accounting, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, ++ { ++ .procname = "fork_depth_penalty", ++ .data = &fork_depth_penalty, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, + #endif + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { +v +vv diff --git a/kernel-bfs-2.6.28/debian/patches/series b/kernel-bfs-2.6.28/debian/patches/series index 82aeba8..bf4a4ad 100644 --- a/kernel-bfs-2.6.28/debian/patches/series +++ b/kernel-bfs-2.6.28/debian/patches/series @@ -28,11 +28,11 @@ bfs-318-to-330.patch sched_reset_on_fork.diff bfs-330-to-350.patch bfs-350-to-357.patch -#bfs357-penalise_fork_depth_account_threads.patch bfs-357-to-360.patch bfs-360-to-363.patch bfs-363-to-400.patch bfs-400-to-401.patch +bfs401-penalise_fork_depth_account_threads.patch voltage_scaling_1.diff voltage_scaling_0.diff arm-proc-v7.diff -- 1.7.9.5