sched: Avoid frequent migration of running task

Power values for cpus can drop quite considerably when it goes idle. As a result, the best choice for running a single task in a cluster can vary quite rapidly. As the task keeps hopping cpus, other cpus go idle and start being seen as more favorable target for running a task, leading to task migrating almost every scheduler tick! Prevent this by keeping track of when a task started running on a cpu and allowing task migration in tick path (migration_needed()) on account of energy efficiency reasons only if the task has run sufficiently long (as determined by sysctl_sched_min_runtime variable). Note that currently sysctl_sched_min_runtime setting is considered only in scheduler_tick()->migration_needed() path and not in idle_balance() path. In other words, a task could be migrated to another cpu which did a idle_balance(). This limitation should not affect high-frequency migrations seen typically (when a single high-demand task runs on high-performance cpu). CRs-Fixed: 756570 Change-Id: I96413b7a81b623193c3bbcec6f3fa9dfec367d99 Signed-off-by: Srivatsa Vaddagiri <vatsa@codeaurora.org>
2014-11-18 13:19:39 +05:30 · 2014-11-18 13:19:39 +05:30 · 32c6ac7c62
parent 3986d38e2c
commit 32c6ac7c62
6 changed files with 49 additions and 0 deletions
--- a/Documentation/scheduler/sched-hmp.txt
+++ b/Documentation/scheduler/sched-hmp.txt
@ -1250,6 +1250,24 @@ Non-small tasks will prefer to wake up on idle CPUs if this tunable is set to 1.
 If the tunable is set to 0, non-small tasks will prefer to wake up on mostly
 idle CPUs which are not completely idle, increasing task packing behavior.

+** 7.24 sched_min_runtime
+
+Appears at: /proc/sys/kernel/sched_min_runtime
+
+Default value: 200000000 (200ms)
+
+This tunable helps avouid frequent migration of task on account of
+energy-awareness. During scheduler tick, a check is made (in migration_needed())
+whether the running task needs to be migrated to a "better" cpu, which could
+either offer better performance or power. When deciding to migrate task on
+account of power, we want to avoid "frequent" migration of task (say every
+tick), which could be add more overhead for comparatively little gains. A task's
+'run_start' attribute is set when it starts running on a cpu. This information
+is used in migration_needed() to avoid "frequent" migrations. Once a task has
+been associated with a cpu (in either running or runnable state) for more than
+'sched_min_vruntime' ns, it is considered eligible for migration in tick path on
+account of energy awareness reasons.
+
 =========================
 8. HMP SCHEDULER TRACE POINTS
 =========================
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@ -1168,6 +1168,7 @@ struct task_struct {
 	 * of this task
 	 */
 	u32 init_load_pct;
+	u64 run_start;
 #endif
 #ifdef CONFIG_CGROUP_SCHED
 	struct task_group *sched_task_group;
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@ -44,6 +44,7 @@ extern unsigned int sysctl_sched_cpu_high_irqload;
 extern unsigned int sysctl_sched_freq_account_wait_time;
 extern unsigned int sysctl_sched_migration_fixup;
 extern unsigned int sysctl_sched_heavy_task_pct;
+extern unsigned int sysctl_sched_min_runtime;

 #if defined(CONFIG_SCHED_FREQ_INPUT) || defined(CONFIG_SCHED_HMP)
 extern unsigned int sysctl_sched_init_task_load_pct;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@ -2519,6 +2519,16 @@ static void restore_orig_mark_start(struct task_struct *p, u64 mark_start)
 	p->ravg.mark_start = mark_start;
 }

+/*
+ * Note down when task started running on a cpu. This information will be handy
+ * to avoid "too" frequent task migrations for a running task on account of
+ * power.
+ */
+static inline void note_run_start(struct task_struct *p, u64 wallclock)
+{
+	p->run_start = wallclock;
+}
+
 #else	/* CONFIG_SCHED_HMP */

 static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { }
@ -2550,6 +2560,8 @@ restore_orig_mark_start(struct task_struct *p, u64 mark_start)
 {
 }

+static inline void note_run_start(struct task_struct *p, u64 wallclock) { }
+
 #endif	/* CONFIG_SCHED_HMP */

 #ifdef CONFIG_SMP
@ -2581,6 +2593,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

 	trace_sched_migrate_task(p, new_cpu, pct_task_load(p));

+	note_run_start(p, -1);
+
 	if (task_cpu(p) != new_cpu) {
 		struct task_migration_notifier tmn;

@ -4664,6 +4678,7 @@ need_resched:
 			prev->state = TASK_RUNNING;
 		} else {
 			deactivate_task(rq, prev, DEQUEUE_SLEEP);
+			note_run_start(prev, -1);
 			prev->on_rq = 0;

 			/*
@ -4694,6 +4709,7 @@ need_resched:
 	update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
 	clear_tsk_need_resched(prev);
 	rq->skip_clock_update = 0;
+	note_run_start(next, wallclock);

 	BUG_ON(task_cpu(next) != cpu_of(rq));

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@ -1229,6 +1229,8 @@ unsigned int __read_mostly sched_init_task_load_pelt;
 unsigned int __read_mostly sched_init_task_load_windows;
 unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15;

+unsigned int __read_mostly sysctl_sched_min_runtime = 200000000; /* 200 ms */
+
 static inline unsigned int task_load(struct task_struct *p)
 {
 	if (sched_use_pelt)
@ -2287,6 +2289,10 @@ static int lower_power_cpu_available(struct task_struct *p, int cpu)
 	int i;
 	int lowest_power_cpu = task_cpu(p);
 	int lowest_power = power_cost(p, task_cpu(p));
+	u64 delta = sched_clock() - p->run_start;
+
+	if (delta < sysctl_sched_min_runtime)
+		return 0;

 	/* Is a lower-powered idle CPU available which will fit this task? */
 	for_each_cpu_and(i, tsk_cpus_allowed(p), cpu_online_mask) {
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@ -374,6 +374,13 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= sched_hmp_proc_update_handler,
 	},
+	{
+		.procname	= "sched_min_runtime",
+		.data		= &sysctl_sched_min_runtime,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{
 		.procname	= "sched_spill_load",
 		.data		= &sysctl_sched_spill_load_pct,