sched/fair: Introduce scheduler boost for low latency workloads

Certain low latency bursty workloads require immediate use of highest capacity CPUs in HMP systems. Existing load tracking mechanisms may be unable to respond to the sudden surge in the system load within the latency requirements. Introduce the scheduler boost feature for such workloads. While boost is in effect the scheduler bypasses regular load based task placement and prefers highest capacity CPUs in the system for all non-small fair sched class tasks. Provide both a kernel and userspace API for software that may have apriori knowledge about the system workload. Change-Id: I783f585d1f8c97219e629d9c54f712318821922f Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
2014-05-28 13:30:26 -07:00 · 2014-05-28 13:30:26 -07:00 · 65eab4a6f5
parent a6fa50d177
commit 65eab4a6f5
4 changed files with 100 additions and 7 deletions
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@ -1880,6 +1880,15 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
 }
 #endif

+#ifdef CONFIG_SCHED_HMP
+extern int sched_set_boost(int enable);
+#else
+static inline int sched_set_boost(int enable)
+{
+	return -EINVAL;
+}
+#endif
+
 #ifdef CONFIG_NO_HZ_COMMON
 void calc_load_enter_idle(void);
 void calc_load_exit_idle(void);
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@ -60,6 +60,7 @@ extern unsigned int sysctl_sched_downmigrate_pct;
 extern int sysctl_sched_upmigrate_min_nice;
 extern unsigned int sysctl_sched_enable_power_aware;
 extern unsigned int sysctl_sched_powerband_limit_pct;
+extern unsigned int sysctl_sched_boost;

 #else /* CONFIG_SCHED_HMP */

@ -100,6 +101,9 @@ extern int sched_migrate_notify_proc_handler(struct ctl_table *table,
 extern int sched_hmp_proc_update_handler(struct ctl_table *table,
 		int write, void __user *buffer, size_t *lenp, loff_t *ppos);

+extern int sched_boost_handler(struct ctl_table *table, int write,
+			void __user *buffer, size_t *lenp, loff_t *ppos);
+
 #ifdef CONFIG_SCHED_DEBUG
 static inline unsigned int get_sysctl_timer_migration(void)
 {
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@ -1329,6 +1329,14 @@ unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60;
 */
 int __read_mostly sysctl_sched_upmigrate_min_nice = 15;

+/*
+ * Scheduler boost is a mechanism to temporarily place tasks on CPUs
+ * with higher capacity than those where a task would have normally
+ * ended up with their load characteristics. Any entity enabling
+ * boost is responsible for disabling it as well.
+ */
+unsigned int sysctl_sched_boost;
+
 static inline int available_cpu_capacity(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@ -1384,6 +1392,11 @@ u64 scale_task_load(u64 task_load, int cpu)
 static inline int is_big_task(struct task_struct *p)
 {
 	unsigned int load = task_load(p);
+	int nice = TASK_NICE(p);
+
+	/* Todo: Provide cgroup-based control as well? */
+	if (nice > sysctl_sched_upmigrate_min_nice)
+		return 0;

 	load = scale_task_load(load, task_cpu(p));

@ -1427,6 +1440,60 @@ int mostly_idle_cpu(int cpu)
 		&& rq->nr_running <= sysctl_sched_mostly_idle_nr_run);
 }

+static int boost_refcount;
+static DEFINE_SPINLOCK(boost_lock);
+static DEFINE_MUTEX(boost_mutex);
+
+static inline int sched_boost(void)
+{
+	return boost_refcount > 0;
+}
+
+int sched_set_boost(int enable)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&boost_lock, flags);
+
+	if (enable == 1) {
+		boost_refcount++;
+	} else if (!enable) {
+		if (boost_refcount >= 1)
+			boost_refcount--;
+		else
+			ret = -EINVAL;
+	} else {
+		ret = -EINVAL;
+	}
+
+	spin_unlock_irqrestore(&boost_lock, flags);
+
+	return ret;
+}
+
+int sched_boost_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos)
+{
+	int ret;
+
+	mutex_lock(&boost_mutex);
+	if (!write)
+		sysctl_sched_boost = sched_boost();
+
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (ret || !write)
+		goto done;
+
+	ret = (sysctl_sched_boost <= 1) ?
+		sched_set_boost(sysctl_sched_boost) : -EINVAL;
+
+done:
+	mutex_unlock(&boost_mutex);
+	return ret;
+}
+
 /*
 * Task will fit on a cpu if it's bandwidth consumption on that cpu
 * will be less than sched_upmigrate. A big task that was previously
@ -1449,13 +1516,19 @@ static int task_will_fit(struct task_struct *p, int cpu)
 			 rq->capacity == max_capacity)
 		return 1;

-	load = scale_task_load(task_load(p), cpu);
+	if (sched_boost()) {
+		if (rq->capacity > prev_rq->capacity)
+			return 1;

-	if (prev_rq->capacity > rq->capacity)
-		upmigrate = sched_downmigrate;
+	} else {
+		load = scale_task_load(task_load(p), cpu);

-	if (load < upmigrate)
-		return 1;
+		if (prev_rq->capacity > rq->capacity)
+			upmigrate = sched_downmigrate;
+
+		if (load < upmigrate)
+			return 1;
+	}

 	return 0;
 }
@ -1617,7 +1690,7 @@ static int select_best_cpu(struct task_struct *p, int target)

 void inc_nr_big_small_task(struct rq *rq, struct task_struct *p)
 {
-	if (!task_will_fit(p, cpu_of(rq)))
+	if (is_big_task(p))
 		rq->nr_big_tasks++;
 	else if (is_small_task(p))
 		rq->nr_small_tasks++;
@ -1625,7 +1698,7 @@ void inc_nr_big_small_task(struct rq *rq, struct task_struct *p)

 void dec_nr_big_small_task(struct rq *rq, struct task_struct *p)
 {
-	if (!task_will_fit(p, cpu_of(rq)))
+	if (is_big_task(p))
 		rq->nr_big_tasks--;
 	else if (is_small_task(p))
 		rq->nr_small_tasks--;
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@ -411,6 +411,13 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= sched_hmp_proc_update_handler,
 	},
+	{
+		.procname	= "sched_boost",
+		.data		= &sysctl_sched_boost,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_boost_handler,
+	},
 #endif	/* CONFIG_SCHED_HMP */
 #ifdef CONFIG_SCHED_DEBUG
 	{