sched: Add cgroup-based criteria for upmigration

It may be desirable to discourage upmigration of tasks belonging to some cgroups. Add a per-cgroup flag (upmigrate_discourage) that discourages upmigration of tasks of a cgroup. Tasks of the cgroup are allowed to upmigrate only under overcommitted scenario. Change-Id: I1780e420af1b6865c5332fb55ee1ee408b74d8ce Signed-off-by: Srivatsa Vaddagiri <vatsa@codeaurora.org>
2015-02-06 18:05:53 +05:30 · 2015-02-06 18:05:53 +05:30 · 995fad6d1a
parent 93a10a8b28
commit 995fad6d1a
4 changed files with 94 additions and 8 deletions
--- a/Documentation/scheduler/sched-hmp.txt
+++ b/Documentation/scheduler/sched-hmp.txt
@ -557,8 +557,13 @@ both tasks and CPUs to aid in the placement of tasks.

  /proc/sys/kernel/sched_upmigrate

-  This value is a percentage. If a task consumes more than this much of
-  a particular CPU, that CPU will be considered too small for the task.
+  This value is a percentage. If a task consumes more than this much of a
+  particular CPU, that CPU will be considered too small for the task. The task
+  will thus be seen as a "big" task on the cpu and will reflect in nr_big_tasks
+  statistics maintained for that cpu. Note that certain tasks (whose nice
+  value exceeds sched_upmigrate_min_nice value or those that belong to a cgroup
+  whose upmigrate_discourage flag is set) will never be classified as big tasks
+  despite their high demand.

 - mostly_idle

@ -1096,6 +1101,8 @@ A task whose nice value is greater than this tunable value will never
 be considered as a "big" task (it will not be allowed to run on a
 high-performance CPU).

+See also notes on 'cpu.upmigrate_discourage' tunable.
+
 *** 7.10 sched_enable_power_aware

 Appears at: /proc/sys/kernel/sched_enable_power_aware
@ -1284,6 +1291,22 @@ account of energy awareness reasons.
 The same logic also applies to the load balancer path to avoid frequent
 migrations due to energy awareness.

+** 7.25 cpu.upmigrate_discourage
+
+Default value : 0
+
+This is a cgroup attribute supported by the cpu resource controller. It normally
+appears at [root_cpu]/[name1]/../[name2]/cpu.upmigrate_discourage. Here
+"root_cpu" is the mount point for cgroup (cpu resource control) filesystem
+and name1, name2 etc are names of cgroups that form a hierarchy.
+
+Setting this flag to 1 discourages upmigration for all tasks of a cgroup. High
+demand tasks of such a cgroup will never be classified as big tasks and hence
+not upmigrated. Any task of the cgroup is allowed to upmigrate only under
+overcommitted scenario. See notes on sched_spill_nr_run and sched_spill_load for
+how overcommitment threshold is defined and also notes on
+'sched_upmigrate_min_nice' tunable.
+
 =========================
 8. HMP SCHEDULER TRACE POINTS
 =========================
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@ -9808,6 +9808,45 @@ static int cpu_notify_on_migrate_write_u64(struct cgroup *cgrp,
 	return 0;
 }

+#ifdef CONFIG_SCHED_HMP
+
+static u64 cpu_upmigrate_discourage_read_u64(struct cgroup *cgrp,
+					  struct cftype *cft)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+
+	return tg->upmigrate_discouraged;
+}
+
+static int cpu_upmigrate_discourage_write_u64(struct cgroup *cgrp,
+				   struct cftype *cft, u64 upmigrate_discourage)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+	int discourage = upmigrate_discourage > 0;
+
+	if (tg->upmigrate_discouraged == discourage)
+		return 0;
+
+	/*
+	 * Revisit big-task classification for tasks of this cgroup. It would
+	 * have been efficient to walk tasks of just this cgroup in running
+	 * state, but we don't have easy means to do that. Walk all tasks in
+	 * running state on all cpus instead and re-visit their big task
+	 * classification.
+	 */
+	get_online_cpus();
+	pre_big_small_task_count_change(cpu_online_mask);
+
+	tg->upmigrate_discouraged = discourage;
+
+	post_big_small_task_count_change(cpu_online_mask);
+	put_online_cpus();
+
+	return 0;
+}
+
+#endif	/* CONFIG_SCHED_HMP */
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
 				u64 shareval)
@ -10091,6 +10130,13 @@ static struct cftype cpu_files[] = {
 		.read_u64 = cpu_notify_on_migrate_read_u64,
 		.write_u64 = cpu_notify_on_migrate_write_u64,
 	},
+#ifdef CONFIG_SCHED_HMP
+	{
+		.name = "upmigrate_discourage",
+		.read_u64 = cpu_upmigrate_discourage_read_u64,
+		.write_u64 = cpu_upmigrate_discourage_write_u64,
+	},
+#endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
 		.name = "shares",
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@ -1499,14 +1499,29 @@ u64 scale_load_to_cpu(u64 task_load, int cpu)
 	return task_load;
 }

+#ifdef CONFIG_CGROUP_SCHED
+
+static inline int upmigrate_discouraged(struct task_struct *p)
+{
+	return task_group(p)->upmigrate_discouraged;
+}
+
+#else
+
+static inline int upmigrate_discouraged(struct task_struct *p)
+{
+	return 0;
+}
+
+#endif
+
 /* Is a task "big" on its current cpu */
 static inline int is_big_task(struct task_struct *p)
 {
 	u64 load = task_load(p);
 	int nice = TASK_NICE(p);

-	/* Todo: Provide cgroup-based control as well? */
-	if (nice > sched_upmigrate_min_nice)
+	if (nice > sched_upmigrate_min_nice || upmigrate_discouraged(p))
 		return 0;

 	load = scale_load_to_cpu(load, task_cpu(p));
@ -1693,8 +1708,7 @@ static int task_will_fit(struct task_struct *p, int cpu)
 		if (rq->capacity > prev_rq->capacity)
 			return 1;
 	} else {
-		/* Todo: Provide cgroup-based control as well? */
-		if (nice > sched_upmigrate_min_nice)
+		if (nice > sched_upmigrate_min_nice || upmigrate_discouraged(p))
 			return 1;

 		load = scale_load_to_cpu(task_load(p), cpu);
@ -2642,8 +2656,8 @@ static inline int migration_needed(struct rq *rq, struct task_struct *p)
 	if (is_small_task(p))
 		return 0;

-	/* Todo: cgroup-based control? */
-	if (nice > sched_upmigrate_min_nice && rq->capacity > min_capacity)
+	if ((nice > sched_upmigrate_min_nice || upmigrate_discouraged(p)) &&
+			 rq->capacity > min_capacity)
 		return MOVE_TO_LITTLE_CPU;

 	if (!task_will_fit(p, cpu_of(rq)))
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@ -137,6 +137,9 @@ struct task_group {
 	struct cgroup_subsys_state css;

 	bool notify_on_migrate;
+#ifdef CONFIG_SCHED_HMP
+	bool upmigrate_discouraged;
+#endif

 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* schedulable entities of this group on each cpu */