sched: Optimize select_best_cpu() to reduce execution time

select_best_cpu() is a crucial wakeup routine that determines the time taken by the scheduler to wake up a task. Optimize this routine to get higher performance. The following changes have been made as part of the optimization listed in order of how they built on top of one another: * Several routines called by select_best_cpu() recalculate task load and CPU load even though these are already known quantities. For example mostly_idle_cpu_sync() calculates CPU load; task_will_fit() calculates task load before spill_threshold_crossed() recalculates both. Remove these redundant calculations by moving the task load and CPU load computations to the select_best_cpu() 'for' loop and passing to any functions that need the information. * Rewrite best_small_task_cpu() to avoid the existing two pass approach. The two pass approach was only in place to find the minimum power cluster for small task placement. This information can easily be established by looking at runqueue capacities. The cluster with not the highest capacity constitutes the minimum power cluster. A special CPU mask is called the mpc_mask required to safeguard against undue side effects on SMP systems. Also terminate the function early if the previous CPU is found to be mostly_idle. * Reorganize code to ensure that no unnecessary computations or variable assignments are done. For example there is no need to compute CPU load if that information does not end up getting used in any iteration of the 'for' loop. * The tick logic for EA migrations unnecessarily checks for the power of all CPUs only for skip_cpu() to throw away the result later. Ensure that for EA we only check CPUs within the same cluster and avoid running select_best_cpu() whenever possible. CRs-fixed: 849655 Change-Id: I4e722912fcf3fe4e365a826d4d92a4dd45c05ef3 Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org> [joonwoop@codeaurora.org: fixed cpufreq_notifier_policy() to set mpc_mask. added a comment about prerequisite of lower_power_cpu_available(). s/struct rq * rq/struct rq *rq/.] Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
2015-05-28 00:11:01 -07:00 · 2015-05-28 00:11:01 -07:00 · 89e7c56a27
parent 6ccfdccee9
commit 89e7c56a27
3 changed files with 139 additions and 125 deletions
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@ -1222,6 +1222,9 @@ unsigned int min_capacity = 1024; /* min(rq->capacity) */
 unsigned int max_load_scale_factor = 1024; /* max possible load scale factor */
 unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */

+/* Mask of all CPUs that have  max_possible_capacity */
+cpumask_t mpc_mask = CPU_MASK_ALL;
+
 /* Window size (in ns) */
 __read_mostly unsigned int sched_ravg_window = 10000000;

@ -2545,8 +2548,13 @@ static int cpufreq_notifier_policy(struct notifier_block *nb,
 			mplsf = div_u64(((u64) rq->load_scale_factor) *
 				rq->max_possible_freq, rq->max_freq);

-			if (mpc > highest_mpc)
+			if (mpc > highest_mpc) {
 				highest_mpc = mpc;
+				cpumask_clear(&mpc_mask);
+				cpumask_set_cpu(i, &mpc_mask);
+			} else if (mpc == highest_mpc) {
+				cpumask_set_cpu(i, &mpc_mask);
+			}

 			if (mplsf > highest_mplsf)
 				highest_mplsf = mplsf;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@ -1715,11 +1715,9 @@ static inline u64 cpu_load_sync(int cpu, int sync)
 }

 static int
-spill_threshold_crossed(struct task_struct *p, struct rq *rq, int cpu,
-			int sync)
+spill_threshold_crossed(u64 task_load, u64 cpu_load, struct rq *rq)
 {
-	u64 total_load = cpu_load_sync(cpu, sync) +
-		scale_load_to_cpu(task_load(p), cpu);
+	u64 total_load = task_load + cpu_load;

 	if (total_load > sched_spill_load ||
 	    (rq->nr_running + 1) > sysctl_sched_spill_nr_run)
@ -1737,10 +1735,9 @@ int mostly_idle_cpu(int cpu)
 		&& !sched_cpu_high_irqload(cpu);
 }

-static int mostly_idle_cpu_sync(int cpu, int sync)
+static int mostly_idle_cpu_sync(int cpu, u64 load, int sync)
 {
 	struct rq *rq = cpu_rq(cpu);
-	u64 load = cpu_load_sync(cpu, sync);
 	int nr_running;

 	nr_running = rq->nr_running;
@ -1838,14 +1835,12 @@ done:
 * sched_downmigrate. This will help avoid frequenty migrations for
 * tasks with load close to the upmigrate threshold
 */
-static int task_will_fit(struct task_struct *p, int cpu)
+
+static int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu)
 {
-	u64 load;
-	int prev_cpu = task_cpu(p);
-	struct rq *prev_rq = cpu_rq(prev_cpu);
+	struct rq *prev_rq = cpu_rq(task_cpu(p));
 	struct rq *rq = cpu_rq(cpu);
-	int upmigrate = sched_upmigrate;
-	int nice = TASK_NICE(p);
+	int upmigrate, nice;

 	if (rq->capacity == max_capacity)
 		return 1;
@ -1854,33 +1849,39 @@ static int task_will_fit(struct task_struct *p, int cpu)
 		if (rq->capacity > prev_rq->capacity)
 			return 1;
 	} else {
+		nice = TASK_NICE(p);
 		if (nice > sched_upmigrate_min_nice || upmigrate_discouraged(p))
 			return 1;

-		load = scale_load_to_cpu(task_load(p), cpu);
-
+		upmigrate = sched_upmigrate;
 		if (prev_rq->capacity > rq->capacity)
 			upmigrate = sched_downmigrate;

-		if (load < upmigrate)
+		if (task_load < upmigrate)
 			return 1;
 	}

 	return 0;
 }

-static int eligible_cpu(struct task_struct *p, int cpu, int sync)
+static int task_will_fit(struct task_struct *p, int cpu)
+{
+	u64 tload = scale_load_to_cpu(task_load(p), cpu);
+	return task_load_will_fit(p, tload, cpu);
+}
+
+static int eligible_cpu(u64 task_load, u64 cpu_load, int cpu, int sync)
 {
 	struct rq *rq = cpu_rq(cpu);

 	if (sched_cpu_high_irqload(cpu))
 		return 0;

-	if (mostly_idle_cpu_sync(cpu, sync))
+	if (mostly_idle_cpu_sync(cpu, cpu_load, sync))
 		return 1;

 	if (rq->max_possible_capacity != max_possible_capacity)
-		return !spill_threshold_crossed(p, rq, cpu, sync);
+		return !spill_threshold_crossed(task_load, cpu_load, rq);

 	return 0;
 }
@ -1935,22 +1936,23 @@ unsigned int power_cost_at_freq(int cpu, unsigned int freq)
 /* Return the cost of running task p on CPU cpu. This function
 * currently assumes that task p is the only task which will run on
 * the CPU. */
-static unsigned int power_cost(struct task_struct *p, int cpu)
+static unsigned int power_cost(u64 task_load, int cpu)
 {
+	unsigned int task_freq, cur_freq;
+	struct rq *rq = cpu_rq(cpu);
 	u64 demand;
-	unsigned int task_freq;
-	unsigned int cur_freq = cpu_rq(cpu)->cur_freq;

 	if (!sysctl_sched_enable_power_aware)
-		return cpu_rq(cpu)->max_possible_capacity;
+		return rq->max_possible_capacity;

 	/* calculate % of max freq needed */
-	demand = scale_load_to_cpu(task_load(p), cpu) * 100;
+	demand = task_load * 100;
 	demand = div64_u64(demand, max_task_load());

-	task_freq = demand * cpu_rq(cpu)->max_possible_freq;
+	task_freq = demand * rq->max_possible_freq;
 	task_freq /= 100; /* khz needed */

+	cur_freq = rq->cur_freq;
 	task_freq = max(cur_freq, task_freq);

 	return power_cost_at_freq(cpu, task_freq);
@ -1958,94 +1960,79 @@ static unsigned int power_cost(struct task_struct *p, int cpu)

 static int best_small_task_cpu(struct task_struct *p, int sync)
 {
-	int best_busy_cpu = -1, best_fallback_cpu = -1;
-	int best_mi_cpu = -1;
-	int min_cost_cpu = -1, min_cstate_cpu = -1;
+	int best_busy_cpu = -1, fallback_cpu = -1;
+	int min_cstate_cpu = -1;
 	int min_cstate = INT_MAX;
-	int min_fallback_cpu_cost = INT_MAX;
-	int min_cost = INT_MAX;
-	int i, cstate, cpu_cost;
-	u64 load, min_busy_load = ULLONG_MAX;
-	int cost_list[nr_cpu_ids];
-	int prev_cpu = task_cpu(p);
-	struct cpumask search_cpus;
+	int cpu_cost, min_cost = INT_MAX;
+	int i, cstate, prev_cpu;
+	int hmp_capable;
+	u64 tload, cpu_load, min_load = ULLONG_MAX;
+	cpumask_t mi_cpus = CPU_MASK_NONE;
+	cpumask_t temp;

-	cpumask_and(&search_cpus,  tsk_cpus_allowed(p), cpu_online_mask);
+	cpumask_and(&temp, &mpc_mask, cpu_possible_mask);
+	hmp_capable = !cpumask_full(&temp);

-	if (cpumask_empty(&search_cpus))
-		return prev_cpu;
+	for_each_cpu_and(i, tsk_cpus_allowed(p), cpu_online_mask) {
+		struct rq *rq = cpu_rq(i);
+		prev_cpu = (i == task_cpu(p));

-	/* Take a first pass to find the lowest power cost CPU. This
-	   will avoid a potential O(n^2) search */
-	for_each_cpu(i, &search_cpus) {
-
-		trace_sched_cpu_load(cpu_rq(i), idle_cpu(i),
-				     mostly_idle_cpu_sync(i, sync),
-				     sched_irqload(i), power_cost(p, i),
+		trace_sched_cpu_load(rq, idle_cpu(i),
+				     mostly_idle_cpu_sync(i,
+						  cpu_load_sync(i, sync), sync),
+				     sched_irqload(i),
+				     power_cost(scale_load_to_cpu(task_load(p),
+						i), i),
 				     cpu_temp(i));

-		cpu_cost = power_cost(p, i);
-		if (cpu_cost < min_cost ||
-		    (cpu_cost == min_cost && i == prev_cpu)) {
-			min_cost = cpu_cost;
-			min_cost_cpu = i;
-		}
-
-		cost_list[i] = cpu_cost;
-	}
-
-	/*
-	 * Optimization to steer task towards the minimum power cost
-	 * CPU if it's the task's previous CPU. The tradeoff is that
-	 * we may have to check the same information again in pass 2.
-	 */
-	if (!cpu_rq(min_cost_cpu)->cstate &&
-	    mostly_idle_cpu_sync(min_cost_cpu, sync) &&
-	    !sched_cpu_high_irqload(min_cost_cpu) && min_cost_cpu == prev_cpu)
-		return min_cost_cpu;
-
-	for_each_cpu(i, &search_cpus) {
-		struct rq *rq = cpu_rq(i);
-		cstate = rq->cstate;
-
-		if (power_delta_exceeded(cost_list[i], min_cost)) {
-			if (cost_list[i] < min_fallback_cpu_cost ||
-			    (cost_list[i] == min_fallback_cpu_cost &&
-			     i == prev_cpu)) {
-				best_fallback_cpu = i;
-				min_fallback_cpu_cost = cost_list[i];
+		if (rq->max_possible_capacity == max_possible_capacity &&
+							      hmp_capable) {
+			tload = scale_load_to_cpu(task_load(p), i);
+			cpu_cost = power_cost(tload, i);
+			if (cpu_cost < min_cost ||
+			   (prev_cpu && cpu_cost == min_cost)) {
+				fallback_cpu = i;
+				min_cost = cpu_cost;
 			}
 			continue;
 		}

-		if (idle_cpu(i) && cstate && !sched_cpu_high_irqload(i)) {
+		if (sched_cpu_high_irqload(i))
+			continue;
+
+		/* Todo this can be optimized to avoid checking c-state
+		 * and moving cstate assignment statement inside the if */
+		cstate = rq->cstate;
+		if (idle_cpu(i) && cstate) {
 			if (cstate < min_cstate ||
-			    (cstate == min_cstate && i == prev_cpu)) {
+			    (prev_cpu && cstate == min_cstate)) {
 				min_cstate_cpu = i;
 				min_cstate = cstate;
 			}
 			continue;
 		}

-		if (mostly_idle_cpu_sync(i, sync) &&
-		    !sched_cpu_high_irqload(i)) {
-			if (best_mi_cpu == -1 || i == prev_cpu)
-				best_mi_cpu = i;
+		cpu_load = cpu_load_sync(i, sync);
+		if (mostly_idle_cpu_sync(i, cpu_load, sync)) {
+			if (prev_cpu)
+				return task_cpu(p);
+
+			cpumask_set_cpu(i, &mi_cpus);
 			continue;
 		}

-		load = cpu_load_sync(i, sync);
-		if (!spill_threshold_crossed(p, rq, i, sync)) {
-			if (load < min_busy_load ||
-			    (load == min_busy_load && i == prev_cpu)) {
-				min_busy_load = load;
+		tload =  scale_load_to_cpu(task_load(p), i);
+		if (!spill_threshold_crossed(tload, cpu_load, rq)) {
+			if (cpu_load < min_load ||
+			    (prev_cpu && cpu_load == min_load)) {
+				min_load = cpu_load;
 				best_busy_cpu = i;
 			}
 		}
 	}

-	if (best_mi_cpu != -1)
-		return best_mi_cpu;
+	if (!cpumask_empty(&mi_cpus))
+		return cpumask_first(&mi_cpus);

 	if (min_cstate_cpu != -1)
 		return min_cstate_cpu;
@ -2053,7 +2040,7 @@ static int best_small_task_cpu(struct task_struct *p, int sync)
 	if (best_busy_cpu != -1)
 		return best_busy_cpu;

-	return best_fallback_cpu;
+	return fallback_cpu;
 }

 #define UP_MIGRATION		1
@ -2061,11 +2048,11 @@ static int best_small_task_cpu(struct task_struct *p, int sync)
 #define EA_MIGRATION		3
 #define IRQLOAD_MIGRATION	4

-static int skip_cpu(struct task_struct *p, int cpu, int reason)
+static int skip_cpu(struct task_struct *p, u64 task_load, int cpu, int reason)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct rq *task_rq = task_rq(p);
-	int skip = 0;
+	int skip;

 	if (!reason)
 		return 0;
@ -2084,14 +2071,15 @@ static int skip_cpu(struct task_struct *p, int cpu, int reason)

 	case EA_MIGRATION:
 		skip = rq->capacity < task_rq->capacity  ||
-			power_cost(p, cpu) >  power_cost(p,  task_cpu(p));
+			power_cost(task_load, cpu) >
+				power_cost(task_load, task_cpu(p));
 		break;

 	case IRQLOAD_MIGRATION:
 		/* Purposely fall through */

 	default:
-		skip = (cpu == task_cpu(p));
+		skip = (rq == task_rq);
 		break;
 	}

@ -2122,7 +2110,7 @@ static int select_packing_target(struct task_struct *p, int best_cpu)

 	/* Pick the first lowest power cpu as target */
 	for_each_cpu(i, &search_cpus) {
-		int cost = power_cost(p, i);
+		int cost = power_cost(scale_load_to_cpu(task_load(p), i), i);

 		if (cost < min_cost && !sched_cpu_high_irqload(i)) {
 			target = i;
@ -2153,10 +2141,11 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 			   int sync)
 {
 	int i, best_cpu = -1, fallback_idle_cpu = -1, min_cstate_cpu = -1;
-	int prev_cpu = task_cpu(p);
+	int prev_cpu;
 	int cpu_cost, min_cost = INT_MAX;
 	int min_idle_cost = INT_MAX, min_busy_cost = INT_MAX;
-	u64 load, min_load = ULLONG_MAX, min_fallback_load = ULLONG_MAX;
+	u64 tload, cpu_load;
+	u64 min_load = ULLONG_MAX, min_fallback_load = ULLONG_MAX;
 	int small_task = is_small_task(p);
 	int boost = sched_boost();
 	int cstate, min_cstate = INT_MAX;
@ -2189,12 +2178,17 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 	/* Todo : Optimize this loop */
 	for_each_cpu_and(i, tsk_cpus_allowed(p), cpu_online_mask) {

+		tload =  scale_load_to_cpu(task_load(p), i);
+		cpu_load = cpu_load_sync(i, sync);
+		prev_cpu = (i == task_cpu(p));
+
 		trace_sched_cpu_load(cpu_rq(i), idle_cpu(i),
-				     mostly_idle_cpu_sync(i, sync),
-				     sched_irqload(i), power_cost(p, i),
+				     mostly_idle_cpu_sync(i,
+						  cpu_load_sync(i, sync), sync),
+				     sched_irqload(i), power_cost(tload, i),
 				     cpu_temp(i));

-		if (skip_cpu(p, i, reason))
+		if (skip_cpu(p, tload, i, reason))
 			continue;

 		/*
@ -2202,14 +2196,13 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 		 * won't fit is our fallback if we can't find a CPU
 		 * where the task will fit.
 		 */
-		if (!task_will_fit(p, i)) {
-			if (mostly_idle_cpu_sync(i, sync) &&
+		if (!task_load_will_fit(p, tload, i)) {
+			if (mostly_idle_cpu_sync(i, cpu_load, sync) &&
 			    !sched_cpu_high_irqload(i)) {
-				load = cpu_load_sync(i, sync);
-				if (load < min_fallback_load ||
-				    (load == min_fallback_load &&
-				     i == prev_cpu)) {
-					min_fallback_load = load;
+				if (cpu_load < min_fallback_load ||
+				    (cpu_load == min_fallback_load &&
+				     prev_cpu)) {
+					min_fallback_load = cpu_load;
 					fallback_idle_cpu = i;
 				}
 			}
@ -2220,7 +2213,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 		if (prefer_idle == -1)
 			prefer_idle = cpu_rq(i)->prefer_idle;

-		if (!eligible_cpu(p, i, sync))
+		if (!eligible_cpu(tload, cpu_load, i, sync))
 			continue;

 		/*
@ -2229,9 +2222,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 		 * spill.
 		 */

-		load = cpu_load_sync(i, sync);
-		cpu_cost = power_cost(p, i);
-		cstate = cpu_rq(i)->cstate;
+		cpu_cost = power_cost(tload, i);

 		/*
 		 * If the task fits in a CPU in a lower power band, that
@ -2261,6 +2252,8 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 		 */
 		if (idle_cpu(i) || (sync && i == curr_cpu && prefer_idle &&
 				    cpu_rq(i)->nr_running == 1)) {
+			cstate = cpu_rq(i)->cstate;
+
 			if (cstate > min_cstate)
 				continue;

@ -2272,7 +2265,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 			}

 			if (cpu_cost < min_idle_cost ||
-			    (cpu_cost == min_idle_cost && i == prev_cpu)) {
+			    (prev_cpu && cpu_cost == min_idle_cost)) {
 				min_idle_cost = cpu_cost;
 				min_cstate_cpu = i;
 			}
@ -2284,11 +2277,11 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 		 * For CPUs that are not completely idle, pick one with the
 		 * lowest load and break ties with power cost
 		 */
-		if (load > min_load)
+		if (cpu_load > min_load)
 			continue;

-		if (load < min_load) {
-			min_load = load;
+		if (cpu_load < min_load) {
+			min_load = cpu_load;
 			min_busy_cost = cpu_cost;
 			best_cpu = i;
 			continue;
@ -2300,7 +2293,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 		 * more power efficient CPU option.
 		 */
 		if (cpu_cost < min_busy_cost ||
-		    (cpu_cost == min_busy_cost && i == prev_cpu)) {
+		    (prev_cpu && cpu_cost == min_busy_cost)) {
 			min_busy_cost = cpu_cost;
 			best_cpu = i;
 		}
@ -2311,7 +2304,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
 	 * best_cpu cannot have high irq load.
 	 */
 	if (min_cstate_cpu >= 0 && (prefer_idle > 0 || best_cpu < 0 ||
-				    !mostly_idle_cpu_sync(best_cpu, sync)))
+			!mostly_idle_cpu_sync(best_cpu, min_load, sync)))
 		best_cpu = min_cstate_cpu;
 done:
 	if (best_cpu < 0) {
@ -2321,7 +2314,7 @@ done:
 			 * prev_cpu. We may just benefit from having
 			 * a hot cache.
 			 */
-			best_cpu = prev_cpu;
+			best_cpu = task_cpu(p);
 		else
 			best_cpu = fallback_idle_cpu;
 	}
@ -2821,15 +2814,27 @@ static int lower_power_cpu_available(struct task_struct *p, int cpu)
 {
 	int i;
 	int lowest_power_cpu = task_cpu(p);
-	int lowest_power = power_cost(p, task_cpu(p));
+	int lowest_power = power_cost(scale_load_to_cpu(task_load(p),
+					lowest_power_cpu), lowest_power_cpu);
+	struct cpumask search_cpus;
+	struct rq *rq = cpu_rq(cpu);
+
+	/*
+	 * This function should be called only when task 'p' fits in the current
+	 * CPU which can be ensured by task_will_fit() prior to this.
+	 */
+	cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask);
+	cpumask_and(&search_cpus, &search_cpus, &rq->freq_domain_cpumask);
+	cpumask_clear_cpu(lowest_power_cpu, &search_cpus);

 	/* Is a lower-powered idle CPU available which will fit this task? */
-	for_each_cpu_and(i, tsk_cpus_allowed(p), cpu_online_mask) {
-		if (idle_cpu(i) && task_will_fit(p, i)) {
-			int idle_power_cost = power_cost(p, i);
-			if (idle_power_cost < lowest_power) {
+	for_each_cpu(i, &search_cpus) {
+		if (idle_cpu(i)) {
+			int cost =
+			 power_cost(scale_load_to_cpu(task_load(p), i), i);
+			if (cost < lowest_power) {
 				lowest_power_cpu = i;
-				lowest_power = idle_power_cost;
+				lowest_power = cost;
 			}
 		}
 	}
@ -2993,13 +2998,13 @@ static inline int find_new_hmp_ilb(int call_cpu, int type)
 	return 0;
 }

-static inline int power_cost(struct task_struct *p, int cpu)
+static inline int power_cost(u64 task_load, int cpu)
 {
 	return SCHED_POWER_SCALE;
 }

 static inline int
-spill_threshold_crossed(struct task_struct *p, struct rq *rq, int cpu, int sync)
+spill_threshold_crossed(u64 task_load, u64 cpu_load, struct rq *rq)
 {
 	return 0;
 }
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@ -869,6 +869,7 @@ extern unsigned int max_capacity;
 extern unsigned int min_capacity;
 extern unsigned int max_load_scale_factor;
 extern unsigned int max_possible_capacity;
+extern cpumask_t mpc_mask;
 extern unsigned long capacity_scale_cpu_efficiency(int cpu);
 extern unsigned long capacity_scale_cpu_freq(int cpu);
 extern unsigned int sched_mostly_idle_load;