sched: per-cpu mostly_idle threshold

sched_mostly_idle_load and sched_mostly_idle_nr_run knobs help pack
tasks on cpus to some extent. In some cases, it may be desirable to
have different packing limits for different cpus. For example, pack to
a higher limit on high-performance cpus compared to power-efficient
cpus.

This patch removes the global mostly_idle tunables and makes them
per-cpu, thus letting task packing behavior to be controlled in a
fine-grained manner.

Change-Id: Ifc254cda34b928eae9d6c342ce4c0f64e531e6c2
Signed-off-by: Srivatsa Vaddagiri <vatsa@codeaurora.org>
This commit is contained in:
Srivatsa Vaddagiri 2014-11-04 15:25:50 +05:30
parent f0e281597c
commit ed7d7749e9
9 changed files with 176 additions and 65 deletions

View File

@ -564,23 +564,22 @@ both tasks and CPUs to aid in the placement of tasks.
The "mostly_idle" classification applies to CPUs. This
classification attempts to answer the following question: if a task
is put on this CPU, is it likely to be able to run soon? One
possible way to answer this question would be to just check whether
the CPU is idle or not. That may be too conservative however. The
CPU may be currently executing a very small task and could become
idle soon. Since the scheduler is tracking the demand of each task
it can make an educated guess as to whether a CPU will become idle
in the near future.
is put on this CPU, is it likely to be able to run with low contention for
bandwidth? One possible way to answer this question would be to just check
whether the CPU is idle or not. That may be too conservative however. The CPU
may be currently executing a very small task and could become idle soon. Since
the scheduler is tracking the demand of each task it can make an educated
guess as to whether a CPU will become idle in the near future.
There are two tunable parameters which are used to determine whether
a CPU is mostly idle:
/proc/sys/kernel/sched_mostly_idle_nr_run
/proc/sys/kernel/sched_mostly_idle_load
/sys/devices/system/cpu/cpuX/sched_mostly_idle_nr_run
/sys/devices/system/cpu/cpuX/sched_mostly_idle_load
If a CPU does not have more than sched_mostly_idle_nr_run runnable
tasks and is not more than sched_mostly_idle_load percent busy, it
is considered mostly idle.
Note that these tunables are per-cpu. If a CPU does not have more than
sched_mostly_idle_nr_run runnable tasks and is not more than
sched_mostly_idle_load percent busy, it is considered mostly idle.
- spill threshold
@ -1042,9 +1041,9 @@ IRQ_UPDATE
*** 7.1 sched_mostly_idle_nr_run
Appears at: /proc/sys/kernel/sched_mostly_idle_nr_run
Appears at: /sys/devices/system/cpu/cpuX/sched_mostly_idle_nr_run
Default value: 4
Default value: 3
If a CPU has this many runnable tasks (or less), it is considered
"mostly idle." A mostly idle CPU is a preferred destination for a
@ -1054,7 +1053,7 @@ than sched_mostly_idle_load percent busy.
*** 7.2 sched_mostly_idle_load
Appears at: /proc/sys/kernel/sched_mostly_idle_load
Appears at: /sys/devices/system/cpu/cpuX/sched_mostly_idle_load
Default value: 20

View File

@ -168,6 +168,85 @@ static ssize_t show_crash_notes_size(struct device *dev,
static DEVICE_ATTR(crash_notes_size, 0400, show_crash_notes_size, NULL);
#endif
#ifdef CONFIG_SCHED_HMP
static ssize_t show_sched_mostly_idle_load(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct cpu *cpu = container_of(dev, struct cpu, dev);
ssize_t rc;
int cpunum;
int mostly_idle_pct;
cpunum = cpu->dev.id;
mostly_idle_pct = sched_get_cpu_mostly_idle_load(cpunum);
rc = snprintf(buf, PAGE_SIZE-2, "%d\n", mostly_idle_pct);
return rc;
}
static ssize_t __ref store_sched_mostly_idle_load(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
struct cpu *cpu = container_of(dev, struct cpu, dev);
int cpuid = cpu->dev.id;
int mostly_idle_load, err;
err = kstrtoint(strstrip((char *)buf), 0, &mostly_idle_load);
if (err)
return err;
err = sched_set_cpu_mostly_idle_load(cpuid, mostly_idle_load);
if (err >= 0)
err = count;
return err;
}
static ssize_t show_sched_mostly_idle_nr_run(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct cpu *cpu = container_of(dev, struct cpu, dev);
ssize_t rc;
int cpunum;
int mostly_idle_nr_run;
cpunum = cpu->dev.id;
mostly_idle_nr_run = sched_get_cpu_mostly_idle_nr_run(cpunum);
rc = snprintf(buf, PAGE_SIZE-2, "%d\n", mostly_idle_nr_run);
return rc;
}
static ssize_t __ref store_sched_mostly_idle_nr_run(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
struct cpu *cpu = container_of(dev, struct cpu, dev);
int cpuid = cpu->dev.id;
int mostly_idle_nr_run, err;
err = kstrtoint(strstrip((char *)buf), 0, &mostly_idle_nr_run);
if (err)
return err;
err = sched_set_cpu_mostly_idle_nr_run(cpuid, mostly_idle_nr_run);
if (err >= 0)
err = count;
return err;
}
static DEVICE_ATTR(sched_mostly_idle_load, 0664, show_sched_mostly_idle_load,
store_sched_mostly_idle_load);
static DEVICE_ATTR(sched_mostly_idle_nr_run, 0664,
show_sched_mostly_idle_nr_run, store_sched_mostly_idle_nr_run);
#endif
/*
* Print cpu online, possible, present, and system maps
*/
@ -337,6 +416,16 @@ int __cpuinit register_cpu(struct cpu *cpu, int num)
error = device_create_file(&cpu->dev,
&dev_attr_crash_notes_size);
#endif
#ifdef CONFIG_SCHED_HMP
if (!error)
error = device_create_file(&cpu->dev,
&dev_attr_sched_mostly_idle_load);
if (!error)
error = device_create_file(&cpu->dev,
&dev_attr_sched_mostly_idle_nr_run);
#endif
return error;
}

View File

@ -1913,9 +1913,15 @@ sched_set_cpu_cstate(int cpu, int cstate, int wakeup_energy, int wakeup_latency)
#endif
#ifdef CONFIG_SCHED_HMP
extern int sched_set_boost(int enable);
extern int sched_set_init_task_load(struct task_struct *p, int init_load_pct);
extern u32 sched_get_init_task_load(struct task_struct *p);
extern int sched_set_cpu_mostly_idle_load(int cpu, int mostly_idle_pct);
extern int sched_get_cpu_mostly_idle_load(int cpu);
extern int sched_set_cpu_mostly_idle_nr_run(int cpu, int nr_run);
extern int sched_get_cpu_mostly_idle_nr_run(int cpu);
#else
static inline int sched_set_boost(int enable)
{

View File

@ -55,9 +55,7 @@ extern int sysctl_sched_freq_dec_notify;
#ifdef CONFIG_SCHED_HMP
extern unsigned int sysctl_sched_spill_nr_run;
extern unsigned int sysctl_sched_mostly_idle_nr_run;
extern unsigned int sysctl_sched_spill_load_pct;
extern unsigned int sysctl_sched_mostly_idle_load_pct;
extern unsigned int sysctl_sched_small_task_pct;
extern unsigned int sysctl_sched_upmigrate_pct;
extern unsigned int sysctl_sched_downmigrate_pct;

View File

@ -2026,6 +2026,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
u64 start_ts = sched_clock();
int reason = WINDOW_CHANGE;
unsigned int old = 0, new = 0;
unsigned int old_window_size = sched_ravg_window;
disable_window_stats();
@ -2048,8 +2049,13 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
if (window_start)
if (window_start) {
u32 mostly_idle_load = rq->mostly_idle_load;
rq->window_start = window_start;
rq->mostly_idle_load = div64_u64((u64)mostly_idle_load *
(u64)sched_ravg_window, (u64)old_window_size);
}
#ifdef CONFIG_SCHED_FREQ_INPUT
rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
#endif
@ -8984,6 +8990,8 @@ void __init sched_init(void)
rq->window_start = 0;
rq->nr_small_tasks = rq->nr_big_tasks = 0;
rq->hmp_flags = 0;
rq->mostly_idle_load = pct_to_real(20);
rq->mostly_idle_nr_run = 3;
#ifdef CONFIG_SCHED_FREQ_INPUT
rq->old_busy_time = 0;
rq->curr_runnable_sum = rq->prev_runnable_sum = 0;

View File

@ -306,6 +306,8 @@ do { \
P(cpu_power);
#endif
#ifdef CONFIG_SCHED_HMP
P(mostly_idle_load);
P(mostly_idle_nr_run);
P(load_scale_factor);
P(capacity);
P(max_possible_capacity);
@ -398,7 +400,6 @@ static void sched_debug_header(struct seq_file *m)
P(sysctl_sched_child_runs_first);
P(sysctl_sched_features);
#ifdef CONFIG_SCHED_HMP
P(sched_mostly_idle_load);
P(sched_small_task);
P(sched_upmigrate);
P(sched_downmigrate);

View File

@ -1255,14 +1255,6 @@ unsigned int __read_mostly sched_enable_hmp = 0;
*/
unsigned int __read_mostly sysctl_sched_spill_nr_run = 10;
/*
* A cpu is considered practically idle, if:
*
* rq->nr_running <= sysctl_sched_mostly_idle_nr_run &&
* rq->cumulative_runnable_avg <= sched_mostly_idle_load
*/
unsigned int __read_mostly sysctl_sched_mostly_idle_nr_run = 3;
/*
* Control whether or not individual CPU power consumption is used to
* guide task placement.
@ -1276,16 +1268,6 @@ unsigned int __read_mostly sched_enable_power_aware = 0;
*/
unsigned int __read_mostly sysctl_sched_powerband_limit_pct = 20;
/*
* Conversion of *_pct to absolute form is based on max_task_load().
*
* For example:
* sched_mostly_idle_load =
* (sysctl_sched_mostly_idle_load_pct * max_task_load()) / 100;
*/
unsigned int __read_mostly sched_mostly_idle_load;
unsigned int __read_mostly sysctl_sched_mostly_idle_load_pct = 20;
/*
* CPUs with load greater than the sched_spill_load_threshold are not
* eligible for task placement. When all CPUs in a cluster achieve a
@ -1352,17 +1334,11 @@ static inline int available_cpu_capacity(int cpu)
return rq->capacity;
}
#define pct_to_real(tunable) \
(div64_u64((u64)tunable * (u64)max_task_load(), 100))
void set_hmp_defaults(void)
{
sched_spill_load =
pct_to_real(sysctl_sched_spill_load_pct);
sched_mostly_idle_load =
pct_to_real(sysctl_sched_mostly_idle_load_pct);
sched_small_task =
pct_to_real(sysctl_sched_small_task_pct);
@ -1401,6 +1377,44 @@ int sched_set_init_task_load(struct task_struct *p, int init_load_pct)
return 0;
}
int sched_set_cpu_mostly_idle_load(int cpu, int mostly_idle_pct)
{
struct rq *rq = cpu_rq(cpu);
if (mostly_idle_pct < 0 || mostly_idle_pct > 100)
return -EINVAL;
rq->mostly_idle_load = pct_to_real(mostly_idle_pct);
return 0;
}
int sched_get_cpu_mostly_idle_load(int cpu)
{
struct rq *rq = cpu_rq(cpu);
int mostly_idle_pct;
mostly_idle_pct = real_to_pct(rq->mostly_idle_load);
return mostly_idle_pct;
}
int sched_set_cpu_mostly_idle_nr_run(int cpu, int nr_run)
{
struct rq *rq = cpu_rq(cpu);
rq->mostly_idle_nr_run = nr_run;
return 0;
}
int sched_get_cpu_mostly_idle_nr_run(int cpu)
{
struct rq *rq = cpu_rq(cpu);
return rq->mostly_idle_nr_run;
}
/*
* 'load' is in reference to "best cpu" at its best frequency.
* Scale that in reference to a given cpu, accounting for how bad it is
@ -1462,9 +1476,12 @@ spill_threshold_crossed(struct task_struct *p, struct rq *rq, int cpu)
int mostly_idle_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
int mostly_idle;
return (cpu_load(cpu) <= sched_mostly_idle_load
&& rq->nr_running <= sysctl_sched_mostly_idle_nr_run);
mostly_idle = (cpu_load(cpu) <= rq->mostly_idle_load
&& rq->nr_running <= rq->mostly_idle_nr_run);
return mostly_idle;
}
static int boost_refcount;
@ -1988,10 +2005,9 @@ int sched_hmp_proc_update_handler(struct ctl_table *table, int write,
return ret;
if ((sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct) ||
(sysctl_sched_mostly_idle_load_pct >
sysctl_sched_spill_load_pct) || *data > 100) {
*data = old_val;
return -EINVAL;
*data > 100) {
*data = old_val;
return -EINVAL;
}
/*
@ -7138,8 +7154,8 @@ static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
int i;
if (rq->nr_running >= 2 && (rq->nr_running - rq->nr_small_tasks >= 2 ||
rq->nr_running > sysctl_sched_mostly_idle_nr_run ||
cpu_load(cpu) > sched_mostly_idle_load)) {
rq->nr_running > rq->mostly_idle_nr_run ||
cpu_load(cpu) > rq->mostly_idle_load)) {
if (rq->capacity == max_capacity)
return 1;

View File

@ -497,6 +497,8 @@ struct rq {
int capacity;
int max_possible_capacity;
u64 window_start;
u32 mostly_idle_load;
int mostly_idle_nr_run;
#ifdef CONFIG_SCHED_FREQ_INPUT
unsigned int old_busy_time;
@ -749,6 +751,12 @@ dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p)
BUG_ON((s64)rq->cumulative_runnable_avg < 0);
}
#define pct_to_real(tunable) \
(div64_u64((u64)tunable * (u64)max_task_load(), 100))
#define real_to_pct(tunable) \
(div64_u64((u64)tunable * (u64)100, (u64)max_task_load()))
#else /* CONFIG_SCHED_HMP */
static inline int pct_task_load(struct task_struct *p) { return 0; }

View File

@ -367,20 +367,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = sched_hmp_proc_update_handler,
},
{
.procname = "sched_mostly_idle_load",
.data = &sysctl_sched_mostly_idle_load_pct,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_hmp_proc_update_handler,
},
{
.procname = "sched_mostly_idle_nr_run",
.data = &sysctl_sched_mostly_idle_nr_run,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "sched_spill_load",
.data = &sysctl_sched_spill_load_pct,