sched: use ktime instead of sched_clock for load tracking

At present, HMP scheduler uses sched_clock to setup window boundary to be aligned with timer interrupt to ensure timer interrupt fires after window rollover. However this alignment won't last long since the timer interrupt rearms next timer based on time measured by ktime which isn't coupled with sched_clock. Convert sched_clock to ktime to avoid wallclock discrepancy between scheduler and timer so that we can ensure scheduler's window boundary is always aligned with timer. CRs-fixed: 933330 Change-Id: I4108819a4382f725b3ce6075eb46aab0cf670b7e Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org> [pkondeti@codeaurora.org: resolved trival merge conflicts] Signed-off-by: Pavankumar Kondeti <pkondeti@codeaurora.org>
2024-11-01 02:21:16 +00:00 · 2015-11-24 14:33:26 -08:00 · 2015-11-24 14:33:26 -08:00 · b136867440
commit b136867440
parent 244d90076c
5 changed files with 84 additions and 49 deletions
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@ -2065,6 +2065,7 @@ extern u64 cpu_clock(int cpu);
 extern u64 local_clock(void);
 extern u64 sched_clock_cpu(int cpu);

+extern u64 sched_ktime_clock(void);

 extern void sched_clock_init(void);
 extern int sched_clock_initialized(void);
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@ -73,6 +73,7 @@ struct tick_sched {
 extern void __init tick_init(void);
 extern int tick_is_oneshot_available(void);
 extern u64 jiffy_to_sched_clock(u64 *now, u64 *jiffy_sched_clock);
+extern u64 jiffy_to_ktime_ns(u64 *now, u64 *jiffy_ktime_ns);
 extern struct tick_device *tick_get_device(int cpu);

 # ifdef CONFIG_HIGH_RES_TIMERS
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@ -74,6 +74,7 @@
 #include <linux/binfmts.h>
 #include <linux/context_tracking.h>
 #include <linux/cpufreq.h>
+#include <linux/syscore_ops.h>

 #include <asm/switch_to.h>
 #include <asm/tlb.h>
@ -842,6 +843,47 @@ void resched_task(struct task_struct *p)
 }
 #endif /* CONFIG_SMP */

+#ifdef CONFIG_SCHED_HMP
+static ktime_t ktime_last;
+static bool sched_ktime_suspended;
+
+u64 sched_ktime_clock(void)
+{
+	if (unlikely(sched_ktime_suspended))
+		return ktime_to_ns(ktime_last);
+	return ktime_to_ns(ktime_get());
+}
+
+static void sched_resume(void)
+{
+	sched_ktime_suspended = false;
+}
+
+static int sched_suspend(void)
+{
+	ktime_last = ktime_get();
+	sched_ktime_suspended = true;
+	return 0;
+}
+
+static struct syscore_ops sched_syscore_ops = {
+	.resume	= sched_resume,
+	.suspend = sched_suspend
+};
+
+static int __init sched_init_ops(void)
+{
+	register_syscore_ops(&sched_syscore_ops);
+	return 0;
+}
+late_initcall(sched_init_ops);
+#else
+u64 sched_ktime_clock(void)
+{
+	return 0;
+}
+#endif /* CONFIG_SCHED_HMP */
+
 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
 			(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
 /*
@ -1293,7 +1335,7 @@ static void _set_preferred_cluster(struct related_thread_group *grp)
 	 * at same time. Avoid overhead in such cases of rechecking preferred
 	 * cluster
 	 */
-	if (sched_clock() - grp->last_update <
+	if (sched_ktime_clock() - grp->last_update <
 			sched_grp_min_cluster_update_delta)
 		return;

@ -1303,7 +1345,7 @@ static void _set_preferred_cluster(struct related_thread_group *grp)
 	}

 	grp->preferred_cluster = best_cluster(grp, combined_demand);
-	grp->last_update = sched_clock();
+	grp->last_update = sched_ktime_clock();
 	trace_sched_set_preferred_cluster(grp, combined_demand);
 }

@ -2304,16 +2346,20 @@ void sched_account_irqtime(int cpu, struct task_struct *curr,
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags, nr_windows;
-	u64 cur_jiffies_ts, now;
+	u64 cur_jiffies_ts;

 	raw_spin_lock_irqsave(&rq->lock, flags);

-	now = sched_clock();
-	delta += (now - wallclock);
+	/*
+	 * cputime (wallclock) uses sched_clock so use the same here for
+	 * consistency.
+	 */
+	delta += sched_clock() - wallclock;
 	cur_jiffies_ts = get_jiffies_64();

 	if (is_idle_task(curr))
-		update_task_ravg(curr, rq, IRQ_UPDATE, now, delta);
+		update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(),
+				 delta);

 	nr_windows = cur_jiffies_ts - rq->irqload_ts;

@ -2350,13 +2396,14 @@ static void reset_task_stats(struct task_struct *p)
 static inline void mark_task_starting(struct task_struct *p)
 {
 	struct rq *rq = task_rq(p);
-	u64 wallclock = sched_clock();
+	u64 wallclock;

 	if (!rq->window_start || sched_disable_window_stats) {
 		reset_task_stats(p);
 		return;
 	}

+	wallclock = sched_ktime_clock();
 	p->ravg.mark_start = wallclock;
 }

@ -2365,12 +2412,11 @@ static inline void set_window_start(struct rq *rq)
 	int cpu = cpu_of(rq);
 	struct rq *sync_rq = cpu_rq(sync_cpu);

-	if (rq->window_start || !sched_enable_hmp ||
-	    !sched_clock_initialized() || !sched_clock_cpu(cpu))
+	if (rq->window_start || !sched_enable_hmp)
 		return;

 	if (cpu == sync_cpu) {
-		rq->window_start = sched_clock();
+		rq->window_start = sched_ktime_clock();
 	} else {
 		raw_spin_unlock(&rq->lock);
 		double_rq_lock(rq, sync_rq);
@ -2424,7 +2470,7 @@ void sched_exit(struct task_struct *p)

 	raw_spin_lock_irqsave(&rq->lock, flags);
 	/* rq->curr == p */
-	wallclock = sched_clock();
+	wallclock = sched_ktime_clock();
 	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
 	dequeue_task(rq, p, 0);
 	reset_task_stats(p);
@ -2482,7 +2528,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
 {
 	int cpu;
 	unsigned long flags;
-	u64 start_ts = sched_clock();
+	u64 start_ts = sched_ktime_clock();
 	int reason = WINDOW_CHANGE;
 	unsigned int old = 0, new = 0;
 	unsigned int old_window_size = sched_ravg_window;
@ -2563,7 +2609,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
 	local_irq_restore(flags);

 	trace_sched_reset_all_window_stats(window_start, window_size,
-		sched_clock() - start_ts, reason, old, new);
+		sched_ktime_clock() - start_ts, reason, old, new);
 }

 #ifdef CONFIG_SCHED_FREQ_INPUT
@ -2586,7 +2632,7 @@ unsigned long sched_get_busy(int cpu)
 	 * that the window stats are current by doing an update.
 	 */
 	raw_spin_lock_irqsave(&rq->lock, flags);
-	update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_clock(), 0);
+	update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(), 0);
 	load = rq->old_busy_time = rq->prev_runnable_sum;

 	/*
@ -2628,7 +2674,7 @@ void sched_set_io_is_busy(int val)

 int sched_set_window(u64 window_start, unsigned int window_size)
 {
-	u64 now, cur_jiffies, jiffy_sched_clock;
+	u64 now, cur_jiffies, jiffy_ktime_ns;
 	s64 ws;
 	unsigned long flags;

@ -2638,23 +2684,25 @@ int sched_set_window(u64 window_start, unsigned int window_size)

 	mutex_lock(&policy_mutex);

-	/* Get a consistent view of sched_clock, jiffies, and the time
-	 * since the last jiffy (based on last_jiffies_update). */
+	/*
+	 * Get a consistent view of ktime, jiffies, and the time
+	 * since the last jiffy (based on last_jiffies_update).
+	 */
 	local_irq_save(flags);
-	cur_jiffies = jiffy_to_sched_clock(&now, &jiffy_sched_clock);
+	cur_jiffies = jiffy_to_ktime_ns(&now, &jiffy_ktime_ns);
 	local_irq_restore(flags);

 	/* translate window_start from jiffies to nanoseconds */
 	ws = (window_start - cur_jiffies); /* jiffy difference */
 	ws *= TICK_NSEC;
-	ws += jiffy_sched_clock;
+	ws += jiffy_ktime_ns;

 	/* roll back calculated window start so that it is in
 	 * the past (window stats must have a current window) */
 	while (ws > now)
 		ws -= (window_size * TICK_NSEC);

-	BUG_ON(sched_clock() < ws);
+	BUG_ON(sched_ktime_clock() < ws);

 	reset_all_window_stats(ws, window_size);

@ -2679,7 +2727,7 @@ static void fixup_busy_time(struct task_struct *p, int new_cpu)
 	if (sched_disable_window_stats)
 		goto done;

-	wallclock = sched_clock();
+	wallclock = sched_ktime_clock();

 	update_task_ravg(task_rq(p)->curr, task_rq(p),
 			 TASK_UPDATE,
@ -3004,7 +3052,8 @@ static int cpufreq_notifier_trans(struct notifier_block *nb,
 	for_each_cpu_mask(i, cluster->cpus) {
 		struct rq *rq = cpu_rq(i);
 		raw_spin_lock_irqsave(&rq->lock, flags);
-		update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_clock(), 0);
+		update_task_ravg(rq->curr, rq, TASK_UPDATE,
+				sched_ktime_clock(), 0);
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 	}

@ -3069,7 +3118,8 @@ static inline int update_preferred_cluster(struct related_thread_group *grp,
 	 * has passed since we last updated preference
 	 */
 	if (abs(new_load - old_load) > sched_grp_min_task_load_delta ||
-			sched_clock() - p->grp->last_update > sched_ravg_window)
+			sched_ktime_clock() - p->grp->last_update >
+			sched_ravg_window)
 		return 1;

 	return 0;
@ -3147,7 +3197,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

 	trace_sched_migrate_task(p, new_cpu, pct_task_load(p));

-	note_run_start(p, sched_clock());
+	note_run_start(p, sched_ktime_clock());

 	if (task_cpu(p) != new_cpu) {
 		struct task_migration_notifier tmn;
@ -3687,7 +3737,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	raw_spin_lock(&rq->lock);
 	old_load = task_load(p);
 	grp = task_related_thread_group(p);
-	wallclock = sched_clock();
+	wallclock = sched_ktime_clock();
 	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
 	heavy_task = heavy_task_wakeup(p, rq, TASK_WAKE);
 	update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
@ -3782,7 +3832,7 @@ static void try_to_wake_up_local(struct task_struct *p)
 		goto out;

 	if (!p->on_rq) {
-		u64 wallclock = sched_clock();
+		u64 wallclock = sched_ktime_clock();

 		update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
 		update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
@ -5146,7 +5196,7 @@ void scheduler_tick(void)
 	update_rq_clock(rq);
 	update_cpu_load_active(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
-	update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_clock(), 0);
+	update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(), 0);
 	raw_spin_unlock(&rq->lock);

 	perf_event_task_tick();
@ -5422,7 +5472,7 @@ need_resched:

 	put_prev_task(rq, prev);
 	next = pick_next_task(rq);
-	wallclock = sched_clock();
+	wallclock = sched_ktime_clock();
 	if (!prev->on_rq)
 		task_note_last_sleep(prev, wallclock);
 	update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@ -3109,7 +3109,7 @@ static inline int is_cpu_throttling_imminent(int cpu)

 static inline int is_task_migration_throttled(struct task_struct *p)
 {
-	u64 delta = sched_clock() - p->run_start;
+	u64 delta = sched_ktime_clock() - p->run_start;

 	return delta < sched_min_runtime;
 }
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@ -45,32 +45,15 @@ DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
 */
 static ktime_t last_jiffies_update;

-/*
- * Conversion from ktime to sched_clock is error prone. Use this
- * as a safetly margin when calculating the sched_clock value at
- * a particular jiffy as last_jiffies_update uses ktime.
- */
-#define SCHED_CLOCK_MARGIN 100000
-
-static u64 ns_since_jiffy(void)
-{
-	ktime_t delta;
-
-	delta = ktime_sub(ktime_get(), last_jiffies_update);
-
-	return ktime_to_ns(delta);
-}
-
-u64 jiffy_to_sched_clock(u64 *now, u64 *jiffy_sched_clock)
+u64 jiffy_to_ktime_ns(u64 *now, u64 *jiffy_ktime_ns)
 {
 	u64 cur_jiffies;
 	unsigned long seq;

 	do {
 		seq = read_seqbegin(&jiffies_lock);
-		*now = sched_clock();
-		*jiffy_sched_clock = *now -
-			(ns_since_jiffy() + SCHED_CLOCK_MARGIN);
+		*now = ktime_to_ns(ktime_get());
+		*jiffy_ktime_ns = ktime_to_ns(last_jiffies_update);
 		cur_jiffies = get_jiffies_64();
 	} while (read_seqretry(&jiffies_lock, seq));