From ad2b13536ace08dfcca4cf86b75a5d06efe06373 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 5 Mar 2013 15:14:05 +0100
Subject: [PATCH 01/52] tick: Call tick_init late

To convert the clockevents code to cpumask_var_t we need to move the
init call after the allocator setup.

Clockevents are earliest registered from time_init() as they need
interrupts being set up, so this is safe.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20130306111537.304379448@linutronix.de
Cc: Rusty Russell <rusty@rustcorp.com.au>
---
 init/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/init/main.c b/init/main.c
index 63534a141b4e..b3e061428545 100644
--- a/init/main.c
+++ b/init/main.c
@@ -494,7 +494,6 @@ asmlinkage void __init start_kernel(void)
  * Interrupts are still disabled. Do necessary setups, then
  * enable them
  */
-	tick_init();
 	boot_cpu_init();
 	page_address_init();
 	printk(KERN_NOTICE "%s", linux_banner);
@@ -551,6 +550,7 @@ asmlinkage void __init start_kernel(void)
 	/* init some links before init_ISA_irqs() */
 	early_irq_init();
 	init_IRQ();
+	tick_init();
 	init_timers();
 	hrtimers_init();
 	softirq_init();

From b352bc1cbc29134a356b5c16ee2281807a7b984e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 5 Mar 2013 14:25:32 +0100
Subject: [PATCH 02/52] tick: Convert broadcast cpu bitmaps to cpumask_var_t

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20130306111537.366394000@linutronix.de
Cc: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/time/tick-broadcast.c | 86 ++++++++++++++++++------------------
 kernel/time/tick-common.c    |  1 +
 kernel/time/tick-internal.h  |  3 +-
 3 files changed, 46 insertions(+), 44 deletions(-)

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 2fb8cb88df8d..35b887517766 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -28,9 +28,8 @@
  */
 
 static struct tick_device tick_broadcast_device;
-/* FIXME: Use cpumask_var_t. */
-static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
-static DECLARE_BITMAP(tmpmask, NR_CPUS);
+static cpumask_var_t tick_broadcast_mask;
+static cpumask_var_t tmpmask;
 static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
 static int tick_broadcast_force;
 
@@ -50,7 +49,7 @@ struct tick_device *tick_get_broadcast_device(void)
 
 struct cpumask *tick_get_broadcast_mask(void)
 {
-	return to_cpumask(tick_broadcast_mask);
+	return tick_broadcast_mask;
 }
 
 /*
@@ -74,7 +73,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev)
 
 	clockevents_exchange_device(tick_broadcast_device.evtdev, dev);
 	tick_broadcast_device.evtdev = dev;
-	if (!cpumask_empty(tick_get_broadcast_mask()))
+	if (!cpumask_empty(tick_broadcast_mask))
 		tick_broadcast_start_periodic(dev);
 	return 1;
 }
@@ -123,7 +122,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
 	if (!tick_device_is_functional(dev)) {
 		dev->event_handler = tick_handle_periodic;
 		tick_device_setup_broadcast_func(dev);
-		cpumask_set_cpu(cpu, tick_get_broadcast_mask());
+		cpumask_set_cpu(cpu, tick_broadcast_mask);
 		tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
 		ret = 1;
 	} else {
@@ -134,7 +133,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
 		 */
 		if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
 			int cpu = smp_processor_id();
-			cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
+			cpumask_clear_cpu(cpu, tick_broadcast_mask);
 			tick_broadcast_clear_oneshot(cpu);
 		} else {
 			tick_device_setup_broadcast_func(dev);
@@ -198,9 +197,8 @@ static void tick_do_periodic_broadcast(void)
 {
 	raw_spin_lock(&tick_broadcast_lock);
 
-	cpumask_and(to_cpumask(tmpmask),
-		    cpu_online_mask, tick_get_broadcast_mask());
-	tick_do_broadcast(to_cpumask(tmpmask));
+	cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask);
+	tick_do_broadcast(tmpmask);
 
 	raw_spin_unlock(&tick_broadcast_lock);
 }
@@ -263,13 +261,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
 	if (!tick_device_is_functional(dev))
 		goto out;
 
-	bc_stopped = cpumask_empty(tick_get_broadcast_mask());
+	bc_stopped = cpumask_empty(tick_broadcast_mask);
 
 	switch (*reason) {
 	case CLOCK_EVT_NOTIFY_BROADCAST_ON:
 	case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
-		if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
-			cpumask_set_cpu(cpu, tick_get_broadcast_mask());
+		if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
 			if (tick_broadcast_device.mode ==
 			    TICKDEV_MODE_PERIODIC)
 				clockevents_shutdown(dev);
@@ -279,8 +276,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
 		break;
 	case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
 		if (!tick_broadcast_force &&
-		    cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
-			cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
+		    cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) {
 			if (tick_broadcast_device.mode ==
 			    TICKDEV_MODE_PERIODIC)
 				tick_setup_periodic(dev, 0);
@@ -288,7 +284,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
 		break;
 	}
 
-	if (cpumask_empty(tick_get_broadcast_mask())) {
+	if (cpumask_empty(tick_broadcast_mask)) {
 		if (!bc_stopped)
 			clockevents_shutdown(bc);
 	} else if (bc_stopped) {
@@ -337,10 +333,10 @@ void tick_shutdown_broadcast(unsigned int *cpup)
 	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
 
 	bc = tick_broadcast_device.evtdev;
-	cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
+	cpumask_clear_cpu(cpu, tick_broadcast_mask);
 
 	if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
-		if (bc && cpumask_empty(tick_get_broadcast_mask()))
+		if (bc && cpumask_empty(tick_broadcast_mask))
 			clockevents_shutdown(bc);
 	}
 
@@ -376,13 +372,13 @@ int tick_resume_broadcast(void)
 
 		switch (tick_broadcast_device.mode) {
 		case TICKDEV_MODE_PERIODIC:
-			if (!cpumask_empty(tick_get_broadcast_mask()))
+			if (!cpumask_empty(tick_broadcast_mask))
 				tick_broadcast_start_periodic(bc);
 			broadcast = cpumask_test_cpu(smp_processor_id(),
-						     tick_get_broadcast_mask());
+						     tick_broadcast_mask);
 			break;
 		case TICKDEV_MODE_ONESHOT:
-			if (!cpumask_empty(tick_get_broadcast_mask()))
+			if (!cpumask_empty(tick_broadcast_mask))
 				broadcast = tick_resume_broadcast_oneshot(bc);
 			break;
 		}
@@ -395,15 +391,14 @@ int tick_resume_broadcast(void)
 
 #ifdef CONFIG_TICK_ONESHOT
 
-/* FIXME: use cpumask_var_t. */
-static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS);
+static cpumask_var_t tick_broadcast_oneshot_mask;
 
 /*
  * Exposed for debugging: see timer_list.c
  */
 struct cpumask *tick_get_broadcast_oneshot_mask(void)
 {
-	return to_cpumask(tick_broadcast_oneshot_mask);
+	return tick_broadcast_oneshot_mask;
 }
 
 static int tick_broadcast_set_event(ktime_t expires, int force)
@@ -428,7 +423,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
  */
 void tick_check_oneshot_broadcast(int cpu)
 {
-	if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) {
+	if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) {
 		struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
 
 		clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
@@ -448,13 +443,13 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
 again:
 	dev->next_event.tv64 = KTIME_MAX;
 	next_event.tv64 = KTIME_MAX;
-	cpumask_clear(to_cpumask(tmpmask));
+	cpumask_clear(tmpmask);
 	now = ktime_get();
 	/* Find all expired events */
-	for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) {
+	for_each_cpu(cpu, tick_broadcast_oneshot_mask) {
 		td = &per_cpu(tick_cpu_device, cpu);
 		if (td->evtdev->next_event.tv64 <= now.tv64)
-			cpumask_set_cpu(cpu, to_cpumask(tmpmask));
+			cpumask_set_cpu(cpu, tmpmask);
 		else if (td->evtdev->next_event.tv64 < next_event.tv64)
 			next_event.tv64 = td->evtdev->next_event.tv64;
 	}
@@ -462,7 +457,7 @@ again:
 	/*
 	 * Wakeup the cpus which have an expired event.
 	 */
-	tick_do_broadcast(to_cpumask(tmpmask));
+	tick_do_broadcast(tmpmask);
 
 	/*
 	 * Two reasons for reprogram:
@@ -518,16 +513,13 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 
 	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
 	if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
-		if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
-			cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
+		if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
 			if (dev->next_event.tv64 < bc->next_event.tv64)
 				tick_broadcast_set_event(dev->next_event, 1);
 		}
 	} else {
-		if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
-			cpumask_clear_cpu(cpu,
-					  tick_get_broadcast_oneshot_mask());
+		if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
 			if (dev->next_event.tv64 != KTIME_MAX)
 				tick_program_event(dev->next_event, 1);
@@ -543,7 +535,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
  */
 static void tick_broadcast_clear_oneshot(int cpu)
 {
-	cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
+	cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
 }
 
 static void tick_broadcast_init_next_event(struct cpumask *mask,
@@ -581,15 +573,14 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 		 * oneshot_mask bits for those and program the
 		 * broadcast device to fire.
 		 */
-		cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask());
-		cpumask_clear_cpu(cpu, to_cpumask(tmpmask));
-		cpumask_or(tick_get_broadcast_oneshot_mask(),
-			   tick_get_broadcast_oneshot_mask(),
-			   to_cpumask(tmpmask));
+		cpumask_copy(tmpmask, tick_broadcast_mask);
+		cpumask_clear_cpu(cpu, tmpmask);
+		cpumask_or(tick_broadcast_oneshot_mask,
+			   tick_broadcast_oneshot_mask, tmpmask);
 
-		if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) {
+		if (was_periodic && !cpumask_empty(tmpmask)) {
 			clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
-			tick_broadcast_init_next_event(to_cpumask(tmpmask),
+			tick_broadcast_init_next_event(tmpmask,
 						       tick_next_period);
 			tick_broadcast_set_event(tick_next_period, 1);
 		} else
@@ -639,7 +630,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
 	 * Clear the broadcast mask flag for the dead cpu, but do not
 	 * stop the broadcast device!
 	 */
-	cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
+	cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
 
 	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
@@ -663,3 +654,12 @@ bool tick_broadcast_oneshot_available(void)
 }
 
 #endif
+
+void __init tick_broadcast_init(void)
+{
+	alloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);
+	alloc_cpumask_var(&tmpmask, GFP_NOWAIT);
+#ifdef CONFIG_TICK_ONESHOT
+	alloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
+#endif
+}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b1600a6973f4..74413e396acc 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -416,4 +416,5 @@ static struct notifier_block tick_notifier = {
 void __init tick_init(void)
 {
 	clockevents_register_notifier(&tick_notifier);
+	tick_broadcast_init();
 }
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index cf3e59ed6dc0..46d9bd02844c 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -94,7 +94,7 @@ extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
 extern void tick_shutdown_broadcast(unsigned int *cpup);
 extern void tick_suspend_broadcast(void);
 extern int tick_resume_broadcast(void);
-
+extern void tick_broadcast_init(void);
 extern void
 tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
 
@@ -119,6 +119,7 @@ static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
 static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
 static inline void tick_suspend_broadcast(void) { }
 static inline int tick_resume_broadcast(void) { return 0; }
+static inline void tick_broadcast_init(void) { }
 
 /*
  * Set the periodic handler in non broadcast mode

From f9ae39d04ccdec8d8ecf532191b7056c279a22c0 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Sat, 2 Mar 2013 11:10:10 +0100
Subject: [PATCH 03/52] tick: Pass broadcast device to
 tick_broadcast_set_event()

Pass the broadcast timer to tick_broadcast_set_event() instead of
reevaluating tick_broadcast_device.evtdev.

[ tglx: Massaged changelog ]

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: viresh.kumar@linaro.org
Cc: jacob.jun.pan@linux.intel.com
Cc: linux-arm-kernel@lists.infradead.org
Cc: santosh.shilimkar@ti.com
Cc: linaro-kernel@lists.linaro.org
Cc: patches@linaro.org
Cc: rickard.andersson@stericsson.com
Cc: vincent.guittot@linaro.org
Cc: linus.walleij@stericsson.com
Cc: john.stultz@linaro.org
Link: http://lkml.kernel.org/r/1362219013-18173-2-git-send-email-daniel.lezcano@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 35b887517766..70dd98ce18d7 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -401,10 +401,9 @@ struct cpumask *tick_get_broadcast_oneshot_mask(void)
 	return tick_broadcast_oneshot_mask;
 }
 
-static int tick_broadcast_set_event(ktime_t expires, int force)
+static int tick_broadcast_set_event(struct clock_event_device *bc,
+				    ktime_t expires, int force)
 {
-	struct clock_event_device *bc = tick_broadcast_device.evtdev;
-
 	if (bc->mode != CLOCK_EVT_MODE_ONESHOT)
 		clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
 
@@ -474,7 +473,7 @@ again:
 		 * Rearm the broadcast device. If event expired,
 		 * repeat the above
 		 */
-		if (tick_broadcast_set_event(next_event, 0))
+		if (tick_broadcast_set_event(dev, next_event, 0))
 			goto again;
 	}
 	raw_spin_unlock(&tick_broadcast_lock);
@@ -516,7 +515,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 		if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
 			if (dev->next_event.tv64 < bc->next_event.tv64)
-				tick_broadcast_set_event(dev->next_event, 1);
+				tick_broadcast_set_event(bc, dev->next_event, 1);
 		}
 	} else {
 		if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
@@ -582,7 +581,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 			clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
 			tick_broadcast_init_next_event(tmpmask,
 						       tick_next_period);
-			tick_broadcast_set_event(tick_next_period, 1);
+			tick_broadcast_set_event(bc, tick_next_period, 1);
 		} else
 			bc->next_event.tv64 = KTIME_MAX;
 	} else {

From d2348fb6fdc6d671ad45b62db237f76c8c115603 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Sat, 2 Mar 2013 11:10:11 +0100
Subject: [PATCH 04/52] tick: Dynamically set broadcast irq affinity

When a cpu goes to a deep idle state where its local timer is
shutdown, it notifies the time frame work to use the broadcast timer
instead.  Unfortunately, the broadcast device could wake up any CPU,
including an idle one which is not concerned by the wake up at all. So
in the worst case an idle CPU will wake up to send an IPI to the CPU
whose timer expired.

Provide an opt-in feature CLOCK_EVT_FEAT_DYNIRQ which tells the core
that is should set the interrupt affinity of the broadcast interrupt
to the cpu which has the earliest expiry time. This avoids unnecessary
spurious wakeups and IPIs.

[ tglx: Adopted to cpumask rework, silenced an uninitialized warning,
  massaged changelog ]

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: viresh.kumar@linaro.org
Cc: jacob.jun.pan@linux.intel.com
Cc: linux-arm-kernel@lists.infradead.org
Cc: santosh.shilimkar@ti.com
Cc: linaro-kernel@lists.linaro.org
Cc: patches@linaro.org
Cc: rickard.andersson@stericsson.com
Cc: vincent.guittot@linaro.org
Cc: linus.walleij@stericsson.com
Cc: john.stultz@linaro.org
Link: http://lkml.kernel.org/r/1362219013-18173-3-git-send-email-daniel.lezcano@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clockchips.h   |  5 +++++
 kernel/time/tick-broadcast.c | 39 ++++++++++++++++++++++++++++--------
 2 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 66346521cb65..494d33ea78f8 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -55,6 +55,11 @@ enum clock_event_nofitiers {
 #define CLOCK_EVT_FEAT_C3STOP		0x000008
 #define CLOCK_EVT_FEAT_DUMMY		0x000010
 
+/*
+ * Core shall set the interrupt affinity dynamically in broadcast mode
+ */
+#define CLOCK_EVT_FEAT_DYNIRQ		0x000020
+
 /**
  * struct clock_event_device - clock event device descriptor
  * @event_handler:	Assigned by the framework to be called by the low
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 70dd98ce18d7..380910db7157 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -401,13 +401,34 @@ struct cpumask *tick_get_broadcast_oneshot_mask(void)
 	return tick_broadcast_oneshot_mask;
 }
 
-static int tick_broadcast_set_event(struct clock_event_device *bc,
+/*
+ * Set broadcast interrupt affinity
+ */
+static void tick_broadcast_set_affinity(struct clock_event_device *bc,
+					const struct cpumask *cpumask)
+{
+	if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ))
+		return;
+
+	if (cpumask_equal(bc->cpumask, cpumask))
+		return;
+
+	bc->cpumask = cpumask;
+	irq_set_affinity(bc->irq, bc->cpumask);
+}
+
+static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
 				    ktime_t expires, int force)
 {
+	int ret;
+
 	if (bc->mode != CLOCK_EVT_MODE_ONESHOT)
 		clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
 
-	return clockevents_program_event(bc, expires, force);
+	ret = clockevents_program_event(bc, expires, force);
+	if (!ret)
+		tick_broadcast_set_affinity(bc, cpumask_of(cpu));
+	return ret;
 }
 
 int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
@@ -436,7 +457,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
 {
 	struct tick_device *td;
 	ktime_t now, next_event;
-	int cpu;
+	int cpu, next_cpu = 0;
 
 	raw_spin_lock(&tick_broadcast_lock);
 again:
@@ -447,10 +468,12 @@ again:
 	/* Find all expired events */
 	for_each_cpu(cpu, tick_broadcast_oneshot_mask) {
 		td = &per_cpu(tick_cpu_device, cpu);
-		if (td->evtdev->next_event.tv64 <= now.tv64)
+		if (td->evtdev->next_event.tv64 <= now.tv64) {
 			cpumask_set_cpu(cpu, tmpmask);
-		else if (td->evtdev->next_event.tv64 < next_event.tv64)
+		} else if (td->evtdev->next_event.tv64 < next_event.tv64) {
 			next_event.tv64 = td->evtdev->next_event.tv64;
+			next_cpu = cpu;
+		}
 	}
 
 	/*
@@ -473,7 +496,7 @@ again:
 		 * Rearm the broadcast device. If event expired,
 		 * repeat the above
 		 */
-		if (tick_broadcast_set_event(dev, next_event, 0))
+		if (tick_broadcast_set_event(dev, next_cpu, next_event, 0))
 			goto again;
 	}
 	raw_spin_unlock(&tick_broadcast_lock);
@@ -515,7 +538,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 		if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
 			if (dev->next_event.tv64 < bc->next_event.tv64)
-				tick_broadcast_set_event(bc, dev->next_event, 1);
+				tick_broadcast_set_event(bc, cpu, dev->next_event, 1);
 		}
 	} else {
 		if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
@@ -581,7 +604,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 			clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
 			tick_broadcast_init_next_event(tmpmask,
 						       tick_next_period);
-			tick_broadcast_set_event(bc, tick_next_period, 1);
+			tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
 		} else
 			bc->next_event.tv64 = KTIME_MAX;
 	} else {

From 26517f3e99248668315aee9460dcea21628cdd7f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 6 Mar 2013 11:18:35 +0000
Subject: [PATCH 05/52] tick: Avoid programming the local cpu timer if
 broadcast pending

If the local cpu timer stops in deep idle, we arm the broadcast device
and get woken by an IPI. Now when we return from deep idle we reenable
the local cpu timer unconditionally before handling the IPI. But
that's a pointless exercise: the timer is already expired and the IPI
is on the way. And it's an expensive exercise as we use the forced
reprogramming mode so that we do not lose a timer event. This forced
reprogramming will loop at least once in the retry.

To avoid this reprogramming, we mark the cpu in a pending bit mask
before we send the IPI. Now when the IPI target cpu wakes up, it will
see the pending bit set and skip the reprogramming. The reprogramming
of the cpu local timer will happen in the IPI handler which runs the
cpu local timer interrupt function.

Reported-by: Jason Liu <liu.h.jason@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: LAK <linux-arm-kernel@lists.infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Arjan van de Veen <arjan@infradead.org>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Tested-by: Santosh Shilimkar <santosh.shilimkar@ti.com>
Link: http://lkml.kernel.org/r/20130306111537.431082074@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 380910db7157..005c0ca81a32 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -392,6 +392,7 @@ int tick_resume_broadcast(void)
 #ifdef CONFIG_TICK_ONESHOT
 
 static cpumask_var_t tick_broadcast_oneshot_mask;
+static cpumask_var_t tick_broadcast_pending_mask;
 
 /*
  * Exposed for debugging: see timer_list.c
@@ -470,6 +471,12 @@ again:
 		td = &per_cpu(tick_cpu_device, cpu);
 		if (td->evtdev->next_event.tv64 <= now.tv64) {
 			cpumask_set_cpu(cpu, tmpmask);
+			/*
+			 * Mark the remote cpu in the pending mask, so
+			 * it can avoid reprogramming the cpu local
+			 * timer in tick_broadcast_oneshot_control().
+			 */
+			cpumask_set_cpu(cpu, tick_broadcast_pending_mask);
 		} else if (td->evtdev->next_event.tv64 < next_event.tv64) {
 			next_event.tv64 = td->evtdev->next_event.tv64;
 			next_cpu = cpu;
@@ -535,6 +542,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 
 	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
 	if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
+		WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
 		if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
 			if (dev->next_event.tv64 < bc->next_event.tv64)
@@ -543,10 +551,25 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 	} else {
 		if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
-			if (dev->next_event.tv64 != KTIME_MAX)
-				tick_program_event(dev->next_event, 1);
+			if (dev->next_event.tv64 == KTIME_MAX)
+				goto out;
+			/*
+			 * The cpu which was handling the broadcast
+			 * timer marked this cpu in the broadcast
+			 * pending mask and fired the broadcast
+			 * IPI. So we are going to handle the expired
+			 * event anyway via the broadcast IPI
+			 * handler. No need to reprogram the timer
+			 * with an already expired event.
+			 */
+			if (cpumask_test_and_clear_cpu(cpu,
+				       tick_broadcast_pending_mask))
+				goto out;
+
+			tick_program_event(dev->next_event, 1);
 		}
 	}
+out:
 	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 
@@ -683,5 +706,6 @@ void __init tick_broadcast_init(void)
 	alloc_cpumask_var(&tmpmask, GFP_NOWAIT);
 #ifdef CONFIG_TICK_ONESHOT
 	alloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
+	alloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT);
 #endif
 }

From 989dcb645ca715129c5a2b39102c8334a20d9615 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 6 Mar 2013 11:18:35 +0000
Subject: [PATCH 06/52] tick: Handle broadcast wakeup of multiple cpus

Some brilliant hardware implementations wake multiple cores when the
broadcast timer fires. This leads to the following interesting
problem:

CPU0				CPU1
wakeup from idle		wakeup from idle

leave broadcast mode		leave broadcast mode
 restart per cpu timer		 restart per cpu timer
 	     	 		go back to idle
handle broadcast
 (empty mask)
				enter broadcast mode
				programm broadcast device
enter broadcast mode
programm broadcast device

So what happens is that due to the forced reprogramming of the cpu
local timer, we need to set a event in the future. Now if we manage to
go back to idle before the timer fires, we switch off the timer and
arm the broadcast device with an already expired time (covered by
forced mode). So in the worst case we repeat the above ping pong
forever.

Unfortunately we have no information about what caused the wakeup, but
we can check current time against the expiry time of the local cpu. If
the local event is already in the past, we know that the broadcast
timer is about to fire and send an IPI. So we mark ourself as an IPI
target even if we left broadcast mode and avoid the reprogramming of
the local cpu timer.

This still leaves the possibility that a CPU which is not handling the
broadcast interrupt is going to reach idle again before the IPI
arrives. This can't be solved in the core code and will be handled in
follow up patches.

Reported-by: Jason Liu <liu.h.jason@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: LAK <linux-arm-kernel@lists.infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Arjan van de Veen <arjan@infradead.org>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Tested-by: Santosh Shilimkar <santosh.shilimkar@ti.com>
Link: http://lkml.kernel.org/r/20130306111537.492045206@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 59 +++++++++++++++++++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 005c0ca81a32..2100aad6b5f2 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -393,6 +393,7 @@ int tick_resume_broadcast(void)
 
 static cpumask_var_t tick_broadcast_oneshot_mask;
 static cpumask_var_t tick_broadcast_pending_mask;
+static cpumask_var_t tick_broadcast_force_mask;
 
 /*
  * Exposed for debugging: see timer_list.c
@@ -483,6 +484,10 @@ again:
 		}
 	}
 
+	/* Take care of enforced broadcast requests */
+	cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask);
+	cpumask_clear(tick_broadcast_force_mask);
+
 	/*
 	 * Wakeup the cpus which have an expired event.
 	 */
@@ -518,6 +523,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 	struct clock_event_device *bc, *dev;
 	struct tick_device *td;
 	unsigned long flags;
+	ktime_t now;
 	int cpu;
 
 	/*
@@ -545,7 +551,16 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 		WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
 		if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
-			if (dev->next_event.tv64 < bc->next_event.tv64)
+			/*
+			 * We only reprogram the broadcast timer if we
+			 * did not mark ourself in the force mask and
+			 * if the cpu local event is earlier than the
+			 * broadcast event. If the current CPU is in
+			 * the force mask, then we are going to be
+			 * woken by the IPI right away.
+			 */
+			if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) &&
+			    dev->next_event.tv64 < bc->next_event.tv64)
 				tick_broadcast_set_event(bc, cpu, dev->next_event, 1);
 		}
 	} else {
@@ -566,6 +581,47 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 				       tick_broadcast_pending_mask))
 				goto out;
 
+			/*
+			 * If the pending bit is not set, then we are
+			 * either the CPU handling the broadcast
+			 * interrupt or we got woken by something else.
+			 *
+			 * We are not longer in the broadcast mask, so
+			 * if the cpu local expiry time is already
+			 * reached, we would reprogram the cpu local
+			 * timer with an already expired event.
+			 *
+			 * This can lead to a ping-pong when we return
+			 * to idle and therefor rearm the broadcast
+			 * timer before the cpu local timer was able
+			 * to fire. This happens because the forced
+			 * reprogramming makes sure that the event
+			 * will happen in the future and depending on
+			 * the min_delta setting this might be far
+			 * enough out that the ping-pong starts.
+			 *
+			 * If the cpu local next_event has expired
+			 * then we know that the broadcast timer
+			 * next_event has expired as well and
+			 * broadcast is about to be handled. So we
+			 * avoid reprogramming and enforce that the
+			 * broadcast handler, which did not run yet,
+			 * will invoke the cpu local handler.
+			 *
+			 * We cannot call the handler directly from
+			 * here, because we might be in a NOHZ phase
+			 * and we did not go through the irq_enter()
+			 * nohz fixups.
+			 */
+			now = ktime_get();
+			if (dev->next_event.tv64 <= now.tv64) {
+				cpumask_set_cpu(cpu, tick_broadcast_force_mask);
+				goto out;
+			}
+			/*
+			 * We got woken by something else. Reprogram
+			 * the cpu local timer device.
+			 */
 			tick_program_event(dev->next_event, 1);
 		}
 	}
@@ -707,5 +763,6 @@ void __init tick_broadcast_init(void)
 #ifdef CONFIG_TICK_ONESHOT
 	alloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
 	alloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT);
+	alloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT);
 #endif
 }

From eaa907c546f76222227dfc41784b22588af1e3d7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 6 Mar 2013 11:18:36 +0000
Subject: [PATCH 07/52] tick: Provide a check for a forced broadcast pending

On the CPU which gets woken along with the target CPU of the broadcast
the following happens:

  deep_idle()
			<-- spurious wakeup
  broadcast_exit()
    set forced bit

  enable interrupts

			<-- Nothing happens

  disable interrupts

  broadcast_enter()
			<-- Here we observe the forced bit is set
  deep_idle()

Now after that the target CPU of the broadcast runs the broadcast
handler and finds the other CPU in both the broadcast and the forced
mask, sends the IPI and stuff gets back to normal.

So it's not actually harmful, just more evidence for the theory, that
hardware designers have access to very special drug supplies.

Now there is no point in going back to deep idle just to wake up again
right away via an IPI. Provide a check which allows the idle code to
avoid the deep idle transition.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: LAK <linux-arm-kernel@lists.infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Arjan van de Veen <arjan@infradead.org>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Tested-by: Santosh Shilimkar <santosh.shilimkar@ti.com>
Cc: Jason Liu <liu.h.jason@gmail.com>
Link: http://lkml.kernel.org/r/20130306111537.565418308@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clockchips.h   |  6 ++++++
 kernel/time/tick-broadcast.c | 12 ++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 494d33ea78f8..646aac136eed 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -175,6 +175,12 @@ extern void tick_broadcast(const struct cpumask *mask);
 extern int tick_receive_broadcast(void);
 #endif
 
+#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
+extern int tick_check_broadcast_expired(void);
+#else
+static inline int tick_check_broadcast_expired(void) { return 0; }
+#endif
+
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
 extern void clockevents_notify(unsigned long reason, void *arg);
 #else
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 2100aad6b5f2..d76d816afc5d 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -403,6 +403,18 @@ struct cpumask *tick_get_broadcast_oneshot_mask(void)
 	return tick_broadcast_oneshot_mask;
 }
 
+/*
+ * Called before going idle with interrupts disabled. Checks whether a
+ * broadcast event from the other core is about to happen. We detected
+ * that in tick_broadcast_oneshot_control(). The callsite can use this
+ * to avoid a deep idle transition as we are about to get the
+ * broadcast IPI right away.
+ */
+int tick_check_broadcast_expired(void)
+{
+	return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask);
+}
+
 /*
  * Set broadcast interrupt affinity
  */

From 80bbe9f273a38f83ecfc50fe384a57f8428887bd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 6 Mar 2013 11:18:37 +0000
Subject: [PATCH 08/52] arm: Use tick broadcast expired check

Avoid going back into deep idle if the tick broadcast IPI is about to
fire.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: LAK <linux-arm-kernel@lists.infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Arjan van de Veen <arjan@infradead.org>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Tested-by: Santosh Shilimkar <santosh.shilimkar@ti.com>
Cc: Jason Liu <liu.h.jason@gmail.com>
Link: http://lkml.kernel.org/r/20130306111537.640722922@linutronix.de
---
 arch/arm/kernel/process.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 047d3e40e470..db4ffd09ee23 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -199,7 +199,16 @@ void cpu_idle(void)
 #ifdef CONFIG_PL310_ERRATA_769419
 			wmb();
 #endif
-			if (hlt_counter) {
+			/*
+			 * In poll mode we reenable interrupts and spin.
+			 *
+			 * Also if we detected in the wakeup from idle
+			 * path that the tick broadcast device expired
+			 * for us, we don't want to go deep idle as we
+			 * know that the IPI is going to arrive right
+			 * away
+			 */
+			if (hlt_counter || tick_check_broadcast_expired()) {
 				local_irq_enable();
 				cpu_relax();
 			} else if (!need_resched()) {

From 35b61edb41ffee58711850e76215b852386ddb10 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 6 Mar 2013 11:18:37 +0000
Subject: [PATCH 09/52] x86: Use tick broadcast expired check

Avoid going back into deep idle if the tick broadcast IPI is about to
fire.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Arjan van de Veen <arjan@infradead.org>
Cc: x86@kernel.org
Link: http://lkml.kernel.org/r/20130306111537.702278273@linutronix.de
---
 arch/x86/kernel/process.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 14ae10031ff0..aa524da03bba 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -336,6 +336,18 @@ void cpu_idle(void)
 			local_touch_nmi();
 			local_irq_disable();
 
+			/*
+			 * We detected in the wakeup path that the
+			 * tick broadcast device expired for us, but
+			 * we raced with the other CPU and came back
+			 * here before it was able to fire the IPI.
+			 * No point in going idle.
+			 */
+			if (tick_check_broadcast_expired()) {
+				local_irq_enable();
+				continue;
+			}
+
 			enter_idle();
 
 			/* Don't trace irqs off for idle */

From c30bd09915ea243603d7803d53de890c4a6f1474 Mon Sep 17 00:00:00 2001
From: Dong Zhu <bluezhudong@gmail.com>
Date: Thu, 6 Dec 2012 22:03:34 +0800
Subject: [PATCH 10/52] timekeeping: Avoid adjust kernel time once hwclock kept
 in UTC time

If the Hardware Clock kept in local time,kernel will adjust the time
to be UTC time.But if Hardware Clock kept in UTC time,system will make
a dummy settimeofday call first (sys_tz.tz_minuteswest = 0) to make sure
the time is not shifted,so at this point I think maybe it is not necessary
to set the kernel time once the sys_tz.tz_minuteswest is zero.

Signed-off-by: Dong Zhu <bluezhudong@gmail.com>
[jstultz: Updated to merge with conflicting changes ]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/kernel/time.c b/kernel/time.c
index f8342a41efa6..effac571589f 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -138,13 +138,14 @@ int persistent_clock_is_local;
  */
 static inline void warp_clock(void)
 {
-	struct timespec adjust;
+	if (sys_tz.tz_minuteswest != 0) {
+		struct timespec adjust;
 
-	adjust = current_kernel_time();
-	if (sys_tz.tz_minuteswest != 0)
 		persistent_clock_is_local = 1;
-	adjust.tv_sec += sys_tz.tz_minuteswest * 60;
-	do_settimeofday(&adjust);
+		adjust = current_kernel_time();
+		adjust.tv_sec += sys_tz.tz_minuteswest * 60;
+		do_settimeofday(&adjust);
+	}
 }
 
 /*

From 7859e404ae73fe4f38b8cfc1af19ea82f153084e Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 22 Feb 2013 12:33:29 -0800
Subject: [PATCH 11/52] timekeeping: Use inject_offset in warp_clock

When warping the clock (from a local time RTC), use
timekeeping_inject_offset() to atomically add the offset.

This avoids any minor time error caused by the delay between
reading the time, and then setting the adjusted time.

Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/time.c b/kernel/time.c
index effac571589f..d3617dbd3dca 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -142,9 +142,9 @@ static inline void warp_clock(void)
 		struct timespec adjust;
 
 		persistent_clock_is_local = 1;
-		adjust = current_kernel_time();
-		adjust.tv_sec += sys_tz.tz_minuteswest * 60;
-		do_settimeofday(&adjust);
+		adjust.tv_sec = sys_tz.tz_minuteswest * 60;
+		adjust.tv_nsec = 0;
+		timekeeping_inject_offset(&adjust);
 	}
 }
 

From 3195ef59cb42cda3aeeb24a7fd2ba1b900c4a3cc Mon Sep 17 00:00:00 2001
From: Prarit Bhargava <prarit@redhat.com>
Date: Thu, 14 Feb 2013 12:02:54 -0500
Subject: [PATCH 12/52] x86: Do full rtc synchronization with ntp

Every 11 minutes ntp attempts to update the x86 rtc with the current
system time.  Currently, the x86 code only updates the rtc if the system
time is within +/-15 minutes of the current value of the rtc. This
was done originally to avoid setting the RTC if the RTC was in localtime
mode (common with Windows dualbooting).  Other architectures do a full
synchronization and now that we have better infrastructure to detect
when the RTC is in localtime, there is no reason that x86 should be
software limited to a 30 minute window.

This patch changes the behavior of the kernel to do a full synchronization
(year, month, day, hour, minute, and second) of the rtc when ntp requests
a synchronization between the system time and the rtc.

I've used the RTC library functions in this patchset as they do all the
required bounds checking.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: x86@kernel.org
Cc: Matt Fleming <matt.fleming@intel.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: linux-efi@vger.kernel.org
Signed-off-by: Prarit Bhargava <prarit@redhat.com>
[jstultz: Tweak commit message, fold in build fix found by fengguang
Also add select RTC_LIB to X86, per new dependency, as found by prarit]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 arch/x86/Kconfig              |  1 +
 arch/x86/kernel/rtc.c         | 69 ++++++-----------------------------
 arch/x86/platform/efi/efi.c   | 24 ++++++++----
 arch/x86/platform/mrst/vrtc.c | 42 ++++++++++++---------
 4 files changed, 54 insertions(+), 82 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a4f24f5b1218..26bd79261532 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -120,6 +120,7 @@ config X86
 	select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION
 	select OLD_SIGACTION if X86_32
 	select COMPAT_OLD_SIGACTION if IA32_EMULATION
+	select RTC_LIB
 
 config INSTRUCTION_DECODER
 	def_bool y
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 2e8f3d3b5641..198eb201ed3b 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -13,6 +13,7 @@
 #include <asm/x86_init.h>
 #include <asm/time.h>
 #include <asm/mrst.h>
+#include <asm/rtc.h>
 
 #ifdef CONFIG_X86_32
 /*
@@ -36,70 +37,24 @@ EXPORT_SYMBOL(rtc_lock);
  * nowtime is written into the registers of the CMOS clock, it will
  * jump to the next second precisely 500 ms later. Check the Motorola
  * MC146818A or Dallas DS12887 data sheet for details.
- *
- * BUG: This routine does not handle hour overflow properly; it just
- *      sets the minutes. Usually you'll only notice that after reboot!
  */
 int mach_set_rtc_mmss(unsigned long nowtime)
 {
-	int real_seconds, real_minutes, cmos_minutes;
-	unsigned char save_control, save_freq_select;
-	unsigned long flags;
+	struct rtc_time tm;
 	int retval = 0;
 
-	spin_lock_irqsave(&rtc_lock, flags);
-
-	 /* tell the clock it's being set */
-	save_control = CMOS_READ(RTC_CONTROL);
-	CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
-
-	/* stop and reset prescaler */
-	save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
-	CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
-
-	cmos_minutes = CMOS_READ(RTC_MINUTES);
-	if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
-		cmos_minutes = bcd2bin(cmos_minutes);
-
-	/*
-	 * since we're only adjusting minutes and seconds,
-	 * don't interfere with hour overflow. This avoids
-	 * messing with unknown time zones but requires your
-	 * RTC not to be off by more than 15 minutes
-	 */
-	real_seconds = nowtime % 60;
-	real_minutes = nowtime / 60;
-	/* correct for half hour time zone */
-	if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1)
-		real_minutes += 30;
-	real_minutes %= 60;
-
-	if (abs(real_minutes - cmos_minutes) < 30) {
-		if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
-			real_seconds = bin2bcd(real_seconds);
-			real_minutes = bin2bcd(real_minutes);
-		}
-		CMOS_WRITE(real_seconds, RTC_SECONDS);
-		CMOS_WRITE(real_minutes, RTC_MINUTES);
+	rtc_time_to_tm(nowtime, &tm);
+	if (!rtc_valid_tm(&tm)) {
+		retval = set_rtc_time(&tm);
+		if (retval)
+			printk(KERN_ERR "%s: RTC write failed with error %d\n",
+			       __FUNCTION__, retval);
 	} else {
-		printk_once(KERN_NOTICE
-		       "set_rtc_mmss: can't update from %d to %d\n",
-		       cmos_minutes, real_minutes);
-		retval = -1;
+		printk(KERN_ERR
+		       "%s: Invalid RTC value: write of %lx to RTC failed\n",
+			__FUNCTION__, nowtime);
+		retval = -EINVAL;
 	}
-
-	/* The following flags have to be released exactly in this order,
-	 * otherwise the DS12887 (popular MC146818A clone with integrated
-	 * battery and quartz) will not reset the oscillator and will not
-	 * update precisely 500 ms later. You won't find this mentioned in
-	 * the Dallas Semiconductor data sheets, but who believes data
-	 * sheets anyway ...                           -- Markus Kuhn
-	 */
-	CMOS_WRITE(save_control, RTC_CONTROL);
-	CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
-
-	spin_unlock_irqrestore(&rtc_lock, flags);
-
 	return retval;
 }
 
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 5f2ecaf3f9d8..28d9efacc9b6 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -48,6 +48,7 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 #include <asm/x86_init.h>
+#include <asm/rtc.h>
 
 #define EFI_DEBUG	1
 
@@ -258,10 +259,10 @@ static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
 
 int efi_set_rtc_mmss(unsigned long nowtime)
 {
-	int real_seconds, real_minutes;
 	efi_status_t 	status;
 	efi_time_t 	eft;
 	efi_time_cap_t 	cap;
+	struct rtc_time	tm;
 
 	status = efi.get_time(&eft, &cap);
 	if (status != EFI_SUCCESS) {
@@ -269,13 +270,20 @@ int efi_set_rtc_mmss(unsigned long nowtime)
 		return -1;
 	}
 
-	real_seconds = nowtime % 60;
-	real_minutes = nowtime / 60;
-	if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
-		real_minutes += 30;
-	real_minutes %= 60;
-	eft.minute = real_minutes;
-	eft.second = real_seconds;
+	rtc_time_to_tm(nowtime, &tm);
+	if (!rtc_valid_tm(&tm)) {
+		eft.year = tm.tm_year + 1900;
+		eft.month = tm.tm_mon + 1;
+		eft.day = tm.tm_mday;
+		eft.minute = tm.tm_min;
+		eft.second = tm.tm_sec;
+		eft.nanosecond = 0;
+	} else {
+		printk(KERN_ERR
+		       "%s: Invalid EFI RTC value: write of %lx to EFI RTC failed\n",
+		       __FUNCTION__, nowtime);
+		return -1;
+	}
 
 	status = efi.set_time(&eft);
 	if (status != EFI_SUCCESS) {
diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c
index 225bd0f0f675..d62b0a3b5c14 100644
--- a/arch/x86/platform/mrst/vrtc.c
+++ b/arch/x86/platform/mrst/vrtc.c
@@ -85,27 +85,35 @@ unsigned long vrtc_get_time(void)
 	return mktime(year, mon, mday, hour, min, sec);
 }
 
-/* Only care about the minutes and seconds */
 int vrtc_set_mmss(unsigned long nowtime)
 {
-	int real_sec, real_min;
 	unsigned long flags;
-	int vrtc_min;
+	struct rtc_time tm;
+	int year;
+	int retval = 0;
 
-	spin_lock_irqsave(&rtc_lock, flags);
-	vrtc_min = vrtc_cmos_read(RTC_MINUTES);
-
-	real_sec = nowtime % 60;
-	real_min = nowtime / 60;
-	if (((abs(real_min - vrtc_min) + 15)/30) & 1)
-		real_min += 30;
-	real_min %= 60;
-
-	vrtc_cmos_write(real_sec, RTC_SECONDS);
-	vrtc_cmos_write(real_min, RTC_MINUTES);
-	spin_unlock_irqrestore(&rtc_lock, flags);
-
-	return 0;
+	rtc_time_to_tm(nowtime, &tm);
+	if (!rtc_valid_tm(&tm) && tm.tm_year >= 72) {
+		/*
+		 * tm.year is the number of years since 1900, and the
+		 * vrtc need the years since 1972.
+		 */
+		year = tm.tm_year - 72;
+		spin_lock_irqsave(&rtc_lock, flags);
+		vrtc_cmos_write(year, RTC_YEAR);
+		vrtc_cmos_write(tm.tm_mon, RTC_MONTH);
+		vrtc_cmos_write(tm.tm_mday, RTC_DAY_OF_MONTH);
+		vrtc_cmos_write(tm.tm_hour, RTC_HOURS);
+		vrtc_cmos_write(tm.tm_min, RTC_MINUTES);
+		vrtc_cmos_write(tm.tm_sec, RTC_SECONDS);
+		spin_unlock_irqrestore(&rtc_lock, flags);
+	} else {
+		printk(KERN_ERR
+		       "%s: Invalid vRTC value: write of %lx to vRTC failed\n",
+			__FUNCTION__, nowtime);
+		retval = -EINVAL;
+	}
+	return retval;
 }
 
 void __init mrst_rtc_init(void)

From c54fdbb2823d96b842d00c548e14dbc0dd37831d Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Tue, 12 Mar 2013 11:56:45 +0800
Subject: [PATCH 13/52] x86: Add cpu capability flag X86_FEATURE_NONSTOP_TSC_S3

On some new Intel Atom processors (Penwell and Cloverview), there is
a feature that the TSC won't stop in S3 state, say the TSC value
won't be reset to 0 after resume. This feature makes TSC a more reliable
clocksource and could benefit the timekeeping code during system
suspend/resume cycle, so add a flag for it.

Signed-off-by: Feng Tang <feng.tang@intel.com>
[jstultz: Fix checkpatch warning]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 arch/x86/include/asm/cpufeature.h |  1 +
 arch/x86/kernel/cpu/intel.c       | 12 ++++++++++++
 2 files changed, 13 insertions(+)

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 93fe929d1cee..a8466f203e62 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -100,6 +100,7 @@
 #define X86_FEATURE_AMD_DCM     (3*32+27) /* multi-node processor */
 #define X86_FEATURE_APERFMPERF	(3*32+28) /* APERFMPERF */
 #define X86_FEATURE_EAGER_FPU	(3*32+29) /* "eagerfpu" Non lazy FPU restore */
+#define X86_FEATURE_NONSTOP_TSC_S3 (3*32+30) /* TSC doesn't stop in S3 state */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* "pni" SSE-3 */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 1905ce98bee0..e7ae0d89e7e0 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -96,6 +96,18 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
 			sched_clock_stable = 1;
 	}
 
+	/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
+	if (c->x86 == 6) {
+		switch (c->x86_model) {
+		case 0x27:	/* Penwell */
+		case 0x35:	/* Cloverview */
+			set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
+			break;
+		default:
+			break;
+		}
+	}
+
 	/*
 	 * There is a known erratum on Pentium III and Core Solo
 	 * and Core Duo CPUs.

From 5caf4636259ae3af0efbb9bfc4cd97874b547c7d Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Tue, 12 Mar 2013 11:56:46 +0800
Subject: [PATCH 14/52] clocksource: Add new feature flag
 CLOCK_SOURCE_SUSPEND_NONSTOP

Some x86 processors have a TSC clocksource, which continues to run
even when system is suspended. Also most OMAP platforms have a
32 KHz timer which has similar capability. Add a feature flag so that
it could be utilized.

Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/clocksource.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 27cfda427dd9..aa7032c7238f 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -206,6 +206,7 @@ struct clocksource {
 #define CLOCK_SOURCE_WATCHDOG			0x10
 #define CLOCK_SOURCE_VALID_FOR_HRES		0x20
 #define CLOCK_SOURCE_UNSTABLE			0x40
+#define CLOCK_SOURCE_SUSPEND_NONSTOP		0x80
 
 /* simplify initialization of mask field */
 #define CLOCKSOURCE_MASK(bits) (cycle_t)((bits) < 64 ? ((1ULL<<(bits))-1) : -1)

From 82f9c080b22a5b859ae2b50822dfb6b812898fdb Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Tue, 12 Mar 2013 11:56:47 +0800
Subject: [PATCH 15/52] x86: tsc: Add support for new S3_NONSTOP feature

Add support for new S3_NONSTOP feature

Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 arch/x86/kernel/tsc.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 4b9ea101fe3b..098b3cfda72e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -768,7 +768,8 @@ static cycle_t read_tsc(struct clocksource *cs)
 
 static void resume_tsc(struct clocksource *cs)
 {
-	clocksource_tsc.cycle_last = 0;
+	if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
+		clocksource_tsc.cycle_last = 0;
 }
 
 static struct clocksource clocksource_tsc = {
@@ -939,6 +940,9 @@ static int __init init_tsc_clocksource(void)
 		clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
 	}
 
+	if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
+		clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
+
 	/*
 	 * Trust the results of the earlier calibration on systems
 	 * exporting a reliable TSC.

From e445cf1c4257cc0238d72e4129eb4739f46fd3de Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Tue, 12 Mar 2013 11:56:48 +0800
Subject: [PATCH 16/52] timekeeping: utilize the suspend-nonstop clocksource to
 count suspended time

There are some new processors whose TSC clocksource won't stop during
suspend. Currently, after system resumes, kernel will use persistent
clock or RTC to compensate the sleep time, but with these nonstop
clocksources, we could skip the special compensation from external
sources, and just use current clocksource for time recounting.

This can solve some time drift bugs caused by some not-so-accurate or
error-prone RTC devices.

The current way to count suspended time is first try to use the persistent
clock, and then try the RTC if persistent clock can't be used. This
patch will change the trying order to:
	suspend-nonstop clocksource -> persistent clock -> RTC

When counting the sleep time with nonstop clocksource, use an accurate way
suggested by Jason Gunthorpe to cover very large delta cycles.

Signed-off-by: Feng Tang <feng.tang@intel.com>
[jstultz: Small optimization, avoiding re-reading the clocksource]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 58 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 51 insertions(+), 7 deletions(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 9a0bc98fbe1d..0355f125d585 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -788,22 +788,66 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 static void timekeeping_resume(void)
 {
 	struct timekeeper *tk = &timekeeper;
+	struct clocksource *clock = tk->clock;
 	unsigned long flags;
-	struct timespec ts;
+	struct timespec ts_new, ts_delta;
+	cycle_t cycle_now, cycle_delta;
+	bool suspendtime_found = false;
 
-	read_persistent_clock(&ts);
+	read_persistent_clock(&ts_new);
 
 	clockevents_resume();
 	clocksource_resume();
 
 	write_seqlock_irqsave(&tk->lock, flags);
 
-	if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
-		ts = timespec_sub(ts, timekeeping_suspend_time);
-		__timekeeping_inject_sleeptime(tk, &ts);
+	/*
+	 * After system resumes, we need to calculate the suspended time and
+	 * compensate it for the OS time. There are 3 sources that could be
+	 * used: Nonstop clocksource during suspend, persistent clock and rtc
+	 * device.
+	 *
+	 * One specific platform may have 1 or 2 or all of them, and the
+	 * preference will be:
+	 *	suspend-nonstop clocksource -> persistent clock -> rtc
+	 * The less preferred source will only be tried if there is no better
+	 * usable source. The rtc part is handled separately in rtc core code.
+	 */
+	cycle_now = clock->read(clock);
+	if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
+		cycle_now > clock->cycle_last) {
+		u64 num, max = ULLONG_MAX;
+		u32 mult = clock->mult;
+		u32 shift = clock->shift;
+		s64 nsec = 0;
+
+		cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+
+		/*
+		 * "cycle_delta * mutl" may cause 64 bits overflow, if the
+		 * suspended time is too long. In that case we need do the
+		 * 64 bits math carefully
+		 */
+		do_div(max, mult);
+		if (cycle_delta > max) {
+			num = div64_u64(cycle_delta, max);
+			nsec = (((u64) max * mult) >> shift) * num;
+			cycle_delta -= num * max;
+		}
+		nsec += ((u64) cycle_delta * mult) >> shift;
+
+		ts_delta = ns_to_timespec(nsec);
+		suspendtime_found = true;
+	} else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) {
+		ts_delta = timespec_sub(ts_new, timekeeping_suspend_time);
+		suspendtime_found = true;
 	}
-	/* re-base the last cycle value */
-	tk->clock->cycle_last = tk->clock->read(tk->clock);
+
+	if (suspendtime_found)
+		__timekeeping_inject_sleeptime(tk, &ts_delta);
+
+	/* Re-base the last cycle value */
+	clock->cycle_last = cycle_now;
 	tk->ntp_error = 0;
 	timekeeping_suspended = 0;
 	timekeeping_update(tk, false);

From 19919226c3f20e6bf5de3df96432ce80ffd63ff2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 22 Mar 2013 10:48:33 +0100
Subject: [PATCH 17/52] clockevents: Add missing tick_check_broadcast_expired()
 for CLOCKEVENTS=n

Fengs build robot reports:

arch/arm/kernel/process.c: In function 'cpu_idle':
arch/arm/kernel/process.c:211:4: error: implicit declaration of function
'tick_check_broadcast_expired' [-Werror=implicit-function-declaration]

Add the missing inline function for non clockevent builds

Reported-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clockchips.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 646aac136eed..464e229e7d84 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -193,6 +193,7 @@ static inline void clockevents_suspend(void) {}
 static inline void clockevents_resume(void) {}
 
 #define clockevents_notify(reason, arg) do { } while (0)
+static inline int tick_check_broadcast_expired(void) { return 0; }
 
 #endif
 

From cc244ddae6d4c6902ac9d7d64023534f8c44a7eb Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 3 May 2012 12:30:07 -0700
Subject: [PATCH 18/52] timekeeping: Move TAI managment into timekeeping core
 from ntp

Currently NTP manages the TAI offset. Since there's plans for a
CLOCK_TAI clockid, push the TAI management into the timekeeping
core.

CC: Thomas Gleixner <tglx@linutronix.de>
CC: Eric Dumazet <eric.dumazet@gmail.com>
CC: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/time.h                |  2 ++
 include/linux/timekeeper_internal.h |  3 ++
 kernel/time/ntp.c                   | 18 ++++++------
 kernel/time/timekeeping.c           | 44 +++++++++++++++++++++++++++++
 4 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/include/linux/time.h b/include/linux/time.h
index d4835dfdf25e..47210a175e78 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -181,6 +181,8 @@ extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
 extern int timekeeping_valid_for_hres(void);
 extern u64 timekeeping_max_deferment(void);
 extern int timekeeping_inject_offset(struct timespec *ts);
+extern s32 timekeeping_get_tai_offset(void);
+extern void timekeeping_set_tai_offset(s32 tai_offset);
 
 struct tms;
 extern void do_sys_times(struct tms *);
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index e1d558e237ec..ff94f436f8b7 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -62,6 +62,9 @@ struct timekeeper {
 	ktime_t			offs_boot;
 	/* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
 	struct timespec		raw_time;
+	/* The current UTC to TAI offset in seconds */
+	s32			tai_offset;
+
 	/* Seqlock for all timekeeper values */
 	seqlock_t		lock;
 };
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 072bb066bb7d..59e2749be0fa 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -53,9 +53,6 @@ static int			time_state = TIME_OK;
 /* clock status bits:							*/
 static int			time_status = STA_UNSYNC;
 
-/* TAI offset (secs):							*/
-static long			time_tai;
-
 /* time adjustment (nsecs):						*/
 static s64			time_offset;
 
@@ -415,7 +412,6 @@ int second_overflow(unsigned long secs)
 		else if (secs % 86400 == 0) {
 			leap = -1;
 			time_state = TIME_OOP;
-			time_tai++;
 			printk(KERN_NOTICE
 				"Clock: inserting leap second 23:59:60 UTC\n");
 		}
@@ -425,7 +421,6 @@ int second_overflow(unsigned long secs)
 			time_state = TIME_OK;
 		else if ((secs + 1) % 86400 == 0) {
 			leap = 1;
-			time_tai--;
 			time_state = TIME_WAIT;
 			printk(KERN_NOTICE
 				"Clock: deleting leap second 23:59:59 UTC\n");
@@ -579,7 +574,9 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
  * Called with ntp_lock held, so we can access and modify
  * all the global NTP state:
  */
-static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts)
+static inline void process_adjtimex_modes(struct timex *txc,
+						struct timespec *ts,
+						s32 *time_tai)
 {
 	if (txc->modes & ADJ_STATUS)
 		process_adj_status(txc, ts);
@@ -613,7 +610,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
 	}
 
 	if (txc->modes & ADJ_TAI && txc->constant > 0)
-		time_tai = txc->constant;
+		*time_tai = txc->constant;
 
 	if (txc->modes & ADJ_OFFSET)
 		ntp_update_offset(txc->offset);
@@ -632,6 +629,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
 int do_adjtimex(struct timex *txc)
 {
 	struct timespec ts;
+	u32 time_tai, orig_tai;
 	int result;
 
 	/* Validate the data before disabling interrupts */
@@ -671,6 +669,7 @@ int do_adjtimex(struct timex *txc)
 	}
 
 	getnstimeofday(&ts);
+	orig_tai = time_tai = timekeeping_get_tai_offset();
 
 	raw_spin_lock_irq(&ntp_lock);
 
@@ -687,7 +686,7 @@ int do_adjtimex(struct timex *txc)
 
 		/* If there are input parameters, then process them: */
 		if (txc->modes)
-			process_adjtimex_modes(txc, &ts);
+			process_adjtimex_modes(txc, &ts, &time_tai);
 
 		txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
 				  NTP_SCALE_SHIFT);
@@ -716,6 +715,9 @@ int do_adjtimex(struct timex *txc)
 
 	raw_spin_unlock_irq(&ntp_lock);
 
+	if (time_tai != orig_tai)
+		timekeeping_set_tai_offset(time_tai);
+
 	txc->time.tv_sec = ts.tv_sec;
 	txc->time.tv_usec = ts.tv_nsec;
 	if (!(time_status & STA_NANO))
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 0355f125d585..937098aab498 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -513,6 +513,48 @@ error: /* even if we error out, we forwarded the time, so call update */
 }
 EXPORT_SYMBOL(timekeeping_inject_offset);
 
+
+/**
+ * timekeeping_get_tai_offset - Returns current TAI offset from UTC
+ *
+ */
+s32 timekeeping_get_tai_offset(void)
+{
+	struct timekeeper *tk = &timekeeper;
+	unsigned int seq;
+	s32 ret;
+
+	do {
+		seq = read_seqbegin(&tk->lock);
+		ret = tk->tai_offset;
+	} while (read_seqretry(&tk->lock, seq));
+
+	return ret;
+}
+
+/**
+ * __timekeeping_set_tai_offset - Lock free worker function
+ *
+ */
+void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
+{
+	tk->tai_offset = tai_offset;
+}
+
+/**
+ * timekeeping_set_tai_offset - Sets the current TAI offset from UTC
+ *
+ */
+void timekeeping_set_tai_offset(s32 tai_offset)
+{
+	struct timekeeper *tk = &timekeeper;
+	unsigned long flags;
+
+	write_seqlock_irqsave(&tk->lock, flags);
+	__timekeeping_set_tai_offset(tk, tai_offset);
+	write_sequnlock_irqrestore(&tk->lock, flags);
+}
+
 /**
  * change_clocksource - Swaps clocksources if a new one is available
  *
@@ -1143,6 +1185,8 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
 			tk_set_wall_to_mono(tk,
 				timespec_sub(tk->wall_to_monotonic, ts));
 
+			__timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
+
 			clock_was_set_delayed();
 		}
 	}

From 1ff3c9677bff7e468e0c487d0ffefe4e901d33f4 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 3 May 2012 12:43:40 -0700
Subject: [PATCH 19/52] timekeeping: Add CLOCK_TAI clockid

This add a CLOCK_TAI clockid and the needed accessors.

CC: Thomas Gleixner <tglx@linutronix.de>
CC: Eric Dumazet <eric.dumazet@gmail.com>
CC: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/time.h      |  1 +
 include/uapi/linux/time.h |  6 ++----
 kernel/posix-timers.c     | 10 ++++++++++
 kernel/time/timekeeping.c | 30 ++++++++++++++++++++++++++++++
 4 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/include/linux/time.h b/include/linux/time.h
index 47210a175e78..22d81b3c955b 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -183,6 +183,7 @@ extern u64 timekeeping_max_deferment(void);
 extern int timekeeping_inject_offset(struct timespec *ts);
 extern s32 timekeeping_get_tai_offset(void);
 extern void timekeeping_set_tai_offset(s32 tai_offset);
+extern void timekeeping_clocktai(struct timespec *ts);
 
 struct tms;
 extern void do_sys_times(struct tms *);
diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h
index 0d3c0edc3eda..e75e1b6ff27f 100644
--- a/include/uapi/linux/time.h
+++ b/include/uapi/linux/time.h
@@ -54,11 +54,9 @@ struct itimerval {
 #define CLOCK_BOOTTIME			7
 #define CLOCK_REALTIME_ALARM		8
 #define CLOCK_BOOTTIME_ALARM		9
+#define CLOCK_SGI_CYCLE			10	/* Hardware specific */
+#define CLOCK_TAI			11
 
-/*
- * The IDs of various hardware clocks:
- */
-#define CLOCK_SGI_CYCLE			10
 #define MAX_CLOCKS			16
 #define CLOCKS_MASK			(CLOCK_REALTIME | CLOCK_MONOTONIC)
 #define CLOCKS_MONO			CLOCK_MONOTONIC
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 6edbb2c55c22..fbfc5f1b7710 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -221,6 +221,11 @@ static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
 	return 0;
 }
 
+static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
+{
+	timekeeping_clocktai(tp);
+	return 0;
+}
 
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
@@ -261,6 +266,10 @@ static __init int init_posix_timers(void)
 		.clock_getres	= posix_get_coarse_res,
 		.clock_get	= posix_get_monotonic_coarse,
 	};
+	struct k_clock clock_tai = {
+		.clock_getres	= hrtimer_get_res,
+		.clock_get	= posix_get_tai,
+	};
 	struct k_clock clock_boottime = {
 		.clock_getres	= hrtimer_get_res,
 		.clock_get	= posix_get_boottime,
@@ -278,6 +287,7 @@ static __init int init_posix_timers(void)
 	posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
 	posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
 	posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
+	posix_timers_register_clock(CLOCK_TAI, &clock_tai);
 
 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
 					sizeof (struct k_itimer), 0, SLAB_PANIC,
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 937098aab498..8a842756572d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -379,6 +379,36 @@ void ktime_get_ts(struct timespec *ts)
 }
 EXPORT_SYMBOL_GPL(ktime_get_ts);
 
+
+/**
+ * timekeeping_clocktai - Returns the TAI time of day in a timespec
+ * @ts:		pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec.
+ */
+void timekeeping_clocktai(struct timespec *ts)
+{
+	struct timekeeper *tk = &timekeeper;
+	unsigned long seq;
+	u64 nsecs;
+
+	WARN_ON(timekeeping_suspended);
+
+	do {
+		seq = read_seqbegin(&tk->lock);
+
+		ts->tv_sec = tk->xtime_sec + tk->tai_offset;
+		nsecs = timekeeping_get_ns(tk);
+
+	} while (read_seqretry(&tk->lock, seq));
+
+	ts->tv_nsec = 0;
+	timespec_add_ns(ts, nsecs);
+
+}
+EXPORT_SYMBOL(timekeeping_clocktai);
+
+
 #ifdef CONFIG_NTP_PPS
 
 /**

From 90adda98b89aaf68b06014ecf805b6c477daa19b Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Mon, 21 Jan 2013 17:00:11 -0800
Subject: [PATCH 20/52] hrtimer: Add hrtimer support for CLOCK_TAI

Add hrtimer support for CLOCK_TAI, as well as posix timer interfaces.

Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/hrtimer.h             |  5 ++++-
 include/linux/timekeeper_internal.h |  2 ++
 kernel/hrtimer.c                    | 14 +++++++++++++-
 kernel/posix-timers.c               |  6 ++++++
 kernel/time/timekeeping.c           | 20 +++++++++++++++++++-
 5 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index cc07d2777bbe..d19a5c2d2270 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -157,6 +157,7 @@ enum  hrtimer_base_type {
 	HRTIMER_BASE_MONOTONIC,
 	HRTIMER_BASE_REALTIME,
 	HRTIMER_BASE_BOOTTIME,
+	HRTIMER_BASE_TAI,
 	HRTIMER_MAX_CLOCK_BASES,
 };
 
@@ -327,7 +328,9 @@ extern ktime_t ktime_get(void);
 extern ktime_t ktime_get_real(void);
 extern ktime_t ktime_get_boottime(void);
 extern ktime_t ktime_get_monotonic_offset(void);
-extern ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot);
+extern ktime_t ktime_get_clocktai(void);
+extern ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
+					 ktime_t *offs_tai);
 
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index ff94f436f8b7..26700d870506 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -64,6 +64,8 @@ struct timekeeper {
 	struct timespec		raw_time;
 	/* The current UTC to TAI offset in seconds */
 	s32			tai_offset;
+	/* Offset clock monotonic -> clock tai */
+	ktime_t			offs_tai;
 
 	/* Seqlock for all timekeeper values */
 	seqlock_t		lock;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cc47812d3feb..258720741d3e 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -83,6 +83,12 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 			.get_time = &ktime_get_boottime,
 			.resolution = KTIME_LOW_RES,
 		},
+		{
+			.index = HRTIMER_BASE_TAI,
+			.clockid = CLOCK_TAI,
+			.get_time = &ktime_get_clocktai,
+			.resolution = KTIME_LOW_RES,
+		},
 	}
 };
 
@@ -90,6 +96,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
 	[CLOCK_REALTIME]	= HRTIMER_BASE_REALTIME,
 	[CLOCK_MONOTONIC]	= HRTIMER_BASE_MONOTONIC,
 	[CLOCK_BOOTTIME]	= HRTIMER_BASE_BOOTTIME,
+	[CLOCK_TAI]		= HRTIMER_BASE_TAI,
 };
 
 static inline int hrtimer_clockid_to_base(clockid_t clock_id)
@@ -106,8 +113,10 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 {
 	ktime_t xtim, mono, boot;
 	struct timespec xts, tom, slp;
+	s32 tai_offset;
 
 	get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
+	tai_offset = timekeeping_get_tai_offset();
 
 	xtim = timespec_to_ktime(xts);
 	mono = ktime_add(xtim, timespec_to_ktime(tom));
@@ -115,6 +124,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 	base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
 	base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
 	base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
+	base->clock_base[HRTIMER_BASE_TAI].softirq_time =
+				ktime_add(xtim,	ktime_set(tai_offset, 0));
 }
 
 /*
@@ -651,8 +662,9 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
 {
 	ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
 	ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
+	ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
 
-	return ktime_get_update_offsets(offs_real, offs_boot);
+	return ktime_get_update_offsets(offs_real, offs_boot, offs_tai);
 }
 
 /*
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index fbfc5f1b7710..2a2e173d0a7a 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -269,6 +269,12 @@ static __init int init_posix_timers(void)
 	struct k_clock clock_tai = {
 		.clock_getres	= hrtimer_get_res,
 		.clock_get	= posix_get_tai,
+		.nsleep		= common_nsleep,
+		.nsleep_restart	= hrtimer_nanosleep_restart,
+		.timer_create	= common_timer_create,
+		.timer_set	= common_timer_set,
+		.timer_get	= common_timer_get,
+		.timer_del	= common_timer_del,
 	};
 	struct k_clock clock_boottime = {
 		.clock_getres	= hrtimer_get_res,
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 8a842756572d..8061ae0be7bd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -67,6 +67,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
 	tk->wall_to_monotonic = wtm;
 	set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
 	tk->offs_real = timespec_to_ktime(tmp);
+	tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tk->tai_offset, 0));
 }
 
 static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
@@ -409,6 +410,20 @@ void timekeeping_clocktai(struct timespec *ts)
 EXPORT_SYMBOL(timekeeping_clocktai);
 
 
+/**
+ * ktime_get_clocktai - Returns the TAI time of day in a ktime
+ *
+ * Returns the time of day in a ktime.
+ */
+ktime_t ktime_get_clocktai(void)
+{
+	struct timespec ts;
+
+	timekeeping_clocktai(&ts);
+	return timespec_to_ktime(ts);
+}
+EXPORT_SYMBOL(ktime_get_clocktai);
+
 #ifdef CONFIG_NTP_PPS
 
 /**
@@ -569,6 +584,7 @@ s32 timekeeping_get_tai_offset(void)
 void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
 {
 	tk->tai_offset = tai_offset;
+	tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0));
 }
 
 /**
@@ -1539,7 +1555,8 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
  * Returns current monotonic time and updates the offsets
  * Called from hrtimer_interupt() or retrigger_next_event()
  */
-ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
+ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
+							ktime_t *offs_tai)
 {
 	struct timekeeper *tk = &timekeeper;
 	ktime_t now;
@@ -1554,6 +1571,7 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
 
 		*offs_real = tk->offs_real;
 		*offs_boot = tk->offs_boot;
+		*offs_tai = tk->offs_tai;
 	} while (read_seqretry(&tk->lock, seq));
 
 	now = ktime_add_ns(ktime_set(secs, 0), nsecs);

From 23a9537a6999fce16f06ca61fc6cac52c8fbdc86 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Feb 2013 22:51:36 +0000
Subject: [PATCH 21/52] timekeeping: Calc stuff once

Calculate the cycle interval shifted value once. No functional change,
just makes the code more readable.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 8061ae0be7bd..c442a4ccccc9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1250,15 +1250,16 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
 static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
 						u32 shift)
 {
+	cycle_t interval = tk->cycle_interval << shift;
 	u64 raw_nsecs;
 
 	/* If the offset is smaller then a shifted interval, do nothing */
-	if (offset < tk->cycle_interval<<shift)
+	if (offset < interval)
 		return offset;
 
 	/* Accumulate one shifted interval */
-	offset -= tk->cycle_interval << shift;
-	tk->clock->cycle_last += tk->cycle_interval << shift;
+	offset -= interval;
+	tk->clock->cycle_last += interval;
 
 	tk->xtime_nsec += tk->xtime_interval << shift;
 	accumulate_nsecs_to_secs(tk);

From eb93e4d93093615c60cb7dd3dcb24e46bd7d62d4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Feb 2013 22:51:36 +0000
Subject: [PATCH 22/52] timekeeping: Make jiffies_lock internal

Nothing outside of the timekeeping core needs that lock.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/jiffies.h     | 1 -
 kernel/time/tick-internal.h | 2 ++
 kernel/time/timekeeping.c   | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 82ed068b1ebe..8fb8edf12417 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -75,7 +75,6 @@ extern int register_refined_jiffies(long clock_tick_rate);
  */
 extern u64 __jiffy_data jiffies_64;
 extern unsigned long volatile __jiffy_data jiffies;
-extern seqlock_t jiffies_lock;
 
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void);
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index cf3e59ed6dc0..f5c9207967cf 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -4,6 +4,8 @@
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
 
+extern seqlock_t jiffies_lock;
+
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
 
 #define TICK_DO_TIMER_NONE	-1
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c442a4ccccc9..b0c648fc959f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -23,6 +23,7 @@
 #include <linux/stop_machine.h>
 #include <linux/pvclock_gtod.h>
 
+#include "tick-internal.h"
 
 static struct timekeeper timekeeper;
 

From 7e40672d930b369c1984457233ec5557aa53bfb8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Feb 2013 22:51:37 +0000
Subject: [PATCH 23/52] timekeeping: Move lock out of timekeeper struct

Make the lock a separate entity. Preparatory patch for shadow
timekeeper structure.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[Merged with CLOCK_TAI changes]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/timekeeper_internal.h |   2 -
 kernel/time/timekeeping.c           | 108 ++++++++++++++--------------
 2 files changed, 53 insertions(+), 57 deletions(-)

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 26700d870506..a151bd70e52b 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -67,8 +67,6 @@ struct timekeeper {
 	/* Offset clock monotonic -> clock tai */
 	ktime_t			offs_tai;
 
-	/* Seqlock for all timekeeper values */
-	seqlock_t		lock;
 };
 
 static inline struct timespec tk_xtime(struct timekeeper *tk)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index b0c648fc959f..caede71c0a35 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -26,6 +26,7 @@
 #include "tick-internal.h"
 
 static struct timekeeper timekeeper;
+static DEFINE_SEQLOCK(timekeeper_lock);
 
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
@@ -212,11 +213,11 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb)
 	unsigned long flags;
 	int ret;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 	ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
 	/* update timekeeping data */
 	update_pvclock_gtod(tk);
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	return ret;
 }
@@ -230,13 +231,12 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
  */
 int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
 {
-	struct timekeeper *tk = &timekeeper;
 	unsigned long flags;
 	int ret;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 	ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	return ret;
 }
@@ -296,12 +296,12 @@ int __getnstimeofday(struct timespec *ts)
 	s64 nsecs = 0;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		ts->tv_sec = tk->xtime_sec;
 		nsecs = timekeeping_get_ns(tk);
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	ts->tv_nsec = 0;
 	timespec_add_ns(ts, nsecs);
@@ -337,11 +337,11 @@ ktime_t ktime_get(void)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
 		nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec;
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 	/*
 	 * Use ktime_set/ktime_add_ns to create a proper ktime on
 	 * 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -368,12 +368,12 @@ void ktime_get_ts(struct timespec *ts)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		ts->tv_sec = tk->xtime_sec;
 		nsec = timekeeping_get_ns(tk);
 		tomono = tk->wall_to_monotonic;
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	ts->tv_sec += tomono.tv_sec;
 	ts->tv_nsec = 0;
@@ -397,12 +397,12 @@ void timekeeping_clocktai(struct timespec *ts)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		ts->tv_sec = tk->xtime_sec + tk->tai_offset;
 		nsecs = timekeeping_get_ns(tk);
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	ts->tv_nsec = 0;
 	timespec_add_ns(ts, nsecs);
@@ -445,7 +445,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 	WARN_ON_ONCE(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		*ts_raw = tk->raw_time;
 		ts_real->tv_sec = tk->xtime_sec;
@@ -454,7 +454,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 		nsecs_raw = timekeeping_get_ns_raw(tk);
 		nsecs_real = timekeeping_get_ns(tk);
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	timespec_add_ns(ts_raw, nsecs_raw);
 	timespec_add_ns(ts_real, nsecs_real);
@@ -494,7 +494,7 @@ int do_settimeofday(const struct timespec *tv)
 	if (!timespec_valid_strict(tv))
 		return -EINVAL;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 
 	timekeeping_forward_now(tk);
 
@@ -508,7 +508,7 @@ int do_settimeofday(const struct timespec *tv)
 
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	/* signal hrtimers about time change */
 	clock_was_set();
@@ -533,7 +533,7 @@ int timekeeping_inject_offset(struct timespec *ts)
 	if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
 		return -EINVAL;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 
 	timekeeping_forward_now(tk);
 
@@ -550,7 +550,7 @@ int timekeeping_inject_offset(struct timespec *ts)
 error: /* even if we error out, we forwarded the time, so call update */
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	/* signal hrtimers about time change */
 	clock_was_set();
@@ -571,9 +571,9 @@ s32 timekeeping_get_tai_offset(void)
 	s32 ret;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		ret = tk->tai_offset;
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	return ret;
 }
@@ -597,9 +597,9 @@ void timekeeping_set_tai_offset(s32 tai_offset)
 	struct timekeeper *tk = &timekeeper;
 	unsigned long flags;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 	__timekeeping_set_tai_offset(tk, tai_offset);
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 }
 
 /**
@@ -615,7 +615,7 @@ static int change_clocksource(void *data)
 
 	new = (struct clocksource *) data;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 
 	timekeeping_forward_now(tk);
 	if (!new->enable || new->enable(new) == 0) {
@@ -626,7 +626,7 @@ static int change_clocksource(void *data)
 	}
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	return 0;
 }
@@ -676,11 +676,11 @@ void getrawmonotonic(struct timespec *ts)
 	s64 nsecs;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		nsecs = timekeeping_get_ns_raw(tk);
 		*ts = tk->raw_time;
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	timespec_add_ns(ts, nsecs);
 }
@@ -696,11 +696,11 @@ int timekeeping_valid_for_hres(void)
 	int ret;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	return ret;
 }
@@ -715,11 +715,11 @@ u64 timekeeping_max_deferment(void)
 	u64 ret;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		ret = tk->clock->max_idle_ns;
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	return ret;
 }
@@ -782,11 +782,9 @@ void __init timekeeping_init(void)
 		boot.tv_nsec = 0;
 	}
 
-	seqlock_init(&tk->lock);
-
 	ntp_init();
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 	clock = clocksource_default_clock();
 	if (clock->enable)
 		clock->enable(clock);
@@ -805,7 +803,7 @@ void __init timekeeping_init(void)
 	tmp.tv_nsec = 0;
 	tk_set_sleep_time(tk, tmp);
 
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 }
 
 /* time in seconds when suspend began */
@@ -853,7 +851,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 	if (has_persistent_clock())
 		return;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 
 	timekeeping_forward_now(tk);
 
@@ -861,7 +859,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	/* signal hrtimers about time change */
 	clock_was_set();
@@ -888,7 +886,7 @@ static void timekeeping_resume(void)
 	clockevents_resume();
 	clocksource_resume();
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 
 	/*
 	 * After system resumes, we need to calculate the suspended time and
@@ -940,7 +938,7 @@ static void timekeeping_resume(void)
 	tk->ntp_error = 0;
 	timekeeping_suspended = 0;
 	timekeeping_update(tk, false);
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	touch_softlockup_watchdog();
 
@@ -959,7 +957,7 @@ static int timekeeping_suspend(void)
 
 	read_persistent_clock(&timekeeping_suspend_time);
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 	timekeeping_forward_now(tk);
 	timekeeping_suspended = 1;
 
@@ -982,7 +980,7 @@ static int timekeeping_suspend(void)
 		timekeeping_suspend_time =
 			timespec_add(timekeeping_suspend_time, delta_delta);
 	}
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
 	clocksource_suspend();
@@ -1322,7 +1320,7 @@ static void update_wall_time(void)
 	int shift = 0, maxshift;
 	unsigned long flags;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 
 	/* Make sure we're fully resumed: */
 	if (unlikely(timekeeping_suspended))
@@ -1377,7 +1375,7 @@ static void update_wall_time(void)
 	timekeeping_update(tk, false);
 
 out:
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 }
 
@@ -1425,13 +1423,13 @@ void get_monotonic_boottime(struct timespec *ts)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		ts->tv_sec = tk->xtime_sec;
 		nsec = timekeeping_get_ns(tk);
 		tomono = tk->wall_to_monotonic;
 		sleep = tk->total_sleep_time;
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
 	ts->tv_nsec = 0;
@@ -1490,10 +1488,10 @@ struct timespec current_kernel_time(void)
 	unsigned long seq;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		now = tk_xtime(tk);
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	return now;
 }
@@ -1506,11 +1504,11 @@ struct timespec get_monotonic_coarse(void)
 	unsigned long seq;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		now = tk_xtime(tk);
 		mono = tk->wall_to_monotonic;
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
 				now.tv_nsec + mono.tv_nsec);
@@ -1541,11 +1539,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
 	unsigned long seq;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		*xtim = tk_xtime(tk);
 		*wtom = tk->wall_to_monotonic;
 		*sleep = tk->total_sleep_time;
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 }
 
 #ifdef CONFIG_HIGH_RES_TIMERS
@@ -1566,7 +1564,7 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
 	u64 secs, nsecs;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		secs = tk->xtime_sec;
 		nsecs = timekeeping_get_ns(tk);
@@ -1574,7 +1572,7 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
 		*offs_real = tk->offs_real;
 		*offs_boot = tk->offs_boot;
 		*offs_tai = tk->offs_tai;
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	now = ktime_add_ns(ktime_set(secs, 0), nsecs);
 	now = ktime_sub(now, *offs_real);
@@ -1592,9 +1590,9 @@ ktime_t ktime_get_monotonic_offset(void)
 	struct timespec wtom;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		wtom = tk->wall_to_monotonic;
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	return timespec_to_ktime(wtom);
 }

From 9a7a71b1d0968fc2bd602b7481cde1d4872e01ff Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Feb 2013 22:51:38 +0000
Subject: [PATCH 24/52] timekeeping: Split timekeeper_lock into lock and
 seqcount

We want to shorten the seqcount write hold time. So split the seqlock
into a lock and a seqcount.

Open code the seqwrite_lock in the places which matter and drop the
sequence counter update where it's pointless.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[jstultz: Merge fixups from CLOCK_TAI collisions]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 132 +++++++++++++++++++++-----------------
 1 file changed, 73 insertions(+), 59 deletions(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index caede71c0a35..5e048e030c62 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -26,7 +26,8 @@
 #include "tick-internal.h"
 
 static struct timekeeper timekeeper;
-static DEFINE_SEQLOCK(timekeeper_lock);
+static DEFINE_RAW_SPINLOCK(timekeeper_lock);
+static seqcount_t timekeeper_seq;
 
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
@@ -204,8 +205,6 @@ static void update_pvclock_gtod(struct timekeeper *tk)
 
 /**
  * pvclock_gtod_register_notifier - register a pvclock timedata update listener
- *
- * Must hold write on timekeeper.lock
  */
 int pvclock_gtod_register_notifier(struct notifier_block *nb)
 {
@@ -213,11 +212,10 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb)
 	unsigned long flags;
 	int ret;
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
 	ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
-	/* update timekeeping data */
 	update_pvclock_gtod(tk);
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 	return ret;
 }
@@ -226,23 +224,21 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
 /**
  * pvclock_gtod_unregister_notifier - unregister a pvclock
  * timedata update listener
- *
- * Must hold write on timekeeper.lock
  */
 int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
 {
 	unsigned long flags;
 	int ret;
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
 	ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 	return ret;
 }
 EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
 
-/* must hold write on timekeeper.lock */
+/* must hold timekeeper_lock */
 static void timekeeping_update(struct timekeeper *tk, bool clearntp)
 {
 	if (clearntp) {
@@ -296,12 +292,12 @@ int __getnstimeofday(struct timespec *ts)
 	s64 nsecs = 0;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 
 		ts->tv_sec = tk->xtime_sec;
 		nsecs = timekeeping_get_ns(tk);
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	ts->tv_nsec = 0;
 	timespec_add_ns(ts, nsecs);
@@ -337,11 +333,11 @@ ktime_t ktime_get(void)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 		secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
 		nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec;
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 	/*
 	 * Use ktime_set/ktime_add_ns to create a proper ktime on
 	 * 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -368,12 +364,12 @@ void ktime_get_ts(struct timespec *ts)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 		ts->tv_sec = tk->xtime_sec;
 		nsec = timekeeping_get_ns(tk);
 		tomono = tk->wall_to_monotonic;
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	ts->tv_sec += tomono.tv_sec;
 	ts->tv_nsec = 0;
@@ -397,12 +393,12 @@ void timekeeping_clocktai(struct timespec *ts)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 
 		ts->tv_sec = tk->xtime_sec + tk->tai_offset;
 		nsecs = timekeeping_get_ns(tk);
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	ts->tv_nsec = 0;
 	timespec_add_ns(ts, nsecs);
@@ -445,7 +441,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 	WARN_ON_ONCE(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 
 		*ts_raw = tk->raw_time;
 		ts_real->tv_sec = tk->xtime_sec;
@@ -454,7 +450,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 		nsecs_raw = timekeeping_get_ns_raw(tk);
 		nsecs_real = timekeeping_get_ns(tk);
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	timespec_add_ns(ts_raw, nsecs_raw);
 	timespec_add_ns(ts_real, nsecs_real);
@@ -494,7 +490,8 @@ int do_settimeofday(const struct timespec *tv)
 	if (!timespec_valid_strict(tv))
 		return -EINVAL;
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 
 	timekeeping_forward_now(tk);
 
@@ -508,7 +505,8 @@ int do_settimeofday(const struct timespec *tv)
 
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 	/* signal hrtimers about time change */
 	clock_was_set();
@@ -533,7 +531,8 @@ int timekeeping_inject_offset(struct timespec *ts)
 	if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
 		return -EINVAL;
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 
 	timekeeping_forward_now(tk);
 
@@ -550,7 +549,8 @@ int timekeeping_inject_offset(struct timespec *ts)
 error: /* even if we error out, we forwarded the time, so call update */
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 	/* signal hrtimers about time change */
 	clock_was_set();
@@ -571,9 +571,9 @@ s32 timekeeping_get_tai_offset(void)
 	s32 ret;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 		ret = tk->tai_offset;
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	return ret;
 }
@@ -597,9 +597,11 @@ void timekeeping_set_tai_offset(s32 tai_offset)
 	struct timekeeper *tk = &timekeeper;
 	unsigned long flags;
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 	__timekeeping_set_tai_offset(tk, tai_offset);
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 }
 
 /**
@@ -615,7 +617,8 @@ static int change_clocksource(void *data)
 
 	new = (struct clocksource *) data;
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 
 	timekeeping_forward_now(tk);
 	if (!new->enable || new->enable(new) == 0) {
@@ -626,7 +629,8 @@ static int change_clocksource(void *data)
 	}
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 	return 0;
 }
@@ -676,11 +680,11 @@ void getrawmonotonic(struct timespec *ts)
 	s64 nsecs;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 		nsecs = timekeeping_get_ns_raw(tk);
 		*ts = tk->raw_time;
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	timespec_add_ns(ts, nsecs);
 }
@@ -696,11 +700,11 @@ int timekeeping_valid_for_hres(void)
 	int ret;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 
 		ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	return ret;
 }
@@ -715,11 +719,11 @@ u64 timekeeping_max_deferment(void)
 	u64 ret;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 
 		ret = tk->clock->max_idle_ns;
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	return ret;
 }
@@ -784,7 +788,8 @@ void __init timekeeping_init(void)
 
 	ntp_init();
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 	clock = clocksource_default_clock();
 	if (clock->enable)
 		clock->enable(clock);
@@ -803,7 +808,8 @@ void __init timekeeping_init(void)
 	tmp.tv_nsec = 0;
 	tk_set_sleep_time(tk, tmp);
 
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 }
 
 /* time in seconds when suspend began */
@@ -851,7 +857,8 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 	if (has_persistent_clock())
 		return;
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 
 	timekeeping_forward_now(tk);
 
@@ -859,7 +866,8 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 	/* signal hrtimers about time change */
 	clock_was_set();
@@ -886,7 +894,8 @@ static void timekeeping_resume(void)
 	clockevents_resume();
 	clocksource_resume();
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 
 	/*
 	 * After system resumes, we need to calculate the suspended time and
@@ -938,7 +947,8 @@ static void timekeeping_resume(void)
 	tk->ntp_error = 0;
 	timekeeping_suspended = 0;
 	timekeeping_update(tk, false);
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 	touch_softlockup_watchdog();
 
@@ -957,7 +967,8 @@ static int timekeeping_suspend(void)
 
 	read_persistent_clock(&timekeeping_suspend_time);
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 	timekeeping_forward_now(tk);
 	timekeeping_suspended = 1;
 
@@ -980,7 +991,8 @@ static int timekeeping_suspend(void)
 		timekeeping_suspend_time =
 			timespec_add(timekeeping_suspend_time, delta_delta);
 	}
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
 	clocksource_suspend();
@@ -1320,7 +1332,8 @@ static void update_wall_time(void)
 	int shift = 0, maxshift;
 	unsigned long flags;
 
-	write_seqlock_irqsave(&timekeeper_lock, flags);
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
 
 	/* Make sure we're fully resumed: */
 	if (unlikely(timekeeping_suspended))
@@ -1375,7 +1388,8 @@ static void update_wall_time(void)
 	timekeeping_update(tk, false);
 
 out:
-	write_sequnlock_irqrestore(&timekeeper_lock, flags);
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
 }
 
@@ -1423,13 +1437,13 @@ void get_monotonic_boottime(struct timespec *ts)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 		ts->tv_sec = tk->xtime_sec;
 		nsec = timekeeping_get_ns(tk);
 		tomono = tk->wall_to_monotonic;
 		sleep = tk->total_sleep_time;
 
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
 	ts->tv_nsec = 0;
@@ -1488,10 +1502,10 @@ struct timespec current_kernel_time(void)
 	unsigned long seq;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 
 		now = tk_xtime(tk);
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	return now;
 }
@@ -1504,11 +1518,11 @@ struct timespec get_monotonic_coarse(void)
 	unsigned long seq;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 
 		now = tk_xtime(tk);
 		mono = tk->wall_to_monotonic;
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
 				now.tv_nsec + mono.tv_nsec);
@@ -1539,11 +1553,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
 	unsigned long seq;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 		*xtim = tk_xtime(tk);
 		*wtom = tk->wall_to_monotonic;
 		*sleep = tk->total_sleep_time;
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 }
 
 #ifdef CONFIG_HIGH_RES_TIMERS
@@ -1564,7 +1578,7 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
 	u64 secs, nsecs;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 
 		secs = tk->xtime_sec;
 		nsecs = timekeeping_get_ns(tk);
@@ -1572,7 +1586,7 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
 		*offs_real = tk->offs_real;
 		*offs_boot = tk->offs_boot;
 		*offs_tai = tk->offs_tai;
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	now = ktime_add_ns(ktime_set(secs, 0), nsecs);
 	now = ktime_sub(now, *offs_real);
@@ -1590,9 +1604,9 @@ ktime_t ktime_get_monotonic_offset(void)
 	struct timespec wtom;
 
 	do {
-		seq = read_seqbegin(&timekeeper_lock);
+		seq = read_seqcount_begin(&timekeeper_seq);
 		wtom = tk->wall_to_monotonic;
-	} while (read_seqretry(&timekeeper_lock, seq));
+	} while (read_seqcount_retry(&timekeeper_seq, seq));
 
 	return timespec_to_ktime(wtom);
 }

From 8ffbc7d9ff0064ce000788272a609fa79d06ead1 Mon Sep 17 00:00:00 2001
From: David Daney <ddaney.cavm@gmail.com>
Date: Wed, 13 Mar 2013 12:20:38 -0700
Subject: [PATCH 25/52] hrtimer/trivial: Fix comment text in hrtimer.c

The comments mention HRTIMER_ABS and HRTIMER_REL, these symbols don't
exist, the proper names are HRTIMER_MODE_ABS and HRTIMER_MODE_REL.

Signed-off-by: David Daney <david.daney@cavium.com>
Cc: Jiri Kosina <trivial@kernel.org>
Link: http://lkml.kernel.org/r/1363202438-21234-1-git-send-email-ddaney.cavm@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cc47812d3feb..031184e3d872 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1010,7 +1010,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
  * @timer:	the timer to be added
  * @tim:	expiry time
  * @delta_ns:	"slack" range for the timer
- * @mode:	expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ * @mode:	expiry mode: absolute (HRTIMER_MODE_ABS) or
+ *		relative (HRTIMER_MODE_REL)
  *
  * Returns:
  *  0 on success
@@ -1027,7 +1028,8 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
  * hrtimer_start - (re)start an hrtimer on the current CPU
  * @timer:	the timer to be added
  * @tim:	expiry time
- * @mode:	expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ * @mode:	expiry mode: absolute (HRTIMER_MODE_ABS) or
+ *		relative (HRTIMER_MODE_REL)
  *
  * Returns:
  *  0 on success

From cfea7d7e452f57682a0bb55a55e9f79c569558c2 Mon Sep 17 00:00:00 2001
From: Rado Vrbovsky <rvrbovsk@redhat.com>
Date: Fri, 8 Feb 2013 12:37:30 -0500
Subject: [PATCH 26/52] tick: Change log level of NOHZ: local_softirq_pending
 message

The "NOHZ: local_softirq_pending" message is a largely informational
message.  This makes extra work for customers that have a policy of
investigating all kernel log messages logged at <= KERN_ERR log level.
This patch sets the message to a different log level.

[ tglx: Use pr_warn() ]

Signed-off-by: Rado Vrbovsky <rvrbovsk@redhat.com>
Cc: Don Zickus <dzickus@redhat.com>
Link: http://lkml.kernel.org/r/2037057938.893524.1360345050772.JavaMail.root@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a19a39952c1b..225f8bf19095 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -482,8 +482,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 
 		if (ratelimit < 10 &&
 		    (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
-			printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
-			       (unsigned int) local_softirq_pending());
+			pr_warn("NOHZ: local_softirq_pending %02x\n",
+				(unsigned int) local_softirq_pending());
 			ratelimit++;
 		}
 		return false;

From dd5d70e869f960bde6376f4447fff59f16186cf5 Mon Sep 17 00:00:00 2001
From: Fengguang Wu <fengguang.wu@intel.com>
Date: Mon, 25 Mar 2013 12:24:24 -0700
Subject: [PATCH 27/52] timekeeping: __timekeeping_set_tai_offset can be static

Yet again, the kbuild test robot saves the day, noting
I left out defining __timekeeping_set_tai_offset as
static. It even sent me this patch.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5e048e030c62..c5feb7aa3acb 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -582,7 +582,7 @@ s32 timekeeping_get_tai_offset(void)
  * __timekeeping_set_tai_offset - Lock free worker function
  *
  */
-void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
+static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
 {
 	tk->tai_offset = tai_offset;
 	tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0));

From 8011657b9e63cb2e914b9a0f75233b910c1854cb Mon Sep 17 00:00:00 2001
From: Christian Daudt <csd@broadcom.com>
Date: Wed, 13 Mar 2013 14:27:27 -0700
Subject: [PATCH 28/52] ARM: bcm281xx: Add timer driver (driver portion)

This adds support for the Broadcom timer, used in the following SoCs:
BCM11130, BCM11140, BCM11351, BCM28145, BCM28155

Updates from V6:
- Split DT portion into a separate patch

Updates from V5:
- Rebase to latest arm-soc/for-next

Updates from V4:
- Switch code to use CLOCKSOURCE_OF_DECLARE

Updates from V3:
- Migrate to 3.9 timer framework updates

Updates from V2:
- prepend static fns + fields with kona_

Updates from V1:
- Rename bcm_timer.c to bcm_kona_timer.c
- Pull .h into bcm_kona_timer.c
- Make timers static
- Clean up comment block
- Switched to using clockevents_config_and_register
- Added an error to the get_timer loop if it repeats too much
- Added to Documentation/devicetree/bindings/arm/bcm/bcm,kona-timer.txt
- Added missing readl to timer_disable_and_clear

Note: bcm,kona-timer was kept as the 'compatible' field to make it
specific enough for when there are multiple bcm timers (bcm,timer is
too generic).

Signed-off-by: Christian Daudt <csd@broadcom.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: John Stultz <john.stultz@linaro.org>
Reviewed-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 arch/arm/mach-bcm/Kconfig            |   1 +
 arch/arm/mach-bcm/board_bcm.c        |   7 +-
 drivers/clocksource/Makefile         |   1 +
 drivers/clocksource/bcm_kona_timer.c | 211 +++++++++++++++++++++++++++
 4 files changed, 215 insertions(+), 5 deletions(-)
 create mode 100644 drivers/clocksource/bcm_kona_timer.c

diff --git a/arch/arm/mach-bcm/Kconfig b/arch/arm/mach-bcm/Kconfig
index bf02471d7e7c..f11289519c39 100644
--- a/arch/arm/mach-bcm/Kconfig
+++ b/arch/arm/mach-bcm/Kconfig
@@ -6,6 +6,7 @@ config ARCH_BCM
 	select ARM_ERRATA_764369 if SMP
 	select ARM_GIC
 	select CPU_V7
+	select CLKSRC_OF
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_TIME
 	select GPIO_BCM
diff --git a/arch/arm/mach-bcm/board_bcm.c b/arch/arm/mach-bcm/board_bcm.c
index f0f9abafad29..259593540477 100644
--- a/arch/arm/mach-bcm/board_bcm.c
+++ b/arch/arm/mach-bcm/board_bcm.c
@@ -16,14 +16,11 @@
 #include <linux/device.h>
 #include <linux/platform_device.h>
 #include <linux/irqchip.h>
+#include <linux/clocksource.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/time.h>
 
-static void timer_init(void)
-{
-}
-
 
 static void __init board_init(void)
 {
@@ -35,7 +32,7 @@ static const char * const bcm11351_dt_compat[] = { "bcm,bcm11351", NULL, };
 
 DT_MACHINE_START(BCM11351_DT, "Broadcom Application Processor")
 	.init_irq = irqchip_init,
-	.init_time = timer_init,
+	.init_time = clocksource_of_init,
 	.init_machine = board_init,
 	.dt_compat = bcm11351_dt_compat,
 MACHINE_END
diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
index 4d8283aec5b5..96e25319659b 100644
--- a/drivers/clocksource/Makefile
+++ b/drivers/clocksource/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_ARCH_BCM2835)	+= bcm2835_timer.o
 obj-$(CONFIG_SUNXI_TIMER)	+= sunxi_timer.o
 obj-$(CONFIG_ARCH_TEGRA)	+= tegra20_timer.o
 obj-$(CONFIG_VT8500_TIMER)	+= vt8500_timer.o
+obj-$(CONFIG_ARCH_BCM)		+= bcm_kona_timer.o
 
 obj-$(CONFIG_ARM_ARCH_TIMER)		+= arm_arch_timer.o
 obj-$(CONFIG_CLKSRC_METAG_GENERIC)	+= metag_generic.o
diff --git a/drivers/clocksource/bcm_kona_timer.c b/drivers/clocksource/bcm_kona_timer.c
new file mode 100644
index 000000000000..350f49356458
--- /dev/null
+++ b/drivers/clocksource/bcm_kona_timer.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (C) 2012 Broadcom Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation version 2.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/jiffies.h>
+#include <linux/clockchips.h>
+#include <linux/types.h>
+
+#include <linux/io.h>
+#include <asm/mach/time.h>
+
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+
+
+#define KONA_GPTIMER_STCS_OFFSET			0x00000000
+#define KONA_GPTIMER_STCLO_OFFSET			0x00000004
+#define KONA_GPTIMER_STCHI_OFFSET			0x00000008
+#define KONA_GPTIMER_STCM0_OFFSET			0x0000000C
+
+#define KONA_GPTIMER_STCS_TIMER_MATCH_SHIFT		0
+#define KONA_GPTIMER_STCS_COMPARE_ENABLE_SHIFT		4
+
+struct kona_bcm_timers {
+	int tmr_irq;
+	void __iomem *tmr_regs;
+};
+
+static struct kona_bcm_timers timers;
+
+static u32 arch_timer_rate;
+
+/*
+ * We use the peripheral timers for system tick, the cpu global timer for
+ * profile tick
+ */
+static void kona_timer_disable_and_clear(void __iomem *base)
+{
+	uint32_t reg;
+
+	/*
+	 * clear and disable interrupts
+	 * We are using compare/match register 0 for our system interrupts
+	 */
+	reg = readl(base + KONA_GPTIMER_STCS_OFFSET);
+
+	/* Clear compare (0) interrupt */
+	reg |= 1 << KONA_GPTIMER_STCS_TIMER_MATCH_SHIFT;
+	/* disable compare */
+	reg &= ~(1 << KONA_GPTIMER_STCS_COMPARE_ENABLE_SHIFT);
+
+	writel(reg, base + KONA_GPTIMER_STCS_OFFSET);
+
+}
+
+static void
+kona_timer_get_counter(void *timer_base, uint32_t *msw, uint32_t *lsw)
+{
+	void __iomem *base = IOMEM(timer_base);
+	int loop_limit = 4;
+
+	/*
+	 * Read 64-bit free running counter
+	 * 1. Read hi-word
+	 * 2. Read low-word
+	 * 3. Read hi-word again
+	 * 4.1
+	 *      if new hi-word is not equal to previously read hi-word, then
+	 *      start from #1
+	 * 4.2
+	 *      if new hi-word is equal to previously read hi-word then stop.
+	 */
+
+	while (--loop_limit) {
+		*msw = readl(base + KONA_GPTIMER_STCHI_OFFSET);
+		*lsw = readl(base + KONA_GPTIMER_STCLO_OFFSET);
+		if (*msw == readl(base + KONA_GPTIMER_STCHI_OFFSET))
+			break;
+	}
+	if (!loop_limit) {
+		pr_err("bcm_kona_timer: getting counter failed.\n");
+		pr_err(" Timer will be impacted\n");
+	}
+
+	return;
+}
+
+static const struct of_device_id bcm_timer_ids[] __initconst = {
+	{.compatible = "bcm,kona-timer"},
+	{},
+};
+
+static void __init kona_timers_init(void)
+{
+	struct device_node *node;
+	u32 freq;
+
+	node = of_find_matching_node(NULL, bcm_timer_ids);
+
+	if (!node)
+		panic("No timer");
+
+	if (!of_property_read_u32(node, "clock-frequency", &freq))
+		arch_timer_rate = freq;
+	else
+		panic("clock-frequency not set in the .dts file");
+
+	/* Setup IRQ numbers */
+	timers.tmr_irq = irq_of_parse_and_map(node, 0);
+
+	/* Setup IO addresses */
+	timers.tmr_regs = of_iomap(node, 0);
+
+	kona_timer_disable_and_clear(timers.tmr_regs);
+}
+
+static int kona_timer_set_next_event(unsigned long clc,
+				  struct clock_event_device *unused)
+{
+	/*
+	 * timer (0) is disabled by the timer interrupt already
+	 * so, here we reload the next event value and re-enable
+	 * the timer.
+	 *
+	 * This way, we are potentially losing the time between
+	 * timer-interrupt->set_next_event. CPU local timers, when
+	 * they come in should get rid of skew.
+	 */
+
+	uint32_t lsw, msw;
+	uint32_t reg;
+
+	kona_timer_get_counter(timers.tmr_regs, &msw, &lsw);
+
+	/* Load the "next" event tick value */
+	writel(lsw + clc, timers.tmr_regs + KONA_GPTIMER_STCM0_OFFSET);
+
+	/* Enable compare */
+	reg = readl(timers.tmr_regs + KONA_GPTIMER_STCS_OFFSET);
+	reg |= (1 << KONA_GPTIMER_STCS_COMPARE_ENABLE_SHIFT);
+	writel(reg, timers.tmr_regs + KONA_GPTIMER_STCS_OFFSET);
+
+	return 0;
+}
+
+static void kona_timer_set_mode(enum clock_event_mode mode,
+			     struct clock_event_device *unused)
+{
+	switch (mode) {
+	case CLOCK_EVT_MODE_ONESHOT:
+		/* by default mode is one shot don't do any thing */
+		break;
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+	default:
+		kona_timer_disable_and_clear(timers.tmr_regs);
+	}
+}
+
+static struct clock_event_device kona_clockevent_timer = {
+	.name = "timer 1",
+	.features = CLOCK_EVT_FEAT_ONESHOT,
+	.set_next_event = kona_timer_set_next_event,
+	.set_mode = kona_timer_set_mode
+};
+
+static void __init kona_timer_clockevents_init(void)
+{
+	kona_clockevent_timer.cpumask = cpumask_of(0);
+	clockevents_config_and_register(&kona_clockevent_timer,
+		arch_timer_rate, 6, 0xffffffff);
+}
+
+static irqreturn_t kona_timer_interrupt(int irq, void *dev_id)
+{
+	struct clock_event_device *evt = &kona_clockevent_timer;
+
+	kona_timer_disable_and_clear(timers.tmr_regs);
+	evt->event_handler(evt);
+	return IRQ_HANDLED;
+}
+
+static struct irqaction kona_timer_irq = {
+	.name = "Kona Timer Tick",
+	.flags = IRQF_TIMER,
+	.handler = kona_timer_interrupt,
+};
+
+static void __init kona_timer_init(void)
+{
+	kona_timers_init();
+	kona_timer_clockevents_init();
+	setup_irq(timers.tmr_irq, &kona_timer_irq);
+	kona_timer_set_next_event((arch_timer_rate / HZ), NULL);
+}
+
+CLOCKSOURCE_OF_DECLARE(bcm_kona, "bcm,kona-timer",
+	kona_timer_init);

From ad460967a2953496ad76b1c22091ea99f21b4e86 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 22 Mar 2013 11:59:04 -0700
Subject: [PATCH 29/52] ntp: Split out timex validation from do_adjtimex

Split out the timex validation done in do_adjtimex into a separate
function. This will help simplify logic in following patches.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/ntp.c | 41 ++++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 59e2749be0fa..457d2ba245fe 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -622,17 +622,13 @@ static inline void process_adjtimex_modes(struct timex *txc,
 		ntp_update_frequency();
 }
 
-/*
- * adjtimex mainly allows reading (and writing, if superuser) of
- * kernel time-keeping variables. used by xntpd.
- */
-int do_adjtimex(struct timex *txc)
-{
-	struct timespec ts;
-	u32 time_tai, orig_tai;
-	int result;
 
-	/* Validate the data before disabling interrupts */
+
+/**
+ * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex
+ */
+int ntp_validate_timex(struct timex *txc)
+{
 	if (txc->modes & ADJ_ADJTIME) {
 		/* singleshot must not be used with any other mode bits */
 		if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
@@ -644,7 +640,6 @@ int do_adjtimex(struct timex *txc)
 		/* In order to modify anything, you gotta be super-user! */
 		 if (txc->modes && !capable(CAP_SYS_TIME))
 			return -EPERM;
-
 		/*
 		 * if the quartz is off by more than 10% then
 		 * something is VERY wrong!
@@ -655,12 +650,32 @@ int do_adjtimex(struct timex *txc)
 			return -EINVAL;
 	}
 
+	if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME)))
+		return -EPERM;
+
+	return 0;
+}
+
+
+/*
+ * adjtimex mainly allows reading (and writing, if superuser) of
+ * kernel time-keeping variables. used by xntpd.
+ */
+int do_adjtimex(struct timex *txc)
+{
+	struct timespec ts;
+	u32 time_tai, orig_tai;
+	int result;
+
+	/* Validate the data before disabling interrupts */
+	result = ntp_validate_timex(txc);
+	if (result)
+		return result;
+
 	if (txc->modes & ADJ_SETOFFSET) {
 		struct timespec delta;
 		delta.tv_sec  = txc->time.tv_sec;
 		delta.tv_nsec = txc->time.tv_usec;
-		if (!capable(CAP_SYS_TIME))
-			return -EPERM;
 		if (!(txc->modes & ADJ_NANO))
 			delta.tv_nsec *= 1000;
 		result = timekeeping_inject_offset(&delta);

From aa6f9c595d857328e5d815e5b94c0e7cd31a6b59 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 22 Mar 2013 11:31:29 -0700
Subject: [PATCH 30/52] ntp: Move do_adjtimex() and hardpps() functions to
 timekeeping.c

In preparation for changing the ntp locking rules, move
do_adjtimex and hardpps accessor functions to timekeeping.c,
but keep the code logic in ntp.c.

This patch also introduces a ntp_internal.h file so timekeeping
specific interfaces of ntp.c can be more limitedly shared with
timekeeping.c.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/timex.h      |  7 -------
 kernel/time/ntp.c          |  9 ++++-----
 kernel/time/ntp_internal.h | 11 +++++++++++
 kernel/time/timekeeping.c  | 21 +++++++++++++++++++++
 4 files changed, 36 insertions(+), 12 deletions(-)
 create mode 100644 kernel/time/ntp_internal.h

diff --git a/include/linux/timex.h b/include/linux/timex.h
index 5ec87c60b97c..b3726e61368e 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -125,9 +125,6 @@
 extern unsigned long tick_usec;		/* USER_HZ period (usec) */
 extern unsigned long tick_nsec;		/* SHIFTED_HZ period (nsec) */
 
-extern void ntp_init(void);
-extern void ntp_clear(void);
-
 /* Required to safely shift negative values */
 #define shift_right(x, s) ({	\
 	__typeof__(x) __x = (x);	\
@@ -140,10 +137,6 @@ extern void ntp_clear(void);
 #define NTP_INTERVAL_FREQ  (HZ)
 #define NTP_INTERVAL_LENGTH (NSEC_PER_SEC/NTP_INTERVAL_FREQ)
 
-/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
-extern u64 ntp_tick_length(void);
-
-extern int second_overflow(unsigned long secs);
 extern int do_adjtimex(struct timex *);
 extern void hardpps(const struct timespec *, const struct timespec *);
 
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 457d2ba245fe..8b107068d7e3 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -18,6 +18,7 @@
 #include <linux/rtc.h>
 
 #include "tick-internal.h"
+#include "ntp_internal.h"
 
 /*
  * NTP timekeeping variables:
@@ -661,7 +662,7 @@ int ntp_validate_timex(struct timex *txc)
  * adjtimex mainly allows reading (and writing, if superuser) of
  * kernel time-keeping variables. used by xntpd.
  */
-int do_adjtimex(struct timex *txc)
+int __do_adjtimex(struct timex *txc)
 {
 	struct timespec ts;
 	u32 time_tai, orig_tai;
@@ -911,7 +912,7 @@ static void hardpps_update_phase(long error)
 }
 
 /*
- * hardpps() - discipline CPU clock oscillator to external PPS signal
+ * __hardpps() - discipline CPU clock oscillator to external PPS signal
  *
  * This routine is called at each PPS signal arrival in order to
  * discipline the CPU clock oscillator to the PPS signal. It takes two
@@ -922,7 +923,7 @@ static void hardpps_update_phase(long error)
  * This code is based on David Mills's reference nanokernel
  * implementation. It was mostly rewritten but keeps the same idea.
  */
-void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 {
 	struct pps_normtime pts_norm, freq_norm;
 	unsigned long flags;
@@ -976,8 +977,6 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 
 	raw_spin_unlock_irqrestore(&ntp_lock, flags);
 }
-EXPORT_SYMBOL(hardpps);
-
 #endif	/* CONFIG_NTP_PPS */
 
 static int __init ntp_tick_adj_setup(char *str)
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
new file mode 100644
index 000000000000..fdee80cb34f3
--- /dev/null
+++ b/kernel/time/ntp_internal.h
@@ -0,0 +1,11 @@
+#ifndef _LINUX_NTP_INTERNAL_H
+#define _LINUX_NTP_INTERNAL_H
+
+extern void ntp_init(void);
+extern void ntp_clear(void);
+/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
+extern u64 ntp_tick_length(void);
+extern int second_overflow(unsigned long secs);
+extern int __do_adjtimex(struct timex *);
+extern void __hardpps(const struct timespec *, const struct timespec *);
+#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c5feb7aa3acb..a138ec2cde3e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -24,6 +24,7 @@
 #include <linux/pvclock_gtod.h>
 
 #include "tick-internal.h"
+#include "ntp_internal.h"
 
 static struct timekeeper timekeeper;
 static DEFINE_RAW_SPINLOCK(timekeeper_lock);
@@ -1612,6 +1613,26 @@ ktime_t ktime_get_monotonic_offset(void)
 }
 EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
 
+/**
+ * do_adjtimex() - Accessor function to NTP __do_adjtimex function
+ */
+int do_adjtimex(struct timex *txc)
+{
+	return __do_adjtimex(txc);
+}
+
+
+#ifdef CONFIG_NTP_PPS
+/**
+ * hardpps() - Accessor function to NTP __hardpps function
+ */
+void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+{
+	__hardpps(phase_ts, raw_ts);
+}
+EXPORT_SYMBOL(hardpps);
+#endif
+
 /**
  * xtime_update() - advances the timekeeping infrastructure
  * @ticks:	number of ticks, that have elapsed since the last call.

From e4085693f629ded8ac8c35b5cdd324d20242990b Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 22 Mar 2013 12:08:52 -0700
Subject: [PATCH 31/52] ntp: Move timex validation to timekeeping do_adjtimex
 call.

Move logic that does not need the ntp state to be done
in the timekeeping do_adjtimex() call.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/ntp.c          | 5 -----
 kernel/time/ntp_internal.h | 1 +
 kernel/time/timekeeping.c  | 7 +++++++
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 8b107068d7e3..2dc60c6fe76b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -668,11 +668,6 @@ int __do_adjtimex(struct timex *txc)
 	u32 time_tai, orig_tai;
 	int result;
 
-	/* Validate the data before disabling interrupts */
-	result = ntp_validate_timex(txc);
-	if (result)
-		return result;
-
 	if (txc->modes & ADJ_SETOFFSET) {
 		struct timespec delta;
 		delta.tv_sec  = txc->time.tv_sec;
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index fdee80cb34f3..a2a397659e46 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -6,6 +6,7 @@ extern void ntp_clear(void);
 /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
 extern u64 ntp_tick_length(void);
 extern int second_overflow(unsigned long secs);
+extern int ntp_validate_timex(struct timex *);
 extern int __do_adjtimex(struct timex *);
 extern void __hardpps(const struct timespec *, const struct timespec *);
 #endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index a138ec2cde3e..f6c8a7279157 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1618,6 +1618,13 @@ EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
  */
 int do_adjtimex(struct timex *txc)
 {
+	int ret;
+
+	/* Validate the data before disabling interrupts */
+	ret = ntp_validate_timex(txc);
+	if (ret)
+		return ret;
+
 	return __do_adjtimex(txc);
 }
 

From 87ace39b7168bd9d352c1c52b6f5d88eb1876cf8 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 22 Mar 2013 12:28:15 -0700
Subject: [PATCH 32/52] ntp: Rework do_adjtimex to take timespec and tai
 arguments

In order to change the locking rules, we need to provide
the timespec and tai values rather then having the ntp
logic acquire these values itself.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/ntp.c          | 18 +++++-------------
 kernel/time/ntp_internal.h |  2 +-
 kernel/time/timekeeping.c  | 13 +++++++++++--
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 2dc60c6fe76b..d17e13c0147d 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -662,10 +662,8 @@ int ntp_validate_timex(struct timex *txc)
  * adjtimex mainly allows reading (and writing, if superuser) of
  * kernel time-keeping variables. used by xntpd.
  */
-int __do_adjtimex(struct timex *txc)
+int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
 {
-	struct timespec ts;
-	u32 time_tai, orig_tai;
 	int result;
 
 	if (txc->modes & ADJ_SETOFFSET) {
@@ -679,9 +677,6 @@ int __do_adjtimex(struct timex *txc)
 			return result;
 	}
 
-	getnstimeofday(&ts);
-	orig_tai = time_tai = timekeeping_get_tai_offset();
-
 	raw_spin_lock_irq(&ntp_lock);
 
 	if (txc->modes & ADJ_ADJTIME) {
@@ -697,7 +692,7 @@ int __do_adjtimex(struct timex *txc)
 
 		/* If there are input parameters, then process them: */
 		if (txc->modes)
-			process_adjtimex_modes(txc, &ts, &time_tai);
+			process_adjtimex_modes(txc, ts, time_tai);
 
 		txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
 				  NTP_SCALE_SHIFT);
@@ -719,18 +714,15 @@ int __do_adjtimex(struct timex *txc)
 	txc->precision	   = 1;
 	txc->tolerance	   = MAXFREQ_SCALED / PPM_SCALE;
 	txc->tick	   = tick_usec;
-	txc->tai	   = time_tai;
+	txc->tai	   = *time_tai;
 
 	/* fill PPS status fields */
 	pps_fill_timex(txc);
 
 	raw_spin_unlock_irq(&ntp_lock);
 
-	if (time_tai != orig_tai)
-		timekeeping_set_tai_offset(time_tai);
-
-	txc->time.tv_sec = ts.tv_sec;
-	txc->time.tv_usec = ts.tv_nsec;
+	txc->time.tv_sec = ts->tv_sec;
+	txc->time.tv_usec = ts->tv_nsec;
 	if (!(time_status & STA_NANO))
 		txc->time.tv_usec /= NSEC_PER_USEC;
 
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index a2a397659e46..1950cb4ca2a4 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -7,6 +7,6 @@ extern void ntp_clear(void);
 extern u64 ntp_tick_length(void);
 extern int second_overflow(unsigned long secs);
 extern int ntp_validate_timex(struct timex *);
-extern int __do_adjtimex(struct timex *);
+extern int __do_adjtimex(struct timex *, struct timespec *, s32 *);
 extern void __hardpps(const struct timespec *, const struct timespec *);
 #endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f6c8a7279157..5f7a2330dc3c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1618,6 +1618,8 @@ EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
  */
 int do_adjtimex(struct timex *txc)
 {
+	struct timespec ts;
+	s32 tai, orig_tai;
 	int ret;
 
 	/* Validate the data before disabling interrupts */
@@ -1625,9 +1627,16 @@ int do_adjtimex(struct timex *txc)
 	if (ret)
 		return ret;
 
-	return __do_adjtimex(txc);
-}
+	getnstimeofday(&ts);
+	orig_tai = tai = timekeeping_get_tai_offset();
 
+	ret = __do_adjtimex(txc, &ts, &tai);
+
+	if (tai != orig_tai)
+		timekeeping_set_tai_offset(tai);
+
+	return ret;
+}
 
 #ifdef CONFIG_NTP_PPS
 /**

From cef90377fab488bd1f959efda178fb83250cf61d Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 22 Mar 2013 15:04:13 -0700
Subject: [PATCH 33/52] timekeeping: Move ADJ_SETOFFSET to top level
 do_adjtimex()

Since ADJ_SETOFFSET adjusts the timekeeping state, process
it as part of the top level do_adjtimex() function in
timekeeping.c.

This avoids deadlocks that could occur once we change the
ntp locking rules.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/ntp.c         | 11 -----------
 kernel/time/timekeeping.c | 11 +++++++++++
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index d17e13c0147d..a331ebc32e21 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -666,17 +666,6 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
 {
 	int result;
 
-	if (txc->modes & ADJ_SETOFFSET) {
-		struct timespec delta;
-		delta.tv_sec  = txc->time.tv_sec;
-		delta.tv_nsec = txc->time.tv_usec;
-		if (!(txc->modes & ADJ_NANO))
-			delta.tv_nsec *= 1000;
-		result = timekeeping_inject_offset(&delta);
-		if (result)
-			return result;
-	}
-
 	raw_spin_lock_irq(&ntp_lock);
 
 	if (txc->modes & ADJ_ADJTIME) {
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5f7a2330dc3c..e44915c7b16c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1627,6 +1627,17 @@ int do_adjtimex(struct timex *txc)
 	if (ret)
 		return ret;
 
+	if (txc->modes & ADJ_SETOFFSET) {
+		struct timespec delta;
+		delta.tv_sec  = txc->time.tv_sec;
+		delta.tv_nsec = txc->time.tv_usec;
+		if (!(txc->modes & ADJ_NANO))
+			delta.tv_nsec *= 1000;
+		ret = timekeeping_inject_offset(&delta);
+		if (ret)
+			return ret;
+	}
+
 	getnstimeofday(&ts);
 	orig_tai = tai = timekeeping_get_tai_offset();
 

From 06c017fdd4dc48451a29ac37fc1db4a3f86b7f40 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 22 Mar 2013 11:37:28 -0700
Subject: [PATCH 34/52] timekeeping: Hold timekeepering locks in do_adjtimex
 and hardpps

In moving the NTP state to be protected by the timekeeping locks,
be sure to acquire the timekeeping locks prior to calling
ntp functions.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e44915c7b16c..d10bd734b151 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -787,10 +787,10 @@ void __init timekeeping_init(void)
 		boot.tv_nsec = 0;
 	}
 
-	ntp_init();
-
 	raw_spin_lock_irqsave(&timekeeper_lock, flags);
 	write_seqcount_begin(&timekeeper_seq);
+	ntp_init();
+
 	clock = clocksource_default_clock();
 	if (clock->enable)
 		clock->enable(clock);
@@ -1618,6 +1618,7 @@ EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
  */
 int do_adjtimex(struct timex *txc)
 {
+	unsigned long flags;
 	struct timespec ts;
 	s32 tai, orig_tai;
 	int ret;
@@ -1641,8 +1642,14 @@ int do_adjtimex(struct timex *txc)
 	getnstimeofday(&ts);
 	orig_tai = tai = timekeeping_get_tai_offset();
 
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
+
 	ret = __do_adjtimex(txc, &ts, &tai);
 
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+
 	if (tai != orig_tai)
 		timekeeping_set_tai_offset(tai);
 
@@ -1655,7 +1662,15 @@ int do_adjtimex(struct timex *txc)
  */
 void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 {
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	write_seqcount_begin(&timekeeper_seq);
+
 	__hardpps(phase_ts, raw_ts);
+
+	write_seqcount_end(&timekeeper_seq);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 }
 EXPORT_SYMBOL(hardpps);
 #endif

From 0b5154fb9040cca94e7d9893384c0e78bfe2d296 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 22 Mar 2013 14:20:03 -0700
Subject: [PATCH 35/52] timekeeping: Simplify tai updating from do_adjtimex

Since we are taking the timekeeping locks, just go ahead
and update any tai change directly, rather then dropping
the lock and calling a function that will just take it again.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d10bd734b151..f93f60cd97ad 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1618,9 +1618,10 @@ EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
  */
 int do_adjtimex(struct timex *txc)
 {
+	struct timekeeper *tk = &timekeeper;
 	unsigned long flags;
 	struct timespec ts;
-	s32 tai, orig_tai;
+	s32 tai;
 	int ret;
 
 	/* Validate the data before disabling interrupts */
@@ -1640,19 +1641,17 @@ int do_adjtimex(struct timex *txc)
 	}
 
 	getnstimeofday(&ts);
-	orig_tai = tai = timekeeping_get_tai_offset();
 
 	raw_spin_lock_irqsave(&timekeeper_lock, flags);
 	write_seqcount_begin(&timekeeper_seq);
 
+	tai = tk->tai_offset;
 	ret = __do_adjtimex(txc, &ts, &tai);
 
+	__timekeeping_set_tai_offset(tk, tai);
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
-	if (tai != orig_tai)
-		timekeeping_set_tai_offset(tai);
-
 	return ret;
 }
 

From a076b2146fabb0894cae5e0189a8ba3f1502d737 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 22 Mar 2013 11:52:03 -0700
Subject: [PATCH 36/52] ntp: Remove ntp_lock, using the timekeeping locks to
 protect ntp state

In order to properly handle the NTP state in future changes to the
timekeeping lock management, this patch moves the management of
all of the ntp state under the timekeeping locks.

This allows us to remove the ntp_lock.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/ntp.c | 41 ++++-------------------------------------
 1 file changed, 4 insertions(+), 37 deletions(-)

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index a331ebc32e21..12ff13a838c6 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -22,10 +22,10 @@
 
 /*
  * NTP timekeeping variables:
+ *
+ * Note: All of the NTP state is protected by the timekeeping locks.
  */
 
-DEFINE_RAW_SPINLOCK(ntp_lock);
-
 
 /* USER_HZ period (usecs): */
 unsigned long			tick_usec = TICK_USEC;
@@ -132,8 +132,6 @@ static inline void pps_reset_freq_interval(void)
 
 /**
  * pps_clear - Clears the PPS state variables
- *
- * Must be called while holding a write on the ntp_lock
  */
 static inline void pps_clear(void)
 {
@@ -148,8 +146,6 @@ static inline void pps_clear(void)
 /* Decrease pps_valid to indicate that another second has passed since
  * the last PPS signal. When it reaches 0, indicate that PPS signal is
  * missing.
- *
- * Must be called while holding a write on the ntp_lock
  */
 static inline void pps_dec_valid(void)
 {
@@ -344,10 +340,6 @@ static void ntp_update_offset(long offset)
  */
 void ntp_clear(void)
 {
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&ntp_lock, flags);
-
 	time_adjust	= 0;		/* stop active adjtime() */
 	time_status	|= STA_UNSYNC;
 	time_maxerror	= NTP_PHASE_LIMIT;
@@ -360,20 +352,12 @@ void ntp_clear(void)
 
 	/* Clear PPS state variables */
 	pps_clear();
-	raw_spin_unlock_irqrestore(&ntp_lock, flags);
-
 }
 
 
 u64 ntp_tick_length(void)
 {
-	unsigned long flags;
-	s64 ret;
-
-	raw_spin_lock_irqsave(&ntp_lock, flags);
-	ret = tick_length;
-	raw_spin_unlock_irqrestore(&ntp_lock, flags);
-	return ret;
+	return tick_length;
 }
 
 
@@ -391,9 +375,6 @@ int second_overflow(unsigned long secs)
 {
 	s64 delta;
 	int leap = 0;
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&ntp_lock, flags);
 
 	/*
 	 * Leap second processing. If in leap-insert state at the end of the
@@ -475,8 +456,6 @@ int second_overflow(unsigned long secs)
 	time_adjust = 0;
 
 out:
-	raw_spin_unlock_irqrestore(&ntp_lock, flags);
-
 	return leap;
 }
 
@@ -571,10 +550,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
 	time_status |= txc->status & ~STA_RONLY;
 }
 
-/*
- * Called with ntp_lock held, so we can access and modify
- * all the global NTP state:
- */
+
 static inline void process_adjtimex_modes(struct timex *txc,
 						struct timespec *ts,
 						s32 *time_tai)
@@ -666,8 +642,6 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
 {
 	int result;
 
-	raw_spin_lock_irq(&ntp_lock);
-
 	if (txc->modes & ADJ_ADJTIME) {
 		long save_adjust = time_adjust;
 
@@ -708,8 +682,6 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
 	/* fill PPS status fields */
 	pps_fill_timex(txc);
 
-	raw_spin_unlock_irq(&ntp_lock);
-
 	txc->time.tv_sec = ts->tv_sec;
 	txc->time.tv_usec = ts->tv_nsec;
 	if (!(time_status & STA_NANO))
@@ -906,8 +878,6 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 
 	pts_norm = pps_normalize_ts(*phase_ts);
 
-	raw_spin_lock_irqsave(&ntp_lock, flags);
-
 	/* clear the error bits, they will be set again if needed */
 	time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
 
@@ -919,7 +889,6 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 	 * just start the frequency interval */
 	if (unlikely(pps_fbase.tv_sec == 0)) {
 		pps_fbase = *raw_ts;
-		raw_spin_unlock_irqrestore(&ntp_lock, flags);
 		return;
 	}
 
@@ -934,7 +903,6 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 		time_status |= STA_PPSJITTER;
 		/* restart the frequency calibration interval */
 		pps_fbase = *raw_ts;
-		raw_spin_unlock_irqrestore(&ntp_lock, flags);
 		pr_err("hardpps: PPSJITTER: bad pulse\n");
 		return;
 	}
@@ -951,7 +919,6 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 
 	hardpps_update_phase(pts_norm.nsec);
 
-	raw_spin_unlock_irqrestore(&ntp_lock, flags);
 }
 #endif	/* CONFIG_NTP_PPS */
 

From 14a3b6abe98c8f53a13522610c257accef7321df Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Feb 2013 22:51:38 +0000
Subject: [PATCH 37/52] timekeeping: Store cycle_last value in timekeeper
 struct as well

For implementing a shadow timekeeper and a split calculation/update
region we need to store the cycle_last value in the timekeeper and
update the value in the clocksource struct only in the update region.

Add the extra storage to the timekeeper.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/timekeeper_internal.h | 2 ++
 kernel/time/timekeeping.c           | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index a151bd70e52b..c1825eb436ed 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -20,6 +20,8 @@ struct timekeeper {
 	u32			shift;
 	/* Number of clock cycles in one NTP interval. */
 	cycle_t			cycle_interval;
+	/* Last cycle value (also stored in clock->cycle_last) */
+	cycle_t			cycle_last;
 	/* Number of clock shifted nano seconds in one NTP interval. */
 	u64			xtime_interval;
 	/* shifted nano seconds left over when rounding cycle_interval */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f93f60cd97ad..4c276b2d022d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -101,7 +101,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 
 	old_clock = tk->clock;
 	tk->clock = clock;
-	clock->cycle_last = clock->read(clock);
+	tk->cycle_last = clock->cycle_last = clock->read(clock);
 
 	/* Do the ns -> cycle conversion first, using original mult */
 	tmp = NTP_INTERVAL_LENGTH;
@@ -266,7 +266,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
 	clock = tk->clock;
 	cycle_now = clock->read(clock);
 	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-	clock->cycle_last = cycle_now;
+	tk->cycle_last = clock->cycle_last = cycle_now;
 
 	tk->xtime_nsec += cycle_delta * tk->mult;
 

From 7ec98e15aa049b7a2ca73485f31cf4f90c34e2dd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Feb 2013 22:51:39 +0000
Subject: [PATCH 38/52] timekeeping: Delay update of clock->cycle_last

For calculating the new timekeeper values store the new cycle_last
value in the timekeeper and update the clock->cycle_last just when we
actually update the new values.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4c276b2d022d..38ac782c0ef8 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1271,7 +1271,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
 
 	/* Accumulate one shifted interval */
 	offset -= interval;
-	tk->clock->cycle_last += interval;
+	tk->cycle_last += interval;
 
 	tk->xtime_nsec += tk->xtime_interval << shift;
 	accumulate_nsecs_to_secs(tk);
@@ -1386,6 +1386,8 @@ static void update_wall_time(void)
 	 */
 	accumulate_nsecs_to_secs(tk);
 
+	/* Update clock->cycle_last with the new value */
+	clock->cycle_last = tk->cycle_last;
 	timekeeping_update(tk, false);
 
 out:

From 48cdc135d4840aab8efd2fc3bacb5d7dfd94a9c8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Feb 2013 22:51:40 +0000
Subject: [PATCH 39/52] timekeeping: Implement a shadow timekeeper

Use the shadow timekeeper to do the update_wall_time() adjustments and
then copy it over to the real timekeeper.

Keep the shadow timekeeper in sync when updating stuff outside of
update_wall_time().

This allows us to limit the timekeeper_seq hold time to the update of
the real timekeeper and the vsyscall data in the next patch.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 41 +++++++++++++++++++++++++++------------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 38ac782c0ef8..d20ffdad62e8 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -29,6 +29,7 @@
 static struct timekeeper timekeeper;
 static DEFINE_RAW_SPINLOCK(timekeeper_lock);
 static seqcount_t timekeeper_seq;
+static struct timekeeper shadow_timekeeper;
 
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
@@ -240,7 +241,7 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
 EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
 
 /* must hold timekeeper_lock */
-static void timekeeping_update(struct timekeeper *tk, bool clearntp)
+static void timekeeping_update(struct timekeeper *tk, bool clearntp, bool mirror)
 {
 	if (clearntp) {
 		tk->ntp_error = 0;
@@ -248,6 +249,9 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp)
 	}
 	update_vsyscall(tk);
 	update_pvclock_gtod(tk);
+
+	if (mirror)
+		memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
 }
 
 /**
@@ -504,7 +508,7 @@ int do_settimeofday(const struct timespec *tv)
 
 	tk_set_xtime(tk, tv);
 
-	timekeeping_update(tk, true);
+	timekeeping_update(tk, true, true);
 
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -548,7 +552,7 @@ int timekeeping_inject_offset(struct timespec *ts)
 	tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
 
 error: /* even if we error out, we forwarded the time, so call update */
-	timekeeping_update(tk, true);
+	timekeeping_update(tk, true, true);
 
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -628,7 +632,7 @@ static int change_clocksource(void *data)
 		if (old->disable)
 			old->disable(old);
 	}
-	timekeeping_update(tk, true);
+	timekeeping_update(tk, true, true);
 
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -809,6 +813,8 @@ void __init timekeeping_init(void)
 	tmp.tv_nsec = 0;
 	tk_set_sleep_time(tk, tmp);
 
+	memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
+
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 }
@@ -865,7 +871,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 
 	__timekeeping_inject_sleeptime(tk, delta);
 
-	timekeeping_update(tk, true);
+	timekeeping_update(tk, true, true);
 
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -947,7 +953,7 @@ static void timekeeping_resume(void)
 	clock->cycle_last = cycle_now;
 	tk->ntp_error = 0;
 	timekeeping_suspended = 0;
-	timekeeping_update(tk, false);
+	timekeeping_update(tk, false, true);
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
@@ -1328,7 +1334,8 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
 static void update_wall_time(void)
 {
 	struct clocksource *clock;
-	struct timekeeper *tk = &timekeeper;
+	struct timekeeper *real_tk = &timekeeper;
+	struct timekeeper *tk = &shadow_timekeeper;
 	cycle_t offset;
 	int shift = 0, maxshift;
 	unsigned long flags;
@@ -1340,16 +1347,16 @@ static void update_wall_time(void)
 	if (unlikely(timekeeping_suspended))
 		goto out;
 
-	clock = tk->clock;
+	clock = real_tk->clock;
 
 #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-	offset = tk->cycle_interval;
+	offset = real_tk->cycle_interval;
 #else
 	offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
 #endif
 
 	/* Check if there's really nothing to do */
-	if (offset < tk->cycle_interval)
+	if (offset < real_tk->cycle_interval)
 		goto out;
 
 	/*
@@ -1388,12 +1395,22 @@ static void update_wall_time(void)
 
 	/* Update clock->cycle_last with the new value */
 	clock->cycle_last = tk->cycle_last;
-	timekeeping_update(tk, false);
+	/*
+	 * Update the real timekeeper.
+	 *
+	 * We could avoid this memcpy by switching pointers, but that
+	 * requires changes to all other timekeeper usage sites as
+	 * well, i.e. move the timekeeper pointer getter into the
+	 * spinlocked/seqcount protected sections. And we trade this
+	 * memcpy under the timekeeper_seq against one before we start
+	 * updating.
+	 */
+	memcpy(real_tk, tk, sizeof(*tk));
+	timekeeping_update(real_tk, false, false);
 
 out:
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
-
 }
 
 /**

From ca4523cda429712fc135c5db50920d90eb776a6c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Feb 2013 22:51:40 +0000
Subject: [PATCH 40/52] timekeeping: Shorten seq_count region

Shorten the seqcount write hold region to the actual update of the
timekeeper and the related data (e.g vsyscall).

On a contemporary x86 system this reduces the maximum latencies on
Preempt-RT from 8us to 4us on the non-timekeeping cores.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d20ffdad62e8..c4d2a8751f3e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1341,7 +1341,6 @@ static void update_wall_time(void)
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&timekeeper_lock, flags);
-	write_seqcount_begin(&timekeeper_seq);
 
 	/* Make sure we're fully resumed: */
 	if (unlikely(timekeeping_suspended))
@@ -1393,6 +1392,7 @@ static void update_wall_time(void)
 	 */
 	accumulate_nsecs_to_secs(tk);
 
+	write_seqcount_begin(&timekeeper_seq);
 	/* Update clock->cycle_last with the new value */
 	clock->cycle_last = tk->cycle_last;
 	/*
@@ -1407,9 +1407,8 @@ static void update_wall_time(void)
 	 */
 	memcpy(real_tk, tk, sizeof(*tk));
 	timekeeping_update(real_tk, false, false);
-
-out:
 	write_seqcount_end(&timekeeper_seq);
+out:
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 }
 

From 8f294b5a139ee4b75e890ad5b443c93d1e558a8b Mon Sep 17 00:00:00 2001
From: Prarit Bhargava <prarit@redhat.com>
Date: Mon, 8 Apr 2013 08:47:15 -0400
Subject: [PATCH 41/52] hrtimer: Add expiry time overflow check in
 hrtimer_interrupt

The settimeofday01 test in the LTP testsuite effectively does

        gettimeofday(current time);
        settimeofday(Jan 1, 1970 + 100 seconds);
        settimeofday(current time);

This test causes a stack trace to be displayed on the console during the
setting of timeofday to Jan 1, 1970 + 100 seconds:

[  131.066751] ------------[ cut here ]------------
[  131.096448] WARNING: at kernel/time/clockevents.c:209 clockevents_program_event+0x135/0x140()
[  131.104935] Hardware name: Dinar
[  131.108150] Modules linked in: sg nfsv3 nfs_acl nfsv4 auth_rpcgss nfs dns_resolver fscache lockd sunrpc nf_conntrack_netbios_ns nf_conntrack_broadcast ipt_MASQUERADE ip6table_mangle ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 iptable_nat nf_nat_ipv4 nf_nat iptable_mangle ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter ip_tables kvm_amd kvm sp5100_tco bnx2 i2c_piix4 crc32c_intel k10temp fam15h_power ghash_clmulni_intel amd64_edac_mod pcspkr serio_raw edac_mce_amd edac_core microcode xfs libcrc32c sr_mod sd_mod cdrom ata_generic crc_t10dif pata_acpi radeon i2c_algo_bit drm_kms_helper ttm drm ahci pata_atiixp libahci libata usb_storage i2c_core dm_mirror dm_region_hash dm_log dm_mod
[  131.176784] Pid: 0, comm: swapper/28 Not tainted 3.8.0+ #6
[  131.182248] Call Trace:
[  131.184684]  <IRQ>  [<ffffffff810612af>] warn_slowpath_common+0x7f/0xc0
[  131.191312]  [<ffffffff8106130a>] warn_slowpath_null+0x1a/0x20
[  131.197131]  [<ffffffff810b9fd5>] clockevents_program_event+0x135/0x140
[  131.203721]  [<ffffffff810bb584>] tick_program_event+0x24/0x30
[  131.209534]  [<ffffffff81089ab1>] hrtimer_interrupt+0x131/0x230
[  131.215437]  [<ffffffff814b9600>] ? cpufreq_p4_target+0x130/0x130
[  131.221509]  [<ffffffff81619119>] smp_apic_timer_interrupt+0x69/0x99
[  131.227839]  [<ffffffff8161805d>] apic_timer_interrupt+0x6d/0x80
[  131.233816]  <EOI>  [<ffffffff81099745>] ? sched_clock_cpu+0xc5/0x120
[  131.240267]  [<ffffffff814b9ff0>] ? cpuidle_wrap_enter+0x50/0xa0
[  131.246252]  [<ffffffff814b9fe9>] ? cpuidle_wrap_enter+0x49/0xa0
[  131.252238]  [<ffffffff814ba050>] cpuidle_enter_tk+0x10/0x20
[  131.257877]  [<ffffffff814b9c89>] cpuidle_idle_call+0xa9/0x260
[  131.263692]  [<ffffffff8101c42f>] cpu_idle+0xaf/0x120
[  131.268727]  [<ffffffff815f8971>] start_secondary+0x255/0x257
[  131.274449] ---[ end trace 1151a50552231615 ]---

When we change the system time to a low value like this, the value of
timekeeper->offs_real will be a negative value.

It seems that the WARN occurs because an hrtimer has been started in the time
between the releasing of the timekeeper lock and the IPI call (via a call to
on_each_cpu) in clock_was_set() in the do_settimeofday() code.  The end result
is that a REALTIME_CLOCK timer has been added with softexpires = expires =
KTIME_MAX.  The hrtimer_interrupt() fires/is called and the loop at
kernel/hrtimer.c:1289 is executed.  In this loop the code subtracts the
clock base's offset (which was set to timekeeper->offs_real in
do_settimeofday()) from the current hrtimer_cpu_base->expiry value (which
was KTIME_MAX):

	KTIME_MAX - (a negative value) = overflow

A simple check for an overflow can resolve this problem.  Using KTIME_MAX
instead of the overflow value will result in the hrtimer function being run,
and the reprogramming of the timer after that.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Prarit Bhargava <prarit@redhat.com>
[jstultz: Tweaked commit subject]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/hrtimer.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index d6830d5ae730..071e093c4486 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1323,6 +1323,8 @@ retry:
 
 				expires = ktime_sub(hrtimer_get_expires(timer),
 						    base->offset);
+				if (expires.tv64 < 0)
+					expires.tv64 = KTIME_MAX;
 				if (expires.tv64 < expires_next.tv64)
 					expires_next = expires;
 				break;

From 51fd36f3fad8447c487137ae26b9d0b3ce77bb25 Mon Sep 17 00:00:00 2001
From: David Engraf <david.engraf@sysgo.com>
Date: Tue, 19 Mar 2013 13:29:55 +0100
Subject: [PATCH 42/52] hrtimer: Fix ktime_add_ns() overflow on 32bit
 architectures

One can trigger an overflow when using ktime_add_ns() on a 32bit
architecture not supporting CONFIG_KTIME_SCALAR.

When passing a very high value for u64 nsec, e.g. 7881299347898368000
the do_div() function converts this value to seconds (7881299347) which
is still to high to pass to the ktime_set() function as long. The result
in is a negative value.

The problem on my system occurs in the tick-sched.c,
tick_nohz_stop_sched_tick() when time_delta is set to
timekeeping_max_deferment(). The check for time_delta < KTIME_MAX is
valid, thus ktime_add_ns() is called with a too large value resulting in
a negative expire value. This leads to an endless loop in the ticker code:

time_delta: 7881299347898368000
expires = ktime_add_ns(last_update, time_delta)
expires: negative value

This fix caps the value to KTIME_MAX.

This error doesn't occurs on 64bit or architectures supporting
CONFIG_KTIME_SCALAR (e.g. ARM, x86-32).

Cc: stable@vger.kernel.org
Signed-off-by: David Engraf <david.engraf@sysgo.com>
[jstultz: Minor tweaks to commit message & header]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/hrtimer.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 071e093c4486..c0875ae0de17 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -286,6 +286,10 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
 	} else {
 		unsigned long rem = do_div(nsec, NSEC_PER_SEC);
 
+		/* Make sure nsec fits into long */
+		if (unlikely(nsec > KTIME_SEC_MAX))
+			return (ktime_t){ .tv64 = KTIME_MAX };
+
 		tmp = ktime_set((long)nsec, rem);
 	}
 

From 4e8f8b34b92b6514cc070aeb94d317cadd5071d7 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 10 Apr 2013 12:41:49 -0700
Subject: [PATCH 43/52] timekeeping: Make sure to notify hrtimers when TAI
 offset changes

Now that we have CLOCK_TAI timers, make sure we notify hrtimer
code when TAI offset is changed.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/1365622909-953-1-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c4d2a8751f3e..675f720a848b 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -607,6 +607,7 @@ void timekeeping_set_tai_offset(s32 tai_offset)
 	__timekeeping_set_tai_offset(tk, tai_offset);
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+	clock_was_set();
 }
 
 /**
@@ -1639,7 +1640,7 @@ int do_adjtimex(struct timex *txc)
 	struct timekeeper *tk = &timekeeper;
 	unsigned long flags;
 	struct timespec ts;
-	s32 tai;
+	s32 orig_tai, tai;
 	int ret;
 
 	/* Validate the data before disabling interrupts */
@@ -1663,10 +1664,13 @@ int do_adjtimex(struct timex *txc)
 	raw_spin_lock_irqsave(&timekeeper_lock, flags);
 	write_seqcount_begin(&timekeeper_seq);
 
-	tai = tk->tai_offset;
+	orig_tai = tai = tk->tai_offset;
 	ret = __do_adjtimex(txc, &ts, &tai);
 
-	__timekeeping_set_tai_offset(tk, tai);
+	if (tai != orig_tai) {
+		__timekeeping_set_tai_offset(tk, tai);
+		clock_was_set_delayed();
+	}
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 

From 5ed67f05f66c41e39880a6d61358438a25f9fee5 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Mon, 11 Mar 2013 13:12:21 +0400
Subject: [PATCH 44/52] posix timers: Allocate timer id per process (v2)

Currently kernel generates IDs for posix timers in a global manner --
there's a kernel-wide IDR tree from which IDs are created. This makes
it impossible to recreate a timer with a desired ID (in particular
this is done by the CRIU checkpoint-restore project) -- since these
IDs are global it may happen, that at the time we recreate a timer, the
ID we want for it is already busy by some other timer.

In order to address this, replace the IDR tree with a global hash
table for timers and makes timer IDs unique per signal_struct (to
which timers are linked anyway). With this, two timers belonging to
different processes may have equal IDs and we can recreate either of
them with the ID we want.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Matthew Helsley <matt.helsley@gmail.com>
Link: http://lkml.kernel.org/r/513D9FF5.9010004@parallels.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/posix-timers.h |   1 +
 include/linux/sched.h        |   3 +-
 kernel/posix-timers.c        | 106 +++++++++++++++++++++++------------
 3 files changed, 72 insertions(+), 38 deletions(-)

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 042058fdb0af..60bac697a91b 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -55,6 +55,7 @@ struct cpu_timer_list {
 /* POSIX.1b interval timer structure. */
 struct k_itimer {
 	struct list_head list;		/* free/ allocate list */
+	struct hlist_node t_hash;
 	spinlock_t it_lock;
 	clockid_t it_clock;		/* which timer type */
 	timer_t it_id;			/* timer id */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d35d2b6ddbfb..d13341b55096 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -526,7 +526,8 @@ struct signal_struct {
 	unsigned int		has_child_subreaper:1;
 
 	/* POSIX.1b Interval Timers */
-	struct list_head posix_timers;
+	int			posix_timer_id;
+	struct list_head	posix_timers;
 
 	/* ITIMER_REAL timer for the process */
 	struct hrtimer real_timer;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 2a2e173d0a7a..34d75926b843 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -40,38 +40,31 @@
 #include <linux/list.h>
 #include <linux/init.h>
 #include <linux/compiler.h>
-#include <linux/idr.h>
+#include <linux/hash.h>
 #include <linux/posix-clock.h>
 #include <linux/posix-timers.h>
 #include <linux/syscalls.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
 #include <linux/export.h>
+#include <linux/hashtable.h>
 
 /*
- * Management arrays for POSIX timers.	 Timers are kept in slab memory
- * Timer ids are allocated by an external routine that keeps track of the
- * id and the timer.  The external interface is:
- *
- * void *idr_find(struct idr *idp, int id);           to find timer_id <id>
- * int idr_get_new(struct idr *idp, void *ptr);       to get a new id and
- *                                                    related it to <ptr>
- * void idr_remove(struct idr *idp, int id);          to release <id>
- * void idr_init(struct idr *idp);                    to initialize <idp>
- *                                                    which we supply.
- * The idr_get_new *may* call slab for more memory so it must not be
- * called under a spin lock.  Likewise idr_remore may release memory
- * (but it may be ok to do this under a lock...).
- * idr_find is just a memory look up and is quite fast.  A -1 return
- * indicates that the requested id does not exist.
+ * Management arrays for POSIX timers. Timers are now kept in static hash table
+ * with 512 entries.
+ * Timer ids are allocated by local routine, which selects proper hash head by
+ * key, constructed from current->signal address and per signal struct counter.
+ * This keeps timer ids unique per process, but now they can intersect between
+ * processes.
  */
 
 /*
  * Lets keep our timers in a slab cache :-)
  */
 static struct kmem_cache *posix_timers_cache;
-static struct idr posix_timers_id;
-static DEFINE_SPINLOCK(idr_lock);
+
+static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
+static DEFINE_SPINLOCK(hash_lock);
 
 /*
  * we assume that the new SIGEV_THREAD_ID shares no bits with the other
@@ -152,6 +145,57 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
 	__timr;								   \
 })
 
+static int hash(struct signal_struct *sig, unsigned int nr)
+{
+	return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable));
+}
+
+static struct k_itimer *__posix_timers_find(struct hlist_head *head,
+					    struct signal_struct *sig,
+					    timer_t id)
+{
+	struct hlist_node *node;
+	struct k_itimer *timer;
+
+	hlist_for_each_entry_rcu(timer, head, t_hash) {
+		if ((timer->it_signal == sig) && (timer->it_id == id))
+			return timer;
+	}
+	return NULL;
+}
+
+static struct k_itimer *posix_timer_by_id(timer_t id)
+{
+	struct signal_struct *sig = current->signal;
+	struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)];
+
+	return __posix_timers_find(head, sig, id);
+}
+
+static int posix_timer_add(struct k_itimer *timer)
+{
+	struct signal_struct *sig = current->signal;
+	int first_free_id = sig->posix_timer_id;
+	struct hlist_head *head;
+	int ret = -ENOENT;
+
+	do {
+		spin_lock(&hash_lock);
+		head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
+		if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
+			hlist_add_head_rcu(&timer->t_hash, head);
+			ret = sig->posix_timer_id;
+		}
+		if (++sig->posix_timer_id < 0)
+			sig->posix_timer_id = 0;
+		if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
+			/* Loop over all possible ids completed */
+			ret = -EAGAIN;
+		spin_unlock(&hash_lock);
+	} while (ret == -ENOENT);
+	return ret;
+}
+
 static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
 {
 	spin_unlock_irqrestore(&timr->it_lock, flags);
@@ -298,7 +342,6 @@ static __init int init_posix_timers(void)
 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
 					sizeof (struct k_itimer), 0, SLAB_PANIC,
 					NULL);
-	idr_init(&posix_timers_id);
 	return 0;
 }
 
@@ -520,9 +563,9 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
 {
 	if (it_id_set) {
 		unsigned long flags;
-		spin_lock_irqsave(&idr_lock, flags);
-		idr_remove(&posix_timers_id, tmr->it_id);
-		spin_unlock_irqrestore(&idr_lock, flags);
+		spin_lock_irqsave(&hash_lock, flags);
+		hlist_del_rcu(&tmr->t_hash);
+		spin_unlock_irqrestore(&hash_lock, flags);
 	}
 	put_pid(tmr->it_pid);
 	sigqueue_free(tmr->sigq);
@@ -568,22 +611,11 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
 		return -EAGAIN;
 
 	spin_lock_init(&new_timer->it_lock);
-
-	idr_preload(GFP_KERNEL);
-	spin_lock_irq(&idr_lock);
-	error = idr_alloc(&posix_timers_id, new_timer, 0, 0, GFP_NOWAIT);
-	spin_unlock_irq(&idr_lock);
-	idr_preload_end();
-	if (error < 0) {
-		/*
-		 * Weird looking, but we return EAGAIN if the IDR is
-		 * full (proper POSIX return value for this)
-		 */
-		if (error == -ENOSPC)
-			error = -EAGAIN;
+	new_timer_id = posix_timer_add(new_timer);
+	if (new_timer_id < 0) {
+		error = new_timer_id;
 		goto out;
 	}
-	new_timer_id = error;
 
 	it_id_set = IT_ID_SET;
 	new_timer->it_id = (timer_t) new_timer_id;
@@ -661,7 +693,7 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
 		return NULL;
 
 	rcu_read_lock();
-	timr = idr_find(&posix_timers_id, (int)timer_id);
+	timr = posix_timer_by_id(timer_id);
 	if (timr) {
 		spin_lock_irqsave(&timr->it_lock, *flags);
 		if (timr->it_signal == current->signal) {

From 48f6a7a511ef8823fdff39afee0320092d43a8a0 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Mon, 11 Mar 2013 13:12:45 +0400
Subject: [PATCH 45/52] posix-timers: Introduce /proc/PID/timers file

Currently kernel doesn't provide any API for getting info about what
posix timers are configured by processes. It's implied, that a process
which configured some timers, knows what it did. However, for external
tools it's impossible to get this information. In particular, this is
critical for checkpoint-restore project to have this info.

Introduce a per-pid proc file with information about posix
timers. Since these timers are shared between threads, this file is
present on tgid level only, no such thing in tid subdirs.

The file format is expected to be the "/proc/<pid>/smaps"-like,
i.e. each timer will occupy seveal lines to allow for future
extending.

Each new timer entry starts with the

ID: <number>

line which is added by this patch.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Matthew Helsley <matt.helsley@gmail.com>
Link: http://lkml.kernel.org/r/513DA00D.6070009@parallels.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/proc/base.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 69078c7cef1f..01def9f8aa74 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -86,6 +86,7 @@
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
 #include <linux/flex_array.h>
+#include <linux/posix-timers.h>
 #ifdef CONFIG_HARDWALL
 #include <asm/hardwall.h>
 #endif
@@ -2013,6 +2014,85 @@ static const struct file_operations proc_map_files_operations = {
 	.llseek		= default_llseek,
 };
 
+struct timers_private {
+	struct pid *pid;
+	struct task_struct *task;
+	struct sighand_struct *sighand;
+	unsigned long flags;
+};
+
+static void *timers_start(struct seq_file *m, loff_t *pos)
+{
+	struct timers_private *tp = m->private;
+
+	tp->task = get_pid_task(tp->pid, PIDTYPE_PID);
+	if (!tp->task)
+		return ERR_PTR(-ESRCH);
+
+	tp->sighand = lock_task_sighand(tp->task, &tp->flags);
+	if (!tp->sighand)
+		return ERR_PTR(-ESRCH);
+
+	return seq_list_start(&tp->task->signal->posix_timers, *pos);
+}
+
+static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct timers_private *tp = m->private;
+	return seq_list_next(v, &tp->task->signal->posix_timers, pos);
+}
+
+static void timers_stop(struct seq_file *m, void *v)
+{
+	struct timers_private *tp = m->private;
+
+	if (tp->sighand) {
+		unlock_task_sighand(tp->task, &tp->flags);
+		tp->sighand = NULL;
+	}
+
+	if (tp->task) {
+		put_task_struct(tp->task);
+		tp->task = NULL;
+	}
+}
+
+static int show_timer(struct seq_file *m, void *v)
+{
+	struct k_itimer *timer;
+
+	timer = list_entry((struct list_head *)v, struct k_itimer, list);
+	seq_printf(m, "ID: %d\n", timer->it_id);
+
+	return 0;
+}
+
+static const struct seq_operations proc_timers_seq_ops = {
+	.start	= timers_start,
+	.next	= timers_next,
+	.stop	= timers_stop,
+	.show	= show_timer,
+};
+
+static int proc_timers_open(struct inode *inode, struct file *file)
+{
+	struct timers_private *tp;
+
+	tp = __seq_open_private(file, &proc_timers_seq_ops,
+			sizeof(struct timers_private));
+	if (!tp)
+		return -ENOMEM;
+
+	tp->pid = proc_pid(inode);
+	return 0;
+}
+
+static const struct file_operations proc_timers_operations = {
+	.open		= proc_timers_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+};
 #endif /* CONFIG_CHECKPOINT_RESTORE */
 
 static struct dentry *proc_pident_instantiate(struct inode *dir,
@@ -2583,6 +2663,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
 	REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
 #endif
+#ifdef CONFIG_CHECKPOINT_RESTORE
+	REG("timers",	  S_IRUGO, proc_timers_operations),
+#endif
 };
 
 static int proc_tgid_base_readdir(struct file * filp,

From 57b8015e07a70301e9ec9f324db1a8b73b5a1e2b Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Mon, 11 Mar 2013 13:13:08 +0400
Subject: [PATCH 46/52] posix-timers: Show sigevent info in proc file

Previous patch added proc file to list posix timers created by task.
Expand the information provided in this file by adding info about
notification method, with which timers were created. I.e. after
the "ID:" line there go

1. "signal:" line, that shows signal number and sigval bits;
2. "notify:" line, that shows the timer notification method.

Thus the timer entry would looke like this:

ID: 123
signal: 14/0000000000b005d0
notify: signal/pid.732

This information is enough to understand how timer_create() was called
for each particular timer.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Matthew Helsley <matt.helsley@gmail.com>
Link: http://lkml.kernel.org/r/513DA024.80404@parallels.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/proc/base.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 01def9f8aa74..a19308604145 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2018,6 +2018,7 @@ struct timers_private {
 	struct pid *pid;
 	struct task_struct *task;
 	struct sighand_struct *sighand;
+	struct pid_namespace *ns;
 	unsigned long flags;
 };
 
@@ -2060,9 +2061,24 @@ static void timers_stop(struct seq_file *m, void *v)
 static int show_timer(struct seq_file *m, void *v)
 {
 	struct k_itimer *timer;
+	struct timers_private *tp = m->private;
+	int notify;
+	static char *nstr[] = {
+		[SIGEV_SIGNAL] = "signal",
+		[SIGEV_NONE] = "none",
+		[SIGEV_THREAD] = "thread",
+	};
 
 	timer = list_entry((struct list_head *)v, struct k_itimer, list);
+	notify = timer->it_sigev_notify;
+
 	seq_printf(m, "ID: %d\n", timer->it_id);
+	seq_printf(m, "signal: %d/%p\n", timer->sigq->info.si_signo,
+			timer->sigq->info.si_value.sival_ptr);
+	seq_printf(m, "notify: %s/%s.%d\n",
+		nstr[notify & ~SIGEV_THREAD_ID],
+		(notify & SIGEV_THREAD_ID) ? "tid" : "pid",
+		pid_nr_ns(timer->it_pid, tp->ns));
 
 	return 0;
 }
@@ -2084,6 +2100,7 @@ static int proc_timers_open(struct inode *inode, struct file *file)
 		return -ENOMEM;
 
 	tp->pid = proc_pid(inode);
+	tp->ns = inode->i_sb->s_fs_info;
 	return 0;
 }
 

From 60cf7ea849e77c8782dee147cfb8c38d1984236e Mon Sep 17 00:00:00 2001
From: Nathan Zimmer <nzimmer@sgi.com>
Date: Tue, 26 Mar 2013 19:56:29 -0500
Subject: [PATCH 47/52] timer_list: Split timer_list_show_tickdevices

Split timer_list_show_tickdevices() into the header printout and pull
the rest up to timer_list_show. This is a preparatory patch for
converting timer_list to a proper seqfile with its own iterator

Signed-off-by: Nathan Zimmer <nzimmer@sgi.com>
Reported-by: Dave Jones <davej@redhat.com>
Cc: John Stultz <johnstul@us.ibm.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Link: http://lkml.kernel.org/r/1364345790-14577-2-git-send-email-nzimmer@sgi.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer_list.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index af5a7e9f164b..380a58977490 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -133,7 +133,6 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
 	struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
 	int i;
 
-	SEQ_printf(m, "\n");
 	SEQ_printf(m, "cpu: %d\n", cpu);
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		SEQ_printf(m, " clock %d:\n", i);
@@ -187,6 +186,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
 
 #undef P
 #undef P_ns
+	SEQ_printf(m, "\n");
 }
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
@@ -195,7 +195,6 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
 {
 	struct clock_event_device *dev = td->evtdev;
 
-	SEQ_printf(m, "\n");
 	SEQ_printf(m, "Tick Device: mode:     %d\n", td->mode);
 	if (cpu < 0)
 		SEQ_printf(m, "Broadcast device\n");
@@ -230,12 +229,11 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
 	print_name_offset(m, dev->event_handler);
 	SEQ_printf(m, "\n");
 	SEQ_printf(m, " retries:        %lu\n", dev->retries);
+	SEQ_printf(m, "\n");
 }
 
-static void timer_list_show_tickdevices(struct seq_file *m)
+static void timer_list_show_tickdevices_header(struct seq_file *m)
 {
-	int cpu;
-
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
 	print_tickdevice(m, tick_get_broadcast_device(), -1);
 	SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
@@ -246,12 +244,7 @@ static void timer_list_show_tickdevices(struct seq_file *m)
 #endif
 	SEQ_printf(m, "\n");
 #endif
-	for_each_online_cpu(cpu)
-		print_tickdevice(m, tick_get_device(cpu), cpu);
-	SEQ_printf(m, "\n");
 }
-#else
-static void timer_list_show_tickdevices(struct seq_file *m) { }
 #endif
 
 static int timer_list_show(struct seq_file *m, void *v)
@@ -262,12 +255,16 @@ static int timer_list_show(struct seq_file *m, void *v)
 	SEQ_printf(m, "Timer List Version: v0.7\n");
 	SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
 	SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
+	SEQ_printf(m, "\n");
 
 	for_each_online_cpu(cpu)
 		print_cpu(m, cpu, now);
 
-	SEQ_printf(m, "\n");
-	timer_list_show_tickdevices(m);
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+	timer_list_show_tickdevices_header(m);
+	for_each_online_cpu(cpu)
+		print_tickdevice(m, tick_get_device(cpu), cpu);
+#endif
 
 	return 0;
 }

From b3956a896ea57f25cacd74708b8fab611543a81d Mon Sep 17 00:00:00 2001
From: Nathan Zimmer <nzimmer@sgi.com>
Date: Tue, 26 Mar 2013 19:56:30 -0500
Subject: [PATCH 48/52] timer_list: Convert timer list to be a proper seq_file

When running with 4096 cores attemping to read /proc/timer_list will fail
with an ENOMEM condition.  On a sufficantly large systems the total amount
of data is more then 4mb, so it won't fit into a single buffer.  The
failure can also occur on smaller systems when memory fragmentation is
high as reported by Dave Jones.

Convert /proc/timer_list to a proper seq_file with its own iterator.  This
is a little more complex given that we have to make two passes with two
separate headers.

sysrq_timer_list_show also needed to be updated to reflect the fact that
now timer_list_show only does one cpu at at time.

Signed-off-by: Nathan Zimmer <nzimmer@sgi.com>
Reported-by: Dave Jones <davej@redhat.com>
Cc: John Stultz <johnstul@us.ibm.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Link: http://lkml.kernel.org/r/1364345790-14577-3-git-send-email-nzimmer@sgi.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer_list.c | 91 ++++++++++++++++++++++++++++++++++------
 1 file changed, 78 insertions(+), 13 deletions(-)

diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 380a58977490..3bdf28323012 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -20,6 +20,13 @@
 
 #include <asm/uaccess.h>
 
+
+struct timer_list_iter {
+	int cpu;
+	bool second_pass;
+	u64 now;
+};
+
 typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
 
 DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
@@ -247,43 +254,101 @@ static void timer_list_show_tickdevices_header(struct seq_file *m)
 }
 #endif
 
-static int timer_list_show(struct seq_file *m, void *v)
+static inline void timer_list_header(struct seq_file *m, u64 now)
 {
-	u64 now = ktime_to_ns(ktime_get());
-	int cpu;
-
 	SEQ_printf(m, "Timer List Version: v0.7\n");
 	SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
 	SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
 	SEQ_printf(m, "\n");
+}
 
-	for_each_online_cpu(cpu)
-		print_cpu(m, cpu, now);
+static int timer_list_show(struct seq_file *m, void *v)
+{
+	struct timer_list_iter *iter = v;
+	u64 now = ktime_to_ns(ktime_get());
 
+	if (iter->cpu == -1 && !iter->second_pass)
+		timer_list_header(m, now);
+	else if (!iter->second_pass)
+		print_cpu(m, iter->cpu, iter->now);
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
-	timer_list_show_tickdevices_header(m);
-	for_each_online_cpu(cpu)
-		print_tickdevice(m, tick_get_device(cpu), cpu);
+	else if (iter->cpu == -1 && iter->second_pass)
+		timer_list_show_tickdevices_header(m);
+	else
+		print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu);
 #endif
-
 	return 0;
 }
 
 void sysrq_timer_list_show(void)
 {
-	timer_list_show(NULL, NULL);
+	u64 now = ktime_to_ns(ktime_get());
+	int cpu;
+
+	timer_list_header(NULL, now);
+
+	for_each_online_cpu(cpu)
+		print_cpu(NULL, cpu, now);
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+	timer_list_show_tickdevices_header(NULL);
+	for_each_online_cpu(cpu)
+		print_tickdevice(NULL, tick_get_device(cpu), cpu);
+#endif
+	return;
 }
 
+static void *timer_list_start(struct seq_file *file, loff_t *offset)
+{
+	struct timer_list_iter *iter = file->private;
+
+	if (!*offset) {
+		iter->cpu = -1;
+		iter->now = ktime_to_ns(ktime_get());
+	} else if (iter->cpu >= nr_cpu_ids) {
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+		if (!iter->second_pass) {
+			iter->cpu = -1;
+			iter->second_pass = true;
+		} else
+			return NULL;
+#else
+		return NULL;
+#endif
+	}
+	return iter;
+}
+
+static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset)
+{
+	struct timer_list_iter *iter = file->private;
+	iter->cpu = cpumask_next(iter->cpu, cpu_online_mask);
+	++*offset;
+	return timer_list_start(file, offset);
+}
+
+static void timer_list_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations timer_list_sops = {
+	.start = timer_list_start,
+	.next = timer_list_next,
+	.stop = timer_list_stop,
+	.show = timer_list_show,
+};
+
 static int timer_list_open(struct inode *inode, struct file *filp)
 {
-	return single_open(filp, timer_list_show, NULL);
+	return seq_open_private(filp, &timer_list_sops,
+			sizeof(struct timer_list_iter));
 }
 
 static const struct file_operations timer_list_fops = {
 	.open		= timer_list_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= single_release,
+	.release	= seq_release_private,
 };
 
 static int __init init_timer_list_procfs(void)

From c038c1c44179c80da6201f91ef354e48d5689617 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Wed, 17 Apr 2013 10:26:06 -0700
Subject: [PATCH 49/52] clockevents: Switch into oneshot mode even if broadcast
 registered late

tick_oneshot_notify() is used to notify a particular CPU to try
to switch into oneshot mode after a oneshot capable tick device
is registered and tick_clock_notify() is used to notify all CPUs
to try to switch into oneshot mode after a high res clocksource
is registered. There is one caveat; if the tick devices suffer
from FEAT_C3_STOP we don't try to switch into oneshot mode unless
we have a oneshot capable broadcast device already registered.

If the broadcast device is registered after the tick devices that
have FEAT_C3_STOP we'll never try to switch into oneshot mode
again, causing us to be stuck in periodic mode forever. Avoid
this scenario by calling tick_clock_notify() after we register
the broadcast device so that we try to switch into oneshot mode
on all CPUs one more time.

[ tglx: Adopted to timers/core and added a comment ]

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Link: http://lkml.kernel.org/r/1366219566-29783-1-git-send-email-sboyd@codeaurora.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index d76d816afc5d..f8d2109ef0a2 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -75,6 +75,16 @@ int tick_check_broadcast_device(struct clock_event_device *dev)
 	tick_broadcast_device.evtdev = dev;
 	if (!cpumask_empty(tick_broadcast_mask))
 		tick_broadcast_start_periodic(dev);
+	/*
+	 * Inform all cpus about this. We might be in a situation
+	 * where we did not switch to oneshot mode because the per cpu
+	 * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack
+	 * of a oneshot capable broadcast device. Without that
+	 * notification the systems stays stuck in periodic mode
+	 * forever.
+	 */
+	if (dev->features & CLOCK_EVT_FEAT_ONESHOT)
+		tick_clock_notify();
 	return 1;
 }
 

From d2054b2c11682495fca41e9d4092e654df63b517 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 18 Apr 2013 12:51:19 +0200
Subject: [PATCH 50/52] posix-timers: Remove unused variable

Remove the unused variable *node introduced by commit 5ed67f05 (posix
timers: Allocate timer id per process)

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Pavel Emelyanov <xemul@parallels.com>
---
 kernel/posix-timers.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 34d75926b843..424c2d4265c9 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -154,7 +154,6 @@ static struct k_itimer *__posix_timers_find(struct hlist_head *head,
 					    struct signal_struct *sig,
 					    timer_t id)
 {
-	struct hlist_node *node;
 	struct k_itimer *timer;
 
 	hlist_for_each_entry_rcu(timer, head, t_hash) {

From 77c675ba18836802f6b73d2d773481d06ebc0f04 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Apr 2013 09:37:04 +0200
Subject: [PATCH 51/52] timekeeping: Update tk->cycle_last in resume

commit 7ec98e15aa (timekeeping: Delay update of clock->cycle_last)
forgot to update tk->cycle_last in the resume path. This results in a
stale value versus clock->cycle_last and prevents resume in the worst
case.

Reported-by: Jiri Slaby <jslaby@suse.cz>
Reported-and-tested-by: Borislav Petkov <bp@alien8.de>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Linux-pm mailing list <linux-pm@lists.linux-foundation.org>
Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1304211648150.21884@ionos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 675f720a848b..98cd470bbe49 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -951,7 +951,7 @@ static void timekeeping_resume(void)
 		__timekeeping_inject_sleeptime(tk, &ts_delta);
 
 	/* Re-base the last cycle value */
-	clock->cycle_last = cycle_now;
+	tk->cycle_last = clock->cycle_last = cycle_now;
 	tk->ntp_error = 0;
 	timekeeping_suspended = 0;
 	timekeeping_update(tk, false, true);

From 6f7a05d7018de222e40ca003721037a530979974 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Apr 2013 11:45:53 +0200
Subject: [PATCH 52/52] clockevents: Set dummy handler on CPU_DEAD shutdown

Vitaliy reported that a per cpu HPET timer interrupt crashes the
system during hibernation. What happens is that the per cpu HPET timer
gets shut down when the nonboot cpus are stopped. When the nonboot
cpus are onlined again the HPET code sets up the MSI interrupt which
fires before the clock event device is registered. The event handler
is still set to hrtimer_interrupt, which then crashes the machine due
to highres mode not being active.

See http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=700333

There is no real good way to avoid that in the HPET code. The HPET
code alrady has a mechanism to detect spurious interrupts when event
handler == NULL for a similar reason.

We can handle that in the clockevent/tick layer and replace the
previous functional handler with a dummy handler like we do in
tick_setup_new_device().

The original clockevents code did this in clockevents_exchange_device(),
but that got removed by commit 7c1e76897 (clockevents: prevent
clockevent event_handler ending up handler_noop) which forgot to fix
it up in tick_shutdown(). Same issue with the broadcast device.

Reported-by: Vitaliy Fillipov <vitalif@yourcmc.ru>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: stable@vger.kernel.org
Cc: 700333@bugs.debian.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 4 ++++
 kernel/time/tick-common.c    | 1 +
 2 files changed, 5 insertions(+)

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 6e23fde83dbe..61d00a8cdf2f 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -66,6 +66,8 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
  */
 int tick_check_broadcast_device(struct clock_event_device *dev)
 {
+	struct clock_event_device *cur = tick_broadcast_device.evtdev;
+
 	if ((dev->features & CLOCK_EVT_FEAT_DUMMY) ||
 	    (tick_broadcast_device.evtdev &&
 	     tick_broadcast_device.evtdev->rating >= dev->rating) ||
@@ -73,6 +75,8 @@ int tick_check_broadcast_device(struct clock_event_device *dev)
 		return 0;
 
 	clockevents_exchange_device(tick_broadcast_device.evtdev, dev);
+	if (cur)
+		cur->event_handler = clockevents_handle_noop;
 	tick_broadcast_device.evtdev = dev;
 	if (!cpumask_empty(tick_broadcast_mask))
 		tick_broadcast_start_periodic(dev);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 74413e396acc..6176a3e45709 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -323,6 +323,7 @@ static void tick_shutdown(unsigned int *cpup)
 		 */
 		dev->mode = CLOCK_EVT_MODE_UNUSED;
 		clockevents_exchange_device(dev, NULL);
+		dev->event_handler = clockevents_handle_noop;
 		td->evtdev = NULL;
 	}
 	raw_spin_unlock_irqrestore(&tick_device_lock, flags);