Merge branch 'tip/perf/core' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-2.6-trace into perf/core

2011-02-17 14:11:15 +01:00 · 2011-02-17 14:11:15 +01:00 · 5beda5f6e4
parent ba3dd36c67 6752ab4a9c
commit 5beda5f6e4
9 changed files with 74 additions and 197 deletions
--- a/Documentation/trace/ftrace-design.txt
+++ b/Documentation/trace/ftrace-design.txt
@ -247,6 +247,13 @@ You need very few things to get the syscalls tracing in an arch.
 - Support the TIF_SYSCALL_TRACEPOINT thread flags.
 - Put the trace_sys_enter() and trace_sys_exit() tracepoints calls from ptrace
  in the ptrace syscalls tracing path.
+- If the system call table on this arch is more complicated than a simple array
+  of addresses of the system calls, implement an arch_syscall_addr to return
+  the address of a given system call.
+- If the symbol names of the system calls do not match the function names on
+  this arch, define ARCH_HAS_SYSCALL_MATCH_SYM_NAME in asm/ftrace.h and
+  implement arch_syscall_match_sym_name with the appropriate logic to return
+  true if the function name corresponds with the symbol name.
 - Tag this arch as HAVE_SYSCALL_TRACEPOINTS.


--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@ -80,11 +80,11 @@ of ftrace. Here is a list of some of the key files:
 	tracers listed here can be configured by
 	echoing their name into current_tracer.

-  tracing_enabled:
+  tracing_on:

-	This sets or displays whether the current_tracer
-	is activated and tracing or not. Echo 0 into this
-	file to disable the tracer or 1 to enable it.
+	This sets or displays whether writing to the trace
+	ring buffer is enabled. Echo 0 into this file to disable
+	the tracer or 1 to enable it.

  trace:

@ -202,10 +202,6 @@ Here is the list of current tracers that may be configured.
 	to draw a graph of function calls similar to C code
 	source.

-  "sched_switch"
-
-	Traces the context switches and wakeups between tasks.
-
  "irqsoff"

 	Traces the areas that disable interrupts and saves
@ -273,39 +269,6 @@ format, the function name that was traced "path_put" and the
 parent function that called this function "path_walk". The
 timestamp is the time at which the function was entered.

-The sched_switch tracer also includes tracing of task wakeups
-and context switches.
-
-     ksoftirqd/1-7     [01]  1453.070013:      7:115:R   +  2916:115:S
-     ksoftirqd/1-7     [01]  1453.070013:      7:115:R   +    10:115:S
-     ksoftirqd/1-7     [01]  1453.070013:      7:115:R ==>    10:115:R
-        events/1-10    [01]  1453.070013:     10:115:S ==>  2916:115:R
-     kondemand/1-2916  [01]  1453.070013:   2916:115:S ==>     7:115:R
-     ksoftirqd/1-7     [01]  1453.070013:      7:115:S ==>     0:140:R
-
-Wake ups are represented by a "+" and the context switches are
-shown as "==>".  The format is:
-
- Context switches:
-
-       Previous task              Next Task
-
-  <pid>:<prio>:<state>  ==>  <pid>:<prio>:<state>
-
- Wake ups:
-
-       Current task               Task waking up
-
-  <pid>:<prio>:<state>    +  <pid>:<prio>:<state>
-
-The prio is the internal kernel priority, which is the inverse
-of the priority that is usually displayed by user-space tools.
-Zero represents the highest priority (99). Prio 100 starts the
-"nice" priorities with 100 being equal to nice -20 and 139 being
-nice 19. The prio "140" is reserved for the idle task which is
-the lowest priority thread (pid 0).
-
-
 Latency trace format
 --------------------

@ -491,79 +454,6 @@ x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
                   latencies, as described in "Latency
                   trace format".

-sched_switch
------------
-
-This tracer simply records schedule switches. Here is an example
-of how to use it.
-
- # echo sched_switch > current_tracer
- # echo 1 > tracing_enabled
- # sleep 1
- # echo 0 > tracing_enabled
- # cat trace
-
-# tracer: sched_switch
-#
-#           TASK-PID   CPU#    TIMESTAMP  FUNCTION
-#              | |      |          |         |
-            bash-3997  [01]   240.132281:   3997:120:R   +  4055:120:R
-            bash-3997  [01]   240.132284:   3997:120:R ==>  4055:120:R
-           sleep-4055  [01]   240.132371:   4055:120:S ==>  3997:120:R
-            bash-3997  [01]   240.132454:   3997:120:R   +  4055:120:S
-            bash-3997  [01]   240.132457:   3997:120:R ==>  4055:120:R
-           sleep-4055  [01]   240.132460:   4055:120:D ==>  3997:120:R
-            bash-3997  [01]   240.132463:   3997:120:R   +  4055:120:D
-            bash-3997  [01]   240.132465:   3997:120:R ==>  4055:120:R
-          <idle>-0     [00]   240.132589:      0:140:R   +     4:115:S
-          <idle>-0     [00]   240.132591:      0:140:R ==>     4:115:R
-     ksoftirqd/0-4     [00]   240.132595:      4:115:S ==>     0:140:R
-          <idle>-0     [00]   240.132598:      0:140:R   +     4:115:S
-          <idle>-0     [00]   240.132599:      0:140:R ==>     4:115:R
-     ksoftirqd/0-4     [00]   240.132603:      4:115:S ==>     0:140:R
-           sleep-4055  [01]   240.133058:   4055:120:S ==>  3997:120:R
- [...]
-
-
-As we have discussed previously about this format, the header
-shows the name of the trace and points to the options. The
-"FUNCTION" is a misnomer since here it represents the wake ups
-and context switches.
-
-The sched_switch file only lists the wake ups (represented with
-'+') and context switches ('==>') with the previous task or
-current task first followed by the next task or task waking up.
-The format for both of these is PID:KERNEL-PRIO:TASK-STATE.
-Remember that the KERNEL-PRIO is the inverse of the actual
-priority with zero (0) being the highest priority and the nice
-values starting at 100 (nice -20). Below is a quick chart to map
-the kernel priority to user land priorities.
-
-   Kernel Space                     User Space
- ===============================================================
-   0(high) to  98(low)     user RT priority 99(high) to 1(low)
-                           with SCHED_RR or SCHED_FIFO
- ---------------------------------------------------------------
-  99                       sched_priority is not used in scheduling
-                           decisions(it must be specified as 0)
- ---------------------------------------------------------------
- 100(high) to 139(low)     user nice -20(high) to 19(low)
- ---------------------------------------------------------------
- 140                       idle task priority
- ---------------------------------------------------------------
-
-The task states are:
-
- R - running : wants to run, may not actually be running
- S - sleep   : process is waiting to be woken up (handles signals)
- D - disk sleep (uninterruptible sleep) : process must be woken up
-					(ignores signals)
- T - stopped : process suspended
- t - traced  : process is being traced (with something like gdb)
- Z - zombie  : process waiting to be cleaned up
- X - unknown
-
-
 ftrace_enabled
 --------------

@ -607,10 +497,10 @@ an example:
 # echo irqsoff > current_tracer
 # echo latency-format > trace_options
 # echo 0 > tracing_max_latency
- # echo 1 > tracing_enabled
+ # echo 1 > tracing_on
 # ls -ltr
 [...]
- # echo 0 > tracing_enabled
+ # echo 0 > tracing_on
 # cat trace
 # tracer: irqsoff
 #
@ -715,10 +605,10 @@ is much like the irqsoff tracer.
 # echo preemptoff > current_tracer
 # echo latency-format > trace_options
 # echo 0 > tracing_max_latency
- # echo 1 > tracing_enabled
+ # echo 1 > tracing_on
 # ls -ltr
 [...]
- # echo 0 > tracing_enabled
+ # echo 0 > tracing_on
 # cat trace
 # tracer: preemptoff
 #
@ -863,10 +753,10 @@ tracers.
 # echo preemptirqsoff > current_tracer
 # echo latency-format > trace_options
 # echo 0 > tracing_max_latency
- # echo 1 > tracing_enabled
+ # echo 1 > tracing_on
 # ls -ltr
 [...]
- # echo 0 > tracing_enabled
+ # echo 0 > tracing_on
 # cat trace
 # tracer: preemptirqsoff
 #
@ -1026,9 +916,9 @@ Instead of performing an 'ls', we will run 'sleep 1' under
 # echo wakeup > current_tracer
 # echo latency-format > trace_options
 # echo 0 > tracing_max_latency
- # echo 1 > tracing_enabled
+ # echo 1 > tracing_on
 # chrt -f 5 sleep 1
- # echo 0 > tracing_enabled
+ # echo 0 > tracing_on
 # cat trace
 # tracer: wakeup
 #
@ -1140,9 +1030,9 @@ ftrace_enabled is set; otherwise this tracer is a nop.

 # sysctl kernel.ftrace_enabled=1
 # echo function > current_tracer
- # echo 1 > tracing_enabled
+ # echo 1 > tracing_on
 # usleep 1
- # echo 0 > tracing_enabled
+ # echo 0 > tracing_on
 # cat trace
 # tracer: function
 #
@ -1180,7 +1070,7 @@ int trace_fd;
 [...]
 int main(int argc, char *argv[]) {
 	[...]
-	trace_fd = open(tracing_file("tracing_enabled"), O_WRONLY);
+	trace_fd = open(tracing_file("tracing_on"), O_WRONLY);
 	[...]
 	if (condition_hit()) {
 		write(trace_fd, "0", 1);
@ -1631,9 +1521,9 @@ If I am only interested in sys_nanosleep and hrtimer_interrupt:
 # echo sys_nanosleep hrtimer_interrupt \
 		> set_ftrace_filter
 # echo function > current_tracer
- # echo 1 > tracing_enabled
+ # echo 1 > tracing_on
 # usleep 1
- # echo 0 > tracing_enabled
+ # echo 0 > tracing_on
 # cat trace
 # tracer: ftrace
 #
@ -1879,9 +1769,9 @@ different. The trace is live.
 # echo function > current_tracer
 # cat trace_pipe > /tmp/trace.out &
 [1] 4153
- # echo 1 > tracing_enabled
+ # echo 1 > tracing_on
 # usleep 1
- # echo 0 > tracing_enabled
+ # echo 0 > tracing_on
 # cat trace
 # tracer: function
 #
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@ -132,11 +132,11 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 		.class			= &event_class_syscall_enter,	\
 		.event.funcs            = &enter_syscall_print_funcs,	\
 		.data			= (void *)&__syscall_meta_##sname,\
+		.flags			= TRACE_EVENT_FL_CAP_ANY,	\
 	};								\
 	static struct ftrace_event_call __used				\
 	  __attribute__((section("_ftrace_events")))			\
-	 *__event_enter_##sname = &event_enter_##sname;			\
-	__TRACE_EVENT_FLAGS(enter_##sname, TRACE_EVENT_FL_CAP_ANY)
+	 *__event_enter_##sname = &event_enter_##sname;

 #define SYSCALL_TRACE_EXIT_EVENT(sname)					\
 	static struct syscall_metadata __syscall_meta_##sname;		\
@ -146,11 +146,11 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 		.class			= &event_class_syscall_exit,	\
 		.event.funcs		= &exit_syscall_print_funcs,	\
 		.data			= (void *)&__syscall_meta_##sname,\
+		.flags			= TRACE_EVENT_FL_CAP_ANY,	\
 	};								\
 	static struct ftrace_event_call __used				\
 	  __attribute__((section("_ftrace_events")))			\
-	*__event_exit_##sname = &event_exit_##sname;			\
-	__TRACE_EVENT_FLAGS(exit_##sname, TRACE_EVENT_FL_CAP_ANY)
+	*__event_exit_##sname = &event_exit_##sname;

 #define SYSCALL_METADATA(sname, nb)				\
 	SYSCALL_TRACE_ENTER_EVENT(sname);			\
@ -158,6 +158,7 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 	static struct syscall_metadata __used			\
 	  __syscall_meta_##sname = {				\
 		.name 		= "sys"#sname,			\
+		.syscall_nr	= -1,	/* Filled in at boot */	\
 		.nb_args 	= nb,				\
 		.types		= types_##sname,		\
 		.args		= args_##sname,			\
@ -175,6 +176,7 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 	static struct syscall_metadata __used			\
 	  __syscall_meta__##sname = {				\
 		.name 		= "sys_"#sname,			\
+		.syscall_nr	= -1,	/* Filled in at boot */	\
 		.nb_args 	= 0,				\
 		.enter_event	= &event_enter__##sname,	\
 		.exit_event	= &event_exit__##sname,		\
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@ -2163,10 +2163,14 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 		delta = diff;
 		if (unlikely(test_time_stamp(delta))) {
 			WARN_ONCE(delta > (1ULL << 59),
-				  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
+				  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
 				  (unsigned long long)delta,
 				  (unsigned long long)ts,
-				  (unsigned long long)cpu_buffer->write_stamp);
+				  (unsigned long long)cpu_buffer->write_stamp,
+				  sched_clock_stable ? "" :
+				  "If you just came from a suspend/resume,\n"
+				  "please switch to the trace global clock:\n"
+				  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
 			add_timestamp = 1;
 		}
 	}
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@ -2710,6 +2710,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,

 	mutex_lock(&trace_types_lock);
 	if (tracer_enabled ^ val) {
+
+		/* Only need to warn if this is used to change the state */
+		WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on");
+
 		if (val) {
 			tracer_enabled = 1;
 			if (current_trace->start)
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
 	ctx_trace = tr;
 }

-static void stop_sched_trace(struct trace_array *tr)
-{
-	tracing_stop_sched_switch_record();
-}
-
-static int sched_switch_trace_init(struct trace_array *tr)
-{
-	ctx_trace = tr;
-	tracing_reset_online_cpus(tr);
-	tracing_start_sched_switch_record();
-	return 0;
-}
-
-static void sched_switch_trace_reset(struct trace_array *tr)
-{
-	if (sched_ref)
-		stop_sched_trace(tr);
-}
-
-static void sched_switch_trace_start(struct trace_array *tr)
-{
-	sched_stopped = 0;
-}
-
-static void sched_switch_trace_stop(struct trace_array *tr)
-{
-	sched_stopped = 1;
-}
-
-static struct tracer sched_switch_trace __read_mostly =
-{
-	.name		= "sched_switch",
-	.init		= sched_switch_trace_init,
-	.reset		= sched_switch_trace_reset,
-	.start		= sched_switch_trace_start,
-	.stop		= sched_switch_trace_stop,
-	.wait_pipe	= poll_wait_pipe,
-#ifdef CONFIG_FTRACE_SELFTEST
-	.selftest    = trace_selftest_startup_sched_switch,
-#endif
-};
-
-__init static int init_sched_switch_trace(void)
-{
-	return register_tracer(&sched_switch_trace);
-}
-device_initcall(init_sched_switch_trace);
-
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@ -60,6 +60,19 @@ extern struct syscall_metadata *__stop_syscalls_metadata[];

 static struct syscall_metadata **syscalls_metadata;

+#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
+static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
+{
+	/*
+	 * Only compare after the "sys" prefix. Archs that use
+	 * syscall wrappers may have syscalls symbols aliases prefixed
+	 * with "SyS" instead of "sys", leading to an unwanted
+	 * mismatch.
+	 */
+	return !strcmp(sym + 3, name + 3);
+}
+#endif
+
 static __init struct syscall_metadata *
 find_syscall_meta(unsigned long syscall)
 {
@ -72,14 +85,11 @@ find_syscall_meta(unsigned long syscall)
 	stop = __stop_syscalls_metadata;
 	kallsyms_lookup(syscall, NULL, NULL, NULL, str);

+	if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
+		return NULL;
+
 	for ( ; start < stop; start++) {
-		/*
-		 * Only compare after the "sys" prefix. Archs that use
-		 * syscall wrappers may have syscalls symbols aliases prefixed
-		 * with "SyS" instead of "sys", leading to an unwanted
-		 * mismatch.
-		 */
-		if ((*start)->name && !strcmp((*start)->name + 3, str + 3))
+		if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
 			return *start;
 	}
 	return NULL;
@ -359,7 +369,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
 	int num;

 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
-	if (num < 0 || num >= NR_syscalls)
+	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
 		return -ENOSYS;
 	mutex_lock(&syscall_trace_lock);
 	if (!sys_refcount_enter)
@ -377,7 +387,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
 	int num;

 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
-	if (num < 0 || num >= NR_syscalls)
+	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
 		return;
 	mutex_lock(&syscall_trace_lock);
 	sys_refcount_enter--;
@ -393,7 +403,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
 	int num;

 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
-	if (num < 0 || num >= NR_syscalls)
+	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
 		return -ENOSYS;
 	mutex_lock(&syscall_trace_lock);
 	if (!sys_refcount_exit)
@ -411,7 +421,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
 	int num;

 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
-	if (num < 0 || num >= NR_syscalls)
+	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
 		return;
 	mutex_lock(&syscall_trace_lock);
 	sys_refcount_exit--;
@ -424,6 +434,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
 int init_syscall_trace(struct ftrace_event_call *call)
 {
 	int id;
+	int num;
+
+	num = ((struct syscall_metadata *)call->data)->syscall_nr;
+	if (num < 0 || num >= NR_syscalls) {
+		pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
+				((struct syscall_metadata *)call->data)->name);
+		return -ENOSYS;
+	}

 	if (set_syscall_print_fmt(call) < 0)
 		return -ENOMEM;
@ -438,7 +456,7 @@ int init_syscall_trace(struct ftrace_event_call *call)
 	return id;
 }

-unsigned long __init arch_syscall_addr(int nr)
+unsigned long __init __weak arch_syscall_addr(int nr)
 {
 	return (unsigned long)sys_call_table[nr];
 }
--- a/scripts/kconfig/streamline_config.pl
+++ b/scripts/kconfig/streamline_config.pl
@ -1,6 +1,6 @@
 #!/usr/bin/perl -w
 #
-# Copywrite 2005-2009 - Steven Rostedt
+# Copyright 2005-2009 - Steven Rostedt
 # Licensed under the terms of the GNU GPL License version 2
 #
 #  It's simple enough to figure out how this works.
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@ -1,6 +1,6 @@
 #!/usr/bin/perl -w
 #
-# Copywrite 2010 - Steven Rostedt <srostedt@redhat.com>, Red Hat Inc.
+# Copyright 2010 - Steven Rostedt <srostedt@redhat.com>, Red Hat Inc.
 # Licensed under the terms of the GNU GPL License version 2
 #