From 9f498cc5be7e013d8d6e4c616980ed0ffc8680d2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 23 Jul 2009 14:46:33 +0200 Subject: [PATCH] perf_counter: Full task tracing In order to be able to distinguish between no samples due to inactivity and no samples due to task ended, Arjan asked for PERF_EVENT_EXIT events. This is useful to the boot delay instrumentation (bootchart) app. This patch changes the PERF_EVENT_FORK to be emitted on every clone, and adds PERF_EVENT_EXIT to be emitted on task exit, after the task's counters have been closed. This task tracing is controlled through: attr.comm || attr.mmap and through the new attr.task field. Suggested-by: Arjan van de Ven Cc: Paul Mackerras Cc: Anton Blanchard Signed-off-by: Peter Zijlstra [ cleaned up perf_counter.h a bit ] Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 13 +++- kernel/fork.c | 4 +- kernel/perf_counter.c | 113 ++++++++++++++++++++++------------- 3 files changed, 84 insertions(+), 46 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index bd15d7a5f5ce..e604e6ef72dd 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -181,8 +181,9 @@ struct perf_counter_attr { freq : 1, /* use freq, not period */ inherit_stat : 1, /* per task counts */ enable_on_exec : 1, /* next exec enables */ + task : 1, /* trace fork/exit */ - __reserved_1 : 51; + __reserved_1 : 50; __u32 wakeup_events; /* wakeup every n events */ __u32 __reserved_2; @@ -308,6 +309,15 @@ enum perf_event_type { */ PERF_EVENT_COMM = 3, + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * u32 tid, ptid; + * }; + */ + PERF_EVENT_EXIT = 4, + /* * struct { * struct perf_event_header header; @@ -323,6 +333,7 @@ enum perf_event_type { * struct { * struct perf_event_header header; * u32 pid, ppid; + * u32 tid, ptid; * }; */ PERF_EVENT_FORK = 7, diff --git a/kernel/fork.c b/kernel/fork.c index 29b532e718f7..466531eb92cc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1269,6 +1269,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); + perf_counter_fork(p); return p; bad_fork_free_pid: @@ -1410,9 +1411,6 @@ long do_fork(unsigned long clone_flags, init_completion(&vfork); } - if (!(clone_flags & CLONE_THREAD)) - perf_counter_fork(p); - audit_finish_fork(p); tracehook_report_clone(regs, clone_flags, nr, p); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 48471d75ae01..199ed4771315 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -42,6 +42,7 @@ static int perf_overcommit __read_mostly = 1; static atomic_t nr_counters __read_mostly; static atomic_t nr_mmap_counters __read_mostly; static atomic_t nr_comm_counters __read_mostly; +static atomic_t nr_task_counters __read_mostly; /* * perf counter paranoia level: @@ -1654,6 +1655,8 @@ static void free_counter(struct perf_counter *counter) atomic_dec(&nr_mmap_counters); if (counter->attr.comm) atomic_dec(&nr_comm_counters); + if (counter->attr.task) + atomic_dec(&nr_task_counters); } if (counter->destroy) @@ -2831,10 +2834,12 @@ perf_counter_read_event(struct perf_counter *counter, } /* - * fork tracking + * task tracking -- fork/exit + * + * enabled by: attr.comm | attr.mmap | attr.task */ -struct perf_fork_event { +struct perf_task_event { struct task_struct *task; struct { @@ -2842,37 +2847,42 @@ struct perf_fork_event { u32 pid; u32 ppid; + u32 tid; + u32 ptid; } event; }; -static void perf_counter_fork_output(struct perf_counter *counter, - struct perf_fork_event *fork_event) +static void perf_counter_task_output(struct perf_counter *counter, + struct perf_task_event *task_event) { struct perf_output_handle handle; - int size = fork_event->event.header.size; - struct task_struct *task = fork_event->task; + int size = task_event->event.header.size; + struct task_struct *task = task_event->task; int ret = perf_output_begin(&handle, counter, size, 0, 0); if (ret) return; - fork_event->event.pid = perf_counter_pid(counter, task); - fork_event->event.ppid = perf_counter_pid(counter, task->real_parent); + task_event->event.pid = perf_counter_pid(counter, task); + task_event->event.ppid = perf_counter_pid(counter, task->real_parent); - perf_output_put(&handle, fork_event->event); + task_event->event.tid = perf_counter_tid(counter, task); + task_event->event.ptid = perf_counter_tid(counter, task->real_parent); + + perf_output_put(&handle, task_event->event); perf_output_end(&handle); } -static int perf_counter_fork_match(struct perf_counter *counter) +static int perf_counter_task_match(struct perf_counter *counter) { - if (counter->attr.comm || counter->attr.mmap) + if (counter->attr.comm || counter->attr.mmap || counter->attr.task) return 1; return 0; } -static void perf_counter_fork_ctx(struct perf_counter_context *ctx, - struct perf_fork_event *fork_event) +static void perf_counter_task_ctx(struct perf_counter_context *ctx, + struct perf_task_event *task_event) { struct perf_counter *counter; @@ -2881,19 +2891,19 @@ static void perf_counter_fork_ctx(struct perf_counter_context *ctx, rcu_read_lock(); list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_fork_match(counter)) - perf_counter_fork_output(counter, fork_event); + if (perf_counter_task_match(counter)) + perf_counter_task_output(counter, task_event); } rcu_read_unlock(); } -static void perf_counter_fork_event(struct perf_fork_event *fork_event) +static void perf_counter_task_event(struct perf_task_event *task_event) { struct perf_cpu_context *cpuctx; struct perf_counter_context *ctx; cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_fork_ctx(&cpuctx->ctx, fork_event); + perf_counter_task_ctx(&cpuctx->ctx, task_event); put_cpu_var(perf_cpu_context); rcu_read_lock(); @@ -2903,32 +2913,40 @@ static void perf_counter_fork_event(struct perf_fork_event *fork_event) */ ctx = rcu_dereference(current->perf_counter_ctxp); if (ctx) - perf_counter_fork_ctx(ctx, fork_event); + perf_counter_task_ctx(ctx, task_event); rcu_read_unlock(); } +static void perf_counter_task(struct task_struct *task, int new) +{ + struct perf_task_event task_event; + + if (!atomic_read(&nr_comm_counters) && + !atomic_read(&nr_mmap_counters) && + !atomic_read(&nr_task_counters)) + return; + + task_event = (struct perf_task_event){ + .task = task, + .event = { + .header = { + .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, + .misc = 0, + .size = sizeof(task_event.event), + }, + /* .pid */ + /* .ppid */ + /* .tid */ + /* .ptid */ + }, + }; + + perf_counter_task_event(&task_event); +} + void perf_counter_fork(struct task_struct *task) { - struct perf_fork_event fork_event; - - if (!atomic_read(&nr_comm_counters) && - !atomic_read(&nr_mmap_counters)) - return; - - fork_event = (struct perf_fork_event){ - .task = task, - .event = { - .header = { - .type = PERF_EVENT_FORK, - .misc = 0, - .size = sizeof(fork_event.event), - }, - /* .pid */ - /* .ppid */ - }, - }; - - perf_counter_fork_event(&fork_event); + perf_counter_task(task, 1); } /* @@ -3887,6 +3905,8 @@ done: atomic_inc(&nr_mmap_counters); if (counter->attr.comm) atomic_inc(&nr_comm_counters); + if (counter->attr.task) + atomic_inc(&nr_task_counters); } return counter; @@ -4248,8 +4268,10 @@ void perf_counter_exit_task(struct task_struct *child) struct perf_counter_context *child_ctx; unsigned long flags; - if (likely(!child->perf_counter_ctxp)) + if (likely(!child->perf_counter_ctxp)) { + perf_counter_task(child, 0); return; + } local_irq_save(flags); /* @@ -4267,15 +4289,22 @@ void perf_counter_exit_task(struct task_struct *child) * incremented the context's refcount before we do put_ctx below. */ spin_lock(&child_ctx->lock); - child->perf_counter_ctxp = NULL; /* * If this context is a clone; unclone it so it can't get * swapped to another process while we're removing all * the counters from it. */ unclone_ctx(child_ctx); - spin_unlock(&child_ctx->lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&child_ctx->lock, flags); + + /* + * Report the task dead after unscheduling the counters so that we + * won't get any samples after PERF_EVENT_EXIT. We can however still + * get a few PERF_EVENT_READ events. + */ + perf_counter_task(child, 0); + + child->perf_counter_ctxp = NULL; /* * We can recurse on the same lock type through: