commit 8b62450e8ff5670c82fd98957d7c3e3625b7bb23 Author: Andi Kleen Date: Fri Oct 28 00:30:59 2011 -0700 kernel disabled support (warning: causes rare crashes) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index e0786e3..a45f177 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -218,9 +218,12 @@ struct perf_event_attr { precise_ip : 2, /* skid constraint */ mmap_data : 1, /* non-exec mmap data */ sample_id_all : 1, /* sample_type all events */ - - __reserved_1 : 45; - + /* + * task_disable_after_exec: + * Disable per task events after exec + */ + task_disable_after_exec : 1, + __reserved_1 : 44; union { __u32 wakeup_events; /* wakeup every n events */ __u32 wakeup_watermark; /* bytes before wakeup */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 14a6c7b..af8c3a6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1774,6 +1774,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define PF_FROZEN 0x00010000 /* frozen for system suspend */ #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ #define PF_KSWAPD 0x00040000 /* I am kswapd */ +#define PF_PERF_DISABLED 0x00080000 /* task perf is disabled for this task */ #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 9efe710..bda8fe2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1893,7 +1893,14 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, cpuctx = __get_cpu_context(ctx); if (!cpuctx->task_ctx) return; - + if (unlikely(!next)) { + ctx_sched_out(ctx, cpuctx, EVENT_ALL); + cpuctx->task_ctx = NULL; + return; + } + if (next->flags & PF_PERF_DISABLED) + return; + rcu_read_lock(); parent = rcu_dereference(ctx->parent_ctx); next_ctx = next->perf_event_ctxp[ctxn]; @@ -1910,7 +1917,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, */ raw_spin_lock(&ctx->lock); raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); - if (context_equiv(ctx, next_ctx)) { + if (context_equiv(ctx, next_ctx) && task != next) { /* * XXX do we need a memory barrier of sorts * wrt to rcu_dereference() of perf_event_ctxp @@ -2107,7 +2114,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx; cpuctx = __get_cpu_context(ctx); - if (cpuctx->task_ctx == ctx) + if (cpuctx->task_ctx == ctx || (task->flags & PF_PERF_DISABLED)) return; perf_pmu_disable(ctx->pmu); @@ -2389,16 +2396,19 @@ void perf_event_task_tick(void) static int event_enable_on_exec(struct perf_event *event, struct perf_event_context *ctx) { - if (!event->attr.enable_on_exec) - return 0; - - event->attr.enable_on_exec = 0; - if (event->state >= PERF_EVENT_STATE_INACTIVE) - return 0; - - __perf_event_mark_enabled(event, ctx); + if (event->attr.task_disable_after_exec) { + event->attr.task_disable_after_exec = 0; + current->flags |= PF_PERF_DISABLED; + } - return 1; + if (event->attr.enable_on_exec) { + event->attr.enable_on_exec = 0; + if (event->state < PERF_EVENT_STATE_INACTIVE) { + __perf_event_mark_enabled(event, ctx); + return 1; + } + } + return 0; } /* @@ -3321,25 +3331,27 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) int perf_event_task_enable(void) { - struct perf_event *event; - - mutex_lock(¤t->perf_event_mutex); - list_for_each_entry(event, ¤t->perf_event_list, owner_entry) - perf_event_for_each_child(event, perf_event_enable); - mutex_unlock(¤t->perf_event_mutex); + struct task_struct *task = current; + local_irq_disable(); + __perf_event_task_sched_out(task, NULL); + current->flags &= ~PF_PERF_DISABLED; + /* In again to reenable cgroups */ + __perf_event_task_sched_in(task); + local_irq_enable(); return 0; } int perf_event_task_disable(void) { - struct perf_event *event; - - mutex_lock(¤t->perf_event_mutex); - list_for_each_entry(event, ¤t->perf_event_list, owner_entry) - perf_event_for_each_child(event, perf_event_disable); - mutex_unlock(¤t->perf_event_mutex); + struct task_struct *task = current; + local_irq_disable(); + __perf_event_task_sched_out(task, task); + current->flags |= PF_PERF_DISABLED; + /* In again to reenable cgroups */ + __perf_event_task_sched_in(task); + local_irq_enable(); return 0; } @@ -5269,6 +5281,9 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi, struct perf_sample_data data; int rctx; + if (current->flags & PF_PERF_DISABLED) + return; + preempt_disable_notrace(); rctx = perf_swevent_get_recursion_context(); if (rctx < 0) diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 5a520f8..056133d 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -148,6 +148,13 @@ an empty cgroup (monitor all the time) using, e.g., -G foo,,bar. Cgroups must ha corresponding events, i.e., they always refer to events defined earlier on the command line. +--disabled:: +start the events disabled in the target process. The target process +can then use prctl(PR_TASK_PERF_EVENTS_ENABLE) to enable the counters +when needed and PR_TASK_PERF_EVENTS_DISABLE to disable them again. +This is useful to measure only specific regions in a program. +This only works without -a. + SEE ALSO -------- linkperf:perf-stat[1], linkperf:perf-list[1] diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index 918cc38..5776b47 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -94,6 +94,13 @@ an empty cgroup (monitor all the time) using, e.g., -G foo,,bar. Cgroups must ha corresponding events, i.e., they always refer to events defined earlier on the command line. +--disabled:: +start the events disabled in the target process. The target process +can then use prctl(PR_TASK_PERF_EVENTS_ENABLE) to enable the counters +when needed and PR_TASK_PERF_EVENTS_DISABLE to disable them again. +This is useful to measure only specific regions in a program. +This only works without -a. + EXAMPLES -------- diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 8e2c857..455069b 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -66,6 +66,7 @@ static bool sample_time = false; static bool no_buildid = false; static bool no_buildid_cache = false; static struct perf_evlist *evsel_list; +static bool disabled = false; static long samples = 0; static u64 bytes_written = 0; @@ -227,6 +228,9 @@ static void config_attr(struct perf_evsel *evsel, struct perf_evlist *evlist) attr->disabled = 1; attr->enable_on_exec = 1; } + + if (disabled) + attr->task_disable_after_exec = 1; } static bool perf_evlist__equal(struct perf_evlist *evlist, @@ -787,6 +791,8 @@ const struct option record_options[] = { OPT_CALLBACK('G', "cgroup", &evsel_list, "name", "monitor event in cgroup name only", parse_cgroups), + OPT_BOOLEAN(0, "disabled", &disabled, + "set up counters, but don't enable them"), OPT_END() }; diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a9f0671..bb592bf 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -191,6 +191,7 @@ static int big_num_opt = -1; static const char *cpu_list; static const char *csv_sep = NULL; static bool csv_output = false; +static bool disabled = false; static volatile int done = 0; @@ -285,6 +286,9 @@ static int create_perf_stat_counter(struct perf_evsel *evsel) attr->enable_on_exec = 1; } + if (disabled) + attr->task_disable_after_exec = 1; + return perf_evsel__open_per_thread(evsel, evsel_list->threads, false); } @@ -1059,6 +1063,8 @@ static const struct option options[] = { OPT_CALLBACK('G', "cgroup", &evsel_list, "name", "monitor event in cgroup name only", parse_cgroups), + OPT_BOOLEAN(0, "disabled", &disabled, + "set up counters, but don't enable them"), OPT_END() };