From d6b17bebd79dae2e3577f2ea27a832af4991a5e6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 3 May 2010 15:14:33 +0200
Subject: perf: Provide a new deterministic events reordering algorithm

The current events reordering algorithm is based on a heuristic that
gets broken once we deal with a very fast flow of events.

Indeed the time period based flushing is not suitable anymore
in the following case, assuming we have a flush period of two
seconds.

    CPU 0           |        CPU 1
                    |
  cnt1 timestamps   |      cnt1 timestamps
                    |
    0               |         0
    1               |         1
    2               |         2
    3               |         3
    [...]           |        [...]
    4 seconds later

If we spend too much time to read the buffers (case of a lot of
events to record in each buffers or when we have a lot of CPU buffers
to read), in the next pass the CPU 0 buffer could contain a slice
of several seconds of events. We'll read them all and notice we've
reached the period to flush. In the above example we flush the first
half of the CPU 0 buffer, then we read the CPU 1 buffer where we
have events that were on the flush slice and then the reordering
fails.

It's simple to reproduce with:

	perf lock record perf bench sched messaging

To solve this, we use a new solution that doesn't rely on an
heuristical time slice period anymore but on a deterministic basis
based on how perf record does its job.

perf record saves the buffers through passes. A pass is a tour
on every buffers from every CPUs. This is made in order: for
each CPU we read the buffers of every counters. So the more
buffers we visit, the later will be the timstamps of their events.

When perf record finishes a pass it records a
PERF_RECORD_FINISHED_ROUND pseudo event.
We record the max timestamp t found in the pass n. Assuming these
timestamps are monotonic across cpus, we know that if a buffer
still has events with timestamps below t, they will be all available
and then read in the pass n + 1.
Hence when we start to read the pass n + 2, we can safely flush every
events with timestamps below t.

      ============ PASS n =================
         CPU 0         |   CPU 1
                       |
      cnt1 timestamps  |   cnt2 timestamps
            1          |         2
            2          |         3
            -          |         4  <--- max recorded

      ============ PASS n + 1 ==============
         CPU 0         |   CPU 1
                       |
      cnt1 timestamps  |   cnt2 timestamps
            3          |         5
            4          |         6
            5          |         7 <---- max recorded

        Flush every events below timestamp 4

      ============ PASS n + 2 ==============
         CPU 0         |   CPU 1
                       |
      cnt1 timestamps  |   cnt2 timestamps
            6          |         8
            7          |         9
            -          |         10

        Flush every events below timestamp 7
        etc...

It also works on perf.data versions that don't have
PERF_RECORD_FINISHED_ROUND pseudo events. The difference is that
the events will be only flushed in the end of the perf.data
processing. It will then consume more memory and scale less with
large perf.data files.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
---
 tools/perf/util/session.h | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

(limited to 'tools/perf/util/session.h')

diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index f2b2c6a3a49d..568fd08a478d 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -14,7 +14,8 @@ struct thread;
 
 struct ordered_samples {
 	u64			last_flush;
-	u64			flush_limit;
+	u64			next_flush;
+	u64			max_timestamp;
 	struct list_head	samples_head;
 	struct sample_queue	*last_inserted;
 };
@@ -41,23 +42,28 @@ struct perf_session {
 	char filename[0];
 };
 
+struct perf_event_ops;
+
 typedef int (*event_op)(event_t *self, struct perf_session *session);
+typedef int (*event_op2)(event_t *self, struct perf_session *session,
+			 struct perf_event_ops *ops);
 
 struct perf_event_ops {
-	event_op sample,
-		 mmap,
-		 comm,
-		 fork,
-		 exit,
-		 lost,
-		 read,
-		 throttle,
-		 unthrottle,
-		 attr,
-		 event_type,
-		 tracing_data,
-		 build_id;
-	bool	ordered_samples;
+	event_op	sample,
+			mmap,
+			comm,
+			fork,
+			exit,
+			lost,
+			read,
+			throttle,
+			unthrottle,
+			attr,
+			event_type,
+			tracing_data,
+			build_id;
+	event_op2	finished_round;
+	bool		ordered_samples;
 };
 
 struct perf_session *perf_session__new(const char *filename, int mode, bool force, bool repipe);
-- 
cgit v1.2.3