#! /usr/bin/perl
#
# Copyright © 2017 Intel Corporation
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice (including the next
# paragraph) shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
#

use strict;
use warnings;
use 5.010;

my $gid = 0;
my (%db, %vdb, %queue, %submit, %notify, %rings, %ctxdb, %ringmap, %reqwait,
    %ctxtimelines);
my (%cids, %ctxmap);
my $cid = 0;
my %queues;
my @freqs;

use constant VENG => '255:254';

my $max_requests = 1000;
my $width_us = 32000;
my $correct_durations = 0;
my %ignore_ring;
my %skip_box;
my $html = 0;
my $trace = 0;
my $avg_delay_stats = 0;
my $gpu_timeline = 0;
my $colour_contexts = 0;

my @args;

sub arg_help
{
	return unless scalar(@_);

	if ($_[0] eq '--help' or $_[0] eq '-h') {
		shift @_;
print <<ENDHELP;
Notes:

   The tool parse the output generated by the 'perf script' command after the
   correct set of i915 tracepoints have been collected via perf record.

   To collect the data:

	./trace.pl --trace [command-to-be-profiled]

   The above will invoke perf record, or alternatively it can be done directly:

	perf record -a -c 1 -e i915:intel_gpu_freq_change, \
			       i915:i915_request_add, \
			       i915:i915_request_submit, \
			       i915:i915_request_in, \
			       i915:i915_request_out, \
			       dma_fence:dma_fence_signaled, \
			       i915:i915_request_wait_begin, \
			       i915:i915_request_wait_end \
			       [command-to-be-profiled]

   Then create the log file with:

	perf script >trace.log

   This file in turn should be piped into this tool which will generate some
   statistics out of it, or if --html was given HTML output.

   HTML can be viewed from a directory containing the 'vis' JavaScript module.
   On Ubuntu this can be installed like this:

	apt-get install npm
	npm install vis

Usage:
   trace.pl <options> <input-file >output-file

      --help / -h			This help text
      --max-items=num / -m num		Maximum number of boxes to put on the
					timeline. More boxes means more work for
					the JavaScript engine in the browser.
      --zoom-width-ms=ms / -z ms	Width of the initial timeline zoom
      --split-requests / -s		Try to split out request which were
					submitted together due coalescing in the
					driver. May not be 100% accurate and may
					influence the per-engine statistics so
					use with care.
      --ignore-ring=id / -i id		Ignore ring with the numerical id when
					parsing the log (enum intel_engine_id).
					Can be given multiple times.
      --skip-box=name / -x name		Do not put a certain type of a box on
					the timeline. One of: queue, ready,
					execute and ctxsave.
					Can be given multiple times.
      --html				Generate HTML output.
      --trace cmd			Trace the following command.
      --avg-delay-stats			Print average delay stats.
      --gpu-timeline			Draw overall GPU busy timeline.
      --colour-contexts / -c		Use different colours for different
					context execution boxes.
ENDHELP

		exit 0;
	}

	return @_;
}

sub arg_html
{
	return unless scalar(@_);

	if ($_[0] eq '--html') {
		shift @_;
		$html = 1;
	}

	return @_;
}

sub arg_avg_delay_stats
{
	return unless scalar(@_);

	if ($_[0] eq '--avg-delay-stats') {
		shift @_;
		$avg_delay_stats = 1;
	}

	return @_;
}

sub arg_gpu_timeline
{
	return unless scalar(@_);

	if ($_[0] eq '--gpu-timeline') {
		shift @_;
		$gpu_timeline = 1;
	}

	return @_;
}

sub arg_trace
{
	my @events = ( 'i915:intel_gpu_freq_change',
		       'i915:i915_request_add',
		       'i915:i915_request_submit',
		       'i915:i915_request_in',
		       'i915:i915_request_out',
		       'dma_fence:dma_fence_signaled',
		       'i915:i915_request_wait_begin',
		       'i915:i915_request_wait_end' );

	return unless scalar(@_);

	if ($_[0] eq '--trace') {
		shift @_;

		unshift @_, '--';
		unshift @_, join(',', @events);
		unshift @_, ('perf', 'record', '-a', '-c', '1', '-q', '-o', 'perf.data', '-e');

		exec @_;
	}

	return @_;
}

sub arg_max_requests
{
	my $val;

	return unless scalar(@_);

	if ($_[0] eq '--max-requests' or $_[0] eq '-m') {
		shift @_;
		$val = shift @_;
	} elsif ($_[0] =~ /--max-requests=(\d+)/) {
		shift @_;
		$val = $1;
	}

	$max_requests = int($val) if defined $val;

	return @_;
}

sub arg_zoom_width
{
	my $val;

	return unless scalar(@_);

	if ($_[0] eq '--zoom-width-ms' or $_[0] eq '-z') {
		shift @_;
		$val = shift @_;
	} elsif ($_[0] =~ /--zoom-width-ms=(\d+)/) {
		shift @_;
		$val = $1;
	}

	$width_us = int($val) * 1000 if defined $val;

	return @_;
}

sub arg_split_requests
{
	return unless scalar(@_);

	if ($_[0] eq '--split-requests' or $_[0] eq '-s') {
		shift @_;
		$correct_durations = 1;
	}

	return @_;
}

sub arg_ignore_ring
{
	my $val;

	return unless scalar(@_);

	if ($_[0] eq '--ignore-ring' or $_[0] eq '-i') {
		shift @_;
		$val = shift @_;
	} elsif ($_[0] =~ /--ignore-ring=(\d+)/) {
		shift @_;
		$val = $1;
	}

	$ignore_ring{$val} = 1 if defined $val;

	return @_;
}

sub arg_skip_box
{
	my $val;

	return unless scalar(@_);

	if ($_[0] eq '--skip-box' or $_[0] eq '-x') {
		shift @_;
		$val = shift @_;
	} elsif ($_[0] =~ /--skip-box=(\d+)/) {
		shift @_;
		$val = $1;
	}

	$skip_box{$val} = 1 if defined $val;

	return @_;
}

sub arg_colour_contexts
{
	return unless scalar(@_);

	if ($_[0] eq '--colour-contexts' or
	    $_[0] eq '--color-contexts' or
	    $_[0] eq '-c') {
		shift @_;
		$colour_contexts = 1;
	}

	return @_;
}

@args = @ARGV;
while (@args) {
	my $left = scalar(@args);

	@args = arg_help(@args);
	@args = arg_html(@args);
	@args = arg_avg_delay_stats(@args);
	@args = arg_gpu_timeline(@args);
	@args = arg_trace(@args);
	@args = arg_max_requests(@args);
	@args = arg_zoom_width(@args);
	@args = arg_split_requests(@args);
	@args = arg_ignore_ring(@args);
	@args = arg_skip_box(@args);
	@args = arg_colour_contexts(@args);

	last if $left == scalar(@args);
}

die if scalar(@args);

@ARGV = @args;

sub db_key
{
	my ($ring, $ctx, $seqno) = @_;

	return $ring . '/' . $ctx . '/' . $seqno;
}

sub notify_key
{
	my ($ctx, $seqno) = @_;

	return $ctx . '/' . $seqno;
}

sub sanitize_ctx
{
	my ($ctx, $ring) = @_;

	if (exists $ctxdb{$ctx} and $ctxdb{$ctx} > 1) {
		return $ctx . '.' . $ctxdb{$ctx};
	} else {
		return $ctx;
	}
}

sub is_veng
{
	my ($class, $instance) = split ':', shift;

	return $instance eq '254';
}

# Main input loop - parse lines and build the internal representation of the
# trace using a hash of requests and some auxilliary data structures.
my $prev_freq = 0;
my $prev_freq_ts = 0;
while (<>) {
	my @fields;
	my $tp_name;
	my %tp;
	my ($time, $ctx, $ring, $seqno, $orig_ctx, $key);

	chomp;
	@fields = split ' ';

	chop $fields[3];
	$time = int($fields[3] * 1000000.0 + 0.5);

	$tp_name = $fields[4];

	splice @fields, 0, 5;

	foreach my $f (@fields) {
		my ($k, $v);

		next unless $f =~ m/=/;
		($k, $v) = ($`, $');
		$k = 'global' if $k eq 'global_seqno';
		chop $v if substr($v, -1, 1) eq ',';
		$tp{$k} = $v;

		$tp{'ring'} = $tp{'engine'} if $k eq 'engine';
	}

	next if exists $tp{'ring'} and exists $ignore_ring{$tp{'ring'}};

	if (exists $tp{'ring'} and exists $tp{'seqno'}) {
		$ring = $tp{'ring'};
		$seqno = $tp{'seqno'};

		if (exists $tp{'ctx'}) {
			$ctx = $tp{'ctx'};
			$orig_ctx = $ctx;
			$ctx = sanitize_ctx($ctx, $ring);
			$ring = VENG if is_veng($ring);
			$key = db_key($ring, $ctx, $seqno);
		}
	}

	if ($tp_name eq 'i915:i915_request_wait_begin:') {
		my %rw;

		next if exists $reqwait{$key};
		die if $ring eq VENG and not exists $queues{$ctx};

		$rw{'key'} = $key;
		$rw{'ring'} = $ring;
		$rw{'seqno'} = $seqno;
		$rw{'ctx'} = $ctx;
		$rw{'start'} = $time;
		$reqwait{$key} = \%rw;
	} elsif ($tp_name eq 'i915:i915_request_wait_end:') {
		die if $ring eq VENG and not exists $queues{$ctx};

		if (exists $reqwait{$key}) {
			$reqwait{$key}->{'end'} = $time;
		} else { # Virtual engine
			my $vkey = db_key(VENG, $ctx, $seqno);

			die unless exists $reqwait{$vkey};

			# If the wait started on the virtual engine, attribute
			# it to it completely.
			$reqwait{$vkey}->{'end'} = $time;
		}
	} elsif ($tp_name eq 'i915:i915_request_add:') {
		if (exists $queue{$key}) {
			$ctxdb{$orig_ctx}++;
			$ctx = sanitize_ctx($orig_ctx, $ring);
			$key = db_key($ring, $ctx, $seqno);
		} else {
			$ctxdb{$orig_ctx} = 1;
		}

		$queue{$key} = $time;
		if ($ring eq VENG and not exists $queues{$ctx}) {
			$queues{$ctx} = 1 ;
			$cids{$ctx} = $cid++;
			$ctxmap{$cids{$ctx}} = $ctx;
		}
	} elsif ($tp_name eq 'i915:i915_request_submit:') {
		die if exists $submit{$key};
		die unless exists $queue{$key};
		die if $ring eq VENG and not exists $queues{$ctx};

		$submit{$key} = $time;
	} elsif ($tp_name eq 'i915:i915_request_in:') {
		my ($q, $s);
		my %req;

		# preemption
		delete $db{$key} if exists $db{$key};

		unless (exists $queue{$key}) {
			# Virtual engine
			my $vkey = db_key(VENG, $ctx, $seqno);
			my %req;

			die unless exists $queues{$ctx};
			die unless exists $queue{$vkey};
			die unless exists $submit{$vkey};

			# Create separate request record on the queue timeline
			$q = $queue{$vkey};
			$s = $submit{$vkey};
			$req{'queue'} = $q;
			$req{'submit'} = $s;
			$req{'start'} = $time;
			$req{'end'} = $time;
			$req{'ring'} = VENG;
			$req{'seqno'} = $seqno;
			$req{'ctx'} = $ctx;
			$req{'name'} = $ctx . '/' . $seqno;
			$req{'global'} = $tp{'global'};
			$req{'port'} = $tp{'port'};

			$vdb{$vkey} = \%req;
		} else {
			$q = $queue{$key};
			$s = $submit{$key};
		}

		$req{'start'} = $time;
		$req{'ring'} = $ring;
		$req{'seqno'} = $seqno;
		$req{'ctx'} = $ctx;
		$ctxtimelines{$ctx . '/' . $ring} = 1;
		$req{'name'} = $ctx . '/' . $seqno;
		$req{'global'} = $tp{'global'};
		$req{'port'} = $tp{'port'};
		$req{'queue'} = $q;
		$req{'submit'} = $s;
		$req{'virtual'} = 1 if exists $queues{$ctx};
		$rings{$ring} = $gid++ unless exists $rings{$ring};
		$ringmap{$rings{$ring}} = $ring;
		$db{$key} = \%req;
	} elsif ($tp_name eq 'i915:i915_request_out:') {
		if ($tp{'completed?'}) {
			my $nkey;

			die unless exists $db{$key};
			die unless exists $db{$key}->{'start'};
			die if exists $db{$key}->{'end'};

			$nkey = notify_key($ctx, $seqno);

			$db{$key}->{'end'} = $time;
			$db{$key}->{'notify'} = $notify{$nkey}
						if exists $notify{$nkey};
		} else {
			delete $db{$key};
		}
	} elsif ($tp_name eq 'dma_fence:dma_fence_signaled:') {
		my $nkey;

		next unless $tp{'driver'} eq 'i915' and
			    $tp{'timeline'} eq 'signaled';

		$nkey = notify_key($tp{'context'}, $tp{'seqno'});

		die if exists $notify{$nkey};
		$notify{$nkey} = $time unless exists $notify{$nkey};
	} elsif ($tp_name eq 'i915:intel_gpu_freq_change:') {
		push @freqs, [$prev_freq_ts, $time, $prev_freq] if $prev_freq;
		$prev_freq_ts = $time;
		$prev_freq = $tp{'new_freq'};
	}
}

# Sanitation pass to fixup up out of order notify and context complete, and to
# find the largest seqno to be used for timeline sorting purposes.
my $max_seqno = 0;
foreach my $key (keys %db) {
	my $nkey = notify_key($db{$key}->{'ctx'}, $db{$key}->{'seqno'});

	die unless exists $db{$key}->{'start'};

	$max_seqno = $db{$key}->{'seqno'} if $db{$key}->{'seqno'} > $max_seqno;

	# Notify arrived after context complete?
	$db{$key}->{'notify'} = $notify{$nkey} if not exists $db{$key}->{'notify'}
						  and exists $notify{$nkey};

	# No notify but we have end?
	$db{$key}->{'notify'} = $db{$key}->{'end'} if exists $db{$key}->{'end'} and
						      not exists $db{$key}->{'notify'};

	# If user interrupt arrived out of order push it back to be no later
	# than request out.
	if (exists $db{$key}->{'end'} and exists $db{$key}->{'notify'} and
	    $db{$key}->{'notify'} > $db{$key}->{'end'}) {
		$db{$key}->{'notify'} = $db{$key}->{'end'};
	}
}

my $key_count = scalar(keys %db);

my %engine_timelines;

sub sortStart {
	my $as = $db{$a}->{'start'};
	my $bs = $db{$b}->{'start'};
	my $val;

	$val = $as <=> $bs;
	$val = $a cmp $b if $val == 0;

	return $val;
}

sub get_engine_timeline {
	my ($ring) = @_;
	my @timeline;

	return $engine_timelines{$ring} if exists $engine_timelines{$ring};

	@timeline = grep { $db{$_}->{'ring'} eq $ring } keys %db;
	@timeline = sort sortStart @timeline;
	$engine_timelines{$ring} = \@timeline;

	return \@timeline;
}

# Fix up coalesced requests by ending them either when the following same
# context request with known end ends, or when a different context starts.
foreach my $gid (sort keys %rings) {
	my $ring = $ringmap{$rings{$gid}};
	my $timeline = get_engine_timeline($ring);
	my $last_complete = -1;
	my $last_ctx = -1;
	my $complete;

	foreach my $pos (0..$#{$timeline}) {
		my $key = @{$timeline}[$pos];
		my ($ctx, $end);

		next if exists $db{$key}->{'end'};

		$db{$key}->{'no-end'} = 1;
		$ctx = $db{$key}->{'ctx'};

		if ($pos > $last_complete or $ctx != $last_ctx) {
			my $next = $pos;

			undef $complete;

			while ($next < $#{$timeline}) {
				my $next_key = ${$timeline}[++$next];
				if ($ctx == $db{$next_key}->{'ctx'} and
				    exists $db{$next_key}->{'end'}) {
					$last_ctx = $db{$next_key}->{'ctx'};
					$last_complete = $next;
					$complete = $next_key;
					last;
				}
			}
		}

		if (defined $complete) {
			if ($ctx == $db{$complete}->{'ctx'}) {
				$end = $db{$complete}->{'end'};
			} else {
				$end = $db{$complete}->{'start'};
			}
		} else {
			# No next submission. Use notify if available or give up.
			if (exists $db{$key}->{'notify'}) {
				$end = $db{$key}->{'notify'};
			} else {
				$end = $db{$key}->{'start'};
				$db{$key}->{'incomplete'} = 1;
			}
		}

		unless (exists $db{$key}->{'notify'}) {
			$db{$key}->{'notify'} = $end;
			$db{$key}->{'no-notify'} = 1;
		}
		$db{$key}->{'end'} = $end;
		$db{$key}->{'notify'} = $end if $db{$key}->{'notify'} > $end;
	}
}

my $re_sort = 1;
my @sorted_keys;

sub maybe_sort_keys
{
	if ($re_sort) {
		@sorted_keys = sort sortStart keys %db;
		$re_sort = 0;
		die "Database changed size?!" unless scalar(@sorted_keys) ==
						     $key_count;
	}
}

maybe_sort_keys();

my %ctx_timelines;

sub sortContext {
	my $as = $db{$a}->{'seqno'};
	my $bs = $db{$b}->{'seqno'};
	my $val;

	$val = $as <=> $bs;

	die if $val == 0;

	return $val;
}

sub get_ctx_timeline {
	my ($ctx, $ring, $key) = @_;
	my @timeline;

	return $ctx_timelines{$key} if exists $ctx_timelines{$key};

	@timeline = grep { $db{$_}->{'ring'} eq $ring and
			   $db{$_}->{'ctx'} == $ctx } @sorted_keys;
	# FIXME seqno restart
	@timeline = sort sortContext @timeline;

	$ctx_timelines{$key} = \@timeline;

	return \@timeline;
}

# Split out merged batches if requested.
if ($correct_durations) {
	# Shift !port0 requests start time to after the previous context on the
	# same timeline has finished.
	foreach my $gid (sort keys %rings) {
		my $ring = $ringmap{$rings{$gid}};
		my $timeline = get_engine_timeline($ring);
		my $complete;

		foreach my $pos (0..$#{$timeline}) {
			my $key = @{$timeline}[$pos];
			my $prev = $complete;
			my $pkey;

			$complete = $key unless exists $db{$key}->{'no-end'};
			$pkey = $complete;

			next if $db{$key}->{'port'} == 0;

			$pkey = $prev if $complete eq $key;

			die unless defined $pkey;

			$db{$key}->{'start'} = $db{$pkey}->{'end'};
			$db{$key}->{'start'} = $db{$pkey}->{'notify'} if $db{$key}->{'start'} > $db{$key}->{'end'};

			die if $db{$key}->{'start'} > $db{$key}->{'end'};

			$re_sort = 1;
		}
	}

	maybe_sort_keys();

	# Batch with no-end (no request_out) means it was submitted as part of
	# coalesced context. This means it's start time should be set to the end
	# time of a following request on this context timeline.
	foreach my $tkey (sort keys %ctxtimelines) {
		my ($ctx, $ring) = split '/', $tkey;
		my $timeline = get_ctx_timeline($ctx, $ring, $tkey);
		my $last_complete = -1;
		my $complete;

		foreach my $pos (0..$#{$timeline}) {
			my $key = @{$timeline}[$pos];
			my $next_key;

			next unless exists $db{$key}->{'no-end'};
			last if $pos == $#{$timeline};

			# Shift following request to start after the current
			# one, but only if that wouldn't make it zero duration,
			# which would indicate notify arrived after context
			# complete.
			$next_key = ${$timeline}[$pos + 1];
			if (exists $db{$key}->{'notify'} and
			    $db{$key}->{'notify'} < $db{$key}->{'end'}) {
				$db{$next_key}->{'engine-start'} = $db{$next_key}->{'start'};
				$db{$next_key}->{'start'} = $db{$key}->{'notify'};
				$re_sort = 1;
			}
		}
	}
}

maybe_sort_keys();

# GPU time accounting
my (%running, %runnable, %queued, %batch_avg, %batch_total_avg, %batch_count);
my (%submit_avg, %execute_avg, %ctxsave_avg);

my $last_ts = 0;
my $first_ts;
my $min_ctx;

foreach my $key (@sorted_keys) {
	my $ring = $db{$key}->{'ring'};
	my $end = $db{$key}->{'end'};
	my $start = $db{$key}->{'start'};
	my $engine_start = $db{$key}->{'engine_start'};
	my $notify = $db{$key}->{'notify'};

	$first_ts = $db{$key}->{'queue'} if not defined $first_ts or $db{$key}->{'queue'} < $first_ts;
	$last_ts = $end if $end > $last_ts;
	$min_ctx = $db{$key}->{'ctx'} if not defined $min_ctx or
					 $db{$key}->{'ctx'} < $min_ctx;

	unless (exists $db{$key}->{'no-end'}) {
		$db{$key}->{'context-complete-delay'} = $end - $notify;
	} else {
		$db{$key}->{'context-complete-delay'} = 0;
	}

	$engine_start = $db{$key}->{'start'} unless defined $engine_start;
	$db{$key}->{'execute-delay'} = $engine_start - $db{$key}->{'submit'};
	$db{$key}->{'submit-delay'} = $db{$key}->{'submit'} - $db{$key}->{'queue'};
	unless (exists $db{$key}->{'no-notify'}) {
		$db{$key}->{'duration'} = $notify - $start;
	} else {
		$db{$key}->{'duration'} = 0;
	}

	$running{$ring} += $end - $start if $correct_durations or
					    not exists $db{$key}->{'no-end'};
	unless (exists $db{$key}->{'virtual'}) {
		$runnable{$ring} += $db{$key}->{'execute-delay'};
		$queued{$ring} += $start - $db{$key}->{'execute-delay'} - $db{$key}->{'queue'};
	}

	$batch_count{$ring}++;

	$batch_avg{$ring} += $db{$key}->{'duration'};
	$batch_total_avg{$ring} += $end - $start;

	$submit_avg{$ring} += $db{$key}->{'submit-delay'};
	$execute_avg{$ring} += $db{$key}->{'execute-delay'};
	$ctxsave_avg{$ring} += $db{$key}->{'context-complete-delay'};
}

foreach my $ring (sort keys %batch_avg) {
	$batch_avg{$ring} /= $batch_count{$ring};
	$batch_total_avg{$ring} /= $batch_count{$ring};
	$submit_avg{$ring} /= $batch_count{$ring};
	$execute_avg{$ring} /= $batch_count{$ring};
	$ctxsave_avg{$ring} /= $batch_count{$ring};
}

# Calculate engine idle time
my %flat_busy;
foreach my $gid (sort keys %rings) {
	my $ring = $ringmap{$rings{$gid}};
	my (@s_, @e_);

	# Extract all GPU busy intervals and sort them.
	foreach my $key (@sorted_keys) {
		next unless $db{$key}->{'ring'} eq $ring;
		die if $db{$key}->{'start'} > $db{$key}->{'end'};
		push @s_, $db{$key}->{'start'};
		push @e_, $db{$key}->{'end'};
	}

	die unless $#s_ == $#e_;

	# Flatten the intervals.
	for my $i (1..$#s_) {
		last if $i >= @s_; # End of array.
		die if $e_[$i] < $s_[$i];
		if ($s_[$i] <= $e_[$i - 1]) {
			# Current entry overlaps with the previous one. We need
			# to merge end of the previous interval from the list
			# with the start of the current one.
			if ($e_[$i] >= $e_[$i - 1]) {
				splice @e_, $i - 1, 1;
			} else {
				splice @e_, $i, 1;
			}
			splice @s_, $i, 1;
			# Continue with the same element when list got squashed.
			redo;
		}
	}

	# Add up all busy times.
	my $total = 0;
	for my $i (0..$#s_) {
		die if $e_[$i] < $s_[$i];

		$total = $total + ($e_[$i] - $s_[$i]);
	}

	$flat_busy{$ring} = $total;
}

# Calculate overall GPU idle time
my @gpu_intervals;
my (@s_, @e_);

# Extract all GPU busy intervals and sort them.
foreach my $key (@sorted_keys) {
	push @s_, $db{$key}->{'start'};
	push @e_, $db{$key}->{'end'};
	die if $db{$key}->{'start'} > $db{$key}->{'end'};
}

die unless $#s_ == $#e_;

# Flatten the intervals (copy & paste of the flattening loop above)
for my $i (1..$#s_) {
	last if $i >= @s_;
	die if $e_[$i] < $s_[$i];
	die if $s_[$i] < $s_[$i - 1];
	if ($s_[$i] <= $e_[$i - 1]) {
		if ($e_[$i] >= $e_[$i - 1]) {
			splice @e_, $i - 1, 1;
		} else {
			splice @e_, $i, 1;
		}
		splice @s_, $i, 1;
		redo;
	}
}

# Add up all busy times.
my $total = 0;
for my $i (0..$#s_) {
	die if $e_[$i] < $s_[$i];

	$total = $total + ($e_[$i] - $s_[$i]);
}

# Generate data for the GPU timeline if requested
if ($gpu_timeline) {
	for my $i (0..$#s_) {
		push @gpu_intervals, [ $s_[$i], $e_[$i] ];
	}
}

$flat_busy{'gpu-busy'} = $total / ($last_ts - $first_ts) * 100.0;
$flat_busy{'gpu-idle'} = (1.0 - $total / ($last_ts - $first_ts)) * 100.0;

# Add up all request waits per engine
my %reqw;
foreach my $key (keys %reqwait) {
	$reqw{$reqwait{$key}->{'ring'}} += $reqwait{$key}->{'end'} - $reqwait{$key}->{'start'};
}

# Add up all request waits per virtual engine
my %vreqw;
foreach my $key (keys %reqwait) {
	$vreqw{$reqwait{$key}->{'ctx'}} += $reqwait{$key}->{'end'} - $reqwait{$key}->{'start'};
}

say sprintf('GPU: %.2f%% idle, %.2f%% busy',
	     $flat_busy{'gpu-idle'}, $flat_busy{'gpu-busy'}) unless $html;

my $timeline_text = $colour_contexts ?
		    'per context coloured shading like' : 'box shading like';

my %ctx_colours;
my $ctx_table;

sub generate_ctx_table
{
	my @states = ('queue', 'ready', 'execute', 'ctxsave', 'incomplete');
	my $max_show = 6;
	my (@ctxts, @disp_ctxts);
	my $step;

	if( $colour_contexts ) {
		@ctxts = sort keys %ctxdb;
	} else {
		@ctxts = ($min_ctx);
	}

	# Limit number of shown context examples
	$step = int(scalar(@ctxts) / $max_show);
	if ($step) {
		foreach my $i (0..$#ctxts) {
			push @disp_ctxts, $ctxts[$i] unless $i % $step;
			last if scalar(@disp_ctxts) == $max_show;
		}
	} else {
		@disp_ctxts = @ctxts;
	}

	$ctx_table .= '<table>';

	foreach my $ctx (@disp_ctxts) {
		$ctx_table .= "<tr>\n";
		$ctx_table .= "  <td>Context $ctx</td>\n" if $colour_contexts;
		foreach my $state (@states) {
			$ctx_table .= "  <td align='center' valign='middle'><div style='" . box_style($ctx, $state) . " padding-top: 6px; padding-bottom: 6px; padding-left: 6x; padding-right: 6px;'>" . uc($state) . "</div></td>\n";
		}
		$ctx_table .= "</tr>\n";
	}

	$ctx_table .= '</table>';
}

sub generate_ctx_colours
{
	my $num_ctx = keys %ctxdb;
	my $i = 0;

	foreach my $ctx (sort keys %ctxdb) {
		$ctx_colours{$ctx} = int(360 / $num_ctx * $i++);
	}
}


generate_ctx_colours() if $html and $colour_contexts;
generate_ctx_table() if $html;

print <<ENDHTML if $html;
<!DOCTYPE HTML>
<html>
<head>
  <title>i915 GT timeline</title>

  <style type="text/css">
    body, html {
      font-family: sans-serif;
    }
  </style>

  <script src="node_modules/vis/dist/vis.js"></script>
  <link href="node_modules/vis//dist/vis.css" rel="stylesheet" type="text/css" />
</head>
<body>
<p>
<b>Timeline request view is $timeline_text:</b>
<table>
<tr>
<td>
$ctx_table
</td>
<td>
QUEUE = requests executing on the GPU<br>
READY = runnable requests waiting for a slot on GPU<br>
EXECUTE = requests waiting on fences and dependencies before they are runnable<br>
CTXSAVE = GPU saving the context image<br>
INCOMPLETE = request of unknown completion time
<p>
Boxes are in format 'ctx-id/seqno'.
</p>
<p>
Use Ctrl+scroll-action to zoom-in/out and scroll-action or dragging to move around the timeline.
</p>
<button onclick="toggleStacking()">Toggle overlap stacking</button>
</td>
</tr>
</table>
<p>
<b>GPU idle: $flat_busy{'gpu-idle'}%</b>
<br>
<b>GPU busy: $flat_busy{'gpu-busy'}%</b>
</p>
<div id="visualization"></div>

<script type="text/javascript">
  var container = document.getElementById('visualization');

  var groups = new vis.DataSet([
ENDHTML

#   var groups = new vis.DataSet([
# 	{id: 1, content: 'g0'},
# 	{id: 2, content: 'g1'}
#   ]);

sub html_stats
{
	my ($stats, $group, $id) = @_;
	my $veng = exists $stats->{'virtual'} ? 1 : 0;
	my $name;

	$name = $veng ? 'Virtual' : 'Ring';
	$name .= $group;
	$name .= '<br><small><br>';
	unless ($veng) {
		$name .= sprintf('%.2f', $stats->{'idle'}) . '% idle<br><br>';
		$name .= sprintf('%.2f', $stats->{'busy'}) . '% busy<br>';
	}
	$name .= sprintf('%.2f', $stats->{'runnable'}) . '% runnable<br>';
	$name .= sprintf('%.2f', $stats->{'queued'}) . '% queued<br><br>';
	$name .= sprintf('%.2f', $stats->{'wait'}) . '% wait<br><br>';
	$name .= $stats->{'count'} . ' batches<br>';
	unless ($veng) {
		$name .= sprintf('%.2f', $stats->{'avg'}) . 'us avg batch<br>';
		$name .= sprintf('%.2f', $stats->{'total-avg'}) . 'us avg engine batch<br>';
	}
	$name .= '</small>';

	print "\t{id: $id, content: '$name'},\n";
}

sub stdio_stats
{
	my ($stats, $group, $id) = @_;
	my $veng = exists $stats->{'virtual'} ? 1 : 0;
	my $str;

	$str = $veng ? 'Virtual' : 'Ring';
	$str .= $group . ': ';
	$str .= $stats->{'count'} . ' batches, ';
	unless ($veng) {
		$str .= sprintf('%.2f (%.2f) avg batch us, ',
				$stats->{'avg'}, $stats->{'total-avg'});
		$str .= sprintf('%.2f', $stats->{'idle'}) . '% idle, ';
		$str .= sprintf('%.2f', $stats->{'busy'}) . '% busy, ';
	}

	$str .= sprintf('%.2f', $stats->{'runnable'}) . '% runnable, ';
	$str .= sprintf('%.2f', $stats->{'queued'}) . '% queued, ';
	$str .= sprintf('%.2f', $stats->{'wait'}) . '% wait';

	if ($avg_delay_stats and not $veng) {
		$str .= ', submit/execute/save-avg=(';
		$str .= sprintf('%.2f/%.2f/%.2f)', $stats->{'submit'}, $stats->{'execute'}, $stats->{'save'});
	}

	say $str;
}

print "\t{id: 0, content: 'Freq'},\n" if $html;
print "\t{id: 1, content: 'GPU'},\n" if $gpu_timeline;

my $engine_start_id = $gpu_timeline ? 2 : 1;

foreach my $group (sort keys %rings) {
	my $name;
	my $ring = $ringmap{$rings{$group}};
	my $id = $engine_start_id + $rings{$group};
	my $elapsed = $last_ts - $first_ts;
	my %stats;

	$stats{'idle'} = (1.0 - $flat_busy{$ring} / $elapsed) * 100.0;
	$stats{'busy'} = $running{$ring} / $elapsed * 100.0;
	if (exists $runnable{$ring}) {
		$stats{'runnable'} = $runnable{$ring} / $elapsed * 100.0;
	} else {
		$stats{'runnable'} = 0;
	}
	if (exists $queued{$ring}) {
		$stats{'queued'} = $queued{$ring} / $elapsed * 100.0;
	} else {
		$stats{'queued'} = 0;
	}
	$reqw{$ring} = 0 unless exists $reqw{$ring};
	$stats{'wait'} = $reqw{$ring} / $elapsed * 100.0;
	$stats{'count'} = $batch_count{$ring};
	$stats{'avg'} = $batch_avg{$ring};
	$stats{'total-avg'} = $batch_total_avg{$ring};
	$stats{'submit'} = $submit_avg{$ring};
	$stats{'execute'} = $execute_avg{$ring};
	$stats{'save'} = $ctxsave_avg{$ring};

	if ($html) {
		html_stats(\%stats, $group, $id);
	} else {
		stdio_stats(\%stats, $group, $id);
	}
}

sub sortVQueue {
	my $as = $vdb{$a}->{'queue'};
	my $bs = $vdb{$b}->{'queue'};
	my $val;

	$val = $as <=> $bs;
	$val = $a cmp $b if $val == 0;

	return $val;
}

my @sorted_vkeys = sort sortVQueue keys %vdb;
my (%vqueued, %vrunnable);

foreach my $key (@sorted_vkeys) {
	my $ctx = $vdb{$key}->{'ctx'};

	$vdb{$key}->{'submit-delay'} = $vdb{$key}->{'submit'} - $vdb{$key}->{'queue'};
	$vdb{$key}->{'execute-delay'} = $vdb{$key}->{'start'} - $vdb{$key}->{'submit'};

	$vqueued{$ctx} += $vdb{$key}->{'submit-delay'};
	$vrunnable{$ctx} += $vdb{$key}->{'execute-delay'};
}

my $veng_id = $engine_start_id + scalar(keys %rings);

foreach my $cid (sort keys %ctxmap) {
	my $ctx = $ctxmap{$cid};
	my $elapsed = $last_ts - $first_ts;
	my %stats;

	$stats{'virtual'} = 1;
	if (exists $vrunnable{$ctx}) {
		$stats{'runnable'} = $vrunnable{$ctx} / $elapsed * 100.0;
	} else {
		$stats{'runnable'} = 0;
	}
	if (exists $vqueued{$ctx}) {
		$stats{'queued'} = $vqueued{$ctx} / $elapsed * 100.0;
	} else {
		$stats{'queued'} = 0;
	}
	$vreqw{$ctx} = 0 unless exists $vreqw{$ctx};
	$stats{'wait'} = $vreqw{$ctx} / $elapsed * 100.0;
	$stats{'count'} = scalar(grep {$ctx == $vdb{$_}->{'ctx'}} keys %vdb);

	if ($html) {
		html_stats(\%stats, $cid, $veng_id++);
	} else {
		stdio_stats(\%stats, $cid, $veng_id++);
	}
}

exit 0 unless $html;

print <<ENDHTML;
  ]);

  var items = new vis.DataSet([
ENDHTML

sub sortQueue {
	my $as = $db{$a}->{'queue'};
	my $bs = $db{$b}->{'queue'};
	my $val;

	$val = $as <=> $bs;
	$val = $a cmp $b if $val == 0;

	return $val;
}

sub ctx_colour
{
	my ($ctx, $stage, $lfac) = (@_);
	my ($s, $l);
	my $val;

	unless ($colour_contexts) {
		if ($stage eq 'queue') {
			$val = 210;
			$s = 65;
			$l = 52;
		} elsif ($stage eq 'ready') {
			$val = 0;
			$s = 0;
			$l = 47;
		} elsif ($stage eq 'execute') {
			$val = 346;
			$s = 68;
			$l = 65;
		} elsif ($stage eq 'ctxsave') {
			$val = 26;
			$s = 90;
			$l = 52;
		} elsif ($stage eq 'incomplete') {
			$val = 0;
			$s = 85;
			$l = 50;
		}
	} else {
		if ($stage eq 'queue') {
			$s = 35;
			$l = 85;
		} elsif ($stage eq 'ready') {
			$s = 35;
			$l = 45;
		} elsif ($stage eq 'execute') {
			$s = 80;
			$l = 65;
		} elsif ($stage eq 'ctxsave') {
			$s = 75;
			$l = 70;
		} elsif ($stage eq 'incomplete') {
			$s = 80;
			$l = 25;
		}

		$val = $ctx_colours{$ctx};
	}

	$l = int($l * $lfac);

	return "hsl($val, $s%, $l%)";
}

sub box_style
{
	my ($ctx, $stage) = @_;
	my $deg;
	my $text_col = 'white';

	if ($stage eq 'queue') {
		$deg = 90;
		$text_col = 'black' if $colour_contexts;
	} elsif ($stage eq 'ready') {
		$deg = 45;
	} elsif ($stage eq 'execute') {
		$deg = 0;
		$text_col = 'black' if $colour_contexts;
	} elsif ($stage eq 'ctxsave') {
		$deg = 105;
		$text_col = 'black' if $colour_contexts;
	} elsif ($stage eq 'incomplete') {
		$deg = 0;
	}

	return "color: $text_col; background: repeating-linear-gradient(" .
		$deg . 'deg, ' .
		ctx_colour($ctx, $stage, 1.0) . ', ' .
		ctx_colour($ctx, $stage, 1.0) . ' 10px, ' .
		ctx_colour($ctx, $stage, 0.90) . ' 10px, ' .
		ctx_colour($ctx, $stage, 0.90) . ' 20px);';
}

my $i = 0;
my $req = 0;
foreach my $key (sort sortQueue keys %db) {
	my ($name, $ctx, $seqno) = ($db{$key}->{'name'}, $db{$key}->{'ctx'}, $db{$key}->{'seqno'});
	my ($queue, $start, $notify, $end) = ($db{$key}->{'queue'}, $db{$key}->{'start'}, $db{$key}->{'notify'}, $db{$key}->{'end'});
	my $engine_start = $db{$key}->{'engine-start'};
	my $submit = $queue + $db{$key}->{'submit-delay'};
	my ($content, $style);
	my $group = $engine_start_id + $rings{$db{$key}->{'ring'}};
	my $subgroup = $ctx - $min_ctx;
	my $type = ' type: \'range\',';
	my $startend;
	my $skey;

	# submit to execute
	unless (exists $skip_box{'queue'} or exists $db{$key}->{'virtual'}) {
		$skey = 2 * $max_seqno * $ctx + 2 * $seqno;
		$style = box_style($ctx, 'queue');
		$content = "$name<br>$db{$key}->{'submit-delay'}us <small>($db{$key}->{'execute-delay'}us)</small>";
		$startend = 'start: ' . $queue . ', end: ' . $submit;
		print "\t{id: $i, key: $skey, $type group: $group, subgroup: $subgroup, subgroupOrder: $subgroup, content: '$content', $startend, style: \'$style\'},\n";
		$i++;
	}

	# execute to start
	$engine_start = $db{$key}->{'start'} unless defined $engine_start;
	unless (exists $skip_box{'ready'} or exists $db{$key}->{'virtual'}) {
		$skey = 2 * $max_seqno * $ctx + 2 * $seqno + 1;
		$style = box_style($ctx, 'ready');
		$content = "<small>$name<br>$db{$key}->{'execute-delay'}us</small>";
		$startend = 'start: ' . $submit . ', end: ' . $engine_start;
		print "\t{id: $i, key: $skey, $type group: $group, subgroup: $subgroup, subgroupOrder: $subgroup, content: '$content', $startend, style: \'$style\'},\n";
		$i++;
	}

	# start to user interrupt
	unless (exists $skip_box{'execute'}) {
		$skey = -2 * $max_seqno * $ctx - 2 * $seqno - 1;
		$style = box_style($ctx,
				   exists $db{$key}->{'incomplete'} ?
				   'incomplete' : 'execute');
		$content = "$name <small>$db{$key}->{'port'}</small>";
		$content .= ' <small><i>???</i></small> ' if exists $db{$key}->{'incomplete'};
		$content .= ' <small><i>++</i></small> ' if exists $db{$key}->{'no-end'};
		$content .= ' <small><i>+</i></small> ' if exists $db{$key}->{'no-notify'};
		$content .= "<br>$db{$key}->{'duration'}us <small>($db{$key}->{'context-complete-delay'}us)</small>";
		$startend = 'start: ' . $start . ', end: ' . $notify;
		print "\t{id: $i, key: $skey, $type group: $group, subgroup: $subgroup, subgroupOrder: $subgroup, content: '$content', $startend, style: \'$style\'},\n";
		$i++;
	}

	# user interrupt to context complete
	unless (exists $skip_box{'ctxsave'} or exists $db{$key}->{'no-end'}) {
		$skey = -2 * $max_seqno * $ctx - 2 * $seqno;
		$style = box_style($ctx, 'ctxsave');
		my $ctxsave = $db{$key}->{'end'} - $db{$key}->{'notify'};
		$content = "<small>$name<br>${ctxsave}us</small>";
		$content .= ' <small><i>???</i></small> ' if exists $db{$key}->{'incomplete'};
		$content .= ' <small><i>++</i></small> ' if exists $db{$key}->{'no-end'};
		$content .= ' <small><i>+</i></small> ' if exists $db{$key}->{'no-notify'};
		$startend = 'start: ' . $notify . ', end: ' . $end;
		print "\t{id: $i, key: $skey, $type group: $group, subgroup: $subgroup, subgroupOrder: $subgroup, content: '$content', $startend, style: \'$style\'},\n";
		$i++;
	}

	$last_ts = $end;

	last if ++$req > $max_requests;
}

push @freqs, [$prev_freq_ts, $last_ts, $prev_freq] if $prev_freq;

foreach my $item (@freqs) {
	my ($start, $end, $freq) = @$item;
	my $startend;

	next if $start > $last_ts;

	$start = $first_ts if $start < $first_ts;
	$end = $last_ts if $end > $last_ts;
	$startend = 'start: ' . $start . ', end: ' . $end;
	print "\t{id: $i, type: 'range', group: 0, content: '$freq', $startend},\n";
	$i++;
}

if ($gpu_timeline) {
	foreach my $item (@gpu_intervals) {
		my ($start, $end) = @$item;
		my $startend;

		next if $start > $last_ts;

		$start = $first_ts if $start < $first_ts;
		$end = $last_ts if $end > $last_ts;
		$startend = 'start: ' . $start . ', end: ' . $end;
		print "\t{id: $i, type: 'range', group: 1, $startend},\n";
		$i++;
	}
}

$req = 0;
$veng_id = $engine_start_id + scalar(keys %rings);
foreach my $key (@sorted_vkeys) {
	my ($name, $ctx, $seqno) = ($vdb{$key}->{'name'}, $vdb{$key}->{'ctx'}, $vdb{$key}->{'seqno'});
	my $queue = $vdb{$key}->{'queue'};
	my $submit = $vdb{$key}->{'submit'};
	my $engine_start = $db{$key}->{'engine-start'};
	my ($content, $style, $startend, $skey);
	my $group = $veng_id + $cids{$ctx};
	my $subgroup = $ctx - $min_ctx;
	my $type = ' type: \'range\',';
	my $duration;

	# submit to execute
	unless (exists $skip_box{'queue'}) {
		$skey = 2 * $max_seqno * $ctx + 2 * $seqno;
		$style = box_style($ctx, 'queue');
		$content = "$name<br>$vdb{$key}->{'submit-delay'}us <small>($vdb{$key}->{'execute-delay'}us)</small>";
		$startend = 'start: ' . $queue . ', end: ' . $submit;
		print "\t{id: $i, key: $skey, $type group: $group, subgroup: $subgroup, subgroupOrder: $subgroup, content: '$content', $startend, style: \'$style\'},\n";
		$i++;
	}

	# execute to start
	$engine_start = $vdb{$key}->{'start'} unless defined $engine_start;
	unless (exists $skip_box{'ready'}) {
		$skey = 2 * $max_seqno * $ctx + 2 * $seqno + 1;
		$style = box_style($ctx, 'ready');
		$content = "<small>$name<br>$vdb{$key}->{'execute-delay'}us</small>";
		$startend = 'start: ' . $submit . ', end: ' . $engine_start;
		print "\t{id: $i, key: $skey, $type group: $group, subgroup: $subgroup, subgroupOrder: $subgroup, content: '$content', $startend, style: \'$style\'},\n";
		$i++;
	}

	last if ++$req > $max_requests;
}

my $end_ts = $first_ts + $width_us;
$first_ts = $first_ts;

print <<ENDHTML;
  ]);

  function majorAxis(date, scale, step) {
	var s = date / 1000000;
	var precision;

	if (scale == 'millisecond')
		precision = 6;
	else if (scale == 'second')
		precision = 3;
	else
		precision = 0;

	return s.toFixed(precision) + "s";
  }

  function minorAxis(date, scale, step) {
	var t = date;
	var precision;
	var unit;

	if (scale == 'millisecond') {
		t %= 1000;
		precision = 0;
		unit = 'us';
	} else if (scale == 'second') {
		t /= 1000;
		t %= 1000;
		precision = 0;
		unit = 'ms';
	} else {
		t /= 1000000;
		precision = 1;
		unit = 's';
	}

	return t.toFixed(precision) + unit;
  }

  // Configuration for the Timeline
  var options = { groupOrder: 'content',
		  horizontalScroll: true,
		  stack: false,
		  stackSubgroups: false,
		  zoomKey: 'ctrlKey',
		  orientation: 'top',
		  format: { majorLabels: majorAxis, minorLabels: minorAxis },
		  start: $first_ts,
		  end: $end_ts};

  // Create a Timeline
  var timeline = new vis.Timeline(container, items, groups, options);

  function toggleStacking() {
	options.stack = !options.stack;
	options.stackSubgroups = !options.stackSubgroups;
	timeline.setOptions(options);
  }
ENDHTML

print <<ENDHTML;
</script>
</body>
</html>
ENDHTML
