# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. import json import logging import os import re import shlex import shutil import sys import tempfile # TODO (b/206008069), remove this when migrated to new env sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/six-1.16.0-py2.7.egg') try: # This is weird. But it seems something is bringing in six earlier # Going to force a reload after the egg is inserted. import six if six.PY2: reload(six) else: import importlib importlib.reload(six) logging.debug("six version is {}".format(six.__version__)) if six.__version__ != '1.16.0': logging.debug(sys.path) except ImportError as e: logging.warning("Could not import six due to %s", e) from contextlib import contextmanager from autotest_lib.client.common_lib import error from autotest_lib.server import test from autotest_lib.server import utils from autotest_lib.server.cros import telemetry_runner from autotest_lib.server.cros.crosperf import device_setup_utils from autotest_lib.site_utils import test_runner_utils WAIT_FOR_CMD_TIMEOUT_SECS = 60 DUT_COMMON_SSH_OPTIONS = [ '-o StrictHostKeyChecking=no', '-o UserKnownHostsFile=/dev/null', '-o BatchMode=yes', '-o ConnectTimeout=30', '-o ServerAliveInterval=900', '-o ServerAliveCountMax=3', '-o ConnectionAttempts=4', '-o Protocol=2' ] DUT_SCP_OPTIONS = ' '.join(DUT_COMMON_SSH_OPTIONS) RSA_KEY = '-i %s' % test_runner_utils.TEST_KEY_PATH DUT_CHROME_RESULTS_DIR = '/usr/local/telemetry/src' TURBOSTAT_LOG = 'turbostat.log' CPUSTATS_LOG = 'cpustats.log' CPUINFO_LOG = 'cpuinfo.log' TOP_LOG = 'top.log' WAIT_TIME_LOG = 'wait_time.log' # Result Statuses SUCCESS_STATUS = 'SUCCESS' WARNING_STATUS = 'WARNING' FAILED_STATUS = 'FAILED' # Regex for the RESULT output lines understood by chrome buildbot. # TODO b:169251326 terms below are set outside of this codebase and # should be updated when possible ("slave"). # nocheck # Keep in sync with # chromium/tools/build/scripts/slave/performance_log_processor.py. # nocheck RESULTS_REGEX = re.compile(r'(?P\*)?RESULT ' r'(?P[^:]*): (?P[^=]*)= ' r'(?P[\{\[]?[-\d\., ]+[\}\]]?)(' r' ?(?P.+))?') HISTOGRAM_REGEX = re.compile(r'(?P\*)?HISTOGRAM ' r'(?P[^:]*): (?P[^=]*)= ' r'(?P{.*})(?P.+)?') class telemetry_Crosperf(test.test): """ Run one or more telemetry benchmarks under the crosperf script. """ version = 1 def scp_telemetry_results(self, client_ip, dut, file_path, host_dir, ignore_status=False): """ Copy telemetry results from dut. @param client_ip: The ip address of the DUT @param dut: The autotest host object representing DUT. @param file_path: The file to copy from DUT. @param host_dir: The directory on host to put the file . @returns status code for scp command. """ cmd = ['scp', DUT_SCP_OPTIONS, RSA_KEY, '-v'] port = '' if dut: port = dut.port ip = dut.hostname else: ip_and_port = client_ip.split(':') ip = ip_and_port[0] if len(ip_and_port) > 1: port = ip_and_port[1] if port: cmd.extend(['-P', str(port)]) src = 'root@%s:%s' % (ip, file_path) cmd.extend([src, host_dir]) command = ' '.join(cmd) logging.debug('Retrieving Results: %s', command) try: result = utils.run( command, timeout=WAIT_FOR_CMD_TIMEOUT_SECS, ignore_status=ignore_status) exit_code = result.exit_status except Exception as e: logging.error('Failed to retrieve results: %s', e) raise if exit_code: logging.error('Command "%s" returned non-zero status: %d', command, exit_code) return exit_code @contextmanager def no_background(self, *_args): """ Background stub. """ yield @contextmanager def run_in_background_with_log(self, cmd, dut, log_path): """ Get cpustats periodically in background. NOTE: No error handling, exception or error report if command fails to run in background. Command failure is silenced. """ logging.info('Running in background:\n%s', cmd) pid = dut.run_background(cmd) try: # TODO(denik http://crbug.com/966514): replace with more reliable # way to check run success/failure in background. # Wait some time before checking the process. check = dut.run('sleep 5; kill -0 %s' % pid, ignore_status=True) if check.exit_status != 0: # command wasn't started correctly logging.error( "Background command wasn't started correctly.\n" 'Command:\n%s', cmd) pid = '' yield return logging.info('Background process started successfully, pid %s', pid) yield finally: if pid: # Stop background processes. logging.info('Killing background process, pid %s', pid) # Kill the process blindly. OK if it's already gone. # There is an issue when underlying child processes stay alive # while the parent main process is killed. # The solution is to kill the chain of processes via process # group id. dut.run('pgid=$(cat /proc/%s/stat | cut -d")" -f2 | ' 'cut -d" " -f4) && kill -- -$pgid 2>/dev/null' % pid, ignore_status=True) # Copy the results to results directory with silenced failure. scp_res = self.scp_telemetry_results( '', dut, log_path, self.resultsdir, ignore_status=True) if scp_res: logging.error( 'scp of cpuinfo logs failed ' 'with error %d.', scp_res) def run_cpustats_in_background(self, dut, log_name): """ Run command to collect CPU stats in background. """ log_path = '/tmp/%s' % log_name cpu_stats_cmd = ( 'cpulog=%s; ' 'rm -f ${cpulog}; ' # Stop after 720*0.5min=6hours if not killed at clean-up phase. 'for i in {1..720} ; do ' # Collect current CPU frequency on all cores and thermal data. ' paste -d" " ' ' <(ls /sys/devices/system/cpu/cpu*/cpufreq/cpuinfo_cur_freq)' ' <(cat /sys/devices/system/cpu/cpu*/cpufreq/cpuinfo_cur_freq)' ' >> ${cpulog} || break; ' # exit loop if fails. ' paste -d" "' ' <(cat /sys/class/thermal/thermal_zone*/type)' ' <(cat /sys/class/thermal/thermal_zone*/temp)' # Filter in thermal data from only CPU-related sources. ' | grep -iE "soc|cpu|pkg|big|little" >> ${cpulog} || break; ' # Repeat every 30 sec. ' sleep 30; ' 'done;') % log_path return self.run_in_background_with_log(cpu_stats_cmd, dut, log_path) def run_top_in_background(self, dut, log_name, interval_in_sec): """ Run top in background. """ log_path = '/tmp/%s' % log_name top_cmd = ( # Run top in batch mode with specified interval and filter out # top system summary and processes not consuming %CPU. # Output of each iteration is separated by a blank line. 'HOME=/usr/local COLUMNS=128 top -bi -d%.1f' ' | grep -E "^[ 0-9]|^$" > %s;') % (interval_in_sec, log_path) return self.run_in_background_with_log(top_cmd, dut, log_path) def run_turbostat_in_background(self, dut, log_name): """ Run turbostat in background. """ log_path = '/tmp/%s' % log_name turbostat_cmd = ( 'nohup turbostat --quiet --interval 10 ' '--show=CPU,Bzy_MHz,Avg_MHz,TSC_MHz,Busy%%,IRQ,CoreTmp ' '1> %s') % log_path return self.run_in_background_with_log(turbostat_cmd, dut, log_path) def run_cpuinfo(self, dut, log_name): """ Collect CPU info of "dut" into "log_name" file. """ cpuinfo_cmd = ( 'for cpunum in ' " $(awk '/^processor/ { print $NF ; }' /proc/cpuinfo ) ; do " ' for i in' ' `ls -d /sys/devices/system/cpu/cpu"${cpunum}"/cpufreq/' '{cpuinfo_cur_freq,scaling_*_freq,scaling_governor} ' ' 2>/dev/null` ; do ' ' paste -d" " <(echo "${i}") <(cat "${i}"); ' ' done;' 'done;' 'for cpudata in' ' /sys/devices/system/cpu/intel_pstate/no_turbo' ' /sys/devices/system/cpu/online; do ' ' if [[ -e "${cpudata}" ]] ; then ' ' paste <(echo "${cpudata}") <(cat "${cpudata}"); ' ' fi; ' 'done; ' # Adding thermal data to the log. 'paste -d" "' ' <(cat /sys/class/thermal/thermal_zone*/type)' ' <(cat /sys/class/thermal/thermal_zone*/temp)') # Get CPUInfo at the end of the test. logging.info('Get cpuinfo: %s', cpuinfo_cmd) with open(os.path.join(self.resultsdir, log_name), 'w') as cpu_log_file: # Handle command failure gracefully. res = dut.run( cpuinfo_cmd, stdout_tee=cpu_log_file, ignore_status=True) if res.exit_status: logging.error('Get cpuinfo command failed with %d.', res.exit_status) def run_once(self, args, client_ip='', dut=None): """ Run a single telemetry test. @param args: A dictionary of the arguments that were passed to this test. @param client_ip: The ip address of the DUT @param dut: The autotest host object representing DUT. @returns A TelemetryResult instance with the results of this execution. """ test_name = args.get('test', '') test_args = args.get('test_args', '') profiler_args = args.get('profiler_args', '') dut_config_str = args.get('dut_config', '{}') dut_config = json.loads(dut_config_str) # Setup device with dut_config arguments before running test if dut_config: wait_time = device_setup_utils.setup_device(dut, dut_config) # Wait time can be used to accumulate cooldown time in Crosperf. with open(os.path.join(self.resultsdir, WAIT_TIME_LOG), 'w') as f: f.write(str(wait_time)) output_format = 'histograms' # For local runs, we set local=True and use local chrome source to run # tests; for lab runs, we use the drone instead. # By default to be True. local = args.get('local', 'true').lower() == 'true' # If run_local=true, telemetry benchmark will run on DUT, otherwise # run remotely from host. # By default to be False. # TODO(zhizhouy): It is better to change the field name from "run_local" # to "telemetry_on_dut" in crosperf experiment files for consistency. telemetry_on_dut = args.get('run_local', '').lower() == 'true' # Run the test. And collect profile if needed. try: # If profiler_args specified, we want to add several more options # to the command so that run_benchmark will collect system wide # profiles. if profiler_args: profiler_opts = [ '--interval-profiling-period=story_run', '--interval-profiling-target=system_wide', '--interval-profiler-options="%s"' % profiler_args ] top_interval = dut_config.get('top_interval', 0) turbostat = dut_config.get('turbostat') run_cpuinfo = self.run_cpustats_in_background if dut \ else self.no_background run_turbostat = self.run_turbostat_in_background if ( dut and turbostat) else self.no_background run_top = self.run_top_in_background if ( dut and top_interval > 0) else self.no_background # FIXME(denik): replace with ExitStack. with run_cpuinfo(dut, CPUSTATS_LOG) as _cpu_cm, \ run_turbostat(dut, TURBOSTAT_LOG) as _turbo_cm, \ run_top(dut, TOP_LOG, top_interval) as _top_cm: arguments = [] if test_args: arguments.extend(shlex.split(test_args)) if profiler_args: arguments.extend(profiler_opts) logging.debug('Telemetry Arguments: %s', arguments) perf_value_writer = self artifacts = True if profiler_args else False with telemetry_runner.TelemetryRunnerFactory().get_runner( dut, local=local, telemetry_on_dut=telemetry_on_dut) as tr: result = tr.run_telemetry_benchmark( test_name, perf_value_writer, *arguments, ex_output_format=output_format, results_dir=self.resultsdir, no_verbose=True, artifacts=artifacts) logging.info('Telemetry completed with exit status: %s.', result.status) logging.info('output: %s\n', result.output) except (error.TestFail, error.TestWarn): logging.debug( 'Test did not succeed while executing telemetry test.') raise except: logging.debug('Unexpected failure on telemetry_Crosperf.') raise finally: if dut: self.run_cpuinfo(dut, CPUINFO_LOG) # Checking whether result file exists. filepath = os.path.join(self.resultsdir, 'histograms.json') if not os.path.exists(filepath): raise RuntimeError('Missing results file: %s' % filepath) # Copy the perf data file into the test_that profiling directory, # if necessary. It always comes from DUT. if profiler_args: filepath = os.path.join(self.resultsdir, 'artifacts') if not os.path.isabs(filepath): raise RuntimeError('Expected absolute path of ' 'arfifacts: %s' % filepath) perf_exist = False for root, _dirs, files in os.walk(filepath): for f in files: if f.endswith('.perf.data'): perf_exist = True src_file = os.path.join(root, f) # results-cache.py in crosperf supports multiple # perf.data files, but only if they are named exactly # so. Therefore, create a subdir for each perf.data # file. # Use mkdtemp to make sure a directory name is unique. dst_dir = tempfile.mkdtemp(dir=self.profdir, prefix=''.join( f.split('.')[:-2])) dst_file = os.path.join(dst_dir, 'perf.data') shutil.copyfile(src_file, dst_file) if not perf_exist: raise error.TestFail('Error: No profiles collected, test may ' 'not run correctly.') # In skylab run, environment variable SYNCHRONOUS_OFFLOAD_DIR is set to # be a directory; In test_that run, it will not be set. synchronous_dir = os.getenv('SYNCHRONOUS_OFFLOAD_DIR', '') # For skylab run, push logs and json results to the synchronous offload # directory for instant access for report. if synchronous_dir and os.path.isdir(synchronous_dir): try: dst = os.path.join(synchronous_dir, 'results') shutil.copytree(self.resultsdir, dst) except: raise RuntimeError( 'Failed to copy results in %s to synchronous offload' ' directory %s' % (self.resultsdir, synchronous_dir)) return result