Commit 80451b07 authored by Sergiy Belozorov's avatar Sergiy Belozorov Committed by Commit Bot

[tools] Implement confidence-based number of runs

R=machenbach@chromium.org, tmrts@chromium.org

Bug: chromium:880724
Change-Id: I2b8ede244fa09868eef384b967223a3788ddd2a1
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1581180
Commit-Queue: Sergiy Belozorov <sergiyb@chromium.org>
Reviewed-by: 's avatarMichael Achenbach <machenbach@chromium.org>
Cr-Commit-Position: refs/heads/master@{#61370}
parent 615d61bf
......@@ -66,3 +66,11 @@ wheel: <
name: "infra/python/wheels/mock-py2_py3"
version: "version:2.0.0"
>
# Used by:
# tools/run_perf.py
# tools/unittests/run_perf_test.py
wheel: <
name: "infra/python/wheels/numpy/${vpython_platform}"
version: "version:1.11.3"
>
......@@ -3875,6 +3875,12 @@ group("gn_all") {
}
}
group("v8_python_base") {
data = [
".vpython",
]
}
group("v8_clusterfuzz") {
testonly = true
......
......@@ -44,6 +44,7 @@ group("v8_perf") {
testonly = true
data_deps = [
"..:v8_python_base",
"cctest:cctest",
"..:d8",
"../tools:v8_android_test_runner_deps",
......
......@@ -43,6 +43,7 @@ group("v8_testrunner") {
testonly = true
data_deps = [
"..:v8_python_base",
"..:v8_dump_build_config",
":v8_android_test_runner_deps",
]
......
#!/usr/bin/env python
# Copyright 2014 the V8 project authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
......@@ -120,6 +119,8 @@ import sys
import time
import traceback
import numpy
from testrunner.local import android
from testrunner.local import command
from testrunner.local import utils
......@@ -142,6 +143,7 @@ RESULT_STDDEV_RE = re.compile(r'^\{([^\}]+)\}$')
RESULT_LIST_RE = re.compile(r'^\[([^\]]+)\]$')
TOOLS_BASE = os.path.abspath(os.path.dirname(__file__))
INFRA_FAILURE_RETCODE = 87
MIN_RUNS_FOR_CONFIDENCE = 10
def GeometricMean(values):
......@@ -150,7 +152,7 @@ def GeometricMean(values):
The mean is calculated using log to avoid overflow.
"""
values = map(float, values)
return str(math.exp(sum(map(math.log, values)) / len(values)))
return math.exp(sum(map(math.log, values)) / len(values))
class ResultTracker(object):
......@@ -241,6 +243,42 @@ class ResultTracker(object):
with open(file_name, 'w') as f:
f.write(json.dumps(self.ToDict()))
def HasEnoughRuns(self, graph_config, confidence_level):
"""Checks if the mean of the results for a given trace config is within
0.1% of the true value with the specified confidence level.
This assumes Gaussian distribution of the noise and based on
https://en.wikipedia.org/wiki/68%E2%80%9395%E2%80%9399.7_rule.
Args:
graph_config: An instance of GraphConfig.
confidence_level: Number of standard deviations from the mean that all
values must lie within. Typical values are 1, 2 and 3 and correspond
to 68%, 95% and 99.7% probability that the measured value is within
0.1% of the true value.
Returns:
True if specified confidence level have been achieved.
"""
if not isinstance(graph_config, TraceConfig):
return all(self.HasEnoughRuns(child, confidence_level)
for child in graph_config.children)
trace = self.traces.get(graph_config.name, {})
results = trace.get('results', [])
logging.debug('HasEnoughRuns for %s', graph_config.name)
if len(results) < MIN_RUNS_FOR_CONFIDENCE:
logging.debug(' Ran %d times, need at least %d',
len(results), MIN_RUNS_FOR_CONFIDENCE)
return False
logging.debug(' Results: %d entries', len(results))
mean = numpy.mean(results)
mean_stderr = numpy.std(results) / numpy.sqrt(len(results))
logging.debug(' Mean: %.2f, mean_stderr: %.2f', mean, mean_stderr)
return confidence_level * mean_stderr < mean / 1000.0
def __str__(self): # pragma: no cover
return json.dumps(self.ToDict(), indent=2, separators=(',', ': '))
......@@ -383,8 +421,8 @@ class TraceConfig(GraphConfig):
stddev = None
try:
result = str(float(
re.search(self.results_regexp, output.stdout, re.M).group(1)))
result = float(
re.search(self.results_regexp, output.stdout, re.M).group(1))
except ValueError:
result_tracker.AddError(
'Regexp "%s" returned a non-numeric for test %s.' %
......@@ -740,6 +778,7 @@ class AndroidPlatform(Platform): # pragma: no cover
output.duration = time.time() - start
return output
class CustomMachineConfiguration:
def __init__(self, disable_aslr = False, governor = None):
self.aslr_backup = None
......@@ -844,6 +883,12 @@ class CustomMachineConfiguration:
raise Exception('Could not set CPU governor. Present value is %s'
% cur_value )
class MaxTotalDurationReachedError(Exception):
"""Exception used to stop running tests when max total duration is reached."""
pass
def Main(argv):
parser = argparse.ArgumentParser()
parser.add_argument('--arch',
......@@ -900,12 +945,28 @@ def Main(argv):
'--filter=JSTests/TypedArrays/ will run only TypedArray '
'benchmarks from the JSTests suite.',
default='')
parser.add_argument('--confidence-level', type=int,
help='Repeatedly runs each benchmark until specified '
'confidence level is reached. The value is interpreted '
'as the number of standard deviations from the mean that '
'all values must lie within. Typical values are 1, 2 and '
'3 and correspond to 68%, 95% and 99.7% probability that '
'the measured value is within 0.1% of the true value. '
'Larger values result in more retries and thus longer '
'runtime, but also provide more reliable results. Also '
'see --max-total-duration flag.')
parser.add_argument('--max-total-duration', type=int, default=7140, # 1h 59m
help='Max total duration in seconds allowed for retries '
'across all tests. This is especially useful in '
'combination with the --confidence-level flag.')
parser.add_argument('--dump-logcats-to',
help='Writes logcat output from each test into specified '
'directory. Only supported for android targets.')
parser.add_argument("--run-count", type=int, default=0,
help="Override the run count specified by the test "
"suite. The default 0 uses the suite's config.")
parser.add_argument('--run-count', type=int, default=0,
help='Override the run count specified by the test '
'suite. The default 0 uses the suite\'s config.')
parser.add_argument('-v', '--verbose', default=False, action='store_true',
help='Be verbose and print debug output.')
parser.add_argument('suite', nargs='+', help='Path to the suite config file.')
try:
......@@ -914,7 +975,8 @@ def Main(argv):
return INFRA_FAILURE_RETCODE
logging.basicConfig(
level=logging.INFO, format='%(asctime)s %(levelname)-8s %(message)s')
level=logging.DEBUG if args.verbose else logging.INFO,
format='%(asctime)s %(levelname)-8s %(message)s')
if args.arch == 'auto': # pragma: no cover
args.arch = utils.DefaultArch()
......@@ -973,8 +1035,7 @@ def Main(argv):
result_tracker = ResultTracker()
result_tracker_secondary = ResultTracker()
# We use list here to allow modification in nested function below.
have_failed_tests = [False]
have_failed_tests = False
with CustomMachineConfiguration(governor = args.cpu_governor,
disable_aslr = args.noaslr) as conf:
for path in args.suite:
......@@ -1000,6 +1061,8 @@ def Main(argv):
platform.PreTests(node, path)
# Traverse graph/trace tree and iterate over all runnables.
start = time.time()
try:
for runnable in FlattenRunnables(root, NodeCB):
runnable_name = '/'.join(runnable.graphs)
if (not runnable_name.startswith(args.filter) and
......@@ -1007,9 +1070,27 @@ def Main(argv):
continue
logging.info('>>> Running suite: %s', runnable_name)
def RunGenerator(runnable):
if args.confidence_level:
counter = 0
while not result_tracker.HasEnoughRuns(
runnable, args.confidence_level):
yield counter
counter += 1
else:
for i in range(0, max(1, args.run_count or runnable.run_count)):
yield i
for i in RunGenerator(runnable):
attempts_left = runnable.retry_count + 1
while attempts_left:
total_duration = time.time() - start
if total_duration > args.max_total_duration:
logging.info(
'>>> Stopping now since running for too long (%ds > %ds)',
total_duration, args.max_total_duration)
raise MaxTotalDurationReachedError()
output, output_secondary = platform.Run(
runnable, i, secondary=args.shell_dir_secondary)
result_tracker.AddRunnableDuration(runnable, output.duration)
......@@ -1025,7 +1106,7 @@ def Main(argv):
attempts_left -= 1
if not attempts_left: # ignore failures until last attempt
have_failed_tests[0] = True
have_failed_tests = True
else:
logging.info('>>> Retrying suite: %s', runnable_name)
......@@ -1033,6 +1114,8 @@ def Main(argv):
result_tracker.timeouts.append(runnable_name)
if runnable.has_near_timeouts:
result_tracker.near_timeouts.append(runnable_name)
except MaxTotalDurationReachedError:
have_failed_tests = True
platform.PostExecution()
......@@ -1048,7 +1131,7 @@ def Main(argv):
print('Secondary results:', result_tracker_secondary)
if (result_tracker.errors or result_tracker_secondary.errors or
have_failed_tests[0]):
have_failed_tests):
return 1
return 0
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment