Commit 80451b07 authored by Sergiy Belozorov's avatar Sergiy Belozorov Committed by Commit Bot

[tools] Implement confidence-based number of runs

R=machenbach@chromium.org, tmrts@chromium.org

Bug: chromium:880724
Change-Id: I2b8ede244fa09868eef384b967223a3788ddd2a1
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1581180
Commit-Queue: Sergiy Belozorov <sergiyb@chromium.org>
Reviewed-by: 's avatarMichael Achenbach <machenbach@chromium.org>
Cr-Commit-Position: refs/heads/master@{#61370}
parent 615d61bf
...@@ -66,3 +66,11 @@ wheel: < ...@@ -66,3 +66,11 @@ wheel: <
name: "infra/python/wheels/mock-py2_py3" name: "infra/python/wheels/mock-py2_py3"
version: "version:2.0.0" version: "version:2.0.0"
> >
# Used by:
# tools/run_perf.py
# tools/unittests/run_perf_test.py
wheel: <
name: "infra/python/wheels/numpy/${vpython_platform}"
version: "version:1.11.3"
>
...@@ -3875,6 +3875,12 @@ group("gn_all") { ...@@ -3875,6 +3875,12 @@ group("gn_all") {
} }
} }
group("v8_python_base") {
data = [
".vpython",
]
}
group("v8_clusterfuzz") { group("v8_clusterfuzz") {
testonly = true testonly = true
......
...@@ -44,6 +44,7 @@ group("v8_perf") { ...@@ -44,6 +44,7 @@ group("v8_perf") {
testonly = true testonly = true
data_deps = [ data_deps = [
"..:v8_python_base",
"cctest:cctest", "cctest:cctest",
"..:d8", "..:d8",
"../tools:v8_android_test_runner_deps", "../tools:v8_android_test_runner_deps",
......
...@@ -43,6 +43,7 @@ group("v8_testrunner") { ...@@ -43,6 +43,7 @@ group("v8_testrunner") {
testonly = true testonly = true
data_deps = [ data_deps = [
"..:v8_python_base",
"..:v8_dump_build_config", "..:v8_dump_build_config",
":v8_android_test_runner_deps", ":v8_android_test_runner_deps",
] ]
......
#!/usr/bin/env python
# Copyright 2014 the V8 project authors. All rights reserved. # Copyright 2014 the V8 project authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file. # found in the LICENSE file.
...@@ -120,6 +119,8 @@ import sys ...@@ -120,6 +119,8 @@ import sys
import time import time
import traceback import traceback
import numpy
from testrunner.local import android from testrunner.local import android
from testrunner.local import command from testrunner.local import command
from testrunner.local import utils from testrunner.local import utils
...@@ -142,6 +143,7 @@ RESULT_STDDEV_RE = re.compile(r'^\{([^\}]+)\}$') ...@@ -142,6 +143,7 @@ RESULT_STDDEV_RE = re.compile(r'^\{([^\}]+)\}$')
RESULT_LIST_RE = re.compile(r'^\[([^\]]+)\]$') RESULT_LIST_RE = re.compile(r'^\[([^\]]+)\]$')
TOOLS_BASE = os.path.abspath(os.path.dirname(__file__)) TOOLS_BASE = os.path.abspath(os.path.dirname(__file__))
INFRA_FAILURE_RETCODE = 87 INFRA_FAILURE_RETCODE = 87
MIN_RUNS_FOR_CONFIDENCE = 10
def GeometricMean(values): def GeometricMean(values):
...@@ -150,7 +152,7 @@ def GeometricMean(values): ...@@ -150,7 +152,7 @@ def GeometricMean(values):
The mean is calculated using log to avoid overflow. The mean is calculated using log to avoid overflow.
""" """
values = map(float, values) values = map(float, values)
return str(math.exp(sum(map(math.log, values)) / len(values))) return math.exp(sum(map(math.log, values)) / len(values))
class ResultTracker(object): class ResultTracker(object):
...@@ -241,6 +243,42 @@ class ResultTracker(object): ...@@ -241,6 +243,42 @@ class ResultTracker(object):
with open(file_name, 'w') as f: with open(file_name, 'w') as f:
f.write(json.dumps(self.ToDict())) f.write(json.dumps(self.ToDict()))
def HasEnoughRuns(self, graph_config, confidence_level):
"""Checks if the mean of the results for a given trace config is within
0.1% of the true value with the specified confidence level.
This assumes Gaussian distribution of the noise and based on
https://en.wikipedia.org/wiki/68%E2%80%9395%E2%80%9399.7_rule.
Args:
graph_config: An instance of GraphConfig.
confidence_level: Number of standard deviations from the mean that all
values must lie within. Typical values are 1, 2 and 3 and correspond
to 68%, 95% and 99.7% probability that the measured value is within
0.1% of the true value.
Returns:
True if specified confidence level have been achieved.
"""
if not isinstance(graph_config, TraceConfig):
return all(self.HasEnoughRuns(child, confidence_level)
for child in graph_config.children)
trace = self.traces.get(graph_config.name, {})
results = trace.get('results', [])
logging.debug('HasEnoughRuns for %s', graph_config.name)
if len(results) < MIN_RUNS_FOR_CONFIDENCE:
logging.debug(' Ran %d times, need at least %d',
len(results), MIN_RUNS_FOR_CONFIDENCE)
return False
logging.debug(' Results: %d entries', len(results))
mean = numpy.mean(results)
mean_stderr = numpy.std(results) / numpy.sqrt(len(results))
logging.debug(' Mean: %.2f, mean_stderr: %.2f', mean, mean_stderr)
return confidence_level * mean_stderr < mean / 1000.0
def __str__(self): # pragma: no cover def __str__(self): # pragma: no cover
return json.dumps(self.ToDict(), indent=2, separators=(',', ': ')) return json.dumps(self.ToDict(), indent=2, separators=(',', ': '))
...@@ -383,8 +421,8 @@ class TraceConfig(GraphConfig): ...@@ -383,8 +421,8 @@ class TraceConfig(GraphConfig):
stddev = None stddev = None
try: try:
result = str(float( result = float(
re.search(self.results_regexp, output.stdout, re.M).group(1))) re.search(self.results_regexp, output.stdout, re.M).group(1))
except ValueError: except ValueError:
result_tracker.AddError( result_tracker.AddError(
'Regexp "%s" returned a non-numeric for test %s.' % 'Regexp "%s" returned a non-numeric for test %s.' %
...@@ -740,6 +778,7 @@ class AndroidPlatform(Platform): # pragma: no cover ...@@ -740,6 +778,7 @@ class AndroidPlatform(Platform): # pragma: no cover
output.duration = time.time() - start output.duration = time.time() - start
return output return output
class CustomMachineConfiguration: class CustomMachineConfiguration:
def __init__(self, disable_aslr = False, governor = None): def __init__(self, disable_aslr = False, governor = None):
self.aslr_backup = None self.aslr_backup = None
...@@ -844,6 +883,12 @@ class CustomMachineConfiguration: ...@@ -844,6 +883,12 @@ class CustomMachineConfiguration:
raise Exception('Could not set CPU governor. Present value is %s' raise Exception('Could not set CPU governor. Present value is %s'
% cur_value ) % cur_value )
class MaxTotalDurationReachedError(Exception):
"""Exception used to stop running tests when max total duration is reached."""
pass
def Main(argv): def Main(argv):
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--arch', parser.add_argument('--arch',
...@@ -900,12 +945,28 @@ def Main(argv): ...@@ -900,12 +945,28 @@ def Main(argv):
'--filter=JSTests/TypedArrays/ will run only TypedArray ' '--filter=JSTests/TypedArrays/ will run only TypedArray '
'benchmarks from the JSTests suite.', 'benchmarks from the JSTests suite.',
default='') default='')
parser.add_argument('--confidence-level', type=int,
help='Repeatedly runs each benchmark until specified '
'confidence level is reached. The value is interpreted '
'as the number of standard deviations from the mean that '
'all values must lie within. Typical values are 1, 2 and '
'3 and correspond to 68%, 95% and 99.7% probability that '
'the measured value is within 0.1% of the true value. '
'Larger values result in more retries and thus longer '
'runtime, but also provide more reliable results. Also '
'see --max-total-duration flag.')
parser.add_argument('--max-total-duration', type=int, default=7140, # 1h 59m
help='Max total duration in seconds allowed for retries '
'across all tests. This is especially useful in '
'combination with the --confidence-level flag.')
parser.add_argument('--dump-logcats-to', parser.add_argument('--dump-logcats-to',
help='Writes logcat output from each test into specified ' help='Writes logcat output from each test into specified '
'directory. Only supported for android targets.') 'directory. Only supported for android targets.')
parser.add_argument("--run-count", type=int, default=0, parser.add_argument('--run-count', type=int, default=0,
help="Override the run count specified by the test " help='Override the run count specified by the test '
"suite. The default 0 uses the suite's config.") 'suite. The default 0 uses the suite\'s config.')
parser.add_argument('-v', '--verbose', default=False, action='store_true',
help='Be verbose and print debug output.')
parser.add_argument('suite', nargs='+', help='Path to the suite config file.') parser.add_argument('suite', nargs='+', help='Path to the suite config file.')
try: try:
...@@ -914,7 +975,8 @@ def Main(argv): ...@@ -914,7 +975,8 @@ def Main(argv):
return INFRA_FAILURE_RETCODE return INFRA_FAILURE_RETCODE
logging.basicConfig( logging.basicConfig(
level=logging.INFO, format='%(asctime)s %(levelname)-8s %(message)s') level=logging.DEBUG if args.verbose else logging.INFO,
format='%(asctime)s %(levelname)-8s %(message)s')
if args.arch == 'auto': # pragma: no cover if args.arch == 'auto': # pragma: no cover
args.arch = utils.DefaultArch() args.arch = utils.DefaultArch()
...@@ -973,8 +1035,7 @@ def Main(argv): ...@@ -973,8 +1035,7 @@ def Main(argv):
result_tracker = ResultTracker() result_tracker = ResultTracker()
result_tracker_secondary = ResultTracker() result_tracker_secondary = ResultTracker()
# We use list here to allow modification in nested function below. have_failed_tests = False
have_failed_tests = [False]
with CustomMachineConfiguration(governor = args.cpu_governor, with CustomMachineConfiguration(governor = args.cpu_governor,
disable_aslr = args.noaslr) as conf: disable_aslr = args.noaslr) as conf:
for path in args.suite: for path in args.suite:
...@@ -1000,6 +1061,8 @@ def Main(argv): ...@@ -1000,6 +1061,8 @@ def Main(argv):
platform.PreTests(node, path) platform.PreTests(node, path)
# Traverse graph/trace tree and iterate over all runnables. # Traverse graph/trace tree and iterate over all runnables.
start = time.time()
try:
for runnable in FlattenRunnables(root, NodeCB): for runnable in FlattenRunnables(root, NodeCB):
runnable_name = '/'.join(runnable.graphs) runnable_name = '/'.join(runnable.graphs)
if (not runnable_name.startswith(args.filter) and if (not runnable_name.startswith(args.filter) and
...@@ -1007,9 +1070,27 @@ def Main(argv): ...@@ -1007,9 +1070,27 @@ def Main(argv):
continue continue
logging.info('>>> Running suite: %s', runnable_name) logging.info('>>> Running suite: %s', runnable_name)
def RunGenerator(runnable):
if args.confidence_level:
counter = 0
while not result_tracker.HasEnoughRuns(
runnable, args.confidence_level):
yield counter
counter += 1
else:
for i in range(0, max(1, args.run_count or runnable.run_count)): for i in range(0, max(1, args.run_count or runnable.run_count)):
yield i
for i in RunGenerator(runnable):
attempts_left = runnable.retry_count + 1 attempts_left = runnable.retry_count + 1
while attempts_left: while attempts_left:
total_duration = time.time() - start
if total_duration > args.max_total_duration:
logging.info(
'>>> Stopping now since running for too long (%ds > %ds)',
total_duration, args.max_total_duration)
raise MaxTotalDurationReachedError()
output, output_secondary = platform.Run( output, output_secondary = platform.Run(
runnable, i, secondary=args.shell_dir_secondary) runnable, i, secondary=args.shell_dir_secondary)
result_tracker.AddRunnableDuration(runnable, output.duration) result_tracker.AddRunnableDuration(runnable, output.duration)
...@@ -1025,7 +1106,7 @@ def Main(argv): ...@@ -1025,7 +1106,7 @@ def Main(argv):
attempts_left -= 1 attempts_left -= 1
if not attempts_left: # ignore failures until last attempt if not attempts_left: # ignore failures until last attempt
have_failed_tests[0] = True have_failed_tests = True
else: else:
logging.info('>>> Retrying suite: %s', runnable_name) logging.info('>>> Retrying suite: %s', runnable_name)
...@@ -1033,6 +1114,8 @@ def Main(argv): ...@@ -1033,6 +1114,8 @@ def Main(argv):
result_tracker.timeouts.append(runnable_name) result_tracker.timeouts.append(runnable_name)
if runnable.has_near_timeouts: if runnable.has_near_timeouts:
result_tracker.near_timeouts.append(runnable_name) result_tracker.near_timeouts.append(runnable_name)
except MaxTotalDurationReachedError:
have_failed_tests = True
platform.PostExecution() platform.PostExecution()
...@@ -1048,7 +1131,7 @@ def Main(argv): ...@@ -1048,7 +1131,7 @@ def Main(argv):
print('Secondary results:', result_tracker_secondary) print('Secondary results:', result_tracker_secondary)
if (result_tracker.errors or result_tracker_secondary.errors or if (result_tracker.errors or result_tracker_secondary.errors or
have_failed_tests[0]): have_failed_tests):
return 1 return 1
return 0 return 0
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment