Improve flakiness logic for print-flaky-tests.

We only compare the 2nd attempt of a run to the tests expected results to see if it was very flaky. Instead we should see if any of the runs had an unexpected result. That way, any test that gets marked flaky will stop getting printed out by this script as soon as the bots have cycled once. Also, use TestExpectations.result_was_expected to determine if a result was actually expected. The previous "in" check didn't work for cases like Text, which are a subset of Failure instead of verbatim include in TestExpectations as Text. Finally, just use the length of the results entry to decide if something is very flaky vs regular flaky. If there are <=2 entries, then it's not very flaky. That catches the cases of a test that reliably fails as well as the case of a test that fails the first attempt and runs as expected on the second run (not necessarily passing on the second attempt, just matching the listing in TestExpectations for that run). R=joelo@chromium.org Review URL: https://codereview.chromium.org/1289163002 . git-svn-id: svn://svn.chromium.org/blink/trunk@200768 bbb929c8-8fbe-4397-9dbb-9b2b20218538

Improve flakiness logic for print-flaky-tests.
We only compare the 2nd attempt of a run to the tests expected results to see if it was very flaky. Instead we should see if any of the runs had an unexpected result. That way, any test that gets marked flaky will stop getting printed out by this script as soon as the bots have cycled once. Also, use TestExpectations.result_was_expected to determine if a result was actually expected. The previous "in" check didn't work for cases like Text, which are a subset of Failure instead of verbatim include in TestExpectations as Text. Finally, just use the length of the results entry to decide if something is very flaky vs regular flaky. If there are <=2 entries, then it's not very flaky. That catches the cases of a test that reliably fails as well as the case of a test that fails the first attempt and runs as expected on the second run (not necessarily passing on the second attempt, just matching the listing in TestExpectations for that run). R=joelo@chromium.org Review URL: https://codereview.chromium.org/1289163002 . git-svn-id: svn://svn.chromium.org/blink/trunk@200768 bbb929c8-8fbe-4397-9dbb-9b2b20218538
3c2a7b80 · ojan@chromium.org · 2f5747ed · 3c2a7b80 · 3c2a7b80 · 3c2a7b80
Commit 3c2a7b80 authored Aug 19, 2015 by ojan@chromium.org
4 changed files
--- a/third_party/WebKit/Tools/Scripts/webkitpy/layout_tests/layout_package/bot_test_expectations.py
+++ b/third_party/WebKit/Tools/Scripts/webkitpy/layout_tests/layout_package/bot_test_expectations.py
@@ -36,7 +36,7 @@ import urllib
 import urllib2

 from webkitpy.layout_tests.port import builders
-from webkitpy.layout_tests.models.test_expectations import TestExpectations
+from webkitpy.layout_tests.models.test_expectations import TestExpectations, PASS
 from webkitpy.layout_tests.models.test_expectations import TestExpectationLine


@@ -152,8 +152,6 @@ class BotTestExpectations(object):
    # The JSON can contain results for expectations, not just actual result types.
    NON_RESULT_TYPES = ['S', 'X']  # SLOW, SKIP

-    PASS_EXPECTATION = 'PASS'
-
    # specifiers arg is used in unittests to avoid the static dependency on builders.
    def __init__(self, results_json, specifiers=None):
        self.results_json = results_json
@@ -167,7 +165,7 @@ class BotTestExpectations(object):
        line.path = test_path  # FIXME: Should this be normpath?
        line.matching_tests = [test_path]
        line.bugs = ["crbug.com/FILE_A_BUG_BEFORE_COMMITTING_THIS"]
-        line.expectations = sorted(map(self.results_json.expectation_for_type, flaky_types))
+        line.expectations = sorted(flaky_types)
        line.specifiers = self.specifiers
        return line

@@ -178,7 +176,7 @@ class BotTestExpectations(object):
            flaky_types = self._flaky_types_in_results(entry, only_ignore_very_flaky)
            if len(flaky_types) <= 1:
                continue
-            flakes_by_path[test_path] = sorted(map(self.results_json.expectation_for_type, flaky_types))
+            flakes_by_path[test_path] = sorted(flaky_types)
        return flakes_by_path

    def unexpected_results_by_path(self):
@@ -247,12 +245,18 @@ class BotTestExpectations(object):

        return results

+    def _result_to_enum(self, result):
+        return TestExpectations.EXPECTATIONS[result.lower()]
+
    def _flaky_types_in_results(self, results_entry, only_ignore_very_flaky):
        flaky_results = set()

-        latest_expectations = [self.PASS_EXPECTATION]
+        # Always include pass as an expected result. Passes will never turn the bot red.
+        # This fixes cases where the expectations have an implicit Pass, e.g. [ Slow ].
+        latest_expectations = [PASS]
        if self.results_json.EXPECTATIONS_KEY in results_entry:
-            latest_expectations = results_entry[self.results_json.EXPECTATIONS_KEY].split(' ')
+            expectations_list = results_entry[self.results_json.EXPECTATIONS_KEY].split(' ')
+            latest_expectations += [self._result_to_enum(expectation) for expectation in expectations_list]

        for result_item in results_entry[self.results_json.RESULTS_KEY]:
            _, result_types_str = self.results_json.occurances_and_type_from_result_item(result_item)
@@ -261,7 +265,7 @@ class BotTestExpectations(object):
            for result_type in result_types_str:
                # TODO(ojan): Remove this if-statement once crbug.com/514378 is fixed.
                if result_type not in self.NON_RESULT_TYPES:
-                    result_types.append(result_type)
+                    result_types.append(self.results_json.expectation_for_type(result_type))

            # It didn't flake if it didn't retry.
            if len(result_types) <= 1:
@@ -270,13 +274,29 @@ class BotTestExpectations(object):
            # If the test ran as expected after only one retry, it's not very flaky.
            # It's only very flaky if it failed the first run and the first retry
            # and then ran as expected in one of the subsequent retries.
-            if only_ignore_very_flaky and self.results_json.expectation_for_type(result_types[1]) in latest_expectations:
-                # Once we get a pass, we don't retry again. So if the first retry passed,
-                # then there should only be 2 entries because we don't do another retry.
-                # TODO(ojan): Reenable this assert once crbug.com/514378 is fixed.
-                # assert(len(result_types) == 2)
+            # If there are only two entries, then that means it failed on the first
+            # try and ran as expected on the second because otherwise we'd have
+            # a third entry from the next try.
+            second_result_type_enum_value = self._result_to_enum(result_types[1])
+            if only_ignore_very_flaky and len(result_types) == 2:
                continue

-            flaky_results = flaky_results.union(set(result_types))
+            has_unexpected_results = False
+            for result_type in result_types:
+                result_enum = self._result_to_enum(result_type)
+                # TODO(ojan): We really should be grabbing the expected results from the time
+                # of the run instead of looking at the latest expected results. That's a lot
+                # more complicated though. So far we've been looking at the aggregated
+                # results_small.json off test_results.appspot, which has all the information
+                # for the last 100 runs. In order to do this, we'd need to look at the
+                # individual runs' full_results.json, which would be slow and more complicated.
+                # The only thing we lose by not fixing this is that a test that was flaky
+                # and got fixed will still get printed out until 100 runs have passed.
+                if not TestExpectations.result_was_expected(result_enum, latest_expectations, test_needs_rebaselining=False):
+                    has_unexpected_results = True
+                    break
+
+            if has_unexpected_results:
+                flaky_results = flaky_results.union(set(result_types))

        return flaky_results
--- a/third_party/WebKit/Tools/Scripts/webkitpy/layout_tests/layout_package/bot_test_expectations_unittest.py
+++ b/third_party/WebKit/Tools/Scripts/webkitpy/layout_tests/layout_package/bot_test_expectations_unittest.py
@@ -135,6 +135,12 @@ class BotTestExpectationsTest(unittest.TestCase):
                    'maybeflaky.html': self._results_from_string('FP'),
                    'notflakypass.html': self._results_from_string('P'),
                    'notflakyfail.html': self._results_from_string('F'),
+                    # Even if there are no expected results, it's not very flaky if it didn't do mulitple retries.
+                    # This accounts for the latest expectations not necessarily matching the expectations
+                    # at the time of the given run.
+                    'notverflakynoexpected.html': self._results_from_string('FT'),
+                    # If the test is flaky, but marked as such, it shouldn't get printed out.
+                    'notflakyexpected.html': {'results': [[2, 'FFFP']], 'expected': 'PASS FAIL'},
                }
            }
        }
@@ -144,6 +150,7 @@ class BotTestExpectationsTest(unittest.TestCase):

        self._assert_expectations(test_data, {
            'foo/veryflaky.html': sorted(["TEXT", "PASS"]),
+            'foo/notverflakynoexpected.html': ['TEXT', 'TIMEOUT'],
            'foo/maybeflaky.html': sorted(["TEXT", "PASS"]),
        }, only_ignore_very_flaky=False)


--- a/third_party/WebKit/Tools/Scripts/webkitpy/tool/commands/flakytests.py
+++ b/third_party/WebKit/Tools/Scripts/webkitpy/tool/commands/flakytests.py
@@ -26,6 +26,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+import logging
 import os
 import optparse
 from webkitpy.tool.multicommandtool import AbstractDeclarativeCommand
@@ -35,15 +36,30 @@ from webkitpy.layout_tests.port import builders
 from webkitpy.common.net import sheriff_calendar


+_log = logging.getLogger(__name__)
+
+
 class FlakyTests(AbstractDeclarativeCommand):
    name = "print-flaky-tests"
    help_text = "Print out flaky lines from the flakiness dashboard"
    show_in_main_help = True

-    FLAKY_TEST_CONTENTS = '''%s
+    FLAKINESS_DASHBOARD_URL = 'https://test-results.appspot.com/dashboards/flakiness_dashboard.html#tests=%s'
+
+    BUG_TEMPLATE = 'https://code.google.com/p/chromium/issues/entry?owner=FILL_ME_IN&status=Assigned&labels=Pri-1,Cr-Blink,FlakyLayoutTest&summary=XXXXXXX%20is%20flaky&comment=XXXXXXX%20is%20flaky.%0A%0AIt%20failed%20twice%20and%20then%20passed%20on%20the%203rd%20or%204th%20retry.%20This%20is%20too%20flaky.%20The%20test%20will%20be%20skipped%20until%20it%27s%20fixed.%20If%20not%20fixed%20in%203%20months,%20it%20will%20be%20deleted%20or%20perma-skipped.%0A%0AIn%20the%20flakiness%20dashboard,%20the%20turquoise%20boxes%20are%20runs%20where%20the%20test%20failed%20and%20then%20passed%20on%20retry.%0A%0Ahttp://test-results.appspot.com/dashboards/flakiness_dashboard.html%23tests=XXXXXXX'
+
+    HEADER = '''Manually add bug numbers for these and then put the lines in LayoutTests/TestExpectations.
+Look up the test in the flakiness dashboard first to see if the the platform
+specifiers should be made more general.
+
+Bug template:
+%s
+''' % BUG_TEMPLATE

-Manually add bug numbers for these and then put the lines in LayoutTests/TestExpectations.
-TODO(ojan): Write a script to file/assign the bugs then create a bot to do this automatically.
+    OUTPUT = '''%s
+%s
+
+Flakiness dashboard: %s
 '''

    def __init__(self):
@@ -67,6 +83,13 @@ TODO(ojan): Write a script to file/assign the bugs then create a bot to do this
            models.append(model)

            expectations = factory.expectations_for_builder(builder_name)
+
+            # TODO(ojan): We should also skip bots that haven't uploaded recently,
+            # e.g. if they're >24h stale.
+            if not expectations:
+                _log.error("Can't load flakiness data for builder: %s" % builder_name)
+                continue
+
            for line in expectations.expectation_lines(only_ignore_very_flaky=True):
                # TODO(ojan): Find a way to merge specifiers instead of removing build types.
                # We can't just union because some specifiers will change the meaning of others.
@@ -93,5 +116,10 @@ TODO(ojan): Write a script to file/assign the bugs then create a bot to do this
        fs = tool.filesystem
        lines = filter(lambda line: fs.exists(fs.join(port.layout_tests_dir(), line.path)), lines)

-        print self.FLAKY_TEST_CONTENTS % TestExpectations.list_to_string(lines)  # pylint: disable=E1601
+        test_names = [line.name for line in lines]
+        flakiness_dashbord_url = self.FLAKINESS_DASHBOARD_URL % ','.join(test_names)
+        expectations_string = TestExpectations.list_to_string(lines)
+
+        # pylint: disable=E1601
+        print self.OUTPUT % (self.HEADER, expectations_string, flakiness_dashbord_url)

--- a/third_party/WebKit/Tools/Scripts/webkitpy/tool/commands/flakytests_unittest.py
+++ b/third_party/WebKit/Tools/Scripts/webkitpy/tool/commands/flakytests_unittest.py
@@ -71,11 +71,9 @@ class FlakyTestsTest(CommandsTest):
        tool = MockTool()
        command.expectations_factory = FakeBotTestExpectationsFactory
        options = MockOptions(upload=True)
-        expected_stdout = '''
-
-Manually add bug numbers for these and then put the lines in LayoutTests/TestExpectations.
-TODO(ojan): Write a script to file/assign the bugs then create a bot to do this automatically.
-
-'''
+        expected_stdout = flakytests.FlakyTests.OUTPUT % (
+            flakytests.FlakyTests.HEADER,
+            '',
+            flakytests.FlakyTests.FLAKINESS_DASHBOARD_URL % '') + '\n'

        self.assert_execute_outputs(command, options=options, tool=tool, expected_stdout=expected_stdout)