Commit 888c0fe7 authored by George Engelbrecht's avatar George Engelbrecht Committed by LUCI CQ

ReadHttpResponse: Handle 429s with retries and adjust retry count

We've had reliability issues with gerrit, primarily related to
429 status codes but also DDoS bans. The light DDoS bans can be
very short lived and we extend retries to handle this. Longer
term bans can take up to an hour to lift.

BUG=chromium:1071590
TEST=./gerrit_client_test.py

Change-Id: Iaf68c0d9cc7375aa58367ec0447d54a99f8ebf39
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/tools/depot_tools/+/2153089
Commit-Queue: George Engelbrecht <engeg@google.com>
Reviewed-by: 's avatarEdward Lesmes <ehmaldonado@chromium.org>
Reviewed-by: 's avatarMike Frysinger <vapier@chromium.org>
parent 8aebb60a
......@@ -44,10 +44,12 @@ else:
from io import StringIO
LOGGER = logging.getLogger()
# With a starting sleep time of 1.5 seconds, 2^n exponential backoff, and seven
# total tries, the sleep time between the first and last tries will be 94.5 sec.
TRY_LIMIT = 3
# With a starting sleep time of 10.0 seconds, x <= [1.8-2.2]x backoff, and five
# total tries, the sleep time between the first and last tries will be ~7 min.
TRY_LIMIT = 5
SLEEP_TIME = 10.0
MAX_BACKOFF = 2.2
MIN_BACKOFF = 1.8
# Controls the transport protocol used to communicate with Gerrit.
# This is parameterized primarily to enable GerritTestCase.
......@@ -302,7 +304,7 @@ class GceAuthenticator(Authenticator):
@staticmethod
def _get(url, **kwargs):
next_delay_sec = 1
next_delay_sec = 1.0
for i in range(TRY_LIMIT):
p = urllib.parse.urlparse(url)
if p.scheme not in ('http', 'https'):
......@@ -323,7 +325,7 @@ class GceAuthenticator(Authenticator):
LOGGER.info('Will retry in %d seconds (%d more times)...',
next_delay_sec, TRY_LIMIT - i - 1)
time_sleep(next_delay_sec)
next_delay_sec *= 2
next_delay_sec *= random.uniform(MIN_BACKOFF, MAX_BACKOFF)
return None, None
@classmethod
......@@ -418,7 +420,7 @@ def ReadHttpResponse(conn, accept_statuses=frozenset([200])):
Common additions include 204, 400, and 404.
Returns: A string buffer containing the connection's reply.
"""
sleep_time = 1.5
sleep_time = SLEEP_TIME
for idx in range(TRY_LIMIT):
before_response = time.time()
response, contents = conn.request(**conn.req_params)
......@@ -434,9 +436,10 @@ def ReadHttpResponse(conn, accept_statuses=frozenset([200])):
# If response.status is an accepted status,
# or response.status < 500 then the result is final; break retry loop.
# If the response is 404/409 it might be because of replication lag,
# so keep trying anyway.
# so keep trying anyway. If it is 429, it is generally ok to retry after
# a backoff.
if (response.status in accept_statuses
or response.status < 500 and response.status not in [404, 409]):
or response.status < 500 and response.status not in [404, 409, 429]):
LOGGER.debug('got response %d for %s %s', response.status,
conn.req_params['method'], conn.req_params['uri'])
# If 404 was in accept_statuses, then it's expected that the file might
......@@ -459,7 +462,7 @@ def ReadHttpResponse(conn, accept_statuses=frozenset([200])):
LOGGER.info('Will retry in %d seconds (%d more times)...',
sleep_time, TRY_LIMIT - idx - 1)
time_sleep(sleep_time)
sleep_time = sleep_time * 2
sleep_time *= random.uniform(MIN_BACKOFF, MAX_BACKOFF)
# end of retries loop
if response.status in accept_statuses:
......
......@@ -202,8 +202,8 @@ class GceAuthenticatorTest(unittest.TestCase):
def testIsGce_500(self):
httplib2.Http().request.return_value = (mock.Mock(status=500), None)
self.assertFalse(self.GceAuthenticator.is_gce())
self.assertEqual(
[mock.call(1), mock.call(2)], gerrit_util.time_sleep.mock_calls)
last_call = gerrit_util.time_sleep.mock_calls[-1]
self.assertLessEqual(last_call, mock.call(43.0))
def testIsGce_FailsThenSucceeds(self):
response = mock.Mock(status=200)
......@@ -363,17 +363,23 @@ class GerritUtilTest(unittest.TestCase):
self.assertEqual(404, cm.exception.http_status)
def testReadHttpResponse_ServerError(self):
def readHttpResponse_ServerErrorHelper(self, status):
conn = mock.Mock(req_params={'uri': 'uri', 'method': 'method'})
conn.request.return_value = (mock.Mock(status=500), b'')
conn.request.return_value = (mock.Mock(status=status), b'')
with self.assertRaises(gerrit_util.GerritError) as cm:
gerrit_util.ReadHttpResponse(conn)
self.assertEqual(500, cm.exception.http_status)
self.assertEqual(status, cm.exception.http_status)
self.assertEqual(gerrit_util.TRY_LIMIT, len(conn.request.mock_calls))
self.assertEqual(
[mock.call(1.5), mock.call(3)], gerrit_util.time_sleep.mock_calls)
last_call = gerrit_util.time_sleep.mock_calls[-1]
self.assertLessEqual(last_call, mock.call(422.0))
def testReadHttpResponse_ServerError(self):
self.readHttpResponse_ServerErrorHelper(status=404)
self.readHttpResponse_ServerErrorHelper(status=409)
self.readHttpResponse_ServerErrorHelper(status=429)
self.readHttpResponse_ServerErrorHelper(status=500)
def testReadHttpResponse_ServerErrorAndSuccess(self):
conn = mock.Mock(req_params={'uri': 'uri', 'method': 'method'})
......@@ -384,7 +390,7 @@ class GerritUtilTest(unittest.TestCase):
self.assertEqual('content✔', gerrit_util.ReadHttpResponse(conn).getvalue())
self.assertEqual(2, len(conn.request.mock_calls))
gerrit_util.time_sleep.assert_called_once_with(1.5)
gerrit_util.time_sleep.assert_called_once_with(10.0)
def testReadHttpResponse_Expected404(self):
conn = mock.Mock()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment