From: Michael Brown Date: Tue, 21 Apr 2026 15:31:52 +0000 (+0100) Subject: [cloud] Retry all Alibaba Cloud API calls X-Git-Url: http://git.ipfire.org/index.cgi?a=commitdiff_plain;ds=sidebyside;p=thirdparty%2Fipxe.git [cloud] Retry all Alibaba Cloud API calls Experimentation suggests Alibaba Cloud API calls are extremely unreliable, with a failure rate around 1%. It is therefore necessary to allow for retrying basically every API call. Some API calls (e.g. DescribeImages or ModifyImageAttribute) are naturally idempotent and so safe to retry. Some non-idempotent API calls (e.g. CopyImage) support explicit idempotence tokens. The remaining API calls may simply fail on a retry, if the original request happened to succeed but failed to return a response. We could write convoluted retry logic around the non-idempotent calls, but this would substantially increase the complexity of the already unnecessarily complex code. For now, we assume that retrying non-idempotent requests is probably more likely to fix transient failures than to cause additional problems. Signed-off-by: Michael Brown --- diff --git a/contrib/cloud/ali-import b/contrib/cloud/ali-import index 7b5df222a..88c08cb7f 100755 --- a/contrib/cloud/ali-import +++ b/contrib/cloud/ali-import @@ -86,6 +86,29 @@ IPXE_STORAGE_PREFIX = 'ipxe-upload-temp-' POLL_INTERVAL_SEC = 5 POLL_MAX_RETRIES = 100 +# Experimentation suggests Alibaba Cloud API calls are extremely +# unreliable, with a failure rate around 1%. It is therefore +# necessary to allow for retrying basically every API call. +# +# Some API calls (e.g. DescribeImages or ModifyImageAttribute) are +# naturally idempotent and so safe to retry. Some non-idempotent API +# calls (e.g. CopyImage) support explicit idempotence tokens. The +# remaining API calls may simply fail on a retry, if the original +# request happened to succeed but failed to return a response. +# +# We could write convoluted retry logic around the non-idempotent +# calls, but this would substantially increase the complexity of the +# already unnecessarily complex code. For now, we assume that +# retrying non-idempotent requests is probably more likely to fix +# transient failures than to cause additional problems. +# +RUNTIME_OPTS = util.models.RuntimeOptions( + autoretry=True, + max_attempts=5, + connect_timeout=10000, + read_timeout=120000, +) + # For regions in mainland China, the Chinese state censorship laws # prohibit direct access to OSS bucket contents. # @@ -227,13 +250,13 @@ def delete_temp_function(clients, func): """Remove temporary function""" logger.info("delete function %s %s" % (clients.region, func)) assert func.startswith(IPXE_STORAGE_PREFIX) - clients.fc.delete_function(func) + clients.fc.delete_function_with_options(func, {}, RUNTIME_OPTS) def create_temp_function(clients, role): """Create temporary function (and remove any stale temporary functions)""" req = fc.models.ListFunctionsRequest(prefix=IPXE_STORAGE_PREFIX) try: - rsp = clients.fc.list_functions(req) + rsp = clients.fc.list_functions_with_options(req, {}, RUNTIME_OPTS) except openapi.client.UnretryableException: # AliCloud provides no other way to detect non-working regions return None @@ -259,7 +282,7 @@ def create_temp_function(clients, role): timeout=FC_TIMEOUT_SEC, ) req = fc.models.CreateFunctionRequest(body=body) - rsp = clients.fc.create_function(req) + rsp = clients.fc.create_function_with_options(req, {}, RUNTIME_OPTS) logger.info("create function %s %s" % (clients.region, func)) return func @@ -271,13 +294,7 @@ def call_temp_function(clients, func, payload): ) body = json.dumps(payload) req = fc.models.InvokeFunctionRequest(body=body) - run = util.models.RuntimeOptions( - autoretry=True, - max_attempts=5, - connect_timeout=10000, - read_timeout=120000, - ) - rsp = clients.fc.invoke_function_with_options(func, req, hdr, run) + rsp = clients.fc.invoke_function_with_options(func, req, hdr, RUNTIME_OPTS) log = base64.b64decode(rsp.headers.get('x-fc-log-result', b'')).decode() if rsp.status_code != http.HTTPStatus.OK: raise RuntimeError(rsp) @@ -368,7 +385,7 @@ def delete_image(clients, name): image_name=name, image_owner_alias='self', ) - rsp = clients.ecs.describe_images(req) + rsp = clients.ecs.describe_images_with_options(req, RUNTIME_OPTS) for image in rsp.body.images.image or (): logger.info("delete image %s %s (%s)" % (clients.region, image.image_name, image.image_id)) @@ -378,12 +395,14 @@ def delete_image(clients, name): image_id=image.image_id, is_public=False, ) - rsp = clients.ecs.modify_image_share_permission(req) + rsp = clients.ecs.modify_image_share_permission_with_options( + req, RUNTIME_OPTS + ) req = ecs.models.DeleteImageRequest( region_id=clients.region, image_id=image.image_id ) - rsp = clients.ecs.delete_image(req) + rsp = clients.ecs.delete_image_with_options(req, RUNTIME_OPTS) def wait_for_task(clients, task_id): """Wait for task to complete""" @@ -394,7 +413,10 @@ def wait_for_task(clients, task_id): region_id=clients.region, task_ids=task_id, ) - rsp = clients.ecs.describe_tasks(req) + try: + rsp = clients.ecs.describe_tasks_with_options(req, RUNTIME_OPTS) + except openapi.client.UnretryableException: + continue assert len(rsp.body.task_set.task) == 1 assert rsp.body.task_set.task[0].task_id == task_id status = rsp.body.task_set.task[0].task_status @@ -412,7 +434,10 @@ def wait_for_image(clients, image_id): region_id=clients.region, image_id=image_id, ) - rsp = clients.ecs.describe_images(req) + try: + rsp = clients.ecs.describe_images_with_options(req, RUNTIME_OPTS) + except openapi.client.UnretryableException: + continue if len(rsp.body.images.image): assert len(rsp.body.images.image) == 1 assert rsp.body.images.image[0].image_id == image_id @@ -437,8 +462,9 @@ def import_image(clients, image, bucket): architecture=image.arch, boot_mode=image.mode, disk_device_mapping=[disk], + client_token=str(uuid4()), ) - rsp = clients.ecs.import_image(req) + rsp = clients.ecs.import_image_with_options(req, RUNTIME_OPTS) image_id = rsp.body.image_id task_id = rsp.body.task_id wait_for_task(clients, task_id) @@ -456,8 +482,9 @@ def copy_image(clients, image, image_id, censored): image_id=image_id, destination_region_id=censored.region, destination_image_name=image.name, + client_token=str(uuid4()), ) - rsp = clients.ecs.copy_image(req) + rsp = clients.ecs.copy_image_with_options(req, RUNTIME_OPTS) copy_id = rsp.body.image_id wait_for_image(censored, copy_id) logger.info("image %s %s (%s)" % (censored.region, image.name, copy_id)) @@ -471,14 +498,16 @@ def finalise_image(clients, image, image_id): image_id=image_id, image_family=image.family, ) - rsp = clients.ecs.modify_image_attribute(req) + rsp = clients.ecs.modify_image_attribute_with_options(req, RUNTIME_OPTS) if image.public: req = ecs.models.ModifyImageSharePermissionRequest( region_id=clients.region, image_id=image_id, is_public=True, ) - rsp = clients.ecs.modify_image_share_permission(req) + rsp = clients.ecs.modify_image_share_permission_with_options( + req, RUNTIME_OPTS + ) # Parse command-line arguments parser = argparse.ArgumentParser(description="Import Alibaba Cloud image")