]> git.ipfire.org Git - thirdparty/ipxe.git/commitdiff
[cloud] Retry all Alibaba Cloud API calls master
authorMichael Brown <mcb30@ipxe.org>
Tue, 21 Apr 2026 15:31:52 +0000 (16:31 +0100)
committerMichael Brown <mcb30@ipxe.org>
Tue, 21 Apr 2026 15:33:29 +0000 (16:33 +0100)
Experimentation suggests Alibaba Cloud API calls are extremely
unreliable, with a failure rate around 1%.  It is therefore necessary
to allow for retrying basically every API call.

Some API calls (e.g. DescribeImages or ModifyImageAttribute) are
naturally idempotent and so safe to retry.  Some non-idempotent API
calls (e.g. CopyImage) support explicit idempotence tokens.  The
remaining API calls may simply fail on a retry, if the original
request happened to succeed but failed to return a response.

We could write convoluted retry logic around the non-idempotent calls,
but this would substantially increase the complexity of the already
unnecessarily complex code.  For now, we assume that retrying
non-idempotent requests is probably more likely to fix transient
failures than to cause additional problems.

Signed-off-by: Michael Brown <mcb30@ipxe.org>
contrib/cloud/ali-import

index 7b5df222aab660520e36abd31a81268dcda5ad3b..88c08cb7fffeb638132cf7f24d061f15cae4081a 100755 (executable)
@@ -86,6 +86,29 @@ IPXE_STORAGE_PREFIX = 'ipxe-upload-temp-'
 POLL_INTERVAL_SEC = 5
 POLL_MAX_RETRIES = 100
 
+# Experimentation suggests Alibaba Cloud API calls are extremely
+# unreliable, with a failure rate around 1%.  It is therefore
+# necessary to allow for retrying basically every API call.
+#
+# Some API calls (e.g. DescribeImages or ModifyImageAttribute) are
+# naturally idempotent and so safe to retry.  Some non-idempotent API
+# calls (e.g. CopyImage) support explicit idempotence tokens.  The
+# remaining API calls may simply fail on a retry, if the original
+# request happened to succeed but failed to return a response.
+#
+# We could write convoluted retry logic around the non-idempotent
+# calls, but this would substantially increase the complexity of the
+# already unnecessarily complex code.  For now, we assume that
+# retrying non-idempotent requests is probably more likely to fix
+# transient failures than to cause additional problems.
+#
+RUNTIME_OPTS = util.models.RuntimeOptions(
+    autoretry=True,
+    max_attempts=5,
+    connect_timeout=10000,
+    read_timeout=120000,
+)
+
 # For regions in mainland China, the Chinese state censorship laws
 # prohibit direct access to OSS bucket contents.
 #
@@ -227,13 +250,13 @@ def delete_temp_function(clients, func):
     """Remove temporary function"""
     logger.info("delete function %s %s" % (clients.region, func))
     assert func.startswith(IPXE_STORAGE_PREFIX)
-    clients.fc.delete_function(func)
+    clients.fc.delete_function_with_options(func, {}, RUNTIME_OPTS)
 
 def create_temp_function(clients, role):
     """Create temporary function (and remove any stale temporary functions)"""
     req = fc.models.ListFunctionsRequest(prefix=IPXE_STORAGE_PREFIX)
     try:
-        rsp = clients.fc.list_functions(req)
+        rsp = clients.fc.list_functions_with_options(req, {}, RUNTIME_OPTS)
     except openapi.client.UnretryableException:
         # AliCloud provides no other way to detect non-working regions
         return None
@@ -259,7 +282,7 @@ def create_temp_function(clients, role):
         timeout=FC_TIMEOUT_SEC,
     )
     req = fc.models.CreateFunctionRequest(body=body)
-    rsp = clients.fc.create_function(req)
+    rsp = clients.fc.create_function_with_options(req, {}, RUNTIME_OPTS)
     logger.info("create function %s %s" % (clients.region, func))
     return func
 
@@ -271,13 +294,7 @@ def call_temp_function(clients, func, payload):
     )
     body = json.dumps(payload)
     req = fc.models.InvokeFunctionRequest(body=body)
-    run = util.models.RuntimeOptions(
-        autoretry=True,
-        max_attempts=5,
-        connect_timeout=10000,
-        read_timeout=120000,
-    )
-    rsp = clients.fc.invoke_function_with_options(func, req, hdr, run)
+    rsp = clients.fc.invoke_function_with_options(func, req, hdr, RUNTIME_OPTS)
     log = base64.b64decode(rsp.headers.get('x-fc-log-result', b'')).decode()
     if rsp.status_code != http.HTTPStatus.OK:
         raise RuntimeError(rsp)
@@ -368,7 +385,7 @@ def delete_image(clients, name):
         image_name=name,
         image_owner_alias='self',
     )
-    rsp = clients.ecs.describe_images(req)
+    rsp = clients.ecs.describe_images_with_options(req, RUNTIME_OPTS)
     for image in rsp.body.images.image or ():
         logger.info("delete image %s %s (%s)" %
                     (clients.region, image.image_name, image.image_id))
@@ -378,12 +395,14 @@ def delete_image(clients, name):
                 image_id=image.image_id,
                 is_public=False,
             )
-            rsp = clients.ecs.modify_image_share_permission(req)
+            rsp = clients.ecs.modify_image_share_permission_with_options(
+                req, RUNTIME_OPTS
+            )
         req = ecs.models.DeleteImageRequest(
             region_id=clients.region,
             image_id=image.image_id
         )
-        rsp = clients.ecs.delete_image(req)
+        rsp = clients.ecs.delete_image_with_options(req, RUNTIME_OPTS)
 
 def wait_for_task(clients, task_id):
     """Wait for task to complete"""
@@ -394,7 +413,10 @@ def wait_for_task(clients, task_id):
             region_id=clients.region,
             task_ids=task_id,
         )
-        rsp = clients.ecs.describe_tasks(req)
+        try:
+            rsp = clients.ecs.describe_tasks_with_options(req, RUNTIME_OPTS)
+        except openapi.client.UnretryableException:
+            continue
         assert len(rsp.body.task_set.task) == 1
         assert rsp.body.task_set.task[0].task_id == task_id
         status = rsp.body.task_set.task[0].task_status
@@ -412,7 +434,10 @@ def wait_for_image(clients, image_id):
             region_id=clients.region,
             image_id=image_id,
         )
-        rsp = clients.ecs.describe_images(req)
+        try:
+            rsp = clients.ecs.describe_images_with_options(req, RUNTIME_OPTS)
+        except openapi.client.UnretryableException:
+            continue
         if len(rsp.body.images.image):
             assert len(rsp.body.images.image) == 1
             assert rsp.body.images.image[0].image_id == image_id
@@ -437,8 +462,9 @@ def import_image(clients, image, bucket):
         architecture=image.arch,
         boot_mode=image.mode,
         disk_device_mapping=[disk],
+        client_token=str(uuid4()),
     )
-    rsp = clients.ecs.import_image(req)
+    rsp = clients.ecs.import_image_with_options(req, RUNTIME_OPTS)
     image_id = rsp.body.image_id
     task_id = rsp.body.task_id
     wait_for_task(clients, task_id)
@@ -456,8 +482,9 @@ def copy_image(clients, image, image_id, censored):
         image_id=image_id,
         destination_region_id=censored.region,
         destination_image_name=image.name,
+        client_token=str(uuid4()),
     )
-    rsp = clients.ecs.copy_image(req)
+    rsp = clients.ecs.copy_image_with_options(req, RUNTIME_OPTS)
     copy_id = rsp.body.image_id
     wait_for_image(censored, copy_id)
     logger.info("image %s %s (%s)" % (censored.region, image.name, copy_id))
@@ -471,14 +498,16 @@ def finalise_image(clients, image, image_id):
         image_id=image_id,
         image_family=image.family,
     )
-    rsp = clients.ecs.modify_image_attribute(req)
+    rsp = clients.ecs.modify_image_attribute_with_options(req, RUNTIME_OPTS)
     if image.public:
         req = ecs.models.ModifyImageSharePermissionRequest(
             region_id=clients.region,
             image_id=image_id,
             is_public=True,
         )
-        rsp = clients.ecs.modify_image_share_permission(req)
+        rsp = clients.ecs.modify_image_share_permission_with_options(
+            req, RUNTIME_OPTS
+        )
 
 # Parse command-line arguments
 parser = argparse.ArgumentParser(description="Import Alibaba Cloud image")