Skip to content

Error handling

Retry behavior

Errors inside a step

When your code throws an exception from inside a step, the SDK retries the step according to its retry strategy. The SDK checkpoints each retry attempt, ends the Lambda invocation and then the backend re-invokes the Lambda function at the required time. This means you do not consume compute time while waiting between retry attempts.

Errors outside a step

When your code throws an exception from outside a step, the SDK marks the execution as FAILED and returns the error immediately. The SDK does not retry it. Wrap error-prone code in a step for automatic retry.

When retries exhaust

When a step exhausts all retry attempts, the SDK checkpoints the final error and throws it to your handler. You can catch it there and handle it as required.

import { withDurableExecution, StepError } from "@aws/durable-execution-sdk-js";

export const handler = withDurableExecution(async (event, context) => {
  try {
    const result = await context.step("process-order", async (stepCtx) => {
      const orderId = (event as { orderId?: string }).orderId;
      if (!orderId) {
        throw new Error("orderId is required");
      }
      return { orderId, status: "processed" };
    });
    return result;
  } catch (err) {
    if (err instanceof StepError) {
      context.logger.error("Step failed", { cause: err.cause?.message });
      return { error: err.cause?.message ?? "Step failed" };
    }
    throw err;
  }
});
from aws_durable_execution_sdk_python.context import DurableContext, StepContext, durable_step
from aws_durable_execution_sdk_python.exceptions import CallableRuntimeError
from aws_durable_execution_sdk_python.execution import durable_execution


@durable_step
def process_order(step_context: StepContext, order_id: str) -> dict:
    if not order_id:
        raise ValueError("order_id is required")
    return {"order_id": order_id, "status": "processed"}


@durable_execution
def lambda_handler(event: dict, context: DurableContext) -> dict:
    try:
        result = context.step(process_order(event.get("orderId", "")))
        return result
    except CallableRuntimeError as e:
        context.logger.error("Step failed", extra={"error_type": e.error_type})
        return {"error": e.message}
import java.util.Map;
import software.amazon.lambda.durable.DurableContext;
import software.amazon.lambda.durable.DurableHandler;
import software.amazon.lambda.durable.exception.StepFailedException;

public class BasicErrorHandling extends DurableHandler<Map<String, String>, Map<String, Object>> {

    @Override
    public Map<String, Object> handleRequest(Map<String, String> event, DurableContext context) {
        try {
            Map<String, String> result = context.step("process-order", Map.class, stepCtx -> {
                String orderId = event.get("orderId");
                if (orderId == null || orderId.isEmpty()) {
                    throw new IllegalArgumentException("orderId is required");
                }
                return Map.of("orderId", orderId, "status", "processed");
            });
            return Map.of("result", result);
        } catch (StepFailedException e) {
            context.getLogger().error("Step failed: " + e.getMessage());
            return Map.of("error", e.getErrorObject().errorMessage());
        }
    }
}

Replay throws the same error

When the SDK replays a completed operation from its checkpoint, it returns the checkpointed result without re-running the step body. If the checkpointed result was an error, the SDK re-throws that same error at the same point in your code. Your handler sees the same exception it did on the original execution.

Exception types

Each SDK wraps step failures in its own exception type. The type carries the original error and the operation details.

graph TD
  DOE[DurableOperationError]
  DOE --> StepError
  DOE --> CallbackError
  DOE --> CallbackTimeoutError
  DOE --> CallbackSubmitterError
  DOE --> InvokeError
  DOE --> ChildContextError
  DOE --> WaitForConditionError
  SIE[StepInterruptedError]
import {
  DurableOperationError,
  StepError,
  CallbackError,
  CallbackTimeoutError,
  CallbackSubmitterError,
  InvokeError,
  ChildContextError,
  WaitForConditionError,
  StepInterruptedError,
} from "@aws/durable-execution-sdk-js";

// DurableOperationError
//   StepError              — step failed after retries exhausted
//   CallbackError          — callback operation failed
//   CallbackTimeoutError   — callback timed out
//   CallbackSubmitterError — callback submitter failed
//   InvokeError            — invoke operation failed
//   ChildContextError      — child context failed
//   WaitForConditionError  — wait-for-condition failed
//
// StepInterruptedError     — at-most-once step interrupted before checkpoint
//   (not a DurableOperationError; thrown directly)

export {
  DurableOperationError,
  StepError,
  CallbackError,
  CallbackTimeoutError,
  CallbackSubmitterError,
  InvokeError,
  ChildContextError,
  WaitForConditionError,
  StepInterruptedError,
};

DurableOperationError is the base class for all operation-level failures. Each subclass corresponds to a specific operation type. The cause property holds the original error your code threw.

StepInterruptedError is not a DurableOperationError. The SDK throws it when an at-most-once step started but Lambda was interrupted before the SDK checkpointed the result. See Step interrupted below.

graph TD
  DEE[DurableExecutionsError]
  DEE --> UnrecoverableError
  DEE --> ValidationError
  DEE --> SerDesError
  DEE --> UserlandError
  UnrecoverableError --> ExecutionError
  UnrecoverableError --> InvocationError
  ExecutionError --> CallbackError
  InvocationError --> StepInterruptedError
  UserlandError --> CallableRuntimeError
from aws_durable_execution_sdk_python.exceptions import (
    DurableExecutionsError,
    UnrecoverableError,
    ExecutionError,
    InvocationError,
    CallbackError,
    ValidationError,
    SerDesError,
    StepInterruptedError,
    CallableRuntimeError,
    UserlandError,
)

# DurableExecutionsError
#   UnrecoverableError
#     ExecutionError         — fails execution without retry
#       CallbackError        — callback handling failed
#     InvocationError        — triggers Lambda retry
#       StepInterruptedError — at-most-once step interrupted
#   ValidationError          — invalid arguments to SDK operations
#   SerDesError              — serialization or deserialization failed
#   UserlandError
#     CallableRuntimeError   — wraps exceptions thrown inside step functions

DurableExecutionsError is the base class for all SDK exceptions.

CallableRuntimeError wraps any exception your code throws inside a step. Its error_type, message, data, and stack_trace attributes carry the details of the original exception.

ExecutionError fails the execution without retry. InvocationError causes Lambda to retry the entire invocation. Both carry a termination_reason attribute.

StepInterruptedError is a subclass of InvocationError. The SDK raises it when an at-most-once step started but Lambda was interrupted before the SDK checkpointed the result. See Step interrupted below.

graph TD
  DEE[DurableExecutionException]
  DEE --> UDEE[UnrecoverableDurableExecutionException]
  DEE --> DOE[DurableOperationException]
  DEE --> SerDesException
  UDEE --> IllegalDurableOperationException
  UDEE --> NonDeterministicExecutionException
  DOE --> StepException
  DOE --> CallbackException
  StepException --> StepFailedException
  StepException --> StepInterruptedException
  CallbackException --> CallbackFailedException
import software.amazon.lambda.durable.exception.DurableExecutionException;
import software.amazon.lambda.durable.exception.DurableOperationException;
import software.amazon.lambda.durable.exception.UnrecoverableDurableExecutionException;
import software.amazon.lambda.durable.exception.StepException;
import software.amazon.lambda.durable.exception.StepFailedException;
import software.amazon.lambda.durable.exception.StepInterruptedException;
import software.amazon.lambda.durable.exception.CallbackException;
import software.amazon.lambda.durable.exception.CallbackFailedException;
import software.amazon.lambda.durable.exception.SerDesException;
import software.amazon.lambda.durable.exception.IllegalDurableOperationException;
import software.amazon.lambda.durable.exception.NonDeterministicExecutionException;

// DurableExecutionException
//   UnrecoverableDurableExecutionException — execution terminated immediately
//     IllegalDurableOperationException     — illegal SDK operation detected
//     NonDeterministicExecutionException   — non-deterministic replay detected
//   DurableOperationException              — operation-level failure with details
//     StepException
//       StepFailedException                — step failed after retries exhausted
//       StepInterruptedException           — at-most-once step interrupted
//     CallbackException
//       CallbackFailedException            — callback failed with error from external system
//   SerDesException                        — serialization or deserialization failed

DurableExecutionException is the base class for all SDK exceptions.

DurableOperationException carries the failed Operation and its ErrorObject. Use getOperation(), getErrorObject(), and getOperationStatus() to inspect the failure.

StepFailedException is thrown when a step exhausts all retry attempts.

StepInterruptedException is thrown when an at-most-once step started but Lambda was interrupted before the SDK checkpointed the result. See Step interrupted below.

Validation errors

The SDK does not retry validation errors. The SDK throws validation errors when you pass invalid arguments to an SDK operation, such as a negative duration or an empty operation name.

The SDK throws TypeError for invalid configuration values.

import {
  withDurableExecution,
  createRetryStrategy,
} from "@aws/durable-execution-sdk-js";

export const handler = withDurableExecution(async (event, context) => {
  // The SDK throws a TypeError for invalid configuration values.
  // For example, passing a negative delay to createRetryStrategy:
  try {
    const retryStrategy = createRetryStrategy({
      initialDelay: { seconds: -1 }, // invalid: negative delay
      maxAttempts: 3,
    });
  } catch (err) {
    if (err instanceof TypeError) {
      context.logger.error("Invalid SDK configuration", { message: (err as Error).message });
      return { error: "InvalidConfiguration" };
    }
    throw err;
  }
  return { status: "ok" };
});

The SDK raises ValidationError for invalid configuration values.

from aws_durable_execution_sdk_python.config import Duration
from aws_durable_execution_sdk_python.context import DurableContext
from aws_durable_execution_sdk_python.exceptions import ValidationError
from aws_durable_execution_sdk_python.execution import durable_execution


@durable_execution
def lambda_handler(event: dict, context: DurableContext) -> dict:
    # The SDK raises ValidationError for invalid configuration values.
    # For example, passing a negative duration:
    try:
        duration = Duration(seconds=-1)  # invalid
    except ValidationError as e:
        context.logger.error("Invalid SDK configuration", extra={"message": str(e)})
        return {"error": "InvalidConfiguration", "message": str(e)}
    return {"status": "ok"}

The SDK throws IllegalArgumentException for invalid configuration values.

import java.time.Duration;
import java.util.Map;
import software.amazon.lambda.durable.DurableContext;
import software.amazon.lambda.durable.DurableHandler;

public class ValidationErrorExample extends DurableHandler<Map<String, Object>, Map<String, Object>> {

    @Override
    public Map<String, Object> handleRequest(Map<String, Object> event, DurableContext context) {
        // The SDK throws IllegalArgumentException for invalid configuration values.
        // For example, passing a duration shorter than 1 second to context.wait():
        try {
            context.wait("short-wait", Duration.ofMillis(500)); // invalid: less than 1 second
        } catch (IllegalArgumentException e) {
            context.getLogger().error("Invalid SDK configuration: " + e.getMessage());
            return Map.of("error", "InvalidConfiguration", "message", e.getMessage());
        }
        return Map.of("status", "ok");
    }
}

Step interrupted

When you configure a step with at-most-once semantics, the SDK runs the step body at most once per retry attempt. If Lambda is interrupted after the step body starts but before the SDK checkpoints the result, the SDK does not re-run the step on the next invocation. Instead, it throws a step-interrupted exception.

Use at-most-once semantics for operations with side effects that must not run more than once, such as charging a payment or sending a notification. When you catch a step-interrupted exception, check the external system to determine whether the operation succeeded before deciding how to proceed.

import {
  withDurableExecution,
  StepSemantics,
  StepInterruptedError,
} from "@aws/durable-execution-sdk-js";

export const handler = withDurableExecution(async (event, context) => {
  try {
    const result = await context.step(
      "charge-payment",
      async (stepCtx) => {
        return chargePayment((event as { amount: number }).amount);
      },
      { semantics: StepSemantics.AtMostOncePerRetry },
    );
    return { status: "charged", result };
  } catch (err) {
    if (err instanceof StepInterruptedError) {
      // The step started but Lambda was interrupted before the result was
      // checkpointed. The SDK will not re-run the step on the next invocation.
      // Inspect your payment system to determine whether the charge succeeded.
      context.logger.warn("Payment step interrupted — check payment system");
      return { status: "unknown" };
    }
    throw err;
  }
});

function chargePayment(amount: number): { charged: number } {
  return { charged: amount };
}
from aws_durable_execution_sdk_python.config import StepConfig, StepSemantics
from aws_durable_execution_sdk_python.context import DurableContext, StepContext, durable_step
from aws_durable_execution_sdk_python.exceptions import StepInterruptedError
from aws_durable_execution_sdk_python.execution import durable_execution


@durable_step
def charge_payment(step_context: StepContext, amount: float) -> dict:
    return charge_external_system(amount)


@durable_execution
def lambda_handler(event: dict, context: DurableContext) -> dict:
    try:
        result = context.step(
            charge_payment(event["amount"]),
            config=StepConfig(step_semantics=StepSemantics.AT_MOST_ONCE_PER_RETRY),
        )
        return {"status": "charged", "result": result}
    except StepInterruptedError:
        # The step started but Lambda was interrupted before the result was
        # checkpointed. The SDK will not re-run the step on the next invocation.
        # Inspect your payment system to determine whether the charge succeeded.
        context.logger.warning("Payment step interrupted — check payment system")
        return {"status": "unknown"}


def charge_external_system(amount: float) -> dict:
    return {"charged": amount}
import java.util.Map;
import software.amazon.lambda.durable.DurableContext;
import software.amazon.lambda.durable.DurableHandler;
import software.amazon.lambda.durable.config.StepConfig;
import software.amazon.lambda.durable.config.StepSemantics;
import software.amazon.lambda.durable.exception.StepInterruptedException;

public class StepInterrupted extends DurableHandler<Map<String, Object>, Map<String, Object>> {

    @Override
    public Map<String, Object> handleRequest(Map<String, Object> event, DurableContext context) {
        StepConfig config = StepConfig.builder()
                .semantics(StepSemantics.AT_MOST_ONCE_PER_RETRY)
                .build();
        try {
            Map<String, Object> result = context.step(
                    "charge-payment", Map.class,
                    stepCtx -> chargePayment((Double) event.get("amount")),
                    config);
            return Map.of("status", "charged", "result", result);
        } catch (StepInterruptedException e) {
            // The step started but Lambda was interrupted before the result was
            // checkpointed. The SDK will not re-run the step on the next invocation.
            // Inspect your payment system to determine whether the charge succeeded.
            context.getLogger().warn("Payment step interrupted — check payment system");
            return Map.of("status", "unknown");
        }
    }

    private Map<String, Object> chargePayment(double amount) {
        return Map.of("charged", amount);
    }
}

Serialization errors

The SDK serializes step results to checkpoint storage. The default serializer handles standard types for each language. Custom serializers should throw the appropriate exception type when they encounter a value they cannot handle.

When a custom Serdes implementation throws during serialize or deserialize, the SDK throws SerdesFailedError as an unhandled exception. The durable functions backend retries the invocation.

import { withDurableExecution, Serdes, SerdesContext } from "@aws/durable-execution-sdk-js";

// A custom Serdes that throws on serialization failure.
// The SDK terminates the Lambda invocation when serialization fails.
const strictSerdes: Serdes<unknown> = {
  serialize: async (value: unknown, _context: SerdesContext) => {
    // Circular references cause JSON.stringify to throw
    return JSON.stringify(value);
  },
  deserialize: async (data: string | undefined, _context: SerdesContext) => {
    if (data === undefined) return undefined;
    return JSON.parse(data);
  },
};

export const handler = withDurableExecution(async (event, context) => {
  const result = await context.step(
    "build-result",
    async (stepCtx) => {
      // Return a value that can be serialized
      return { message: "hello", timestamp: Date.now() };
    },
    { serdes: strictSerdes },
  );
  return result;
});

When serialization or deserialization fails, the SDK raises SerDesError, returns a FAILED status response, and does not retry.

from aws_durable_execution_sdk_python.context import DurableContext, StepContext, durable_step
from aws_durable_execution_sdk_python.exceptions import SerDesError
from aws_durable_execution_sdk_python.execution import durable_execution


@durable_step
def build_result(step_context: StepContext) -> dict:
    # Return a value that can be serialized by the default serdes
    return {"message": "hello"}


@durable_execution
def lambda_handler(event: dict, context: DurableContext) -> dict:
    # SerDesError is raised when the SDK cannot serialize or deserialize a value.
    # The default serdes handles standard Python types. Custom serdes implementations
    # should raise SerDesError when they encounter a value they cannot handle.
    try:
        result = context.step(build_result())
        return result
    except SerDesError as e:
        context.logger.error("Serialization failed", extra={"message": str(e)})
        raise

When serialization or deserialization fails, the SDK throws SerDesException. The executor catches it, returns a FAILED status response, and does not retry.

import java.util.Map;
import software.amazon.lambda.durable.DurableContext;
import software.amazon.lambda.durable.DurableHandler;
import software.amazon.lambda.durable.exception.SerDesException;

public class SerdesErrorExample extends DurableHandler<Map<String, Object>, Map<String, Object>> {

    @Override
    public Map<String, Object> handleRequest(Map<String, Object> event, DurableContext context) {
        // SerDesException is thrown when the SDK cannot serialize or deserialize a value.
        // Custom SerDes implementations should throw SerDesException when they encounter
        // a value they cannot handle.
        try {
            Map<String, Object> result = context.step("build-result", Map.class, stepCtx ->
                    Map.of("message", "hello"));
            return result;
        } catch (SerDesException e) {
            context.getLogger().error("Serialization failed: " + e.getMessage());
            throw e;
        }
    }
}

See also