diff --git a/examples/src/test/java/software/amazon/lambda/durable/examples/OtelXRayIntegrationTest.java b/examples/src/test/java/software/amazon/lambda/durable/examples/OtelXRayIntegrationTest.java index 581514a3d..bee76a806 100644 --- a/examples/src/test/java/software/amazon/lambda/durable/examples/OtelXRayIntegrationTest.java +++ b/examples/src/test/java/software/amazon/lambda/durable/examples/OtelXRayIntegrationTest.java @@ -144,14 +144,13 @@ void simpleSteps_producesUnifiedTraceInXRay() throws Exception { // Find the trace that contains our durable spans var durableTrace = allTraces.stream() - .filter(trace -> - trace.segments().stream().anyMatch(seg -> segmentContains(seg, "durable.step:create-greeting"))) + .filter(trace -> trace.segments().stream().anyMatch(seg -> segmentContains(seg, "create-greeting"))) .findFirst() .orElse(null); assertNotNull( durableTrace, - "Expected to find a trace with durable.step:create-greeting segment. " + "Found " + traces.size() + "Expected to find a trace with create-greeting segment. " + "Found " + traces.size() + " traces in the time window. Segment names: " + allTraces.stream() .flatMap(t -> t.segments().stream()) @@ -165,12 +164,10 @@ void simpleSteps_producesUnifiedTraceInXRay() throws Exception { // Verify expected span names appear in the trace assertTrue( - allSegmentText.contains("durable.invocation"), - "Expected durable.invocation span in trace. Segments: " + summarizeSegments(segmentDocuments)); - assertTrue( - allSegmentText.contains("durable.step:create-greeting"), - "Expected durable.step:create-greeting span in trace"); - assertTrue(allSegmentText.contains("durable.step:transform"), "Expected durable.step:transform span in trace"); + allSegmentText.contains("invocation"), + "Expected invocation span in trace. Segments: " + summarizeSegments(segmentDocuments)); + assertTrue(allSegmentText.contains("create-greeting"), "Expected create-greeting span in trace"); + assertTrue(allSegmentText.contains("transform"), "Expected transform span in trace"); // Verify all segments share the same trace ID (single unified trace) var uniqueTraceIds = @@ -218,14 +215,13 @@ void waitAndResume_producesUnifiedTraceAcrossInvocations() throws Exception { // Find the trace containing our durable spans var durableTrace = allTraces.stream() - .filter(trace -> - trace.segments().stream().anyMatch(seg -> segmentContains(seg, "durable.step:before-wait"))) + .filter(trace -> trace.segments().stream().anyMatch(seg -> segmentContains(seg, "before-wait"))) .findFirst() .orElse(null); assertNotNull( durableTrace, - "Expected to find a trace with durable.step:before-wait segment. " + "Found " + traces.size() + "Expected to find a trace with before-wait segment. " + "Found " + traces.size() + " traces in the time window."); // 4. Verify multi-invocation trace structure @@ -234,14 +230,12 @@ void waitAndResume_producesUnifiedTraceAcrossInvocations() throws Exception { var allSegmentText = String.join("\n", segmentDocuments); // Verify spans from BOTH invocations appear in the same trace - assertTrue( - allSegmentText.contains("durable.step:before-wait"), "Expected before-wait span from first invocation"); - assertTrue( - allSegmentText.contains("durable.step:after-wait"), "Expected after-wait span from second invocation"); - assertTrue(allSegmentText.contains("durable.wait:pause"), "Expected wait:pause span in trace"); + assertTrue(allSegmentText.contains("before-wait"), "Expected before-wait span from first invocation"); + assertTrue(allSegmentText.contains("after-wait"), "Expected after-wait span from second invocation"); + assertTrue(allSegmentText.contains("pause"), "Expected wait:pause span in trace"); // Verify multiple invocation spans (one per Lambda invocation) - var invocationCount = countOccurrences(allSegmentText, "durable.invocation"); + var invocationCount = countOccurrences(allSegmentText, "invocation"); assertTrue( invocationCount >= 2, "Expected at least 2 invocation spans (multi-invocation), got " + invocationCount); @@ -264,7 +258,9 @@ private List queryTracesWithRetry(Instant startTime, Instant endTi throws InterruptedException { // Query by durable.invocation service — our spans are in a separate trace from Lambda's // built-in X-Ray segment (durable backend propagates its own trace root) - var filterExpression = "service(\"durable.invocation\")"; + // Filter by the Lambda function's service name — each function has a unique one. + // This avoids picking up traces from other durable functions that share service.name="invocation". + var filterExpression = "service(\"" + functionName + functionNameSuffix + "\")"; for (int attempt = 0; attempt < XRAY_QUERY_RETRIES; attempt++) { var response = xrayClient.getTraceSummaries(GetTraceSummariesRequest.builder() .startTime(startTime) diff --git a/examples/template.yaml b/examples/template.yaml index c9fc1d5e7..d528b8b42 100644 --- a/examples/template.yaml +++ b/examples/template.yaml @@ -370,9 +370,6 @@ Resources: - !Sub - arn:aws:lambda:${AWS::Region}:901920570463:layer:aws-otel-java-agent-${AdotArch}-ver-1-32-0:6 - AdotArch: amd64 - Environment: - Variables: - AWS_LAMBDA_EXEC_WRAPPER: /opt/otel-handler OtelXRayWaitExampleFunction: Type: AWS::Serverless::Function @@ -389,9 +386,6 @@ Resources: - !Sub - arn:aws:lambda:${AWS::Region}:901920570463:layer:aws-otel-java-agent-${AdotArch}-ver-1-32-0:6 - AdotArch: amd64 - Environment: - Variables: - AWS_LAMBDA_EXEC_WRAPPER: /opt/otel-handler RetryInvokeExampleFunction: Type: AWS::Serverless::Function diff --git a/otel-plugin/src/main/java/software/amazon/lambda/durable/otel/OtelPlugin.java b/otel-plugin/src/main/java/software/amazon/lambda/durable/otel/OtelPlugin.java index d2016da93..e75bdead2 100644 --- a/otel-plugin/src/main/java/software/amazon/lambda/durable/otel/OtelPlugin.java +++ b/otel-plugin/src/main/java/software/amazon/lambda/durable/otel/OtelPlugin.java @@ -5,14 +5,17 @@ import static software.amazon.lambda.durable.otel.SpanAttributes.*; import io.opentelemetry.api.common.AttributeKey; +import io.opentelemetry.api.common.Attributes; import io.opentelemetry.api.trace.Span; import io.opentelemetry.api.trace.SpanContext; +import io.opentelemetry.api.trace.SpanKind; import io.opentelemetry.api.trace.StatusCode; import io.opentelemetry.api.trace.TraceFlags; import io.opentelemetry.api.trace.TraceState; import io.opentelemetry.api.trace.Tracer; import io.opentelemetry.context.Context; import io.opentelemetry.context.Scope; +import io.opentelemetry.sdk.resources.Resource; import io.opentelemetry.sdk.trace.SdkTracerProvider; import io.opentelemetry.sdk.trace.SdkTracerProviderBuilder; import java.time.Instant; @@ -128,6 +131,12 @@ public OtelPlugin(SdkTracerProviderBuilder tracerProviderBuilder, ContextExtract public OtelPlugin( SdkTracerProviderBuilder tracerProviderBuilder, ContextExtractor contextExtractor, boolean enableMdc) { this.idGenerator = new DeterministicIdGenerator(); + + // Set service.name to "invocation" — X-Ray uses this as the display name for SERVER spans, + // creating a separate service node in the trace map labeled "invocation". + var resource = Resource.create(Attributes.of(AttributeKey.stringKey("service.name"), "invocation")); + tracerProviderBuilder.setResource(resource); + this.tracerProvider = tracerProviderBuilder.setIdGenerator(idGenerator).build(); this.tracer = tracerProvider.get(INSTRUMENTATION_NAME); this.contextExtractor = contextExtractor; @@ -152,10 +161,17 @@ public void onInvocationStart(InvocationInfo info) { } // If no extracted context, idGenerator falls back to ARN-derived trace ID - // Determine parent context for the invocation span + // Determine parent context for the invocation span. + // When the X-Ray header has a Parent field, create the invocation span as a + // child of that segment. This connects our OTLP-exported spans to the Lambda + // service's X-Ray segments. Context parentContext; - if (extractedContext != null && extractedContext.parentSpanId() != null) { - // Create a remote parent span context from the X-Ray Parent field + var activeSpan = Span.fromContext(Context.current()); + if (activeSpan.getSpanContext().isValid()) { + // An auto-instrumenter (e.g., ADOT agent) is active — parent to its span + parentContext = Context.current(); + } else if (extractedContext != null && extractedContext.parentSpanId() != null) { + // No active instrumenter, but X-Ray header has parent — create remote parent var parentSpanContext = SpanContext.createFromRemoteParent( extractedContext.traceId(), extractedContext.parentSpanId(), @@ -166,8 +182,10 @@ public void onInvocationStart(InvocationInfo info) { parentContext = Context.root(); } - // Create invocation span as child of Lambda's X-Ray segment (via Parent field) - var spanBuilder = tracer.spanBuilder("durable.invocation") + // Create a SERVER span to establish a separate X-Ray service node. + // X-Ray uses service.name for the segment display name. + var spanBuilder = tracer.spanBuilder("invocation") + .setSpanKind(SpanKind.SERVER) .setParent(parentContext) .setAttribute(DURABLE_EXECUTION_ARN, info.durableExecutionArn()) .setAttribute(DURABLE_FIRST_INVOCATION, info.isFirstInvocation()); @@ -227,8 +245,6 @@ public void onInvocationEnd(InvocationEndInfo info) { public void onOperationStart(OperationInfo info) { if (info.id() == null) return; - idGenerator.setNextSpanOperationId(info.id()); - var parentContext = resolveParentContext(info.parentId()); var spanBuilder = tracer.spanBuilder(spanName(info.type(), info.subType(), info.name())) @@ -237,6 +253,19 @@ public void onOperationStart(OperationInfo info) { .setAttribute(DURABLE_OPERATION_ID, info.id()) .setAttribute(DURABLE_OPERATION_TYPE, info.type()); + if (info.isContinuation()) { + // Operation was already started in a prior invocation — use a random span ID + // and add a Link to the deterministic span from the original invocation for correlation. + var deterministicSpanId = idGenerator.generateSpanIdForOperation(info.id()); + var traceId = idGenerator.generateTraceId(); + var linkedSpanContext = + SpanContext.create(traceId, deterministicSpanId, TraceFlags.getSampled(), TraceState.getDefault()); + spanBuilder.addLink(linkedSpanContext); + } else { + // First execution — use deterministic span ID so continuations can link back + idGenerator.setNextSpanOperationId(info.id()); + } + if (info.name() != null) { spanBuilder.setAttribute(DURABLE_OPERATION_NAME, info.name()); } @@ -400,17 +429,17 @@ private Context resolveParentContext(String parentId) { private static String spanName(String type, String subType, String name) { if (name != null) { - return "durable." + (subType != null ? subType.toLowerCase() : type.toLowerCase()) + ":" + name; + return name; } - return "durable." + (subType != null ? subType.toLowerCase() : type.toLowerCase()); + return subType != null ? subType.toLowerCase() : type.toLowerCase(); } private static String attemptSpanName(String type, String subType, String name, Integer attempt) { var base = spanName(type, subType, name); if (attempt != null) { - return base + " [attempt " + attempt + "]"; + return base + " attempt " + attempt; } - return base + " [fn]"; + return base; } private static String attemptKey(String operationId, Integer attempt) { diff --git a/otel-plugin/src/main/java/software/amazon/lambda/durable/otel/XRayContextExtractor.java b/otel-plugin/src/main/java/software/amazon/lambda/durable/otel/XRayContextExtractor.java index 9eb0f0664..f458004bf 100644 --- a/otel-plugin/src/main/java/software/amazon/lambda/durable/otel/XRayContextExtractor.java +++ b/otel-plugin/src/main/java/software/amazon/lambda/durable/otel/XRayContextExtractor.java @@ -25,14 +25,21 @@ public class XRayContextExtractor implements ContextExtractor { private static final Logger logger = LoggerFactory.getLogger(XRayContextExtractor.class); private static final String XRAY_ENV_VAR = "_X_AMZN_TRACE_ID"; + private static final String XRAY_SYSTEM_PROPERTY = "com.amazonaws.xray.traceHeader"; private static final Pattern HEX_32 = Pattern.compile("[0-9a-f]{32}"); private static final Pattern HEX_16 = Pattern.compile("[0-9a-f]{16}"); @Override public ExtractedContext extract() { - var traceHeader = System.getenv(XRAY_ENV_VAR); + // Try system property first — Lambda runtime updates this per invocation + // and it avoids JVM environment variable caching issues. + var traceHeader = System.getProperty(XRAY_SYSTEM_PROPERTY); if (traceHeader == null || traceHeader.isEmpty()) { - logger.debug("No X-Ray trace header found in environment"); + // Fallback to environment variable + traceHeader = System.getenv(XRAY_ENV_VAR); + } + if (traceHeader == null || traceHeader.isEmpty()) { + logger.debug("No X-Ray trace header found in environment or system properties"); return null; } diff --git a/otel-plugin/src/test/java/software/amazon/lambda/durable/otel/OtelPluginTest.java b/otel-plugin/src/test/java/software/amazon/lambda/durable/otel/OtelPluginTest.java index f2a48ffc1..9d092a618 100644 --- a/otel-plugin/src/test/java/software/amazon/lambda/durable/otel/OtelPluginTest.java +++ b/otel-plugin/src/test/java/software/amazon/lambda/durable/otel/OtelPluginTest.java @@ -4,6 +4,7 @@ import static org.junit.jupiter.api.Assertions.*; +import io.opentelemetry.api.trace.SpanKind; import io.opentelemetry.api.trace.StatusCode; import io.opentelemetry.sdk.testing.exporter.InMemorySpanExporter; import io.opentelemetry.sdk.trace.SdkTracerProvider; @@ -44,10 +45,60 @@ void invocationStart_and_end_createsSpan() { assertEquals(1, spans.size()); var span = spans.get(0); - assertEquals("durable.invocation", span.getName()); + assertEquals("invocation", span.getName()); assertEquals(StatusCode.UNSET, span.getStatus().getStatusCode()); } + @Test + void invocationSpan_hasServerKind_forSeparateXRayNode() { + plugin.onInvocationStart(new InvocationInfo("req-1", "arn:exec1", true)); + plugin.onInvocationEnd(new InvocationEndInfo("req-1", "arn:exec1", true, InvocationStatus.SUCCEEDED, null)); + + var span = spanExporter.getFinishedSpanItems().get(0); + assertEquals( + SpanKind.SERVER, + span.getKind(), + "Invocation span must be SERVER kind so X-Ray creates a separate service node"); + } + + @Test + void operationSpanName_usesOperationName_withoutPrefix() { + plugin.onInvocationStart(new InvocationInfo("req-1", "arn:exec1", true)); + plugin.onOperationStart( + new OperationInfo("op-1", "create-greeting", "STEP", "Step", null, Instant.now(), null, false)); + plugin.onOperationEnd(new OperationEndInfo( + "op-1", "create-greeting", "STEP", "Step", null, Instant.now(), Instant.now(), null)); + plugin.onInvocationEnd(new InvocationEndInfo("req-1", "arn:exec1", true, InvocationStatus.SUCCEEDED, null)); + + var operationSpan = spanExporter.getFinishedSpanItems().stream() + .filter(s -> s.getName().equals("create-greeting")) + .findFirst() + .orElseThrow(); + assertEquals( + "create-greeting", + operationSpan.getName(), + "Operation span should use the operation name directly without 'durable.' prefix"); + } + + @Test + void attemptSpanName_usesOperationNameWithAttemptNumber() { + plugin.onInvocationStart(new InvocationInfo("req-1", "arn:exec1", true)); + plugin.onUserFunctionStart( + new UserFunctionStartInfo("op-1", "process-order", "STEP", "Step", null, Instant.now(), false, 1)); + plugin.onUserFunctionEnd(new UserFunctionEndInfo( + "op-1", "process-order", "STEP", "Step", null, Instant.now(), Instant.now(), false, 1, true, null)); + plugin.onInvocationEnd(new InvocationEndInfo("req-1", "arn:exec1", true, InvocationStatus.SUCCEEDED, null)); + + var attemptSpan = spanExporter.getFinishedSpanItems().stream() + .filter(s -> s.getName().contains("attempt")) + .findFirst() + .orElseThrow(); + assertEquals( + "process-order attempt 1", + attemptSpan.getName(), + "Attempt span should be 'name attempt N' without brackets or prefix"); + } + @Test void invocationEnd_withFailure_setsErrorStatus() { plugin.onInvocationStart(new InvocationInfo("req-123", "arn:exec1", true)); @@ -67,7 +118,7 @@ void operationStart_createsSpan_operationEnd_endsIt() { var end = Instant.parse("2026-06-01T10:00:05Z"); // Operation span created at start - plugin.onOperationStart(new OperationInfo("op-hash-1", "my-step", "STEP", "Step", null, start, null)); + plugin.onOperationStart(new OperationInfo("op-hash-1", "my-step", "STEP", "Step", null, start, null, false)); // Operation span ended at completion plugin.onOperationEnd(new OperationEndInfo("op-hash-1", "my-step", "STEP", "Step", null, start, end, null)); @@ -81,7 +132,7 @@ void operationStart_createsSpan_operationEnd_endsIt() { .filter(s -> s.getName().contains("step")) .findFirst() .orElseThrow(); - assertEquals("durable.step:my-step", operationSpan.getName()); + assertEquals("my-step", operationSpan.getName()); } @Test @@ -142,7 +193,7 @@ void fullLifecycle_producesCorrectSpanHierarchy() { plugin.onInvocationStart(new InvocationInfo("req-1", arn, true)); // Step 1: operation starts, user function runs, operation completes - plugin.onOperationStart(new OperationInfo("op-1", "step-a", "STEP", "Step", null, Instant.now(), null)); + plugin.onOperationStart(new OperationInfo("op-1", "step-a", "STEP", "Step", null, Instant.now(), null, false)); plugin.onUserFunctionStart( new UserFunctionStartInfo("op-1", "step-a", "STEP", "Step", null, Instant.now(), false, 1)); plugin.onUserFunctionEnd(new UserFunctionEndInfo( @@ -151,7 +202,7 @@ void fullLifecycle_producesCorrectSpanHierarchy() { new OperationEndInfo("op-1", "step-a", "STEP", "Step", null, Instant.now(), Instant.now(), null)); // Step 2: operation starts, user function runs, operation completes - plugin.onOperationStart(new OperationInfo("op-2", "step-b", "STEP", "Step", null, Instant.now(), null)); + plugin.onOperationStart(new OperationInfo("op-2", "step-b", "STEP", "Step", null, Instant.now(), null, false)); plugin.onUserFunctionStart( new UserFunctionStartInfo("op-2", "step-b", "STEP", "Step", null, Instant.now(), false, 1)); plugin.onUserFunctionEnd(new UserFunctionEndInfo( @@ -194,7 +245,7 @@ void operationNotCompleted_spanEndedAtInvocationEnd() { plugin.onInvocationStart(new InvocationInfo("req-1", "arn:exec1", true)); // Operation starts but never completes (e.g., wait operation, invocation suspends) - plugin.onOperationStart(new OperationInfo("op-1", "my-wait", "WAIT", "Wait", null, Instant.now(), null)); + plugin.onOperationStart(new OperationInfo("op-1", "my-wait", "WAIT", "Wait", null, Instant.now(), null, false)); // Invocation ends without onOperationEnd being called plugin.onInvocationEnd(new InvocationEndInfo("req-1", "arn:exec1", true, InvocationStatus.PENDING, null)); @@ -207,7 +258,7 @@ void operationNotCompleted_spanEndedAtInvocationEnd() { .filter(s -> s.getName().contains("wait")) .findFirst() .orElseThrow(); - assertEquals("durable.wait:my-wait", operationSpan.getName()); + assertEquals("my-wait", operationSpan.getName()); } @Test @@ -266,7 +317,8 @@ void xrayExtraction_allSpansShareExtractedTraceId() { false); xrayPlugin.onInvocationStart(new InvocationInfo("req-1", "arn:exec1", true)); - xrayPlugin.onOperationStart(new OperationInfo("op-1", "step-a", "STEP", "Step", null, Instant.now(), null)); + xrayPlugin.onOperationStart( + new OperationInfo("op-1", "step-a", "STEP", "Step", null, Instant.now(), null, false)); xrayPlugin.onUserFunctionStart( new UserFunctionStartInfo("op-1", "step-a", "STEP", "Step", null, Instant.now(), false, 1)); xrayPlugin.onUserFunctionEnd(new UserFunctionEndInfo( @@ -351,14 +403,16 @@ void xrayExtraction_multipleInvocations_sameTraceId_unifiedTrace() { // First invocation xrayPlugin.onInvocationStart(new InvocationInfo("req-1", "arn:exec1", true)); - xrayPlugin.onOperationStart(new OperationInfo("op-1", "step-1", "STEP", "Step", null, Instant.now(), null)); + xrayPlugin.onOperationStart( + new OperationInfo("op-1", "step-1", "STEP", "Step", null, Instant.now(), null, false)); xrayPlugin.onOperationEnd( new OperationEndInfo("op-1", "step-1", "STEP", "Step", null, Instant.now(), Instant.now(), null)); xrayPlugin.onInvocationEnd(new InvocationEndInfo("req-1", "arn:exec1", true, InvocationStatus.PENDING, null)); // Second invocation (same execution, same X-Ray Root from backend) xrayPlugin.onInvocationStart(new InvocationInfo("req-2", "arn:exec1", false)); - xrayPlugin.onOperationStart(new OperationInfo("op-2", "step-2", "STEP", "Step", null, Instant.now(), null)); + xrayPlugin.onOperationStart( + new OperationInfo("op-2", "step-2", "STEP", "Step", null, Instant.now(), null, false)); xrayPlugin.onOperationEnd( new OperationEndInfo("op-2", "step-2", "STEP", "Step", null, Instant.now(), Instant.now(), null)); xrayPlugin.onInvocationEnd( @@ -438,7 +492,7 @@ void operationEnd_withoutMatchingStart_createsContinuationSpanWithLink() { .filter(s -> s.getName().contains("wait")) .findFirst() .orElseThrow(); - assertEquals("durable.wait:my-wait", continuationSpan.getName()); + assertEquals("my-wait", continuationSpan.getName()); assertFalse(continuationSpan.getLinks().isEmpty(), "Continuation span should have a Link"); } @@ -527,11 +581,11 @@ void childOperation_parentedToParentOperationSpan() { // Parent context operation plugin.onOperationStart(new OperationInfo( - "op-parent", "my-context", "CONTEXT", "RunInChildContext", null, Instant.now(), null)); + "op-parent", "my-context", "CONTEXT", "RunInChildContext", null, Instant.now(), null, false)); // Child operation with parentId pointing to parent plugin.onOperationStart( - new OperationInfo("op-child", "inner-step", "STEP", "Step", "op-parent", Instant.now(), null)); + new OperationInfo("op-child", "inner-step", "STEP", "Step", "op-parent", Instant.now(), null, false)); plugin.onOperationEnd(new OperationEndInfo( "op-child", "inner-step", "STEP", "Step", "op-parent", Instant.now(), Instant.now(), null)); @@ -565,14 +619,14 @@ void multiInvocation_stepWaitStep_producesCorrectSpans() { // Invocation 1: step completes, wait starts plugin.onInvocationStart(new InvocationInfo("req-1", arn, true)); - plugin.onOperationStart(new OperationInfo("op-1", "step-A", "STEP", "Step", null, Instant.now(), null)); + plugin.onOperationStart(new OperationInfo("op-1", "step-A", "STEP", "Step", null, Instant.now(), null, false)); plugin.onUserFunctionStart( new UserFunctionStartInfo("op-1", "step-A", "STEP", "Step", null, Instant.now(), false, 1)); plugin.onUserFunctionEnd(new UserFunctionEndInfo( "op-1", "step-A", "STEP", "Step", null, Instant.now(), Instant.now(), false, 1, true, null)); plugin.onOperationEnd( new OperationEndInfo("op-1", "step-A", "STEP", "Step", null, Instant.now(), Instant.now(), null)); - plugin.onOperationStart(new OperationInfo("op-2", "pause", "WAIT", "Wait", null, Instant.now(), null)); + plugin.onOperationStart(new OperationInfo("op-2", "pause", "WAIT", "Wait", null, Instant.now(), null, false)); plugin.onInvocationEnd(new InvocationEndInfo("req-1", arn, true, InvocationStatus.PENDING, null)); // Invocation 1 should have: step op + step attempt + wait (PENDING) + invocation = 4 @@ -585,7 +639,7 @@ void multiInvocation_stepWaitStep_producesCorrectSpans() { plugin.onInvocationStart(new InvocationInfo("req-2", arn, false)); plugin.onOperationEnd( new OperationEndInfo("op-2", "pause", "WAIT", "Wait", null, Instant.now(), Instant.now(), null)); - plugin.onOperationStart(new OperationInfo("op-3", "step-B", "STEP", "Step", null, Instant.now(), null)); + plugin.onOperationStart(new OperationInfo("op-3", "step-B", "STEP", "Step", null, Instant.now(), null, false)); plugin.onUserFunctionStart( new UserFunctionStartInfo("op-3", "step-B", "STEP", "Step", null, Instant.now(), false, 1)); plugin.onUserFunctionEnd(new UserFunctionEndInfo( @@ -604,9 +658,110 @@ void multiInvocation_stepWaitStep_producesCorrectSpans() { // Wait continuation should have a Link var waitContinuation = inv2Spans.stream() - .filter(s -> s.getName().contains("wait")) + .filter(s -> s.getName().contains("pause")) .findFirst() .orElseThrow(); assertFalse(waitContinuation.getLinks().isEmpty()); } + + // ─── Cross-invocation step retry scenario ──────────────────────────── + + @Test + void crossInvocation_stepRetry_attemptsParentedToRespectiveInvocations() { + var arn = "arn:aws:lambda:us-east-1:123:function:test:$LATEST/durable/exec1"; + + // Invocation 1: step starts, attempt 1 fails, invocation suspended during retry poll + plugin.onInvocationStart(new InvocationInfo("req-1", arn, true)); + plugin.onOperationStart( + new OperationInfo("op-1", "process-payment", "STEP", "Step", null, Instant.now(), null, false)); + plugin.onUserFunctionStart( + new UserFunctionStartInfo("op-1", "process-payment", "STEP", "Step", null, Instant.now(), false, 1)); + plugin.onUserFunctionEnd(new UserFunctionEndInfo( + "op-1", + "process-payment", + "STEP", + "Step", + null, + Instant.now(), + Instant.now(), + false, + 1, + false, + new RuntimeException("payment failed"))); + plugin.onInvocationEnd(new InvocationEndInfo("req-1", arn, true, InvocationStatus.PENDING, null)); + + var inv1Spans = spanExporter.getFinishedSpanItems(); + // operation span (PENDING) + attempt 1 span + invocation span = 3 + assertEquals(3, inv1Spans.size()); + + var inv1OperationSpan = inv1Spans.stream() + .filter(s -> + s.getName().contains("process-payment") && !s.getName().contains("attempt")) + .findFirst() + .orElseThrow(); + var attempt1Span = inv1Spans.stream() + .filter(s -> s.getName().contains("attempt 1")) + .findFirst() + .orElseThrow(); + + // Attempt 1 should be parented to invocation 1's operation span + assertEquals( + inv1OperationSpan.getSpanId(), + attempt1Span.getParentSpanId(), + "Attempt 1 should be parented to invocation 1's operation span"); + + spanExporter.reset(); + + // Invocation 2: step is replayed (continuation), attempt 2 executes and succeeds + plugin.onInvocationStart(new InvocationInfo("req-2", arn, false)); + // isContinuation=true: this operation was already started in a prior invocation + plugin.onOperationStart( + new OperationInfo("op-1", "process-payment", "STEP", "Step", null, Instant.now(), null, true)); + plugin.onUserFunctionStart( + new UserFunctionStartInfo("op-1", "process-payment", "STEP", "Step", null, Instant.now(), false, 2)); + plugin.onUserFunctionEnd(new UserFunctionEndInfo( + "op-1", "process-payment", "STEP", "Step", null, Instant.now(), Instant.now(), false, 2, true, null)); + plugin.onOperationEnd(new OperationEndInfo( + "op-1", "process-payment", "STEP", "Step", null, Instant.now(), Instant.now(), null)); + plugin.onInvocationEnd(new InvocationEndInfo("req-2", arn, false, InvocationStatus.SUCCEEDED, null)); + + var inv2Spans = spanExporter.getFinishedSpanItems(); + // operation span + attempt 2 span + invocation span = 3 + assertEquals(3, inv2Spans.size()); + + var inv2OperationSpan = inv2Spans.stream() + .filter(s -> + s.getName().contains("process-payment") && !s.getName().contains("attempt")) + .findFirst() + .orElseThrow(); + var attempt2Span = inv2Spans.stream() + .filter(s -> s.getName().contains("attempt 2")) + .findFirst() + .orElseThrow(); + + // Attempt 2 should be parented to invocation 2's operation span + assertEquals( + inv2OperationSpan.getSpanId(), + attempt2Span.getParentSpanId(), + "Attempt 2 should be parented to invocation 2's operation span"); + + // The two operation spans must have DIFFERENT span IDs (the bug was they were the same) + assertNotEquals( + inv1OperationSpan.getSpanId(), + inv2OperationSpan.getSpanId(), + "Continuation operation span must have a different span ID from the original"); + + // The continuation operation span should have a Link to the original for correlation + assertFalse( + inv2OperationSpan.getLinks().isEmpty(), + "Continuation operation span should have a Link to the original"); + + // All spans share the same trace ID + var allSpans = new java.util.ArrayList<>(inv1Spans); + allSpans.addAll(inv2Spans); + var traceId = allSpans.get(0).getTraceId(); + assertTrue( + allSpans.stream().allMatch(s -> s.getTraceId().equals(traceId)), + "All spans across invocations should share the same trace ID"); + } } diff --git a/otel-plugin/src/test/java/software/amazon/lambda/durable/otel/XRayContextExtractorTest.java b/otel-plugin/src/test/java/software/amazon/lambda/durable/otel/XRayContextExtractorTest.java index 55038cf59..0f5a196d0 100644 --- a/otel-plugin/src/test/java/software/amazon/lambda/durable/otel/XRayContextExtractorTest.java +++ b/otel-plugin/src/test/java/software/amazon/lambda/durable/otel/XRayContextExtractorTest.java @@ -21,6 +21,23 @@ void extract_withoutEnvVar_returnsNull() { assertNull(context); } + @Test + void extract_readsFromSystemProperty_whenEnvVarMissing() { + // Simulate Lambda runtime setting the system property (env var not set in tests) + var traceHeader = "Root=1-6a43574f-2cf3140a69b0c8fe165f9503;Parent=53995c3f42cd8ad8;Sampled=1"; + System.setProperty("com.amazonaws.xray.traceHeader", traceHeader); + try { + var extractor = new XRayContextExtractor(); + var context = extractor.extract(); + + assertNotNull(context, "Should extract context from system property when env var is missing"); + assertEquals("6a43574f2cf3140a69b0c8fe165f9503", context.traceId()); + assertEquals("53995c3f42cd8ad8", context.parentSpanId()); + } finally { + System.clearProperty("com.amazonaws.xray.traceHeader"); + } + } + @Test void extract_implementsContextExtractor() { // Verify XRayContextExtractor implements the ContextExtractor interface diff --git a/sdk-integration-tests/src/test/java/software/amazon/lambda/durable/OtelPluginIntegrationTest.java b/sdk-integration-tests/src/test/java/software/amazon/lambda/durable/OtelPluginIntegrationTest.java index 1658e8774..9c0408b46 100644 --- a/sdk-integration-tests/src/test/java/software/amazon/lambda/durable/OtelPluginIntegrationTest.java +++ b/sdk-integration-tests/src/test/java/software/amazon/lambda/durable/OtelPluginIntegrationTest.java @@ -55,9 +55,9 @@ void simpleStep_producesInvocationAndOperationAndAttemptSpans() { assertTrue(spans.size() >= 3, "Expected at least 3 spans, got " + spans.size()); // Verify span names - assertSpanExists(spans, "durable.invocation"); - assertSpanExists(spans, "durable.step:greet"); - assertSpanExists(spans, "durable.step:greet [attempt 1]"); + assertSpanExists(spans, "invocation"); + assertSpanExists(spans, "greet"); + assertSpanExists(spans, "greet attempt 1"); // All spans share the same trace ID var traceId = spans.get(0).getTraceId(); @@ -84,12 +84,12 @@ void multipleSteps_producesSpansForEach() { // 1 invocation + 3 operations + 3 attempts = 7 assertTrue(spans.size() >= 7, "Expected at least 7 spans, got " + spans.size()); - assertSpanExists(spans, "durable.step:step-a"); - assertSpanExists(spans, "durable.step:step-b"); - assertSpanExists(spans, "durable.step:step-c"); - assertSpanExists(spans, "durable.step:step-a [attempt 1]"); - assertSpanExists(spans, "durable.step:step-b [attempt 1]"); - assertSpanExists(spans, "durable.step:step-c [attempt 1]"); + assertSpanExists(spans, "step-a"); + assertSpanExists(spans, "step-b"); + assertSpanExists(spans, "step-c"); + assertSpanExists(spans, "step-a attempt 1"); + assertSpanExists(spans, "step-b attempt 1"); + assertSpanExists(spans, "step-c attempt 1"); } @Test @@ -153,9 +153,8 @@ void waitAndResume_producesSpansAcrossInvocations() { assertTrue(allSpans.size() > spansAfterFirstInvocation, "Second invocation should produce additional spans"); // Verify both invocations produced invocation spans - var invocationSpans = allSpans.stream() - .filter(s -> s.getName().equals("durable.invocation")) - .toList(); + var invocationSpans = + allSpans.stream().filter(s -> s.getName().equals("invocation")).toList(); assertEquals(2, invocationSpans.size(), "Should have 2 invocation spans (one per run)"); } @@ -178,7 +177,7 @@ void waitCompletedDuringSuspension_producesOperationSpan() { // The wait operation span should be open (ended with PENDING status at invocation end) var spansAfterFirst = spanExporter.getFinishedSpanItems(); var waitSpansAfterFirst = spansAfterFirst.stream() - .filter(s -> s.getName().equals("durable.wait:pause")) + .filter(s -> s.getName().equals("pause")) .toList(); assertEquals(1, waitSpansAfterFirst.size(), "Wait span should exist after first invocation (ended as PENDING)"); @@ -190,9 +189,8 @@ void waitCompletedDuringSuspension_producesOperationSpan() { // After second invocation, the wait should have a second operation span // (fired by onOperationEnd via updatedOperationIds) showing it completed var allSpans = spanExporter.getFinishedSpanItems(); - var waitSpansTotal = allSpans.stream() - .filter(s -> s.getName().equals("durable.wait:pause")) - .toList(); + var waitSpansTotal = + allSpans.stream().filter(s -> s.getName().equals("pause")).toList(); assertEquals( 2, waitSpansTotal.size(), @@ -267,8 +265,8 @@ void childContext_producesNestedSpans() { // + inner step operation + inner step attempt = 5 assertTrue(spans.size() >= 5, "Should have at least 5 spans for child context, got " + spans.size()); - assertSpanExists(spans, "durable.runinchildcontext:child"); - assertSpanExists(spans, "durable.step:inner"); + assertSpanExists(spans, "child"); + assertSpanExists(spans, "inner"); } @Test @@ -293,7 +291,7 @@ void failedStep_producesErrorSpan() { // Invocation span should have error status var invocationSpan = spans.stream() - .filter(s -> s.getName().equals("durable.invocation")) + .filter(s -> s.getName().equals("invocation")) .findFirst() .orElseThrow(); assertEquals( @@ -420,8 +418,8 @@ void parallelOperation_producesSpansForEachBranch() { spans.stream().anyMatch(s -> s.getName().contains("fan-out")), "Should have parallel operation span. Got: " + spans.stream().map(SpanData::getName).toList()); - assertSpanExists(spans, "durable.step:step-A"); - assertSpanExists(spans, "durable.step:step-B"); + assertSpanExists(spans, "step-A"); + assertSpanExists(spans, "step-B"); } @Test @@ -440,9 +438,9 @@ void nestedChildContext_producesCorrectHierarchy() { var spans = spanExporter.getFinishedSpanItems(); - assertSpanExists(spans, "durable.runinchildcontext:outer"); - assertSpanExists(spans, "durable.runinchildcontext:inner"); - assertSpanExists(spans, "durable.step:deep-step"); + assertSpanExists(spans, "outer"); + assertSpanExists(spans, "inner"); + assertSpanExists(spans, "deep-step"); } @Test @@ -475,16 +473,13 @@ void multipleWaits_producesSpansAcrossMultipleInvocations() { var allSpans = spanExporter.getFinishedSpanItems(); // Should have 3 invocation spans - var invocationSpans = allSpans.stream() - .filter(s -> s.getName().equals("durable.invocation")) - .toList(); + var invocationSpans = + allSpans.stream().filter(s -> s.getName().equals("invocation")).toList(); assertEquals(3, invocationSpans.size(), "Should have 3 invocation spans"); // Should have wait-A and wait-B spans - assertTrue( - allSpans.stream().anyMatch(s -> s.getName().equals("durable.wait:wait-A")), "Should have wait-A span"); - assertTrue( - allSpans.stream().anyMatch(s -> s.getName().equals("durable.wait:wait-B")), "Should have wait-B span"); + assertTrue(allSpans.stream().anyMatch(s -> s.getName().equals("wait-A")), "Should have wait-A span"); + assertTrue(allSpans.stream().anyMatch(s -> s.getName().equals("wait-B")), "Should have wait-B span"); } @Test diff --git a/sdk/src/main/java/software/amazon/lambda/durable/plugin/OperationInfo.java b/sdk/src/main/java/software/amazon/lambda/durable/plugin/OperationInfo.java index 74c23d687..f58b94d70 100644 --- a/sdk/src/main/java/software/amazon/lambda/durable/plugin/OperationInfo.java +++ b/sdk/src/main/java/software/amazon/lambda/durable/plugin/OperationInfo.java @@ -17,6 +17,9 @@ * @param startTimestamp when the operation started — on first execution this is a local {@code Instant.now()} which may * slightly differ from the timestamp recorded by the backend; on replay it comes from the backend checkpoint * @param endTimestamp when the operation ended (null if still running) + * @param isContinuation true if this operation was already started in a prior invocation and is being re-executed in + * the current invocation (e.g., a step retrying across invocations). Plugins can use this to avoid generating + * duplicate span IDs. * @deprecated This is a preview API that is experimental and may be changed or removed in future releases. */ @Deprecated @@ -27,4 +30,5 @@ public record OperationInfo( String subType, String parentId, Instant startTimestamp, - Instant endTimestamp) {} + Instant endTimestamp, + boolean isContinuation) {} diff --git a/sdk/src/main/java/software/amazon/lambda/durable/plugin/PluginInfoConverter.java b/sdk/src/main/java/software/amazon/lambda/durable/plugin/PluginInfoConverter.java index 6c7b30ff5..5a72f7689 100644 --- a/sdk/src/main/java/software/amazon/lambda/durable/plugin/PluginInfoConverter.java +++ b/sdk/src/main/java/software/amazon/lambda/durable/plugin/PluginInfoConverter.java @@ -32,7 +32,8 @@ public static OperationInfo toOperationInfo(Operation operation, OperationIdenti identifier.subType() != null ? identifier.subType().getValue() : null, parentId, operation != null ? operation.startTimestamp() : Instant.now(), - operation != null ? operation.endTimestamp() : null); + operation != null ? operation.endTimestamp() : null, + operation != null); } /** diff --git a/sdk/src/test/java/software/amazon/lambda/durable/plugin/PluginRunnerTest.java b/sdk/src/test/java/software/amazon/lambda/durable/plugin/PluginRunnerTest.java index 991af3265..a9e533fd0 100644 --- a/sdk/src/test/java/software/amazon/lambda/durable/plugin/PluginRunnerTest.java +++ b/sdk/src/test/java/software/amazon/lambda/durable/plugin/PluginRunnerTest.java @@ -142,7 +142,7 @@ private static InvocationEndInfo invocationEndInfo() { } private static OperationInfo operationInfo() { - return new OperationInfo("op-1", "test-step", "STEP", null, null, Instant.now(), null); + return new OperationInfo("op-1", "test-step", "STEP", null, null, Instant.now(), null, false); } private static OperationEndInfo operationEndInfo() {