fix: apply per-request timeout in DA retriever instead of per-height

claude · claude · commit ce4eadd2129a · 2025-12-02T08:13:05.000Z
Previously, the timeout for DA retrieval was derived from the parent
context, which could cause the overall operation to timeout even when
each individual request completed within its timeout. This was problematic
for DA heights with thousands of blocks, where multiple batches of 100
blobs each need to be retrieved.

Changes:
- Use context.Background() for each GetIDs and Get batch request to ensure
  each gets a fresh, independent timeout
- Check parent context cancellation between requests to still respect
  graceful shutdown
- Add comprehensive tests verifying per-request timeout behavior
diff --git a/block/internal/da/client.go b/block/internal/da/client.go
@@ -149,11 +149,25 @@ func (c *client) Submit(ctx context.Context, data [][]byte, gasPrice float64, na
 }
 
 // Retrieve retrieves blobs from the DA layer at the specified height and namespace.
+// Each request (GetIDs and each batch of Get) has its own independent timeout,
+// ensuring that retrieval of heights with many blobs doesn't fail due to an overall timeout.
 func (c *client) Retrieve(ctx context.Context, height uint64, namespace []byte) coreda.ResultRetrieve {
-	// 1. Get IDs
-	getIDsCtx, cancel := context.WithTimeout(ctx, c.defaultTimeout)
-	defer cancel()
+	// Check for parent context cancellation before starting
+	if err := ctx.Err(); err != nil {
+		return coreda.ResultRetrieve{
+			BaseResult: coreda.BaseResult{
+				Code:      coreda.StatusError,
+				Message:   fmt.Sprintf("context cancelled before retrieval: %s", err.Error()),
+				Height:    height,
+				Timestamp: time.Now(),
+			},
+		}
+	}
+
+	// 1. Get IDs with per-request timeout (independent of parent context deadline)
+	getIDsCtx, cancel := context.WithTimeout(context.Background(), c.defaultTimeout)
 	idsResult, err := c.da.GetIDs(getIDsCtx, height, namespace)
+	cancel()
 	if err != nil {
 		// Handle specific "not found" error
 		if strings.Contains(err.Error(), coreda.ErrBlobNotFound.Error()) {
@@ -202,13 +216,41 @@ func (c *client) Retrieve(ctx context.Context, height uint64, namespace []byte)
 			},
 		}
 	}
+	// Check for parent context cancellation after GetIDs
+	if err := ctx.Err(); err != nil {
+		return coreda.ResultRetrieve{
+			BaseResult: coreda.BaseResult{
+				Code:      coreda.StatusError,
+				Message:   fmt.Sprintf("context cancelled after GetIDs: %s", err.Error()),
+				Height:    height,
+				Timestamp: time.Now(),
+			},
+		}
+	}
+
 	// 2. Get Blobs using the retrieved IDs in batches
+	// Each batch has its own independent timeout to ensure large retrievals don't timeout
 	batchSize := 100
 	blobs := make([][]byte, 0, len(idsResult.IDs))
 	for i := 0; i < len(idsResult.IDs); i += batchSize {
+		// Check for parent context cancellation before each batch
+		if err := ctx.Err(); err != nil {
+			c.logger.Debug().Uint64("height", height).Int("batch_start", i).Err(err).Msg("Context cancelled during batch retrieval")
+			return coreda.ResultRetrieve{
+				BaseResult: coreda.BaseResult{
+					Code:      coreda.StatusError,
+					Message:   fmt.Sprintf("context cancelled during batch retrieval at %d: %s", i, err.Error()),
+					Height:    height,
+					Timestamp: time.Now(),
+				},
+			}
+		}
+
 		end := min(i+batchSize, len(idsResult.IDs))
 
-		getBlobsCtx, cancel := context.WithTimeout(ctx, c.defaultTimeout)
+		// Use context.Background() for timeout to ensure each batch gets a fresh timeout
+		// independent of any parent context deadline
+		getBlobsCtx, cancel := context.WithTimeout(context.Background(), c.defaultTimeout)
 		batchBlobs, err := c.da.Get(getBlobsCtx, idsResult.IDs[i:end], namespace)
 		cancel()
 		if err != nil {
diff --git a/block/internal/da/client_test.go b/block/internal/da/client_test.go
@@ -3,6 +3,7 @@ package da
 import (
 	"context"
 	"errors"
+	"fmt"
 	"testing"
 	"time"
 
@@ -456,3 +457,190 @@ func TestClient_Retrieve_Timeout(t *testing.T) {
 		assert.Assert(t, result.Message != "")
 	})
 }
+
+func TestClient_Retrieve_PerRequestTimeout(t *testing.T) {
+	logger := zerolog.Nop()
+	dataLayerHeight := uint64(100)
+	encodedNamespace := coreda.NamespaceFromString("test-namespace")
+
+	t.Run("each batch gets independent timeout", func(t *testing.T) {
+		// Create 250 IDs to force 3 batches (100, 100, 50)
+		mockIDs := make([][]byte, 250)
+		for i := range mockIDs {
+			mockIDs[i] = []byte(fmt.Sprintf("id%d", i))
+		}
+		mockTimestamp := time.Now()
+
+		batchCount := 0
+		batchTimeout := 50 * time.Millisecond
+
+		mockDAInstance := &mockDA{
+			getIDsFunc: func(ctx context.Context, height uint64, namespace []byte) (*coreda.GetIDsResult, error) {
+				return &coreda.GetIDsResult{
+					IDs:       mockIDs,
+					Timestamp: mockTimestamp,
+				}, nil
+			},
+			getFunc: func(ctx context.Context, ids []coreda.ID, namespace []byte) ([]coreda.Blob, error) {
+				batchCount++
+				// Simulate some delay for each batch (less than timeout)
+				time.Sleep(20 * time.Millisecond)
+
+				// Verify each batch's context has its own deadline
+				deadline, ok := ctx.Deadline()
+				assert.Assert(t, ok, "batch should have a deadline")
+				// The deadline should be roughly batchTimeout from now (within tolerance)
+				remaining := time.Until(deadline)
+				assert.Assert(t, remaining > 0, "deadline should be in the future")
+				assert.Assert(t, remaining <= batchTimeout, "deadline should be at most batchTimeout")
+
+				// Return mock blobs
+				blobs := make([][]byte, len(ids))
+				for i := range blobs {
+					blobs[i] = []byte("blob")
+				}
+				return blobs, nil
+			},
+		}
+
+		client := NewClient(Config{
+			DA:             mockDAInstance,
+			Logger:         logger,
+			Namespace:      "test-namespace",
+			DataNamespace:  "test-data-namespace",
+			DefaultTimeout: batchTimeout,
+		})
+
+		result := client.Retrieve(context.Background(), dataLayerHeight, encodedNamespace.Bytes())
+
+		assert.Equal(t, coreda.StatusSuccess, result.Code)
+		assert.Equal(t, 3, batchCount, "should have made 3 batch requests")
+		assert.Equal(t, 250, len(result.Data), "should have retrieved all blobs")
+	})
+
+	t.Run("succeeds even when total time exceeds single timeout", func(t *testing.T) {
+		// This test verifies that even if the total operation takes longer than
+		// a single timeout period, it succeeds because each individual request
+		// gets its own fresh timeout.
+		mockIDs := make([][]byte, 300) // 3 batches
+		for i := range mockIDs {
+			mockIDs[i] = []byte(fmt.Sprintf("id%d", i))
+		}
+		mockTimestamp := time.Now()
+
+		perRequestTimeout := 100 * time.Millisecond
+		delayPerBatch := 40 * time.Millisecond // Each batch takes 40ms
+
+		mockDAInstance := &mockDA{
+			getIDsFunc: func(ctx context.Context, height uint64, namespace []byte) (*coreda.GetIDsResult, error) {
+				time.Sleep(delayPerBatch) // GetIDs also takes time
+				return &coreda.GetIDsResult{
+					IDs:       mockIDs,
+					Timestamp: mockTimestamp,
+				}, nil
+			},
+			getFunc: func(ctx context.Context, ids []coreda.ID, namespace []byte) ([]coreda.Blob, error) {
+				time.Sleep(delayPerBatch)
+				blobs := make([][]byte, len(ids))
+				for i := range blobs {
+					blobs[i] = []byte("blob")
+				}
+				return blobs, nil
+			},
+		}
+
+		client := NewClient(Config{
+			DA:             mockDAInstance,
+			Logger:         logger,
+			Namespace:      "test-namespace",
+			DataNamespace:  "test-data-namespace",
+			DefaultTimeout: perRequestTimeout,
+		})
+
+		start := time.Now()
+		result := client.Retrieve(context.Background(), dataLayerHeight, encodedNamespace.Bytes())
+		elapsed := time.Since(start)
+
+		// Total time: GetIDs (40ms) + 3 batches * 40ms = 160ms
+		// This is greater than perRequestTimeout (100ms), but should still succeed
+		// because each individual request completes within its timeout
+		assert.Equal(t, coreda.StatusSuccess, result.Code)
+		assert.Assert(t, elapsed > perRequestTimeout, "total time should exceed single timeout")
+		assert.Equal(t, 300, len(result.Data))
+	})
+
+	t.Run("respects parent context cancellation", func(t *testing.T) {
+		// Use 5 batches to ensure we have enough time to cancel mid-operation
+		mockIDs := make([][]byte, 500) // 5 batches of 100
+		for i := range mockIDs {
+			mockIDs[i] = []byte(fmt.Sprintf("id%d", i))
+		}
+		mockTimestamp := time.Now()
+		batchCount := 0
+
+		mockDAInstance := &mockDA{
+			getIDsFunc: func(ctx context.Context, height uint64, namespace []byte) (*coreda.GetIDsResult, error) {
+				return &coreda.GetIDsResult{
+					IDs:       mockIDs,
+					Timestamp: mockTimestamp,
+				}, nil
+			},
+			getFunc: func(ctx context.Context, ids []coreda.ID, namespace []byte) ([]coreda.Blob, error) {
+				batchCount++
+				time.Sleep(50 * time.Millisecond) // Each batch takes 50ms
+				blobs := make([][]byte, len(ids))
+				for i := range blobs {
+					blobs[i] = []byte("blob")
+				}
+				return blobs, nil
+			},
+		}
+
+		client := NewClient(Config{
+			DA:             mockDAInstance,
+			Logger:         logger,
+			Namespace:      "test-namespace",
+			DataNamespace:  "test-data-namespace",
+			DefaultTimeout: 1 * time.Second,
+		})
+
+		// Create a context that will be cancelled after the second batch completes
+		// but before all batches finish
+		ctx, cancel := context.WithCancel(context.Background())
+		go func() {
+			time.Sleep(120 * time.Millisecond) // Cancel after ~2 batches (2 * 50ms = 100ms)
+			cancel()
+		}()
+
+		result := client.Retrieve(ctx, dataLayerHeight, encodedNamespace.Bytes())
+
+		// Should fail due to context cancellation
+		assert.Equal(t, coreda.StatusError, result.Code)
+		assert.Assert(t, batchCount < 5, "should not complete all batches, got %d", batchCount)
+	})
+
+	t.Run("returns early if parent context already cancelled", func(t *testing.T) {
+		mockDAInstance := &mockDA{
+			getIDsFunc: func(ctx context.Context, height uint64, namespace []byte) (*coreda.GetIDsResult, error) {
+				t.Fatal("GetIDs should not be called if context is already cancelled")
+				return nil, nil
+			},
+		}
+
+		client := NewClient(Config{
+			DA:             mockDAInstance,
+			Logger:         logger,
+			Namespace:      "test-namespace",
+			DataNamespace:  "test-data-namespace",
+			DefaultTimeout: 1 * time.Second,
+		})
+
+		ctx, cancel := context.WithCancel(context.Background())
+		cancel() // Cancel immediately
+
+		result := client.Retrieve(ctx, dataLayerHeight, encodedNamespace.Bytes())
+
+		assert.Equal(t, coreda.StatusError, result.Code)
+		assert.Assert(t, result.Message != "")
+	})
+}
diff --git a/block/internal/syncing/da_retriever_test.go b/block/internal/syncing/da_retriever_test.go
@@ -27,6 +27,11 @@ import (
 
 // newTestDARetriever creates a DA retriever for testing with the given DA implementation
 func newTestDARetriever(t *testing.T, mockDA coreda.DA, cfg config.Config, gen genesis.Genesis) *daRetriever {
+	return newTestDARetrieverWithTimeout(t, mockDA, cfg, gen, 0)
+}
+
+// newTestDARetrieverWithTimeout creates a DA retriever for testing with a custom timeout
+func newTestDARetrieverWithTimeout(t *testing.T, mockDA coreda.DA, cfg config.Config, gen genesis.Genesis, timeout time.Duration) *daRetriever {
 	t.Helper()
 	if cfg.DA.Namespace == "" {
 		cfg.DA.Namespace = "test-ns"
@@ -39,10 +44,11 @@ func newTestDARetriever(t *testing.T, mockDA coreda.DA, cfg config.Config, gen g
 	require.NoError(t, err)
 
 	daClient := da.NewClient(da.Config{
-		DA:            mockDA,
-		Logger:        zerolog.Nop(),
-		Namespace:     cfg.DA.Namespace,
-		DataNamespace: cfg.DA.DataNamespace,
+		DA:             mockDA,
+		Logger:         zerolog.Nop(),
+		DefaultTimeout: timeout, // 0 means use default (30s)
+		Namespace:      cfg.DA.Namespace,
+		DataNamespace:  cfg.DA.DataNamespace,
 	})
 
 	return NewDARetriever(daClient, cm, gen, zerolog.Nop())
@@ -111,18 +117,19 @@ func TestDARetriever_RetrieveFromDA_HeightFromFuture(t *testing.T) {
 }
 
 func TestDARetriever_RetrieveFromDA_Timeout(t *testing.T) {
-	t.Skip("Skipping flaky timeout test - timing is now controlled by DA client")
-
 	mockDA := testmocks.NewMockDA(t)
 
+	// Use a short timeout for testing
+	testTimeout := 50 * time.Millisecond
+
 	// Mock GetIDs to hang longer than the timeout
 	mockDA.EXPECT().GetIDs(mock.Anything, mock.Anything, mock.Anything).
 		Run(func(ctx context.Context, height uint64, namespace []byte) {
 			<-ctx.Done()
 		}).
 		Return(nil, context.DeadlineExceeded).Maybe()
 
-	r := newTestDARetriever(t, mockDA, config.DefaultConfig(), genesis.Genesis{})
+	r := newTestDARetrieverWithTimeout(t, mockDA, config.DefaultConfig(), genesis.Genesis{}, testTimeout)
 
 	start := time.Now()
 	events, err := r.RetrieveFromDA(context.Background(), 42)
@@ -135,9 +142,8 @@ func TestDARetriever_RetrieveFromDA_Timeout(t *testing.T) {
 	assert.Len(t, events, 0)
 
 	// Verify timeout occurred approximately at expected time (with some tolerance)
-	// DA client has a 30-second default timeout
-	assert.Greater(t, duration, 29*time.Second, "should timeout after approximately 30 seconds")
-	assert.Less(t, duration, 35*time.Second, "should not take much longer than timeout")
+	assert.GreaterOrEqual(t, duration, testTimeout, "should timeout after approximately the configured timeout")
+	assert.Less(t, duration, testTimeout*3, "should not take much longer than timeout")
 }
 
 func TestDARetriever_RetrieveFromDA_TimeoutFast(t *testing.T) {