Add 529 error handling with tool call cleanup and backoff strategy

Co-authored-by: dallin <dallin@buster.so>
2025-07-14 17:09:54 +00:00 · 2025-07-14 17:09:54 +00:00 · 7988ccbb58
parent 58542c204d
commit 7988ccbb58
7 changed files with 504 additions and 0 deletions
--- a/packages/ai/src/utils/retry/healing-strategies.ts
+++ b/packages/ai/src/utils/retry/healing-strategies.ts
@ -43,6 +43,14 @@ export function determineHealingStrategy(
      };
    }

+    // 529 Overloaded errors - special handling, cleanup is done separately
+    case 'overloaded-error':
+      return {
+        shouldRemoveLastAssistantMessage: false,
+        healingMessage: null, // No healing message needed, cleanup handles it
+        backoffMultiplier: 2, // Longer backoff for overload
+      };
+
    // Network/server errors - just retry with backoff
    case 'network-timeout':
    case 'server-error':
--- a/packages/ai/src/utils/retry/index.ts
+++ b/packages/ai/src/utils/retry/index.ts
@ -10,6 +10,7 @@ export {
  logRetryInfo,
  logMessagesAfterHealing,
  handleRetryWithHealing,
+  cleanupIncompleteToolCalls,
 } from './retry-helpers';
 export {
  determineHealingStrategy,
--- a/packages/ai/src/utils/retry/retry-agent-stream.ts
+++ b/packages/ai/src/utils/retry/retry-agent-stream.ts
@ -202,6 +202,19 @@ export function detectRetryableError(

    // Server errors (5xx)
    if (error.statusCode && error.statusCode >= 500 && error.statusCode < 600) {
+      // Special handling for 529 overloaded errors
+      if (error.statusCode === 529) {
+        return {
+          type: 'overloaded-error',
+          originalError: error,
+          healingMessage: {
+            role: 'user',
+            content: `Server overloaded (529). Retrying after cleanup...`,
+          },
+          requiresMessageCleanup: true, // New flag
+        };
+      }
+
      return {
        type: 'server-error',
        originalError: error,
--- a/packages/ai/src/utils/retry/retry-helpers.ts
+++ b/packages/ai/src/utils/retry/retry-helpers.ts
@ -311,6 +311,62 @@ export function logMessagesAfterHealing(
  });
 }

+/**
+ * Cleans up incomplete tool calls from message history
+ * This is specifically for handling 529 errors that disconnect mid-stream
+ * while generating tool calls, leaving orphaned tool calls without results
+ */
+export function cleanupIncompleteToolCalls(messages: CoreMessage[]): CoreMessage[] {
+  const cleaned = [...messages];
+
+  // Find the last assistant message
+  for (let i = cleaned.length - 1; i >= 0; i--) {
+    const msg = cleaned[i];
+    if (!msg) continue; // Add guard for undefined
+
+    if (msg.role === 'assistant' && Array.isArray(msg.content)) {
+      const toolCalls = msg.content.filter(
+        c => typeof c === 'object' && 'type' in c && c.type === 'tool-call'
+      );
+
+      // Check if any tool calls lack results
+      const orphanedToolCalls = toolCalls.filter(toolCall => {
+        // Look ahead for matching tool result
+        for (let j = i + 1; j < cleaned.length; j++) {
+          const nextMsg = cleaned[j];
+          if (!nextMsg) continue; // Add guard for undefined
+          
+          if (nextMsg.role === 'tool' && Array.isArray(nextMsg.content)) {
+            const hasResult = nextMsg.content.some(
+              c => typeof c === 'object' && 
+                  'type' in c && 
+                  c.type === 'tool-result' && 
+                  'toolCallId' in c &&
+                  'toolCallId' in toolCall &&
+                  c.toolCallId === toolCall.toolCallId
+            );
+            if (hasResult) return false;
+          }
+        }
+        return true; // No result found
+      });
+
+      if (orphanedToolCalls.length > 0) {
+        // Remove the entire assistant message with orphaned tool calls
+        console.info(`cleanupIncompleteToolCalls: Removing assistant message with ${orphanedToolCalls.length} orphaned tool calls`, {
+          messageIndex: i,
+          orphanedToolCallIds: orphanedToolCalls.map(tc => 'toolCallId' in tc ? tc.toolCallId : 'unknown'),
+          textContent: msg.content.filter(c => typeof c === 'object' && 'type' in c && c.type === 'text').map(c => 'text' in c ? c.text : ''),
+        });
+        cleaned.splice(i, 1);
+        break; // Only clean up the last problematic message
+      }
+    }
+  }
+
+  return cleaned;
+}
+
 /**
 * Handles retry logic after a RetryWithHealingError is caught
 * Returns the healed messages and whether to continue without healing
@ -328,6 +384,23 @@ export async function handleRetryWithHealing(
  // Determine the healing strategy
  const healingStrategy = determineHealingStrategy(retryableError, context);

+  // Special handling for 529 overloaded errors
+  if (retryableError.type === 'overloaded-error') {
+    const cleanedMessages = cleanupIncompleteToolCalls(currentMessages);
+
+    console.info(`${context.currentStep}: Cleaned incomplete tool calls after 529 error`, {
+      originalCount: currentMessages.length,
+      cleanedCount: cleanedMessages.length,
+      removed: currentMessages.length - cleanedMessages.length,
+    });
+
+    return {
+      healedMessages: cleanedMessages,
+      shouldContinueWithoutHealing: false,
+      backoffDelay: calculateBackoffDelay(retryCount) * 2, // Longer backoff for overload
+    };
+  }
+
  // For network/server errors, just retry without healing
  if (shouldRetryWithoutHealing(retryableError.type)) {
    const backoffDelay =
--- a/packages/ai/src/utils/retry/types.ts
+++ b/packages/ai/src/utils/retry/types.ts
@ -7,6 +7,7 @@ export interface RetryableError {
    | 'empty-response'
    | 'rate-limit'
    | 'server-error'
+    | 'overloaded-error'
    | 'network-timeout'
    | 'stream-interruption'
    | 'json-parse-error'
@ -14,6 +15,7 @@ export interface RetryableError {
    | 'unknown-error';
  originalError?: Error | unknown;
  healingMessage: CoreMessage;
+  requiresMessageCleanup?: boolean; // New optional flag for 529 errors
 }

 /**
--- a/packages/ai/tests/utils/retry/cleanup-incomplete-tool-calls.test.ts
+++ b/packages/ai/tests/utils/retry/cleanup-incomplete-tool-calls.test.ts
@ -0,0 +1,204 @@
+import { describe, it, expect } from 'vitest';
+import type { CoreMessage } from 'ai';
+import { cleanupIncompleteToolCalls } from '../../../src/utils/retry';
+
+describe('cleanupIncompleteToolCalls', () => {
+  it('should remove assistant message with orphaned tool call', () => {
+    const messages: CoreMessage[] = [
+      {
+        role: 'user',
+        content: 'Hello'
+      },
+      {
+        role: 'assistant',
+        content: [
+          { type: 'text', text: 'Let me help' },
+          { type: 'tool-call', toolCallId: '123', toolName: 'getTodo', args: {} }
+        ]
+      }
+      // No tool result - orphaned
+    ];
+
+    const cleaned = cleanupIncompleteToolCalls(messages);
+    expect(cleaned).toHaveLength(1);
+    expect(cleaned[0]?.role).toBe('user');
+  });
+
+  it('should preserve complete tool call/result pairs', () => {
+    const messages: CoreMessage[] = [
+      {
+        role: 'assistant',
+        content: [
+          { type: 'tool-call', toolCallId: '123', toolName: 'getTodo', args: {} }
+        ]
+      },
+      {
+        role: 'tool',
+        content: [
+          { type: 'tool-result', toolCallId: '123', toolName: 'getTodo', result: { todo: 'test' } }
+        ]
+      }
+    ];
+
+    const cleaned = cleanupIncompleteToolCalls(messages);
+    expect(cleaned).toHaveLength(2);
+    expect(cleaned[0]?.role).toBe('assistant');
+    expect(cleaned[1]?.role).toBe('tool');
+  });
+
+  it('should handle mixed complete and incomplete tool calls', () => {
+    const messages: CoreMessage[] = [
+      {
+        role: 'assistant',
+        content: [
+          { type: 'tool-call', toolCallId: '123', toolName: 'getTodo', args: {} },
+          { type: 'tool-call', toolCallId: '456', toolName: 'createTodo', args: { title: 'test' } }
+        ]
+      },
+      {
+        role: 'tool',
+        content: [
+          { type: 'tool-result', toolCallId: '123', toolName: 'getTodo', result: { todo: 'test' } }
+        ]
+      }
+      // Missing result for toolCallId '456' - partially orphaned
+    ];
+
+    const cleaned = cleanupIncompleteToolCalls(messages);
+    expect(cleaned).toHaveLength(0); // Both messages removed because assistant message had orphaned call
+  });
+
+  it('should handle multiple assistant messages and only clean the last one', () => {
+    const messages: CoreMessage[] = [
+      {
+        role: 'assistant',
+        content: [
+          { type: 'tool-call', toolCallId: '111', toolName: 'getTodo', args: {} }
+        ]
+      },
+      {
+        role: 'tool',
+        content: [
+          { type: 'tool-result', toolCallId: '111', toolName: 'getTodo', result: { todo: 'first' } }
+        ]
+      },
+      {
+        role: 'user',
+        content: 'Another request'
+      },
+      {
+        role: 'assistant',
+        content: [
+          { type: 'text', text: 'Processing...' },
+          { type: 'tool-call', toolCallId: '222', toolName: 'createTodo', args: { title: 'new' } }
+        ]
+      }
+      // No result for '222' - orphaned
+    ];
+
+    const cleaned = cleanupIncompleteToolCalls(messages);
+    expect(cleaned).toHaveLength(3); // Only last assistant message removed
+    expect(cleaned[0]?.role).toBe('assistant');
+    expect(cleaned[1]?.role).toBe('tool');
+    expect(cleaned[2]?.role).toBe('user');
+  });
+
+  it('should handle assistant messages with only text content', () => {
+    const messages: CoreMessage[] = [
+      {
+        role: 'user',
+        content: 'Hello'
+      },
+      {
+        role: 'assistant',
+        content: 'Hi there, how can I help?'
+      }
+    ];
+
+    const cleaned = cleanupIncompleteToolCalls(messages);
+    expect(cleaned).toHaveLength(2); // No changes
+    expect(cleaned).toEqual(messages);
+  });
+
+  it('should handle empty message array', () => {
+    const messages: CoreMessage[] = [];
+    const cleaned = cleanupIncompleteToolCalls(messages);
+    expect(cleaned).toHaveLength(0);
+  });
+
+  it('should handle messages with mixed content types', () => {
+    const messages: CoreMessage[] = [
+      {
+        role: 'assistant',
+        content: [
+          { type: 'text', text: 'Let me search for that' },
+          { type: 'tool-call', toolCallId: '789', toolName: 'search', args: { query: 'test' } },
+          { type: 'text', text: 'Searching now...' }
+        ]
+      }
+      // No tool result - orphaned
+    ];
+
+    const cleaned = cleanupIncompleteToolCalls(messages);
+    expect(cleaned).toHaveLength(0); // Removed due to orphaned tool call
+  });
+
+  it('should handle tool results that appear before their calls', () => {
+    const messages: CoreMessage[] = [
+      {
+        role: 'tool',
+        content: [
+          { type: 'tool-result', toolCallId: '999', toolName: 'getTodo', result: { error: 'Not found' } }
+        ]
+      },
+      {
+        role: 'assistant',
+        content: [
+          { type: 'tool-call', toolCallId: '999', toolName: 'getTodo', args: {} }
+        ]
+      }
+    ];
+
+    const cleaned = cleanupIncompleteToolCalls(messages);
+    expect(cleaned).toHaveLength(1); // Assistant message removed, tool result remains
+    expect(cleaned[0]?.role).toBe('tool');
+  });
+
+  it('should handle multiple tool results in single message', () => {
+    const messages: CoreMessage[] = [
+      {
+        role: 'assistant',
+        content: [
+          { type: 'tool-call', toolCallId: 'abc', toolName: 'getTodo', args: {} },
+          { type: 'tool-call', toolCallId: 'def', toolName: 'createTodo', args: { title: 'new' } }
+        ]
+      },
+      {
+        role: 'tool',
+        content: [
+          { type: 'tool-result', toolCallId: 'abc', toolName: 'getTodo', result: { todo: 'existing' } },
+          { type: 'tool-result', toolCallId: 'def', toolName: 'createTodo', result: { id: 1, title: 'new' } }
+        ]
+      }
+    ];
+
+    const cleaned = cleanupIncompleteToolCalls(messages);
+    expect(cleaned).toHaveLength(2); // All complete, no changes
+    expect(cleaned).toEqual(messages);
+  });
+
+  it('should handle assistant message with no tool calls', () => {
+    const messages: CoreMessage[] = [
+      {
+        role: 'assistant',
+        content: [
+          { type: 'text', text: 'Here is your answer' }
+        ]
+      }
+    ];
+
+    const cleaned = cleanupIncompleteToolCalls(messages);
+    expect(cleaned).toHaveLength(1); // No changes
+    expect(cleaned).toEqual(messages);
+  });
+});
--- a/packages/ai/tests/utils/retry/retry-529-handling.test.ts
+++ b/packages/ai/tests/utils/retry/retry-529-handling.test.ts
@ -0,0 +1,203 @@
+import { describe, it, expect, vi } from 'vitest';
+import type { CoreMessage } from 'ai';
+import { APICallError } from 'ai';
+import { detectRetryableError, handleRetryWithHealing } from '../../../src/utils/retry';
+import type { RetryableError, WorkflowContext } from '../../../src/utils/retry';
+
+describe('529 error handling', () => {
+  it('should detect 529 as overloaded-error type', () => {
+    const error = new APICallError({
+      message: 'Server overloaded',
+      statusCode: 529,
+      responseHeaders: {},
+      responseBody: 'Server is overloaded, please try again',
+      url: 'https://api.example.com',
+      requestBodyValues: {}
+    });
+
+    const result = detectRetryableError(error);
+    expect(result).not.toBeNull();
+    expect(result?.type).toBe('overloaded-error');
+    expect(result?.requiresMessageCleanup).toBe(true);
+    expect(result?.healingMessage.role).toBe('user');
+    expect(result?.healingMessage.content).toContain('Server overloaded (529)');
+  });
+
+  it('should apply cleanup for overloaded errors in handleRetryWithHealing', async () => {
+    const messagesWithOrphan: CoreMessage[] = [
+      {
+        role: 'user',
+        content: 'Please analyze this data'
+      },
+      {
+        role: 'assistant',
+        content: [
+          { type: 'text', text: 'Let me analyze that for you' },
+          { type: 'tool-call', toolCallId: 'tc-123', toolName: 'analyzeData', args: { data: 'test' } }
+        ]
+      }
+      // No tool result - connection interrupted
+    ];
+
+    const retryableError: RetryableError = {
+      type: 'overloaded-error',
+      originalError: new Error('529'),
+      healingMessage: { role: 'user', content: 'Server overloaded (529). Retrying after cleanup...' },
+      requiresMessageCleanup: true
+    };
+
+    const context: WorkflowContext = { 
+      currentStep: 'analyst',
+      availableTools: new Set(['analyzeData', 'createMetrics'])
+    };
+
+    const result = await handleRetryWithHealing(
+      retryableError,
+      messagesWithOrphan,
+      0,
+      context
+    );
+
+    expect(result.healedMessages).toHaveLength(1); // Only user message remains
+    expect(result.healedMessages[0]?.role).toBe('user');
+    expect(result.shouldContinueWithoutHealing).toBe(false);
+    expect(result.backoffDelay).toBeGreaterThan(0);
+  });
+
+  it('should handle 529 differently from other 5xx errors', () => {
+    const error529 = new APICallError({
+      message: 'Server overloaded',
+      statusCode: 529,
+      responseHeaders: {},
+      responseBody: 'Overloaded',
+      url: 'https://api.example.com',
+      requestBodyValues: {}
+    });
+
+    const error500 = new APICallError({
+      message: 'Internal server error',
+      statusCode: 500,
+      responseHeaders: {},
+      responseBody: 'Internal error',
+      url: 'https://api.example.com',
+      requestBodyValues: {}
+    });
+
+    const result529 = detectRetryableError(error529);
+    const result500 = detectRetryableError(error500);
+
+    expect(result529?.type).toBe('overloaded-error');
+    expect(result529?.requiresMessageCleanup).toBe(true);
+
+    expect(result500?.type).toBe('server-error');
+    expect(result500?.requiresMessageCleanup).toBeUndefined();
+  });
+
+  it('should use longer backoff for 529 errors', async () => {
+    const messages: CoreMessage[] = [
+      { role: 'user', content: 'Test' }
+    ];
+
+    const retryableError: RetryableError = {
+      type: 'overloaded-error',
+      originalError: new Error('529'),
+      healingMessage: { role: 'user', content: 'Retrying...' }
+    };
+
+    const context: WorkflowContext = { 
+      currentStep: 'analyst', // Changed from 'test' to valid value
+      availableTools: new Set()
+    };
+
+    const result = await handleRetryWithHealing(
+      retryableError,
+      messages,
+      1, // retryCount
+      context
+    );
+
+    // Backoff should be multiplied by 2 for overloaded errors
+    expect(result.backoffDelay).toBeGreaterThan(1000); // Base backoff is usually 1000ms for retry 1
+  });
+
+  it('should log cleanup details', async () => {
+    const consoleSpy = vi.spyOn(console, 'info');
+    
+    const messagesWithMultipleOrphans: CoreMessage[] = [
+      {
+        role: 'assistant',
+        content: [
+          { type: 'tool-call', toolCallId: 'tc-1', toolName: 'tool1', args: {} },
+          { type: 'tool-call', toolCallId: 'tc-2', toolName: 'tool2', args: {} }
+        ]
+      }
+    ];
+
+    const retryableError: RetryableError = {
+      type: 'overloaded-error',
+      originalError: new Error('529'),
+      healingMessage: { role: 'user', content: 'Retrying...' }
+    };
+
+    const context: WorkflowContext = { 
+      currentStep: 'analyst',
+      availableTools: new Set()
+    };
+
+    await handleRetryWithHealing(
+      retryableError,
+      messagesWithMultipleOrphans,
+      0,
+      context
+    );
+
+    expect(consoleSpy).toHaveBeenCalledWith(
+      'analyst: Cleaned incomplete tool calls after 529 error',
+      expect.objectContaining({
+        originalCount: 1,
+        cleanedCount: 0,
+        removed: 1
+      })
+    );
+
+    consoleSpy.mockRestore();
+  });
+
+  it('should preserve messages when no cleanup is needed', async () => {
+    const completeMessages: CoreMessage[] = [
+      {
+        role: 'assistant',
+        content: [
+          { type: 'tool-call', toolCallId: 'tc-123', toolName: 'getTodo', args: {} }
+        ]
+      },
+      {
+        role: 'tool',
+        content: [
+          { type: 'tool-result', toolCallId: 'tc-123', toolName: 'getTodo', result: { todo: 'test' } }
+        ]
+      }
+    ];
+
+    const retryableError: RetryableError = {
+      type: 'overloaded-error',
+      originalError: new Error('529'),
+      healingMessage: { role: 'user', content: 'Retrying...' }
+    };
+
+    const context: WorkflowContext = { 
+      currentStep: 'analyst',
+      availableTools: new Set()
+    };
+
+    const result = await handleRetryWithHealing(
+      retryableError,
+      completeMessages,
+      0,
+      context
+    );
+
+    expect(result.healedMessages).toHaveLength(2); // No cleanup needed
+    expect(result.healedMessages).toEqual(completeMessages);
+  });
+});