From 7988ccbb5898998b3f531b8fd5c79dddbdf9c4d4 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Mon, 14 Jul 2025 17:09:54 +0000
Subject: [PATCH] Add 529 error handling with tool call cleanup and backoff
 strategy

Co-authored-by: dallin <dallin@buster.so>
---
 .../ai/src/utils/retry/healing-strategies.ts  |   8 +
 packages/ai/src/utils/retry/index.ts          |   1 +
 .../ai/src/utils/retry/retry-agent-stream.ts  |  13 ++
 packages/ai/src/utils/retry/retry-helpers.ts  |  73 +++++++
 packages/ai/src/utils/retry/types.ts          |   2 +
 .../cleanup-incomplete-tool-calls.test.ts     | 204 ++++++++++++++++++
 .../utils/retry/retry-529-handling.test.ts    | 203 +++++++++++++++++
 7 files changed, 504 insertions(+)
 create mode 100644 packages/ai/tests/utils/retry/cleanup-incomplete-tool-calls.test.ts
 create mode 100644 packages/ai/tests/utils/retry/retry-529-handling.test.ts

diff --git a/packages/ai/src/utils/retry/healing-strategies.ts b/packages/ai/src/utils/retry/healing-strategies.ts
index 73d4de7db..0df644584 100644
--- a/packages/ai/src/utils/retry/healing-strategies.ts
+++ b/packages/ai/src/utils/retry/healing-strategies.ts
@@ -43,6 +43,14 @@ export function determineHealingStrategy(
       };
     }
 
+    // 529 Overloaded errors - special handling, cleanup is done separately
+    case 'overloaded-error':
+      return {
+        shouldRemoveLastAssistantMessage: false,
+        healingMessage: null, // No healing message needed, cleanup handles it
+        backoffMultiplier: 2, // Longer backoff for overload
+      };
+
     // Network/server errors - just retry with backoff
     case 'network-timeout':
     case 'server-error':
diff --git a/packages/ai/src/utils/retry/index.ts b/packages/ai/src/utils/retry/index.ts
index 91392b4fc..c0a18e17b 100644
--- a/packages/ai/src/utils/retry/index.ts
+++ b/packages/ai/src/utils/retry/index.ts
@@ -10,6 +10,7 @@ export {
   logRetryInfo,
   logMessagesAfterHealing,
   handleRetryWithHealing,
+  cleanupIncompleteToolCalls,
 } from './retry-helpers';
 export {
   determineHealingStrategy,
diff --git a/packages/ai/src/utils/retry/retry-agent-stream.ts b/packages/ai/src/utils/retry/retry-agent-stream.ts
index 7c9ee3f9e..bf31c02c5 100644
--- a/packages/ai/src/utils/retry/retry-agent-stream.ts
+++ b/packages/ai/src/utils/retry/retry-agent-stream.ts
@@ -202,6 +202,19 @@ export function detectRetryableError(
 
     // Server errors (5xx)
     if (error.statusCode && error.statusCode >= 500 && error.statusCode < 600) {
+      // Special handling for 529 overloaded errors
+      if (error.statusCode === 529) {
+        return {
+          type: 'overloaded-error',
+          originalError: error,
+          healingMessage: {
+            role: 'user',
+            content: `Server overloaded (529). Retrying after cleanup...`,
+          },
+          requiresMessageCleanup: true, // New flag
+        };
+      }
+
       return {
         type: 'server-error',
         originalError: error,
diff --git a/packages/ai/src/utils/retry/retry-helpers.ts b/packages/ai/src/utils/retry/retry-helpers.ts
index 6fb8dcb28..1978ad85f 100644
--- a/packages/ai/src/utils/retry/retry-helpers.ts
+++ b/packages/ai/src/utils/retry/retry-helpers.ts
@@ -311,6 +311,62 @@ export function logMessagesAfterHealing(
   });
 }
 
+/**
+ * Cleans up incomplete tool calls from message history
+ * This is specifically for handling 529 errors that disconnect mid-stream
+ * while generating tool calls, leaving orphaned tool calls without results
+ */
+export function cleanupIncompleteToolCalls(messages: CoreMessage[]): CoreMessage[] {
+  const cleaned = [...messages];
+
+  // Find the last assistant message
+  for (let i = cleaned.length - 1; i >= 0; i--) {
+    const msg = cleaned[i];
+    if (!msg) continue; // Add guard for undefined
+
+    if (msg.role === 'assistant' && Array.isArray(msg.content)) {
+      const toolCalls = msg.content.filter(
+        c => typeof c === 'object' && 'type' in c && c.type === 'tool-call'
+      );
+
+      // Check if any tool calls lack results
+      const orphanedToolCalls = toolCalls.filter(toolCall => {
+        // Look ahead for matching tool result
+        for (let j = i + 1; j < cleaned.length; j++) {
+          const nextMsg = cleaned[j];
+          if (!nextMsg) continue; // Add guard for undefined
+          
+          if (nextMsg.role === 'tool' && Array.isArray(nextMsg.content)) {
+            const hasResult = nextMsg.content.some(
+              c => typeof c === 'object' && 
+                  'type' in c && 
+                  c.type === 'tool-result' && 
+                  'toolCallId' in c &&
+                  'toolCallId' in toolCall &&
+                  c.toolCallId === toolCall.toolCallId
+            );
+            if (hasResult) return false;
+          }
+        }
+        return true; // No result found
+      });
+
+      if (orphanedToolCalls.length > 0) {
+        // Remove the entire assistant message with orphaned tool calls
+        console.info(`cleanupIncompleteToolCalls: Removing assistant message with ${orphanedToolCalls.length} orphaned tool calls`, {
+          messageIndex: i,
+          orphanedToolCallIds: orphanedToolCalls.map(tc => 'toolCallId' in tc ? tc.toolCallId : 'unknown'),
+          textContent: msg.content.filter(c => typeof c === 'object' && 'type' in c && c.type === 'text').map(c => 'text' in c ? c.text : ''),
+        });
+        cleaned.splice(i, 1);
+        break; // Only clean up the last problematic message
+      }
+    }
+  }
+
+  return cleaned;
+}
+
 /**
  * Handles retry logic after a RetryWithHealingError is caught
  * Returns the healed messages and whether to continue without healing
@@ -328,6 +384,23 @@ export async function handleRetryWithHealing(
   // Determine the healing strategy
   const healingStrategy = determineHealingStrategy(retryableError, context);
 
+  // Special handling for 529 overloaded errors
+  if (retryableError.type === 'overloaded-error') {
+    const cleanedMessages = cleanupIncompleteToolCalls(currentMessages);
+
+    console.info(`${context.currentStep}: Cleaned incomplete tool calls after 529 error`, {
+      originalCount: currentMessages.length,
+      cleanedCount: cleanedMessages.length,
+      removed: currentMessages.length - cleanedMessages.length,
+    });
+
+    return {
+      healedMessages: cleanedMessages,
+      shouldContinueWithoutHealing: false,
+      backoffDelay: calculateBackoffDelay(retryCount) * 2, // Longer backoff for overload
+    };
+  }
+
   // For network/server errors, just retry without healing
   if (shouldRetryWithoutHealing(retryableError.type)) {
     const backoffDelay =
diff --git a/packages/ai/src/utils/retry/types.ts b/packages/ai/src/utils/retry/types.ts
index c15cf0628..f80cc6ee2 100644
--- a/packages/ai/src/utils/retry/types.ts
+++ b/packages/ai/src/utils/retry/types.ts
@@ -7,6 +7,7 @@ export interface RetryableError {
     | 'empty-response'
     | 'rate-limit'
     | 'server-error'
+    | 'overloaded-error'
     | 'network-timeout'
     | 'stream-interruption'
     | 'json-parse-error'
@@ -14,6 +15,7 @@ export interface RetryableError {
     | 'unknown-error';
   originalError?: Error | unknown;
   healingMessage: CoreMessage;
+  requiresMessageCleanup?: boolean; // New optional flag for 529 errors
 }
 
 /**
diff --git a/packages/ai/tests/utils/retry/cleanup-incomplete-tool-calls.test.ts b/packages/ai/tests/utils/retry/cleanup-incomplete-tool-calls.test.ts
new file mode 100644
index 000000000..24a3e95a4
--- /dev/null
+++ b/packages/ai/tests/utils/retry/cleanup-incomplete-tool-calls.test.ts
@@ -0,0 +1,204 @@
+import { describe, it, expect } from 'vitest';
+import type { CoreMessage } from 'ai';
+import { cleanupIncompleteToolCalls } from '../../../src/utils/retry';
+
+describe('cleanupIncompleteToolCalls', () => {
+  it('should remove assistant message with orphaned tool call', () => {
+    const messages: CoreMessage[] = [
+      {
+        role: 'user',
+        content: 'Hello'
+      },
+      {
+        role: 'assistant',
+        content: [
+          { type: 'text', text: 'Let me help' },
+          { type: 'tool-call', toolCallId: '123', toolName: 'getTodo', args: {} }
+        ]
+      }
+      // No tool result - orphaned
+    ];
+
+    const cleaned = cleanupIncompleteToolCalls(messages);
+    expect(cleaned).toHaveLength(1);
+    expect(cleaned[0]?.role).toBe('user');
+  });
+
+  it('should preserve complete tool call/result pairs', () => {
+    const messages: CoreMessage[] = [
+      {
+        role: 'assistant',
+        content: [
+          { type: 'tool-call', toolCallId: '123', toolName: 'getTodo', args: {} }
+        ]
+      },
+      {
+        role: 'tool',
+        content: [
+          { type: 'tool-result', toolCallId: '123', toolName: 'getTodo', result: { todo: 'test' } }
+        ]
+      }
+    ];
+
+    const cleaned = cleanupIncompleteToolCalls(messages);
+    expect(cleaned).toHaveLength(2);
+    expect(cleaned[0]?.role).toBe('assistant');
+    expect(cleaned[1]?.role).toBe('tool');
+  });
+
+  it('should handle mixed complete and incomplete tool calls', () => {
+    const messages: CoreMessage[] = [
+      {
+        role: 'assistant',
+        content: [
+          { type: 'tool-call', toolCallId: '123', toolName: 'getTodo', args: {} },
+          { type: 'tool-call', toolCallId: '456', toolName: 'createTodo', args: { title: 'test' } }
+        ]
+      },
+      {
+        role: 'tool',
+        content: [
+          { type: 'tool-result', toolCallId: '123', toolName: 'getTodo', result: { todo: 'test' } }
+        ]
+      }
+      // Missing result for toolCallId '456' - partially orphaned
+    ];
+
+    const cleaned = cleanupIncompleteToolCalls(messages);
+    expect(cleaned).toHaveLength(0); // Both messages removed because assistant message had orphaned call
+  });
+
+  it('should handle multiple assistant messages and only clean the last one', () => {
+    const messages: CoreMessage[] = [
+      {
+        role: 'assistant',
+        content: [
+          { type: 'tool-call', toolCallId: '111', toolName: 'getTodo', args: {} }
+        ]
+      },
+      {
+        role: 'tool',
+        content: [
+          { type: 'tool-result', toolCallId: '111', toolName: 'getTodo', result: { todo: 'first' } }
+        ]
+      },
+      {
+        role: 'user',
+        content: 'Another request'
+      },
+      {
+        role: 'assistant',
+        content: [
+          { type: 'text', text: 'Processing...' },
+          { type: 'tool-call', toolCallId: '222', toolName: 'createTodo', args: { title: 'new' } }
+        ]
+      }
+      // No result for '222' - orphaned
+    ];
+
+    const cleaned = cleanupIncompleteToolCalls(messages);
+    expect(cleaned).toHaveLength(3); // Only last assistant message removed
+    expect(cleaned[0]?.role).toBe('assistant');
+    expect(cleaned[1]?.role).toBe('tool');
+    expect(cleaned[2]?.role).toBe('user');
+  });
+
+  it('should handle assistant messages with only text content', () => {
+    const messages: CoreMessage[] = [
+      {
+        role: 'user',
+        content: 'Hello'
+      },
+      {
+        role: 'assistant',
+        content: 'Hi there, how can I help?'
+      }
+    ];
+
+    const cleaned = cleanupIncompleteToolCalls(messages);
+    expect(cleaned).toHaveLength(2); // No changes
+    expect(cleaned).toEqual(messages);
+  });
+
+  it('should handle empty message array', () => {
+    const messages: CoreMessage[] = [];
+    const cleaned = cleanupIncompleteToolCalls(messages);
+    expect(cleaned).toHaveLength(0);
+  });
+
+  it('should handle messages with mixed content types', () => {
+    const messages: CoreMessage[] = [
+      {
+        role: 'assistant',
+        content: [
+          { type: 'text', text: 'Let me search for that' },
+          { type: 'tool-call', toolCallId: '789', toolName: 'search', args: { query: 'test' } },
+          { type: 'text', text: 'Searching now...' }
+        ]
+      }
+      // No tool result - orphaned
+    ];
+
+    const cleaned = cleanupIncompleteToolCalls(messages);
+    expect(cleaned).toHaveLength(0); // Removed due to orphaned tool call
+  });
+
+  it('should handle tool results that appear before their calls', () => {
+    const messages: CoreMessage[] = [
+      {
+        role: 'tool',
+        content: [
+          { type: 'tool-result', toolCallId: '999', toolName: 'getTodo', result: { error: 'Not found' } }
+        ]
+      },
+      {
+        role: 'assistant',
+        content: [
+          { type: 'tool-call', toolCallId: '999', toolName: 'getTodo', args: {} }
+        ]
+      }
+    ];
+
+    const cleaned = cleanupIncompleteToolCalls(messages);
+    expect(cleaned).toHaveLength(1); // Assistant message removed, tool result remains
+    expect(cleaned[0]?.role).toBe('tool');
+  });
+
+  it('should handle multiple tool results in single message', () => {
+    const messages: CoreMessage[] = [
+      {
+        role: 'assistant',
+        content: [
+          { type: 'tool-call', toolCallId: 'abc', toolName: 'getTodo', args: {} },
+          { type: 'tool-call', toolCallId: 'def', toolName: 'createTodo', args: { title: 'new' } }
+        ]
+      },
+      {
+        role: 'tool',
+        content: [
+          { type: 'tool-result', toolCallId: 'abc', toolName: 'getTodo', result: { todo: 'existing' } },
+          { type: 'tool-result', toolCallId: 'def', toolName: 'createTodo', result: { id: 1, title: 'new' } }
+        ]
+      }
+    ];
+
+    const cleaned = cleanupIncompleteToolCalls(messages);
+    expect(cleaned).toHaveLength(2); // All complete, no changes
+    expect(cleaned).toEqual(messages);
+  });
+
+  it('should handle assistant message with no tool calls', () => {
+    const messages: CoreMessage[] = [
+      {
+        role: 'assistant',
+        content: [
+          { type: 'text', text: 'Here is your answer' }
+        ]
+      }
+    ];
+
+    const cleaned = cleanupIncompleteToolCalls(messages);
+    expect(cleaned).toHaveLength(1); // No changes
+    expect(cleaned).toEqual(messages);
+  });
+});
\ No newline at end of file
diff --git a/packages/ai/tests/utils/retry/retry-529-handling.test.ts b/packages/ai/tests/utils/retry/retry-529-handling.test.ts
new file mode 100644
index 000000000..6baccb26c
--- /dev/null
+++ b/packages/ai/tests/utils/retry/retry-529-handling.test.ts
@@ -0,0 +1,203 @@
+import { describe, it, expect, vi } from 'vitest';
+import type { CoreMessage } from 'ai';
+import { APICallError } from 'ai';
+import { detectRetryableError, handleRetryWithHealing } from '../../../src/utils/retry';
+import type { RetryableError, WorkflowContext } from '../../../src/utils/retry';
+
+describe('529 error handling', () => {
+  it('should detect 529 as overloaded-error type', () => {
+    const error = new APICallError({
+      message: 'Server overloaded',
+      statusCode: 529,
+      responseHeaders: {},
+      responseBody: 'Server is overloaded, please try again',
+      url: 'https://api.example.com',
+      requestBodyValues: {}
+    });
+
+    const result = detectRetryableError(error);
+    expect(result).not.toBeNull();
+    expect(result?.type).toBe('overloaded-error');
+    expect(result?.requiresMessageCleanup).toBe(true);
+    expect(result?.healingMessage.role).toBe('user');
+    expect(result?.healingMessage.content).toContain('Server overloaded (529)');
+  });
+
+  it('should apply cleanup for overloaded errors in handleRetryWithHealing', async () => {
+    const messagesWithOrphan: CoreMessage[] = [
+      {
+        role: 'user',
+        content: 'Please analyze this data'
+      },
+      {
+        role: 'assistant',
+        content: [
+          { type: 'text', text: 'Let me analyze that for you' },
+          { type: 'tool-call', toolCallId: 'tc-123', toolName: 'analyzeData', args: { data: 'test' } }
+        ]
+      }
+      // No tool result - connection interrupted
+    ];
+
+    const retryableError: RetryableError = {
+      type: 'overloaded-error',
+      originalError: new Error('529'),
+      healingMessage: { role: 'user', content: 'Server overloaded (529). Retrying after cleanup...' },
+      requiresMessageCleanup: true
+    };
+
+    const context: WorkflowContext = { 
+      currentStep: 'analyst',
+      availableTools: new Set(['analyzeData', 'createMetrics'])
+    };
+
+    const result = await handleRetryWithHealing(
+      retryableError,
+      messagesWithOrphan,
+      0,
+      context
+    );
+
+    expect(result.healedMessages).toHaveLength(1); // Only user message remains
+    expect(result.healedMessages[0]?.role).toBe('user');
+    expect(result.shouldContinueWithoutHealing).toBe(false);
+    expect(result.backoffDelay).toBeGreaterThan(0);
+  });
+
+  it('should handle 529 differently from other 5xx errors', () => {
+    const error529 = new APICallError({
+      message: 'Server overloaded',
+      statusCode: 529,
+      responseHeaders: {},
+      responseBody: 'Overloaded',
+      url: 'https://api.example.com',
+      requestBodyValues: {}
+    });
+
+    const error500 = new APICallError({
+      message: 'Internal server error',
+      statusCode: 500,
+      responseHeaders: {},
+      responseBody: 'Internal error',
+      url: 'https://api.example.com',
+      requestBodyValues: {}
+    });
+
+    const result529 = detectRetryableError(error529);
+    const result500 = detectRetryableError(error500);
+
+    expect(result529?.type).toBe('overloaded-error');
+    expect(result529?.requiresMessageCleanup).toBe(true);
+
+    expect(result500?.type).toBe('server-error');
+    expect(result500?.requiresMessageCleanup).toBeUndefined();
+  });
+
+  it('should use longer backoff for 529 errors', async () => {
+    const messages: CoreMessage[] = [
+      { role: 'user', content: 'Test' }
+    ];
+
+    const retryableError: RetryableError = {
+      type: 'overloaded-error',
+      originalError: new Error('529'),
+      healingMessage: { role: 'user', content: 'Retrying...' }
+    };
+
+    const context: WorkflowContext = { 
+      currentStep: 'analyst', // Changed from 'test' to valid value
+      availableTools: new Set()
+    };
+
+    const result = await handleRetryWithHealing(
+      retryableError,
+      messages,
+      1, // retryCount
+      context
+    );
+
+    // Backoff should be multiplied by 2 for overloaded errors
+    expect(result.backoffDelay).toBeGreaterThan(1000); // Base backoff is usually 1000ms for retry 1
+  });
+
+  it('should log cleanup details', async () => {
+    const consoleSpy = vi.spyOn(console, 'info');
+    
+    const messagesWithMultipleOrphans: CoreMessage[] = [
+      {
+        role: 'assistant',
+        content: [
+          { type: 'tool-call', toolCallId: 'tc-1', toolName: 'tool1', args: {} },
+          { type: 'tool-call', toolCallId: 'tc-2', toolName: 'tool2', args: {} }
+        ]
+      }
+    ];
+
+    const retryableError: RetryableError = {
+      type: 'overloaded-error',
+      originalError: new Error('529'),
+      healingMessage: { role: 'user', content: 'Retrying...' }
+    };
+
+    const context: WorkflowContext = { 
+      currentStep: 'analyst',
+      availableTools: new Set()
+    };
+
+    await handleRetryWithHealing(
+      retryableError,
+      messagesWithMultipleOrphans,
+      0,
+      context
+    );
+
+    expect(consoleSpy).toHaveBeenCalledWith(
+      'analyst: Cleaned incomplete tool calls after 529 error',
+      expect.objectContaining({
+        originalCount: 1,
+        cleanedCount: 0,
+        removed: 1
+      })
+    );
+
+    consoleSpy.mockRestore();
+  });
+
+  it('should preserve messages when no cleanup is needed', async () => {
+    const completeMessages: CoreMessage[] = [
+      {
+        role: 'assistant',
+        content: [
+          { type: 'tool-call', toolCallId: 'tc-123', toolName: 'getTodo', args: {} }
+        ]
+      },
+      {
+        role: 'tool',
+        content: [
+          { type: 'tool-result', toolCallId: 'tc-123', toolName: 'getTodo', result: { todo: 'test' } }
+        ]
+      }
+    ];
+
+    const retryableError: RetryableError = {
+      type: 'overloaded-error',
+      originalError: new Error('529'),
+      healingMessage: { role: 'user', content: 'Retrying...' }
+    };
+
+    const context: WorkflowContext = { 
+      currentStep: 'analyst',
+      availableTools: new Set()
+    };
+
+    const result = await handleRetryWithHealing(
+      retryableError,
+      completeMessages,
+      0,
+      context
+    );
+
+    expect(result.healedMessages).toHaveLength(2); // No cleanup needed
+    expect(result.healedMessages).toEqual(completeMessages);
+  });
+});
\ No newline at end of file