diff --git a/backend/agent/prompt.py b/backend/agent/prompt.py index ce34ed89..99de8dd5 100644 --- a/backend/agent/prompt.py +++ b/backend/agent/prompt.py @@ -1,6 +1,6 @@ import datetime -SYSTEM_PROMPT = """ +SYSTEM_PROMPT = f""" You are Suna.so, an autonomous AI Agent created by the Kortix team. # 1. CORE IDENTITY & CAPABILITIES @@ -59,11 +59,11 @@ You have the ability to execute operations using both Python and CLI tools: * Always expose ports when you need to show running services to users ### 2.2.4 WEB SEARCH CAPABILITIES -- Searching the web for up-to-date information -- Retrieving and extracting content from specific webpages -- Filtering search results by date, relevance, and content +- Searching the web for up-to-date information with direct question answering +- Retrieving relevant images related to search queries +- Getting comprehensive search results with titles, URLs, and snippets - Finding recent news, articles, and information beyond training data -- Scraping webpage content for detailed information extraction +- Scraping webpage content for detailed information extraction when needed ### 2.2.5 BROWSER TOOLS AND CAPABILITIES - BROWSER OPERATIONS: @@ -89,7 +89,7 @@ You have the ability to execute operations using both Python and CLI tools: - You can use the 'get_data_provider_endpoints' tool to get the endpoints for a specific data provider. - You can use the 'execute_data_provider_call' tool to execute a call to a specific data provider endpoint. - The data providers are: - * linkedin - for LinkedIn data - ALWAYS USE THIS INSTEAD OF TRYING TO SCRAPE LINKEDIN PAGES + * linkedin - for LinkedIn data * twitter - for Twitter data * zillow - for Zillow data * amazon - for Amazon data @@ -97,7 +97,6 @@ You have the ability to execute operations using both Python and CLI tools: * active_jobs - for Active Jobs data - Use data providers where appropriate to get the most accurate and up-to-date data for your tasks. This is preferred over generic web scraping. - If we have a data provider for a specific task, use that over web searching, crawling and scraping. -- IMPORTANT: For LinkedIn profiles and company information, ALWAYS use the LinkedIn data provider instead of trying to scrape LinkedIn pages, which will fail due to access restrictions. # 3. TOOLKIT & METHODOLOGY @@ -119,86 +118,37 @@ You have the ability to execute operations using both Python and CLI tools: ## 3.2 CLI OPERATIONS BEST PRACTICES - Use terminal commands for system operations, file manipulations, and quick tasks -- Terminal commands now run in tmux sessions using these tools: - 1. Start a command: `command` - * Default: Non-blocking execution in background tmux session - * Optional parameters: - - blocking="true": Wait for command completion - - timeout="300": Set timeout in seconds for blocking commands (default: 60) +- For command execution, you have two approaches: + 1. Synchronous Commands (blocking): + * Use for quick operations that complete within 60 seconds + * Commands run directly and wait for completion + * Example: `ls -l` + * IMPORTANT: Do not use for long-running operations as they will timeout after 60 seconds - 2. Check command output: `` - * View current output of running commands - * Optional parameters: - - kill_session="true": Terminate session after checking - - 3. Kill a command: `` - * Terminates a running tmux session - - 4. List all commands: `` - * Shows all active tmux sessions - -- Common usage patterns: - * Long-running servers: - ``` - npm run dev - // Later check status: - - ``` - - * Quick operations: - ``` - npm install - ``` - - * Long blocking operations: - ``` - npm run build - ``` + 2. Asynchronous Commands (non-blocking): + * Use run_async="true" for any command that might take longer than 60 seconds + * Commands run in background and return immediately + * Example: `npm run dev` + * Common use cases: + - Development servers (Next.js, React, etc.) + - Build processes + - Long-running data processing + - Background services - Session Management: - * Each command uses a tmux session specified by session_name parameter - * Use meaningful session names: - - "dev" for development servers - - "build" for build processes - - "install" for package installations - * Sessions persist between commands until explicitly terminated - * Monitor output of running commands with: - `` - * Terminate sessions when done: - `` - * List all active sessions: - `` - -- Command Execution Workflow: - 1. Start non-blocking command: - `python -m http.server` - 2. Continue working on other tasks - 3. Check command output as needed: - `` - 4. Terminate when finished: - `` - -- Use logical session naming conventions for organization -- Chain commands with && for sequential execution -- Use | for piping output between commands -- Redirect output with > and >> for commands with large output + * Each command must specify a session_name + * Use consistent session names for related commands + * Different sessions are isolated from each other + * Example: Use "build" session for build commands, "dev" for development servers + * Sessions maintain state between commands - Command Execution Guidelines: - * Use non-blocking execution for: - - Development servers - - Build processes - - Data processing jobs - - Any command that might take >60 seconds - * Use blocking execution for: - - Quick commands (<60 seconds) - - Commands where you need immediate results - - Simple file operations - - Package installations with small dependencies - * Always use descriptive session names - * Always check and terminate sessions when done + * For commands that might take longer than 60 seconds, ALWAYS use run_async="true" + * Do not rely on increasing timeout for long-running commands + * Use proper session names for organization * Chain commands with && for sequential execution * Use | for piping output between commands - * Redirect output to files when needed with > or >> + * Redirect output to files for long-running processes - Avoid commands requiring confirmation; actively use -y or -f flags for automatic confirmation - Avoid commands with excessive output; save to files when necessary @@ -362,75 +312,27 @@ You have the ability to execute operations using both Python and CLI tools: ## 4.4 WEB SEARCH & CONTENT EXTRACTION - Research Best Practices: 1. ALWAYS use a multi-source approach for thorough research: - * Use data providers first when available, especially for: - - LinkedIn profiles and company pages (ALWAYS use LinkedIn data provider, NEVER try to scrape LinkedIn) - - Twitter profiles and tweets - - Zillow real estate listings - - Amazon product listings - - Yahoo Finance stock and company data - * Start with web-search to find relevant URLs and sources - * ALWAYS collect MULTIPLE URLs (at least 3-5) from search results - * ALWAYS scrape multiple URLs together in a single command, not one at a time: - CORRECT: `` - INCORRECT: `` + * Start with web-search to find direct answers, images, and relevant URLs + * Only use scrape-webpage when you need detailed content not available in the search results + * Utilize data providers for real-time, accurate data when available * Only use browser tools when scrape-webpage fails or interaction is needed - 2. Data Provider Priority: * ALWAYS check if a data provider exists for your research topic * Use data providers as the primary source when available * Data providers offer real-time, accurate data for: - - LinkedIn data (REQUIRED - NEVER try to scrape LinkedIn directly) + - LinkedIn data - Twitter data - Zillow data - Amazon data - Yahoo Finance data - Active Jobs data * Only fall back to web search when no data provider is available - - 3. Working with Scraped Web Content: - * Scraped webpages are JSON files in /workspace/scrape with structure: title, url, text, metadata - * BEST PRACTICES FOR LARGE FILES - ALWAYS: - - Limit initial output with head/tail: `cat file.json | jq .text | head -n 300` - - Use grep with line limits: `cat file.json | jq .text | grep -m 20 "keyword"` (stops after 20 matches) - - Combine tools to focus on specific sections: `cat file.json | jq .text | grep -A 5 -B 5 -m 10 "keyword"` - - Process in chunks: `cat file.json | jq .text | head -n 1000 | grep "term"` - - Target specific line ranges: `cat file.json | jq .text | sed -n '100,120p'` - - * EFFICIENT COMMAND PATTERNS: - - Single-pipeline extraction: `cat file.json | jq .text | grep -A 5 -B 5 -m 10 "keyword" > extract.txt && cat extract.txt` - - Multi-file processing: `for f in scrape/*.json; do cat $f | jq .text | grep -m 5 "keyword" && echo "-- $f --"; done` - - Targeted search with context limit: `cat file.json | jq .text | grep -A 10 -B 10 -m 5 "term" | grep -v "exclude"` - - Count before extracting: `cat file.json | jq .text | grep -c "term" && cat file.json | jq .text | grep -m 20 "term"` - - * MULTI-SOURCE PROCESSING: - - Process multiple files together: `cat scrape/file1.json scrape/file2.json | jq .text | grep -m 30 "term"` - - Compare sources: `cat scrape/file1.json | jq .text | grep -c "term" && cat scrape/file2.json | jq .text | grep -c "term"` - - Extract from all files: `grep -l "term" scrape/*.json | xargs -I% cat % | jq .text | grep -A 5 -B 5 -m 3 "term"` - - Safe iteration: `find scrape -name "*.json" -type f | head -n 5 | xargs -I% sh -c 'echo "=== % ==="; cat % | jq .text | head -n 50'` - - * KEY CLI SAFEGUARDS: - - ALWAYS limit output size: use head/tail, grep -m, or other limiters - - Inspect before extracting: `cat file.json | jq .text | wc -l` to check size - - Process iteratively: examine small samples before processing entire files - - Use line numbers to target sections: `sed -n '100,200p' file.txt` - - Prefer targeted extraction over retrieving entire files - - * INFORMATION GATHERING WORKFLOW: - 1. First check file size and structure: `du -h file.json && cat file.json | jq .text | wc -l` - 2. Extract focused samples: `cat file.json | jq .text | grep -m 10 -A 3 -B 3 "keyword"` - 3. Refine search with additional context: `cat file.json | jq .text | grep -m 5 -A 10 -B 10 "term1" | grep "term2"` - 4. Analyze multiple sources in parallel with safeguards against excessive output - 5. Summarize findings from targeted extracts, not entire documents - - 4. Efficient Research Workflow: + 3. Research Workflow: a. First check for relevant data providers b. If no data provider exists: - - Start with web-search to find relevant URLs: - `` - - Then scrape MULTIPLE relevant URLs at once (NEVER just one): - `` - - Process scraped content with CLI tools in single command chains using limits: - * `cat scrape/20240601_123456_example_com.json | jq .text | grep -m 30 -A 10 -B 10 "key concept" > findings.txt && cat findings.txt` + - Use web-search to get direct answers, images, and relevant URLs + - Only if you need specific details not found in search results: + * Use scrape-webpage on specific URLs from web-search results - Only if scrape-webpage fails or if the page requires interaction: * Use direct browser tools (browser_navigate_to, browser_go_back, browser_wait, browser_click_element, browser_input_text, browser_send_keys, browser_switch_tab, browser_close_tab, browser_scroll_down, browser_scroll_up, browser_scroll_to_text, browser_get_dropdown_options, browser_select_dropdown_option, browser_drag_drop, browser_click_coordinates etc.) * This is needed for: @@ -444,31 +346,41 @@ You have the ability to execute operations using both Python and CLI tools: e. Document sources and timestamps - Web Search Best Practices: - 1. Use specific, targeted search queries to obtain the most relevant results + 1. Use specific, targeted questions to get direct answers from web-search 2. Include key terms and contextual information in search queries 3. Filter search results by date when freshness is important - 4. Use include_text/exclude_text parameters to refine search results + 4. Review the direct answer, images, and search results 5. Analyze multiple search results to cross-validate information -- Web Content Extraction Workflow: - 1. ALWAYS start with web-search to find relevant URLs - 2. Use scrape-webpage on URLs from web-search results - 3. Only if scrape-webpage fails or if the page requires interaction: - - Use direct browser tools (browser_navigate_to, browser_go_back, browser_wait, browser_click_element, browser_input_text, browser_send_keys, browser_switch_tab, browser_close_tab, browser_scroll_down, browser_scroll_up, browser_scroll_to_text, browser_get_dropdown_options, browser_select_dropdown_option, browser_drag_drop, browser_click_coordinates etc.) +- Content Extraction Decision Tree: + 1. ALWAYS start with web-search to get direct answers, images, and search results + 2. Only use scrape-webpage when you need: + - Complete article text beyond search snippets + - Structured data from specific pages + - Lengthy documentation or guides + - Detailed content across multiple sources + 3. Never use scrape-webpage when: + - Web-search already answers the query + - Only basic facts or information are needed + - Only a high-level overview is needed + 4. Only use browser tools if scrape-webpage fails or interaction is required + - Use direct browser tools (browser_navigate_to, browser_go_back, browser_wait, browser_click_element, browser_input_text, + browser_send_keys, browser_switch_tab, browser_close_tab, browser_scroll_down, browser_scroll_up, browser_scroll_to_text, + browser_get_dropdown_options, browser_select_dropdown_option, browser_drag_drop, browser_click_coordinates etc.) - This is needed for: * Dynamic content loading * JavaScript-heavy sites * Pages requiring login * Interactive elements * Infinite scroll pages - 4. DO NOT use browser tools directly unless scrape-webpage fails or interaction is required - 5. Maintain this strict workflow order: web-search → scrape-webpage → direct browser tools (if needed) + DO NOT use browser tools directly unless interaction is required. + 5. Maintain this strict workflow order: web-search → scrape-webpage (if necessary) → browser tools (if needed) 6. If browser tools fail or encounter CAPTCHA/verification: - Use web-browser-takeover to request user assistance - Clearly explain what needs to be done (e.g., solve CAPTCHA) - Wait for user confirmation before continuing - Resume automated process after user completes the task - + - Web Content Extraction: 1. Verify URL validity before scraping 2. Extract and save content to files for further processing @@ -687,4 +599,4 @@ def get_system_prompt(): ''' Returns the system prompt ''' - return SYSTEM_PROMPT.replace("//", "#") \ No newline at end of file + return SYSTEM_PROMPT \ No newline at end of file diff --git a/backend/agent/prompt.txt b/backend/agent/prompt.txt index 255daf2c..ebbf4164 100644 --- a/backend/agent/prompt.txt +++ b/backend/agent/prompt.txt @@ -841,7 +841,7 @@ Ask user a question and wait for response. Use for: 1) Requesting clarification + num_results="20"> diff --git a/backend/agent/sample_responses/2.txt b/backend/agent/sample_responses/2.txt index c868d7e4..c7d74926 100644 --- a/backend/agent/sample_responses/2.txt +++ b/backend/agent/sample_responses/2.txt @@ -43,31 +43,31 @@ First, I'll create a todo list to organize our work and make sure we cover all a Now I'll start researching flight options and the best regions to visit based on your interests. Let's begin by searching for flight information and gathering essential data about traveling to Japan from Seattle. - + Let me also search for information about the best places to visit in Japan, especially focusing on historical sites and cultural experiences: - + Now let's look for information about Nara deer park and other cultural experiences you mentioned: - + - + - + - + Let's also research hidden gems and romantic spots in Japan for your proposal: - + - + Now let's look at transportation options within Japan: - + Based on the searches, I'm going to compile information to create your itinerary. First, let me mark some research tasks as complete and update our todo list: @@ -112,7 +112,7 @@ Based on the searches, I'm going to compile information to create your itinerary Now I'll start crafting your detailed 7-day itinerary based on the research. First, let me research accommodation options within your budget: - + Based on all the research, I'm now going to create a detailed itinerary file: diff --git a/backend/agent/tools/web_search_tool.py b/backend/agent/tools/web_search_tool.py index 517edae6..25ef62b9 100644 --- a/backend/agent/tools/web_search_tool.py +++ b/backend/agent/tools/web_search_tool.py @@ -66,22 +66,26 @@ class SandboxWebSearchTool(SandboxToolsBase): The web-search tool allows you to search the internet for real-time information. Use this tool when you need to find current information, research topics, or verify facts. - The tool returns information including: - - Titles of relevant web pages - - URLs for accessing the pages - - Published dates (when available) + THE TOOL NOW RETURNS: + - Direct answer to your query from search results + - Relevant images when available + - Detailed search results including titles, URLs, and snippets + + WORKFLOW RECOMMENDATION: + 1. Use web-search first with a specific question to get direct answers + 2. Only use scrape-webpage if you need more detailed information from specific pages --> + query="latest AI research on transformer models" + num_results="20"> ''' ) @@ -116,33 +120,18 @@ class SandboxWebSearchTool(SandboxToolsBase): search_response = await self.tavily_client.search( query=query, max_results=num_results, - include_answer=False, - include_images=False, + include_images=True, + include_answer="advanced", + search_depth="advanced", ) - - # Normalize the response format - raw_results = ( - search_response.get("results") - if isinstance(search_response, dict) - else search_response - ) - - # Format results consistently - formatted_results = [] - for result in raw_results: - formatted_result = { - "title": result.get("title", ""), - "url": result.get("url", ""), - } - - formatted_results.append(formatted_result) - logging.info(f"Retrieved {len(formatted_results)} search results for query: '{query}'") + # Return the complete Tavily response + # This includes the query, answer, results, images and more + logging.info(f"Retrieved search results for query: '{query}' with answer and {len(search_response.get('results', []))} results") - # Return a properly formatted ToolResult with the search results directly return ToolResult( success=True, - output=json.dumps(formatted_results, ensure_ascii=False) + output=json.dumps(search_response, ensure_ascii=False) ) except Exception as e: @@ -176,40 +165,50 @@ class SandboxWebSearchTool(SandboxToolsBase): {"param_name": "urls", "node_type": "attribute", "path": "."} ], example=''' - - + + query="what is Kortix AI and what are they building?" + num_results="20"> - - + + urls="https://www.kortix.ai/,https://github.com/kortix-ai/suna"> - - - - + + ''' ) async def scrape_webpage( diff --git a/frontend/src/components/thread/tool-views/WebSearchToolView.tsx b/frontend/src/components/thread/tool-views/WebSearchToolView.tsx index 2be673d1..3b4a6905 100644 --- a/frontend/src/components/thread/tool-views/WebSearchToolView.tsx +++ b/frontend/src/components/thread/tool-views/WebSearchToolView.tsx @@ -5,6 +5,7 @@ import { CheckCircle, AlertTriangle, ExternalLink, + Image as ImageIcon, } from 'lucide-react'; import { ToolViewProps } from './types'; import { @@ -28,6 +29,28 @@ export function WebSearchToolView({ const query = extractSearchQuery(assistantContent); const searchResults = extractSearchResults(toolContent); const toolTitle = getToolTitle(name); + + // Extract additional data from Tavily response + const [answer, setAnswer] = React.useState(null); + const [images, setImages] = React.useState([]); + + React.useEffect(() => { + if (toolContent) { + try { + const parsedContent = JSON.parse(toolContent); + // Check if it's the new Tavily response format with answer + if (parsedContent.answer && typeof parsedContent.answer === 'string') { + setAnswer(parsedContent.answer); + } + // Check for images array + if (parsedContent.images && Array.isArray(parsedContent.images)) { + setImages(parsedContent.images); + } + } catch (e) { + // Silently fail - the view will work without these extras + } + } + }, [toolContent]); return (
@@ -77,6 +100,52 @@ export function WebSearchToolView({
) : searchResults.length > 0 ? (
+ {/* Answer section */} + {answer && ( +
+

+ Answer +

+

+ {answer} +

+
+ )} + + {/* Images section */} + {images.length > 0 && ( +
+

+ + Images +

+
+ {images.map((image, idx) => ( + + {`Search { + const target = e.target as HTMLImageElement; + target.src = "data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='24' height='24' viewBox='0 0 24 24' fill='none' stroke='%23888' stroke-width='2' stroke-linecap='round' stroke-linejoin='round'%3E%3Crect x='3' y='3' width='18' height='18' rx='2' ry='2'%3E%3C/rect%3E%3Ccircle cx='8.5' cy='8.5' r='1.5'%3E%3C/circle%3E%3Cpolyline points='21 15 16 10 5 21'%3E%3C/polyline%3E%3C/svg%3E"; + target.classList.add("p-4"); + }} + /> +
+
+ ))} +
+
+ )} + + {/* Results section */}
Found {searchResults.length} results
diff --git a/frontend/src/components/thread/tool-views/utils.ts b/frontend/src/components/thread/tool-views/utils.ts index aa7f3001..8516ec89 100644 --- a/frontend/src/components/thread/tool-views/utils.ts +++ b/frontend/src/components/thread/tool-views/utils.ts @@ -289,30 +289,37 @@ export function extractSearchQuery(content: string | undefined): string | null { let contentToSearch = content; // Start with the original content - // 3. Try parsing as JSON first, as the relevant content might be nested + // Try parsing as JSON first try { - const parsedOuter = JSON.parse(content); - if (typeof parsedOuter.content === 'string') { + const parsedContent = JSON.parse(content); + + // Check if it's the new Tavily response format + if (parsedContent.query && typeof parsedContent.query === 'string') { + return parsedContent.query; + } + + // Continue with existing logic for backward compatibility + if (typeof parsedContent.content === 'string') { // If the outer content is JSON and has a 'content' string field, // use that inner content for searching the query. - contentToSearch = parsedOuter.content; + contentToSearch = parsedContent.content; // Also check common JSON structures within the outer parsed object itself - if (typeof parsedOuter.query === 'string') { - return parsedOuter.query; + if (typeof parsedContent.query === 'string') { + return parsedContent.query; } if ( - typeof parsedOuter.arguments === 'object' && - parsedOuter.arguments !== null && - typeof parsedOuter.arguments.query === 'string' + typeof parsedContent.arguments === 'object' && + parsedContent.arguments !== null && + typeof parsedContent.arguments.query === 'string' ) { - return parsedOuter.arguments.query; + return parsedContent.arguments.query; } if ( - Array.isArray(parsedOuter.tool_calls) && - parsedOuter.tool_calls.length > 0 + Array.isArray(parsedContent.tool_calls) && + parsedContent.tool_calls.length > 0 ) { - const toolCall = parsedOuter.tool_calls[0]; + const toolCall = parsedContent.tool_calls[0]; if ( typeof toolCall.arguments === 'object' && toolCall.arguments !== null && @@ -766,10 +773,20 @@ export function extractSearchResults( ): Array<{ title: string; url: string; snippet?: string }> { if (!content) return []; - // First try the standard JSON extraction methods + // First check if it's the new Tavily response format try { - // Try to parse JSON content first const parsedContent = JSON.parse(content); + + // Check if this is the new Tavily response format + if (parsedContent.results && Array.isArray(parsedContent.results)) { + return parsedContent.results.map(result => ({ + title: result.title || '', + url: result.url || '', + snippet: result.content || '', + })); + } + + // Continue with existing logic for backward compatibility if (parsedContent.content && typeof parsedContent.content === 'string') { // Look for a tool_result tag const toolResultMatch = parsedContent.content.match(