mirror of https://github.com/kortix-ai/suna.git
scrape & read based approach v1, non-optimal but better for ctxt
This commit is contained in:
parent
8847515d1c
commit
3a0f6db03e
|
@ -1,6 +1,6 @@
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
SYSTEM_PROMPT = f"""
|
SYSTEM_PROMPT = """
|
||||||
You are Suna.so, an autonomous AI Agent created by the Kortix team.
|
You are Suna.so, an autonomous AI Agent created by the Kortix team.
|
||||||
|
|
||||||
# 1. CORE IDENTITY & CAPABILITIES
|
# 1. CORE IDENTITY & CAPABILITIES
|
||||||
|
@ -89,7 +89,7 @@ You have the ability to execute operations using both Python and CLI tools:
|
||||||
- You can use the 'get_data_provider_endpoints' tool to get the endpoints for a specific data provider.
|
- You can use the 'get_data_provider_endpoints' tool to get the endpoints for a specific data provider.
|
||||||
- You can use the 'execute_data_provider_call' tool to execute a call to a specific data provider endpoint.
|
- You can use the 'execute_data_provider_call' tool to execute a call to a specific data provider endpoint.
|
||||||
- The data providers are:
|
- The data providers are:
|
||||||
* linkedin - for LinkedIn data
|
* linkedin - for LinkedIn data - ALWAYS USE THIS INSTEAD OF TRYING TO SCRAPE LINKEDIN PAGES
|
||||||
* twitter - for Twitter data
|
* twitter - for Twitter data
|
||||||
* zillow - for Zillow data
|
* zillow - for Zillow data
|
||||||
* amazon - for Amazon data
|
* amazon - for Amazon data
|
||||||
|
@ -97,6 +97,7 @@ You have the ability to execute operations using both Python and CLI tools:
|
||||||
* active_jobs - for Active Jobs data
|
* active_jobs - for Active Jobs data
|
||||||
- Use data providers where appropriate to get the most accurate and up-to-date data for your tasks. This is preferred over generic web scraping.
|
- Use data providers where appropriate to get the most accurate and up-to-date data for your tasks. This is preferred over generic web scraping.
|
||||||
- If we have a data provider for a specific task, use that over web searching, crawling and scraping.
|
- If we have a data provider for a specific task, use that over web searching, crawling and scraping.
|
||||||
|
- IMPORTANT: For LinkedIn profiles and company information, ALWAYS use the LinkedIn data provider instead of trying to scrape LinkedIn pages, which will fail due to access restrictions.
|
||||||
|
|
||||||
# 3. TOOLKIT & METHODOLOGY
|
# 3. TOOLKIT & METHODOLOGY
|
||||||
|
|
||||||
|
@ -361,16 +362,24 @@ You have the ability to execute operations using both Python and CLI tools:
|
||||||
## 4.4 WEB SEARCH & CONTENT EXTRACTION
|
## 4.4 WEB SEARCH & CONTENT EXTRACTION
|
||||||
- Research Best Practices:
|
- Research Best Practices:
|
||||||
1. ALWAYS use a multi-source approach for thorough research:
|
1. ALWAYS use a multi-source approach for thorough research:
|
||||||
|
* Use data providers first when available, especially for:
|
||||||
|
- LinkedIn profiles and company pages (ALWAYS use LinkedIn data provider, NEVER try to scrape LinkedIn)
|
||||||
|
- Twitter profiles and tweets
|
||||||
|
- Zillow real estate listings
|
||||||
|
- Amazon product listings
|
||||||
|
- Yahoo Finance stock and company data
|
||||||
* Start with web-search to find relevant URLs and sources
|
* Start with web-search to find relevant URLs and sources
|
||||||
* Use scrape-webpage on URLs from web-search results to get detailed content
|
* ALWAYS collect MULTIPLE URLs (at least 3-5) from search results
|
||||||
* Utilize data providers for real-time, accurate data when available
|
* ALWAYS scrape multiple URLs together in a single command, not one at a time:
|
||||||
|
CORRECT: `<scrape-webpage urls="url1,url2,url3,url4,url5"></scrape-webpage>`
|
||||||
|
INCORRECT: `<scrape-webpage urls="url1"></scrape-webpage>`
|
||||||
* Only use browser tools when scrape-webpage fails or interaction is needed
|
* Only use browser tools when scrape-webpage fails or interaction is needed
|
||||||
|
|
||||||
2. Data Provider Priority:
|
2. Data Provider Priority:
|
||||||
* ALWAYS check if a data provider exists for your research topic
|
* ALWAYS check if a data provider exists for your research topic
|
||||||
* Use data providers as the primary source when available
|
* Use data providers as the primary source when available
|
||||||
* Data providers offer real-time, accurate data for:
|
* Data providers offer real-time, accurate data for:
|
||||||
- LinkedIn data
|
- LinkedIn data (REQUIRED - NEVER try to scrape LinkedIn directly)
|
||||||
- Twitter data
|
- Twitter data
|
||||||
- Zillow data
|
- Zillow data
|
||||||
- Amazon data
|
- Amazon data
|
||||||
|
@ -379,51 +388,49 @@ You have the ability to execute operations using both Python and CLI tools:
|
||||||
* Only fall back to web search when no data provider is available
|
* Only fall back to web search when no data provider is available
|
||||||
|
|
||||||
3. Working with Scraped Web Content:
|
3. Working with Scraped Web Content:
|
||||||
* Scraped webpages are saved as JSON files in the /workspace/scrape directory
|
* Scraped webpages are JSON files in /workspace/scrape with structure: title, url, text, metadata
|
||||||
* The JSON structure includes: title, url, text (markdown content), and metadata
|
* BEST PRACTICES FOR LARGE FILES - ALWAYS:
|
||||||
* For efficient analysis of large scraped files, ALWAYS use chained CLI commands:
|
- Limit initial output with head/tail: `cat file.json | jq .text | head -n 300`
|
||||||
- Chain commands with && for processing in a single operation:
|
- Use grep with line limits: `cat file.json | jq .text | grep -m 20 "keyword"` (stops after 20 matches)
|
||||||
`mkdir -p temp_data && cat scrape/file.json | jq .text > temp_data/content.md && grep -A 10 "keyword" temp_data/content.md`
|
- Combine tools to focus on specific sections: `cat file.json | jq .text | grep -A 5 -B 5 -m 10 "keyword"`
|
||||||
- Combine jq, grep, awk, and other tools in a single pipeline:
|
- Process in chunks: `cat file.json | jq .text | head -n 1000 | grep "term"`
|
||||||
`cat scrape/file.json | jq .text | grep -C 5 "important topic" | awk '{{print $1, $2}}' > results.txt`
|
- Target specific line ranges: `cat file.json | jq .text | sed -n '100,120p'`
|
||||||
- Extract and process in one command:
|
|
||||||
`cat scrape/file.json | jq .text | grep -A 10 -B 10 "search term" | grep -v "exclude term" > relevant_section.txt && cat relevant_section.txt`
|
|
||||||
|
|
||||||
* IMPORTANT: Process multiple search results simultaneously with && for efficiency:
|
* EFFICIENT COMMAND PATTERNS:
|
||||||
- Process multiple files in one command:
|
- Single-pipeline extraction: `cat file.json | jq .text | grep -A 5 -B 5 -m 10 "keyword" > extract.txt && cat extract.txt`
|
||||||
`cat scrape/file1.json | jq .text | grep "term" > result1.txt && cat scrape/file2.json | jq .text | grep "term" > result2.txt && cat result1.txt result2.txt > combined.txt`
|
- Multi-file processing: `for f in scrape/*.json; do cat $f | jq .text | grep -m 5 "keyword" && echo "-- $f --"; done`
|
||||||
- Compare multiple sources in parallel:
|
- Targeted search with context limit: `cat file.json | jq .text | grep -A 10 -B 10 -m 5 "term" | grep -v "exclude"`
|
||||||
`cat scrape/source1.json | jq .text | grep -o "key term" | wc -l > count1.txt && cat scrape/source2.json | jq .text | grep -o "key term" | wc -l > count2.txt && echo "Source 1: $(cat count1.txt) occurrences, Source 2: $(cat count2.txt) occurrences"`
|
- Count before extracting: `cat file.json | jq .text | grep -c "term" && cat file.json | jq .text | grep -m 20 "term"`
|
||||||
- Search across multiple files:
|
|
||||||
`for f in scrape/*.json; do cat $f | jq .text | grep -l "search term" && echo "Found in $f"; done`
|
|
||||||
- Extract same information from multiple sources:
|
|
||||||
`cat scrape/file1.json | jq .metadata.date > dates.txt && cat scrape/file2.json | jq .metadata.date >> dates.txt && cat scrape/file3.json | jq .metadata.date >> dates.txt && sort dates.txt`
|
|
||||||
|
|
||||||
* Use these command combinations for specific tasks:
|
* MULTI-SOURCE PROCESSING:
|
||||||
- Preview content: `cat scrape/file.json | jq .text | head -n 20`
|
- Process multiple files together: `cat scrape/file1.json scrape/file2.json | jq .text | grep -m 30 "term"`
|
||||||
- Find sections with context: `cat scrape/file.json | jq .text | grep -A 10 -B 10 "keyword" | less`
|
- Compare sources: `cat scrape/file1.json | jq .text | grep -c "term" && cat scrape/file2.json | jq .text | grep -c "term"`
|
||||||
- Extract metadata: `cat scrape/file.json | jq .metadata`
|
- Extract from all files: `grep -l "term" scrape/*.json | xargs -I% cat % | jq .text | grep -A 5 -B 5 -m 3 "term"`
|
||||||
- Count occurrences: `cat scrape/file.json | jq .text | grep -o "term" | wc -l`
|
- Safe iteration: `find scrape -name "*.json" -type f | head -n 5 | xargs -I% sh -c 'echo "=== % ==="; cat % | jq .text | head -n 50'`
|
||||||
- Extract and save relevant parts: `cat scrape/file.json | jq .text | awk '/start pattern/,/end pattern/' > extract.txt`
|
|
||||||
|
|
||||||
* For structured analysis:
|
* KEY CLI SAFEGUARDS:
|
||||||
1. Extract focused sections and process them in a single command chain
|
- ALWAYS limit output size: use head/tail, grep -m, or other limiters
|
||||||
2. Avoid multiple separate commands when a single pipeline can accomplish the task
|
- Inspect before extracting: `cat file.json | jq .text | wc -l` to check size
|
||||||
3. Use temporary files only when necessary, prefer direct pipelines
|
- Process iteratively: examine small samples before processing entire files
|
||||||
4. ALWAYS chain commands with && when processing multiple files or sources
|
- Use line numbers to target sections: `sed -n '100,200p' file.txt`
|
||||||
5. Group related operations together into a single command execution
|
- Prefer targeted extraction over retrieving entire files
|
||||||
|
|
||||||
* When sharing scraped content in responses:
|
* INFORMATION GATHERING WORKFLOW:
|
||||||
1. Extract only the most relevant sections using efficient command chains
|
1. First check file size and structure: `du -h file.json && cat file.json | jq .text | wc -l`
|
||||||
2. Include original source URL attribution
|
2. Extract focused samples: `cat file.json | jq .text | grep -m 10 -A 3 -B 3 "keyword"`
|
||||||
3. Cite from multiple sources simultaneously using parallel processing
|
3. Refine search with additional context: `cat file.json | jq .text | grep -m 5 -A 10 -B 10 "term1" | grep "term2"`
|
||||||
|
4. Analyze multiple sources in parallel with safeguards against excessive output
|
||||||
|
5. Summarize findings from targeted extracts, not entire documents
|
||||||
|
|
||||||
4. Research Workflow:
|
4. Efficient Research Workflow:
|
||||||
a. First check for relevant data providers
|
a. First check for relevant data providers
|
||||||
b. If no data provider exists:
|
b. If no data provider exists:
|
||||||
- Use web-search to find relevant URLs
|
- Start with web-search to find relevant URLs:
|
||||||
- Use scrape-webpage on URLs from web-search results
|
`<web-search query="topic" num_results="10"></web-search>`
|
||||||
- Process scraped content with CLI tools (grep, jq, awk, etc.)
|
- Then scrape MULTIPLE relevant URLs at once (NEVER just one):
|
||||||
|
`<scrape-webpage urls="url1,url2,url3,url4,url5"></scrape-webpage>`
|
||||||
|
- Process scraped content with CLI tools in single command chains using limits:
|
||||||
|
* `cat scrape/20240601_123456_example_com.json | jq .text | grep -m 30 -A 10 -B 10 "key concept" > findings.txt && cat findings.txt`
|
||||||
- Only if scrape-webpage fails or if the page requires interaction:
|
- Only if scrape-webpage fails or if the page requires interaction:
|
||||||
* Use direct browser tools (browser_navigate_to, browser_go_back, browser_wait, browser_click_element, browser_input_text, browser_send_keys, browser_switch_tab, browser_close_tab, browser_scroll_down, browser_scroll_up, browser_scroll_to_text, browser_get_dropdown_options, browser_select_dropdown_option, browser_drag_drop, browser_click_coordinates etc.)
|
* Use direct browser tools (browser_navigate_to, browser_go_back, browser_wait, browser_click_element, browser_input_text, browser_send_keys, browser_switch_tab, browser_close_tab, browser_scroll_down, browser_scroll_up, browser_scroll_to_text, browser_get_dropdown_options, browser_select_dropdown_option, browser_drag_drop, browser_click_coordinates etc.)
|
||||||
* This is needed for:
|
* This is needed for:
|
||||||
|
@ -680,4 +687,4 @@ def get_system_prompt():
|
||||||
'''
|
'''
|
||||||
Returns the system prompt
|
Returns the system prompt
|
||||||
'''
|
'''
|
||||||
return SYSTEM_PROMPT
|
return SYSTEM_PROMPT.replace("//", "#")
|
|
@ -19,7 +19,7 @@ from agent.tools.sb_files_tool import SandboxFilesTool
|
||||||
from agent.tools.sb_browser_tool import SandboxBrowserTool
|
from agent.tools.sb_browser_tool import SandboxBrowserTool
|
||||||
from agent.tools.data_providers_tool import DataProvidersTool
|
from agent.tools.data_providers_tool import DataProvidersTool
|
||||||
from agent.prompt import get_system_prompt
|
from agent.prompt import get_system_prompt
|
||||||
from utils import logger
|
from utils.logger import logger
|
||||||
from utils.auth_utils import get_account_id_from_thread
|
from utils.auth_utils import get_account_id_from_thread
|
||||||
from services.billing import check_billing_status
|
from services.billing import check_billing_status
|
||||||
from agent.tools.sb_vision_tool import SandboxVisionTool
|
from agent.tools.sb_vision_tool import SandboxVisionTool
|
||||||
|
@ -39,7 +39,7 @@ async def run_agent(
|
||||||
enable_context_manager: bool = True
|
enable_context_manager: bool = True
|
||||||
):
|
):
|
||||||
"""Run the development agent with specified configuration."""
|
"""Run the development agent with specified configuration."""
|
||||||
print(f"🚀 Starting agent with model: {model_name}")
|
logger.info(f"🚀 Starting agent with model: {model_name}")
|
||||||
|
|
||||||
thread_manager = ThreadManager()
|
thread_manager = ThreadManager()
|
||||||
|
|
||||||
|
@ -90,7 +90,7 @@ async def run_agent(
|
||||||
|
|
||||||
while continue_execution and iteration_count < max_iterations:
|
while continue_execution and iteration_count < max_iterations:
|
||||||
iteration_count += 1
|
iteration_count += 1
|
||||||
print(f"🔄 Running iteration {iteration_count} of {max_iterations}...")
|
logger.info(f"🔄 Running iteration {iteration_count} of {max_iterations}...")
|
||||||
|
|
||||||
# Billing check on each iteration - still needed within the iterations
|
# Billing check on each iteration - still needed within the iterations
|
||||||
can_run, message, subscription = await check_billing_status(client, account_id)
|
can_run, message, subscription = await check_billing_status(client, account_id)
|
||||||
|
@ -108,7 +108,7 @@ async def run_agent(
|
||||||
if latest_message.data and len(latest_message.data) > 0:
|
if latest_message.data and len(latest_message.data) > 0:
|
||||||
message_type = latest_message.data[0].get('type')
|
message_type = latest_message.data[0].get('type')
|
||||||
if message_type == 'assistant':
|
if message_type == 'assistant':
|
||||||
print(f"Last message was from assistant, stopping execution")
|
logger.info(f"Last message was from assistant, stopping execution")
|
||||||
continue_execution = False
|
continue_execution = False
|
||||||
break
|
break
|
||||||
|
|
||||||
|
@ -215,7 +215,7 @@ async def run_agent(
|
||||||
)
|
)
|
||||||
|
|
||||||
if isinstance(response, dict) and "status" in response and response["status"] == "error":
|
if isinstance(response, dict) and "status" in response and response["status"] == "error":
|
||||||
print(f"Error response from run_thread: {response.get('message', 'Unknown error')}")
|
logger.error(f"Error response from run_thread: {response.get('message', 'Unknown error')}")
|
||||||
yield response
|
yield response
|
||||||
break
|
break
|
||||||
|
|
||||||
|
@ -228,7 +228,7 @@ async def run_agent(
|
||||||
async for chunk in response:
|
async for chunk in response:
|
||||||
# If we receive an error chunk, we should stop after this iteration
|
# If we receive an error chunk, we should stop after this iteration
|
||||||
if isinstance(chunk, dict) and chunk.get('type') == 'status' and chunk.get('status') == 'error':
|
if isinstance(chunk, dict) and chunk.get('type') == 'status' and chunk.get('status') == 'error':
|
||||||
print(f"Error chunk detected: {chunk.get('message', 'Unknown error')}")
|
logger.error(f"Error chunk detected: {chunk.get('message', 'Unknown error')}")
|
||||||
error_detected = True
|
error_detected = True
|
||||||
yield chunk # Forward the error chunk
|
yield chunk # Forward the error chunk
|
||||||
continue # Continue processing other chunks but don't break yet
|
continue # Continue processing other chunks but don't break yet
|
||||||
|
@ -256,27 +256,27 @@ async def run_agent(
|
||||||
xml_tool = 'web-browser-takeover'
|
xml_tool = 'web-browser-takeover'
|
||||||
|
|
||||||
last_tool_call = xml_tool
|
last_tool_call = xml_tool
|
||||||
print(f"Agent used XML tool: {xml_tool}")
|
logger.info(f"Agent used XML tool: {xml_tool}")
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
# Handle cases where content might not be valid JSON
|
# Handle cases where content might not be valid JSON
|
||||||
print(f"Warning: Could not parse assistant content JSON: {chunk.get('content')}")
|
logger.warning(f"Warning: Could not parse assistant content JSON: {chunk.get('content')}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error processing assistant chunk: {e}")
|
logger.error(f"Error processing assistant chunk: {e}")
|
||||||
|
|
||||||
yield chunk
|
yield chunk
|
||||||
|
|
||||||
# Check if we should stop based on the last tool call or error
|
# Check if we should stop based on the last tool call or error
|
||||||
if error_detected:
|
if error_detected:
|
||||||
print(f"Stopping due to error detected in response")
|
logger.info(f"Stopping due to error detected in response")
|
||||||
break
|
break
|
||||||
|
|
||||||
if last_tool_call in ['ask', 'complete', 'web-browser-takeover']:
|
if last_tool_call in ['ask', 'complete', 'web-browser-takeover']:
|
||||||
print(f"Agent decided to stop with tool: {last_tool_call}")
|
logger.info(f"Agent decided to stop with tool: {last_tool_call}")
|
||||||
continue_execution = False
|
continue_execution = False
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Just log the error and re-raise to stop all iterations
|
# Just log the error and re-raise to stop all iterations
|
||||||
error_msg = f"Error during response streaming: {str(e)}"
|
error_msg = f"Error during response streaming: {str(e)}"
|
||||||
print(f"Error: {error_msg}")
|
logger.error(f"Error: {error_msg}")
|
||||||
yield {
|
yield {
|
||||||
"type": "status",
|
"type": "status",
|
||||||
"status": "error",
|
"status": "error",
|
||||||
|
@ -288,7 +288,7 @@ async def run_agent(
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Just log the error and re-raise to stop all iterations
|
# Just log the error and re-raise to stop all iterations
|
||||||
error_msg = f"Error running thread: {str(e)}"
|
error_msg = f"Error running thread: {str(e)}"
|
||||||
print(f"Error: {error_msg}")
|
logger.error(f"Error: {error_msg}")
|
||||||
yield {
|
yield {
|
||||||
"type": "status",
|
"type": "status",
|
||||||
"status": "error",
|
"status": "error",
|
||||||
|
|
|
@ -37,7 +37,7 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
||||||
"type": "function",
|
"type": "function",
|
||||||
"function": {
|
"function": {
|
||||||
"name": "web_search",
|
"name": "web_search",
|
||||||
"description": "Search the web for up-to-date information on a specific topic using the Tavily API. This tool allows you to gather real-time information from the internet to answer user queries, research topics, validate facts, and find recent developments. Results include titles, URLs, summaries, and publication dates. Use this tool for discovering relevant web pages before potentially crawling them for complete content.",
|
"description": "Search the web for up-to-date information on a specific topic using the Tavily API. This tool allows you to gather real-time information from the internet to answer user queries, research topics, validate facts, and find recent developments. Results include titles, URLs, and publication dates. Use this tool for discovering relevant web pages before potentially crawling them for complete content.",
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
@ -45,11 +45,6 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "The search query to find relevant web pages. Be specific and include key terms to improve search accuracy. For best results, use natural language questions or keyword combinations that precisely describe what you're looking for."
|
"description": "The search query to find relevant web pages. Be specific and include key terms to improve search accuracy. For best results, use natural language questions or keyword combinations that precisely describe what you're looking for."
|
||||||
},
|
},
|
||||||
# "summary": {
|
|
||||||
# "type": "boolean",
|
|
||||||
# "description": "Whether to include a summary of each search result. Summaries provide key context about each page without requiring full content extraction. Set to true to get concise descriptions of each result.",
|
|
||||||
# "default": True
|
|
||||||
# },
|
|
||||||
"num_results": {
|
"num_results": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"description": "The number of search results to return. Increase for more comprehensive research or decrease for focused, high-relevance results.",
|
"description": "The number of search results to return. Increase for more comprehensive research or decrease for focused, high-relevance results.",
|
||||||
|
@ -64,7 +59,6 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
||||||
tag_name="web-search",
|
tag_name="web-search",
|
||||||
mappings=[
|
mappings=[
|
||||||
{"param_name": "query", "node_type": "attribute", "path": "."},
|
{"param_name": "query", "node_type": "attribute", "path": "."},
|
||||||
# {"param_name": "summary", "node_type": "attribute", "path": "."},
|
|
||||||
{"param_name": "num_results", "node_type": "attribute", "path": "."}
|
{"param_name": "num_results", "node_type": "attribute", "path": "."}
|
||||||
],
|
],
|
||||||
example='''
|
example='''
|
||||||
|
@ -87,14 +81,13 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
||||||
<!-- Another search example -->
|
<!-- Another search example -->
|
||||||
<web-search
|
<web-search
|
||||||
query="healthy breakfast recipes"
|
query="healthy breakfast recipes"
|
||||||
num_results="20">
|
num_results="10">
|
||||||
</web-search>
|
</web-search>
|
||||||
'''
|
'''
|
||||||
)
|
)
|
||||||
async def web_search(
|
async def web_search(
|
||||||
self,
|
self,
|
||||||
query: str,
|
query: str,
|
||||||
# summary: bool = True,
|
|
||||||
num_results: int = 20
|
num_results: int = 20
|
||||||
) -> ToolResult:
|
) -> ToolResult:
|
||||||
"""
|
"""
|
||||||
|
@ -119,6 +112,7 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
||||||
num_results = 20
|
num_results = 20
|
||||||
|
|
||||||
# Execute the search with Tavily
|
# Execute the search with Tavily
|
||||||
|
logging.info(f"Executing web search for query: '{query}' with {num_results} results")
|
||||||
search_response = await self.tavily_client.search(
|
search_response = await self.tavily_client.search(
|
||||||
query=query,
|
query=query,
|
||||||
max_results=num_results,
|
max_results=num_results,
|
||||||
|
@ -141,16 +135,10 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
||||||
"url": result.get("url", ""),
|
"url": result.get("url", ""),
|
||||||
}
|
}
|
||||||
|
|
||||||
# if summary:
|
|
||||||
# # Prefer full content; fall back to description
|
|
||||||
# formatted_result["snippet"] = (
|
|
||||||
# result.get("content") or
|
|
||||||
# result.get("description") or
|
|
||||||
# ""
|
|
||||||
# )
|
|
||||||
|
|
||||||
formatted_results.append(formatted_result)
|
formatted_results.append(formatted_result)
|
||||||
|
|
||||||
|
logging.info(f"Retrieved {len(formatted_results)} search results for query: '{query}'")
|
||||||
|
|
||||||
# Return a properly formatted ToolResult with the search results directly
|
# Return a properly formatted ToolResult with the search results directly
|
||||||
return ToolResult(
|
return ToolResult(
|
||||||
success=True,
|
success=True,
|
||||||
|
@ -159,6 +147,7 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_message = str(e)
|
error_message = str(e)
|
||||||
|
logging.error(f"Error performing web search for '{query}': {error_message}")
|
||||||
simplified_message = f"Error performing web search: {error_message[:200]}"
|
simplified_message = f"Error performing web search: {error_message[:200]}"
|
||||||
if len(error_message) > 200:
|
if len(error_message) > 200:
|
||||||
simplified_message += "..."
|
simplified_message += "..."
|
||||||
|
@ -168,39 +157,32 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
||||||
"type": "function",
|
"type": "function",
|
||||||
"function": {
|
"function": {
|
||||||
"name": "scrape_webpage",
|
"name": "scrape_webpage",
|
||||||
"description": "Retrieve the complete text content of a specific webpage using Firecrawl. This tool extracts the full text content from any accessible web page and returns it for analysis, processing, or reference. The extracted text includes the main content of the page without HTML markup. Note that some pages may have limitations on access due to paywalls, access restrictions, or dynamic content loading.",
|
"description": "Extract full text content from multiple webpages in a single operation. IMPORTANT: You should ALWAYS collect multiple relevant URLs from web-search results and scrape them all in a single call for efficiency. This tool saves time by processing multiple pages simultaneously rather than one at a time. The extracted text includes the main content of each page without HTML markup.",
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"url": {
|
"urls": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "The complete URL of the webpage to scrape. This should be a valid, accessible web address including the protocol (http:// or https://). The tool will attempt to extract all text content from this URL."
|
"description": "Multiple URLs to scrape, separated by commas. You should ALWAYS include several URLs when possible for efficiency. Example: 'https://example.com/page1,https://example.com/page2,https://example.com/page3'"
|
||||||
},
|
|
||||||
"result_name": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Name to use for the saved result file. If not provided, a name will be generated from the URL.",
|
|
||||||
"default": ""
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"required": ["url"]
|
"required": ["urls"]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
@xml_schema(
|
@xml_schema(
|
||||||
tag_name="scrape-webpage",
|
tag_name="scrape-webpage",
|
||||||
mappings=[
|
mappings=[
|
||||||
{"param_name": "url", "node_type": "attribute", "path": "."},
|
{"param_name": "urls", "node_type": "attribute", "path": "."}
|
||||||
{"param_name": "result_name", "node_type": "attribute", "path": ".", "required": False}
|
|
||||||
],
|
],
|
||||||
example='''
|
example='''
|
||||||
<!--
|
<!--
|
||||||
The scrape-webpage tool extracts the complete text content from web pages using Firecrawl.
|
The scrape-webpage tool extracts the complete text content from web pages using Firecrawl.
|
||||||
IMPORTANT WORKFLOW RULES:
|
IMPORTANT WORKFLOW RULES:
|
||||||
1. ALWAYS use web-search first to find relevant URLs
|
1. ALWAYS use web-search first to find relevant URLs
|
||||||
2. Then use scrape-webpage on URLs from web-search results
|
2. COLLECT MULTIPLE URLs from web-search results, not just one
|
||||||
3. Only if scrape-webpage fails or if the page requires interaction:
|
3. ALWAYS scrape multiple URLs at once for efficiency, rather than making separate calls
|
||||||
- Use direct browser tools (browser_navigate_to, browser_click_element, etc.)
|
4. Only use browser tools if scrape-webpage fails
|
||||||
- This is needed for dynamic content, JavaScript-heavy sites, or pages requiring interaction
|
|
||||||
|
|
||||||
Firecrawl Features:
|
Firecrawl Features:
|
||||||
- Converts web pages into clean markdown
|
- Converts web pages into clean markdown
|
||||||
|
@ -217,58 +199,116 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
||||||
num_results="5">
|
num_results="5">
|
||||||
</web-search>
|
</web-search>
|
||||||
|
|
||||||
<!-- 2. Then scrape specific URLs from search results -->
|
<!-- 2. WRONG WAY (inefficient) - don't do this -->
|
||||||
|
<!-- Don't scrape just one URL at a time -->
|
||||||
<scrape-webpage
|
<scrape-webpage
|
||||||
url="https://example.com/research/ai-paper-2024"
|
urls="https://example.com/research/ai-paper-2024">
|
||||||
result_name="ai_research_paper">
|
|
||||||
</scrape-webpage>
|
</scrape-webpage>
|
||||||
|
|
||||||
<!-- 3. Only if scrape fails or interaction needed, use browser tools -->
|
<!-- 3. CORRECT WAY (efficient) - do this instead -->
|
||||||
<!-- Example of when to use browser tools:
|
<!-- Always scrape multiple URLs in a single call -->
|
||||||
- Dynamic content loading
|
<scrape-webpage
|
||||||
- JavaScript-heavy sites
|
urls="https://example.com/research/paper1,https://example.com/research/paper2,https://example.com/research/paper3,https://example.com/research/paper4">
|
||||||
- Pages requiring login
|
</scrape-webpage>
|
||||||
- Interactive elements
|
|
||||||
- Infinite scroll pages
|
|
||||||
-->
|
|
||||||
'''
|
'''
|
||||||
)
|
)
|
||||||
async def scrape_webpage(
|
async def scrape_webpage(
|
||||||
self,
|
self,
|
||||||
url: str,
|
urls: str
|
||||||
result_name: str = ""
|
|
||||||
) -> ToolResult:
|
) -> ToolResult:
|
||||||
"""
|
"""
|
||||||
Retrieve the complete text content of a webpage using Firecrawl and save it to a file.
|
Retrieve the complete text content of multiple webpages in a single efficient operation.
|
||||||
|
|
||||||
This function scrapes the specified URL and extracts the full text content from the page.
|
ALWAYS collect multiple relevant URLs from search results and scrape them all at once
|
||||||
The extracted text is saved to a file in the /workspace/scrape directory.
|
rather than making separate calls for each URL. This is much more efficient.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
- url: The URL of the webpage to scrape
|
- urls: Multiple URLs to scrape, separated by commas
|
||||||
- result_name: Optional name for the result file (if not provided, generated from URL)
|
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
logging.info(f"Starting to scrape webpage: {url}")
|
logging.info(f"Starting to scrape webpages: {urls}")
|
||||||
|
|
||||||
# Ensure sandbox is initialized
|
# Ensure sandbox is initialized
|
||||||
await self._ensure_sandbox()
|
await self._ensure_sandbox()
|
||||||
|
|
||||||
# Parse the URL parameter exactly as it would appear in XML
|
# Parse the URLs parameter
|
||||||
if not url:
|
if not urls:
|
||||||
logging.warning("Scrape attempt with empty URL")
|
logging.warning("Scrape attempt with empty URLs")
|
||||||
return self.fail_response("A valid URL is required.")
|
return self.fail_response("Valid URLs are required.")
|
||||||
|
|
||||||
|
# Split the URLs string into a list
|
||||||
|
url_list = [url.strip() for url in urls.split(',') if url.strip()]
|
||||||
|
|
||||||
|
if not url_list:
|
||||||
|
logging.warning("No valid URLs found in the input")
|
||||||
|
return self.fail_response("No valid URLs provided.")
|
||||||
|
|
||||||
# Handle url parameter (as it would appear in XML)
|
if len(url_list) == 1:
|
||||||
if isinstance(url, str):
|
logging.warning("Only a single URL provided - for efficiency you should scrape multiple URLs at once")
|
||||||
# Add protocol if missing
|
|
||||||
if not (url.startswith('http://') or url.startswith('https://')):
|
logging.info(f"Processing {len(url_list)} URLs: {url_list}")
|
||||||
url = 'https://' + url
|
|
||||||
logging.info(f"Added https:// protocol to URL: {url}")
|
# Process each URL and collect results
|
||||||
|
results = []
|
||||||
|
for url in url_list:
|
||||||
|
try:
|
||||||
|
# Add protocol if missing
|
||||||
|
if not (url.startswith('http://') or url.startswith('https://')):
|
||||||
|
url = 'https://' + url
|
||||||
|
logging.info(f"Added https:// protocol to URL: {url}")
|
||||||
|
|
||||||
|
# Scrape this URL
|
||||||
|
result = await self._scrape_single_url(url)
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error processing URL {url}: {str(e)}")
|
||||||
|
results.append({
|
||||||
|
"url": url,
|
||||||
|
"success": False,
|
||||||
|
"error": str(e)
|
||||||
|
})
|
||||||
|
|
||||||
|
# Summarize results
|
||||||
|
successful = sum(1 for r in results if r.get("success", False))
|
||||||
|
failed = len(results) - successful
|
||||||
|
|
||||||
|
# Create success/failure message
|
||||||
|
if successful == len(results):
|
||||||
|
message = f"Successfully scraped all {len(results)} URLs. Results saved to:"
|
||||||
|
for r in results:
|
||||||
|
if r.get("file_path"):
|
||||||
|
message += f"\n- {r.get('file_path')}"
|
||||||
|
elif successful > 0:
|
||||||
|
message = f"Scraped {successful} URLs successfully and {failed} failed. Results saved to:"
|
||||||
|
for r in results:
|
||||||
|
if r.get("success", False) and r.get("file_path"):
|
||||||
|
message += f"\n- {r.get('file_path')}"
|
||||||
|
message += "\n\nFailed URLs:"
|
||||||
|
for r in results:
|
||||||
|
if not r.get("success", False):
|
||||||
|
message += f"\n- {r.get('url')}: {r.get('error', 'Unknown error')}"
|
||||||
else:
|
else:
|
||||||
logging.warning(f"Invalid URL type: {type(url)}")
|
error_details = "; ".join([f"{r.get('url')}: {r.get('error', 'Unknown error')}" for r in results])
|
||||||
return self.fail_response("URL must be a string.")
|
return self.fail_response(f"Failed to scrape all {len(results)} URLs. Errors: {error_details}")
|
||||||
|
|
||||||
|
return ToolResult(
|
||||||
|
success=True,
|
||||||
|
output=message
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
error_message = str(e)
|
||||||
|
logging.error(f"Error in scrape_webpage: {error_message}")
|
||||||
|
return self.fail_response(f"Error processing scrape request: {error_message[:200]}")
|
||||||
|
|
||||||
|
async def _scrape_single_url(self, url: str) -> dict:
|
||||||
|
"""
|
||||||
|
Helper function to scrape a single URL and return the result information.
|
||||||
|
"""
|
||||||
|
logging.info(f"Scraping single URL: {url}")
|
||||||
|
|
||||||
|
try:
|
||||||
# ---------- Firecrawl scrape endpoint ----------
|
# ---------- Firecrawl scrape endpoint ----------
|
||||||
logging.info(f"Sending request to Firecrawl for URL: {url}")
|
logging.info(f"Sending request to Firecrawl for URL: {url}")
|
||||||
async with httpx.AsyncClient() as client:
|
async with httpx.AsyncClient() as client:
|
||||||
|
@ -328,27 +368,17 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
||||||
formatted_result["metadata"] = data["data"]["metadata"]
|
formatted_result["metadata"] = data["data"]["metadata"]
|
||||||
logging.info(f"Added metadata: {data['data']['metadata'].keys()}")
|
logging.info(f"Added metadata: {data['data']['metadata'].keys()}")
|
||||||
|
|
||||||
# Create a safe filename from the URL or use provided result_name
|
# Create a simple filename from the URL domain and date
|
||||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
if result_name:
|
|
||||||
safe_filename = f"{timestamp}_{result_name}"
|
|
||||||
else:
|
|
||||||
# Extract domain and path from URL for the filename
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
parsed_url = urlparse(url)
|
|
||||||
domain = parsed_url.netloc.replace("www.", "")
|
|
||||||
path = parsed_url.path.rstrip("/")
|
|
||||||
if path:
|
|
||||||
last_part = path.split("/")[-1]
|
|
||||||
safe_filename = f"{timestamp}_{domain}_{last_part}"
|
|
||||||
else:
|
|
||||||
safe_filename = f"{timestamp}_{domain}"
|
|
||||||
# Clean up filename
|
|
||||||
safe_filename = "".join([c if c.isalnum() else "_" for c in safe_filename])[:60]
|
|
||||||
|
|
||||||
# Ensure .json extension
|
# Extract domain from URL for the filename
|
||||||
if not safe_filename.endswith('.json'):
|
from urllib.parse import urlparse
|
||||||
safe_filename += '.json'
|
parsed_url = urlparse(url)
|
||||||
|
domain = parsed_url.netloc.replace("www.", "")
|
||||||
|
|
||||||
|
# Clean up domain for filename
|
||||||
|
domain = "".join([c if c.isalnum() else "_" for c in domain])
|
||||||
|
safe_filename = f"{timestamp}_{domain}.json"
|
||||||
|
|
||||||
logging.info(f"Generated filename: {safe_filename}")
|
logging.info(f"Generated filename: {safe_filename}")
|
||||||
|
|
||||||
|
@ -365,34 +395,24 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
||||||
json_content.encode()
|
json_content.encode()
|
||||||
)
|
)
|
||||||
|
|
||||||
return ToolResult(
|
return {
|
||||||
success=True,
|
"url": url,
|
||||||
output=f"Successfully saved the scrape of the website under path '{results_file_path}'."
|
"success": True,
|
||||||
)
|
"title": title,
|
||||||
|
"file_path": results_file_path,
|
||||||
|
"content_length": len(markdown_content)
|
||||||
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_message = str(e)
|
error_message = str(e)
|
||||||
# Log the full error for debugging
|
logging.error(f"Error scraping URL '{url}': {error_message}")
|
||||||
logging.error(f"Scraping error for URL '{url}': {error_message}")
|
|
||||||
|
|
||||||
# Create a more informative error message for the user
|
# Create an error result
|
||||||
if "timeout" in error_message.lower():
|
return {
|
||||||
user_message = f"The request timed out while trying to scrape the webpage. The site might be slow or blocking automated access."
|
"url": url,
|
||||||
elif "connection" in error_message.lower():
|
"success": False,
|
||||||
user_message = f"Could not connect to the website. The site might be down or blocking access."
|
"error": error_message
|
||||||
elif "404" in error_message:
|
}
|
||||||
user_message = f"The webpage was not found (404 error). Please check if the URL is correct."
|
|
||||||
elif "403" in error_message:
|
|
||||||
user_message = f"Access to the webpage was forbidden (403 error). The site may be blocking automated access."
|
|
||||||
elif "401" in error_message:
|
|
||||||
user_message = f"Authentication required to access this webpage (401 error)."
|
|
||||||
else:
|
|
||||||
user_message = f"Error scraping webpage: {error_message[:200]}"
|
|
||||||
if len(error_message) > 200:
|
|
||||||
user_message += "..."
|
|
||||||
|
|
||||||
return self.fail_response(user_message)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
async def test_web_search():
|
async def test_web_search():
|
||||||
|
|
|
@ -351,21 +351,13 @@ class BrowserAutomation:
|
||||||
self.browser = await playwright.chromium.launch(**launch_options)
|
self.browser = await playwright.chromium.launch(**launch_options)
|
||||||
print("Browser launched with minimal options")
|
print("Browser launched with minimal options")
|
||||||
|
|
||||||
try:
|
# Directly create a single page
|
||||||
await self.get_current_page()
|
print("Creating a new page...")
|
||||||
print("Found existing page, using it")
|
page = await self.browser.new_page()
|
||||||
self.current_page_index = 0
|
self.pages.append(page)
|
||||||
except Exception as page_error:
|
self.current_page_index = 0
|
||||||
print(f"Error finding existing page, creating new one. ( {page_error})")
|
print("Browser initialization completed successfully")
|
||||||
# page = await self.browser.new_page()
|
|
||||||
print("New page created successfully")
|
|
||||||
# self.pages.append(page)
|
|
||||||
self.current_page_index = 0
|
|
||||||
# Navigate to about:blank to ensure page is ready
|
|
||||||
# await page.goto("google.com", timeout=30000)
|
|
||||||
# print("Navigated to google.com")
|
|
||||||
|
|
||||||
print("Browser initialization completed successfully")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Browser startup error: {str(e)}")
|
print(f"Browser startup error: {str(e)}")
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
Loading…
Reference in New Issue