mirror of https://github.com/kortix-ai/suna.git
scrape & read based approach v1, non-optimal but better for ctxt
This commit is contained in:
parent
8847515d1c
commit
3a0f6db03e
|
@ -1,6 +1,6 @@
|
|||
import datetime
|
||||
|
||||
SYSTEM_PROMPT = f"""
|
||||
SYSTEM_PROMPT = """
|
||||
You are Suna.so, an autonomous AI Agent created by the Kortix team.
|
||||
|
||||
# 1. CORE IDENTITY & CAPABILITIES
|
||||
|
@ -89,7 +89,7 @@ You have the ability to execute operations using both Python and CLI tools:
|
|||
- You can use the 'get_data_provider_endpoints' tool to get the endpoints for a specific data provider.
|
||||
- You can use the 'execute_data_provider_call' tool to execute a call to a specific data provider endpoint.
|
||||
- The data providers are:
|
||||
* linkedin - for LinkedIn data
|
||||
* linkedin - for LinkedIn data - ALWAYS USE THIS INSTEAD OF TRYING TO SCRAPE LINKEDIN PAGES
|
||||
* twitter - for Twitter data
|
||||
* zillow - for Zillow data
|
||||
* amazon - for Amazon data
|
||||
|
@ -97,6 +97,7 @@ You have the ability to execute operations using both Python and CLI tools:
|
|||
* active_jobs - for Active Jobs data
|
||||
- Use data providers where appropriate to get the most accurate and up-to-date data for your tasks. This is preferred over generic web scraping.
|
||||
- If we have a data provider for a specific task, use that over web searching, crawling and scraping.
|
||||
- IMPORTANT: For LinkedIn profiles and company information, ALWAYS use the LinkedIn data provider instead of trying to scrape LinkedIn pages, which will fail due to access restrictions.
|
||||
|
||||
# 3. TOOLKIT & METHODOLOGY
|
||||
|
||||
|
@ -361,16 +362,24 @@ You have the ability to execute operations using both Python and CLI tools:
|
|||
## 4.4 WEB SEARCH & CONTENT EXTRACTION
|
||||
- Research Best Practices:
|
||||
1. ALWAYS use a multi-source approach for thorough research:
|
||||
* Use data providers first when available, especially for:
|
||||
- LinkedIn profiles and company pages (ALWAYS use LinkedIn data provider, NEVER try to scrape LinkedIn)
|
||||
- Twitter profiles and tweets
|
||||
- Zillow real estate listings
|
||||
- Amazon product listings
|
||||
- Yahoo Finance stock and company data
|
||||
* Start with web-search to find relevant URLs and sources
|
||||
* Use scrape-webpage on URLs from web-search results to get detailed content
|
||||
* Utilize data providers for real-time, accurate data when available
|
||||
* ALWAYS collect MULTIPLE URLs (at least 3-5) from search results
|
||||
* ALWAYS scrape multiple URLs together in a single command, not one at a time:
|
||||
CORRECT: `<scrape-webpage urls="url1,url2,url3,url4,url5"></scrape-webpage>`
|
||||
INCORRECT: `<scrape-webpage urls="url1"></scrape-webpage>`
|
||||
* Only use browser tools when scrape-webpage fails or interaction is needed
|
||||
|
||||
2. Data Provider Priority:
|
||||
* ALWAYS check if a data provider exists for your research topic
|
||||
* Use data providers as the primary source when available
|
||||
* Data providers offer real-time, accurate data for:
|
||||
- LinkedIn data
|
||||
- LinkedIn data (REQUIRED - NEVER try to scrape LinkedIn directly)
|
||||
- Twitter data
|
||||
- Zillow data
|
||||
- Amazon data
|
||||
|
@ -379,51 +388,49 @@ You have the ability to execute operations using both Python and CLI tools:
|
|||
* Only fall back to web search when no data provider is available
|
||||
|
||||
3. Working with Scraped Web Content:
|
||||
* Scraped webpages are saved as JSON files in the /workspace/scrape directory
|
||||
* The JSON structure includes: title, url, text (markdown content), and metadata
|
||||
* For efficient analysis of large scraped files, ALWAYS use chained CLI commands:
|
||||
- Chain commands with && for processing in a single operation:
|
||||
`mkdir -p temp_data && cat scrape/file.json | jq .text > temp_data/content.md && grep -A 10 "keyword" temp_data/content.md`
|
||||
- Combine jq, grep, awk, and other tools in a single pipeline:
|
||||
`cat scrape/file.json | jq .text | grep -C 5 "important topic" | awk '{{print $1, $2}}' > results.txt`
|
||||
- Extract and process in one command:
|
||||
`cat scrape/file.json | jq .text | grep -A 10 -B 10 "search term" | grep -v "exclude term" > relevant_section.txt && cat relevant_section.txt`
|
||||
* Scraped webpages are JSON files in /workspace/scrape with structure: title, url, text, metadata
|
||||
* BEST PRACTICES FOR LARGE FILES - ALWAYS:
|
||||
- Limit initial output with head/tail: `cat file.json | jq .text | head -n 300`
|
||||
- Use grep with line limits: `cat file.json | jq .text | grep -m 20 "keyword"` (stops after 20 matches)
|
||||
- Combine tools to focus on specific sections: `cat file.json | jq .text | grep -A 5 -B 5 -m 10 "keyword"`
|
||||
- Process in chunks: `cat file.json | jq .text | head -n 1000 | grep "term"`
|
||||
- Target specific line ranges: `cat file.json | jq .text | sed -n '100,120p'`
|
||||
|
||||
* IMPORTANT: Process multiple search results simultaneously with && for efficiency:
|
||||
- Process multiple files in one command:
|
||||
`cat scrape/file1.json | jq .text | grep "term" > result1.txt && cat scrape/file2.json | jq .text | grep "term" > result2.txt && cat result1.txt result2.txt > combined.txt`
|
||||
- Compare multiple sources in parallel:
|
||||
`cat scrape/source1.json | jq .text | grep -o "key term" | wc -l > count1.txt && cat scrape/source2.json | jq .text | grep -o "key term" | wc -l > count2.txt && echo "Source 1: $(cat count1.txt) occurrences, Source 2: $(cat count2.txt) occurrences"`
|
||||
- Search across multiple files:
|
||||
`for f in scrape/*.json; do cat $f | jq .text | grep -l "search term" && echo "Found in $f"; done`
|
||||
- Extract same information from multiple sources:
|
||||
`cat scrape/file1.json | jq .metadata.date > dates.txt && cat scrape/file2.json | jq .metadata.date >> dates.txt && cat scrape/file3.json | jq .metadata.date >> dates.txt && sort dates.txt`
|
||||
* EFFICIENT COMMAND PATTERNS:
|
||||
- Single-pipeline extraction: `cat file.json | jq .text | grep -A 5 -B 5 -m 10 "keyword" > extract.txt && cat extract.txt`
|
||||
- Multi-file processing: `for f in scrape/*.json; do cat $f | jq .text | grep -m 5 "keyword" && echo "-- $f --"; done`
|
||||
- Targeted search with context limit: `cat file.json | jq .text | grep -A 10 -B 10 -m 5 "term" | grep -v "exclude"`
|
||||
- Count before extracting: `cat file.json | jq .text | grep -c "term" && cat file.json | jq .text | grep -m 20 "term"`
|
||||
|
||||
* Use these command combinations for specific tasks:
|
||||
- Preview content: `cat scrape/file.json | jq .text | head -n 20`
|
||||
- Find sections with context: `cat scrape/file.json | jq .text | grep -A 10 -B 10 "keyword" | less`
|
||||
- Extract metadata: `cat scrape/file.json | jq .metadata`
|
||||
- Count occurrences: `cat scrape/file.json | jq .text | grep -o "term" | wc -l`
|
||||
- Extract and save relevant parts: `cat scrape/file.json | jq .text | awk '/start pattern/,/end pattern/' > extract.txt`
|
||||
* MULTI-SOURCE PROCESSING:
|
||||
- Process multiple files together: `cat scrape/file1.json scrape/file2.json | jq .text | grep -m 30 "term"`
|
||||
- Compare sources: `cat scrape/file1.json | jq .text | grep -c "term" && cat scrape/file2.json | jq .text | grep -c "term"`
|
||||
- Extract from all files: `grep -l "term" scrape/*.json | xargs -I% cat % | jq .text | grep -A 5 -B 5 -m 3 "term"`
|
||||
- Safe iteration: `find scrape -name "*.json" -type f | head -n 5 | xargs -I% sh -c 'echo "=== % ==="; cat % | jq .text | head -n 50'`
|
||||
|
||||
* For structured analysis:
|
||||
1. Extract focused sections and process them in a single command chain
|
||||
2. Avoid multiple separate commands when a single pipeline can accomplish the task
|
||||
3. Use temporary files only when necessary, prefer direct pipelines
|
||||
4. ALWAYS chain commands with && when processing multiple files or sources
|
||||
5. Group related operations together into a single command execution
|
||||
* KEY CLI SAFEGUARDS:
|
||||
- ALWAYS limit output size: use head/tail, grep -m, or other limiters
|
||||
- Inspect before extracting: `cat file.json | jq .text | wc -l` to check size
|
||||
- Process iteratively: examine small samples before processing entire files
|
||||
- Use line numbers to target sections: `sed -n '100,200p' file.txt`
|
||||
- Prefer targeted extraction over retrieving entire files
|
||||
|
||||
* When sharing scraped content in responses:
|
||||
1. Extract only the most relevant sections using efficient command chains
|
||||
2. Include original source URL attribution
|
||||
3. Cite from multiple sources simultaneously using parallel processing
|
||||
* INFORMATION GATHERING WORKFLOW:
|
||||
1. First check file size and structure: `du -h file.json && cat file.json | jq .text | wc -l`
|
||||
2. Extract focused samples: `cat file.json | jq .text | grep -m 10 -A 3 -B 3 "keyword"`
|
||||
3. Refine search with additional context: `cat file.json | jq .text | grep -m 5 -A 10 -B 10 "term1" | grep "term2"`
|
||||
4. Analyze multiple sources in parallel with safeguards against excessive output
|
||||
5. Summarize findings from targeted extracts, not entire documents
|
||||
|
||||
4. Research Workflow:
|
||||
4. Efficient Research Workflow:
|
||||
a. First check for relevant data providers
|
||||
b. If no data provider exists:
|
||||
- Use web-search to find relevant URLs
|
||||
- Use scrape-webpage on URLs from web-search results
|
||||
- Process scraped content with CLI tools (grep, jq, awk, etc.)
|
||||
- Start with web-search to find relevant URLs:
|
||||
`<web-search query="topic" num_results="10"></web-search>`
|
||||
- Then scrape MULTIPLE relevant URLs at once (NEVER just one):
|
||||
`<scrape-webpage urls="url1,url2,url3,url4,url5"></scrape-webpage>`
|
||||
- Process scraped content with CLI tools in single command chains using limits:
|
||||
* `cat scrape/20240601_123456_example_com.json | jq .text | grep -m 30 -A 10 -B 10 "key concept" > findings.txt && cat findings.txt`
|
||||
- Only if scrape-webpage fails or if the page requires interaction:
|
||||
* Use direct browser tools (browser_navigate_to, browser_go_back, browser_wait, browser_click_element, browser_input_text, browser_send_keys, browser_switch_tab, browser_close_tab, browser_scroll_down, browser_scroll_up, browser_scroll_to_text, browser_get_dropdown_options, browser_select_dropdown_option, browser_drag_drop, browser_click_coordinates etc.)
|
||||
* This is needed for:
|
||||
|
@ -680,4 +687,4 @@ def get_system_prompt():
|
|||
'''
|
||||
Returns the system prompt
|
||||
'''
|
||||
return SYSTEM_PROMPT
|
||||
return SYSTEM_PROMPT.replace("//", "#")
|
|
@ -19,7 +19,7 @@ from agent.tools.sb_files_tool import SandboxFilesTool
|
|||
from agent.tools.sb_browser_tool import SandboxBrowserTool
|
||||
from agent.tools.data_providers_tool import DataProvidersTool
|
||||
from agent.prompt import get_system_prompt
|
||||
from utils import logger
|
||||
from utils.logger import logger
|
||||
from utils.auth_utils import get_account_id_from_thread
|
||||
from services.billing import check_billing_status
|
||||
from agent.tools.sb_vision_tool import SandboxVisionTool
|
||||
|
@ -39,7 +39,7 @@ async def run_agent(
|
|||
enable_context_manager: bool = True
|
||||
):
|
||||
"""Run the development agent with specified configuration."""
|
||||
print(f"🚀 Starting agent with model: {model_name}")
|
||||
logger.info(f"🚀 Starting agent with model: {model_name}")
|
||||
|
||||
thread_manager = ThreadManager()
|
||||
|
||||
|
@ -90,7 +90,7 @@ async def run_agent(
|
|||
|
||||
while continue_execution and iteration_count < max_iterations:
|
||||
iteration_count += 1
|
||||
print(f"🔄 Running iteration {iteration_count} of {max_iterations}...")
|
||||
logger.info(f"🔄 Running iteration {iteration_count} of {max_iterations}...")
|
||||
|
||||
# Billing check on each iteration - still needed within the iterations
|
||||
can_run, message, subscription = await check_billing_status(client, account_id)
|
||||
|
@ -108,7 +108,7 @@ async def run_agent(
|
|||
if latest_message.data and len(latest_message.data) > 0:
|
||||
message_type = latest_message.data[0].get('type')
|
||||
if message_type == 'assistant':
|
||||
print(f"Last message was from assistant, stopping execution")
|
||||
logger.info(f"Last message was from assistant, stopping execution")
|
||||
continue_execution = False
|
||||
break
|
||||
|
||||
|
@ -215,7 +215,7 @@ async def run_agent(
|
|||
)
|
||||
|
||||
if isinstance(response, dict) and "status" in response and response["status"] == "error":
|
||||
print(f"Error response from run_thread: {response.get('message', 'Unknown error')}")
|
||||
logger.error(f"Error response from run_thread: {response.get('message', 'Unknown error')}")
|
||||
yield response
|
||||
break
|
||||
|
||||
|
@ -228,7 +228,7 @@ async def run_agent(
|
|||
async for chunk in response:
|
||||
# If we receive an error chunk, we should stop after this iteration
|
||||
if isinstance(chunk, dict) and chunk.get('type') == 'status' and chunk.get('status') == 'error':
|
||||
print(f"Error chunk detected: {chunk.get('message', 'Unknown error')}")
|
||||
logger.error(f"Error chunk detected: {chunk.get('message', 'Unknown error')}")
|
||||
error_detected = True
|
||||
yield chunk # Forward the error chunk
|
||||
continue # Continue processing other chunks but don't break yet
|
||||
|
@ -256,27 +256,27 @@ async def run_agent(
|
|||
xml_tool = 'web-browser-takeover'
|
||||
|
||||
last_tool_call = xml_tool
|
||||
print(f"Agent used XML tool: {xml_tool}")
|
||||
logger.info(f"Agent used XML tool: {xml_tool}")
|
||||
except json.JSONDecodeError:
|
||||
# Handle cases where content might not be valid JSON
|
||||
print(f"Warning: Could not parse assistant content JSON: {chunk.get('content')}")
|
||||
logger.warning(f"Warning: Could not parse assistant content JSON: {chunk.get('content')}")
|
||||
except Exception as e:
|
||||
print(f"Error processing assistant chunk: {e}")
|
||||
logger.error(f"Error processing assistant chunk: {e}")
|
||||
|
||||
yield chunk
|
||||
|
||||
# Check if we should stop based on the last tool call or error
|
||||
if error_detected:
|
||||
print(f"Stopping due to error detected in response")
|
||||
logger.info(f"Stopping due to error detected in response")
|
||||
break
|
||||
|
||||
if last_tool_call in ['ask', 'complete', 'web-browser-takeover']:
|
||||
print(f"Agent decided to stop with tool: {last_tool_call}")
|
||||
logger.info(f"Agent decided to stop with tool: {last_tool_call}")
|
||||
continue_execution = False
|
||||
except Exception as e:
|
||||
# Just log the error and re-raise to stop all iterations
|
||||
error_msg = f"Error during response streaming: {str(e)}"
|
||||
print(f"Error: {error_msg}")
|
||||
logger.error(f"Error: {error_msg}")
|
||||
yield {
|
||||
"type": "status",
|
||||
"status": "error",
|
||||
|
@ -288,7 +288,7 @@ async def run_agent(
|
|||
except Exception as e:
|
||||
# Just log the error and re-raise to stop all iterations
|
||||
error_msg = f"Error running thread: {str(e)}"
|
||||
print(f"Error: {error_msg}")
|
||||
logger.error(f"Error: {error_msg}")
|
||||
yield {
|
||||
"type": "status",
|
||||
"status": "error",
|
||||
|
|
|
@ -37,7 +37,7 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
|||
"type": "function",
|
||||
"function": {
|
||||
"name": "web_search",
|
||||
"description": "Search the web for up-to-date information on a specific topic using the Tavily API. This tool allows you to gather real-time information from the internet to answer user queries, research topics, validate facts, and find recent developments. Results include titles, URLs, summaries, and publication dates. Use this tool for discovering relevant web pages before potentially crawling them for complete content.",
|
||||
"description": "Search the web for up-to-date information on a specific topic using the Tavily API. This tool allows you to gather real-time information from the internet to answer user queries, research topics, validate facts, and find recent developments. Results include titles, URLs, and publication dates. Use this tool for discovering relevant web pages before potentially crawling them for complete content.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
@ -45,11 +45,6 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
|||
"type": "string",
|
||||
"description": "The search query to find relevant web pages. Be specific and include key terms to improve search accuracy. For best results, use natural language questions or keyword combinations that precisely describe what you're looking for."
|
||||
},
|
||||
# "summary": {
|
||||
# "type": "boolean",
|
||||
# "description": "Whether to include a summary of each search result. Summaries provide key context about each page without requiring full content extraction. Set to true to get concise descriptions of each result.",
|
||||
# "default": True
|
||||
# },
|
||||
"num_results": {
|
||||
"type": "integer",
|
||||
"description": "The number of search results to return. Increase for more comprehensive research or decrease for focused, high-relevance results.",
|
||||
|
@ -64,7 +59,6 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
|||
tag_name="web-search",
|
||||
mappings=[
|
||||
{"param_name": "query", "node_type": "attribute", "path": "."},
|
||||
# {"param_name": "summary", "node_type": "attribute", "path": "."},
|
||||
{"param_name": "num_results", "node_type": "attribute", "path": "."}
|
||||
],
|
||||
example='''
|
||||
|
@ -87,14 +81,13 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
|||
<!-- Another search example -->
|
||||
<web-search
|
||||
query="healthy breakfast recipes"
|
||||
num_results="20">
|
||||
num_results="10">
|
||||
</web-search>
|
||||
'''
|
||||
)
|
||||
async def web_search(
|
||||
self,
|
||||
query: str,
|
||||
# summary: bool = True,
|
||||
query: str,
|
||||
num_results: int = 20
|
||||
) -> ToolResult:
|
||||
"""
|
||||
|
@ -119,6 +112,7 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
|||
num_results = 20
|
||||
|
||||
# Execute the search with Tavily
|
||||
logging.info(f"Executing web search for query: '{query}' with {num_results} results")
|
||||
search_response = await self.tavily_client.search(
|
||||
query=query,
|
||||
max_results=num_results,
|
||||
|
@ -141,16 +135,10 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
|||
"url": result.get("url", ""),
|
||||
}
|
||||
|
||||
# if summary:
|
||||
# # Prefer full content; fall back to description
|
||||
# formatted_result["snippet"] = (
|
||||
# result.get("content") or
|
||||
# result.get("description") or
|
||||
# ""
|
||||
# )
|
||||
|
||||
formatted_results.append(formatted_result)
|
||||
|
||||
logging.info(f"Retrieved {len(formatted_results)} search results for query: '{query}'")
|
||||
|
||||
# Return a properly formatted ToolResult with the search results directly
|
||||
return ToolResult(
|
||||
success=True,
|
||||
|
@ -159,6 +147,7 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
|||
|
||||
except Exception as e:
|
||||
error_message = str(e)
|
||||
logging.error(f"Error performing web search for '{query}': {error_message}")
|
||||
simplified_message = f"Error performing web search: {error_message[:200]}"
|
||||
if len(error_message) > 200:
|
||||
simplified_message += "..."
|
||||
|
@ -168,39 +157,32 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
|||
"type": "function",
|
||||
"function": {
|
||||
"name": "scrape_webpage",
|
||||
"description": "Retrieve the complete text content of a specific webpage using Firecrawl. This tool extracts the full text content from any accessible web page and returns it for analysis, processing, or reference. The extracted text includes the main content of the page without HTML markup. Note that some pages may have limitations on access due to paywalls, access restrictions, or dynamic content loading.",
|
||||
"description": "Extract full text content from multiple webpages in a single operation. IMPORTANT: You should ALWAYS collect multiple relevant URLs from web-search results and scrape them all in a single call for efficiency. This tool saves time by processing multiple pages simultaneously rather than one at a time. The extracted text includes the main content of each page without HTML markup.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"urls": {
|
||||
"type": "string",
|
||||
"description": "The complete URL of the webpage to scrape. This should be a valid, accessible web address including the protocol (http:// or https://). The tool will attempt to extract all text content from this URL."
|
||||
},
|
||||
"result_name": {
|
||||
"type": "string",
|
||||
"description": "Name to use for the saved result file. If not provided, a name will be generated from the URL.",
|
||||
"default": ""
|
||||
"description": "Multiple URLs to scrape, separated by commas. You should ALWAYS include several URLs when possible for efficiency. Example: 'https://example.com/page1,https://example.com/page2,https://example.com/page3'"
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
"required": ["urls"]
|
||||
}
|
||||
}
|
||||
})
|
||||
@xml_schema(
|
||||
tag_name="scrape-webpage",
|
||||
mappings=[
|
||||
{"param_name": "url", "node_type": "attribute", "path": "."},
|
||||
{"param_name": "result_name", "node_type": "attribute", "path": ".", "required": False}
|
||||
{"param_name": "urls", "node_type": "attribute", "path": "."}
|
||||
],
|
||||
example='''
|
||||
<!--
|
||||
The scrape-webpage tool extracts the complete text content from web pages using Firecrawl.
|
||||
IMPORTANT WORKFLOW RULES:
|
||||
1. ALWAYS use web-search first to find relevant URLs
|
||||
2. Then use scrape-webpage on URLs from web-search results
|
||||
3. Only if scrape-webpage fails or if the page requires interaction:
|
||||
- Use direct browser tools (browser_navigate_to, browser_click_element, etc.)
|
||||
- This is needed for dynamic content, JavaScript-heavy sites, or pages requiring interaction
|
||||
2. COLLECT MULTIPLE URLs from web-search results, not just one
|
||||
3. ALWAYS scrape multiple URLs at once for efficiency, rather than making separate calls
|
||||
4. Only use browser tools if scrape-webpage fails
|
||||
|
||||
Firecrawl Features:
|
||||
- Converts web pages into clean markdown
|
||||
|
@ -217,58 +199,116 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
|||
num_results="5">
|
||||
</web-search>
|
||||
|
||||
<!-- 2. Then scrape specific URLs from search results -->
|
||||
<!-- 2. WRONG WAY (inefficient) - don't do this -->
|
||||
<!-- Don't scrape just one URL at a time -->
|
||||
<scrape-webpage
|
||||
url="https://example.com/research/ai-paper-2024"
|
||||
result_name="ai_research_paper">
|
||||
urls="https://example.com/research/ai-paper-2024">
|
||||
</scrape-webpage>
|
||||
|
||||
<!-- 3. Only if scrape fails or interaction needed, use browser tools -->
|
||||
<!-- Example of when to use browser tools:
|
||||
- Dynamic content loading
|
||||
- JavaScript-heavy sites
|
||||
- Pages requiring login
|
||||
- Interactive elements
|
||||
- Infinite scroll pages
|
||||
-->
|
||||
<!-- 3. CORRECT WAY (efficient) - do this instead -->
|
||||
<!-- Always scrape multiple URLs in a single call -->
|
||||
<scrape-webpage
|
||||
urls="https://example.com/research/paper1,https://example.com/research/paper2,https://example.com/research/paper3,https://example.com/research/paper4">
|
||||
</scrape-webpage>
|
||||
'''
|
||||
)
|
||||
async def scrape_webpage(
|
||||
self,
|
||||
url: str,
|
||||
result_name: str = ""
|
||||
urls: str
|
||||
) -> ToolResult:
|
||||
"""
|
||||
Retrieve the complete text content of a webpage using Firecrawl and save it to a file.
|
||||
Retrieve the complete text content of multiple webpages in a single efficient operation.
|
||||
|
||||
This function scrapes the specified URL and extracts the full text content from the page.
|
||||
The extracted text is saved to a file in the /workspace/scrape directory.
|
||||
ALWAYS collect multiple relevant URLs from search results and scrape them all at once
|
||||
rather than making separate calls for each URL. This is much more efficient.
|
||||
|
||||
Parameters:
|
||||
- url: The URL of the webpage to scrape
|
||||
- result_name: Optional name for the result file (if not provided, generated from URL)
|
||||
- urls: Multiple URLs to scrape, separated by commas
|
||||
"""
|
||||
try:
|
||||
logging.info(f"Starting to scrape webpage: {url}")
|
||||
logging.info(f"Starting to scrape webpages: {urls}")
|
||||
|
||||
# Ensure sandbox is initialized
|
||||
await self._ensure_sandbox()
|
||||
|
||||
# Parse the URL parameter exactly as it would appear in XML
|
||||
if not url:
|
||||
logging.warning("Scrape attempt with empty URL")
|
||||
return self.fail_response("A valid URL is required.")
|
||||
# Parse the URLs parameter
|
||||
if not urls:
|
||||
logging.warning("Scrape attempt with empty URLs")
|
||||
return self.fail_response("Valid URLs are required.")
|
||||
|
||||
# Split the URLs string into a list
|
||||
url_list = [url.strip() for url in urls.split(',') if url.strip()]
|
||||
|
||||
if not url_list:
|
||||
logging.warning("No valid URLs found in the input")
|
||||
return self.fail_response("No valid URLs provided.")
|
||||
|
||||
# Handle url parameter (as it would appear in XML)
|
||||
if isinstance(url, str):
|
||||
# Add protocol if missing
|
||||
if not (url.startswith('http://') or url.startswith('https://')):
|
||||
url = 'https://' + url
|
||||
logging.info(f"Added https:// protocol to URL: {url}")
|
||||
if len(url_list) == 1:
|
||||
logging.warning("Only a single URL provided - for efficiency you should scrape multiple URLs at once")
|
||||
|
||||
logging.info(f"Processing {len(url_list)} URLs: {url_list}")
|
||||
|
||||
# Process each URL and collect results
|
||||
results = []
|
||||
for url in url_list:
|
||||
try:
|
||||
# Add protocol if missing
|
||||
if not (url.startswith('http://') or url.startswith('https://')):
|
||||
url = 'https://' + url
|
||||
logging.info(f"Added https:// protocol to URL: {url}")
|
||||
|
||||
# Scrape this URL
|
||||
result = await self._scrape_single_url(url)
|
||||
results.append(result)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error processing URL {url}: {str(e)}")
|
||||
results.append({
|
||||
"url": url,
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
})
|
||||
|
||||
# Summarize results
|
||||
successful = sum(1 for r in results if r.get("success", False))
|
||||
failed = len(results) - successful
|
||||
|
||||
# Create success/failure message
|
||||
if successful == len(results):
|
||||
message = f"Successfully scraped all {len(results)} URLs. Results saved to:"
|
||||
for r in results:
|
||||
if r.get("file_path"):
|
||||
message += f"\n- {r.get('file_path')}"
|
||||
elif successful > 0:
|
||||
message = f"Scraped {successful} URLs successfully and {failed} failed. Results saved to:"
|
||||
for r in results:
|
||||
if r.get("success", False) and r.get("file_path"):
|
||||
message += f"\n- {r.get('file_path')}"
|
||||
message += "\n\nFailed URLs:"
|
||||
for r in results:
|
||||
if not r.get("success", False):
|
||||
message += f"\n- {r.get('url')}: {r.get('error', 'Unknown error')}"
|
||||
else:
|
||||
logging.warning(f"Invalid URL type: {type(url)}")
|
||||
return self.fail_response("URL must be a string.")
|
||||
|
||||
error_details = "; ".join([f"{r.get('url')}: {r.get('error', 'Unknown error')}" for r in results])
|
||||
return self.fail_response(f"Failed to scrape all {len(results)} URLs. Errors: {error_details}")
|
||||
|
||||
return ToolResult(
|
||||
success=True,
|
||||
output=message
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_message = str(e)
|
||||
logging.error(f"Error in scrape_webpage: {error_message}")
|
||||
return self.fail_response(f"Error processing scrape request: {error_message[:200]}")
|
||||
|
||||
async def _scrape_single_url(self, url: str) -> dict:
|
||||
"""
|
||||
Helper function to scrape a single URL and return the result information.
|
||||
"""
|
||||
logging.info(f"Scraping single URL: {url}")
|
||||
|
||||
try:
|
||||
# ---------- Firecrawl scrape endpoint ----------
|
||||
logging.info(f"Sending request to Firecrawl for URL: {url}")
|
||||
async with httpx.AsyncClient() as client:
|
||||
|
@ -328,27 +368,17 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
|||
formatted_result["metadata"] = data["data"]["metadata"]
|
||||
logging.info(f"Added metadata: {data['data']['metadata'].keys()}")
|
||||
|
||||
# Create a safe filename from the URL or use provided result_name
|
||||
# Create a simple filename from the URL domain and date
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
if result_name:
|
||||
safe_filename = f"{timestamp}_{result_name}"
|
||||
else:
|
||||
# Extract domain and path from URL for the filename
|
||||
from urllib.parse import urlparse
|
||||
parsed_url = urlparse(url)
|
||||
domain = parsed_url.netloc.replace("www.", "")
|
||||
path = parsed_url.path.rstrip("/")
|
||||
if path:
|
||||
last_part = path.split("/")[-1]
|
||||
safe_filename = f"{timestamp}_{domain}_{last_part}"
|
||||
else:
|
||||
safe_filename = f"{timestamp}_{domain}"
|
||||
# Clean up filename
|
||||
safe_filename = "".join([c if c.isalnum() else "_" for c in safe_filename])[:60]
|
||||
|
||||
# Ensure .json extension
|
||||
if not safe_filename.endswith('.json'):
|
||||
safe_filename += '.json'
|
||||
# Extract domain from URL for the filename
|
||||
from urllib.parse import urlparse
|
||||
parsed_url = urlparse(url)
|
||||
domain = parsed_url.netloc.replace("www.", "")
|
||||
|
||||
# Clean up domain for filename
|
||||
domain = "".join([c if c.isalnum() else "_" for c in domain])
|
||||
safe_filename = f"{timestamp}_{domain}.json"
|
||||
|
||||
logging.info(f"Generated filename: {safe_filename}")
|
||||
|
||||
|
@ -365,34 +395,24 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
|||
json_content.encode()
|
||||
)
|
||||
|
||||
return ToolResult(
|
||||
success=True,
|
||||
output=f"Successfully saved the scrape of the website under path '{results_file_path}'."
|
||||
)
|
||||
return {
|
||||
"url": url,
|
||||
"success": True,
|
||||
"title": title,
|
||||
"file_path": results_file_path,
|
||||
"content_length": len(markdown_content)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
error_message = str(e)
|
||||
# Log the full error for debugging
|
||||
logging.error(f"Scraping error for URL '{url}': {error_message}")
|
||||
logging.error(f"Error scraping URL '{url}': {error_message}")
|
||||
|
||||
# Create a more informative error message for the user
|
||||
if "timeout" in error_message.lower():
|
||||
user_message = f"The request timed out while trying to scrape the webpage. The site might be slow or blocking automated access."
|
||||
elif "connection" in error_message.lower():
|
||||
user_message = f"Could not connect to the website. The site might be down or blocking access."
|
||||
elif "404" in error_message:
|
||||
user_message = f"The webpage was not found (404 error). Please check if the URL is correct."
|
||||
elif "403" in error_message:
|
||||
user_message = f"Access to the webpage was forbidden (403 error). The site may be blocking automated access."
|
||||
elif "401" in error_message:
|
||||
user_message = f"Authentication required to access this webpage (401 error)."
|
||||
else:
|
||||
user_message = f"Error scraping webpage: {error_message[:200]}"
|
||||
if len(error_message) > 200:
|
||||
user_message += "..."
|
||||
|
||||
return self.fail_response(user_message)
|
||||
|
||||
# Create an error result
|
||||
return {
|
||||
"url": url,
|
||||
"success": False,
|
||||
"error": error_message
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
async def test_web_search():
|
||||
|
|
|
@ -351,21 +351,13 @@ class BrowserAutomation:
|
|||
self.browser = await playwright.chromium.launch(**launch_options)
|
||||
print("Browser launched with minimal options")
|
||||
|
||||
try:
|
||||
await self.get_current_page()
|
||||
print("Found existing page, using it")
|
||||
self.current_page_index = 0
|
||||
except Exception as page_error:
|
||||
print(f"Error finding existing page, creating new one. ( {page_error})")
|
||||
# page = await self.browser.new_page()
|
||||
print("New page created successfully")
|
||||
# self.pages.append(page)
|
||||
self.current_page_index = 0
|
||||
# Navigate to about:blank to ensure page is ready
|
||||
# await page.goto("google.com", timeout=30000)
|
||||
# print("Navigated to google.com")
|
||||
|
||||
print("Browser initialization completed successfully")
|
||||
# Directly create a single page
|
||||
print("Creating a new page...")
|
||||
page = await self.browser.new_page()
|
||||
self.pages.append(page)
|
||||
self.current_page_index = 0
|
||||
print("Browser initialization completed successfully")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Browser startup error: {str(e)}")
|
||||
traceback.print_exc()
|
||||
|
|
Loading…
Reference in New Issue