mirror of https://github.com/kortix-ai/suna.git
tool vie wip
This commit is contained in:
parent
4729999bb8
commit
d5c59b1acb
|
@ -5,6 +5,7 @@ from datetime import datetime
|
|||
import os
|
||||
from dotenv import load_dotenv
|
||||
from agentpress.tool import Tool, ToolResult, openapi_schema, xml_schema
|
||||
import json
|
||||
|
||||
# TODO: add subpages, etc... in filters as sometimes its necessary
|
||||
|
||||
|
@ -92,30 +93,13 @@ class WebSearchTool(Tool):
|
|||
) -> ToolResult:
|
||||
"""
|
||||
Search the web using the Exa API to find relevant and up-to-date information.
|
||||
|
||||
This function performs a web search based on the provided query and returns a list
|
||||
of relevant search results. Each result includes metadata about the webpage, such as
|
||||
title, URL, summary (if requested), publication date, and relevance score.
|
||||
|
||||
The returned data for each result includes:
|
||||
- Title: The title of the webpage
|
||||
- URL: The URL of the webpage
|
||||
- Summary: A brief summary of the webpage content (if summary=True)
|
||||
- Published Date: When the content was published (if available)
|
||||
- Score: The relevance score of the result
|
||||
|
||||
Parameters:
|
||||
- query: The search query to find relevant web pages
|
||||
- summary: Whether to include a summary of the results (default: True)
|
||||
- num_results: The number of results to return (default: 20)
|
||||
"""
|
||||
try:
|
||||
# Ensure we have a valid query
|
||||
if not query or not isinstance(query, str):
|
||||
return self.fail_response("A valid search query is required.")
|
||||
|
||||
# ---------- Tavily search parameters ----------
|
||||
# num_results normalisation (1‑50)
|
||||
# Normalize num_results
|
||||
if num_results is None:
|
||||
num_results = 20
|
||||
elif isinstance(num_results, int):
|
||||
|
@ -136,30 +120,36 @@ class WebSearchTool(Tool):
|
|||
include_images=False,
|
||||
)
|
||||
|
||||
# `tavily` may return a dict with `results` or a bare list
|
||||
# Normalize the response format
|
||||
raw_results = (
|
||||
search_response.get("results")
|
||||
if isinstance(search_response, dict)
|
||||
else search_response
|
||||
)
|
||||
|
||||
# Format results consistently
|
||||
formatted_results = []
|
||||
for result in raw_results:
|
||||
formatted_result = {
|
||||
"Title": result.get("title"),
|
||||
"URL": result.get("url"),
|
||||
"title": result.get("title", ""),
|
||||
"url": result.get("url", ""),
|
||||
}
|
||||
|
||||
if summary:
|
||||
# Prefer full content; fall back to description
|
||||
if result.get("content"):
|
||||
formatted_result["Summary"] = result["content"]
|
||||
elif result.get("description"):
|
||||
formatted_result["Summary"] = result["description"]
|
||||
formatted_result["snippet"] = (
|
||||
result.get("content") or
|
||||
result.get("description") or
|
||||
""
|
||||
)
|
||||
|
||||
formatted_results.append(formatted_result)
|
||||
|
||||
return self.success_response(formatted_results)
|
||||
# Return a properly formatted ToolResult
|
||||
return ToolResult(
|
||||
success=True,
|
||||
output=json.dumps(formatted_results, ensure_ascii=False)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_message = str(e)
|
||||
|
@ -257,11 +247,8 @@ class WebSearchTool(Tool):
|
|||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
print(f"--- Raw Tavily Response ---")
|
||||
print(data)
|
||||
print(f"--------------------------")
|
||||
|
||||
# Normalise Tavily extract output to a list of dicts
|
||||
# Normalize Tavily extract output to a list of dicts
|
||||
extracted = []
|
||||
if isinstance(data, list):
|
||||
extracted = data
|
||||
|
@ -273,18 +260,25 @@ class WebSearchTool(Tool):
|
|||
else:
|
||||
extracted = [data]
|
||||
|
||||
# Format results consistently
|
||||
formatted_results = []
|
||||
for item in extracted:
|
||||
formatted_result = {
|
||||
"Title": item.get("title"),
|
||||
"URL": item.get("url") or url,
|
||||
"Text":item.get("raw_content") or item.get("content") or item.get("text")
|
||||
"title": item.get("title", ""),
|
||||
"url": item.get("url", url),
|
||||
"content": item.get("raw_content") or item.get("content") or item.get("text", "")
|
||||
}
|
||||
|
||||
if item.get("published_date"):
|
||||
formatted_result["Published Date"] = item["published_date"]
|
||||
formatted_result["published_date"] = item["published_date"]
|
||||
|
||||
formatted_results.append(formatted_result)
|
||||
|
||||
return self.success_response(formatted_results)
|
||||
# Return a properly formatted ToolResult
|
||||
return ToolResult(
|
||||
success=True,
|
||||
output=json.dumps(formatted_results, ensure_ascii=False)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_message = str(e)
|
||||
|
|
|
@ -111,14 +111,14 @@ def create_sandbox(password: str):
|
|||
"CHROME_DEBUGGING_HOST": "localhost",
|
||||
"CHROME_CDP": ""
|
||||
},
|
||||
# ports=[
|
||||
# 7788, # Gradio default port
|
||||
# 6080, # noVNC web interface
|
||||
ports=[
|
||||
# 7788, # Gradio default port
|
||||
6080, # noVNC web interface
|
||||
# 5900, # VNC port
|
||||
# 5901, # VNC port
|
||||
# 9222, # Chrome remote debugging port
|
||||
# 8080 # HTTP website port
|
||||
# ]
|
||||
8080 # HTTP website port
|
||||
]
|
||||
))
|
||||
logger.info(f"Sandbox created with ID: {sandbox.id}")
|
||||
|
||||
|
|
|
@ -193,7 +193,7 @@ export function NavAgents() {
|
|||
) : null}
|
||||
</div>
|
||||
|
||||
<SidebarMenu className="overflow-y-auto max-h-[calc(100vh-200px)]">
|
||||
<SidebarMenu className="overflow-y-auto max-h-[calc(100vh-200px)] [&::-webkit-scrollbar]:hidden [-ms-overflow-style:'none'] [scrollbar-width:'none']">
|
||||
{state === "collapsed" && (
|
||||
<SidebarMenuItem>
|
||||
<Tooltip>
|
||||
|
|
|
@ -77,7 +77,7 @@ export function SidebarLeft({
|
|||
}, [state, setOpen]);
|
||||
|
||||
return (
|
||||
<Sidebar collapsible="icon" className="border-r-0 bg-background/95 backdrop-blur-sm" {...props}>
|
||||
<Sidebar collapsible="icon" className="border-r-0 bg-background/95 backdrop-blur-sm [&::-webkit-scrollbar]:hidden [-ms-overflow-style:'none'] [scrollbar-width:'none']" {...props}>
|
||||
<SidebarHeader className="px-2 py-2">
|
||||
<div className="flex h-[40px] items-center px-1 relative">
|
||||
<Link href="/dashboard">
|
||||
|
@ -100,7 +100,7 @@ export function SidebarLeft({
|
|||
)}
|
||||
</div>
|
||||
</SidebarHeader>
|
||||
<SidebarContent>
|
||||
<SidebarContent className="[&::-webkit-scrollbar]:hidden [-ms-overflow-style:'none'] [scrollbar-width:'none']">
|
||||
<NavAgents />
|
||||
</SidebarContent>
|
||||
{state !== "collapsed" && (
|
||||
|
|
|
@ -312,7 +312,7 @@ export function ToolCallSidePanel({
|
|||
};
|
||||
|
||||
return (
|
||||
<div className="fixed inset-y-0 right-0 w-[90%] sm:w-[450px] md:w-[500px] lg:w-[550px] xl:w-[650px] border-l flex flex-col z-10 h-screen">
|
||||
<div className={`fixed inset-y-0 right-0 w-[90%] sm:w-[450px] md:w-[500px] lg:w-[550px] xl:w-[650px] border-l flex flex-col z-10 h-screen transition-all duration-200 ease-in-out ${!isOpen ? 'translate-x-full' : ''}`}>
|
||||
<div className="flex-1 flex flex-col overflow-hidden">
|
||||
{renderContent()}
|
||||
</div>
|
||||
|
|
|
@ -3,6 +3,7 @@ import { FileCode, FileSymlink, FolderPlus, FileX, Replace, CheckCircle, AlertTr
|
|||
import { ToolViewProps } from "./types";
|
||||
import { extractFilePath, extractFileContent, getFileType, formatTimestamp, getToolTitle } from "./utils";
|
||||
import { GenericToolView } from "./GenericToolView";
|
||||
import { Markdown } from "@/components/ui/markdown";
|
||||
import { cn } from "@/lib/utils";
|
||||
|
||||
// Type for operation type
|
||||
|
@ -70,7 +71,7 @@ export function FileOperationToolView({
|
|||
: undefined;
|
||||
|
||||
// Add state for view mode toggle (code or preview) - moved before any conditional returns
|
||||
const [viewMode, setViewMode] = useState<'code' | 'preview'>(isHtml ? 'preview' : 'code');
|
||||
const [viewMode, setViewMode] = useState<'code' | 'preview'>(isHtml || isMarkdown ? 'preview' : 'code');
|
||||
|
||||
// Fall back to generic view if file path is missing or if content is missing for non-delete operations
|
||||
if ((!filePath && !showDebugInfo) || (operation !== "delete" && !fileContent)) {
|
||||
|
@ -152,14 +153,43 @@ export function FileOperationToolView({
|
|||
</button>
|
||||
</div>
|
||||
)}
|
||||
{/* View switcher for Markdown files */}
|
||||
{isMarkdown && isSuccess && (
|
||||
<div className="flex rounded-md overflow-hidden border border-zinc-200 dark:border-zinc-700">
|
||||
<button
|
||||
onClick={() => setViewMode('code')}
|
||||
className={cn(
|
||||
"flex items-center gap-1 text-xs px-2 py-1 transition-colors",
|
||||
viewMode === 'code'
|
||||
? "bg-zinc-800 text-zinc-100 dark:bg-zinc-700 dark:text-zinc-100"
|
||||
: "bg-zinc-200 text-zinc-700 dark:bg-zinc-800 dark:text-zinc-400 hover:bg-zinc-300 dark:hover:bg-zinc-700"
|
||||
)}
|
||||
>
|
||||
<Code className="h-3 w-3" />
|
||||
<span>Code</span>
|
||||
</button>
|
||||
<button
|
||||
onClick={() => setViewMode('preview')}
|
||||
className={cn(
|
||||
"flex items-center gap-1 text-xs px-2 py-1 transition-colors",
|
||||
viewMode === 'preview'
|
||||
? "bg-zinc-800 text-zinc-100 dark:bg-zinc-700 dark:text-zinc-100"
|
||||
: "bg-zinc-200 text-zinc-700 dark:bg-zinc-800 dark:text-zinc-400 hover:bg-zinc-300 dark:hover:bg-zinc-700"
|
||||
)}
|
||||
>
|
||||
<Eye className="h-3 w-3" />
|
||||
<span>Preview</span>
|
||||
</button>
|
||||
</div>
|
||||
)}
|
||||
<span className="text-xs text-zinc-500 dark:text-zinc-400 bg-zinc-200 dark:bg-zinc-800 px-2 py-0.5 rounded">
|
||||
{fileType}
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* File Content */}
|
||||
{(!isHtml || viewMode === 'code' || !htmlPreviewUrl || !isSuccess) && (
|
||||
{/* File Content (Code View) */}
|
||||
{viewMode === 'code' || (!isHtml && !isMarkdown) || !isSuccess ? (
|
||||
<div className="flex-1 overflow-auto bg-white dark:bg-zinc-950 text-zinc-900 dark:text-zinc-100">
|
||||
<div className="min-w-full table">
|
||||
{contentLines.map((line, idx) => (
|
||||
|
@ -175,7 +205,7 @@ export function FileOperationToolView({
|
|||
<div className="table-row h-4"></div>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
) : null}
|
||||
|
||||
{/* HTML Preview with iframe */}
|
||||
{isHtml && viewMode === 'preview' && htmlPreviewUrl && isSuccess && (
|
||||
|
@ -190,6 +220,15 @@ export function FileOperationToolView({
|
|||
</div>
|
||||
)}
|
||||
|
||||
{/* Markdown Preview */}
|
||||
{isMarkdown && viewMode === 'preview' && isSuccess && (
|
||||
<div className="flex-1 overflow-auto bg-white dark:bg-zinc-950 text-zinc-900 dark:text-zinc-100 p-4">
|
||||
<Markdown className="text-sm prose prose-sm dark:prose-invert max-w-none">
|
||||
{fileContent}
|
||||
</Markdown>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* External link button for HTML files */}
|
||||
{isHtml && viewMode === 'preview' && htmlPreviewUrl && isSuccess && (
|
||||
<div className="bg-zinc-100 dark:bg-zinc-900 p-2 border-t border-zinc-200 dark:border-zinc-800 flex justify-end">
|
||||
|
|
|
@ -13,6 +13,15 @@ export function WebSearchToolView({
|
|||
isSuccess = true,
|
||||
isStreaming = false
|
||||
}: ToolViewProps) {
|
||||
console.log({
|
||||
name,
|
||||
assistantContent,
|
||||
toolContent,
|
||||
assistantTimestamp,
|
||||
toolTimestamp,
|
||||
isSuccess,
|
||||
isStreaming
|
||||
});
|
||||
const query = extractSearchQuery(assistantContent);
|
||||
const searchResults = extractSearchResults(toolContent);
|
||||
const toolTitle = getToolTitle(name);
|
||||
|
@ -20,47 +29,47 @@ export function WebSearchToolView({
|
|||
return (
|
||||
<div className="flex flex-col h-full">
|
||||
<div className="flex-1 p-4 overflow-auto">
|
||||
<div className="border border-zinc-200 dark:border-zinc-800 rounded-md overflow-hidden h-full flex flex-col">
|
||||
<div className="flex items-center p-2 bg-zinc-100 dark:bg-zinc-900 justify-between border-b border-zinc-200 dark:border-zinc-800">
|
||||
<div className="h-full flex flex-col">
|
||||
<div className="flex items-center p-2 justify-between mb-3">
|
||||
<div className="flex items-center">
|
||||
<Search className="h-4 w-4 mr-2 text-zinc-600 dark:text-zinc-400" />
|
||||
<span className="text-xs font-medium text-zinc-700 dark:text-zinc-300">Search Results</span>
|
||||
<Search className="h-4 w-4 mr-2 text-zinc-500 dark:text-zinc-400" />
|
||||
<span className="text-xs font-medium text-zinc-600 dark:text-zinc-300">Search Results</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="px-4 py-3 border-b border-zinc-200 dark:border-zinc-800 bg-zinc-50 dark:bg-zinc-900">
|
||||
<div className="flex items-center">
|
||||
<div className="text-xs font-medium mr-2 text-zinc-600 dark:text-zinc-400">Query:</div>
|
||||
<div className="text-xs py-1 px-2 rounded-md flex-1 bg-zinc-100 dark:bg-zinc-800 text-zinc-800 dark:text-zinc-300">{query || 'Unknown query'}</div>
|
||||
<div className="px-2 mb-4">
|
||||
<div className="flex items-center bg-zinc-100 dark:bg-zinc-800/50 rounded p-2">
|
||||
<div className="text-xs font-medium mr-2 text-zinc-600 dark:text-zinc-400 shrink-0">Query:</div>
|
||||
<div className="text-xs flex-1 text-zinc-800 dark:text-zinc-300 truncate">{query || 'Unknown query'}</div>
|
||||
</div>
|
||||
<div className="mt-1.5 text-xs text-zinc-500 dark:text-zinc-400">
|
||||
<div className="mt-2 text-xs text-zinc-500 dark:text-zinc-400">
|
||||
{isStreaming ? 'Searching...' : searchResults.length > 0 ? `Found ${searchResults.length} results` : 'No results found'}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="flex-1 overflow-auto bg-white dark:bg-zinc-950">
|
||||
<div className="flex-1 overflow-auto bg-white dark:bg-zinc-950 rounded-md border border-zinc-200 dark:border-zinc-800">
|
||||
{isStreaming ? (
|
||||
<div className="p-6 text-center flex-1 flex flex-col items-center justify-center">
|
||||
<CircleDashed className="h-8 w-8 mx-auto mb-2 text-blue-500 animate-spin" />
|
||||
<div className="p-6 text-center flex-1 flex flex-col items-center justify-center h-full">
|
||||
<CircleDashed className="h-6 w-6 mx-auto mb-2 text-blue-500 animate-spin" />
|
||||
<p className="text-sm font-medium text-zinc-700 dark:text-zinc-300">Searching the web...</p>
|
||||
<p className="text-xs mt-1 text-zinc-500 dark:text-zinc-400">This might take a moment</p>
|
||||
</div>
|
||||
) : searchResults.length > 0 ? (
|
||||
<div className="divide-y divide-zinc-100 dark:divide-zinc-800">
|
||||
{searchResults.map((result, idx) => (
|
||||
<div key={idx} className="p-4 space-y-1.5 hover:bg-zinc-50 dark:hover:bg-zinc-900/50 transition-colors">
|
||||
<div key={idx} className="p-3 space-y-1">
|
||||
<div className="flex flex-col">
|
||||
<div className="text-xs text-emerald-600 dark:text-emerald-400 truncate">
|
||||
<div className="text-xs text-zinc-500 dark:text-zinc-400 truncate mb-0.5">
|
||||
{cleanUrl(result.url)}
|
||||
</div>
|
||||
<a
|
||||
href={result.url}
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="text-sm text-blue-600 dark:text-blue-500 hover:underline font-medium flex items-center gap-1"
|
||||
className="text-sm text-blue-600 dark:text-blue-400 hover:underline font-medium flex items-center gap-1"
|
||||
>
|
||||
{result.title}
|
||||
<ExternalLink className="h-3 w-3 opacity-70" />
|
||||
<ExternalLink className="h-3 w-3 opacity-60" />
|
||||
</a>
|
||||
</div>
|
||||
{result.snippet && (
|
||||
|
@ -72,36 +81,35 @@ export function WebSearchToolView({
|
|||
))}
|
||||
</div>
|
||||
) : (
|
||||
<div className="p-6 text-center text-zinc-500 flex-1 flex flex-col items-center justify-center">
|
||||
<Search className="h-6 w-6 mx-auto mb-2 opacity-50" />
|
||||
<p className="text-sm font-medium text-zinc-700 dark:text-zinc-300">No results found</p>
|
||||
<p className="text-xs mt-1 text-zinc-500 dark:text-zinc-400">Try a different search query</p>
|
||||
<div className="p-6 text-center text-zinc-500 flex-1 flex flex-col items-center justify-center h-full">
|
||||
<Search className="h-6 w-6 mx-auto mb-2 opacity-40" />
|
||||
<p className="text-sm font-medium text-zinc-600 dark:text-zinc-300">No results found</p>
|
||||
<p className="text-xs mt-1 text-zinc-500 dark:text-zinc-400">Try refining your search query</p>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Footer */}
|
||||
<div className="p-4 border-t border-zinc-200 dark:border-zinc-800">
|
||||
<div className="p-3 border-t border-zinc-200 dark:border-zinc-800 bg-zinc-50 dark:bg-zinc-900/50">
|
||||
<div className="flex items-center justify-between text-xs text-zinc-500 dark:text-zinc-400">
|
||||
{!isStreaming && (
|
||||
<div className="flex items-center gap-2">
|
||||
<div className="flex items-center gap-1.5">
|
||||
{isSuccess ? (
|
||||
<CheckCircle className="h-3.5 w-3.5 text-emerald-500" />
|
||||
) : (
|
||||
<AlertTriangle className="h-3.5 w-3.5 text-red-500" />
|
||||
)}
|
||||
<span>
|
||||
{isSuccess ? 'Search completed successfully' : 'Search failed'}
|
||||
{isSuccess ? 'Search completed' : 'Search failed'}
|
||||
</span>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{isStreaming && (
|
||||
<div className="flex items-center gap-2">
|
||||
<div className="flex items-center gap-1.5">
|
||||
<CircleDashed className="h-3.5 w-3.5 text-blue-500 animate-spin" />
|
||||
<span>Searching the web...</span>
|
||||
<span>Searching...</span>
|
||||
</div>
|
||||
)}
|
||||
|
||||
|
|
|
@ -246,8 +246,59 @@ export function extractBrowserOperation(toolName: string | undefined): string {
|
|||
// Helper to extract search query
|
||||
export function extractSearchQuery(content: string | undefined): string | null {
|
||||
if (!content) return null;
|
||||
const queryMatch = content.match(/query=["']([\s\S]*?)["']/);
|
||||
return queryMatch ? queryMatch[1] : null;
|
||||
|
||||
let contentToSearch = content; // Start with the original content
|
||||
|
||||
// 3. Try parsing as JSON first, as the relevant content might be nested
|
||||
try {
|
||||
const parsedOuter = JSON.parse(content);
|
||||
if (typeof parsedOuter.content === 'string') {
|
||||
// If the outer content is JSON and has a 'content' string field,
|
||||
// use that inner content for searching the query.
|
||||
contentToSearch = parsedOuter.content;
|
||||
|
||||
// Also check common JSON structures within the outer parsed object itself
|
||||
if (typeof parsedOuter.query === 'string') {
|
||||
return parsedOuter.query;
|
||||
}
|
||||
if (typeof parsedOuter.arguments === 'object' && parsedOuter.arguments !== null && typeof parsedOuter.arguments.query === 'string') {
|
||||
return parsedOuter.arguments.query;
|
||||
}
|
||||
if (Array.isArray(parsedOuter.tool_calls) && parsedOuter.tool_calls.length > 0) {
|
||||
const toolCall = parsedOuter.tool_calls[0];
|
||||
if (typeof toolCall.arguments === 'object' && toolCall.arguments !== null && typeof toolCall.arguments.query === 'string') {
|
||||
return toolCall.arguments.query;
|
||||
}
|
||||
if (typeof toolCall.arguments === 'string') {
|
||||
try {
|
||||
const argsParsed = JSON.parse(toolCall.arguments);
|
||||
if (typeof argsParsed.query === 'string') {
|
||||
return argsParsed.query;
|
||||
}
|
||||
} catch {}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
// If parsing fails, continue with the original content string
|
||||
}
|
||||
|
||||
// Now search within contentToSearch (either original or nested content)
|
||||
|
||||
// 1. Try regex for attribute within <web-search ...> tag
|
||||
const xmlQueryMatch = contentToSearch.match(/<web-search[^>]*query=[\"']([^\"']*)["'][^>]*>/i);
|
||||
if (xmlQueryMatch && xmlQueryMatch[1]) {
|
||||
return xmlQueryMatch[1].trim();
|
||||
}
|
||||
|
||||
// 2. Try simple attribute regex (fallback, less specific)
|
||||
const simpleAttrMatch = contentToSearch.match(/query=[\"']([\s\S]*?)["']/i);
|
||||
if (simpleAttrMatch && simpleAttrMatch[1]) {
|
||||
return simpleAttrMatch[1].split(/[\"']/)[0].trim();
|
||||
}
|
||||
|
||||
// 4. If nothing found after checking original/nested content and JSON structure, return null
|
||||
return null;
|
||||
}
|
||||
|
||||
// Helper to extract search results from tool response
|
||||
|
@ -308,24 +359,54 @@ export function extractSearchResults(content: string | undefined): Array<{ title
|
|||
export function extractUrlsAndTitles(content: string): Array<{ title: string, url: string, snippet?: string }> {
|
||||
const results: Array<{ title: string, url: string, snippet?: string }> = [];
|
||||
|
||||
// Match URL and title pairs
|
||||
const urlMatches = content.match(/https?:\/\/[^\s"]+/g) || [];
|
||||
urlMatches.forEach(url => {
|
||||
// Try to find a title near this URL
|
||||
const urlIndex = content.indexOf(url);
|
||||
const surroundingText = content.substring(Math.max(0, urlIndex - 100), urlIndex + url.length + 100);
|
||||
// Regex to find URLs, attempting to exclude common trailing unwanted characters/tags
|
||||
const urlRegex = /https?:\/\/[^\s"<]+/g;
|
||||
let match;
|
||||
|
||||
while ((match = urlRegex.exec(content)) !== null) {
|
||||
let url = match[0];
|
||||
|
||||
// Look for "Title:" or similar patterns
|
||||
const titleMatch = surroundingText.match(/Title[:\s]+([^\n]+)/i) ||
|
||||
// Basic cleaning: remove common tags or artifacts if they are directly appended
|
||||
url = url.replace(/<\/?url>$/, '')
|
||||
.replace(/<\/?content>$/, '')
|
||||
.replace(/%3C$/, ''); // Remove trailing %3C (less than sign)
|
||||
|
||||
// Decode URI components to handle % sequences, but catch errors
|
||||
try {
|
||||
url = decodeURIComponent(url);
|
||||
} catch (e) {
|
||||
// If decoding fails, use the URL as is, potentially still needs cleaning
|
||||
console.warn("Failed to decode URL component:", url, e);
|
||||
}
|
||||
|
||||
// Final cleaning for specific problematic sequences like ellipsis
|
||||
url = url.replace(/\u2026$/, ''); // Remove trailing ellipsis (…)
|
||||
|
||||
// Try to find a title near this URL - simplified logic
|
||||
const urlIndex = match.index;
|
||||
const surroundingText = content.substring(Math.max(0, urlIndex - 100), urlIndex + url.length + 150); // Increased lookahead for content
|
||||
|
||||
// Look for title patterns more robustly
|
||||
const contentMatch = surroundingText.match(/<content>([^<]+)<\/content>/i);
|
||||
const titleMatch = surroundingText.match(/Title[:\s]+([^\n<]+)/i) ||
|
||||
surroundingText.match(/\"(.*?)\"[\s\n]*?https?:\/\//);
|
||||
|
||||
const title = titleMatch ? titleMatch[1] : cleanUrl(url);
|
||||
|
||||
results.push({
|
||||
title: title,
|
||||
url: url
|
||||
});
|
||||
});
|
||||
|
||||
let title = cleanUrl(url); // Default to cleaned URL hostname/path
|
||||
if (contentMatch && contentMatch[1].trim()) {
|
||||
title = contentMatch[1].trim();
|
||||
} else if (titleMatch && titleMatch[1].trim()) {
|
||||
title = titleMatch[1].trim();
|
||||
}
|
||||
|
||||
// Avoid adding duplicates if the cleaning resulted in the same URL
|
||||
if (!results.some(r => r.url === url)) {
|
||||
results.push({
|
||||
title: title,
|
||||
url: url
|
||||
// Snippet extraction could be added here if needed
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue