Merge branch 'evals' of https://github.com/buster-so/buster into evals

2025-04-16 09:26:33 -06:00 · 2025-04-16 09:26:33 -06:00 · 4a832d984a
parent 5e2099bc6e 9a8b4ace87
commit 4a832d984a
12 changed files with 155 additions and 34 deletions
--- a/api/libs/agents/src/agents/buster_multi_agent.rs
+++ b/api/libs/agents/src/agents/buster_multi_agent.rs
@ -72,7 +72,7 @@ impl BusterMultiAgent {
        let create_dashboard_files_tool = CreateDashboardFilesTool::new(Arc::clone(&self.agent));
        let modify_dashboard_files_tool = ModifyDashboardFilesTool::new(Arc::clone(&self.agent));
        let message_user_clarifying_question_tool = MessageUserClarifyingQuestion::new();
-        let done_tool = Done::new();
+        let done_tool = Done::new(Arc::clone(&self.agent));
        let no_search_needed_tool = NoSearchNeededTool::new(Arc::clone(&self.agent));
        let review_tool = ReviewPlan::new(Arc::clone(&self.agent));
@ -99,7 +99,11 @@ impl BusterMultiAgent {
        });
        let review_condition =
-            Some(|state: &HashMap<String, Value>| -> bool { state.contains_key("review_needed") });
+            Some(|state: &HashMap<String, Value>| -> bool {
                state.get("review_needed")
                    .and_then(|value| value.as_bool())
                    .unwrap_or(false)
            });
        let planning_tools_condition = Some(|state: &HashMap<String, Value>| -> bool {
            let searched_catalog = state
@ -281,7 +285,11 @@ impl BusterMultiAgent {
        };
        let needs_review_condition =
-            |state: &HashMap<String, Value>| -> bool { state.contains_key("review_needed") };
+            |state: &HashMap<String, Value>| -> bool { 
                state.get("review_needed")
                    .and_then(|value| value.as_bool())
                    .unwrap_or(false)
            };
        // Add prompt rules (order matters)
        // The agent will use the prompt associated with the first condition that evaluates to true.
--- a/api/libs/agents/src/agents/modes/create_plan_prompt.rs
+++ b/api/libs/agents/src/agents/modes/create_plan_prompt.rs
@ -133,6 +133,7 @@ To determine whether to use a Straightforward Plan or an Investigative Plan, con
 - When creating a plan that involves generating assets (visualizations and dashboards), do not include a separate step for delivering these assets, as they are automatically displayed to the user upon creation.
 - Assume that all datasets required for the plan are available, as their availability has already been confirmed in the previous step.
 - If the user's request includes aspects that are not supported (e.g., specific visualizations, forecasts, etc.), do not include these in the step-by-step plan. Instead, mention them in the note section of the plan, and specify that they should be addressed in the final response to the user.
 - The tools used for creating plans include a `todos` argument. This argument is a list of short summary points. **Crucially, each step in the generated plan must correspond to exactly one item in the `todos` list.** These `todos` serve as a concise overview of the plan's execution steps. Do not include any review steps in the `todos` list, as reviews are handled separately.
 **Examples**  
--- a/api/libs/agents/src/agents/modes/review_prompt.rs
+++ b/api/libs/agents/src/agents/modes/review_prompt.rs
@ -20,7 +20,7 @@ Tool Calling
 You have two tools to do your job:
 review_plan: Marks a task as complete. Needs todo_item (an integer) to specify which task (starts at 1).
-done: Sends the final response to the user and ends the workflow.
+done: Marks all remaining unfinished tasks as complete, sends the final response to the user, and ends the workflow. Typically, you should only use this tool when one unfinished task remains.
 Follow these rules:
--- a/api/libs/agents/src/tools/categories/file_tools/create_dashboards.rs
+++ b/api/libs/agents/src/tools/categories/file_tools/create_dashboards.rs
@ -305,9 +305,20 @@ impl ToolExecutor for CreateDashboardFilesTool {
        let duration = start_time.elapsed().as_millis() as i64;
        self.agent
-            .set_state_value(String::from("metrics_available"), Value::Bool(true))
+            .set_state_value(String::from("dashboards_available"), Value::Bool(true))
            .await;
        self.agent
            .set_state_value(String::from("files_available"), Value::Bool(true)) 
            .await;
        // Set review_needed flag if execution was successful
        if failed_files.is_empty() {
            self.agent
                .set_state_value(String::from("review_needed"), Value::Bool(true))
                .await;
        }
        Ok(CreateDashboardFilesOutput {
            message,
            duration,
--- a/api/libs/agents/src/tools/categories/file_tools/create_metrics.rs
+++ b/api/libs/agents/src/tools/categories/file_tools/create_metrics.rs
@ -243,6 +243,13 @@ impl ToolExecutor for CreateMetricFilesTool {
            .set_state_value(String::from("files_available"), Value::Bool(true))
            .await;
        // Set review_needed flag if execution was successful
        if failed_files.is_empty() {
            self.agent
                .set_state_value(String::from("review_needed"), Value::Bool(true))
                .await;
        }
        Ok(CreateMetricFilesOutput {
            message,
            duration,
--- a/api/libs/agents/src/tools/categories/file_tools/modify_dashboards.rs
+++ b/api/libs/agents/src/tools/categories/file_tools/modify_dashboards.rs
@ -359,6 +359,13 @@ impl ToolExecutor for ModifyDashboardFilesTool {
                .map(|(file_name, error)| FailedFileModification { file_name, error }),
        );
        // Set review_needed flag if execution was successful
        if output.failed_files.is_empty() {
            self.agent
                .set_state_value(String::from("review_needed"), Value::Bool(true))
                .await;
        }
        Ok(output)
    }
--- a/api/libs/agents/src/tools/categories/file_tools/modify_metrics.rs
+++ b/api/libs/agents/src/tools/categories/file_tools/modify_metrics.rs
@ -421,6 +421,13 @@ impl ToolExecutor for ModifyMetricFilesTool {
                .map(|(file_name, error)| FailedFileModification { file_name, error }),
        );
        // Set review_needed flag if execution was successful
        if output.failed_files.is_empty() {
            self.agent
                .set_state_value(String::from("review_needed"), Value::Bool(true))
                .await;
        }
        Ok(output)
    }
--- a/api/libs/agents/src/tools/categories/planning_tools/create_plan_investigative.rs
+++ b/api/libs/agents/src/tools/categories/planning_tools/create_plan_investigative.rs
@ -81,7 +81,7 @@ impl ToolExecutor for CreatePlanInvestigative {
                    },
                    "todos": {
                        "type": "array",
-                        "description": "Ordered todo points summarizing the plan. There should be one todo for each step in the plan, in order. For example, if the plan has two steps, plan_todos should have two items, each summarizing a step. Do not include review or response steps—these will be handled by a separate agent.",
+                        "description": "Ordered todo points summarizing the steps of the plan. There should be max one todo for each step in the plan, in order. For example, if the plan has two steps, plan_todos should have two items, each summarizing a step. Do not include review or response steps—these will be handled by a separate agent.",
                        "items": { "type": "string" },
                    },
                },
--- a/api/libs/agents/src/tools/categories/planning_tools/create_plan_straightforward.rs
+++ b/api/libs/agents/src/tools/categories/planning_tools/create_plan_straightforward.rs
@ -58,9 +58,17 @@ impl ToolExecutor for CreatePlanStraightforward {
            .set_state_value(String::from("todos"), Value::Array(todos_state_objects))
            .await;
-        let todos_string = params.todos.iter().map(|item| format!("[ ] {}", item)).collect::<Vec<_>>().join("\n");
+        let todos_string = params
            .todos
            .iter()
            .map(|item| format!("[ ] {}", item))
            .collect::<Vec<_>>()
            .join("\n");
-        Ok(CreatePlanStraightforwardOutput { success: true, todos: todos_string })
+        Ok(CreatePlanStraightforwardOutput {
            success: true,
            todos: todos_string,
        })
    }
    async fn get_schema(&self) -> Value {
@ -81,8 +89,7 @@ impl ToolExecutor for CreatePlanStraightforward {
                    },
                    "todos": {
                        "type": "array",
-                        "description": "Ordered todo points summarizing the plan. There should be one todo for each step in the plan, in order. For example, if the plan has two steps, plan_todos should have two items, each summarizing a step. Do not include review or response steps—these will be handled by a separate agent.",
+                        "description": "Ordered todo points summarizing the steps of the plan. There should be max one todo for each step in the plan, in order. For example, if the plan has two steps, plan_todos should have two items, each summarizing a step. Do not include review or response steps—these will be handled by a separate agent.",                        "items": { "type": "string" },
                        "items": { "type": "string" },
                    },
                },
                "additionalProperties": false
--- a/api/libs/agents/src/tools/categories/planning_tools/review_plan.rs
+++ b/api/libs/agents/src/tools/categories/planning_tools/review_plan.rs
@ -14,7 +14,7 @@ pub struct ReviewPlanOutput {
 #[derive(Debug, Deserialize)]
 pub struct ReviewPlanInput {
-    pub todo_item: usize, // 0-based index
+    pub todo_items: Vec<usize>, // 1-based index
 }
 pub struct ReviewPlan {
@ -45,16 +45,32 @@ impl ToolExecutor for ReviewPlan {
            }
        };
-        let idx = params.todo_item;
+        let total_todos = todos.len();
        if idx >= todos.len() {
            return Err(anyhow::anyhow!("todo_item index {} out of range ({} todos)", idx, todos.len()));
        }
-        // Mark the todo at the given index as complete
+        for idx_one_based in &params.todo_items {
-        if let Some(Value::Object(map)) = todos.get_mut(idx) {
+            // Convert 1-based index to 0-based index
-            map.insert("completed".to_string(), Value::Bool(true));
+            if *idx_one_based == 0 {
-        } else {
+                return Err(anyhow::anyhow!("todo_item index cannot be 0, indexing starts from 1."));
-            return Err(anyhow::anyhow!("Todo item at index {} is not a valid object.", idx));
+            }
            let idx_zero_based = *idx_one_based - 1;
            if idx_zero_based >= total_todos {
                return Err(anyhow::anyhow!(
                    "todo_item index {} out of range ({} todos, 1-based)",
                    idx_one_based,
                    total_todos
                ));
            }
            // Mark the todo at the given index as complete
            if let Some(Value::Object(map)) = todos.get_mut(idx_zero_based) {
                map.insert("completed".to_string(), Value::Bool(true));
            } else {
                return Err(anyhow::anyhow!(
                    "Todo item at index {} (1-based) is not a valid object.",
                    idx_one_based
                ));
            }
        }
        // Save the updated todos back to state
@ -74,23 +90,31 @@ impl ToolExecutor for ReviewPlan {
            .collect::<Vec<_>>()
            .join("\n");
        // Set review_needed to false after review
        self.agent
            .set_state_value(String::from("review_needed"), Value::Bool(false))
            .await;
        Ok(ReviewPlanOutput { success: true, todos: todos_string })
    }
    async fn get_schema(&self) -> Value {
        serde_json::json!({
            "name": self.get_name(),
-            "description": "Marks a task as complete by its index in the to-do list.",
+            "description": "Marks one or more tasks as complete by their 1-based indices in the to-do list.",
            "parameters": {
                "type": "object",
                "properties": {
-                    "todo_item": {
+                    "todo_items": {
-                        "type": "integer",
+                        "type": "array",
-                        "description": "The 0-based index of the task to mark as complete (0 is the first item).",
+                        "items": {
-                        "minimum": 0
+                            "type": "integer",
                            "minimum": 1
                        },
                        "description": "A list of 1-based indices of the tasks to mark as complete (1 is the first item)."
                    }
                },
-                "required": ["todo_item"]
+                "required": ["todo_items"]
            }
        })
    }
--- a/api/libs/agents/src/tools/categories/response_tools/done.rs
+++ b/api/libs/agents/src/tools/categories/response_tools/done.rs
@ -2,8 +2,9 @@ use anyhow::Result;
 use async_trait::async_trait;
 use serde::{Deserialize, Serialize};
 use serde_json::Value;
 use std::sync::Arc;
-use crate::tools::ToolExecutor;
+use crate::{agent::Agent, tools::ToolExecutor};
 #[derive(Debug, Deserialize, Serialize)]
 pub struct DoneInput {
@ -14,13 +15,16 @@ pub struct DoneInput {
 #[derive(Debug, Serialize, Deserialize)]
 pub struct DoneOutput {
    pub success: bool,
    pub todos: String,
 }
-pub struct Done;
+pub struct Done {
    agent: Arc<Agent>,
 }
 impl Done {
-    pub fn new() -> Self {
+    pub fn new(agent: Arc<Agent>) -> Self {
-        Self
+        Self { agent }
    }
 }
@ -34,15 +38,60 @@ impl ToolExecutor for Done {
    }
    async fn execute(&self, params: Self::Params, _tool_call_id: String) -> Result<Self::Output> {
        // Get the current todos from state
        let mut todos = match self.agent.get_state_value("todos").await {
            Some(Value::Array(arr)) => arr,
            _ => {
                // If no todos exist, just return success without a list
                return Ok(DoneOutput { success: true, todos: "No to-do list found.".to_string() });
            }
        };
        let mut marked_by_done = vec![]; // Track items marked by this tool
        // Mark all remaining unfinished todos as complete
        for (idx, todo_val) in todos.iter_mut().enumerate() {
            if let Value::Object(map) = todo_val {
                let is_completed = map.get("completed").and_then(Value::as_bool).unwrap_or(false);
                if !is_completed {
                    map.insert("completed".to_string(), Value::Bool(true));
                    marked_by_done.push(idx); // Track 0-based index
                }
            } else {
                 // Handle invalid item format if necessary, maybe log a warning?
                 eprintln!("Warning: Invalid todo item format at index {}", idx);
            }
        }
        // Save the updated todos back to state
        self.agent.set_state_value("todos".to_string(), Value::Array(todos.clone())).await; // Clone needed for iteration below
        // Format the output string, potentially noting items marked by 'done'
        let todos_string = todos.iter().enumerate()
            .map(|(idx, todo_val)| {
                if let Value::Object(map) = todo_val {
                    let completed = map.get("completed").and_then(Value::as_bool).unwrap_or(false); // Should always be true now
                    let todo_text = map.get("todo").and_then(Value::as_str).unwrap_or("Invalid todo text");
                    let annotation = if marked_by_done.contains(&idx) { " *Marked complete by calling the done tool" } else { "" };
                    format!("[x] {}{}", todo_text, annotation)
                } else {
                    "Invalid todo item format".to_string()
                }
            })
            .collect::<Vec<_>>()
            .join("\n");
        // This tool signals the end of the workflow and provides the final response.
        // The actual agent termination logic resides elsewhere.
-        Ok(DoneOutput { success: true })
+        Ok(DoneOutput { success: true, todos: todos_string }) // Include todos in output
    }
    async fn get_schema(&self) -> Value {
        serde_json::json!({
            "name": self.get_name(),
-            "description": "Use when you have completed your workflow and are ready to send a final response to the user.",
+            "description": "Marks all remaining unfinished tasks as complete, sends a final response to the user, and ends the workflow. Use this when the workflow is finished.",
            "parameters": {
                "type": "object",
                "required": [
--- a/api/libs/handlers/src/chats/post_chat_handler.rs
+++ b/api/libs/handlers/src/chats/post_chat_handler.rs
@ -1503,7 +1503,7 @@ fn transform_tool_message(
        "done" | "message_notify_user" | "message_user_clarifying_question" => vec![],
        // Add specific handling for no_search_needed to return nothing
-        "no_search_needed" => vec![],
+        "no_search_needed" | "review_plan" => vec![],
        // Existing tool result processing - pass duration
        "search_data_catalog" => tool_data_catalog_search(id.clone(), content, delta_duration)?,
@ -2157,7 +2157,7 @@ fn transform_assistant_tool_message(
                    // parser.clear_related_chunks(tool_id.clone()); // Example hypothetical parser method
                }
            }
-            "no_search_needed" => {
+            "no_search_needed" | "review_plan" => {
                 // Clear tracker since this tool doesn't use chunking for its reasoning output
                tracker.clear_chunk(tool_id.clone());
            }