Merge branch 'evals' of https://github.com/buster-so/buster into evals

2025-04-16 13:49:45 -06:00 · 2025-04-16 13:49:45 -06:00 · 7b9ca21062
parent 40b8545400 247c9bcb5c
commit 7b9ca21062
6 changed files with 97 additions and 154 deletions
--- a/api/libs/agents/src/agents/modes/analysis.rs
+++ b/api/libs/agents/src/agents/modes/analysis.rs
@ -16,9 +16,9 @@ use crate::tools::{
    categories::{
        file_tools::{
            CreateDashboardFilesTool, CreateMetricFilesTool, ModifyDashboardFilesTool,
-            ModifyMetricFilesTool,
+            ModifyMetricFilesTool, SearchDataCatalogTool,
        },
-        response_tools::{Done, MessageUserClarifyingQuestion},
+        response_tools::Done,
    },
    IntoToolCallExecutor,
 };
@ -47,6 +47,7 @@ pub fn get_configuration(agent_data: &ModeAgentData) -> ModeConfiguration {
            let create_dashboard_files_tool = CreateDashboardFilesTool::new(agent_clone.clone());
            let modify_dashboard_files_tool = ModifyDashboardFilesTool::new(agent_clone.clone());
            let done_tool = Done::new(agent_clone.clone());
+            let search_data_catalog_tool = SearchDataCatalogTool::new(agent_clone.clone());

            // --- Define Conditions based on Agent State (as per original load_tools) ---
            // Base condition: Plan and context must exist (implicitly true if we are in this mode)
@ -87,6 +88,9 @@ pub fn get_configuration(agent_data: &ModeAgentData) -> ModeConfiguration {
                review_needed || all_todos_complete
            });

+            // Condition for search tool (always available)
+            let always_available = Some(|_state: &HashMap<String, Value>| -> bool { true });
+
            // Add tools to the agent with conditions
            agent_clone
                .add_tool(
@ -123,6 +127,13 @@ pub fn get_configuration(agent_data: &ModeAgentData) -> ModeConfiguration {
                    done_condition,
                )
                .await;
+            agent_clone
+                .add_tool(
+                    search_data_catalog_tool.get_name(),
+                    search_data_catalog_tool.into_tool_call_executor(),
+                    always_available,
+                )
+                .await;

            Ok(())
        })
@ -142,7 +153,9 @@ pub fn get_configuration(agent_data: &ModeAgentData) -> ModeConfiguration {

 // Keep the prompt constant, but it's no longer pub
 const PROMPT: &str = r##"### Role & Task
-You are Buster, an expert analytics and data engineer. Your job is to assess what data is available and then provide fast, accurate answers to analytics questions from non-technical users. You do this by analyzing user requests, searching across a data catalog, and building metrics or dashboards.
+You are Buster, an expert analytics and data engineer. Your job is to assess what data is available (provided via search results) and then provide fast, accurate answers to analytics questions from non-technical users. You do this by analyzing user requests, using the provided data context, and building metrics or dashboards.
+
+**Crucially, you MUST only reference datasets, tables, columns, and values that have been explicitly provided to you through the results of data catalog searches in the conversation history or current context. Do not assume or invent data structures or content. Base all data operations strictly on the provided context.**

 Today's date is {TODAYS_DATE}.

@ -150,14 +163,15 @@ Today's date is {TODAYS_DATE}.

 ## Workflow Summary

-1. **Search the data catalog** to locate relevant data.
-2. **Assess the adequacy** of the search results:
-3. **Create a plan** using the appropriate create plan tool.
+1. **Review the provided data context** from previous search steps.
+2. **Assess the adequacy** of the *available* data context for the current request.
+3. **Create a plan** using the appropriate create plan tool, based *only* on the available data.
 4. **Execute the plan** by creating assets such as metrics or dashboards.
-   - Execute the plan to the best of your ability.
-   - If only certain aspects of the plan are possible, proceed to do whatever is possible.
+   - Execute the plan to the best of your ability using *only* the available data.
+   - If you encounter errors or realize data is missing *during* execution, use the appropriate search tool to find the necessary data *before* continuing or resorting to the `finish_and_respond` tool.
+   - If only certain aspects of the plan are possible with the available data (even after searching again), proceed to do whatever is possible.
 5. **Send a final response to the user** with the `finish_and_respond` tool.
-   - If you were not able to accomplish all aspects of the user request, address the things that were not possible in your final response.
+   - If you were not able to accomplish all aspects of the user request (due to missing data that couldn't be found), address the things that were not possible in your final response.

 ---

@ -169,8 +183,9 @@ You have access to a set of tools to perform actions and deliver results. Adhere
 2. **Follow the tool call schema precisely**, including all required parameters.
 3. **Only use provided tools**, as availability may vary dynamically based on the task.
 4. **Avoid mentioning tool names** in explanations or outputs (e.g., say "I searched the data catalog" instead of naming the tool).
-5. **If the data required is not available**, use the `finish_and_respond` tool to inform the user (do not ask the user to provide you with the required data), signaling the end of your workflow.
-6. **Do not ask clarifying questions.** If the user's request is ambiguous, do not ask clarifying questions. Make reasonable assumptions and proceed to accomplish the task.
+5. **If the data required is not available** in your current context, first use the search tool to attempt to find it. If the necessary data *still* cannot be found after a reasonable search attempt, *then* use the `finish_and_respond` tool to inform the user, signaling the end of your workflow for that request.
+6. **Do not ask clarifying questions.** If the user's request is ambiguous, make reasonable assumptions based on the *available data context* and proceed to accomplish the task.
+7. **Strictly Adhere to Available Data**: Reiterate: NEVER reference datasets, tables, columns, or values not present in the data context provided by search tools. Do not hallucinate or invent data.

 ---

@ -247,140 +262,3 @@ pub const MODEL: Option<&str> = None;
 pub fn get_prompt(todays_date: &str) -> String {
    PROMPT.replace("{TODAYS_DATE}", todays_date)
 }
-
-// Load tools relevant to the analysis/execution mode
-pub async fn load_tools(agent: &Arc<Agent>) -> Result<()> {
-    use crate::tools::{
-        categories::{
-            file_tools::{
-                // File tools are the core of this mode
-                CreateDashboardFilesTool,
-                CreateMetricFilesTool,
-                ModifyDashboardFilesTool,
-                ModifyMetricFilesTool,
-            },
-            response_tools::{
-                // Response tools for completion or issues
-                Done,
-                MessageUserClarifyingQuestion,
-            },
-        },
-        IntoToolCallExecutor,
-    };
-
-    let create_metric_files_tool = CreateMetricFilesTool::new(Arc::clone(agent));
-    let modify_metric_files_tool = ModifyMetricFilesTool::new(Arc::clone(agent));
-    let create_dashboard_files_tool = CreateDashboardFilesTool::new(Arc::clone(agent));
-    let modify_dashboard_files_tool = ModifyDashboardFilesTool::new(Arc::clone(agent));
-    let message_user_clarifying_question_tool = MessageUserClarifyingQuestion::new();
-    let done_tool = Done::new(Arc::clone(agent));
-
-    // --- Store names before moving ---
-    let msg_clarifying_q_tool_name = message_user_clarifying_question_tool.get_name();
-    let done_tool_name = done_tool.get_name();
-    // --------------------------------
-
-    // --- Define Conditions based on Agent State ---
-    // These conditions depend on the plan and available assets, refinement needed based on actual state keys
-
-    // Base condition: Plan and context must exist (implicitly true if we are in this mode)
-    // let base_condition = |state: &HashMap<String, Value>| -> bool {
-    //     state.contains_key("data_context") && state.contains_key("plan_available")
-    // };
-
-    // Create Metric: Plan exists, context exists
-    let create_metric_condition = Some(|state: &HashMap<String, Value>| -> bool {
-        state.contains_key("data_context") && state.contains_key("plan_available")
-    });
-
-    // Modify Metric: Plan, context, and *some* metrics must exist
-    let modify_metric_condition = Some(|state: &HashMap<String, Value>| -> bool {
-        state.contains_key("data_context")
-            && state.contains_key("plan_available")
-            && state.contains_key("metrics_available") // Check if state includes metrics
-    });
-
-    // Create Dashboard: Plan, context, and metrics must exist
-    let create_dashboard_condition = Some(|state: &HashMap<String, Value>| -> bool {
-        state.contains_key("data_context")
-            && state.contains_key("plan_available")
-            && state.contains_key("metrics_available")
-    });
-
-    // Modify Dashboard: Plan, context, and *some* dashboards must exist
-    let modify_dashboard_condition = Some(|state: &HashMap<String, Value>| -> bool {
-        state.contains_key("data_context")
-            && state.contains_key("plan_available")
-            && state.contains_key("dashboards_available") // Check if state includes dashboards
-    });
-
-    // Done tool: Available when plan execution might be complete or needs review
-    // Using the complex condition from follow-up, assuming it's relevant here too
-    let done_condition = Some(|state: &HashMap<String, Value>| -> bool {
-        let review_needed = state
-            .get("review_needed")
-            .and_then(Value::as_bool)
-            .unwrap_or(false);
-        let all_todos_complete = state
-            .get("todos") // Assuming plan execution updates 'todos'
-            .and_then(Value::as_array)
-            .map(|todos| {
-                todos.iter().all(|todo| {
-                    todo.get("completed")
-                        .and_then(Value::as_bool)
-                        .unwrap_or(false)
-                })
-            })
-            .unwrap_or(false);
-        review_needed || all_todos_complete
-    });
-
-    // Clarifying question: Always available
-    let always_available = Some(|_state: &HashMap<String, Value>| -> bool { true });
-
-    // --- Add Tools with Conditions ---
-    agent
-        .add_tool(
-            create_metric_files_tool.get_name(),
-            create_metric_files_tool.into_tool_call_executor(),
-            create_metric_condition,
-        )
-        .await;
-    agent
-        .add_tool(
-            modify_metric_files_tool.get_name(),
-            modify_metric_files_tool.into_tool_call_executor(),
-            modify_metric_condition,
-        )
-        .await;
-    agent
-        .add_tool(
-            create_dashboard_files_tool.get_name(),
-            create_dashboard_files_tool.into_tool_call_executor(),
-            create_dashboard_condition,
-        )
-        .await;
-    agent
-        .add_tool(
-            modify_dashboard_files_tool.get_name(),
-            modify_dashboard_files_tool.into_tool_call_executor(),
-            modify_dashboard_condition,
-        )
-        .await;
-    agent
-        .add_tool(
-            message_user_clarifying_question_tool.get_name(),
-            message_user_clarifying_question_tool.into_tool_call_executor(),
-            always_available,
-        )
-        .await;
-    agent
-        .add_tool(
-            done_tool.get_name(),
-            done_tool.into_tool_call_executor(),
-            done_condition,
-        )
-        .await;
-
-    Ok(())
-}
--- a/api/libs/agents/src/tools/categories/file_tools/common.rs
+++ b/api/libs/agents/src/tools/categories/file_tools/common.rs
@ -478,6 +478,7 @@ definitions:
              - metric
          metricColumnId:
            type: string
+            description: The column ID to use for the metric value
          metricValueAggregate:
            type: string
            enum:
@ -488,7 +489,65 @@ definitions:
              - min
              - count
              - first
-            description: Optional - only used when the user specifically requests it, otherwise leave blank
+            description: Optional aggregation method for the metric value, defaults to sum if not specified
+          metricHeader:
+            oneOf:
+              - type: string
+                description: Simple string title for the metric header
+              - type: object
+                properties:
+                  columnId:
+                    type: string
+                    description: Which column to use for the header
+                  useValue:
+                    type: boolean
+                    description: Whether to display the key or the value in the chart
+                  aggregate:
+                    type: string
+                    enum:
+                      - sum
+                      - average
+                      - median
+                      - max
+                      - min
+                      - count
+                      - first
+                    description: Optional aggregation method, defaults to sum
+                required:
+                  - columnId
+                  - useValue
+                description: Configuration for a derived metric header
+          metricSubHeader:
+            oneOf:
+              - type: string
+                description: Simple string title for the metric sub-header
+              - type: object
+                properties:
+                  columnId:
+                    type: string
+                    description: Which column to use for the sub-header
+                  useValue:
+                    type: boolean
+                    description: Whether to display the key or the value in the chart
+                  aggregate:
+                    type: string
+                    enum:
+                      - sum
+                      - average
+                      - median
+                      - max
+                      - min
+                      - count
+                      - first
+                    description: Optional aggregation method, defaults to sum
+                required:
+                  - columnId
+                  - useValue
+                description: Configuration for a derived metric sub-header
+          metricValueLabel:
+            oneOf:
+              - type: string
+                description: Custom label to display with the metric value
        required:
          - selectedChartType
          - metricColumnId
--- a/api/libs/agents/src/tools/categories/file_tools/create_metrics.rs
+++ b/api/libs/agents/src/tools/categories/file_tools/create_metrics.rs
@ -218,7 +218,7 @@ impl ToolExecutor for CreateMetricFilesTool {

            let failures: Vec<String> = failed_files
                .iter()
-                .map(|(name, error)| format!("Failed to create '{}': {}. Please recreate the metric from scratch rather than attempting to modify.", name, error))
+                .map(|(name, error)| format!("Failed to create '{}': {}.\n\nPlease recreate the metric from scratch rather than attempting to modify. This error could be due to:\n- Using a dataset that doesn't exist (please reevaluate the available datasets in the chat conversation)\n- Invalid configuration in the metric file\n- Special characters in the metric name or SQL query\n- Syntax errors in the SQL query", name, error))
                .collect();

            if failures.len() == 1 {
--- a/api/libs/agents/src/tools/categories/file_tools/modify_metrics.rs
+++ b/api/libs/agents/src/tools/categories/file_tools/modify_metrics.rs
@ -418,7 +418,13 @@ impl ToolExecutor for ModifyMetricFilesTool {
            batch
                .failed_updates
                .into_iter()
-                .map(|(file_name, error)| FailedFileModification { file_name, error }),
+                .map(|(file_name, error)| {
+                    let error_message = format!("Failed to modify '{}': {}.\\n\\nPlease attempt to modify the metric again. This error could be due to:\\n- Using a dataset that doesn't exist (please reevaluate the available datasets in the chat conversation)\\n- Invalid configuration in the metric file\\n- Special characters in the metric name or SQL query\\n- Syntax errors in the SQL query", file_name, error);
+                    FailedFileModification { 
+                        file_name, 
+                        error: error_message,
+                    }
+                }),
        );

        // Set review_needed flag if execution was successful
--- a/api/libs/agents/src/tools/categories/file_tools/search_data_catalog.rs
+++ b/api/libs/agents/src/tools/categories/file_tools/search_data_catalog.rs
@ -363,7 +363,7 @@ async fn rerank_datasets(
        query,
        documents,
        model: ReRankModel::EnglishV3,
-        top_n: Some(30),
+        top_n: Some(50),
        ..Default::default()
    };

--- a/api/libs/database/src/types/metric_yml.rs
+++ b/api/libs/database/src/types/metric_yml.rs
@ -456,10 +456,10 @@ pub struct MetricChartConfig {
    pub metric_value_aggregate: Option<String>,
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(alias = "metric_header")]
-    pub metric_header: Option<String>,
+    pub metric_header: Option<serde_json::Value>,
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(alias = "metric_sub_header")]
-    pub metric_sub_header: Option<String>,
+    pub metric_sub_header: Option<serde_json::Value>,
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(alias = "metric_value_label")]
    pub metric_value_label: Option<String>,