sql dialect guidance and passing

2025-05-05 17:35:36 -06:00 · 2025-05-05 17:35:36 -06:00 · 8fd0ce820d
parent 8b76581454
commit 8fd0ce820d
7 changed files with 128 additions and 40 deletions
--- a/api/libs/agents/src/agents/buster_multi_agent.rs
+++ b/api/libs/agents/src/agents/buster_multi_agent.rs
@ -57,15 +57,31 @@ impl ModeProvider for BusterModeProvider {
    ) -> Result<ModeConfiguration> {
        let current_mode = determine_agent_state(state);

+        // Extract syntax (it might be None if not set yet, which is fine)
+        let data_source_syntax = state
+            .get("data_source_syntax")
+            .and_then(|v| v.as_str())
+            .map(|s| s.to_string());
+
        // Call the appropriate get_configuration function based on the mode
+        // Pass the extracted syntax (or None) to all modes
        let mode_config = match current_mode {
-            AgentState::Initializing => modes::initialization::get_configuration(&self.agent_data),
-            AgentState::DataCatalogSearch => {
-                modes::data_catalog_search::get_configuration(&self.agent_data)
+            AgentState::Initializing => {
+                modes::initialization::get_configuration(&self.agent_data, data_source_syntax)
+            }
+            AgentState::DataCatalogSearch => {
+                modes::data_catalog_search::get_configuration(&self.agent_data, data_source_syntax)
+            }
+            AgentState::Planning => {
+                modes::planning::get_configuration(&self.agent_data, data_source_syntax)
+            }
+            AgentState::AnalysisExecution => {
+                // Syntax is guaranteed to be extracted here or passed as None
+                modes::analysis::get_configuration(&self.agent_data, data_source_syntax)
+            }
+            AgentState::Review => {
+                modes::review::get_configuration(&self.agent_data, data_source_syntax)
            }
-            AgentState::Planning => modes::planning::get_configuration(&self.agent_data),
-            AgentState::AnalysisExecution => modes::analysis::get_configuration(&self.agent_data),
-            AgentState::Review => modes::review::get_configuration(&self.agent_data),
        };

        Ok(mode_config)
@ -120,7 +136,8 @@ impl BusterMultiAgent {
            .into_iter()
            .filter_map(|ds| ds.yml_content) // Get Some(String), filter out None
            .map(|content| serde_yaml::from_str::<YamlRoot>(&content)) // Parse String -> Result<YamlRoot, Error>
-            .filter_map(|result| { // Handle Result
+            .filter_map(|result| {
+                // Handle Result
                match result {
                    Ok(parsed_root) => {
                        // Extract info from the first model if available
@ -130,7 +147,7 @@ impl BusterMultiAgent {
                            tracing::warn!("Parsed YAML has no models");
                            None
                        }
-                    },
+                    }
                    Err(e) => {
                        tracing::warn!("Failed to parse dataset YAML: {}", e);
                        None // Filter out errors
--- a/api/libs/agents/src/agents/modes/analysis.rs
+++ b/api/libs/agents/src/agents/modes/analysis.rs
@ -24,12 +24,27 @@ use crate::tools::{
 };

 // Function to get the configuration for the AnalysisExecution mode
-pub fn get_configuration(agent_data: &ModeAgentData) -> ModeConfiguration {
-    // 1. Get the prompt, formatted with current data
-    let prompt = PROMPT.replace("{TODAYS_DATE}", &agent_data.todays_date);
-    // Note: This prompt doesn't use {DATASETS}
+pub fn get_configuration(agent_data: &ModeAgentData, data_source_syntax: Option<String>) -> ModeConfiguration {
+    // Determine SQL dialect guidance based on syntax
+    let syntax = data_source_syntax.as_deref().unwrap_or("postgres"); // Default to postgres
+    let sql_dialect_guidance = match syntax {
+        "snowflake" => SNOWFLAKE_DIALECT_GUIDANCE.to_string(),
+        "bigquery" => BIGQUERY_DIALECT_GUIDANCE.to_string(),
+        "redshift" => REDSHIFT_DIALECT_GUIDANCE.to_string(),
+        "mysql" | "mariadb" => MYSQL_MARIADB_DIALECT_GUIDANCE.to_string(),
+        "sqlserver" => SQLSERVER_DIALECT_GUIDANCE.to_string(),
+        "databricks" => DATABRICKS_DIALECT_GUIDANCE.to_string(),
+        "supabase" => POSTGRES_DIALECT_GUIDANCE.to_string(), // Supabase uses Postgres
+        "postgres" => POSTGRES_DIALECT_GUIDANCE.to_string(), // Explicit postgres case
+        _ => POSTGRES_DIALECT_GUIDANCE.to_string(), // Default to Postgres for any others
+    };

-    // 2. Define the model for this mode (Using default based on original MODEL = None)
+    // 1. Get the prompt, formatted with current data and SQL guidance
+    let prompt = PROMPT
+        .replace("{TODAYS_DATE}", &agent_data.todays_date)
+        .replace("{SQL_DIALECT_GUIDANCE}", &sql_dialect_guidance);
+
+    // 2. Define the model for this mode
    let model = "gemini-2.5-pro-exp-03-25".to_string();

    // 3. Define the tool loader closure
@ -139,7 +154,7 @@ pub fn get_configuration(agent_data: &ModeAgentData) -> ModeConfiguration {
        })
    });

-    // 4. Define terminating tools for this mode (From original load_tools)
+    // 4. Define terminating tools for this mode
    let terminating_tools = vec![Done::get_name()];

    // 5. Construct and return the ModeConfiguration
@ -151,7 +166,71 @@ pub fn get_configuration(agent_data: &ModeAgentData) -> ModeConfiguration {
    }
 }

-// Keep the prompt constant, but it's no longer pub
+// Placeholder for SQL dialect guidance
+const POSTGRES_DIALECT_GUIDANCE: &str = r##"
+- **Date/Time Functions (PostgreSQL/Supabase)**:
+  - **`DATE_TRUNC`**: Prefer `DATE_TRUNC('day', column)`, `DATE_TRUNC('week', column)`, `DATE_TRUNC('month', column)`, etc., for grouping time series data. Note that `'week'` starts on Monday.
+  - **`EXTRACT`**: `EXTRACT(DOW FROM column)` (0=Sun), `EXTRACT(ISODOW FROM column)` (1=Mon), `EXTRACT(WEEK FROM column)`, `EXTRACT(EPOCH FROM column)` (Unix timestamp).
+  - **Intervals**: Use `INTERVAL '1 day'`, `INTERVAL '1 month'`, etc.
+  - **Current Date/Time**: `CURRENT_DATE`, `CURRENT_TIMESTAMP`, `NOW()`.
+"##;
+
+const SNOWFLAKE_DIALECT_GUIDANCE: &str = r##"
+- **Date/Time Functions (Snowflake)**:
+  - **`DATE_TRUNC`**: Similar usage: `DATE_TRUNC('DAY', column)`, `DATE_TRUNC('WEEK', column)`, `DATE_TRUNC('MONTH', column)`. Week start depends on `WEEK_START` parameter (default Sunday).
+  - **`EXTRACT`**: `EXTRACT(dayofweek FROM column)` (0=Sun), `EXTRACT(dayofweekiso FROM column)` (1=Mon), `EXTRACT(weekiso FROM column)`. Use `DATE_PART` for more options (e.g., `DATE_PART('epoch_second', column)`).
+  - **DateAdd/DateDiff**: Use `DATEADD(day, 1, column)`, `DATEDIFF(day, start_date, end_date)`.
+  - **Intervals**: Use `INTERVAL '1 DAY'`, `INTERVAL '1 MONTH'`.
+  - **Current Date/Time**: `CURRENT_DATE()`, `CURRENT_TIMESTAMP()`, `SYSDATE()`.
+"##;
+
+const BIGQUERY_DIALECT_GUIDANCE: &str = r##"
+- **Date/Time Functions (BigQuery)**:
+  - **`DATE_TRUNC`**: `DATE_TRUNC(column, DAY)`, `DATE_TRUNC(column, WEEK)`, `DATE_TRUNC(column, MONTH)`, etc. Week starts Sunday by default, use `WEEK(MONDAY)` for Monday start.
+  - **`EXTRACT`**: `EXTRACT(DAYOFWEEK FROM column)` (1=Sun, 7=Sat), `EXTRACT(ISOWEEK FROM column)`.
+  - **DateAdd/DateDiff**: Use `DATE_ADD(column, INTERVAL 1 DAY)`, `DATE_SUB(column, INTERVAL 1 MONTH)`, `DATE_DIFF(end_date, start_date, DAY)`.
+  - **Intervals**: Use `INTERVAL 1 DAY`, `INTERVAL 1 MONTH`.
+  - **Current Date/Time**: `CURRENT_DATE()`, `CURRENT_TIMESTAMP()`, `CURRENT_DATETIME()`.
+"##;
+
+// Add constants for other dialects
+const REDSHIFT_DIALECT_GUIDANCE: &str = r##"
+- **Date/Time Functions (Redshift)**:
+  - **`DATE_TRUNC`**: Similar to PostgreSQL: `DATE_TRUNC('day', column)`, `DATE_TRUNC('week', column)`, `DATE_TRUNC('month', column)`. Week starts Monday.
+  - **`EXTRACT`**: `EXTRACT(DOW FROM column)` (0=Sun), `EXTRACT(EPOCH FROM column)`. Also supports `DATE_PART` (e.g., `DATE_PART(w, column)` for week).
+  - **DateAdd/DateDiff**: Use `DATEADD(day, 1, column)`, `DATEDIFF(day, start_date, end_date)`.
+  - **Intervals**: Use `INTERVAL '1 day'`, `INTERVAL '1 month'`.
+  - **Current Date/Time**: `GETDATE()`, `CURRENT_DATE`, `SYSDATE`.
+"##;
+
+const MYSQL_MARIADB_DIALECT_GUIDANCE: &str = r##"
+- **Date/Time Functions (MySQL/MariaDB)**:
+  - **`DATE_FORMAT`**: Use `DATE_FORMAT(column, '%Y-%m-01')` for month truncation. For week, use `STR_TO_DATE(CONCAT(YEAR(column),'-',WEEK(column, 1),' Monday'), '%X-%V %W')` (Mode 1 starts week on Monday).
+  - **`EXTRACT`**: `EXTRACT(DAYOFWEEK FROM column)` (1=Sun, 7=Sat), `EXTRACT(WEEK FROM column)`. `UNIX_TIMESTAMP(column)` for epoch seconds.
+  - **DateAdd/DateDiff**: Use `DATE_ADD(column, INTERVAL 1 DAY)`, `DATE_SUB(column, INTERVAL 1 MONTH)`, `DATEDIFF(end_date, start_date)`.
+  - **Intervals**: Use `INTERVAL 1 DAY`, `INTERVAL 1 MONTH`.
+  - **Current Date/Time**: `CURDATE()`, `NOW()`, `CURRENT_TIMESTAMP`.
+"##;
+
+const SQLSERVER_DIALECT_GUIDANCE: &str = r##"
+- **Date/Time Functions (SQL Server)**:
+  - **`DATE_TRUNC`**: Available in recent versions: `DATE_TRUNC('day', column)`, `DATE_TRUNC('week', column)`, `DATE_TRUNC('month', column)`. Week start depends on `DATEFIRST` setting.
+  - **`DATEPART`**: `DATEPART(weekday, column)`, `DATEPART(iso_week, column)`, `DATEPART(epoch, column)` (requires user function usually).
+  - **DateAdd/DateDiff**: Use `DATEADD(day, 1, column)`, `DATEDIFF(day, start_date, end_date)`.
+  - **Intervals**: Generally handled by `DATEADD`/`DATEDIFF`.
+  - **Current Date/Time**: `GETDATE()`, `SYSDATETIME()`, `CURRENT_TIMESTAMP`.
+"##;
+
+const DATABRICKS_DIALECT_GUIDANCE: &str = r##"
+- **Date/Time Functions (Databricks SQL)**:
+  - **`DATE_TRUNC`**: `DATE_TRUNC('DAY', column)`, `DATE_TRUNC('WEEK', column)`, `DATE_TRUNC('MONTH', column)`. Week starts Monday.
+  - **`EXTRACT`**: `EXTRACT(DAYOFWEEK FROM column)` (1=Sun, 7=Sat), `EXTRACT(WEEK FROM column)`. `unix_timestamp(column)` for epoch seconds.
+  - **DateAdd/DateDiff**: Use `date_add(column, 1)`, `date_sub(column, 30)`, `datediff(end_date, start_date)`.
+  - **Intervals**: Use `INTERVAL 1 DAY`, `INTERVAL 1 MONTH`.
+  - **Current Date/Time**: `current_date()`, `current_timestamp()`.
+"##;
+
+// Keep the prompt template constant, but add the guidance placeholder
 const PROMPT: &str = r##"### Role & Task
 You are Buster, an expert analytics and data engineer. Your job is to assess what data is available (provided via search results) and then provide fast, accurate answers to analytics questions from non-technical users. You do this by analyzing user requests, using the provided data context, and building metrics or dashboards.

@ -239,7 +318,10 @@ To conclude your worklow, you use the `finish_and_respond` tool to send a final
 ---

 ## SQL Best Practices and Constraints** (when creating new metrics)
- USE POSTGRESQL SYNTAX
+
+**Current SQL Dialect Guidance:**
+{SQL_DIALECT_GUIDANCE}
+
 - **Keep Queries Simple**: Strive for simplicity and clarity in your SQL. Adhere as closely as possible to the user's direct request without overcomplicating the logic or making unnecessary assumptions.
 - **Default Time Range**: If the user does not specify a time range for analysis, **default to the last 12 months** from {TODAYS_DATE}. Clearly state this assumption if making it.
 - **Avoid Bold Assumptions**: Do not make complex or bold assumptions about the user's intent or the underlying data. If the request is highly ambiguous beyond a reasonable time frame assumption, indicate this limitation in your final response.
@ -285,8 +367,3 @@ You MUST plan extensively before each function call, and reflect extensively on

 // No specific model override for analysis/execution mode
 pub const MODEL: Option<&str> = None;
-
-// Function to get the formatted prompt for this mode
-pub fn get_prompt(todays_date: &str) -> String {
-    PROMPT.replace("{TODAYS_DATE}", todays_date)
-}
--- a/api/libs/agents/src/agents/modes/data_catalog_search.rs
+++ b/api/libs/agents/src/agents/modes/data_catalog_search.rs
@ -21,7 +21,7 @@ use crate::tools::{
 };

 // Function to get the configuration for the DataCatalogSearch mode
-pub fn get_configuration(agent_data: &ModeAgentData) -> ModeConfiguration {
+pub fn get_configuration(agent_data: &ModeAgentData, _data_source_syntax: Option<String>) -> ModeConfiguration {
    // 1. Get the prompt, formatted with current data
    let prompt = DATA_CATALOG_SEARCH_PROMPT
        .replace("{DATASETS}", &agent_data.dataset_with_descriptions.join("\n\n")) // Deref Arc and Vec to get slice for join
--- a/api/libs/agents/src/agents/modes/initialization.rs
+++ b/api/libs/agents/src/agents/modes/initialization.rs
@ -18,7 +18,7 @@ use crate::tools::{
 };

 // Function to get the configuration for the Initialization mode
-pub fn get_configuration(agent_data: &ModeAgentData) -> ModeConfiguration {
+pub fn get_configuration(agent_data: &ModeAgentData, _data_source_syntax: Option<String>) -> ModeConfiguration {
    // 1. Get the prompt, formatted with current data
    let prompt = INTIALIZATION_PROMPT
        .replace("{DATASETS}", &agent_data.dataset_with_descriptions.join("\n\n"))
--- a/api/libs/agents/src/agents/modes/planning.rs
+++ b/api/libs/agents/src/agents/modes/planning.rs
@ -21,7 +21,7 @@ use crate::tools::{
 };

 // Function to get the configuration for the Planning mode
-pub fn get_configuration(agent_data: &ModeAgentData) -> ModeConfiguration {
+pub fn get_configuration(agent_data: &ModeAgentData, _data_source_syntax: Option<String>) -> ModeConfiguration {
    // 1. Get the prompt, formatted with current data
    let prompt = PLANNING_PROMPT
        .replace("{TODAYS_DATE}", &agent_data.todays_date)
--- a/api/libs/agents/src/agents/modes/review.rs
+++ b/api/libs/agents/src/agents/modes/review.rs
@ -19,12 +19,12 @@ use crate::tools::{
 };

 // Function to get the configuration for the Review mode
-pub fn get_configuration(_agent_data: &ModeAgentData) -> ModeConfiguration {
+pub fn get_configuration(_agent_data: &ModeAgentData, _data_source_syntax: Option<String>) -> ModeConfiguration {
    // 1. Get the prompt (doesn't need formatting for this mode)
-    let prompt = REVIEW_PROMPT.to_string();
+    let prompt = REVIEW_PROMPT.to_string(); // Use the correct constant

    // 2. Define the model for this mode (From original MODEL const)
-    let model = "gemini-2.0-flash-001".to_string();
+    let model = "gemini-2.5-pro-exp-03-25".to_string();

    // 3. Define the tool loader closure
    let tool_loader: Box<
--- a/api/libs/agents/src/tools/categories/file_tools/common.rs
+++ b/api/libs/agents/src/tools/categories/file_tools/common.rs
@ -223,25 +223,19 @@ properties:
      RULE: Follow general quoting rules. CANNOT contain ':'.

  # SQL QUERY
-  ### SQL Best Practices and Constraints** (when creating new metrics)  
-  #  - **Constraints**: Only join tables with explicit entity relationships.  
-  #  - **SQL Requirements**:  
-  #    - Use schema-qualified table names (`<DATABASE_NAME>.<SCHEMA_NAME>.<TABLE_NAME>`).  
-  #    - Use fully qualified column names with table aliases (e.g., `<table_alias>.<column>`).
-  #    - Select specific columns (avoid `SELECT *` or `COUNT(*)`).  
-  #    - Use CTEs instead of subqueries, and use snake_case for naming them.  
-  #    - Use `DISTINCT` (not `DISTINCT ON`) with matching `GROUP BY`/`SORT BY` clauses.  
-  #    - Show entity names rather than just IDs.  
-  #    - Handle date conversions appropriately.  
-  #    - Order dates in ascending order.
-  #    - Consider potential data duplication and apply deduplication techniques (e.g., `DISTINCT`, `GROUP BY`) where necessary.
+  # Describes how the SQL should be formatted within the YAML
  sql:
    required: true
    type: string
    description: |
-      SQL query using YAML pipe syntax (|)
+      SQL query using YAML pipe syntax (|).
      The SQL query should be formatted with proper indentation using the YAML pipe (|) syntax.
      This ensures the multi-line SQL is properly parsed while preserving whitespace and newlines.
+      Example:
+        sql: |
+          SELECT column1, column2
+          FROM my_table
+          WHERE condition;

  # CHART CONFIGURATION
  chartConfig: