bugfix: prompts and database id

Staging
2025-02-14 08:21:10 -08:00 · 2025-02-14 08:21:10 -08:00 · 79688add0b
parent c8adf60cfa 3f3b3f0c5d
commit 79688add0b
10 changed files with 124 additions and 109 deletions
--- a/api/.gitignore
+++ b/api/.gitignore
@ -62,4 +62,6 @@ Cargo.lock
 node_modules/

 prds/
-docs/
+docs/
+
+.cargo/
--- a/api/src/utils/prompts/generate_sql_prompts/dataset_selector_prompt.rs
+++ b/api/src/utils/prompts/generate_sql_prompts/dataset_selector_prompt.rs
@ -2,21 +2,15 @@ use serde_json::{json, Value};

 pub fn dataset_selector_system_prompt(datasets: &String) -> String {
    format!(
-        r#"### DATASET/MODEL INFORMATION
-{}
+        r#"You're responsible for picking out the most relevant datasets to aid in answering the user's requests with SQL.

-### TASK
-Your task is to pick all the datasets required to answer the user question/request. You can and should combine multiple datasets through joins when needed to provide complete answers. Try to remain consistent with previous dataset selections.
-
-If you can't join datasets together with explicit entity relationships defined, explain why.
-
-### GENERAL INSTRUCTIONS
+Here are some general instructions:
 - Your task is to identify all datasets that could be useful when combined to answer the user's request
- Feel free to select multiple datasets that can be joined together to provide more complete answers
 - If the user requests advanced analysis like predictions, forecasts, correlation, impact analysis, etc., identify all datasets that could be combined for the analysis
 - Consider relationships between datasets and how they can be joined to provide comprehensive answers
- Multiple dataset can be selected even while one completely answers the user request.
-"#,
+        
+### DATASET/MODEL INFORMATION
+{}"#,
        datasets
    )
 }
--- a/api/src/utils/prompts/generate_sql_prompts/sql_gen_prompt.rs
+++ b/api/src/utils/prompts/generate_sql_prompts/sql_gen_prompt.rs
@ -6,61 +6,66 @@ pub fn sql_gen_system_prompt(
    data_source_type: &String,
 ) -> String {
    format!(
-        r#"### MODEL/VIEW INFORMATION
+        r#"# OBJECTIVE
+Generate a **single** {} query based on the provided analysis plan.
+
+# CONSTRAINTS
+- Output only the SQL query wrapped in ```sql tags
+- Do not include explanations or commentary
+- Do not suggest using other platforms or tools
+- Only join tables with explicit entity relationships
+- Stay within the provided dataset
+
+# SQL REQUIREMENTS
+- Use schema-qualified table names (<SCHEMA_NAME>.<TABLE_NAME>)
+- Select specific columns (no SELECT * or COUNT(*))
+- Use CTEs instead of subqueries with snake_case names
+- Use DISTINCT (not DISTINCT ON) with matching GROUP BY/SORT BY
+- Show entity names, not just IDs
+- Handle date conversions appropriately
+- Order dates ascending
+- Include date fields for time series
+- Reference database identifiers for cross-database queries
+- Format output for the specified visualization type
+- Maintain consistent data structure across requests unless changes required
+- Use explicit ordering for custom buckets/categories
+
+# TIME AND NAMING CONVENTIONS
+- Default to last 1 year if no timeframe specified
+- Maintain user-specified time ranges until changed
+- Include units in column names for time values
+- Concatenate first/last names by default
+- Use numerical weekday format (1-7)
+- Only use specific dates when explicitly requested
+
+# CONTEXT
+## Dataset Information
 {}

-### MODEL/VIEW REASONING
 {}

-### RELEVANT BUSINESS TERMS/DOMAIN SPECIFIC LANGUAGE
+## Domain Terms
 {}

-### RELEVANT VALUES FROM DATASET
+## Dataset Values
 {}

-### TASK
-Your task is to generate a **single** {} query based on the thoughts that are provided to you.
-
-Format the SQL for the visualization/report that is specified.
-
-Do not respond to the user telling them to use predictive modeling tooling in another platform.  This is a SQL generation tool.
-
-Do not respond with an explanation of the SQL you are generating. Generate just the SQL.
-
-Do not join datasets together that don't have explicit entity relationships defined.
-
-Please output the SQL delimited in ```sql tags.
-
-### GENERAL SQL REQUIREMENTS
- Never use placeholder values or comments suggesting value replacement (e.g. `WHERE id = <ID>` or `-- Replace with actual value`)
- specific days, months, years etc  shouldnt be included in your queries unless the user explicitly specifies dates to filter for. don't mention that you're assuming anything please and just reference the filter the same way the user asked for it.
- Use CTEs instead of subqueries and name them with snake_case.
- Do not use `DISTINCT ON` only DISTINCT and remember to use distinct columns in `GROUP BY` and `SORT BY` clauses.
- When displaying entities with names, show the name, not just the id.
- When performing operations on dates, remember to convert to the appropriate types.
- Always order dates in ascending order.
- When working with time series data, always return a date field.
- You must use the schema when referencing tables. Like this pattern <SCHEMA_NAME>.<TABLE_NAME>
- Pay attention to the database identifier. It may be used to reference across multiple databases. 
- Never use the 'SELECT *'  or 'SELECT COUNT(*)' command.  You must select the columns you want to see/use.
- Users may mention formatting or charting.  Although this task is specific to SQL generation, the user is referring to future steps for visualization.
- A request for a line chart should default to using a date-related field unless the user specifies otherwise or it is not available.
- Try to keep the data format (columns selected, aggregation, ordering, etc.) consistent from request to request unless the user request absolutely requires you to change it.
- If data is missing from the datasets, explain that to the user and try to deliver the best results you can.
- Never ask the user for datasets, columns, etc.
- If returning time units like day, hours, seconds, etc. please make sure column names say so.
- Concatenate first and last names when possible, unless the user specifies otherwise.
- If the user does not specify a time frame, default to the last 1 year.
- If the user specifies a time range during the conversation, maintain that time frame perpetually until specified otherwise
- If returning weekdays, please return them in numerical format (e.g. 1 for Monday, 2 for Tuesday, etc.) In your explanation, don't mention that you're returning the day of the week in numerical format.
- If you make custom buckets/categories, make sure to explicitly order them."#,
-        datasets_string, explanation, terms, relevant_values, data_source_type
+## Data Source
+{}"#,
+        data_source_type, datasets_string, explanation, terms, relevant_values, data_source_type
    )
 }

-pub fn sql_gen_user_prompt(request: String, thought_process: String) -> String {
+pub fn sql_gen_user_prompt(request: String, analysis_plan: String) -> String {
    format!(
-        "## USER REQUEST\n{}\n\n## THOUGHT PROCESS\n{}",
-        request, thought_process
+        r#"# TASK
+Generate SQL based on this request and analysis plan.
+
+# USER REQUEST
+{}
+
+# ANALYSIS PLAN
+{}"#,
+        request, analysis_plan
    )
 }
--- a/api/src/utils/prompts/generate_sql_prompts/sql_gen_thought_prompt.rs
+++ b/api/src/utils/prompts/generate_sql_prompts/sql_gen_thought_prompt.rs
@ -6,65 +6,80 @@ pub fn sql_gen_thought_system_prompt(
    data_source_type: &String,
 ) -> String {
    format!(
-        r#"### MODEL/VIEW INFORMATION
+        r#"# OBJECTIVE
+Your goal is to generate a plan for a SQL query that best answers the user's request. Your response should be a clear, structured plan that:
+
+1. Determines the most appropriate visualization type from:
+   - Single value metrics
+   - Time series: line, multi-line, dual axis
+   - Comparisons: bar, grouped bar, stacked bar, pie, donut
+   - Relationships: scatter, combo chart
+   - Detailed data: table/report
+
+2. Produces accurate results by:
+   - Using only explicitly defined entity relationships
+   - Working with available data (no assumptions about other tables)
+   - Handling data quality issues (missing values, formatting)
+   - Considering column descriptions and business context
+
+# OUTPUT FORMAT
+Provide your response as a numbered list:
+<step_number>. **<decision_point>**: <explanation>
+
+End with:
+**Final Decision**: <summary of approach>
+
+# CONSTRAINTS
+- Only join tables with explicit entity relationships
+- Stay within the provided dataset
+- Prioritize data quality and accuracy
+- Follow user-specified visualization requirements if given
+
+**You will not be writing a sql query, but rather a plan for a sql query.**
+
+# CONTEXT
+## Dataset Information
 {}

-### MODEL/VIEW REASONING
+## Business Context
 {}

-### RELEVANT BUSINESS TERMS/DOMAIN SPECIFIC LANGUAGE
+## Domain Terms
 {}

-### RELEVANT VALUES FROM DATASET
+## Dataset Values
 {}

-### DATA SOURCE TYPE
-{}
-
-### TASK
-Your task is to build a thought process based on the context above.  Specifically thinking through the dataset, its columns and descriptions, relevant terms (if any), and relevant values (if any)
-
-First, you should think about and decide on a visualization type that would be the best for for answering the request. Your options are: line, bar, pie, donut, scatter, multi line, dual axis line, combo chart, grouped bar, stacked bar, metric (for a single aggregated value), and ultimately if no visualization is suitable, you could use a table/report.  If the user specifies a chart, you should think about how to fit the data on the specified one.
-
-The thought process should be an ordered list that you should think through in order to best answer the user request with SQL.  You will not generate a full SQL statement, but instead generate the steps of the framework to best think through the request and provide the best answer to the user.
-
-Your last thought should be a final decision that aggregates up the entire thought process.
-
-This can be any number of thoughts that you deem necessary to fully think through the solution to the users request.
-
-You will be doing this from user request to request.  Do not repeat yourself.
-
-You should only join datasets that have explicit entity relationships defined.
-
-### GENERAL INSTRUCTIONS
- Do not repeat the same thought from message to message.
- NEVER ASSUME ANOTHER TABLE HAS THE DATA.  Try to best answer the user request based on the dataset you have access to.
- You should be decisive in your thoughts.
- Think through data hygiene and data quality.  Such as missing values, formatting, etc.
- Consider the column descriptions in your selection.
- A table/report visualization is best for when multiple non-plottable columns are returned from the query.
- If the user asks for a chart, think through the best way to display the data in that chart.
-
-### OUTPUT STYLE
-Always output each step as <Number>. **<Thought Title>**:<Thought Content>
-
-<Number> is the step number.
-<Thought Title> is the title of the thought.
-<Thought Content> is the content of the thought.
-...
-
-<Number>. **Final Decision**: is the final decision of the thought process.
-
-#"#,
+## Data Source
+{}"#,
        dataset, explanation, terms, relevant_values, data_source_type
    )
 }

 pub fn sql_gen_thought_user_prompt(request: String, sql: Option<String>) -> String {
    let prompt = if let Some(sql) = sql {
-        format!("## USER REQUEST\n{}\n\n## GENERATED SQL\n{}", request, sql)
+        format!(
+            r#"# TASK
+Analyze this request and propose a SQL solution.
+
+# USER REQUEST
+{}
+
+# CURRENT IMPLEMENTATION
+{}
+
+Review the current SQL implementation and suggest improvements if needed."#,
+            request, sql
+        )
    } else {
-        format!("## USER REQUEST\n{}", request)
+        format!(
+            r#"# TASK
+Analyze this request and propose a SQL solution.
+
+# USER REQUEST
+{}"#,
+            request
+        )
    };

    prompt
--- a/api/src/utils/query_engine/credentials.rs
+++ b/api/src/utils/query_engine/credentials.rs
@ -97,7 +97,7 @@ pub struct RedshiftCredentials {
 pub struct SnowflakeCredentials {
    pub account_id: String,
    pub warehouse_id: String,
-    pub database_id: String,
+    pub database_id: Option<String>,
    pub username: String,
    pub password: String,
    pub role: Option<String>,
--- a/api/src/utils/query_engine/data_source_connections/get_snowflake_client.rs
+++ b/api/src/utils/query_engine/data_source_connections/get_snowflake_client.rs
@ -5,12 +5,11 @@ use crate::utils::query_engine::credentials::SnowflakeCredentials;

 pub async fn get_snowflake_client(
    credentials: &SnowflakeCredentials,
-    database: Option<String>,
 ) -> Result<SnowflakeApi, Error> {
    let snowflake_client = match SnowflakeApi::with_password_auth(
        &credentials.account_id,
-        Some(credentials.warehouse_id.as_str()),
-        database.as_deref(),
+        Some(&credentials.warehouse_id),
+        credentials.database_id.as_deref(),
        None,
        &credentials.username,
        credentials.role.as_deref(),
--- a/api/src/utils/query_engine/data_source_query_routes/query_router.rs
+++ b/api/src/utils/query_engine/data_source_query_routes/query_router.rs
@ -229,7 +229,7 @@ async fn route_to_query(
        DataSourceType::Snowflake => {
            let credentials: SnowflakeCredentials = serde_json::from_str(&credentials_string)?;

-            let mut snowflake_client = match get_snowflake_client(&credentials, None).await {
+            let mut snowflake_client = match get_snowflake_client(&credentials).await {
                Ok(snowflake_client) => snowflake_client,
                Err(e) => {
                    tracing::error!("There was an issue while establishing a connection to the parent data source: {}", e);
--- a/api/src/utils/query_engine/import_dataset_columns.rs
+++ b/api/src/utils/query_engine/import_dataset_columns.rs
@ -222,7 +222,7 @@ async fn get_snowflake_columns_batch(
    credentials: &SnowflakeCredentials,
    database: Option<String>,
 ) -> Result<Vec<DatasetColumnRecord>> {
-    let snowflake_client = get_snowflake_client(credentials, database).await?;
+    let snowflake_client = get_snowflake_client(credentials).await?;

    // Build the IN clause for (schema, table) pairs
    let table_pairs: Vec<String> = datasets
@ -647,7 +647,7 @@ async fn get_snowflake_columns(
    schema_name: &String,
    credentials: &SnowflakeCredentials,
 ) -> Result<Vec<DatasetColumnRecord>> {
-    let snowflake_client = get_snowflake_client(credentials, None).await?;
+    let snowflake_client = get_snowflake_client(credentials).await?;

    let uppercase_dataset_name = dataset_name.to_uppercase();
    let uppercase_schema_name = schema_name.to_uppercase();
--- a/api/src/utils/query_engine/import_datasets.rs
+++ b/api/src/utils/query_engine/import_datasets.rs
@ -345,7 +345,7 @@ async fn get_bigquery_tables_and_views(
 async fn get_snowflake_tables_and_views(
    credentials: &SnowflakeCredentials,
 ) -> Result<Vec<DatasetRecord>> {
-    let snowflake_client = get_snowflake_client(credentials, None).await?;
+    let snowflake_client = get_snowflake_client(credentials).await?;

    let schema_list = credentials.schemas.clone().unwrap_or_else(|| vec![]);
    let schema_string = if !schema_list.is_empty() {
--- a/api/src/utils/query_engine/test_data_source_connections.rs
+++ b/api/src/utils/query_engine/test_data_source_connections.rs
@ -95,7 +95,7 @@ pub async fn test_data_source_connection(
                _ => return Err(anyhow!("Invalid credential type")),
            };

-            match get_snowflake_client(&credential, None).await {
+            match get_snowflake_client(&credential).await {
                Ok(client) => client,
                Err(e) => return Err(anyhow!("Error getting snowflake client: {:?}", e)),
            };