mirror of https://github.com/buster-so/buster.git
Enhance database type mapping and YAML generation
- Expand type mapping to support multiple database systems (Snowflake, PostgreSQL, BigQuery, Redshift, MySQL) - Add function to clean quotes from generated YAML - Improve type categorization for dimensions, measures, and unsupported types - Modify YAML description enhancement to remove quotes
This commit is contained in:
parent
3bfe77339a
commit
ca0d8704c2
|
@ -94,36 +94,88 @@ enum ColumnMappingType {
|
||||||
Unsupported,
|
Unsupported,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn map_snowflake_type(type_str: &str) -> ColumnMappingType {
|
fn map_database_type(type_str: &str) -> ColumnMappingType {
|
||||||
// Convert to uppercase for consistent matching
|
// Convert to uppercase for consistent matching
|
||||||
let type_upper = type_str.to_uppercase();
|
let type_upper = type_str.to_uppercase();
|
||||||
|
|
||||||
match type_upper.as_str() {
|
match type_upper.as_str() {
|
||||||
// Numeric types that should be measures
|
// Numeric types that should be measures
|
||||||
|
// Common numeric types across databases
|
||||||
"NUMBER" | "DECIMAL" | "NUMERIC" | "FLOAT" | "REAL" | "DOUBLE" | "INT" | "INTEGER" |
|
"NUMBER" | "DECIMAL" | "NUMERIC" | "FLOAT" | "REAL" | "DOUBLE" | "INT" | "INTEGER" |
|
||||||
"BIGINT" | "SMALLINT" | "TINYINT" | "BYTEINT" => ColumnMappingType::Measure(type_str.to_string()),
|
"BIGINT" | "SMALLINT" | "TINYINT" | "BYTEINT" |
|
||||||
|
// PostgreSQL specific
|
||||||
|
"DOUBLE PRECISION" | "SERIAL" | "BIGSERIAL" | "SMALLSERIAL" | "MONEY" |
|
||||||
|
// BigQuery specific
|
||||||
|
"INT64" | "FLOAT64" | "NUMERIC" | "BIGNUMERIC" |
|
||||||
|
// Redshift specific (mostly same as PostgreSQL)
|
||||||
|
"DECIMAL" | "DOUBLE PRECISION" |
|
||||||
|
// MySQL specific
|
||||||
|
"MEDIUMINT" | "FLOAT4" | "FLOAT8" | "DOUBLE PRECISION" | "DEC" | "FIXED" =>
|
||||||
|
ColumnMappingType::Measure(type_str.to_string()),
|
||||||
|
|
||||||
// Date/Time types
|
// Date/Time types
|
||||||
"DATE" | "DATETIME" | "TIME" | "TIMESTAMP" | "TIMESTAMP_LTZ" |
|
// Common date/time types
|
||||||
"TIMESTAMP_NTZ" | "TIMESTAMP_TZ" => ColumnMappingType::Dimension(type_str.to_string()),
|
"DATE" | "DATETIME" | "TIME" | "TIMESTAMP" |
|
||||||
|
// Snowflake specific
|
||||||
|
"TIMESTAMP_LTZ" | "TIMESTAMP_NTZ" | "TIMESTAMP_TZ" |
|
||||||
|
// PostgreSQL specific
|
||||||
|
"TIMESTAMPTZ" | "TIMESTAMP WITH TIME ZONE" | "TIMESTAMP WITHOUT TIME ZONE" | "INTERVAL" |
|
||||||
|
// BigQuery specific
|
||||||
|
"DATETIME" | "TIMESTAMP" | "DATE" | "TIME" |
|
||||||
|
// Redshift specific
|
||||||
|
"TIMETZ" | "TIMESTAMPTZ" |
|
||||||
|
// MySQL specific
|
||||||
|
"YEAR" =>
|
||||||
|
ColumnMappingType::Dimension(type_str.to_string()),
|
||||||
|
|
||||||
// String types
|
// String types
|
||||||
"TEXT" | "STRING" | "VARCHAR" | "CHAR" | "CHARACTER" => ColumnMappingType::Dimension(type_str.to_string()),
|
// Common string types
|
||||||
|
"TEXT" | "STRING" | "VARCHAR" | "CHAR" | "CHARACTER" |
|
||||||
|
// PostgreSQL specific
|
||||||
|
"CHARACTER VARYING" | "NAME" | "CITEXT" | "CIDR" | "INET" | "MACADDR" | "UUID" |
|
||||||
|
// BigQuery specific
|
||||||
|
"STRING" | "BYTES" |
|
||||||
|
// Redshift specific
|
||||||
|
"BPCHAR" | "NCHAR" | "NVARCHAR" |
|
||||||
|
// MySQL specific
|
||||||
|
"TINYTEXT" | "MEDIUMTEXT" | "LONGTEXT" | "ENUM" | "SET" | "JSON" =>
|
||||||
|
ColumnMappingType::Dimension(type_str.to_string()),
|
||||||
|
|
||||||
// Boolean type
|
// Boolean type
|
||||||
"BOOLEAN" | "BOOL" => ColumnMappingType::Dimension(type_str.to_string()),
|
"BOOLEAN" | "BOOL" | "BIT" =>
|
||||||
|
ColumnMappingType::Dimension(type_str.to_string()),
|
||||||
|
|
||||||
// Unsupported types
|
// Binary/BLOB types
|
||||||
"ARRAY" | "OBJECT" | "VARIANT" => ColumnMappingType::Unsupported,
|
"BINARY" | "VARBINARY" | "BLOB" | "BYTEA" | "MEDIUMBLOB" | "LONGBLOB" | "TINYBLOB" =>
|
||||||
|
ColumnMappingType::Unsupported,
|
||||||
|
|
||||||
|
// Geometric types (PostgreSQL)
|
||||||
|
"POINT" | "LINE" | "LSEG" | "BOX" | "PATH" | "POLYGON" | "CIRCLE" | "GEOMETRY" =>
|
||||||
|
ColumnMappingType::Unsupported,
|
||||||
|
|
||||||
|
// Array/JSON/Complex types
|
||||||
|
"ARRAY" | "OBJECT" | "VARIANT" | "JSONB" | "HSTORE" | "XML" | "STRUCT" | "RECORD" =>
|
||||||
|
ColumnMappingType::Unsupported,
|
||||||
|
|
||||||
// Default to dimension for unknown types
|
// Default to dimension for unknown types
|
||||||
_ => {
|
_ => {
|
||||||
tracing::warn!("Unknown Snowflake type: {}, defaulting to dimension", type_str);
|
tracing::warn!("Unknown database type: {}, defaulting to dimension", type_str);
|
||||||
ColumnMappingType::Dimension(type_str.to_string())
|
ColumnMappingType::Dimension(type_str.to_string())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add a new function to clean up quotes in YAML
|
||||||
|
fn clean_yaml_quotes(yaml: &str) -> String {
|
||||||
|
// First remove all single quotes
|
||||||
|
let no_single_quotes = yaml.replace('\'', "");
|
||||||
|
|
||||||
|
// Then remove all double quotes
|
||||||
|
let no_quotes = no_single_quotes.replace('"', "");
|
||||||
|
|
||||||
|
no_quotes
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn generate_datasets(
|
pub async fn generate_datasets(
|
||||||
Extension(user): Extension<User>,
|
Extension(user): Extension<User>,
|
||||||
Json(request): Json<GenerateDatasetRequest>,
|
Json(request): Json<GenerateDatasetRequest>,
|
||||||
|
@ -181,9 +233,10 @@ async fn enhance_yaml_with_descriptions(yaml: String) -> Result<String> {
|
||||||
LlmMessage::new(
|
LlmMessage::new(
|
||||||
"developer".to_string(),
|
"developer".to_string(),
|
||||||
"You are a YAML description enhancer. Your output must be wrapped in markdown code blocks using ```yml format.
|
"You are a YAML description enhancer. Your output must be wrapped in markdown code blocks using ```yml format.
|
||||||
Your task is to ONLY replace text matching exactly \"{NEED DESCRIPTION HERE}\" with appropriate descriptions. Do not modify any other parts of the YAML or other descriptions without the placeholder. You should still return the entire YAML in your output.
|
Your task is to ONLY replace text matching exactly {NEED DESCRIPTION HERE} with appropriate descriptions. Do not modify any other parts of the YAML or other descriptions without the placeholder. You should still return the entire YAML in your output.
|
||||||
DO NOT modify any other part of the YAML.
|
DO NOT modify any other part of the YAML.
|
||||||
DO NOT add any explanations or text outside the ```yml block.
|
DO NOT add any explanations or text outside the ```yml block.
|
||||||
|
No double or single quotes.
|
||||||
Return the complete YAML wrapped in markdown, with only the placeholders replaced.".to_string(),
|
Return the complete YAML wrapped in markdown, with only the placeholders replaced.".to_string(),
|
||||||
),
|
),
|
||||||
LlmMessage::new(
|
LlmMessage::new(
|
||||||
|
@ -240,7 +293,7 @@ async fn generate_model_yaml(
|
||||||
|
|
||||||
// Process each column and categorize as dimension or measure
|
// Process each column and categorize as dimension or measure
|
||||||
for col in model_columns {
|
for col in model_columns {
|
||||||
match map_snowflake_type(&col.type_) {
|
match map_database_type(&col.type_) {
|
||||||
ColumnMappingType::Dimension(semantic_type) => {
|
ColumnMappingType::Dimension(semantic_type) => {
|
||||||
dimensions.push(Dimension {
|
dimensions.push(Dimension {
|
||||||
name: col.name.clone(),
|
name: col.name.clone(),
|
||||||
|
@ -282,10 +335,13 @@ async fn generate_model_yaml(
|
||||||
|
|
||||||
let yaml = serde_yaml::to_string(&config)?;
|
let yaml = serde_yaml::to_string(&config)?;
|
||||||
|
|
||||||
|
|
||||||
// Enhance descriptions using OpenAI
|
// Enhance descriptions using OpenAI
|
||||||
let enhanced_yaml = enhance_yaml_with_descriptions(yaml).await?;
|
let enhanced_yaml = enhance_yaml_with_descriptions(yaml).await?;
|
||||||
|
|
||||||
Ok(enhanced_yaml)
|
let cleaned_yaml = clean_yaml_quotes(&enhanced_yaml);
|
||||||
|
|
||||||
|
Ok(cleaned_yaml)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn generate_datasets_handler(
|
async fn generate_datasets_handler(
|
||||||
|
|
Loading…
Reference in New Issue