Enhance database type mapping and YAML generation

- Expand type mapping to support multiple database systems (Snowflake, PostgreSQL, BigQuery, Redshift, MySQL)
- Add function to clean quotes from generated YAML
- Improve type categorization for dimensions, measures, and unsupported types
- Modify YAML description enhancement to remove quotes
This commit is contained in:
dal 2025-02-25 21:30:18 -07:00
parent 3bfe77339a
commit ca0d8704c2
No known key found for this signature in database
GPG Key ID: 16F4B0E1E9F61122
1 changed files with 68 additions and 12 deletions

View File

@ -94,36 +94,88 @@ enum ColumnMappingType {
Unsupported,
}
fn map_snowflake_type(type_str: &str) -> ColumnMappingType {
fn map_database_type(type_str: &str) -> ColumnMappingType {
// Convert to uppercase for consistent matching
let type_upper = type_str.to_uppercase();
match type_upper.as_str() {
// Numeric types that should be measures
// Common numeric types across databases
"NUMBER" | "DECIMAL" | "NUMERIC" | "FLOAT" | "REAL" | "DOUBLE" | "INT" | "INTEGER" |
"BIGINT" | "SMALLINT" | "TINYINT" | "BYTEINT" => ColumnMappingType::Measure(type_str.to_string()),
"BIGINT" | "SMALLINT" | "TINYINT" | "BYTEINT" |
// PostgreSQL specific
"DOUBLE PRECISION" | "SERIAL" | "BIGSERIAL" | "SMALLSERIAL" | "MONEY" |
// BigQuery specific
"INT64" | "FLOAT64" | "NUMERIC" | "BIGNUMERIC" |
// Redshift specific (mostly same as PostgreSQL)
"DECIMAL" | "DOUBLE PRECISION" |
// MySQL specific
"MEDIUMINT" | "FLOAT4" | "FLOAT8" | "DOUBLE PRECISION" | "DEC" | "FIXED" =>
ColumnMappingType::Measure(type_str.to_string()),
// Date/Time types
"DATE" | "DATETIME" | "TIME" | "TIMESTAMP" | "TIMESTAMP_LTZ" |
"TIMESTAMP_NTZ" | "TIMESTAMP_TZ" => ColumnMappingType::Dimension(type_str.to_string()),
// Common date/time types
"DATE" | "DATETIME" | "TIME" | "TIMESTAMP" |
// Snowflake specific
"TIMESTAMP_LTZ" | "TIMESTAMP_NTZ" | "TIMESTAMP_TZ" |
// PostgreSQL specific
"TIMESTAMPTZ" | "TIMESTAMP WITH TIME ZONE" | "TIMESTAMP WITHOUT TIME ZONE" | "INTERVAL" |
// BigQuery specific
"DATETIME" | "TIMESTAMP" | "DATE" | "TIME" |
// Redshift specific
"TIMETZ" | "TIMESTAMPTZ" |
// MySQL specific
"YEAR" =>
ColumnMappingType::Dimension(type_str.to_string()),
// String types
"TEXT" | "STRING" | "VARCHAR" | "CHAR" | "CHARACTER" => ColumnMappingType::Dimension(type_str.to_string()),
// Common string types
"TEXT" | "STRING" | "VARCHAR" | "CHAR" | "CHARACTER" |
// PostgreSQL specific
"CHARACTER VARYING" | "NAME" | "CITEXT" | "CIDR" | "INET" | "MACADDR" | "UUID" |
// BigQuery specific
"STRING" | "BYTES" |
// Redshift specific
"BPCHAR" | "NCHAR" | "NVARCHAR" |
// MySQL specific
"TINYTEXT" | "MEDIUMTEXT" | "LONGTEXT" | "ENUM" | "SET" | "JSON" =>
ColumnMappingType::Dimension(type_str.to_string()),
// Boolean type
"BOOLEAN" | "BOOL" => ColumnMappingType::Dimension(type_str.to_string()),
"BOOLEAN" | "BOOL" | "BIT" =>
ColumnMappingType::Dimension(type_str.to_string()),
// Unsupported types
"ARRAY" | "OBJECT" | "VARIANT" => ColumnMappingType::Unsupported,
// Binary/BLOB types
"BINARY" | "VARBINARY" | "BLOB" | "BYTEA" | "MEDIUMBLOB" | "LONGBLOB" | "TINYBLOB" =>
ColumnMappingType::Unsupported,
// Geometric types (PostgreSQL)
"POINT" | "LINE" | "LSEG" | "BOX" | "PATH" | "POLYGON" | "CIRCLE" | "GEOMETRY" =>
ColumnMappingType::Unsupported,
// Array/JSON/Complex types
"ARRAY" | "OBJECT" | "VARIANT" | "JSONB" | "HSTORE" | "XML" | "STRUCT" | "RECORD" =>
ColumnMappingType::Unsupported,
// Default to dimension for unknown types
_ => {
tracing::warn!("Unknown Snowflake type: {}, defaulting to dimension", type_str);
tracing::warn!("Unknown database type: {}, defaulting to dimension", type_str);
ColumnMappingType::Dimension(type_str.to_string())
}
}
}
// Add a new function to clean up quotes in YAML
fn clean_yaml_quotes(yaml: &str) -> String {
// First remove all single quotes
let no_single_quotes = yaml.replace('\'', "");
// Then remove all double quotes
let no_quotes = no_single_quotes.replace('"', "");
no_quotes
}
pub async fn generate_datasets(
Extension(user): Extension<User>,
Json(request): Json<GenerateDatasetRequest>,
@ -181,9 +233,10 @@ async fn enhance_yaml_with_descriptions(yaml: String) -> Result<String> {
LlmMessage::new(
"developer".to_string(),
"You are a YAML description enhancer. Your output must be wrapped in markdown code blocks using ```yml format.
Your task is to ONLY replace text matching exactly \"{NEED DESCRIPTION HERE}\" with appropriate descriptions. Do not modify any other parts of the YAML or other descriptions without the placeholder. You should still return the entire YAML in your output.
Your task is to ONLY replace text matching exactly {NEED DESCRIPTION HERE} with appropriate descriptions. Do not modify any other parts of the YAML or other descriptions without the placeholder. You should still return the entire YAML in your output.
DO NOT modify any other part of the YAML.
DO NOT add any explanations or text outside the ```yml block.
No double or single quotes.
Return the complete YAML wrapped in markdown, with only the placeholders replaced.".to_string(),
),
LlmMessage::new(
@ -240,7 +293,7 @@ async fn generate_model_yaml(
// Process each column and categorize as dimension or measure
for col in model_columns {
match map_snowflake_type(&col.type_) {
match map_database_type(&col.type_) {
ColumnMappingType::Dimension(semantic_type) => {
dimensions.push(Dimension {
name: col.name.clone(),
@ -282,10 +335,13 @@ async fn generate_model_yaml(
let yaml = serde_yaml::to_string(&config)?;
// Enhance descriptions using OpenAI
let enhanced_yaml = enhance_yaml_with_descriptions(yaml).await?;
let cleaned_yaml = clean_yaml_quotes(&enhanced_yaml);
Ok(enhanced_yaml)
Ok(cleaned_yaml)
}
async fn generate_datasets_handler(