mirror of https://github.com/buster-so/buster.git
Add support for YAML-based dataset deployment and enhance dataset structures
This commit is contained in:
parent
d962af3883
commit
79b3df107d
|
@ -89,6 +89,7 @@ tokio-postgres = "0.7"
|
||||||
futures-util = "0.3"
|
futures-util = "0.3"
|
||||||
rayon = "1.10.0"
|
rayon = "1.10.0"
|
||||||
diesel_migrations = "2.0.0"
|
diesel_migrations = "2.0.0"
|
||||||
|
serde_yaml = "0.9.34"
|
||||||
|
|
||||||
[profile.release]
|
[profile.release]
|
||||||
debug = false
|
debug = false
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
-- This file should undo anything in `up.sql`
|
||||||
|
ALTER TABLE datasets
|
||||||
|
DROP CONSTRAINT datasets_database_name_data_source_id_key;
|
|
@ -0,0 +1,9 @@
|
||||||
|
-- Your SQL goes here
|
||||||
|
-- First delete any duplicate rows
|
||||||
|
DELETE FROM datasets a USING datasets b
|
||||||
|
WHERE a.id > b.id
|
||||||
|
AND a.database_name = b.database_name
|
||||||
|
AND a.data_source_id = b.data_source_id;
|
||||||
|
|
||||||
|
ALTER TABLE datasets
|
||||||
|
ADD CONSTRAINT datasets_database_name_data_source_id_key UNIQUE (id, database_name, data_source_id);
|
|
@ -340,6 +340,14 @@ impl DatasetType {
|
||||||
_ => None,
|
_ => None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn to_string(&self) -> &'static str {
|
||||||
|
match *self {
|
||||||
|
DatasetType::Table => "table",
|
||||||
|
DatasetType::View => "view",
|
||||||
|
DatasetType::MaterializedView => "materialized view",
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ToSql<sql_types::DatasetTypeEnum, Pg> for DatasetType {
|
impl ToSql<sql_types::DatasetTypeEnum, Pg> for DatasetType {
|
||||||
|
|
|
@ -4,6 +4,7 @@ use chrono::{DateTime, Utc};
|
||||||
use diesel::{upsert::excluded, ExpressionMethods, QueryDsl, SelectableHelper};
|
use diesel::{upsert::excluded, ExpressionMethods, QueryDsl, SelectableHelper};
|
||||||
use diesel_async::RunQueryDsl;
|
use diesel_async::RunQueryDsl;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
use serde_yaml;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
@ -19,15 +20,26 @@ use crate::{
|
||||||
query_engine::{
|
query_engine::{
|
||||||
credentials::get_data_source_credentials,
|
credentials::get_data_source_credentials,
|
||||||
import_dataset_columns::retrieve_dataset_columns,
|
import_dataset_columns::retrieve_dataset_columns,
|
||||||
|
write_query_engine::write_query_engine,
|
||||||
},
|
},
|
||||||
user::user_info::get_user_organization_id,
|
user::user_info::get_user_organization_id,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
#[derive(Debug, Deserialize)]
|
||||||
pub struct DeployDatasetsRequest {
|
#[serde(untagged)]
|
||||||
|
pub enum DeployDatasetsRequest {
|
||||||
|
Full(Vec<FullDeployDatasetsRequest>),
|
||||||
|
Simple { id: Uuid, sql: String, yml: String },
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct FullDeployDatasetsRequest {
|
||||||
|
pub id: Option<Uuid>,
|
||||||
pub data_source_name: String,
|
pub data_source_name: String,
|
||||||
pub env: String,
|
pub env: String,
|
||||||
|
#[serde(rename = "type")]
|
||||||
|
pub type_: String,
|
||||||
pub name: String,
|
pub name: String,
|
||||||
pub model: Option<String>,
|
pub model: Option<String>,
|
||||||
pub schema: String,
|
pub schema: String,
|
||||||
|
@ -62,11 +74,65 @@ pub struct DeployDatasetsResponse {
|
||||||
pub ids: Vec<Uuid>,
|
pub ids: Vec<Uuid>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct BusterModel {
|
||||||
|
pub version: i32,
|
||||||
|
pub models: Vec<Model>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct Model {
|
||||||
|
pub name: String,
|
||||||
|
pub data_source_name: String,
|
||||||
|
pub schema: String,
|
||||||
|
pub env: String,
|
||||||
|
pub description: String,
|
||||||
|
pub model: Option<String>,
|
||||||
|
#[serde(rename = "type")]
|
||||||
|
pub type_: String,
|
||||||
|
pub entities: Vec<Entity>,
|
||||||
|
pub dimensions: Vec<Dimension>,
|
||||||
|
pub measures: Vec<Measure>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct Entity {
|
||||||
|
pub name: String,
|
||||||
|
pub expr: String,
|
||||||
|
#[serde(rename = "type")]
|
||||||
|
pub entity_type: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct Dimension {
|
||||||
|
pub name: String,
|
||||||
|
pub expr: String,
|
||||||
|
#[serde(rename = "type")]
|
||||||
|
pub dimension_type: String,
|
||||||
|
pub description: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct Measure {
|
||||||
|
pub name: String,
|
||||||
|
pub expr: String,
|
||||||
|
pub agg: String,
|
||||||
|
pub description: String,
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn deploy_datasets(
|
pub async fn deploy_datasets(
|
||||||
Extension(user): Extension<User>,
|
Extension(user): Extension<User>,
|
||||||
Json(request): Json<Vec<DeployDatasetsRequest>>,
|
Json(request): Json<DeployDatasetsRequest>,
|
||||||
) -> Result<ApiResponse<DeployDatasetsResponse>, (axum::http::StatusCode, String)> {
|
) -> Result<ApiResponse<DeployDatasetsResponse>, (axum::http::StatusCode, String)> {
|
||||||
let _ = match deploy_datasets_handler(&user.id, request).await {
|
let requests = match process_deploy_request(request).await {
|
||||||
|
Ok(requests) => requests,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::error!("Error processing deploy request: {:?}", e);
|
||||||
|
return Err((axum::http::StatusCode::BAD_REQUEST, e.to_string()));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let _ = match deploy_datasets_handler(&user.id, requests).await {
|
||||||
Ok(dataset) => dataset,
|
Ok(dataset) => dataset,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
tracing::error!("Error creating dataset: {:?}", e);
|
tracing::error!("Error creating dataset: {:?}", e);
|
||||||
|
@ -77,7 +143,79 @@ pub async fn deploy_datasets(
|
||||||
Ok(ApiResponse::OK)
|
Ok(ApiResponse::OK)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn deploy_datasets_handler(user_id: &Uuid, requests: Vec<DeployDatasetsRequest>) -> Result<()> {
|
async fn process_deploy_request(
|
||||||
|
request: DeployDatasetsRequest,
|
||||||
|
) -> Result<Vec<FullDeployDatasetsRequest>> {
|
||||||
|
match request {
|
||||||
|
DeployDatasetsRequest::Full(requests) => Ok(requests),
|
||||||
|
DeployDatasetsRequest::Simple { id, sql, yml } => {
|
||||||
|
let model: BusterModel = serde_yaml::from_str(&yml)?;
|
||||||
|
let mut requests = Vec::new();
|
||||||
|
|
||||||
|
for semantic_model in model.models {
|
||||||
|
// Create the view in the data source
|
||||||
|
let mut columns = Vec::new();
|
||||||
|
|
||||||
|
// Process dimensions
|
||||||
|
for dim in semantic_model.dimensions {
|
||||||
|
columns.push(DeployDatasetsColumnsRequest {
|
||||||
|
name: dim.name,
|
||||||
|
description: dim.description,
|
||||||
|
semantic_type: Some(String::from("dimension")),
|
||||||
|
expr: Some(dim.expr),
|
||||||
|
type_: Some(dim.dimension_type),
|
||||||
|
agg: None,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process measures
|
||||||
|
for measure in semantic_model.measures {
|
||||||
|
columns.push(DeployDatasetsColumnsRequest {
|
||||||
|
name: measure.name,
|
||||||
|
description: measure.description,
|
||||||
|
semantic_type: Some(String::from("measure")),
|
||||||
|
expr: Some(measure.expr),
|
||||||
|
type_: None,
|
||||||
|
agg: Some(measure.agg),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process entity relationships
|
||||||
|
let entity_relationships = semantic_model
|
||||||
|
.entities
|
||||||
|
.into_iter()
|
||||||
|
.map(|entity| DeployDatasetsEntityRelationshipsRequest {
|
||||||
|
name: entity.name,
|
||||||
|
expr: entity.expr,
|
||||||
|
type_: entity.entity_type,
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
requests.push(FullDeployDatasetsRequest {
|
||||||
|
id: Some(id),
|
||||||
|
data_source_name: semantic_model.data_source_name,
|
||||||
|
env: semantic_model.env,
|
||||||
|
type_: semantic_model.type_,
|
||||||
|
name: semantic_model.name,
|
||||||
|
model: semantic_model.model,
|
||||||
|
schema: semantic_model.schema,
|
||||||
|
description: semantic_model.description,
|
||||||
|
sql_definition: Some(sql.clone()),
|
||||||
|
entity_relationships: Some(entity_relationships),
|
||||||
|
columns,
|
||||||
|
yml_file: Some(yml.clone()),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(requests)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn deploy_datasets_handler(
|
||||||
|
user_id: &Uuid,
|
||||||
|
requests: Vec<FullDeployDatasetsRequest>,
|
||||||
|
) -> Result<()> {
|
||||||
// Get the user organization id.
|
// Get the user organization id.
|
||||||
let organization_id = get_user_organization_id(&user_id).await?;
|
let organization_id = get_user_organization_id(&user_id).await?;
|
||||||
|
|
||||||
|
@ -124,13 +262,15 @@ async fn deploy_datasets_handler(user_id: &Uuid, requests: Vec<DeployDatasetsReq
|
||||||
})
|
})
|
||||||
.ok_or(anyhow!("Data source not found"))?;
|
.ok_or(anyhow!("Data source not found"))?;
|
||||||
|
|
||||||
|
let database_name = req.name.replace(" ", "_");
|
||||||
|
|
||||||
let dataset = Dataset {
|
let dataset = Dataset {
|
||||||
id: Uuid::new_v4(),
|
id: req.id.unwrap_or_else(|| Uuid::new_v4()),
|
||||||
name: req.name.clone(),
|
name: req.name.clone(),
|
||||||
data_source_id: data_source.id,
|
data_source_id: data_source.id,
|
||||||
created_at: Utc::now(),
|
created_at: Utc::now(),
|
||||||
updated_at: Utc::now(),
|
updated_at: Utc::now(),
|
||||||
database_name: req.name.clone(),
|
database_name,
|
||||||
when_to_use: Some(req.description.clone()),
|
when_to_use: Some(req.description.clone()),
|
||||||
when_not_to_use: None,
|
when_not_to_use: None,
|
||||||
type_: DatasetType::View,
|
type_: DatasetType::View,
|
||||||
|
@ -146,7 +286,7 @@ async fn deploy_datasets_handler(user_id: &Uuid, requests: Vec<DeployDatasetsReq
|
||||||
imported: false,
|
imported: false,
|
||||||
organization_id,
|
organization_id,
|
||||||
model: req.model.clone(),
|
model: req.model.clone(),
|
||||||
yml_file: None,
|
yml_file: req.yml_file.clone(),
|
||||||
};
|
};
|
||||||
|
|
||||||
datasets.push(dataset);
|
datasets.push(dataset);
|
||||||
|
@ -155,7 +295,11 @@ async fn deploy_datasets_handler(user_id: &Uuid, requests: Vec<DeployDatasetsReq
|
||||||
// Upsert the datasets into the database.
|
// Upsert the datasets into the database.
|
||||||
let inserted_datasets = diesel::insert_into(datasets::table)
|
let inserted_datasets = diesel::insert_into(datasets::table)
|
||||||
.values(&datasets)
|
.values(&datasets)
|
||||||
.on_conflict((datasets::database_name, datasets::data_source_id))
|
.on_conflict((
|
||||||
|
datasets::id,
|
||||||
|
datasets::database_name,
|
||||||
|
datasets::data_source_id,
|
||||||
|
))
|
||||||
.do_update()
|
.do_update()
|
||||||
.set((
|
.set((
|
||||||
datasets::name.eq(excluded(datasets::name)),
|
datasets::name.eq(excluded(datasets::name)),
|
||||||
|
@ -339,8 +483,6 @@ async fn deploy_datasets_handler(user_id: &Uuid, requests: Vec<DeployDatasetsReq
|
||||||
// Handle entity relationships
|
// Handle entity relationships
|
||||||
let mut entity_relationships_to_upsert: Vec<EntityRelationship> = Vec::new();
|
let mut entity_relationships_to_upsert: Vec<EntityRelationship> = Vec::new();
|
||||||
|
|
||||||
println!("inserted_datasets: {:?}", inserted_datasets);
|
|
||||||
|
|
||||||
for req in &requests {
|
for req in &requests {
|
||||||
if let Some(relationships) = &req.entity_relationships {
|
if let Some(relationships) = &req.entity_relationships {
|
||||||
let current_dataset = inserted_datasets
|
let current_dataset = inserted_datasets
|
||||||
|
@ -397,6 +539,19 @@ async fn deploy_datasets_handler(user_id: &Uuid, requests: Vec<DeployDatasetsReq
|
||||||
.map_err(|e| anyhow!("Failed to upsert entity relationships: {}", e))?;
|
.map_err(|e| anyhow!("Failed to upsert entity relationships: {}", e))?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for dataset in inserted_datasets {
|
||||||
|
let view_name = format!("{}.{}", dataset.schema, dataset.database_name);
|
||||||
|
let view_sql = format!(
|
||||||
|
"CREATE {} {} AS {}",
|
||||||
|
dataset.type_.to_string(),
|
||||||
|
view_name,
|
||||||
|
dataset.definition
|
||||||
|
);
|
||||||
|
|
||||||
|
// Write the view to the data source
|
||||||
|
write_query_engine(&dataset.id, &view_sql).await?;
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: Need to send back the updated and inserated objects.
|
// TODO: Need to send back the updated and inserated objects.
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,7 +26,8 @@ pub async fn post_dataset(
|
||||||
Extension(user): Extension<User>,
|
Extension(user): Extension<User>,
|
||||||
Json(request): Json<PostDatasetReq>,
|
Json(request): Json<PostDatasetReq>,
|
||||||
) -> Result<ApiResponse<Dataset>, (axum::http::StatusCode, String)> {
|
) -> Result<ApiResponse<Dataset>, (axum::http::StatusCode, String)> {
|
||||||
let dataset = match post_dataset_handler(&user.id, &request.data_source_id, &request.name).await {
|
let dataset = match post_dataset_handler(&user.id, &request.data_source_id, &request.name).await
|
||||||
|
{
|
||||||
Ok(dataset) => dataset,
|
Ok(dataset) => dataset,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
tracing::error!("Error creating dataset: {:?}", e);
|
tracing::error!("Error creating dataset: {:?}", e);
|
||||||
|
@ -37,7 +38,11 @@ pub async fn post_dataset(
|
||||||
Ok(ApiResponse::JsonData(dataset))
|
Ok(ApiResponse::JsonData(dataset))
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn post_dataset_handler(user_id: &Uuid, data_source_id: &Uuid, name: &str) -> Result<Dataset> {
|
async fn post_dataset_handler(
|
||||||
|
user_id: &Uuid,
|
||||||
|
data_source_id: &Uuid,
|
||||||
|
name: &str,
|
||||||
|
) -> Result<Dataset> {
|
||||||
// Get the user organization id.
|
// Get the user organization id.
|
||||||
let organization_id = get_user_organization_id(&user_id).await?;
|
let organization_id = get_user_organization_id(&user_id).await?;
|
||||||
|
|
||||||
|
@ -87,6 +92,8 @@ async fn post_dataset_handler(user_id: &Uuid, data_source_id: &Uuid, name: &str)
|
||||||
Err(e) => return Err(anyhow!("Data sources not found: {}", e)),
|
Err(e) => return Err(anyhow!("Data sources not found: {}", e)),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let database_name = name.replace(" ", "_");
|
||||||
|
|
||||||
let dataset = Dataset {
|
let dataset = Dataset {
|
||||||
id: Uuid::new_v4(),
|
id: Uuid::new_v4(),
|
||||||
name: name.to_string(),
|
name: name.to_string(),
|
||||||
|
@ -94,7 +101,7 @@ async fn post_dataset_handler(user_id: &Uuid, data_source_id: &Uuid, name: &str)
|
||||||
organization_id: organization_id.clone(),
|
organization_id: organization_id.clone(),
|
||||||
created_at: chrono::Utc::now(),
|
created_at: chrono::Utc::now(),
|
||||||
updated_at: chrono::Utc::now(),
|
updated_at: chrono::Utc::now(),
|
||||||
database_name: String::new(),
|
database_name,
|
||||||
when_to_use: None,
|
when_to_use: None,
|
||||||
when_not_to_use: None,
|
when_not_to_use: None,
|
||||||
type_: DatasetType::Table,
|
type_: DatasetType::Table,
|
||||||
|
|
|
@ -106,30 +106,6 @@ async fn post_dataset_handler(
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let name = format!("dataset_index_{}", dataset_id.clone());
|
|
||||||
let new_collection_schema = json!({
|
|
||||||
"name": name,
|
|
||||||
"fields": [
|
|
||||||
{"name": "id", "type": "string"},
|
|
||||||
{"name": "value", "type": "string"},
|
|
||||||
{"name": "dataset_id", "type": "string"},
|
|
||||||
{"name": "dataset_column_id", "type": "string"},
|
|
||||||
{"name": "value_embedding", "type": "float[]", "embed": {
|
|
||||||
"from": [
|
|
||||||
"value"
|
|
||||||
],
|
|
||||||
"model_config": {
|
|
||||||
"model_name": "ts/jina-embeddings-v2-base-en"
|
|
||||||
}
|
|
||||||
}},
|
|
||||||
],
|
|
||||||
});
|
|
||||||
|
|
||||||
match typesense::create_collection(new_collection_schema).await {
|
|
||||||
Ok(_) => (),
|
|
||||||
Err(e) => return Err(anyhow!("Error creating dataset index: {}", e)),
|
|
||||||
};
|
|
||||||
|
|
||||||
let dataset_state = match get_dataset_state(&dataset_id, &user_id).await {
|
let dataset_state = match get_dataset_state(&dataset_id, &user_id).await {
|
||||||
Ok(dataset_state) => dataset_state,
|
Ok(dataset_state) => dataset_state,
|
||||||
Err(e) => return Err(anyhow!("Error getting dataset state: {}", e)),
|
Err(e) => return Err(anyhow!("Error getting dataset state: {}", e)),
|
||||||
|
|
Loading…
Reference in New Issue