bugfix: data metadata processing

This commit is contained in:
dal 2025-04-21 15:53:03 -06:00
parent 1819eb1859
commit 7e866c54b1
No known key found for this signature in database
GPG Key ID: 16F4B0E1E9F61122
1 changed files with 54 additions and 51 deletions

View File

@ -99,98 +99,101 @@ fn compute_column_metadata(data: &[IndexMap<String, DataType>]) -> Vec<ColumnMet
columns.iter().map(|column_name| { columns.iter().map(|column_name| {
let mut value_map = HashSet::new(); let mut value_map = HashSet::new();
let mut min_value = None; let mut min_value_numeric: Option<f64> = None; // Use specific name
let mut max_value = None; let mut max_value_numeric: Option<f64> = None; // Use specific name
let mut is_date_type = false;
let mut min_value_str: Option<String> = None; let mut min_value_str: Option<String> = None;
let mut max_value_str: Option<String> = None; let mut max_value_str: Option<String> = None;
let mut determined_type: Option<(SimpleType, ColumnType)> = None;
for row in data { for row in data {
if let Some(value) = row.get(column_name) { if let Some(value) = row.get(column_name) {
// Track unique values (up to a reasonable limit) // Track unique values (up to a reasonable limit)
if value_map.len() < 100 { if value_map.len() < 100 {
value_map.insert(format!("{:?}", value)); value_map.insert(format!("{:?}", value)); // format! handles nulls acceptably
} }
// Calculate min/max for appropriate types // Determine type from first non-null value encountered
if determined_type.is_none() {
match value {
// Check for non-null variants using matches! for conciseness
DataType::Int2(Some(_)) | DataType::Int4(Some(_)) | DataType::Int8(Some(_)) |
DataType::Float4(Some(_)) | DataType::Float8(Some(_)) | DataType::Text(Some(_)) |
DataType::Bool(Some(_)) | DataType::Date(Some(_)) | DataType::Timestamp(Some(_)) |
DataType::Timestamptz(Some(_)) | DataType::Json(Some(_)) | DataType::Uuid(Some(_)) |
DataType::Decimal(Some(_)) | DataType::Time(Some(_)) => {
determined_type = Some(determine_types(value));
}
// If it's a Null variant or Unknown, keep looking
_ => {}
}
}
// Calculate min/max based on value's actual type in this row
match value { match value {
DataType::Int2(Some(v)) => { DataType::Int2(Some(v)) => {
let n = *v as f64; let n = *v as f64;
min_value = Some(min_value.map_or(n, |min: f64| min.min(n))); min_value_numeric = Some(min_value_numeric.map_or(n, |min| min.min(n)));
max_value = Some(max_value.map_or(n, |max: f64| max.max(n))); max_value_numeric = Some(max_value_numeric.map_or(n, |max| max.max(n)));
} }
DataType::Int4(Some(v)) => { DataType::Int4(Some(v)) => {
let n = *v as f64; let n = *v as f64;
min_value = Some(min_value.map_or(n, |min: f64| min.min(n))); min_value_numeric = Some(min_value_numeric.map_or(n, |min| min.min(n)));
max_value = Some(max_value.map_or(n, |max: f64| max.max(n))); max_value_numeric = Some(max_value_numeric.map_or(n, |max| max.max(n)));
} }
DataType::Int8(Some(v)) => { DataType::Int8(Some(v)) => {
let n = *v as f64; let n = *v as f64;
min_value = Some(min_value.map_or(n, |min: f64| min.min(n))); min_value_numeric = Some(min_value_numeric.map_or(n, |min| min.min(n)));
max_value = Some(max_value.map_or(n, |max: f64| max.max(n))); max_value_numeric = Some(max_value_numeric.map_or(n, |max| max.max(n)));
} }
DataType::Float4(Some(v)) => { DataType::Float4(Some(v)) => {
let n = *v as f64; let n = *v as f64;
min_value = Some(min_value.map_or(n, |min: f64| min.min(n))); min_value_numeric = Some(min_value_numeric.map_or(n, |min| min.min(n)));
max_value = Some(max_value.map_or(n, |max: f64| max.max(n))); max_value_numeric = Some(max_value_numeric.map_or(n, |max| max.max(n)));
} }
DataType::Float8(Some(v)) => { DataType::Float8(Some(v)) => {
let n = *v as f64; let n = *v as f64;
min_value = Some(min_value.map_or(n, |min: f64| min.min(n))); min_value_numeric = Some(min_value_numeric.map_or(n, |min| min.min(n)));
max_value = Some(max_value.map_or(n, |max: f64| max.max(n))); max_value_numeric = Some(max_value_numeric.map_or(n, |max| max.max(n)));
} }
DataType::Date(Some(date)) => { DataType::Date(Some(date)) => {
is_date_type = true; update_date_min_max(&date.to_string(), &mut min_value_str, &mut max_value_str);
let date_str = date.to_string();
update_date_min_max(&date_str, &mut min_value_str, &mut max_value_str);
} }
DataType::Timestamp(Some(ts)) => { DataType::Timestamp(Some(ts)) => {
is_date_type = true; update_date_min_max(&ts.to_string(), &mut min_value_str, &mut max_value_str);
let ts_str = ts.to_string();
update_date_min_max(&ts_str, &mut min_value_str, &mut max_value_str);
} }
DataType::Timestamptz(Some(ts)) => { DataType::Timestamptz(Some(ts)) => {
is_date_type = true; update_date_min_max(&ts.to_string(), &mut min_value_str, &mut max_value_str);
let ts_str = ts.to_string();
update_date_min_max(&ts_str, &mut min_value_str, &mut max_value_str);
} }
// Ignore nulls and non-comparable types for min/max calculation
_ => {} _ => {}
} }
} }
} }
// Determine the column type and simple type // Finalize types - default if no non-null value was found
let column_type = first_row.get(column_name).unwrap(); let (simple_type, column_type) = determined_type.unwrap_or((SimpleType::Other, ColumnType::Other));
let (simple_type, column_type) = determine_types(column_type);
// Format min/max values appropriately based on type // Format min/max values appropriately based on determined simple_type
let (min_value, max_value) = if is_date_type { let (min_value_json, max_value_json) = match simple_type {
( SimpleType::Number => (
min_value_str.map_or(serde_json::Value::Null, |v| serde_json::Value::String(v)), min_value_numeric.and_then(|v| serde_json::Number::from_f64(v).map(serde_json::Value::Number))
max_value_str.map_or(serde_json::Value::Null, |v| serde_json::Value::String(v)), .unwrap_or(serde_json::Value::Null),
) max_value_numeric.and_then(|v| serde_json::Number::from_f64(v).map(serde_json::Value::Number))
} else { .unwrap_or(serde_json::Value::Null),
( ),
min_value.map_or(serde_json::Value::Null, |v| { SimpleType::Date => (
match serde_json::Number::from_f64(v) { min_value_str.map_or(serde_json::Value::Null, serde_json::Value::String),
Some(num) => serde_json::Value::Number(num), max_value_str.map_or(serde_json::Value::Null, serde_json::Value::String),
None => serde_json::Value::Null, ),
} // Don't provide min/max for other types
}), _ => (serde_json::Value::Null, serde_json::Value::Null),
max_value.map_or(serde_json::Value::Null, |v| {
match serde_json::Number::from_f64(v) {
Some(num) => serde_json::Value::Number(num),
None => serde_json::Value::Null,
}
}),
)
}; };
ColumnMetaData { ColumnMetaData {
name: column_name.to_lowercase(), name: column_name.to_lowercase(),
min_value, min_value: min_value_json,
max_value, max_value: max_value_json,
unique_values: value_map.len() as i32, unique_values: value_map.len() as i32, // Count includes distinct null representations
simple_type, simple_type,
column_type, column_type,
} }